rpm -e --allmatches --nodeps kmod-mlnx-ofa_kernel-xen-1.5.3-OFED.1.5.3.4.0.42.g3cb72fe.rhel5u8libnes-1.1.1mlnx1-1 libcxgb3-1.3.1-1 libmverbs-0.1.0-3.15.gd28970elibibmad-1.3.8.MLNX_20120424-0.1 libmthca-1.0.6mlnx1-0.1.gbe5eef3 libibumad-1.3.7.MLNX_20130110_ff06102-0.1libibverbs-1.1.5mlnx2-1 libmlx4-1.0.2mlnx6-1 librdmacm-1.0.15-1kernel-mft-2.7.1-2.6.18_308.el5 libmverbs-0.1.0-3.15.gd28970elibipathverbs-1.2mlnx1-1 libibmad-1.3.8.MLNX_20120424-0.1mlnx-ofa_kernel-1.5.3-OFED.1.5.3.4.0.42.g3cb72fe.rhel5u8libibverbs-utils-1.1.5mlnx2-1 libcxgb3-1.3.1-1 mstflint-1.4mlnx4-1.21.gd948dddlibmlx4-1.0.2mlnx6-1 librdmacm-1.0.15-1 libmthca-1.0.6mlnx1-0.1.gbe5eef3libibumad-1.3.7.MLNX_20130110_ff06102-0.1 libibverbs-1.1.5mlnx2-1 librdmacm-utils-1.0.15-1mlnxofed-docs-1.5.3-4.0.42 libipathverbs-1.2mlnx1-1kmod-mlnx-ofa_kernel-1.5.3-OFED.1.5.3.4.0.42.g3cb72fe.rhel5u8libnes-1.1.1mlnx1-1 kernel-mft-2.7.1-2.6.18_308.el5ofed-scripts-1.5.3-OFED.1.5.3.4.0.42 mft-2.7.1a-1
Uninstall finished successfully
[root@node33 ~]#rm –rf/etc/infiniband
[root@node33 ~]#
4 排错
4.1 查看IB工作状态
[root@node33 ~]# ibstat
CA 'mlx4_0'
CAtype: MT26428
Numberof ports: 1
Firmwareversion: 2.9.1000
Hardwareversion: b0
NodeGUID: 0x0002c903000cc00e
Systemimage GUID: 0x0002c903000cc011
Port 1:
State:Active
Physicalstate: LinkUp
Rate:40
Baselid: 1
LMC:0
SMlid: 1
Capabilitymask: 0x0251086a
PortGUID: 0x0002c903000cc00f
Linklayer: InfiniBand
[root@node33 ~]#
4.2 查看hosts信息
[root@node33 ~]# ibhosts
Ca :0x0002c903000cc00a ports 1 "node34 HCA-1"
Ca :0x0002c903000cc00e ports 1 "node33 HCA-1"
[root@node33 ~]#
4.3 查看switch信息
[root@node33 ~]# ibswitches
Switch :0x0002c9020042bcc0 ports 36 "MF0;switch-1140a2:IS5030/U1" enhancedport 0 lid 4 lmc 0
[root@node33 ~]#
4.4 查看拓扑信息
[root@node33 ~]#ibnetdiscover
#
# Topology file: generated on Sun Mar 8 19:53:35 2015
#
# Initiated from node 0002c903000cc00e port0002c903000cc00f
vendid=0x2c9
devid=0xbd36
sysimgguid=0x2c9020042bcc3
switchguid=0x2c9020042bcc0(2c9020042bcc0)
Switch 36"S-0002c9020042bcc0" #"MF0;switch-1140a2:IS5030/U1" enhanced port 0 lid 4 lmc 0
[30] "H-0002c903000cc00e"[1](2c903000cc00f) # "node33 HCA-1" lid 14xQDR
[31] "H-0002c903000cc00a"[1](2c903000cc00b) # "node34HCA-1" lid 7 4xQDR
vendid=0x2c9
devid=0x673c
sysimgguid=0x2c903000cc00d
caguid=0x2c903000cc00a
Ca 1"H-0002c903000cc00a" #"node34 HCA-1"
[1](2c903000cc00b) "S-0002c9020042bcc0"[31] # lid 7 lmc 0"MF0;switch-1140a2:IS5030/U1" lid 4 4xQDR
vendid=0x2c9
devid=0x673c
sysimgguid=0x2c903000cc011
caguid=0x2c903000cc00e
Ca 1"H-0002c903000cc00e" #"node33 HCA-1"
[1](2c903000cc00f) "S-0002c9020042bcc0"[30] # lid 1 lmc 0"MF0;switch-1140a2:IS5030/U1" lid 4 4xQDR
[root@node33 ~]#
4.5 查看报错统计信息
[root@node33 ~]# ibdiagnet -Pall=1
Loading IBDIAGNET from:/opt/ibutils/lib64/ibdiagnet1.5.7
-W- Topology file is not specified.
Reportsregarding cluster links will use direct routes.
Loading IBDM from: /opt/ibutils/lib64/ibdm1.5.7
-I- Using port 1 as the local port.
-I- Discovering ... 3 nodes (1 Switches & 2 CA-s)discovered.
-I---------------------------------------------------
-I- Bad Guids/LIDs Info
-I---------------------------------------------------
-I- No bad Guids were found
-I---------------------------------------------------
-I- Links With Logical State = INIT
-I---------------------------------------------------
-I- No bad Links (with logical state = INIT) werefound
-I---------------------------------------------------
-I- General Device Info
-I---------------------------------------------------
-I---------------------------------------------------
-I- PM Counters Info
-I---------------------------------------------------
-I- No illegal PM counters values were found
-I---------------------------------------------------
-I- Fabric Partitions Report (see ibdiagnet.pkey fora full hosts list)
-I---------------------------------------------------
-I- PKey:0x7fff Hosts:2 full:2 limited:0
-I---------------------------------------------------
-I- IPoIB Subnets Check
-I---------------------------------------------------
-I- Subnet: IPv4 PKey:0x7fff QKey:0x00000b1bMTU:2048Byte rate:10Gbps SL:0x00
-W- Suboptimal rate for group. Lowest memberrate:40Gbps > group-rate:10Gbps
-I---------------------------------------------------
-I- Bad Links Info
-I- No bad link were found
-I---------------------------------------------------
----------------------------------------------------------------
-I- Stages Status Report:
STAGE ErrorsWarnings
Bad GUIDs/LIDs Check 0 0
Link State Active Check 0 0
General Devices Info Report 0 0
Performance Counters Report 0 0
Partitions Check 0 0
IPoIB Subnets Check 0 1
Please see /tmp/ibdiagnet.log for complete log
----------------------------------------------------------------
-I- Done. Run time was 1 seconds.
[root@node33 ~]#
4.6 查看全局详细报错信息
[root@node33 ~]# ibqueryerrors
Errors for 0x2c9020042bcc0"MF0;switch-1140a2:IS5030/U1"
GUID0x2c9020042bcc0 port ALL: [PortRcvSwitchRelayErrors == 64] [PortXmitDiscards ==29] [PortXmitWait == 240663]
GUID0x2c9020042bcc0 port 0: [PortXmitWait == 1232]
GUID0x2c9020042bcc0 port 1: [PortRcvSwitchRelayErrors == 2] [PortXmitDiscards == 3]
GUID0x2c9020042bcc0 port 2: [PortRcvSwitchRelayErrors == 3] [PortXmitDiscards == 3]
GUID0x2c9020042bcc0 port 3: [PortRcvSwitchRelayErrors == 1] [PortXmitDiscards == 3]
GUID0x2c9020042bcc0 port 4: [PortRcvSwitchRelayErrors == 1] [PortXmitDiscards == 1]
GUID0x2c9020042bcc0 port 5: [PortRcvSwitchRelayErrors == 1] [PortXmitDiscards == 2]
GUID0x2c9020042bcc0 port 6: [PortRcvSwitchRelayErrors == 2] [PortXmitDiscards == 3]
GUID0x2c9020042bcc0 port 7: [PortRcvSwitchRelayErrors == 1] [PortXmitDiscards == 2]
GUID0x2c9020042bcc0 port 8: [PortRcvSwitchRelayErrors == 1] [PortXmitDiscards == 2]
GUID0x2c9020042bcc0 port 9: [PortRcvSwitchRelayErrors == 1] [PortXmitDiscards == 2]
GUID0x2c9020042bcc0 port 10: [PortRcvSwitchRelayErrors == 1] [PortXmitDiscards ==2]
GUID0x2c9020042bcc0 port 11: [PortRcvSwitchRelayErrors == 1] [PortXmitDiscards ==2]
GUID0x2c9020042bcc0 port 12: [PortRcvSwitchRelayErrors == 1] [PortXmitDiscards ==2]
GUID0x2c9020042bcc0 port 13: [PortRcvSwitchRelayErrors == 1] [PortXmitDiscards ==1]
GUID0x2c9020042bcc0 port 14: [PortRcvSwitchRelayErrors == 1] [PortXmitDiscards ==1]
GUID0x2c9020042bcc0 port 30: [PortXmitWait == 4294967295]
GUID0x2c9020042bcc0 port 31: [PortRcvSwitchRelayErrors == 46] [PortXmitWait == 295]
GUID0x2c9020042bcc0 port 34: [PortXmitWait == 892]
GUID0x2c9020042bcc0 port 36: [PortXmitWait == 238245]
## Summary: 17 nodes checked, 1 bad nodes found
## 53ports checked, 19 ports have errors beyond threshold