RHEL5.8 下Infiniband驱动安装(3)

rpm -e --allmatches --nodeps kmod-mlnx-ofa_kernel-xen-1.5.3-OFED.1.5.3.4.0.42.g3cb72fe.rhel5u8libnes-1.1.1mlnx1-1 libcxgb3-1.3.1-1 libmverbs-0.1.0-3.15.gd28970elibibmad-1.3.8.MLNX_20120424-0.1 libmthca-1.0.6mlnx1-0.1.gbe5eef3 libibumad-1.3.7.MLNX_20130110_ff06102-0.1libibverbs-1.1.5mlnx2-1 libmlx4-1.0.2mlnx6-1 librdmacm-1.0.15-1kernel-mft-2.7.1-2.6.18_308.el5 libmverbs-0.1.0-3.15.gd28970elibipathverbs-1.2mlnx1-1 libibmad-1.3.8.MLNX_20120424-0.1mlnx-ofa_kernel-1.5.3-OFED.1.5.3.4.0.42.g3cb72fe.rhel5u8libibverbs-utils-1.1.5mlnx2-1 libcxgb3-1.3.1-1 mstflint-1.4mlnx4-1.21.gd948dddlibmlx4-1.0.2mlnx6-1 librdmacm-1.0.15-1 libmthca-1.0.6mlnx1-0.1.gbe5eef3libibumad-1.3.7.MLNX_20130110_ff06102-0.1 libibverbs-1.1.5mlnx2-1 librdmacm-utils-1.0.15-1mlnxofed-docs-1.5.3-4.0.42 libipathverbs-1.2mlnx1-1kmod-mlnx-ofa_kernel-1.5.3-OFED.1.5.3.4.0.42.g3cb72fe.rhel5u8libnes-1.1.1mlnx1-1 kernel-mft-2.7.1-2.6.18_308.el5ofed-scripts-1.5.3-OFED.1.5.3.4.0.42 mft-2.7.1a-1

Uninstall finished successfully

[root@node33 ~]#rm –rf/etc/infiniband

[root@node33 ~]#

4      排错

4.1  查看IB工作状态

[root@node33 ~]# ibstat

CA 'mlx4_0'

CAtype: MT26428

Numberof ports: 1

Firmwareversion: 2.9.1000

Hardwareversion: b0

NodeGUID: 0x0002c903000cc00e

Systemimage GUID: 0x0002c903000cc011

Port 1:

State:Active

Physicalstate: LinkUp

Rate:40

Baselid: 1

LMC:0

SMlid: 1

Capabilitymask: 0x0251086a

PortGUID: 0x0002c903000cc00f

Linklayer: InfiniBand

[root@node33 ~]#

4.2  查看hosts信息

[root@node33 ~]# ibhosts

Ca    :0x0002c903000cc00a ports 1 "node34 HCA-1"

Ca    :0x0002c903000cc00e ports 1 "node33 HCA-1"

[root@node33 ~]#

4.3  查看switch信息

[root@node33 ~]# ibswitches

Switch      :0x0002c9020042bcc0 ports 36 "MF0;switch-1140a2:IS5030/U1" enhancedport 0 lid 4 lmc 0

[root@node33 ~]#

4.4  查看拓扑信息

[root@node33 ~]#ibnetdiscover

#

# Topology file: generated on Sun Mar  8 19:53:35 2015

#

# Initiated from node 0002c903000cc00e port0002c903000cc00f

vendid=0x2c9

devid=0xbd36

sysimgguid=0x2c9020042bcc3

switchguid=0x2c9020042bcc0(2c9020042bcc0)

Switch      36"S-0002c9020042bcc0"                #"MF0;switch-1140a2:IS5030/U1" enhanced port 0 lid 4 lmc 0

[30]  "H-0002c903000cc00e"[1](2c903000cc00f)          # "node33 HCA-1" lid 14xQDR

[31]  "H-0002c903000cc00a"[1](2c903000cc00b)                  # "node34HCA-1" lid 7 4xQDR

vendid=0x2c9

devid=0x673c

sysimgguid=0x2c903000cc00d

caguid=0x2c903000cc00a

Ca    1"H-0002c903000cc00a"                #"node34 HCA-1"

[1](2c903000cc00b)        "S-0002c9020042bcc0"[31]              # lid 7 lmc 0"MF0;switch-1140a2:IS5030/U1" lid 4 4xQDR

vendid=0x2c9

devid=0x673c

sysimgguid=0x2c903000cc011

caguid=0x2c903000cc00e

Ca    1"H-0002c903000cc00e"                #"node33 HCA-1"

[1](2c903000cc00f)        "S-0002c9020042bcc0"[30]              # lid 1 lmc 0"MF0;switch-1140a2:IS5030/U1" lid 4 4xQDR

[root@node33 ~]#

4.5  查看报错统计信息

[root@node33 ~]# ibdiagnet -Pall=1

Loading IBDIAGNET from:/opt/ibutils/lib64/ibdiagnet1.5.7

-W- Topology file is not specified.

Reportsregarding cluster links will use direct routes.

Loading IBDM from: /opt/ibutils/lib64/ibdm1.5.7

-I- Using port 1 as the local port.

-I- Discovering ... 3 nodes (1 Switches & 2 CA-s)discovered.

-I---------------------------------------------------

-I- Bad Guids/LIDs Info

-I---------------------------------------------------

-I- No bad Guids were found

-I---------------------------------------------------

-I- Links With Logical State = INIT

-I---------------------------------------------------

-I- No bad Links (with logical state = INIT) werefound

-I---------------------------------------------------

-I- General Device Info

-I---------------------------------------------------

-I---------------------------------------------------

-I- PM Counters Info

-I---------------------------------------------------

-I- No illegal PM counters values were found

-I---------------------------------------------------

-I- Fabric Partitions Report (see ibdiagnet.pkey fora full hosts list)

-I---------------------------------------------------

-I-  PKey:0x7fff Hosts:2 full:2 limited:0

-I---------------------------------------------------

-I- IPoIB Subnets Check

-I---------------------------------------------------

-I- Subnet: IPv4 PKey:0x7fff QKey:0x00000b1bMTU:2048Byte rate:10Gbps SL:0x00

-W- Suboptimal rate for group. Lowest memberrate:40Gbps > group-rate:10Gbps

-I---------------------------------------------------

-I- Bad Links Info

-I- No bad link were found

-I---------------------------------------------------

----------------------------------------------------------------

-I- Stages Status Report:

STAGE                                    ErrorsWarnings

Bad GUIDs/LIDs Check                    0      0

Link State Active Check                0      0

General Devices Info Report            0      0

Performance Counters Report            0      0

Partitions Check                        0      0

IPoIB Subnets Check                    0      1

Please see /tmp/ibdiagnet.log for complete log

----------------------------------------------------------------

-I- Done. Run time was 1 seconds.

[root@node33 ~]#

4.6  查看全局详细报错信息

[root@node33 ~]# ibqueryerrors

Errors for 0x2c9020042bcc0"MF0;switch-1140a2:IS5030/U1"

GUID0x2c9020042bcc0 port ALL: [PortRcvSwitchRelayErrors == 64] [PortXmitDiscards ==29] [PortXmitWait == 240663]

GUID0x2c9020042bcc0 port 0: [PortXmitWait == 1232]

GUID0x2c9020042bcc0 port 1: [PortRcvSwitchRelayErrors == 2] [PortXmitDiscards == 3]

GUID0x2c9020042bcc0 port 2: [PortRcvSwitchRelayErrors == 3] [PortXmitDiscards == 3]

GUID0x2c9020042bcc0 port 3: [PortRcvSwitchRelayErrors == 1] [PortXmitDiscards == 3]

GUID0x2c9020042bcc0 port 4: [PortRcvSwitchRelayErrors == 1] [PortXmitDiscards == 1]

GUID0x2c9020042bcc0 port 5: [PortRcvSwitchRelayErrors == 1] [PortXmitDiscards == 2]

GUID0x2c9020042bcc0 port 6: [PortRcvSwitchRelayErrors == 2] [PortXmitDiscards == 3]

GUID0x2c9020042bcc0 port 7: [PortRcvSwitchRelayErrors == 1] [PortXmitDiscards == 2]

GUID0x2c9020042bcc0 port 8: [PortRcvSwitchRelayErrors == 1] [PortXmitDiscards == 2]

GUID0x2c9020042bcc0 port 9: [PortRcvSwitchRelayErrors == 1] [PortXmitDiscards == 2]

GUID0x2c9020042bcc0 port 10: [PortRcvSwitchRelayErrors == 1] [PortXmitDiscards ==2]

GUID0x2c9020042bcc0 port 11: [PortRcvSwitchRelayErrors == 1] [PortXmitDiscards ==2]

GUID0x2c9020042bcc0 port 12: [PortRcvSwitchRelayErrors == 1] [PortXmitDiscards ==2]

GUID0x2c9020042bcc0 port 13: [PortRcvSwitchRelayErrors == 1] [PortXmitDiscards ==1]

GUID0x2c9020042bcc0 port 14: [PortRcvSwitchRelayErrors == 1] [PortXmitDiscards ==1]

GUID0x2c9020042bcc0 port 30: [PortXmitWait == 4294967295]

GUID0x2c9020042bcc0 port 31: [PortRcvSwitchRelayErrors == 46] [PortXmitWait == 295]

GUID0x2c9020042bcc0 port 34: [PortXmitWait == 892]

GUID0x2c9020042bcc0 port 36: [PortXmitWait == 238245]

## Summary: 17 nodes checked, 1 bad nodes found

##          53ports checked, 19 ports have errors beyond threshold

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://www.heiqu.com/16646.html