Linux并行集群的搭建(3)


7.更改队列batch部分属性,以满足实际需求
[root@admintorque-3.0.6]#qmgr -c "s q batch resources_default.walltime=24:00:00"
[root@admintorque-3.0.6]#qmgr -c "s s query_other_jobs=true"


8.建立mom配置文件,用于复制到所有计算节点
[root@adminmom_priv]#pwd
/var/spool/torque/mom_priv
[root@adminmom_priv]#cat config
$pbsserver    admin
$logevent      225


9.创建节点信息文件
[root@adminserver_priv]#pwd
/var/spool/torque/server_priv
[root@adminserver_priv]#cat nodes
node1
node2
[root@adminserver_priv]#


10.查看目前节点信息均为down状态
[root@adminserver_priv]#pbsnodes -a
node1
    state = down
    np = 1
    ntype = cluster
    mom_service_port = 15002
    mom_admin_port = 15003
    gpus = 0
 
node2
    state = down
    np = 1
    ntype = cluster
    mom_service_port = 15002
    mom_admin_port = 15003
    gpus = 0
 [root@adminserver_priv]#


11.复制pbs_server启动脚本,并设置开机自动启动
[root@admintorque-3.0.6]#pwd
/share/apps/torque-3.0.6
[root@admintorque-3.0.6]#cp contrib/init.d/pbs_server /etc/init.d/
[root@admintorque-3.0.6]#chmod 755 /etc/init.d/pbs_server
[root@admintorque-3.0.6]#chkconfig pbs_server on


12.复制pbs_mom脚本,方便复制到计算节点
[root@admintorque-3.0.6]#cp contrib/init.d/pbs_mom /etc/init.d/
 
13.安装maui
[root@adminparallel]#tar xzvf maui-3.3.1.tar.gz -C /usr/local/src/
[root@admin ~]#cd /usr/local/src/maui-3.3.1/
[root@adminmaui-3.3.1]#./configure --prefix=/usr/local/maui --with-pbs=/usr/local
[root@adminmaui-3.3.1]#make
[root@adminmaui-3.3.1]#make install


14.复制maui启动脚本,设置正确路径,并设置为开机启动
[root@adminmaui-3.3.1]#cp etc/maui.d /etc/init.d/mauid
[root@adminmaui-3.3.1]#vi /etc/init.d/mauid
更改MAUI_PREFIX=/opt/maui为MAUI_PREFIX=/usr/local/maui
[root@adminmaui-3.3.1]#chmod 755 /etc/init.d/mauid
[root@adminmaui-3.3.1]#chkconfig mauid on


15.启动maui调度服务
[root@adminmaui-3.3.1]#/etc/init.d/mauid start
StartingMAUIScheduler:                                  [ OK  ]
[root@adminmaui-3.3.1]#


16.添加maui命令环境变量
[root@adminmaui-3.3.1]#vi /etc/bashrc
export PATH=/share/apps/openmpi/bin:/usr/local/maui/bin:$PATH
[root@adminmaui-3.3.1]#source /etc/bashrc


17.安装并行软件到共享目录
[root@adminnamd]#tar xzvf NAMD_2.9_Linux-x86_64-multicore.tar.gz -C /share/apps/
[root@adminnamd]#tar xzvf apoa1.tar.gz -C /share/apps/
[root@adminapps]#pwd
/share/apps
[root@adminapps]#mv NAMD_2.9_Linux-x86_64-multicore/ namd


18.添加namd命令环境变量,同时也添加到Path.sh方便计算节点添加环境变量
[root@adminmaui-3.3.1]#vi /etc/bashrc
export PATH=/share/apps/openmpi/bin:/usr/local/maui/bin:/share/apps/namd:$PATH
[root@adminmaui-3.3.1]#source /etc/bashrc
[root@adminscripts]#which namd2
/share/apps/namd/namd2
[root@adminscripts]#cat Path.sh
#!/bin/bash
grep openmpi /etc/bashrc || cat >>/etc/bashrc <<EOF
export PATH=/share/apps/openmpi/bin:/share/apps/namd:\$PATH
EOF
[root@adminscripts]#
至此管理端配置完成
 
三:计算节点配置torque
1.计算节点安装torque
[root@admin ~]#for i in 1 2; do ssh node$i sh /share/source/torque-3.0.6/install.sh; done


2.复制mom配置文件到计算节点

[root@admin ~]#for i in 1 2; do scp /var/spool/torque/mom_priv/confignode$i:/var/spool/torque/mom_priv/; done


3.复制mom启动脚本到计算节点,启动pbs_mom服务,并设置开机启动
[root@admin ~]#for i in 1 2; do scp /etc/init.d/pbs_mom node$i:/etc/init.d/; done
[root@admin ~]#for i in 1 2; do ssh node$i /etc/init.d/pbs_mom start; done
StartingTORQUEMom: [  OK  ]
StartingTORQUEMom: [  OK  ]
[root@admin ~]#for i in 1 2; do ssh node$i chkconfig pbs_mom on; done


4.设置环境变量
[root@admin ~]#for i in 1 2; do ssh node$i sh /share/scripts/Path.sh; done


5.测试环境变量设置是否正确
[root@admin ~]#for i in 1 2; do ssh node$i which mpirun; done
/share/apps/openmpi/bin/mpirun
/share/apps/openmpi/bin/mpirun
[root@admin ~]#for i in 1 2; do ssh node$i which namd2; done
/share/apps/namd/namd2
/share/apps/namd/namd2
[root@admin ~]#


6.此时再观察计算节点状态,已经变成free了,即可以提交任务到计算节点了
[root@adminapps]#pbsnodes -a
node1
    state = free
    np = 1
    ntype = cluster
    status=rectime=1408751492,varattr=,jobs=,state=free,netload=12996103,gres=,loadave=0.01,ncpus=1,physmem=1024932kb,availmem=2082428kb,totmem=2165536kb,idletime=0,nusers=0,nsessions=0,uname=Linuxnode12.6.18-371.el5 #1 SMP Tue Oct 1 08:35:08 EDT 2013 x86_64,opsys=linux
    mom_service_port = 15002
    mom_admin_port = 15003
    gpus = 0
 
node2
    state = free
    np = 1
    ntype = cluster
    status=rectime=1408751482,varattr=,jobs=,state=free,netload=12983275,gres=,loadave=0.03,ncpus=1,physmem=1024932kb,availmem=2082444kb,totmem=2165536kb,idletime=0,nusers=0,nsessions=0,uname=Linuxnode22.6.18-371.el5 #1 SMP Tue Oct 1 08:35:08 EDT 2013 x86_64,opsys=linux
    mom_service_port = 15002
    mom_admin_port = 15003
    gpus = 0
 
[root@adminapps]#
 
 
四:验证并行集群是否搭建成功
1.在管理节点上以建立的linuxidc用户登录,首先设置节点间无密码互访,操作和root用户一样,只是不需要复制.ssh目录


2.复制namd用软件apoa1到当前目录下
[linuxidc@admin ~]$cp -r /share/apps/apoa1/ ./

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://www.heiqu.com/3d48d476a41627ab24aebc9a71d239bc.html