2,去nagios服务器端添加host等监控信息。
2.1 在hosts.cfg里面添加主机信息
define host{
use linux-server
host_name webserver
alias webserver
address 10.xx.xx.10
check_command check-host-alive
max_check_attempts 5
check_period 24x7
contact_groups ops
notification_interval 60
notification_period 24x7
notification_options d,u,r
}
define host{ use linux-server host_name webserver alias webserver address 10.xx.xx.10 check_command check-host-alive max_check_attempts 5 check_period 24x7 contact_groups ops notification_interval 60 notification_period 24x7 notification_options d,u,r }2.2 在service.cfg里面添加web机器监控的命令信息
# No.007 webserver
# service definition
define service{
host_name webserver
service_description check_load
check_command check_nrpe!check_load
max_check_attempts 5
normal_check_interval 3
retry_check_interval 2
check_period 24x7
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
contact_groups opsweb
}
define service{
host_name webserver
service_description check-host-alive
check_command check-host-alive
max_check_attempts 5
normal_check_interval 3
retry_check_interval 2
check_period 24x7
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
contact_groups opsweb
}
define service{
host_name webserver
service_description Check Disk sda1
check_command check_nrpe!check_sda1
max_check_attempts 5
normal_check_interval 3
retry_check_interval 2
check_period 24x7
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
contact_groups opsweb
}
define service{
host_name webserver
service_description Total Processes
check_command check_nrpe!check_total_procs
max_check_attempts 5
normal_check_interval 3
retry_check_interval 2
check_period 24x7
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
contact_groups opsweb
}
define service{
host_name webserver
service_description Current Users
check_command check_nrpe!check_users
max_check_attempts 5
normal_check_interval 3
retry_check_interval 2
check_period 24x7
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
contact_groups opsweb
}
define service{
host_name webserver
service_description Check Zombie Procs
check_command check_nrpe!check_zombie_procs
max_check_attempts 5
normal_check_interval 3
retry_check_interval 2
check_period 24x7
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
contact_groups opsweb
}
define service{
host_name webserver
service_description Check Tomcat 9300 Status
check_command check_nrpe!check_tomcat_9300_status
max_check_attempts 5
normal_check_interval 3
retry_check_interval 2
check_period 24x7
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
contact_groups opsweb
}
# No.007 webserver # service definition define service{ host_name webserver service_description check_load check_command check_nrpe!check_load max_check_attempts 5 normal_check_interval 3 retry_check_interval 2 check_period 24x7 notification_interval 10 notification_period 24x7 notification_options w,u,c,r contact_groups opsweb } define service{ host_name webserver service_description check-host-alive check_command check-host-alive max_check_attempts 5 normal_check_interval 3 retry_check_interval 2 check_period 24x7 notification_interval 10 notification_period 24x7 notification_options w,u,c,r contact_groups opsweb } define service{ host_name webserver service_description Check Disk sda1 check_command check_nrpe!check_sda1 max_check_attempts 5 normal_check_interval 3 retry_check_interval 2 check_period 24x7 notification_interval 10 notification_period 24x7 notification_options w,u,c,r contact_groups opsweb } define service{ host_name webserver service_description Total Processes check_command check_nrpe!check_total_procs max_check_attempts 5 normal_check_interval 3 retry_check_interval 2 check_period 24x7 notification_interval 10 notification_period 24x7 notification_options w,u,c,r contact_groups opsweb } define service{ host_name webserver service_description Current Users check_command check_nrpe!check_users max_check_attempts 5 normal_check_interval 3 retry_check_interval 2 check_period 24x7 notification_interval 10 notification_period 24x7 notification_options w,u,c,r contact_groups opsweb } define service{ host_name webserver service_description Check Zombie Procs check_command check_nrpe!check_zombie_procs max_check_attempts 5 normal_check_interval 3 retry_check_interval 2 check_period 24x7 notification_interval 10 notification_period 24x7 notification_options w,u,c,r contact_groups opsweb } define service{ host_name webserver service_description Check Tomcat 9300 Status check_command check_nrpe!check_tomcat_9300_status max_check_attempts 5 normal_check_interval 3 retry_check_interval 2 check_period 24x7 notification_interval 10 notification_period 24x7 notification_options w,u,c,r contact_groups opsweb }2.3 在vim contacts.cfg添加新的opsweb邮件组信息
define contactgroup{
contactgroup_name opsweb
alias pl ops team
members tim,mch,nagiosadmin
}
define contactgroup{ contactgroup_name opsweb alias pl ops team members tim,mch,nagiosadmin }2.4 添加新的监控tomcat的命令,check_tomcat_9300_status
这里不采用check_tcp!8080端口的方式,是因为在实际中tomcat服务假死之后,jsp的网页都是打不开的,但是这个监控端口8080都是正常的,不会报警出来;所以采用check_http的方式,新建立一个通用的/nagios_test_0611/nagios_test_0611.jsp文件,来检测这个jsp的访问情况,如下所示:
vim commands.cfg
# add by tim on 20140611
define command{
command_name check_tomcat_9300_status
command_line $USER1$/check_http -I $HOSTADDRESS$ -p $PORT$ -u $URL$ -e $N200$ -w $Warning$ -c$Cri$
}
vim commands.cfg # add by tim on 20140611 define command{ command_name check_tomcat_9300_status command_line $USER1$/check_http -I $HOSTADDRESS$ -p $PORT$ -u $URL$ -e $N200$ -w $Warning$ -c$Cri$ }Jsp文件内容如下:
[root@webserver webapps]# vim . /nagios_test_0611/nagios_test_0611.jsp
<%@ page language="java"contentType="text/html; charset=gb2312"
pageEncoding="gb2312"%>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<metahttp-equiv="Content-Type"content="text/html; charset=gb2312">
<title>nagios test here</title>
</head>
<body>
<center>Now time is: <%=new java.util.Date()%></center>
</body>
</html>
[root@webserver webapps]# vim . /nagios_test_0611/nagios_test_0611.jsp <%@ page language="java" contentType="text/html; charset=gb2312" pageEncoding="gb2312"%> <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <head> <meta http-equiv="Content-Type" content="text/html; charset=gb2312"> <title>nagios test here</title> </head> <body> <center>Now time is: <%=new java.util.Date()%></center> </body> </html>2.5 在被监控客户端的nrpe.cfg配置文件里面添加tomcat端口配置信息:
command[check_tomcat_9300_status]=/usr/local/nagios/libexec/check_http -I 10.xx.xx.10 -p 9444 -u /nagios_test_0611/nagios_test_0611.jsp -e 200 -w 5 -c 10
command[check_tomcat_8300_status]=/usr/local/nagios/libexec/check_http -I 10.xx.xx.10 -p 8300 -u /nagios_test_0611/nagios_test_0611.jsp -e 200 -w 5 -c 10
command[check_tomcat_9300_status]=/usr/local/nagios/libexec/check_http -I 10.xx.xx.10 -p 9444 -u /nagios_test_0611/nagios_test_0611.jsp -e 200 -w 5 -c 10 command[check_tomcat_8300_status]=/usr/local/nagios/libexec/check_http -I 10.xx.xx.10 -p 8300 -u /nagios_test_0611/nagios_test_0611.jsp -e 200 -w 5 -c 102.6 测试报错
[root@cache-2 objects]# /usr/local/nagios/libexec/check_nrpe -H 10.xx.xx.10 -c check_load
NRPE: Unable to read output
[root@cache-2 objects]#
[root@cache-2 objects]# /usr/local/nagios/libexec/check_nrpe -H 10.xx.xx.10 -c check_load NRPE: Unable to read output [root@cache-2 objects]#已经添加了tomcat930端口,现在再添加一个tomcat8300端口
去服务器端shell命令行里面check下
/usr/local/nagios/libexec/check_nrpe -H 192.168.15.178 -c check_mysql_myisam_lock
[root@cache-2 etc]# /usr/local/nagios/libexec/check_nrpe -H 10.xx.xx.10 -c check_load
NRPE: Unable to read output
[root@cache-2 etc]#
/usr/local/nagios/libexec/check_nrpe -H 192.168.15.178 -c check_mysql_myisam_lock [root@cache-2 etc]# /usr/local/nagios/libexec/check_nrpe -H 10.xx.xx.10 -c check_load NRPE: Unable to read output [root@cache-2 etc]#同样报错,那么可能就是nagios被监控端的问题。
最终检查是nrpe.cfg里面路径有误,源码安装默认路径是:/usr/local/nagios/libexec/check_http,rpm安装默认路径是:/usr/lib/nagios/plugins/。这里是rpm安装,所以nrpe.cfg配置文件里面用后面rpm的路径/usr/lib/nagios/plugins/,替换下service nrpe restart之后,问题解决,如下图所示: