nagios客户端的安装
下载nagios-plugin和nrpe插件
[root@client1 ~]# mkdir /software/ [root@client1 ~]# cd /software/ [root@client1 software]# wget https://nagios-plugins.org/download/nagios-plugins-2.2.1.tar.gz [root@client1 software]# wget https://sourceforge.net/projects/nagios/files/nrpe-3.x/nrpe-3.1.0.tar.gz安装nagios-plugin
安装依赖包
创建用户
[root@client1 software]# useradd nagios -M -s /sbin/nologin解压nagios-plugin源码包
[root@client1 software]# tar zxvf nagios-plugins-2.2.1.tar.gz进入解压后的目录进行配置
[root@client1 software]# cd nagios-plugins-2.2.1/ [root@client1 nagios-plugins-2.2.1]# ./configure --with-nagios-user=nagios --with-nagios-group=nagios --enable-perl-modules编译及安装
[root@client1 nagios-plugins-2.2.1]# make && make install
安装nrpe插件
解压nrpe源码包
进入解压后的目录进行配置
[root@client1 software]# cd nrpe-3.1.0/ [root@client1 nrpe-3.1.0]# ./configure编译及安装
[root@client1 nrpe-3.1.0]# make all [root@client1 nrpe-3.1.0]# make install-plugin [root@client1 nrpe-3.1.0]# make install-daemon [root@client1 nrpe-3.1.0]# make install-daemon-config [root@client1 nrpe-3.1.0]# mkdir /usr/local/nagios/etc/ [root@client1 nrpe-3.1.0]# cp sample-config/nrpe.cfg /usr/local/nagios/etc/nrpe.cfg安装完成后,查看下libexec下面是否有插件
[root@client1 nrpe-3.1.0]# ls /usr/local/nagios/libexec/ check_apt check_dummy check_imap check_nagios check_overcr check_ssh negate check_breeze check_file_age check_ircd check_nntp check_ping check_ssmtp urlize check_by_ssh check_flexlm check_jabber check_nntps check_pop check_swap utils.pm check_clamd check_fping check_load check_nrpe check_procs check_tcp utils.sh check_cluster check_ftp check_log check_nt check_real check_time check_dhcp check_http check_mailq check_ntp check_rpc check_udp check_dig check_icmp check_mrtg check_ntp_peer check_sensors check_ups check_disk check_ide_smart check_mrtgtraf check_ntp_time check_simap check_uptime check_disk_smb check_ifoperstatus check_mysql check_nwstat check_smtp check_users check_dns check_ifstatus check_mysql_query check_Oracle check_spop check_wave启动nrpe,并测试服务端本地是否可以连通
[root@client1 nrpe-3.1.0]# /usr/local/nagios/bin/nrpe -d -c /usr/local/nagios/etc/nrpe.cfg [root@client1 nrpe-3.1.0]# echo "/usr/local/nagios/bin/nrpe -d -c /usr/local/nagios/etc/nrpe.cfg" >> /etc/rc.local [root@client1 nrpe-3.1.0]# chmod +x /etc/rc.d/rc.local # CentOS 7下需要这一步, 不然/etc/rc.local中的内容开机可能不执行 [root@client1 nrpe-3.1.0]# netstat -lnput|grep 5666 tcp 0 0 0.0.0.0:5666 0.0.0.0:* LISTEN 28296/nrpe tcp6 0 0 :::5666 :::* LISTEN 28296/nrpe [root@client1 nrpe-3.1.0]# /usr/local/nagios/libexec/check_nrpe -H localhost NRPE v3.1.0-rc1修改配置文件
[root@client1 nrpe-3.1.0]# cd /usr/local/nagios/etc/vi nrpe.cfg
允许服务端IP和本机访问,172.16.0.18是nagios服务端IP地址
allowed_hosts=127.0.0.1,::1 ===> 修改为 allowed_hosts=127.0.0.1,::1,172.16.0.18注释下面几行内容
command[check_users]=/usr/local/nagios/libexec/check_users -w 5 -c 10 command[check_load]=/usr/local/nagios/libexec/check_load -r -w .15,.10,.05 -c .30,.25,.20 command[check_hda1]=/usr/local/nagios/libexec/check_disk -w 20% -c 10% -p /dev/hda1 command[check_zombie_procs]=/usr/local/nagios/libexec/check_procs -w 5 -c 10 -s Z command[check_total_procs]=/usr/local/nagios/libexec/check_procs -w 150 -c 200在nrpe.cfg文件末尾增加下面几行内容
# my custom monitor items command[check_users]=/usr/local/nagios/libexec/check_users -w 5 -c 10 command[check_load]=/usr/local/nagios/libexec/check_load -r -w .15,.10,.05 -c .30,.25,.20 command[check_disk]=/usr/local/nagios/libexec/check_disk -w 20% -c 10% -p / command[check_mem]=/usr/local/nagios/libexec/check_mem.pl -w 90% -c 95% command[check_swap]=/usr/local/nagios/libexec/check_swap -w 20% -c 10%创建一个监控内存的perl脚本
[root@client1 etc]# vi /usr/local/nagios/libexec/check_mem.pl添加下面内容
#! /usr/bin/perl -w # # $Id: check_mem.pl 8 2008-08-23 08:59:52Z rhomann $ # # check_mem v1.7 plugin for nagios # # uses the output of `free` to find the percentage of memory used # # Copyright Notice: GPL # # History: # v1.8 Rouven Homann - rouven.homann@cimt.de # + added findbin patch from Duane Toler # + added backward compatibility patch from Timour Ezeev # # v1.7 Ingo Lantschner - ingo AT boxbe DOT com # + adapted for systems with no swap (avoiding divison through 0) # # v1.6 Cedric Temple - cedric DOT temple AT cedrictemple DOT info # + add swap monitoring # + if warning and critical threshold are 0, exit with OK # + add a directive to exclude/include buffers # # v1.5 Rouven Homann - rouven.homann@cimt.de # + perfomance tweak with free -mt (just one sub process started instead of 7) # + more code cleanup # # v1.4 Garrett Honeycutt - gh@3gupload.com # + Fixed PerfData output to adhere to standards and show crit/warn values # # v1.3 Rouven Homann - rouven.homann@cimt.de # + Memory installed, used and free displayed in verbose mode # + Bit Code Cleanup # # v1.2 Rouven Homann - rouven.homann@cimt.de # + Bug fixed where verbose output was required (nrpe2) # + Bug fixed where perfomance data was not displayed at verbose output # + FindBin Module used for the nagios plugin path of the utils.pm # # v1.1 Rouven Homann - rouven.homann@cimt.de # + Status Support (-c, -w) # + Syntax Help Informations (-h) # + Version Informations Output (-V) # + Verbose Output (-v) # + Better Error Code Output (as described in plugin guideline) # # v1.0 Garrett Honeycutt - gh@3gupload.com # + Initial Release # use strict; use FindBin; FindBin::again(); use lib $FindBin::Bin; use utils qw($TIMEOUT %ERRORS &print_revision &support); use vars qw($PROGNAME $PROGVER); use Getopt::Long; use vars qw($opt_V $opt_h $verbose $opt_w $opt_c); $PROGNAME = "check_mem"; $PROGVER = "1.8"; # add a directive to exclude buffers: my $DONT_INCLUDE_BUFFERS = 0; sub print_help (); sub print_usage (); Getopt::Long::Configure('bundling'); GetOptions ("V" => \$opt_V, "version" => \$opt_V, "h" => \$opt_h, "help" => \$opt_h, "v" => \$verbose, "verbose" => \$verbose, "w=s" => \$opt_w, "warning=s" => \$opt_w, "c=s" => \$opt_c, "critical=s" => \$opt_c); if ($opt_V) { print_revision($PROGNAME,'$Revision: '.$PROGVER.' $'); exit $ERRORS{'UNKNOWN'}; } if ($opt_h) { print_help(); exit $ERRORS{'UNKNOWN'}; } print_usage() unless (($opt_c) && ($opt_w)); my ($mem_critical, $swap_critical); my ($mem_warning, $swap_warning); ($mem_critical, $swap_critical) = ($1,$2) if ($opt_c =~ /([0-9]+)[%]?(?:,([0-9]+)[%]?)?/); ($mem_warning, $swap_warning) = ($1,$2) if ($opt_w =~ /([0-9]+)[%]?(?:,([0-9]+)[%]?)?/); # Check if swap params were supplied $swap_critical ||= 100; $swap_warning ||= 100; # print threshold in output message my $mem_threshold_output = " ("; my $swap_threshold_output = " ("; if ( $mem_warning > 0 && $mem_critical > 0) { $mem_threshold_output .= "W> $mem_warning, C> $mem_critical"; } elsif ( $mem_warning > 0 ) { $mem_threshold_output .= "W> $mem_warning"; } elsif ( $mem_critical > 0 ) { $mem_threshold_output .= "C> $mem_critical"; } if ( $swap_warning > 0 && $swap_critical > 0) { $swap_threshold_output .= "W> $swap_warning, C> $swap_critical"; } elsif ( $swap_warning > 0 ) { $swap_threshold_output .= "W> $swap_warning"; } elsif ( $swap_critical > 0 ) { $swap_threshold_output .= "C> $swap_critical"; } $mem_threshold_output .= ")"; $swap_threshold_output .= ")"; my $verbose = $verbose; my ($mem_percent, $mem_total, $mem_used, $swap_percent, $swap_total, $swap_used) = &sys_stats(); my $free_mem = $mem_total - $mem_used; my $free_swap = $swap_total - $swap_used; # set output message my $output = "Memory Usage".$mem_threshold_output.": ". $mem_percent.'% <br>'; $output .= "Swap Usage".$swap_threshold_output.": ". $swap_percent.'%'; # set verbose output message my $verbose_output = "Memory Usage:".$mem_threshold_output.": ". $mem_percent.'% '."- Total: $mem_total MB, used: $mem_used MB, free: $free_mem MB<br>"; $verbose_output .= "Swap Usage:".$swap_threshold_output.": ". $swap_percent.'% '."- Total: $swap_total MB, used: $swap_used MB, free: $free_swap MB<br>"; # set perfdata message my $perfdata_output = "MemUsed=$mem_percent\%;$mem_warning;$mem_critical"; $perfdata_output .= " SwapUsed=$swap_percent\%;$swap_warning;$swap_critical"; # if threshold are 0, exit with OK if ( $mem_warning == 0 ) { $mem_warning = 101 }; if ( $swap_warning == 0 ) { $swap_warning = 101 }; if ( $mem_critical == 0 ) { $mem_critical = 101 }; if ( $swap_critical == 0 ) { $swap_critical = 101 }; if ($mem_percent>$mem_critical || $swap_percent>$swap_critical) { if ($verbose) { print "<b>CRITICAL: ".$verbose_output."</b>|".$perfdata_output."\n";} else { print "<b>CRITICAL: ".$output."</b>|".$perfdata_output."\n";} exit $ERRORS{'CRITICAL'}; } elsif ($mem_percent>$mem_warning || $swap_percent>$swap_warning) { if ($verbose) { print "<b>WARNING: ".$verbose_output."</b>|".$perfdata_output."\n";} else { print "<b>WARNING: ".$output."</b>|".$perfdata_output."\n";} exit $ERRORS{'WARNING'}; } else { if ($verbose) { print "OK: ".$verbose_output."|".$perfdata_output."\n";} else { print "OK: ".$output."|".$perfdata_output."\n";} exit $ERRORS{'OK'}; } sub sys_stats { my @memory = split(" ", `free -mt`); my $mem_total = $memory[7]; my $mem_used; if ( $DONT_INCLUDE_BUFFERS) { $mem_used = $memory[15]; } else { $mem_used = $memory[8];} my $swap_total = $memory[18]; my $swap_used = $memory[19]; my $mem_percent = ($mem_used / $mem_total) * 100; my $swap_percent; if ($swap_total == 0) { $swap_percent = 0; } else { $swap_percent = ($swap_used / $swap_total) * 100; } return (sprintf("%.0f",$mem_percent),$mem_total,$mem_used, sprintf("%.0f",$swap_percent),$swap_total,$swap_used); } sub print_usage () { print "Usage: $PROGNAME -w <warn> -c <crit> [-v] [-h]\n"; exit $ERRORS{'UNKNOWN'} unless ($opt_h); } sub print_help () { print_revision($PROGNAME,'$Revision: '.$PROGVER.' $'); print "Copyright (c) 2005 Garrett Honeycutt/Rouven Homann/Cedric Temple\n"; print "\n"; print_usage(); print "\n"; print "-w <MemoryWarn>,<SwapWarn> = Memory and Swap usage to activate a warning message (eg: -w 90,25 ) .\n"; print "-c <MemoryCrit>,<SwapCrit> = Memory and Swap usage to activate a critical message (eg: -c 95,50 ).\n"; print "-v = Verbose Output.\n"; print "-h = This screen.\n\n"; support(); }给脚本增加执行权限
[root@client1 etc]# chmod 755 /usr/local/nagios/libexec/check_mem.pl重启nrpe服务
# 方法一 [root@client1 etc]# killall nrpe [root@client1 etc]# /usr/local/nagios/bin/nrpe -d -c /usr/local/nagios/etc/nrpe.cfg # 方法二 [root@client1 etc]# kill -HUP `ps -ef|grep nrpe|awk 'NR==1{print $2}'`在本机执行两个命令看下效果
[root@client1 etc]# /usr/local/nagios/libexec/check_nrpe -H localhost -c check_mem <b>CRITICAL: Memory Usage (W> 10, C> 3): 29% <br>Swap Usage (W> 100, C> 100): 12%</b>|MemUsed=29%;10;3 SwapUsed=12%;100;100 [root@client1 etc]# /usr/local/nagios/libexec/check_nrpe -H localhost -c check_disk DISK OK - free space: / 4201 MB (24.15% inode=97%);| /=13192MB;13915;15654;0;17394
服务端配置
nrpe连接客户端机器测试是否可以连通,然后执行一个监控命令(如果不通的话可能是因为selinux和防火墙没关闭而造成的)
因为nagios默认把全部的权限给nagiosadmin,所以可以通过修改cgi.cfg文件赋予nagios权限,切换到/usr/local/nagios/etc目录下
[root@nagios nrpe-3.1.0]# cd /usr/local/nagios/etc [root@nagios etc]# ll 总用量 144 -rw-rw-r-- 1 nagios nagios 12999 4月 21 17:00 cgi.cfg -rw-r--r-- 1 root root 45 4月 24 09:28 htpasswd.users -rw-rw-r-- 1 nagios nagios 44831 4月 21 17:00 nagios.cfg -rw-r--r-- 1 root root 10765 4月 24 12:32 nrpe.cfg drwxrwxr-x 2 nagios nagios 336 4月 21 17:00 objects -rw-rw---- 1 nagios nagios 1312 4月 21 17:00 resource.cfg [root@nagios etc]# grep nagiosadmin cgi.cfg authorized_for_system_information=nagiosadmin authorized_for_configuration_information=nagiosadmin authorized_for_system_commands=nagiosadmin authorized_for_all_services=nagiosadmin authorized_for_all_hosts=nagiosadmin authorized_for_all_service_commands=nagiosadmin authorized_for_all_host_commands=nagiosadmin [root@nagios etc]# sed -i 's/nagiosadmin/nagiosadmin,nagios/g' cgi.cfg [root@nagios etc]# grep nagiosadmin cgi.cfg authorized_for_system_information=nagiosadmin,nagios authorized_for_configuration_information=nagiosadmin,nagios authorized_for_system_commands=nagiosadmin,nagios authorized_for_all_services=nagiosadmin,nagios authorized_for_all_hosts=nagiosadmin,nagios authorized_for_all_service_commands=nagiosadmin,nagios authorized_for_all_host_commands=nagiosadmin,nagios修改nagios.cfg(自定义一些配置)
[root@nagios etc]# vi nagios.cfg +34 #注释掉下面这行 #cfg_file=/usr/local/nagios/etc/objects/localhost.cfg #添加下面两行内容 cfg_file=/usr/local/nagios/etc/objects/services.cfg cfg_file=/usr/local/nagios/etc/objects/hosts.cfg创建hosts.cfg和services.cfg这两个文件
[root@nagios etc]# cd objects/ [root@nagios objects]# pwd /usr/local/nagios/etc/objects [root@nagios objects]# touch services.cfg [root@nagios objects]# head -51 localhost.cfg > hosts.cfg [root@nagios objects]# chown -R nagios.nagios *修改nagios检查语法脚本
[root@nagios objects]# vim /etc/init.d/nagios +181 #check_config $NagiosBin -v $NagiosCfgFile;vi commands.cfg 进入后按shift+g切到结尾加入下面内容。
# 'check_nrpe' command definition define command{ command_name check_nrpe command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$ } # 'check_ping' command definition define command{ command_name check-ping command_line $USER1$/check_ping -H $HOSTADDRESS$ -w 100.0,20% -c 200.0,50% -p 3 -t 2 } # 'check_http' command definition define command{ command_name check-weburl command_line $USER1$/check_http -H $HOSTADDRESS$ $ARG1$ -w 5 -c 10 } # 'check_tcp' command definition define command{ command_name check-tcp command_line $USER1$/check_tcp -H $HOSTADDRESS$ -p $ARG1$ -w 0.02 -c 0.1 }查看有哪些cfg文件
[root@nagios objects]# pwd /usr/local/nagios/etc/objects [root@nagios objects]# ll 总用量 100 -rw-rw-r-- 1 nagios nagios 7860 4月 24 16:53 commands.cfg -rw-rw-r-- 1 nagios nagios 2138 4月 21 17:00 contacts.cfg -rw-r--r-- 1 nagios nagios 1843 4月 24 16:46 hosts.cfg -rw-rw-r-- 1 nagios nagios 5379 4月 21 17:00 localhost.cfg -rw-rw-r-- 1 nagios nagios 3070 4月 21 17:00 printer.cfg -rw-r--r-- 1 nagios nagios 0 4月 24 16:46 services.cfg -rw-rw-r-- 1 nagios nagios 3252 4月 21 17:00 switch.cfg -rw-rw-r-- 1 nagios nagios 10595 4月 21 17:00 templates.cfg -rw-rw-r-- 1 nagios nagios 3180 4月 21 17:00 timeperiods.cfg -rw-rw-r-- 1 nagios nagios 3991 4月 21 17:00 windows.cfg常用对象介绍
联系人
contact
出了问题像谁报告?一般当然是系统管理员了
监控时间段
timeperiod
7X24小时不间断还是周一至周五,或是自定义的其他时间段
被监控主机
host
所需要监控的服务器,当然可以是监控机自己
监控命令
command
nagios发出的哪个指令来执行某个监控,这也是自己定义的
被监控的服务
service
例如主机是否存活,80端口是否开,磁盘使用情况或者自定义的服务等
contacts.cfg文件介绍
contactgroups.cfg文件介绍
# contactgroup define contactgroup{ contactgroup_name 组名 //联系人组的名称 alias 别名 //别名 members 用户名 //组的成员,来自于上面定义的contacts.cfg,如果有多个联系人则以逗号相隔 }
主机模板介绍(hosts.cfg)
# host define host{ host_name 主机名 //被监控主机的名称,最好别带空格nagios-server alias 别名 address IP //被监控主机的IP地址 check_command check-host-alive //监控的命令check-host-alive,这个命令来自commands.cfg,用来监控主机是否存活 max_check_attempts 5 //检查失败后重试的次数 check_period 24x7 //检查的时间段24x7,同样来自于我们之前在 timeperiods.cfg中定义的 contact_groups 组名 //联系人组,上面在contactgroups.cfg中定义的组名 notification_interval 10 //提醒的间隔,每隔10秒提醒一次 notification_period 24x7 //提醒的周期, 24x7,同样来自于我们之前在timeperiods.cfg中定义的 notification_options d,u,r //指定什么情况下提醒,具体含义见之前contacts.cfg部分的介绍 }
主机组模板介绍(hosts.cfg)
# hostgroup define hostgroup{ hostgroup_name 主机组名 alias 别名 members 主机名 //组的成员主机,多个主机以逗号相隔,必须是上面hosts.cfg中定义的 }
服务模板介绍(services.cfg)
# service definition define service{ host_name 主机名 //被监控的主机,hosts.cfg中定义的 service_description check-host-alive //这个监控项目的描述(也可以说是这个项目的名称),可以空格,我们这里定义的是监控这个主机是不是存活 check_command check-host-alive //所用的命令,是commands.cfg中定义的 max_check_attempts 5 normal_check_interval 3 retry_check_interval 2 check_period 24x7 //监控的时间段,是timeperiods.cfg中定义的 notification_interval 10 notification_period 24x7 //通知的时间段, ,是timeperiods.cfg中定义的 notification_options w,u,c,r //在监控的结果是wucr时通知联系人,具体含义看前文. contact_groups 组名 //联系人组,是contactgroups.cfg中定义的 }
主机模板配置
[root@nagios objects]# vi hosts.cfg +21 删除下面11行内容 添加下面内容 # Define some hosts ###########172.16.0.18################## define host { use linux-server host_name nagios alias nagios address 172.16.0.18 check_command check-host-alive max_check_attempts 3 normal_check_interval 2 retry_check_interval 2 check_period 24x7 notification_interval 300 notification_period 24x7 notification_options d,u,r contact_groups admins process_perf_data 1 } ###########172.16.0.18################## define host { use linux-server host_name client1 alias client1 address 172.16.0.20 check_command check-host-alive max_check_attempts 3 normal_check_interval 2 retry_check_interval 2 check_period 24x7 notification_interval 300 notification_period 24x7 notification_options d,u,r contact_groups admins process_perf_data 1 } 把监控的主机添加到主机组里面 [root@nagios objects]# vi hosts.cfg +76 define hostgroup{ hostgroup_name linux-servers ; The name of the hostgroup alias Linux Servers ; Long name of the group members nagios,client1 ; Comma separated list of hosts that belong to this group }