Nagios是一款企业级开源软件,专注于监控服务器上服务是否正常,不生成图形,提供报警机制,邮件或者短信发送监控状态,它通过各种插件实现不同的功能。

Nagios监控平台主程序

Nagios-plugins 必选插件

NRPE 监控远程服务器的主机资源

NSClient++ 用于监控Windows主机

NDOUtils 将数据写入数据库


实例应用:

1 监控快速部署

监控需要安装http php nagios nagios-plugins NRPE软件包

yum install -y gd gd-devel openssl openssl-devel httpd php gcc glibc glib-common make wget

net-snmp

setenforce 0

iptables -F

安装nagios 源码包下载安装

wget http://sourceforge.net/projects/nagios/files/nagios-3.x/nagios-3.5.0/nagios-3.5.0.tar.gz/download

groupadd nagios

useradd -g nagios nagios

tar -zxf nagios-3.5.0.tar.gz -C /usr/src/

cd /usr/src/nagios

./configure --with-nagios-user=nagios --with-nagios-group=nagios

make all

make install

make install-init #安装启动脚本

make install-commandmode #安装与配置目录权限

make install-config#安装配置文件模板

make install-webconf#web监控界面配置

安装nagios-plugins和nrpe

wget http://nchc.dl.sourceforge.net/project/nagiosplug/nagiosplug/1.4.16/nagios-plugins-1.4.16.tar.gz

tar -zxfnagios-plugins-1.4.16.tar.gz -C /usr/src/

cd /usr/src/nagios-plugins-1.4.16

./configure --prefix=/usr/local/nagios/

make && make install

wgetwgethttp://nchc.dl.sourceforge.net/project/nagios/nrpe-2.x/nrpe-2.14/nrpe-2.14.tar.gz

tar -zxfnrpe-2.14.tar.gz -C /usr/src/

cd /usr/src/nrpe-2.14

./configure

make all

make install-plugin

make install-daemon

make install-daemon-config

chown -R nagions.nagions /usr/local/nagios

创建账户信息

htpasswd -c /usr/local/nagions/etc/htpasswd.users tomcat

iptables -I INPUT -p tcp --dport 80 -j ACCEPT

service iptables save

启动服务

service httpd start

/etc/init.d/nagios start

chkconfig httpd on

chkconfig --add nagios

chkconfig nagios on


2 修改配置文件

nagios的配置文件较多,主要位于/usr/local/nagios/etc 下

nagios.conf主配置文件

nrpe.cfg远程监控配置文件

cgi.confCGI配置文件

commands.cfg命令定义文件

contacts.cfg定义联系人文件

timepreriods.cfg 时间周期定义文件

tempaltes.cfg对象定义参考模板

localhost.cfg监控本机配置模板

printer.cfg监控打印机模板

switch.cfg 监控交换模板

windows.cfg 监控Windows配置模板


很多配置文件无需修改可以直接使用

修改主配置文件nagios.cfg,主要是用cfg_file配置加载其他配置文件。

vim /usr/local/nagios/etc/nagios.cfg

cfg_file=/usr/local/nagios/etc/objects/commands.cfg

cfg_file=/usr/local/nagios/etc/objects/contacts.cfg

cfg_file=/usr/local/nagios/etc/objects/templates.cfg

cfg_file=/usr/local/nagios/etc/objects/timeperiods.cfg

cfg_file=/usr/local/nagios/etc/objects/localhost.cfg

cfg_file=/usr/local/nagios/etc/web1.cfg

cfg_file=/usr/local/nagios/etc/web2.cfg


修改CGI配置文件cgi.cfg,添加tomcat账户进来

vim/usr/local/nagios/etc/cgi.cfg

default_user_name=tomcat

authorized_for_system_information=nagiosadmin,tomcat

authorized_for_configuration_information=nagiosadmin,tomcat

authorized_for_system_commands=nagiosadmin,tomcat

authorized_for_all_services=nagiosadmin,tomcat

authorized_for_all_hosts=nagiosadmin,tomcat

authorized_for_all_service_commands=nagiosadmin,tomcat

authorized_for_all_host_commands=nagiosadmin,tomcat


修改命令配置文件command.cfg,定义命令实现的方式,如邮件报警,使用工具,内容格式等。

vim /usr/local/nagios/etc/objects/commands.cfg

define command{

command_name check_nrpe

command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -t 30 -c $ARG1$

}

define command{

command_name check_nrpe_args

command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -t 30 -c $ARG1$ -a $ARG2$

}


修改联系人配置文件contacts.cfg 报警的联系人及联系方式

define contact{

contact_name nagiosadmin

use generic-contact

alias Nagios Admin

email yourname@domain.com

}


修改报警时间周期timeperiods.cfg

vim/usr/local/nagios/etc/objects/timeperiods.cfg

define timeperiods{

timeperiod_name 24x7 #监控所有时间段(7*24小时)

alias 24 Hours A Day, 7 Days A Week
sunday 00:00-24:00
monday 00:00-24:00
tuesday 00:00-24:00
wednesday 00:00-24:00
thursday 00:00-24:00
friday 00:00-24:00
saturday 00:00-24:00
}


修改本机的配置localhost.cfg

define host{

use linux-server

host_name duangr-1

alias duangr-1

address 192.168.56.10

}


define service{

use local-service

host_name duangr-1

service_description Host Alive

check_command check-host-alive

}

define service{

use local-service

host_name duangr-1

service_description Users

check_command check_local_users!20!50

}

define service{

use local-service

host_name duangr-1

service_description CPU

check_command check_local_load!5.0,4.0,3.0!10.0,6.0,4.0

}

define service{

use local-service

host_name duangr-1

service_description Disk Root

check_command check_local_disk!20%!10%!/

}

define service{

use local-service

host_name duangr-1

service_description Disk Home

check_command check_local_disk!20%!10%!/export/home

}

define service{

use local-service

host_name duangr-1

service_description Zombie Procs

check_command check_local_procs!5!10!Z

}

define service{

use local-service

host_name duangr-1

service_description Total Procs

check_command check_local_procs!250!400!RSZDT

}

define service{

use local-service

host_name duangr-1

service_description Swap Usage

check_command check_local_swap!20!10

}


修改模板文件templates.cfg

vi /usr/local/nagios/etc/objects/templates.cfg

#联系人模板generic-contact

define contact{

name generic-contact

service_notification_period 24x7

host_notification_period 24x7

service_notification_options w,u,c,r,f,s

host_notification_options d,u,r,f,s

service_notification_commands notify-service-by-email

host_notification_commands notify-host-by-email

register 0

}


#定义generic-host主机模板

define host{

name generic-host

notifications_enabled 1

event_handler_enabled 1

flap_detection_enabled 1

failure_prediction_enabled 1

process_perf_data 1

retain_status_information 1

retain_nonstatus_information 1

notification_period 24x7

register 0

}


#定义Linux主机模板

define host{

name linux-server

use generic-host

check_period 24x7

check_interval 5

retry_interval 1

max_check_attempts 10

check_command check-host-alive

notification_period workhours

notification_interval 120

notification_options d,u,r

contact_groups admins

register 0

}


创建远程监控web1.cfg

vim/usr/local/nagios/etc/web1.cfg

definehost{
uselinux-server
host_nameduangr-2
aliasduangr-2
address192.168.56.11
}

defineservice{
uselocal-service
host_nameduangr-2
service_descriptionHostAlive
check_commandcheck-host-alive
}
defineservice{
uselocal-service
host_nameduangr-2
service_descriptionUsers
check_commandcheck_nrpe_args!check_users!510
}
defineservice{
uselocal-service
host_nameduangr-2
service_descriptionCPU
check_commandcheck_nrpe_args!check_load!15,10,530,25,20
}
defineservice{
uselocal-service
host_nameduangr-2
service_descriptionDiskRoot
check_commandcheck_nrpe_args!check_disk!20%10%/
}
defineservice{
uselocal-service
host_nameduangr-2
service_descriptionDisk/export/home
check_commandcheck_nrpe_args!check_disk!20%10%/export/home
}
defineservice{
uselocal-service
host_nameduangr-2
service_descriptionProcsZombie
check_commandcheck_nrpe_args!check_procs!510Z
}
defineservice{
uselocal-service
host_nameduangr-2
service_descriptionProcsTotal
check_commandcheck_nrpe_args!check_procs_args!"-w400-c600"}
defineservice{
uselocal-service
host_nameduangr-2
service_descriptionSwapUsage
check_commandcheck_nrpe_args!check_swap!20%10%
}

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;下面是一些常用进程的监控,主要是云平台相关进程
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;监控crond进程
defineservice{
uselocal-service
host_nameduangr-2
service_descriptionPS:crond
check_commandcheck_nrpe_args!check_procs_args!"-c1:1-Ccrond"}
;;监控zookeeper进程
defineservice{
uselocal-service
host_nameduangr-2
service_descriptionPS:QuorumPeerMain
check_commandcheck_nrpe_args!check_procs_args!"-c1:1-Cjava-aserver.quorum.QuorumPeerMain"}
;;监控storm的从节点进程
defineservice{
uselocal-service
host_nameduangr-2
service_descriptionPS:supervisor
check_commandcheck_nrpe_args!check_procs_args!"-c1:1-Cjava-adaemon.supervisor"}
;;监控storm的主节点进程
defineservice{
uselocal-service
host_nameduangr-2
service_descriptionPS:nimbus
check_commandcheck_nrpe_args!check_procs_args!"-c1:1-Cjava-adaemon.nimbus"}
;;监控MetaQ进程
defineservice{
uselocal-service
host_nameduangr-2
service_descriptionPS:MetaQ
check_commandcheck_nrpe_args!check_procs_args!"-c1:1-Cjava-ametamorphosis-server-w"}
;;监控Redis进程
defineservice{
uselocal-service
host_nameduangr-2
service_descriptionPS:redis-server
check_commandcheck_nrpe_args!check_procs_args!"-c1:1-Credis-server"}
;;监控hadoop主节点NameNode进程
defineservice{
uselocal-service
host_nameduangr-2
service_descriptionPS:NameNode
check_commandcheck_nrpe_args!check_procs_args!"-c1:1-Cjava-aserver.namenode.NameNode"}
;;监控hadoop主节点SecondaryNameNode进程
defineservice{
uselocal-service
host_nameduangr-2
service_descriptionPS:SecondaryNameNode
check_commandcheck_nrpe_args!check_procs_args!"-c1:1-Cjava-aserver.namenode.SecondaryNameNode"}
;;监控hadoop主节点ResourceManager进程
defineservice{
uselocal-service
host_nameduangr-2
service_descriptionPS:ResourceManager
check_commandcheck_nrpe_args!check_procs_args!"-c1:1-Cjava-aserver.resourcemanager.ResourceManager"}
;;监控hadoop从节点DataNode进程
defineservice{
uselocal-service
host_nameduangr-2
service_descriptionPS:DataNode
check_commandcheck_nrpe_args!check_procs_args!"-c1:1-Cjava-aserver.datanode.DataNode"}
;;监控hadoop从节点NodeManager进程
defineservice{
uselocal-service
host_nameduangr-2
service_descriptionPS:NodeManager
check_commandcheck_nrpe_args!check_procs_args!"-c1:1-Cjava-aserver.nodemanager.NodeManager"}


由于duangr-2是远程主机,因此使用check_nrpe_args命令来监控.


/etc/init.d/nagios restart

快速定位配置文件问题所在命令

/usr/local/nagios/bin/nagios -V /usr/local/nagios/etc/nagios.cfg


3 被监控机安装软件 nagios-plugin nrpe

yum install -y openssl openssl-devel

groupadd nagios

useradd -g nagios -s /sbin/nologin nagios

tar -zxfnagios-plugins-2.1.6.tar.gz -C /usr/src/

cd /usr/src/nagios-plugins-2.1.6

./configure --prefix=/usr/local/nagios/--with-nagios-user=nagios --with-nagios-group=nagios

make && make install

tar -zxfnrpe-2.14.tar.gz -C /usr/src/

cd /usr/src/nrpe-2.14

./configure

make all

make install-plugin

make install-daemon

make install-daemon-config


修改客户端的NRPE配置文件

command[check_users]=/usr/local/nagios/libexec/check_users -w 5 -c 10

command[check_load]=/usr/local/nagios/libexec/check_load -w 15,10,5 -c 30,25,20

command[check_sda2]=/usr/local/nagios/libexec/check_disk -w 20% -c 10% -p /dev/sda2

command[check_swap]=/usr/local/nagios/libexec/check_disk -w 20% -c 10% -p /dev/shm

command[check_home]=/usr/local/nagios/libexec/check_disk -w 20% -c 10% -p /dev/mapper/VolGroup00-LogVol00

command[check_zombie_procs]=/usr/local/nagios/libexec/check_procs -w 5 -c 10 -s Z

command[check_total_procs]=/usr/local/nagios/libexec/check_procs -w 200 -c 300

command[check_ping81]=/usr/local/nagios/libexec/check_ping -H 10.155.0.1 -w 100.0,20% -c 500.0,60%#

command[check_hda1]=/usr/local/nagios/libexec/check_disk -w 20 -c 10 -p /dev/hda1


/usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d

echo "/usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d" >> /etc/rc.local

netstat -lnupt |grep 5666

iptables -I INPUT -p tcp --dport 5666 -j ACCEPT

service iptables save



检查监控命令配置是否ok

/usr/local/nagios/libexec/check_nrpe -H localhost -c check_users -a 5 10

/usr/local/nagios/libexec/check_nrpe -H localhost -c check_load -a 15,10,5 30,25,20

/usr/local/nagios/libexec/check_nrpe -H localhost -c check_disk -a 20% 10% /

/usr/local/nagios/libexec/check_nrpe -H localhost -c check_procs -a 200 400 RSZDT

/usr/local/nagios/libexec/check_nrpe -H localhost -c check_swap -a 20% 10%


没有问题就可以用浏览器访问nagios了