#!/usr/bin/python

# MySQL主从集中监控脚本

# 请在MySQL上执行如下语句,将跑脚本的机器授权一个db_monitor账号。

# grant SHOW VIEW,REPLICATION CLIENT,SHOW DATABASES,PROCESS on *.* to db_monitor@'192.168.1.10';

import MySQLdb

import os

import datetime

# 发送一个字符串告警,公司特有告警系统,可以改成其他监控方式,比如短信或邮件

def report_alarm(attr, err_msg):

os.system('/usr/local/agenttools/agent/agentRepStr %s "%s" >/dev/null' % (attr, err_msg))

# 上报一个数值型告警,也是特有告警系统

def send_repnum(attr, num):

os.system('/usr/local/agenttools/agent/agentRepNum %s "%s" >/dev/null' % (attr ,num) )

def monitor(db_host):

try:

cxn=MySQLdb.connect(db_host, 'db_monitor', '', db='mysql')

except MySQLdb.Error, e:

try:

sqlError ="Error %d:%s" % (e.args[0], e.args[1])

except IndexError:

sqlError = "MySQL Error:%s" % str(e)

report_alarm(36320, sqlError)

#get mysql version

cur=cxn.cursor()

cur.execute('select version()')

for item in cur.fetchall():

if '5.0.' in item[0]:

mysql_version = 'low'

else:

mysql_version = 'high'

#show slave status

cur=cxn.cursor()

cur.execute('show slave status')

for item in cur.fetchall():

replicate_error = item[19]

if mysql_version == 'low':

seconds_behind_master = item[-1]

else:

seconds_behind_master = item[32]

#show processlist

cur=cxn.cursor()

cur.execute('show processlist')

connect_num = len(cur.fetchall())

meta_lock_num = cur.fetchall().count("meta lock")

cxn.close()

return replicate_error,seconds_behind_master,connect_num,meta_lock_num

print datetime.datetime.now()

for host in open('%s/hosts.list'% os.path.dirname(os.path.realpath(__file__))):

host = host.strip('\n')

replicate_error = ''

seconds_behind_master = 0

connect_num = 0

meta_lock_num = 0

replicate_error,seconds_behind_master,connect_num,meta_lock_num = monitor(host)

try:

Seconds_behind_master = int(seconds_behind_master)

except:

Seconds_behind_master = seconds_behind_master

print '==============================='

print 'Infomation of %s' % host

print "The Num of metalock : %s" % meta_lock_num

print "The Num of connection : %s" % connect_num

# 上报当前连接数

send = send_repnum(36713, connect_num)

# 上报字典锁数量

send = send_repnum(36715, meta_lock_num)

if len(replicate_error) > 0:

print replicate_error

# 如果主从同步出现问题,就发送告警

report_alarm(36320, '%s: Replicate error' % host)

elif Seconds_behind_master > 600:

print "Seconds behid master: %s" % Seconds_behind_master

# 如果主从延迟超过600s,就发送告警

report_alarm(36320, '%s: Seconds behind master ge 600' % host)

#else:

# print host + " is OK!"

print '==============================='