mark: A photo of Mark kneeling on top of the Taal Volcano in the Philippines. It was a long hike. (Default)
Mark Smith ([staff profile] mark) wrote in [site community profile] changelog2009-04-29 09:21 am

[dw-ops] Add MySQL replication checking.

[commit: http://hg.dwscoalition.org/dw-ops/rev/b60429c76622]

Add MySQL replication checking.

Patch by [staff profile] mark.

Files modified:
  • nagios/conf.d/config/commands.cfg
  • nagios/conf.d/config/hostgroups.cfg
  • nagios/conf.d/hosts/dfw-db-a01-auto.cfg
  • nagios/conf.d/services/db.cfg
  • nagios/custom/check_mysql_replication.pl
--------------------------------------------------------------------------------
diff -r 5ba9e2aee2ac -r b60429c76622 nagios/conf.d/config/commands.cfg
--- a/nagios/conf.d/config/commands.cfg	Sun Apr 26 08:06:17 2009 +0000
+++ b/nagios/conf.d/config/commands.cfg	Wed Apr 29 09:21:03 2009 +0000
@@ -21,6 +21,13 @@ define command {
 }
 
 
+# replication check
+define command {
+    command_name  dw_check_mysql_repl
+    command_line  $USER1$/custom/check_mysql_replication.pl -H $HOSTADDRESS$ -u $USER3$ -p $USER4$
+}
+
+
 # checks that we are not falling behind on TheSchwartz jobs
 define command {
     command_name  dw_check_theschwartz_queue
diff -r 5ba9e2aee2ac -r b60429c76622 nagios/conf.d/config/hostgroups.cfg
--- a/nagios/conf.d/config/hostgroups.cfg	Sun Apr 26 08:06:17 2009 +0000
+++ b/nagios/conf.d/config/hostgroups.cfg	Wed Apr 29 09:21:03 2009 +0000
@@ -87,6 +87,14 @@ define hostgroup {
 
 # databases
 define hostgroup {
+    hostgroup_name  db_slave
+    alias           Database Servers - Slaves
+}
+
+
+
+# databases
+define hostgroup {
     hostgroup_name  db_schwartz
     alias           Database Servers - TheSchwartz
 }
diff -r 5ba9e2aee2ac -r b60429c76622 nagios/conf.d/hosts/dfw-db-a01-auto.cfg
--- a/nagios/conf.d/hosts/dfw-db-a01-auto.cfg	Sun Apr 26 08:06:17 2009 +0000
+++ b/nagios/conf.d/hosts/dfw-db-a01-auto.cfg	Wed Apr 29 09:21:03 2009 +0000
@@ -4,7 +4,7 @@ define host {
 define host {
     host_name  dfw-db-a01
     alias      dfw-db-a01
-    hostgroups db
+    hostgroups db,db_slave
     address    10.176.71.86
     use        generic-host
 }
diff -r 5ba9e2aee2ac -r b60429c76622 nagios/conf.d/services/db.cfg
--- a/nagios/conf.d/services/db.cfg	Sun Apr 26 08:06:17 2009 +0000
+++ b/nagios/conf.d/services/db.cfg	Wed Apr 29 09:21:03 2009 +0000
@@ -23,6 +23,15 @@ define service {
 }
 
 
+# verify that MySQL is responding
+define service {
+    hostgroup_name                  db_slave
+    service_description             MySQL Replication
+    check_command                   dw_check_mysql_repl
+    use                             generic-service
+}
+
+
 # in particular, some databases have theschwartz on them and should
 # be monitored for the queues
 define service {
diff -r 5ba9e2aee2ac -r b60429c76622 nagios/custom/check_mysql_replication.pl
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/nagios/custom/check_mysql_replication.pl	Wed Apr 29 09:21:03 2009 +0000
@@ -0,0 +1,57 @@
+#!/usr/bin/perl
+
+use strict;
+use DBI;
+use Getopt::Long;
+
+my ( $user, $pass, $host );
+GetOptions(
+        'host|H=s' => \$host,
+        'user=s' => \$user,
+        'password=s' => \$pass,
+    ) or usage();
+usage()
+    unless $user && $pass && $host;
+
+# connect to the database
+my $dbh = DBI->connect( "DBI:mysql:host=$host", $user, $pass )
+    or crit( 'Unable to connect to database.' );
+
+my $rep = $dbh->selectrow_hashref( 'SHOW SLAVE STATUS' );
+crit( 'Database error or database not a slave.' )
+    unless $rep && $rep->{Slave_IO_State};
+crit( "Replication stopped: IO=$rep->{Slave_IO_Running}, SQL=$rep->{Slave_SQL_Running}." )
+    unless $rep->{Slave_IO_Running} eq 'Yes' &&
+           $rep->{Slave_SQL_Running} eq 'Yes';
+
+# very tight limits
+crit( "Replication very far behind: $rep->{Seconds_Behind_Master}." )
+    if $rep->{Seconds_Behind_Master} > 60;
+_warn( "Replication behind: $rep->{Seconds_Behind_Master}." )
+    if $rep->{Seconds_Behind_Master} > 15;
+
+ok( 'Replication okay.' );
+
+# return usage information on this script
+sub usage {
+    print <<EOF;
+check_mysql_replication.pl -- easy way to monitor replication
+EOF
+    exit 3;
+}
+
+# nagios return codes
+sub crit {
+    print "$_[0]\n";
+    exit 2;
+}
+
+sub _warn {
+    print "$_[0]\n";
+    exit 1;
+}
+
+sub ok {
+    print "$_[0]\n";
+    exit 0;
+}
--------------------------------------------------------------------------------