@@ -273,7 +273,6 @@ def health():
273273 # connection until this point
274274 ctx = dict (user = config .repl_user ,
275275 password = config .repl_password ,
276- database = config .mysql_db ,
277276 timeout = cp .config ['services' ][0 ]['ttl' ])
278277 node .conn = wait_for_connection (** ctx )
279278
@@ -290,8 +289,9 @@ def health():
290289 if node .is_primary () or node .is_standby ():
291290 update_session_ttl ()
292291
293- if (node .is_snapshot_node () and
294- (is_binlog_stale (node .conn ) or is_time_for_snapshot ())):
292+ if all (node .is_snapshot_node (),
293+ (not is_backup_running ()),
294+ (is_binlog_stale (node .conn ) or is_time_for_snapshot ())):
295295 try :
296296 write_snapshot (node .conn )
297297 except Exception as ex :
@@ -319,7 +319,6 @@ def on_change():
319319
320320 ctx = dict (user = config .repl_user ,
321321 password = config .repl_password ,
322- database = config .mysql_db ,
323322 timeout = cp .config ['services' ][0 ]['ttl' ])
324323 node .conn = wait_for_connection (** ctx )
325324
@@ -372,34 +371,48 @@ def on_change():
372371
373372def create_snapshot ():
374373 log .debug ('create_snapshot' )
374+ try :
375+ lockfile_name = '/tmp/{}' .format (BACKUP_TTL_KEY )
376+ backup_lock = open (lockfile_name , 'r+' )
377+ except IOError :
378+ backup_lock = open (lockfile_name , 'w' )
379+
380+ try :
381+ fcntl .flock (backup_lock , fcntl .LOCK_EX | fcntl .LOCK_NB )
382+
383+ # we don't want .isoformat() here because of URL encoding
384+ now = datetime .utcnow ().strftime ('%Y-%m-%dT%H-%M-%SZ' )
385+ backup_id = '{}-{}' .format (BACKUP_NAME , now )
386+
387+ with open ('/tmp/backup.tar' , 'w' ) as f :
388+ subprocess .check_call (['/usr/bin/innobackupex' ,
389+ '--user={}' .format (config .repl_user ),
390+ '--password={}' .format (config .repl_password ),
391+ '--no-timestamp' ,
392+ #'--compress',
393+ '--stream=tar' ,
394+ '/tmp/backup' ], stdout = f )
395+
396+ manta_config .put_backup (backup_id , '/tmp/backup.tar' )
397+ consul .kv .put (LAST_BACKUP_KEY , backup_id )
398+
399+ ctx = dict (user = config .repl_user ,
400+ password = config .repl_password )
401+ conn = wait_for_connection (** ctx )
402+
403+ # write the filename of the binlog to Consul so that we know if
404+ # we've rotated since the last backup.
405+ # query lets IndexError bubble up -- something's broken
406+ results = mysql_query (conn , 'SHOW MASTER STATUS' , ())
407+ binlog_file = results [0 ][0 ]
408+ consul .kv .put (LAST_BINLOG_KEY , binlog_file )
409+
410+ except IOError :
411+ return False
412+ finally :
413+ fcntl .flock (backup_lock , fcntl .LOCK_UN )
414+ backup_lock .close ()
375415
376- # we don't want .isoformat() here because of URL encoding
377- now = datetime .utcnow ().strftime ('%Y-%m-%dT%H-%M-%SZ' )
378- backup_id = '{}-{}' .format (BACKUP_NAME , now )
379-
380- with open ('/tmp/backup.tar' , 'w' ) as f :
381- subprocess .check_call (['/usr/bin/innobackupex' ,
382- '--user={}' .format (config .repl_user ),
383- '--password={}' .format (config .repl_password ),
384- '--no-timestamp' ,
385- #'--compress',
386- '--stream=tar' ,
387- '/tmp/backup' ], stdout = f )
388-
389- manta_config .put_backup (backup_id , '/tmp/backup.tar' )
390- consul .kv .put (LAST_BACKUP_KEY , backup_id )
391-
392- ctx = dict (user = config .repl_user ,
393- password = config .repl_password ,
394- database = config .mysql_db )
395- conn = wait_for_connection (** ctx )
396-
397- # write the filename of the binlog to Consul so that we know if
398- # we've rotated since the last backup.
399- # query lets IndexError bubble up -- something's broken
400- results = mysql_query (conn , 'SHOW MASTER STATUS' , ())
401- binlog_file = results [0 ][0 ]
402- consul .kv .put (LAST_BINLOG_KEY , binlog_file )
403416
404417
405418# ---------------------------------------------------------
@@ -641,7 +654,7 @@ def create_repl_user(conn):
641654 'GRANT SUPER, REPLICATION SLAVE, RELOAD, LOCK TABLES, REPLICATION CLIENT '
642655 'ON *.* TO `{0}`@`%%`; '
643656 'FLUSH PRIVILEGES;' .format (config .repl_user )),
644- (config .repl_password ))
657+ (config .repl_password , ))
645658
646659
647660def set_timezone_info ():
@@ -759,6 +772,17 @@ def restore_from_snapshot(filename):
759772 '/tmp/backup' ])
760773 take_ownership (config )
761774
775+ def is_backup_running ():
776+ try :
777+ backup_lock = open ('/tmp/{}' .format (BACKUP_TTL_KEY ), 'r+' )
778+ fcntl .flock (backup_lock , fcntl .LOCK_EX | fcntl .LOCK_NB )
779+ fcntl .flock (backup_lock , fcntl .LOCK_UN )
780+ return True
781+ except IOError :
782+ return False
783+ finally :
784+ backup_lock .close ()
785+
762786def is_binlog_stale (conn ):
763787 results = mysql_query (conn , 'SHOW MASTER STATUS' , ())
764788 try :
@@ -793,6 +817,10 @@ def write_snapshot(conn):
793817 # create_snapshot call and return. The snapshot process will be
794818 # re-parented to ContainerPilot
795819 set_backup_ttl ()
820+ # TODO: we currently fork this off and return because otherwise
821+ # health checks will fail during backups. When periodic tasks
822+ # support lands in ContainerPilot we should move the snapshot
823+ # to a task and avoid this mess.
796824 subprocess .Popen (['python' , '/usr/local/bin/triton-mysql.py' , 'create_snapshot' ])
797825
798826def set_backup_ttl ():
@@ -910,26 +938,21 @@ def get_from_consul(key):
910938# ---------------------------------------------------------
911939# utility functions
912940
941+ # all exceptions bubble up to the caller
913942def mysql_exec (conn , sql , params ):
914- try :
915- with conn .cursor () as cursor :
916- log .debug (sql )
917- log .debug (params )
918- cursor .execute (sql , params )
919- conn .commit ()
920- except Exception :
921- raise # re-raise so that we exit
943+ with conn .cursor () as cursor :
944+ log .debug (sql )
945+ log .debug (params )
946+ cursor .execute (sql , params )
947+ conn .commit ()
922948
949+ # all exceptions bubble up to the caller
923950def mysql_query (conn , sql , params ):
924- try :
925- with conn .cursor () as cursor :
926- log .debug (sql )
927- log .debug (params )
928- cursor .execute (sql , params )
929- return cursor .fetchall ()
930- except Exception :
931- raise # re-raise so that we exit
932-
951+ with conn .cursor () as cursor :
952+ log .debug (sql )
953+ log .debug (params )
954+ cursor .execute (sql , params )
955+ return cursor .fetchall ()
933956
934957def get_ip (iface = 'eth0' ):
935958 """
0 commit comments