Skip to content

Commit a21fedc

Browse files
committed
Add a file lock around creation of a snapshot (#20)
* Add a file lock around creation of a snapshot Because we set the "binlog isn't stale" flag at the end of a backup, a newly started cluster will spawn many `create_snapshot` processes even though we're locking the BACKUP_TTL in Consul. This adds a local file lock that prevents the node from running multiples. Also removed an unnecessary reliance on the repl user having access to the end-user's data DB just to run health checks. * Make sure we create the backup lock file in create_snapshot If we don't make sure we create the file here we'll throw an IOError and catch it in the except block. This is ok in `is_backup_running` because if the file doesn't even exist we're obviously not running the backup either. But we need to make sure we create the file in `create_snapshot` or we end up bailing out too early.
1 parent 6ee9dc9 commit a21fedc

File tree

1 file changed

+72
-49
lines changed

1 file changed

+72
-49
lines changed

bin/triton-mysql.py

Lines changed: 72 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,6 @@ def health():
273273
# connection until this point
274274
ctx = dict(user=config.repl_user,
275275
password=config.repl_password,
276-
database=config.mysql_db,
277276
timeout=cp.config['services'][0]['ttl'])
278277
node.conn = wait_for_connection(**ctx)
279278

@@ -290,8 +289,9 @@ def health():
290289
if node.is_primary() or node.is_standby():
291290
update_session_ttl()
292291

293-
if (node.is_snapshot_node() and
294-
(is_binlog_stale(node.conn) or is_time_for_snapshot())):
292+
if all(node.is_snapshot_node(),
293+
(not is_backup_running()),
294+
(is_binlog_stale(node.conn) or is_time_for_snapshot())):
295295
try:
296296
write_snapshot(node.conn)
297297
except Exception as ex:
@@ -319,7 +319,6 @@ def on_change():
319319

320320
ctx = dict(user=config.repl_user,
321321
password=config.repl_password,
322-
database=config.mysql_db,
323322
timeout=cp.config['services'][0]['ttl'])
324323
node.conn = wait_for_connection(**ctx)
325324

@@ -372,34 +371,48 @@ def on_change():
372371

373372
def create_snapshot():
374373
log.debug('create_snapshot')
374+
try:
375+
lockfile_name = '/tmp/{}'.format(BACKUP_TTL_KEY)
376+
backup_lock = open(lockfile_name, 'r+')
377+
except IOError:
378+
backup_lock = open(lockfile_name, 'w')
379+
380+
try:
381+
fcntl.flock(backup_lock, fcntl.LOCK_EX|fcntl.LOCK_NB)
382+
383+
# we don't want .isoformat() here because of URL encoding
384+
now = datetime.utcnow().strftime('%Y-%m-%dT%H-%M-%SZ')
385+
backup_id = '{}-{}'.format(BACKUP_NAME, now)
386+
387+
with open('/tmp/backup.tar', 'w') as f:
388+
subprocess.check_call(['/usr/bin/innobackupex',
389+
'--user={}'.format(config.repl_user),
390+
'--password={}'.format(config.repl_password),
391+
'--no-timestamp',
392+
#'--compress',
393+
'--stream=tar',
394+
'/tmp/backup'], stdout=f)
395+
396+
manta_config.put_backup(backup_id, '/tmp/backup.tar')
397+
consul.kv.put(LAST_BACKUP_KEY, backup_id)
398+
399+
ctx = dict(user=config.repl_user,
400+
password=config.repl_password)
401+
conn = wait_for_connection(**ctx)
402+
403+
# write the filename of the binlog to Consul so that we know if
404+
# we've rotated since the last backup.
405+
# query lets IndexError bubble up -- something's broken
406+
results = mysql_query(conn, 'SHOW MASTER STATUS', ())
407+
binlog_file = results[0][0]
408+
consul.kv.put(LAST_BINLOG_KEY, binlog_file)
409+
410+
except IOError:
411+
return False
412+
finally:
413+
fcntl.flock(backup_lock, fcntl.LOCK_UN)
414+
backup_lock.close()
375415

376-
# we don't want .isoformat() here because of URL encoding
377-
now = datetime.utcnow().strftime('%Y-%m-%dT%H-%M-%SZ')
378-
backup_id = '{}-{}'.format(BACKUP_NAME, now)
379-
380-
with open('/tmp/backup.tar', 'w') as f:
381-
subprocess.check_call(['/usr/bin/innobackupex',
382-
'--user={}'.format(config.repl_user),
383-
'--password={}'.format(config.repl_password),
384-
'--no-timestamp',
385-
#'--compress',
386-
'--stream=tar',
387-
'/tmp/backup'], stdout=f)
388-
389-
manta_config.put_backup(backup_id, '/tmp/backup.tar')
390-
consul.kv.put(LAST_BACKUP_KEY, backup_id)
391-
392-
ctx = dict(user=config.repl_user,
393-
password=config.repl_password,
394-
database=config.mysql_db)
395-
conn = wait_for_connection(**ctx)
396-
397-
# write the filename of the binlog to Consul so that we know if
398-
# we've rotated since the last backup.
399-
# query lets IndexError bubble up -- something's broken
400-
results = mysql_query(conn, 'SHOW MASTER STATUS', ())
401-
binlog_file = results[0][0]
402-
consul.kv.put(LAST_BINLOG_KEY, binlog_file)
403416

404417

405418
# ---------------------------------------------------------
@@ -641,7 +654,7 @@ def create_repl_user(conn):
641654
'GRANT SUPER, REPLICATION SLAVE, RELOAD, LOCK TABLES, REPLICATION CLIENT '
642655
'ON *.* TO `{0}`@`%%`; '
643656
'FLUSH PRIVILEGES;'.format(config.repl_user)),
644-
(config.repl_password))
657+
(config.repl_password,))
645658

646659

647660
def set_timezone_info():
@@ -759,6 +772,17 @@ def restore_from_snapshot(filename):
759772
'/tmp/backup'])
760773
take_ownership(config)
761774

775+
def is_backup_running():
776+
try:
777+
backup_lock = open('/tmp/{}'.format(BACKUP_TTL_KEY), 'r+')
778+
fcntl.flock(backup_lock, fcntl.LOCK_EX|fcntl.LOCK_NB)
779+
fcntl.flock(backup_lock, fcntl.LOCK_UN)
780+
return True
781+
except IOError:
782+
return False
783+
finally:
784+
backup_lock.close()
785+
762786
def is_binlog_stale(conn):
763787
results = mysql_query(conn, 'SHOW MASTER STATUS', ())
764788
try:
@@ -793,6 +817,10 @@ def write_snapshot(conn):
793817
# create_snapshot call and return. The snapshot process will be
794818
# re-parented to ContainerPilot
795819
set_backup_ttl()
820+
# TODO: we currently fork this off and return because otherwise
821+
# health checks will fail during backups. When periodic tasks
822+
# support lands in ContainerPilot we should move the snapshot
823+
# to a task and avoid this mess.
796824
subprocess.Popen(['python', '/usr/local/bin/triton-mysql.py', 'create_snapshot'])
797825

798826
def set_backup_ttl():
@@ -910,26 +938,21 @@ def get_from_consul(key):
910938
# ---------------------------------------------------------
911939
# utility functions
912940

941+
# all exceptions bubble up to the caller
913942
def mysql_exec(conn, sql, params):
914-
try:
915-
with conn.cursor() as cursor:
916-
log.debug(sql)
917-
log.debug(params)
918-
cursor.execute(sql, params)
919-
conn.commit()
920-
except Exception:
921-
raise # re-raise so that we exit
943+
with conn.cursor() as cursor:
944+
log.debug(sql)
945+
log.debug(params)
946+
cursor.execute(sql, params)
947+
conn.commit()
922948

949+
# all exceptions bubble up to the caller
923950
def mysql_query(conn, sql, params):
924-
try:
925-
with conn.cursor() as cursor:
926-
log.debug(sql)
927-
log.debug(params)
928-
cursor.execute(sql, params)
929-
return cursor.fetchall()
930-
except Exception:
931-
raise # re-raise so that we exit
932-
951+
with conn.cursor() as cursor:
952+
log.debug(sql)
953+
log.debug(params)
954+
cursor.execute(sql, params)
955+
return cursor.fetchall()
933956

934957
def get_ip(iface='eth0'):
935958
"""

0 commit comments

Comments
 (0)