Skip to content

Commit 639fc6f

Browse files
authored
Bug fixes to the numpy reader (#646)
* Bug fixes to the numpy reader * Added extra_cols based indicator handling, they are used for rulelets only, they only go on disk
1 parent d4a8d53 commit 639fc6f

2 files changed

Lines changed: 66 additions & 43 deletions

File tree

smdebug/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.0.26"
1+
__version__ = "1.0.27"

smdebug/profiler/system_metrics_reader.py

Lines changed: 65 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import os
66
import io
77
import sys
8+
import math
89
import pickle
910
import pandas as pd
1011
import numpy as np
@@ -75,7 +76,7 @@ def _get_event_files_in_the_range(
7576
available timestamp
7677
"""
7778

78-
if self._startAfter_prefix is not "":
79+
if self._startAfter_prefix != "":
7980
if end_time_microseconds >= get_utctimestamp_us_since_epoch_from_system_profiler_file(
8081
self._startAfter_prefix
8182
):
@@ -202,13 +203,14 @@ class S3NumpySystemMetricsReader(S3SystemMetricsReader):
202203
fill_val = 101
203204

204205
def __init__(self, s3_trial_path, use_in_memory_cache,
205-
col_names, col_dict,
206+
col_names, extra_col_names, col_dict,
206207
np_store, store_chunk_size, store_time_file_prefix,
207208
group_size, n_nodes, frequency,
208209
accumulate_forward = False, accumulate_minutes = 20,
209210
n_process=None, logger = None):
210211
super().__init__(s3_trial_path, use_in_memory_cache)
211212
self.col_names = col_names # list of names of indicators
213+
self.extra_col_names = extra_col_names # list of extra indicators, not stored in memory, as opposed to col_names
212214
self.col_dict = col_dict # mapping from name to position. Possibly will decide not needed
213215
self.np_store = np_store
214216
self.np_chunk_size = store_chunk_size
@@ -273,7 +275,9 @@ def init_accumulators(self):
273275

274276
num_rows = self.accu_mins * 60 * 10 # Max, at highest frequency
275277
num_rows = num_rows + num_rows//10 # Buffer
276-
num_cols = len(self.col_names)
278+
# extra_col_names treated separately because, unlike col_names indices,
279+
# the indices corresponding to col_names, they are stored on disk only
280+
num_cols = len(self.col_names) + len(self.extra_col_names)
277281
self.accu_n_rows = num_rows
278282

279283
for i in range (0, self.n_nodes):
@@ -299,9 +303,10 @@ def init_accumulators(self):
299303
self.logger.info("NumpyS3Reader: ALLOCATED ACCU MEM for node {}".format(i))
300304

301305
def _json_to_numpy(node_ind, start_time, end_time, freq_delta,
302-
num_rows, num_cols, np_chunk_size, event_data_list,
306+
num_rows, num_cols, num_extra_cols,
307+
np_chunk_size, event_data_list,
303308
shared_mem_id, col_dict,
304-
accu_val_mem_id, accu_time_mem_id,
309+
accu_val_mem_id, accu_time_mem_id,
305310
accu_cnt_mem_id, accu_num_rows,
306311
np_store, tprefix, queue, logger):
307312

@@ -329,25 +334,26 @@ def _json_to_numpy(node_ind, start_time, end_time, freq_delta,
329334

330335
accu_shm_count = shared_memory.SharedMemory(name=accu_cnt_mem_id)
331336
accu_counts =\
332-
np.ndarray((num_cols,), dtype=np.int32, buffer=accu_shm_count.buf)
337+
np.ndarray((num_cols+num_extra_cols,),
338+
dtype=np.int32, buffer=accu_shm_count.buf)
333339

334340
accu_shm_val = shared_memory.SharedMemory(name=accu_val_mem_id)
335-
accu_vals = np.ndarray((accu_num_rows, num_cols),
341+
accu_vals = np.ndarray((accu_num_rows, num_cols+num_extra_cols),
336342
dtype=np.int32, buffer=accu_shm_val.buf)
337343

338344
accu_shm_time = shared_memory.SharedMemory(name=accu_time_mem_id)
339-
accu_times = np.ndarray((accu_num_rows, num_cols),
345+
accu_times = np.ndarray((accu_num_rows, num_cols+num_extra_cols),
340346
dtype=np.int64, buffer=accu_shm_time.buf)
341347

342348
n_extra = np_chunk_size//100
343349
for i in range (0, num_chunks):
344350
np_val_chunks[i] =\
345-
np.full((np_chunk_size+n_extra, num_cols),
351+
np.full((np_chunk_size+n_extra, num_cols+num_extra_cols),
346352
-1, dtype = np.int32)
347353
np_time_chunks[i] =\
348-
np.full((np_chunk_size+n_extra, num_cols),
354+
np.full((np_chunk_size+n_extra, num_cols+num_extra_cols),
349355
-1, dtype = np.int64)
350-
np_ragged_sizes[i] = np.zeros((num_cols,), dtype = np.int32)
356+
np_ragged_sizes[i] = np.zeros((num_cols+num_extra_cols,), dtype = np.int32)
351357

352358
separator = S3NumpySystemMetricsReader.separator
353359

@@ -359,23 +365,24 @@ def _json_to_numpy(node_ind, start_time, end_time, freq_delta,
359365

360366
n_zeros = 0
361367
n_nonzeros = 0
362-
network_used = []
363-
cpu_memory_used = []
364368
for event_data in event_data_list:
365369
event_string = event_data.decode("utf-8")
366370
event_items = event_string.split("\n")
367371
event_items.remove("")
368372
for item in event_items:
369373
event = json.loads(item) #ojson better
370-
if event['Dimension'] == "Algorithm":
371-
network_used.append(int(event['Value']))
372-
continue
373-
if event['Name'] == "MemoryUsedPercent":
374-
cpu_memory_used.append( float(event['Value']))
375-
continue
376-
if event['Name'].startswith("cpu") == False and\
374+
if event['Dimension'] != "Algorithm" and\
375+
event['Name'] != "MemoryUsedPercent" and\
376+
event['Name'].startswith("cpu") == False and\
377377
event['Name'].startswith("gpu") == False:
378378
continue
379+
380+
if event['Name'] == "MemoryUsedPercent":
381+
# More informative, when very little RAM work
382+
event['Value'] = math.ceil(event['Value'])
383+
384+
is_extra = (event['Dimension'] == "Algorithm" or\
385+
event['Name'] == "MemoryUsedPercent")
379386
col_ind = col_dict[event['Name']+separator+event['Dimension']]
380387
cur_time = int(event['Timestamp']*1000000) #fast
381388

@@ -389,11 +396,12 @@ def _json_to_numpy(node_ind, start_time, end_time, freq_delta,
389396
chunk_ind = row_ind//np_chunk_size
390397
chunk_row_ind = row_ind - chunk_ind*np_chunk_size
391398

392-
np_arr[row_ind,col_ind] = event['Value']
393-
if event['Value'] == 0:
394-
n_zeros += 1
395-
else:
396-
n_nonzeros += 1
399+
if is_extra is False:
400+
np_arr[row_ind,col_ind] = event['Value']
401+
if event['Value'] == 0:
402+
n_zeros += 1
403+
else:
404+
n_nonzeros += 1
397405

398406
try:
399407
if accu_val_mem_id is not None and accu_counts[col_ind] < accu_num_rows:
@@ -435,7 +443,7 @@ def _json_to_numpy(node_ind, start_time, end_time, freq_delta,
435443
max_entries_in_chunk = 0
436444
min_time_in_chunk = max_time # This will yield a filename
437445

438-
for col_ind in range (0, num_cols):
446+
for col_ind in range (0, num_cols+num_extra_cols):
439447
n_entries = np_ragged_sizes[chunk_ind][col_ind]
440448
max_entries_in_chunk = max(max_entries_in_chunk, n_entries)
441449

@@ -458,8 +466,7 @@ def _json_to_numpy(node_ind, start_time, end_time, freq_delta,
458466
continue
459467

460468
if max_entries_in_chunk > 0:
461-
462-
shp = (np_chunk_size+n_extra, num_cols)
469+
shp = (np_chunk_size+n_extra, num_cols+num_extra_cols)
463470
S3NumpySystemMetricsReader.store_vals_times(
464471
node_ind, min_time_in_chunk, np_store, tprefix,
465472
shp, np_val_chunks[chunk_ind], np_time_chunks[chunk_ind])
@@ -469,12 +476,7 @@ def _json_to_numpy(node_ind, start_time, end_time, freq_delta,
469476
#print("RAGGED min_time_in_chunk type: {}".format(type(min_time_in_chunk.item())))
470477
#print("RAGGED np_ragged_sized type: {}".format(type(np_ragged_sizes[chunk_ind])))
471478

472-
network_used = np.array(network_used)
473-
cpu_memory_used = np.array(cpu_memory_used)
474-
S3NumpySystemMetricsReader.store_vals(node_ind, min_time, np_store, (len(network_used),), network_used, val_type="Network")
475-
S3NumpySystemMetricsReader.store_vals(node_ind, min_time, np_store, (len(cpu_memory_used),), cpu_memory_used, val_type="CPUmemory", dtype=float)
476-
477-
logger.info("S3NumpyReader _json_to_numpy FINISHED for node {}".format(node_ind))
479+
logger.info("S3NumpyReader _json_to_numpy FINISHED for node {} min_row {}, max_row {}, min_time {}, max_time {}".format(node_ind, min_row, max_row, min_time, max_time))
478480
queue.put((node_ind, min_row, max_row, min_time, max_time, jagged_metadata))
479481

480482
def get_events(
@@ -542,6 +544,8 @@ def get_events(
542544

543545
num_rows = (end_time-start_time)//(self.frequency*1000)
544546
num_cols = len(self.col_names)
547+
# Not stored in memory, as opposed to col_names based indicators:
548+
num_extra_cols = len(self.extra_col_names)
545549
np_chunk_size = self.np_chunk_size
546550
self.logger.info("NumpyS3Reader: untrimmed DF shape ({},{})".format(num_rows,len(self.col_names)))
547551
np_arr = np.full((num_rows, num_cols), np.nan)
@@ -581,7 +585,7 @@ def get_events(
581585
for i in range (0, n_nodes):
582586
tasks[i] = Process(target=S3NumpySystemMetricsReader._json_to_numpy,
583587
args=(i, start_time, end_time, freq_delta,
584-
num_rows, num_cols, np_chunk_size,
588+
num_rows, num_cols, num_extra_cols, np_chunk_size,
585589
event_data_lists[i],
586590
shared_mem_ids[i], copy.deepcopy(self.col_dict),
587591
self.accu_val_mem_ids[i] if forward else None,
@@ -632,6 +636,16 @@ def get_events(
632636

633637
# Could multiprocess the backfill as well. Not a bottleneck for now
634638
post_process = True
639+
"""
640+
Logic for fl below.
641+
We want to backfill, but not when the user pauses profiling
642+
Also, the user may have switched profiling frequencies, max being 60000,
643+
our original being possibly 100. So we deem as "interruption"
644+
if we do not see a signal within
645+
fudge_factor * 60000 * "max freq" / "our preq"
646+
worst case is 3 minutes
647+
"""
648+
fl = int(max(5, 3*60000/self.frequency))
635649
for i in range (0, n_nodes):
636650
shm = shared_memory.SharedMemory(name=shared_mem_ids[i])
637651
arr_from_sh = np.ndarray(num_rows*num_cols,
@@ -646,8 +660,14 @@ def get_events(
646660
self.logger.info("S3NumpyReader: {} nans out of {}".format(mask.sum(), np_arr.size))
647661
st_loc = time.perf_counter_ns()
648662
temp_df = pd.DataFrame(np_arr)
649-
temp_df.fillna(method='ffill', axis=0, inplace=True)
650-
temp_df.fillna(method='bfill', axis=0, inplace=True)
663+
664+
# Fill at most 5 missing values, if more, there is a gap
665+
temp_df.fillna(method='ffill', axis=0, limit = fl, inplace=True)
666+
temp_df.fillna(method='bfill', axis=0, limit = fl, inplace=True)
667+
# If anything is left to fill, we had a profiling gap. Zero it
668+
temp_df.fillna(0, axis=0, inplace=True)
669+
temp_df.fillna(0, axis=0, inplace=True)
670+
651671
fill_val = S3NumpySystemMetricsReader.fill_val
652672
temp_df.fillna(fill_val, axis=0, inplace=True)
653673
np_arr = temp_df.values
@@ -696,7 +716,8 @@ def collect_accu_metadata(self, start_time: int, end_time: int, forward: bool):
696716

697717
self.logger.info ("S3NumpyReader: writting accumulated data")
698718

699-
num_cols = len(self.col_names)
719+
# both col_names based and extra_col_names based indicators go on disk:
720+
num_cols = len(self.col_names) + len(self.extra_col_names)
700721
num_rows = self.accu_n_rows
701722
np_store = self.np_store
702723
tprefix = self.tprefix
@@ -807,11 +828,10 @@ def store_vals_times(node_ind, min_time_in_chunk, np_store, tprefix,
807828
S3NumpySystemMetricsReader.dump_to_disk(np_store, node_name, time_filename, np_time, shp, dtype=np.int64)
808829

809830
@staticmethod
810-
def store_vals(node_ind, min_time, np_store, shp, np_data, val_type="", dtype=np.int64):
831+
def store_vals(node_ind, np_store, shp, np_data, val_type="", dtype=np.int32):
811832
node_name = S3NumpySystemMetricsReader.node_name_from_index(node_ind)
812833
separator = S3NumpySystemMetricsReader.separator
813-
filename = val_type + separator + str(min_time) + separator + str(node_ind+1) + \
814-
".npy"
834+
filename = val_type + separator + str(node_ind+1) + ".npy"
815835
if np_store.startswith("s3://"):
816836
S3NumpySystemMetricsReader.dump_to_s3(np_store, node_name, filename, np_data)
817837
else:
@@ -829,9 +849,12 @@ def dump_to_s3(s3_storage_loc, node_name, filename, np_data):
829849
s3_client.upload_fileobj(data_stream, bucket, filepath)
830850

831851
@staticmethod
832-
def dump_to_disk(disk_storage_loc, node_name, filename, np_data, shp, dtype=np.int64):
852+
def dump_to_disk(disk_storage_loc, node_name, filename, np_data, shp, dtype=np.int32):
853+
directory = os.path.join(disk_storage_loc, node_name)
833854
filepath = os.path.join(disk_storage_loc, node_name, filename)
834855

856+
if not os.path.exists(directory):
857+
os.makedirs(directory)
835858
fp_numpy = np.memmap(filepath,
836859
dtype=dtype, offset=0, mode='w+', shape = shp)
837860

0 commit comments

Comments
 (0)