55import os
66import io
77import sys
8+ import math
89import pickle
910import pandas as pd
1011import numpy as np
@@ -75,7 +76,7 @@ def _get_event_files_in_the_range(
7576 available timestamp
7677 """
7778
78- if self ._startAfter_prefix is not "" :
79+ if self ._startAfter_prefix != "" :
7980 if end_time_microseconds >= get_utctimestamp_us_since_epoch_from_system_profiler_file (
8081 self ._startAfter_prefix
8182 ):
@@ -202,13 +203,14 @@ class S3NumpySystemMetricsReader(S3SystemMetricsReader):
202203 fill_val = 101
203204
204205 def __init__ (self , s3_trial_path , use_in_memory_cache ,
205- col_names , col_dict ,
206+ col_names , extra_col_names , col_dict ,
206207 np_store , store_chunk_size , store_time_file_prefix ,
207208 group_size , n_nodes , frequency ,
208209 accumulate_forward = False , accumulate_minutes = 20 ,
209210 n_process = None , logger = None ):
210211 super ().__init__ (s3_trial_path , use_in_memory_cache )
211212 self .col_names = col_names # list of names of indicators
213+ self .extra_col_names = extra_col_names # list of extra indicators, not stored in memory, as opposed to col_names
212214 self .col_dict = col_dict # mapping from name to position. Possibly will decide not needed
213215 self .np_store = np_store
214216 self .np_chunk_size = store_chunk_size
@@ -273,7 +275,9 @@ def init_accumulators(self):
273275
274276 num_rows = self .accu_mins * 60 * 10 # Max, at highest frequency
275277 num_rows = num_rows + num_rows // 10 # Buffer
276- num_cols = len (self .col_names )
278+ # extra_col_names treated separately because, unlike col_names indices,
279+ # the indices corresponding to col_names, they are stored on disk only
280+ num_cols = len (self .col_names ) + len (self .extra_col_names )
277281 self .accu_n_rows = num_rows
278282
279283 for i in range (0 , self .n_nodes ):
@@ -299,9 +303,10 @@ def init_accumulators(self):
299303 self .logger .info ("NumpyS3Reader: ALLOCATED ACCU MEM for node {}" .format (i ))
300304
301305 def _json_to_numpy (node_ind , start_time , end_time , freq_delta ,
302- num_rows , num_cols , np_chunk_size , event_data_list ,
306+ num_rows , num_cols , num_extra_cols ,
307+ np_chunk_size , event_data_list ,
303308 shared_mem_id , col_dict ,
304- accu_val_mem_id , accu_time_mem_id ,
309+ accu_val_mem_id , accu_time_mem_id ,
305310 accu_cnt_mem_id , accu_num_rows ,
306311 np_store , tprefix , queue , logger ):
307312
@@ -329,25 +334,26 @@ def _json_to_numpy(node_ind, start_time, end_time, freq_delta,
329334
330335 accu_shm_count = shared_memory .SharedMemory (name = accu_cnt_mem_id )
331336 accu_counts = \
332- np .ndarray ((num_cols ,), dtype = np .int32 , buffer = accu_shm_count .buf )
337+ np .ndarray ((num_cols + num_extra_cols ,),
338+ dtype = np .int32 , buffer = accu_shm_count .buf )
333339
334340 accu_shm_val = shared_memory .SharedMemory (name = accu_val_mem_id )
335- accu_vals = np .ndarray ((accu_num_rows , num_cols ),
341+ accu_vals = np .ndarray ((accu_num_rows , num_cols + num_extra_cols ),
336342 dtype = np .int32 , buffer = accu_shm_val .buf )
337343
338344 accu_shm_time = shared_memory .SharedMemory (name = accu_time_mem_id )
339- accu_times = np .ndarray ((accu_num_rows , num_cols ),
345+ accu_times = np .ndarray ((accu_num_rows , num_cols + num_extra_cols ),
340346 dtype = np .int64 , buffer = accu_shm_time .buf )
341347
342348 n_extra = np_chunk_size // 100
343349 for i in range (0 , num_chunks ):
344350 np_val_chunks [i ] = \
345- np .full ((np_chunk_size + n_extra , num_cols ),
351+ np .full ((np_chunk_size + n_extra , num_cols + num_extra_cols ),
346352 - 1 , dtype = np .int32 )
347353 np_time_chunks [i ] = \
348- np .full ((np_chunk_size + n_extra , num_cols ),
354+ np .full ((np_chunk_size + n_extra , num_cols + num_extra_cols ),
349355 - 1 , dtype = np .int64 )
350- np_ragged_sizes [i ] = np .zeros ((num_cols ,), dtype = np .int32 )
356+ np_ragged_sizes [i ] = np .zeros ((num_cols + num_extra_cols ,), dtype = np .int32 )
351357
352358 separator = S3NumpySystemMetricsReader .separator
353359
@@ -359,23 +365,24 @@ def _json_to_numpy(node_ind, start_time, end_time, freq_delta,
359365
360366 n_zeros = 0
361367 n_nonzeros = 0
362- network_used = []
363- cpu_memory_used = []
364368 for event_data in event_data_list :
365369 event_string = event_data .decode ("utf-8" )
366370 event_items = event_string .split ("\n " )
367371 event_items .remove ("" )
368372 for item in event_items :
369373 event = json .loads (item ) #ojson better
370- if event ['Dimension' ] == "Algorithm" :
371- network_used .append (int (event ['Value' ]))
372- continue
373- if event ['Name' ] == "MemoryUsedPercent" :
374- cpu_memory_used .append ( float (event ['Value' ]))
375- continue
376- if event ['Name' ].startswith ("cpu" ) == False and \
374+ if event ['Dimension' ] != "Algorithm" and \
375+ event ['Name' ] != "MemoryUsedPercent" and \
376+ event ['Name' ].startswith ("cpu" ) == False and \
377377 event ['Name' ].startswith ("gpu" ) == False :
378378 continue
379+
380+ if event ['Name' ] == "MemoryUsedPercent" :
381+ # More informative, when very little RAM work
382+ event ['Value' ] = math .ceil (event ['Value' ])
383+
384+ is_extra = (event ['Dimension' ] == "Algorithm" or \
385+ event ['Name' ] == "MemoryUsedPercent" )
379386 col_ind = col_dict [event ['Name' ]+ separator + event ['Dimension' ]]
380387 cur_time = int (event ['Timestamp' ]* 1000000 ) #fast
381388
@@ -389,11 +396,12 @@ def _json_to_numpy(node_ind, start_time, end_time, freq_delta,
389396 chunk_ind = row_ind // np_chunk_size
390397 chunk_row_ind = row_ind - chunk_ind * np_chunk_size
391398
392- np_arr [row_ind ,col_ind ] = event ['Value' ]
393- if event ['Value' ] == 0 :
394- n_zeros += 1
395- else :
396- n_nonzeros += 1
399+ if is_extra is False :
400+ np_arr [row_ind ,col_ind ] = event ['Value' ]
401+ if event ['Value' ] == 0 :
402+ n_zeros += 1
403+ else :
404+ n_nonzeros += 1
397405
398406 try :
399407 if accu_val_mem_id is not None and accu_counts [col_ind ] < accu_num_rows :
@@ -435,7 +443,7 @@ def _json_to_numpy(node_ind, start_time, end_time, freq_delta,
435443 max_entries_in_chunk = 0
436444 min_time_in_chunk = max_time # This will yield a filename
437445
438- for col_ind in range (0 , num_cols ):
446+ for col_ind in range (0 , num_cols + num_extra_cols ):
439447 n_entries = np_ragged_sizes [chunk_ind ][col_ind ]
440448 max_entries_in_chunk = max (max_entries_in_chunk , n_entries )
441449
@@ -458,8 +466,7 @@ def _json_to_numpy(node_ind, start_time, end_time, freq_delta,
458466 continue
459467
460468 if max_entries_in_chunk > 0 :
461-
462- shp = (np_chunk_size + n_extra , num_cols )
469+ shp = (np_chunk_size + n_extra , num_cols + num_extra_cols )
463470 S3NumpySystemMetricsReader .store_vals_times (
464471 node_ind , min_time_in_chunk , np_store , tprefix ,
465472 shp , np_val_chunks [chunk_ind ], np_time_chunks [chunk_ind ])
@@ -469,12 +476,7 @@ def _json_to_numpy(node_ind, start_time, end_time, freq_delta,
469476 #print("RAGGED min_time_in_chunk type: {}".format(type(min_time_in_chunk.item())))
470477 #print("RAGGED np_ragged_sized type: {}".format(type(np_ragged_sizes[chunk_ind])))
471478
472- network_used = np .array (network_used )
473- cpu_memory_used = np .array (cpu_memory_used )
474- S3NumpySystemMetricsReader .store_vals (node_ind , min_time , np_store , (len (network_used ),), network_used , val_type = "Network" )
475- S3NumpySystemMetricsReader .store_vals (node_ind , min_time , np_store , (len (cpu_memory_used ),), cpu_memory_used , val_type = "CPUmemory" , dtype = float )
476-
477- logger .info ("S3NumpyReader _json_to_numpy FINISHED for node {}" .format (node_ind ))
479+ logger .info ("S3NumpyReader _json_to_numpy FINISHED for node {} min_row {}, max_row {}, min_time {}, max_time {}" .format (node_ind , min_row , max_row , min_time , max_time ))
478480 queue .put ((node_ind , min_row , max_row , min_time , max_time , jagged_metadata ))
479481
480482 def get_events (
@@ -542,6 +544,8 @@ def get_events(
542544
543545 num_rows = (end_time - start_time )// (self .frequency * 1000 )
544546 num_cols = len (self .col_names )
547+ # Not stored in memory, as opposed to col_names based indicators:
548+ num_extra_cols = len (self .extra_col_names )
545549 np_chunk_size = self .np_chunk_size
546550 self .logger .info ("NumpyS3Reader: untrimmed DF shape ({},{})" .format (num_rows ,len (self .col_names )))
547551 np_arr = np .full ((num_rows , num_cols ), np .nan )
@@ -581,7 +585,7 @@ def get_events(
581585 for i in range (0 , n_nodes ):
582586 tasks [i ] = Process (target = S3NumpySystemMetricsReader ._json_to_numpy ,
583587 args = (i , start_time , end_time , freq_delta ,
584- num_rows , num_cols , np_chunk_size ,
588+ num_rows , num_cols , num_extra_cols , np_chunk_size ,
585589 event_data_lists [i ],
586590 shared_mem_ids [i ], copy .deepcopy (self .col_dict ),
587591 self .accu_val_mem_ids [i ] if forward else None ,
@@ -632,6 +636,16 @@ def get_events(
632636
633637 # Could multiprocess the backfill as well. Not a bottleneck for now
634638 post_process = True
639+ """
640+ Logic for fl below.
641+ We want to backfill, but not when the user pauses profiling
642+ Also, the user may have switched profiling frequencies, max being 60000,
643+ our original being possibly 100. So we deem as "interruption"
644+ if we do not see a signal within
645+ fudge_factor * 60000 * "max freq" / "our preq"
646+ worst case is 3 minutes
647+ """
648+ fl = int (max (5 , 3 * 60000 / self .frequency ))
635649 for i in range (0 , n_nodes ):
636650 shm = shared_memory .SharedMemory (name = shared_mem_ids [i ])
637651 arr_from_sh = np .ndarray (num_rows * num_cols ,
@@ -646,8 +660,14 @@ def get_events(
646660 self .logger .info ("S3NumpyReader: {} nans out of {}" .format (mask .sum (), np_arr .size ))
647661 st_loc = time .perf_counter_ns ()
648662 temp_df = pd .DataFrame (np_arr )
649- temp_df .fillna (method = 'ffill' , axis = 0 , inplace = True )
650- temp_df .fillna (method = 'bfill' , axis = 0 , inplace = True )
663+
664+ # Fill at most 5 missing values, if more, there is a gap
665+ temp_df .fillna (method = 'ffill' , axis = 0 , limit = fl , inplace = True )
666+ temp_df .fillna (method = 'bfill' , axis = 0 , limit = fl , inplace = True )
667+ # If anything is left to fill, we had a profiling gap. Zero it
668+ temp_df .fillna (0 , axis = 0 , inplace = True )
669+ temp_df .fillna (0 , axis = 0 , inplace = True )
670+
651671 fill_val = S3NumpySystemMetricsReader .fill_val
652672 temp_df .fillna (fill_val , axis = 0 , inplace = True )
653673 np_arr = temp_df .values
@@ -696,7 +716,8 @@ def collect_accu_metadata(self, start_time: int, end_time: int, forward: bool):
696716
697717 self .logger .info ("S3NumpyReader: writting accumulated data" )
698718
699- num_cols = len (self .col_names )
719+ # both col_names based and extra_col_names based indicators go on disk:
720+ num_cols = len (self .col_names ) + len (self .extra_col_names )
700721 num_rows = self .accu_n_rows
701722 np_store = self .np_store
702723 tprefix = self .tprefix
@@ -807,11 +828,10 @@ def store_vals_times(node_ind, min_time_in_chunk, np_store, tprefix,
807828 S3NumpySystemMetricsReader .dump_to_disk (np_store , node_name , time_filename , np_time , shp , dtype = np .int64 )
808829
809830 @staticmethod
810- def store_vals (node_ind , min_time , np_store , shp , np_data , val_type = "" , dtype = np .int64 ):
831+ def store_vals (node_ind , np_store , shp , np_data , val_type = "" , dtype = np .int32 ):
811832 node_name = S3NumpySystemMetricsReader .node_name_from_index (node_ind )
812833 separator = S3NumpySystemMetricsReader .separator
813- filename = val_type + separator + str (min_time ) + separator + str (node_ind + 1 ) + \
814- ".npy"
834+ filename = val_type + separator + str (node_ind + 1 ) + ".npy"
815835 if np_store .startswith ("s3://" ):
816836 S3NumpySystemMetricsReader .dump_to_s3 (np_store , node_name , filename , np_data )
817837 else :
@@ -829,9 +849,12 @@ def dump_to_s3(s3_storage_loc, node_name, filename, np_data):
829849 s3_client .upload_fileobj (data_stream , bucket , filepath )
830850
831851 @staticmethod
832- def dump_to_disk (disk_storage_loc , node_name , filename , np_data , shp , dtype = np .int64 ):
852+ def dump_to_disk (disk_storage_loc , node_name , filename , np_data , shp , dtype = np .int32 ):
853+ directory = os .path .join (disk_storage_loc , node_name )
833854 filepath = os .path .join (disk_storage_loc , node_name , filename )
834855
856+ if not os .path .exists (directory ):
857+ os .makedirs (directory )
835858 fp_numpy = np .memmap (filepath ,
836859 dtype = dtype , offset = 0 , mode = 'w+' , shape = shp )
837860
0 commit comments