11# Standard Library 
22from  collections  import  defaultdict 
3+ from  enum  import  Enum 
34
45# Third Party 
56import  pandas  as  pd 
89from  smdebug .core .logger  import  get_logger 
910
1011
12+ class  StatsBy (Enum ):
13+     """ 
14+     Enum to get stats by different categories. 
15+     """ 
16+ 
17+     # training phase such as TRAIN/EVAL/GLOBAL. 
18+     TRAINING_PHASE  =  "training_phase" 
19+ 
20+     # framework metrics such as function names/ operator names 
21+     FRAMEWORK_METRICS  =  "framework_metric" 
22+ 
23+     # event phase name as retrieved from events 
24+     PROCESS  =  "process" 
25+ 
26+ 
27+ class  Resource (Enum ):
28+     """ 
29+     Enum to specify the device/resource specified in system metrics 
30+     TODO: Add other resources as system metrics are updated 
31+     """ 
32+ 
33+     CPU  =  "cpu" 
34+ 
35+     GPU  =  "gpu" 
36+ 
37+ 
1138# Container class for job stats 
1239class  JobStats (dict ):
1340    def  __setitem__ (self , key , item ):
@@ -38,7 +65,7 @@ def __init__(self, system_df, framework_df):
3865
3966    def  get_job_statistics (self ):
4067        """ 
41-         Returns a dataframe  with information about runtime of training job, initilization, training loop and finalization. 
68+         Returns a Dictionary  with information about runtime of training job, initilization, training loop and finalization. 
4269        """ 
4370        job_statistics  =  JobStats ()
4471        job_statistics ["start_time" ] =  min (self .sys_metrics_df ["timestamp" ])
@@ -75,14 +102,20 @@ def get_job_statistics(self):
75102
76103        return  job_statistics 
77104
78-     def  get_step_statistics (self , by = "training_phase" ):
105+     def  get_step_statistics (self , by = StatsBy . TRAINING_PHASE ):
79106        """ 
80107        Get average, minimum, maximum, p50, p95, p99 stats on step duration 
81108        :param by: by default, stats are grouped by framework_metric. The other options are 
82-         to get stats by training phase - train/eval/global or grouped by process. 
109+         to get stats by training phase - train/eval/global or grouped by process. This parameter 
110+         should be of type StatsBy 
83111        """ 
112+         if  not  isinstance (by , StatsBy ):
113+             get_logger ("smdebug-profiler" ).info (f"{ by }   should be of type StatsBy" )
114+             return  None 
115+ 
116+         by  =  by .value 
84117        step_stats  =  None 
85-         if  by  in  ["framework_metric" ,  "process" ]:
118+         if  by  in  [StatsBy . FRAMEWORK_METRICS . value ,  StatsBy . PROCESS . value ]:
86119            # TODO: Consider that some events may be occurring in parallel 
87120            step_stats  =  (
88121                self .framework_metrics_df .groupby ([by ])["duration_us" ]
@@ -94,7 +127,7 @@ def get_step_statistics(self, by="training_phase"):
94127            step_stats .columns .name  =  "" 
95128            step_stats  =  step_stats .drop (["count" , "std" ], axis = "columns" )
96129            step_stats  =  step_stats [[by , "mean" , "min" , "max" , "50%" , "95%" , "99%" ]]
97-         elif  by  ==  "training_phase" :
130+         elif  by  ==  StatsBy . TRAINING_PHASE . value :
98131            phase_metrics_df  =  self .framework_metrics_df [
99132                self .framework_metrics_df ["framework_metric" ].str .contains ("Step:ModeKeys" )
100133            ]
@@ -139,18 +172,31 @@ def helper(start, end, phase):
139172            lambda  x : helper (x ["start_time_us" ], x ["end_time_us" ], x ["phase" ]), axis = 1 
140173        )
141174
142-     def  get_utilization_stats (self , by = None , phase = None ):
175+     def  get_utilization_stats (self , resource = None ,  by = None , phase = None ):
143176        """ 
144177        Get CPU/GPU utilization stats 
178+         :param resource: system resource for which utilization stats have to be computed. Type: Resource 
145179        :param by: By default, get overall utilization stats. When by="training_phase", 
146-         utilization stats are provided per training phase interval 
180+         utilization stats are provided per training phase interval. Type: StatsBy  
147181        :param phase: List of training phase to find intervals for. If nothing is mentioned, intervals 
148182        are determined for all training phases available. 
149183        :return: Dataframe containing utilization stats 
150184        """ 
151-         resources  =  ["cpu" , "gpu" ]
185+         if  (by  is  not   None ) and  (not  isinstance (by , StatsBy )):
186+             get_logger ("smdebug-profiler" ).info (f"{ by }   should be of type StatsBy" )
187+             return  None 
188+         if  (resource  is  not   None ) and  (not  isinstance (resource , (list , Resource ))):
189+             get_logger ("smdebug-profiler" ).info (f"{ resource }   should be of type list or Resource" )
190+             return  None 
152191
153-         if  by  ==  "training_phase" :
192+         if  resource  is  None :
193+             resources  =  [Resource .CPU .value , Resource .GPU .value ]
194+         else :
195+             if  isinstance (resource , Resource ):
196+                 resource  =  [resource ]
197+             resources  =  [x .value  for  x  in  resource ]
198+ 
199+         if  by  ==  StatsBy .TRAINING_PHASE :
154200            interval_df  =  self .get_training_phase_intervals (phase )
155201            self ._get_utilization_phase_by_time_interval (interval_df )
156202
@@ -159,7 +205,7 @@ def get_utilization_stats(self, by=None, phase=None):
159205            sys_resrc_df  =  self .sys_metrics_df [
160206                self .sys_metrics_df ["system_metric" ].str .contains (resrc )
161207            ].reset_index ()
162-             if  by  ==  "training_phase" :
208+             if  by  ==  StatsBy . TRAINING_PHASE :
163209                sys_resrc_df  =  (
164210                    sys_resrc_df .groupby ("phase" )["value" ]
165211                    .describe (percentiles = [0.5 , 0.95 , 0.99 ])
@@ -212,44 +258,74 @@ def get_utilization_stats(self, by=None, phase=None):
212258        util_stats .columns  =  columns 
213259        return  util_stats 
214260
215-     def  get_device_usage_stats (self , device = "cpu" , utilization_ranges = None ):
261+     def  get_device_usage_stats (self , device = Resource . CPU , utilization_ranges = None ):
216262        """ 
217263        Find the usage spread based on utilization ranges. If ranges are not provided, 
218264        >90, 10-90, <10 are considered 
219-         :param device: cpu/ gpu 
265+         :param device: Resource. cpu or Resource. gpu. Type: Resource  
220266        :param utilization_ranges: list of tuples 
221267        """ 
268+         if  utilization_ranges  is  None :
269+             utilization_ranges  =  [(90 , 100 ), (10 , 90 ), (0 , 10 )]
270+         if  not  isinstance (utilization_ranges , list ):
271+             get_logger ("smdebug-profiler" ).info (
272+                 f"{ utilization_ranges }   should be a list of tuples containing the ranges" 
273+             )
274+             return  {}
275+         if  len (utilization_ranges ) ==  0 :
276+             get_logger ("smdebug-profiler" ).info (f"{ utilization_ranges }   cannot be empty" )
277+             return  {}
278+         if  not  isinstance (device , Resource ):
279+             get_logger ("smdebug-profiler" ).info (f"{ device }   should be of type Resource" )
280+             return  {}
281+         device  =  device .value 
222282        device_sys_df  =  self .sys_metrics_df [
223283            self .sys_metrics_df ["system_metric" ].str .contains (device )
224284        ].reset_index ()
225-         if  utilization_ranges  is  None :
226-             utilization_ranges  =  [(90 , 100 ), (10 , 90 ), (0 , 10 )]
227285
228286        usage_dict  =  defaultdict (int )
229-         for  ranges  in  utilization_ranges :
230-             start , end  =  ranges 
231-             if  len (ranges ) <  2 :
232-                 get_logger ("smdebug-profiler" ).info (f"Invalid range { ranges }   for usage stats" )
287+         for  utilization_range  in  utilization_ranges :
288+             if  len (utilization_range ) !=  2 :
289+                 get_logger ("smdebug-profiler" ).info (
290+                     f"Invalid range { utilization_range }   for usage stats" 
291+                 )
233292                return  {}
234293            else :
294+                 start , end  =  utilization_range 
235295                between_range  =  len (
236296                    device_sys_df [(device_sys_df ["value" ].between (start , end , inclusive = True ))]
237297                )
238298
239-             usage_dict [ranges ] =  between_range 
299+             usage_dict [utilization_range ] =  between_range 
240300        return  usage_dict 
241301
242302    def  get_training_phase_intervals (self , phase = None ):
243303        """ 
244304        This function splits framework data into before train, train, between train and eval, eval, and after eval. 
245305        :param phase: List of training phase to find intervals for. If nothing is mentioned, intervals 
246-         are determined for all training phases available. 
306+         are determined for all training phases available. Type: string or List of strings  
247307        :return: DataFrame containing the intervals 
248308        """ 
309+         process_list  =  self .framework_metrics_df ["process" ].unique ()
249310        if  phase  is  None :
250-             process_list  =  self .framework_metrics_df ["process" ].unique ()
251311            phase  =  [x  for  x  in  process_list  if  "Step:ModeKeys"  in  x ]
252312
313+         if  isinstance (phase , str ):
314+             phase  =  [phase ]
315+ 
316+         if  not  isinstance (phase , list ):
317+             get_logger ("smdebug-profiler" ).info (f"{ phase }   should be a list of strings" )
318+             return  None 
319+ 
320+         # Filter out phases that are not available in process list 
321+         phase  =  [x  for  x  in  phase  if  x  in  process_list ]
322+ 
323+         if  len (phase ) ==  0 :
324+             get_logger ("smdebug-profiler" ).info (
325+                 f"None of the phase strings matched the phases available in the framework metrics DataFrame" 
326+             )
327+             return  None 
328+ 
253329        mode_df  =  self .framework_metrics_df [
254330            self .framework_metrics_df ["framework_metric" ].isin (phase )
255331        ]
0 commit comments