awslabs
diff --git a/‎examples/profiler/analyze_performance_bottlenecks.ipynb‎
Lines changed: 4 additions & 4 deletions b/‎examples/profiler/analyze_performance_bottlenecks.ipynb‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎smdebug/profiler/analysis/utils/pandas_data_analysis.py‎
Lines changed: 97 additions & 21 deletions b/‎smdebug/profiler/analysis/utils/pandas_data_analysis.py‎
Lines changed: 97 additions & 21 deletions
@@ -408,7 +408,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from smdebug.profiler.analysis.utils.pandas_data_analysis import PandasFrameAnalysis\n",
+    "from smdebug.profiler.analysis.utils.pandas_data_analysis import PandasFrameAnalysis, StatsBy\n",
     "\n",
     "pf_analysis = PandasFrameAnalysis(system_metrics_df, framework_metrics_df)"
    ]
@@ -589,7 +589,7 @@
     }
    ],
    "source": [
-    "pf_analysis.get_step_statistics(by=\"training_phase\")"
+    "pf_analysis.get_step_statistics(by=StatsBy.TRAINING_PHASE)"
    ]
   },
   {
@@ -2590,7 +2590,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "util_stats = pf_analysis.get_utilization_stats(by=\"training_phase\")"
+    "util_stats = pf_analysis.get_utilization_stats(by=StatsBy.TRAINING_PHASE)"
    ]
   },
   {
@@ -2958,7 +2958,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.7"
+   "version": "3.6.5"
   }
  },
  "nbformat": 4,
 
@@ -1,5 +1,6 @@
 # Standard Library
 from collections import defaultdict
+from enum import Enum
 
 # Third Party
 import pandas as pd
@@ -8,6 +9,32 @@
 from smdebug.core.logger import get_logger
 
 
+class StatsBy(Enum):
+    """
+    Enum to get stats by different categories.
+    """
+
+    # training phase such as TRAIN/EVAL/GLOBAL.
+    TRAINING_PHASE = "training_phase"
+
+    # framework metrics such as function names/ operator names
+    FRAMEWORK_METRICS = "framework_metric"
+
+    # event phase name as retrieved from events
+    PROCESS = "process"
+
+
+class Resource(Enum):
+    """
+    Enum to specify the device/resource specified in system metrics
+    TODO: Add other resources as system metrics are updated
+    """
+
+    CPU = "cpu"
+
+    GPU = "gpu"
+
+
 # Container class for job stats
 class JobStats(dict):
     def __setitem__(self, key, item):
@@ -38,7 +65,7 @@ def __init__(self, system_df, framework_df):
 
     def get_job_statistics(self):
         """
-        Returns a dataframe with information about runtime of training job, initilization, training loop and finalization.
+        Returns a Dictionary with information about runtime of training job, initilization, training loop and finalization.
         """
         job_statistics = JobStats()
         job_statistics["start_time"] = min(self.sys_metrics_df["timestamp"])
@@ -75,14 +102,20 @@ def get_job_statistics(self):
 
         return job_statistics
 
-    def get_step_statistics(self, by="training_phase"):
+    def get_step_statistics(self, by=StatsBy.TRAINING_PHASE):
         """
         Get average, minimum, maximum, p50, p95, p99 stats on step duration
         :param by: by default, stats are grouped by framework_metric. The other options are
-        to get stats by training phase - train/eval/global or grouped by process.
+        to get stats by training phase - train/eval/global or grouped by process. This parameter
+        should be of type StatsBy
         """
+        if not isinstance(by, StatsBy):
+            get_logger("smdebug-profiler").info(f"{by} should be of type StatsBy")
+            return None
+
+        by = by.value
         step_stats = None
-        if by in ["framework_metric", "process"]:
+        if by in [StatsBy.FRAMEWORK_METRICS.value, StatsBy.PROCESS.value]:
             # TODO: Consider that some events may be occurring in parallel
             step_stats = (
                 self.framework_metrics_df.groupby([by])["duration_us"]
@@ -94,7 +127,7 @@ def get_step_statistics(self, by="training_phase"):
             step_stats.columns.name = ""
             step_stats = step_stats.drop(["count", "std"], axis="columns")
             step_stats = step_stats[[by, "mean", "min", "max", "50%", "95%", "99%"]]
-        elif by == "training_phase":
+        elif by == StatsBy.TRAINING_PHASE.value:
             phase_metrics_df = self.framework_metrics_df[
                 self.framework_metrics_df["framework_metric"].str.contains("Step:ModeKeys")
             ]
@@ -139,18 +172,31 @@ def helper(start, end, phase):
             lambda x: helper(x["start_time_us"], x["end_time_us"], x["phase"]), axis=1
         )
 
-    def get_utilization_stats(self, by=None, phase=None):
+    def get_utilization_stats(self, resource=None, by=None, phase=None):
         """
         Get CPU/GPU utilization stats
+        :param resource: system resource for which utilization stats have to be computed. Type: Resource
         :param by: By default, get overall utilization stats. When by="training_phase",
-        utilization stats are provided per training phase interval
+        utilization stats are provided per training phase interval. Type: StatsBy
         :param phase: List of training phase to find intervals for. If nothing is mentioned, intervals
         are determined for all training phases available.
         :return: Dataframe containing utilization stats
         """
-        resources = ["cpu", "gpu"]
+        if (by is not None) and (not isinstance(by, StatsBy)):
+            get_logger("smdebug-profiler").info(f"{by} should be of type StatsBy")
+            return None
+        if (resource is not None) and (not isinstance(resource, (list, Resource))):
+            get_logger("smdebug-profiler").info(f"{resource} should be of type list or Resource")
+            return None
 
-        if by == "training_phase":
+        if resource is None:
+            resources = [Resource.CPU.value, Resource.GPU.value]
+        else:
+            if isinstance(resource, Resource):
+                resource = [resource]
+            resources = [x.value for x in resource]
+
+        if by == StatsBy.TRAINING_PHASE:
             interval_df = self.get_training_phase_intervals(phase)
             self._get_utilization_phase_by_time_interval(interval_df)
 
@@ -159,7 +205,7 @@ def get_utilization_stats(self, by=None, phase=None):
             sys_resrc_df = self.sys_metrics_df[
                 self.sys_metrics_df["system_metric"].str.contains(resrc)
             ].reset_index()
-            if by == "training_phase":
+            if by == StatsBy.TRAINING_PHASE:
                 sys_resrc_df = (
                     sys_resrc_df.groupby("phase")["value"]
                     .describe(percentiles=[0.5, 0.95, 0.99])
@@ -212,44 +258,74 @@ def get_utilization_stats(self, by=None, phase=None):
         util_stats.columns = columns
         return util_stats
 
-    def get_device_usage_stats(self, device="cpu", utilization_ranges=None):
+    def get_device_usage_stats(self, device=Resource.CPU, utilization_ranges=None):
         """
         Find the usage spread based on utilization ranges. If ranges are not provided,
         >90, 10-90, <10 are considered
-        :param device: cpu/gpu
+        :param device: Resource.cpu or Resource.gpu. Type: Resource
         :param utilization_ranges: list of tuples
         """
+        if utilization_ranges is None:
+            utilization_ranges = [(90, 100), (10, 90), (0, 10)]
+        if not isinstance(utilization_ranges, list):
+            get_logger("smdebug-profiler").info(
+                f"{utilization_ranges} should be a list of tuples containing the ranges"
+            )
+            return {}
+        if len(utilization_ranges) == 0:
+            get_logger("smdebug-profiler").info(f"{utilization_ranges} cannot be empty")
+            return {}
+        if not isinstance(device, Resource):
+            get_logger("smdebug-profiler").info(f"{device} should be of type Resource")
+            return {}
+        device = device.value
         device_sys_df = self.sys_metrics_df[
             self.sys_metrics_df["system_metric"].str.contains(device)
         ].reset_index()
-        if utilization_ranges is None:
-            utilization_ranges = [(90, 100), (10, 90), (0, 10)]
 
         usage_dict = defaultdict(int)
-        for ranges in utilization_ranges:
-            start, end = ranges
-            if len(ranges) < 2:
-                get_logger("smdebug-profiler").info(f"Invalid range {ranges} for usage stats")
+        for utilization_range in utilization_ranges:
+            if len(utilization_range) != 2:
+                get_logger("smdebug-profiler").info(
+                    f"Invalid range {utilization_range} for usage stats"
+                )
                 return {}
             else:
+                start, end = utilization_range
                 between_range = len(
                     device_sys_df[(device_sys_df["value"].between(start, end, inclusive=True))]
                 )
 
-            usage_dict[ranges] = between_range
+            usage_dict[utilization_range] = between_range
         return usage_dict
 
     def get_training_phase_intervals(self, phase=None):
         """
         This function splits framework data into before train, train, between train and eval, eval, and after eval.
         :param phase: List of training phase to find intervals for. If nothing is mentioned, intervals
-        are determined for all training phases available.
+        are determined for all training phases available. Type: string or List of strings
         :return: DataFrame containing the intervals
         """
+        process_list = self.framework_metrics_df["process"].unique()
         if phase is None:
-            process_list = self.framework_metrics_df["process"].unique()
             phase = [x for x in process_list if "Step:ModeKeys" in x]
 
+        if isinstance(phase, str):
+            phase = [phase]
+
+        if not isinstance(phase, list):
+            get_logger("smdebug-profiler").info(f"{phase} should be a list of strings")
+            return None
+
+        # Filter out phases that are not available in process list
+        phase = [x for x in phase if x in process_list]
+
+        if len(phase) == 0:
+            get_logger("smdebug-profiler").info(
+                f"None of the phase strings matched the phases available in the framework metrics DataFrame"
+            )
+            return None
+
         mode_df = self.framework_metrics_df[
             self.framework_metrics_df["framework_metric"].isin(phase)
         ]
Original file line number	Diff line number	Diff line change
`@@ -408,7 +408,7 @@`
`408`	`408`	`"metadata": {},`
`409`	`409`	`"outputs": [],`
`410`	`410`	`"source": [`
`411`		`- "from smdebug.profiler.analysis.utils.pandas_data_analysis import PandasFrameAnalysis\n",`
	`411`	`+ "from smdebug.profiler.analysis.utils.pandas_data_analysis import PandasFrameAnalysis, StatsBy\n",`
`412`	`412`	`"\n",`
`413`	`413`	`"pf_analysis = PandasFrameAnalysis(system_metrics_df, framework_metrics_df)"`
`414`	`414`	`]`
`@@ -589,7 +589,7 @@`
`589`	`589`	`}`
`590`	`590`	`],`
`591`	`591`	`"source": [`
`592`		`- "pf_analysis.get_step_statistics(by=\"training_phase\")"`
	`592`	`+ "pf_analysis.get_step_statistics(by=StatsBy.TRAINING_PHASE)"`
`593`	`593`	`]`
`594`	`594`	`},`
`595`	`595`	`{`
`@@ -2590,7 +2590,7 @@`
`2590`	`2590`	`"metadata": {},`
`2591`	`2591`	`"outputs": [],`
`2592`	`2592`	`"source": [`
`2593`		`- "util_stats = pf_analysis.get_utilization_stats(by=\"training_phase\")"`
	`2593`	`+ "util_stats = pf_analysis.get_utilization_stats(by=StatsBy.TRAINING_PHASE)"`
`2594`	`2594`	`]`
`2595`	`2595`	`},`
`2596`	`2596`	`{`
`@@ -2958,7 +2958,7 @@`
`2958`	`2958`	`"name": "python",`
`2959`	`2959`	`"nbconvert_exporter": "python",`
`2960`	`2960`	`"pygments_lexer": "ipython3",`
`2961`		`- "version": "3.7.7"`
	`2961`	`+ "version": "3.6.5"`
`2962`	`2962`	`}`
`2963`	`2963`	`},`
`2964`	`2964`	`"nbformat": 4,`