Skip to content

Commit d4d1411

Browse files
authored
Followup to the PR on analysis utils (#50)
1 parent e94a40c commit d4d1411

File tree

4 files changed

+180
-72
lines changed

4 files changed

+180
-72
lines changed

examples/profiler/analyze_performance_bottlenecks.ipynb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,7 @@
408408
"metadata": {},
409409
"outputs": [],
410410
"source": [
411-
"from smdebug.profiler.analysis.utils.pandas_data_analysis import PandasFrameAnalysis\n",
411+
"from smdebug.profiler.analysis.utils.pandas_data_analysis import PandasFrameAnalysis, StatsBy\n",
412412
"\n",
413413
"pf_analysis = PandasFrameAnalysis(system_metrics_df, framework_metrics_df)"
414414
]
@@ -589,7 +589,7 @@
589589
}
590590
],
591591
"source": [
592-
"pf_analysis.get_step_statistics(by=\"training_phase\")"
592+
"pf_analysis.get_step_statistics(by=StatsBy.TRAINING_PHASE)"
593593
]
594594
},
595595
{
@@ -2590,7 +2590,7 @@
25902590
"metadata": {},
25912591
"outputs": [],
25922592
"source": [
2593-
"util_stats = pf_analysis.get_utilization_stats(by=\"training_phase\")"
2593+
"util_stats = pf_analysis.get_utilization_stats(by=StatsBy.TRAINING_PHASE)"
25942594
]
25952595
},
25962596
{
@@ -2958,7 +2958,7 @@
29582958
"name": "python",
29592959
"nbconvert_exporter": "python",
29602960
"pygments_lexer": "ipython3",
2961-
"version": "3.7.7"
2961+
"version": "3.6.5"
29622962
}
29632963
},
29642964
"nbformat": 4,

smdebug/profiler/analysis/utils/pandas_data_analysis.py

Lines changed: 97 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Standard Library
22
from collections import defaultdict
3+
from enum import Enum
34

45
# Third Party
56
import pandas as pd
@@ -8,6 +9,32 @@
89
from smdebug.core.logger import get_logger
910

1011

12+
class StatsBy(Enum):
13+
"""
14+
Enum to get stats by different categories.
15+
"""
16+
17+
# training phase such as TRAIN/EVAL/GLOBAL.
18+
TRAINING_PHASE = "training_phase"
19+
20+
# framework metrics such as function names/ operator names
21+
FRAMEWORK_METRICS = "framework_metric"
22+
23+
# event phase name as retrieved from events
24+
PROCESS = "process"
25+
26+
27+
class Resource(Enum):
28+
"""
29+
Enum to specify the device/resource specified in system metrics
30+
TODO: Add other resources as system metrics are updated
31+
"""
32+
33+
CPU = "cpu"
34+
35+
GPU = "gpu"
36+
37+
1138
# Container class for job stats
1239
class JobStats(dict):
1340
def __setitem__(self, key, item):
@@ -38,7 +65,7 @@ def __init__(self, system_df, framework_df):
3865

3966
def get_job_statistics(self):
4067
"""
41-
Returns a dataframe with information about runtime of training job, initilization, training loop and finalization.
68+
Returns a Dictionary with information about runtime of training job, initilization, training loop and finalization.
4269
"""
4370
job_statistics = JobStats()
4471
job_statistics["start_time"] = min(self.sys_metrics_df["timestamp"])
@@ -75,14 +102,20 @@ def get_job_statistics(self):
75102

76103
return job_statistics
77104

78-
def get_step_statistics(self, by="training_phase"):
105+
def get_step_statistics(self, by=StatsBy.TRAINING_PHASE):
79106
"""
80107
Get average, minimum, maximum, p50, p95, p99 stats on step duration
81108
:param by: by default, stats are grouped by framework_metric. The other options are
82-
to get stats by training phase - train/eval/global or grouped by process.
109+
to get stats by training phase - train/eval/global or grouped by process. This parameter
110+
should be of type StatsBy
83111
"""
112+
if not isinstance(by, StatsBy):
113+
get_logger("smdebug-profiler").info(f"{by} should be of type StatsBy")
114+
return None
115+
116+
by = by.value
84117
step_stats = None
85-
if by in ["framework_metric", "process"]:
118+
if by in [StatsBy.FRAMEWORK_METRICS.value, StatsBy.PROCESS.value]:
86119
# TODO: Consider that some events may be occurring in parallel
87120
step_stats = (
88121
self.framework_metrics_df.groupby([by])["duration_us"]
@@ -94,7 +127,7 @@ def get_step_statistics(self, by="training_phase"):
94127
step_stats.columns.name = ""
95128
step_stats = step_stats.drop(["count", "std"], axis="columns")
96129
step_stats = step_stats[[by, "mean", "min", "max", "50%", "95%", "99%"]]
97-
elif by == "training_phase":
130+
elif by == StatsBy.TRAINING_PHASE.value:
98131
phase_metrics_df = self.framework_metrics_df[
99132
self.framework_metrics_df["framework_metric"].str.contains("Step:ModeKeys")
100133
]
@@ -139,18 +172,31 @@ def helper(start, end, phase):
139172
lambda x: helper(x["start_time_us"], x["end_time_us"], x["phase"]), axis=1
140173
)
141174

142-
def get_utilization_stats(self, by=None, phase=None):
175+
def get_utilization_stats(self, resource=None, by=None, phase=None):
143176
"""
144177
Get CPU/GPU utilization stats
178+
:param resource: system resource for which utilization stats have to be computed. Type: Resource
145179
:param by: By default, get overall utilization stats. When by="training_phase",
146-
utilization stats are provided per training phase interval
180+
utilization stats are provided per training phase interval. Type: StatsBy
147181
:param phase: List of training phase to find intervals for. If nothing is mentioned, intervals
148182
are determined for all training phases available.
149183
:return: Dataframe containing utilization stats
150184
"""
151-
resources = ["cpu", "gpu"]
185+
if (by is not None) and (not isinstance(by, StatsBy)):
186+
get_logger("smdebug-profiler").info(f"{by} should be of type StatsBy")
187+
return None
188+
if (resource is not None) and (not isinstance(resource, (list, Resource))):
189+
get_logger("smdebug-profiler").info(f"{resource} should be of type list or Resource")
190+
return None
152191

153-
if by == "training_phase":
192+
if resource is None:
193+
resources = [Resource.CPU.value, Resource.GPU.value]
194+
else:
195+
if isinstance(resource, Resource):
196+
resource = [resource]
197+
resources = [x.value for x in resource]
198+
199+
if by == StatsBy.TRAINING_PHASE:
154200
interval_df = self.get_training_phase_intervals(phase)
155201
self._get_utilization_phase_by_time_interval(interval_df)
156202

@@ -159,7 +205,7 @@ def get_utilization_stats(self, by=None, phase=None):
159205
sys_resrc_df = self.sys_metrics_df[
160206
self.sys_metrics_df["system_metric"].str.contains(resrc)
161207
].reset_index()
162-
if by == "training_phase":
208+
if by == StatsBy.TRAINING_PHASE:
163209
sys_resrc_df = (
164210
sys_resrc_df.groupby("phase")["value"]
165211
.describe(percentiles=[0.5, 0.95, 0.99])
@@ -212,44 +258,74 @@ def get_utilization_stats(self, by=None, phase=None):
212258
util_stats.columns = columns
213259
return util_stats
214260

215-
def get_device_usage_stats(self, device="cpu", utilization_ranges=None):
261+
def get_device_usage_stats(self, device=Resource.CPU, utilization_ranges=None):
216262
"""
217263
Find the usage spread based on utilization ranges. If ranges are not provided,
218264
>90, 10-90, <10 are considered
219-
:param device: cpu/gpu
265+
:param device: Resource.cpu or Resource.gpu. Type: Resource
220266
:param utilization_ranges: list of tuples
221267
"""
268+
if utilization_ranges is None:
269+
utilization_ranges = [(90, 100), (10, 90), (0, 10)]
270+
if not isinstance(utilization_ranges, list):
271+
get_logger("smdebug-profiler").info(
272+
f"{utilization_ranges} should be a list of tuples containing the ranges"
273+
)
274+
return {}
275+
if len(utilization_ranges) == 0:
276+
get_logger("smdebug-profiler").info(f"{utilization_ranges} cannot be empty")
277+
return {}
278+
if not isinstance(device, Resource):
279+
get_logger("smdebug-profiler").info(f"{device} should be of type Resource")
280+
return {}
281+
device = device.value
222282
device_sys_df = self.sys_metrics_df[
223283
self.sys_metrics_df["system_metric"].str.contains(device)
224284
].reset_index()
225-
if utilization_ranges is None:
226-
utilization_ranges = [(90, 100), (10, 90), (0, 10)]
227285

228286
usage_dict = defaultdict(int)
229-
for ranges in utilization_ranges:
230-
start, end = ranges
231-
if len(ranges) < 2:
232-
get_logger("smdebug-profiler").info(f"Invalid range {ranges} for usage stats")
287+
for utilization_range in utilization_ranges:
288+
if len(utilization_range) != 2:
289+
get_logger("smdebug-profiler").info(
290+
f"Invalid range {utilization_range} for usage stats"
291+
)
233292
return {}
234293
else:
294+
start, end = utilization_range
235295
between_range = len(
236296
device_sys_df[(device_sys_df["value"].between(start, end, inclusive=True))]
237297
)
238298

239-
usage_dict[ranges] = between_range
299+
usage_dict[utilization_range] = between_range
240300
return usage_dict
241301

242302
def get_training_phase_intervals(self, phase=None):
243303
"""
244304
This function splits framework data into before train, train, between train and eval, eval, and after eval.
245305
:param phase: List of training phase to find intervals for. If nothing is mentioned, intervals
246-
are determined for all training phases available.
306+
are determined for all training phases available. Type: string or List of strings
247307
:return: DataFrame containing the intervals
248308
"""
309+
process_list = self.framework_metrics_df["process"].unique()
249310
if phase is None:
250-
process_list = self.framework_metrics_df["process"].unique()
251311
phase = [x for x in process_list if "Step:ModeKeys" in x]
252312

313+
if isinstance(phase, str):
314+
phase = [phase]
315+
316+
if not isinstance(phase, list):
317+
get_logger("smdebug-profiler").info(f"{phase} should be a list of strings")
318+
return None
319+
320+
# Filter out phases that are not available in process list
321+
phase = [x for x in phase if x in process_list]
322+
323+
if len(phase) == 0:
324+
get_logger("smdebug-profiler").info(
325+
f"None of the phase strings matched the phases available in the framework metrics DataFrame"
326+
)
327+
return None
328+
253329
mode_df = self.framework_metrics_df[
254330
self.framework_metrics_df["framework_metric"].isin(phase)
255331
]

0 commit comments

Comments
 (0)