Skip to content

Commit 3a6014a

Browse files
authored
Updating batch size rule (#123)
* fix for batch size rule
1 parent a5af69d commit 3a6014a

File tree

2 files changed

+101
-82
lines changed

2 files changed

+101
-82
lines changed

smdebug/profiler/analysis/rules/batch_size.py

Lines changed: 94 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ def __init__(
4343
self.gpu_memory = {}
4444
self.gpu_utilization = {}
4545
self.cpu_utilization = {}
46+
self.core_ids = {}
4647
self.max_datapoints = 1000000
4748
self.last_timestamp = self.base_trial.first_timestamp
4849
self.report[
@@ -69,13 +70,13 @@ def invoke(self, step):
6970
def invoke_for_timerange(
7071
self, timestamp_start, timestamp_end, sys_events=None, framework_events=None
7172
):
72-
7373
# get system metric events
7474
if sys_events is None:
7575
events = self.base_trial.get_system_metrics(timestamp_start, timestamp_end)
7676
else:
7777
events = sys_events
78-
cpu_cores = {}
78+
79+
total_cpu = {}
7980

8081
# iterate over events
8182
for event in events:
@@ -101,27 +102,24 @@ def invoke_for_timerange(
101102

102103
# get cpu utilization values per node
103104
if event.dimension == "CPUUtilization":
104-
105-
if event.node_id not in cpu_cores:
106-
cpu_cores[event.node_id] = {}
107-
108-
if event.name not in cpu_cores[event.node_id]:
109-
cpu_cores[event.node_id][event.name] = []
110-
111-
cpu_cores[event.node_id][event.name].append(event.value)
105+
if event.name not in self.core_ids:
106+
self.core_ids[event.name] = 0
107+
if event.node_id not in total_cpu:
108+
total_cpu[event.node_id] = {}
109+
if event.timestamp not in total_cpu[event.node_id]:
110+
total_cpu[event.node_id][event.timestamp] = 0
111+
total_cpu[event.node_id][event.timestamp] += event.value
112112

113113
# compute cpu total
114-
for node_id in cpu_cores:
115-
total_cpu = 0
116-
117-
for cpu_id in cpu_cores[node_id]:
118-
total_cpu += np.array(cpu_cores[node_id][cpu_id])
114+
for node_id in total_cpu:
119115

120116
if node_id not in self.cpu_utilization:
121117
self.cpu_utilization[node_id] = []
122118

123-
avg_cpu = total_cpu / len(cpu_cores[node_id].keys())
124-
self.cpu_utilization[node_id].extend(avg_cpu.tolist())
119+
for timestamp in total_cpu[node_id]:
120+
self.cpu_utilization[node_id].append(
121+
total_cpu[node_id][timestamp] / len(self.core_ids)
122+
)
125123

126124
# iterate over values and compare thresholds
127125
for node_id in self.cpu_utilization:
@@ -157,69 +155,86 @@ def invoke_for_timerange(
157155
self.logger.info(
158156
f"Node {node_id} GPU {gpu_id} utilization p95 is {gpu_p95}% which is below the threshold of {self.gpu_threshold_p95}% and memory p95 is {gpu_memory_p95}% which is below the threshold of {self.gpu_memory_threshold_p95}%. Overall CPU utilization p95 is {cpu_p95}% which is below the threshold of {self.cpu_threshold_p95}%."
159157
)
160-
# record information for profiler report
161-
self.report["RuleTriggered"] += 1
162-
self.report["Violations"] += 1
163-
if node_id not in self.report["Details"]:
164-
self.report["Details"][node_id] = {}
165-
166-
self.report["Details"][node_id]["cpu"] = {
167-
"p25": np.quantile(self.cpu_utilization[node_id], 0.25),
168-
"p50": np.quantile(self.cpu_utilization[node_id], 0.50),
169-
"p75": np.quantile(self.cpu_utilization[node_id], 0.75),
170-
"p95": np.quantile(self.cpu_utilization[node_id], 0.95),
171-
}
172-
iqr = (
173-
self.report["Details"][node_id]["cpu"]["p75"]
174-
- self.report["Details"][node_id]["cpu"]["p25"]
175-
)
176-
upper = self.report["Details"][node_id]["cpu"]["p75"] + 1.5 * iqr
177-
lower = self.report["Details"][node_id]["cpu"]["p25"] - 1.5 * iqr
178-
self.report["Details"][node_id]["cpu"]["upper"] = min(
179-
upper, np.quantile(self.cpu_utilization[node_id], 1)
180-
)
181-
self.report["Details"][node_id]["cpu"]["lower"] = max(
182-
lower, np.quantile(self.cpu_utilization[node_id], 0.0)
183-
)
158+
# record information for profiler report
159+
self.report["RuleTriggered"] += 1
160+
self.report["Violations"] += 1
161+
if node_id not in self.report["Details"]:
162+
self.report["Details"][node_id] = {}
163+
164+
self.report["Details"][node_id]["cpu"] = {
165+
"p25": np.quantile(self.cpu_utilization[node_id], 0.25),
166+
"p50": np.quantile(self.cpu_utilization[node_id], 0.50),
167+
"p75": np.quantile(self.cpu_utilization[node_id], 0.75),
168+
"p95": np.quantile(self.cpu_utilization[node_id], 0.95),
169+
}
170+
iqr = (
171+
self.report["Details"][node_id]["cpu"]["p75"]
172+
- self.report["Details"][node_id]["cpu"]["p25"]
173+
)
174+
upper = (
175+
self.report["Details"][node_id]["cpu"]["p75"] + 1.5 * iqr
176+
)
177+
lower = (
178+
self.report["Details"][node_id]["cpu"]["p25"] - 1.5 * iqr
179+
)
180+
self.report["Details"][node_id]["cpu"]["upper"] = min(
181+
upper, np.quantile(self.cpu_utilization[node_id], 1)
182+
)
183+
self.report["Details"][node_id]["cpu"]["lower"] = max(
184+
lower, np.quantile(self.cpu_utilization[node_id], 0.0)
185+
)
184186

185-
self.report["Details"][node_id][gpu_id] = {
186-
"p25": np.quantile(self.gpu_utilization[node_id][gpu_id], 0.25),
187-
"p50": np.quantile(self.gpu_utilization[node_id][gpu_id], 0.50),
188-
"p75": np.quantile(self.gpu_utilization[node_id][gpu_id], 0.75),
189-
"p95": np.quantile(self.gpu_utilization[node_id][gpu_id], 0.95),
190-
}
191-
iqr = (
192-
self.report["Details"][node_id][gpu_id]["p75"]
193-
- self.report["Details"][node_id][gpu_id]["p25"]
194-
)
195-
upper = self.report["Details"][node_id][gpu_id]["p75"] + 1.5 * iqr
196-
lower = self.report["Details"][node_id][gpu_id]["p25"] - 1.5 * iqr
197-
self.report["Details"][node_id][gpu_id]["upper"] = min(
198-
upper, np.quantile(self.gpu_utilization[node_id][gpu_id], 1)
199-
)
200-
self.report["Details"][node_id][gpu_id]["lower"] = max(
201-
lower, np.quantile(self.gpu_utilization[node_id][gpu_id], 0.0)
202-
)
203-
key = f"{gpu_id}_memory"
204-
self.report["Details"][node_id][key] = {
205-
"p25": np.quantile(self.gpu_memory[node_id][gpu_id], 0.25),
206-
"p50": np.quantile(self.gpu_memory[node_id][gpu_id], 0.50),
207-
"p75": np.quantile(self.gpu_memory[node_id][gpu_id], 0.75),
208-
"p95": np.quantile(self.gpu_memory[node_id][gpu_id], 0.95),
209-
}
210-
iqr = (
211-
self.report["Details"][node_id][key]["p75"]
212-
- self.report["Details"][node_id][gpu_id]["p25"]
213-
)
214-
upper = self.report["Details"][node_id][key]["p75"] + 1.5 * iqr
215-
lower = self.report["Details"][node_id][key]["p25"] - 1.5 * iqr
216-
self.report["Details"][node_id][key]["upper"] = min(
217-
upper, np.quantile(self.gpu_memory[node_id][gpu_id], 1)
218-
)
219-
self.report["Details"][node_id][key]["lower"] = max(
220-
lower, np.quantile(self.gpu_memory[node_id][gpu_id], 0.0)
221-
)
222-
self.report["Details"]["last_timestamp"] = self.last_timestamp
187+
self.report["Details"][node_id][gpu_id] = {
188+
"p25": np.quantile(
189+
self.gpu_utilization[node_id][gpu_id], 0.25
190+
),
191+
"p50": np.quantile(
192+
self.gpu_utilization[node_id][gpu_id], 0.50
193+
),
194+
"p75": np.quantile(
195+
self.gpu_utilization[node_id][gpu_id], 0.75
196+
),
197+
"p95": np.quantile(
198+
self.gpu_utilization[node_id][gpu_id], 0.95
199+
),
200+
}
201+
iqr = (
202+
self.report["Details"][node_id][gpu_id]["p75"]
203+
- self.report["Details"][node_id][gpu_id]["p25"]
204+
)
205+
upper = (
206+
self.report["Details"][node_id][gpu_id]["p75"] + 1.5 * iqr
207+
)
208+
lower = (
209+
self.report["Details"][node_id][gpu_id]["p25"] - 1.5 * iqr
210+
)
211+
self.report["Details"][node_id][gpu_id]["upper"] = min(
212+
upper, np.quantile(self.gpu_utilization[node_id][gpu_id], 1)
213+
)
214+
self.report["Details"][node_id][gpu_id]["lower"] = max(
215+
lower,
216+
np.quantile(self.gpu_utilization[node_id][gpu_id], 0.0),
217+
)
218+
key = f"{gpu_id}_memory"
219+
self.report["Details"][node_id][key] = {
220+
"p25": np.quantile(self.gpu_memory[node_id][gpu_id], 0.25),
221+
"p50": np.quantile(self.gpu_memory[node_id][gpu_id], 0.50),
222+
"p75": np.quantile(self.gpu_memory[node_id][gpu_id], 0.75),
223+
"p95": np.quantile(self.gpu_memory[node_id][gpu_id], 0.95),
224+
}
225+
iqr = (
226+
self.report["Details"][node_id][key]["p75"]
227+
- self.report["Details"][node_id][gpu_id]["p25"]
228+
)
229+
upper = self.report["Details"][node_id][key]["p75"] + 1.5 * iqr
230+
lower = self.report["Details"][node_id][key]["p25"] - 1.5 * iqr
231+
self.report["Details"][node_id][key]["upper"] = min(
232+
upper, np.quantile(self.gpu_memory[node_id][gpu_id], 1)
233+
)
234+
self.report["Details"][node_id][key]["lower"] = max(
235+
lower, np.quantile(self.gpu_memory[node_id][gpu_id], 0.0)
236+
)
237+
self.report["Details"]["last_timestamp"] = self.last_timestamp
223238

224239
else:
225240
self.logger.info(f"Node {node_id} Overall CPU utilization p95 is {cpu_p95}% ")

smdebug/profiler/analysis/rules/profiler_report.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,13 @@ def invoke_for_timerange(self, timestamp_start, timestamp_end):
118118
self.logger.info(
119119
f"Invoking rule:{rule.rule_name} for timestamp_start:{timestamp_start} to timestamp_end:{timestamp_end}"
120120
)
121-
rule_condition = rule.invoke_for_timerange(
122-
timestamp_start, timestamp_end, sys_events, framework_events
123-
)
121+
try:
122+
rule_condition = rule.invoke_for_timerange(
123+
timestamp_start, timestamp_end, sys_events, framework_events
124+
)
125+
except:
126+
self.logger.error(f"Error running rule {rule.name}")
127+
124128
is_condition_met = is_condition_met or rule_condition
125129
if self.report_dir:
126130
# Only dump the report if the report directory is specified.

0 commit comments

Comments
 (0)