@@ -43,6 +43,7 @@ def __init__(
4343 self .gpu_memory = {}
4444 self .gpu_utilization = {}
4545 self .cpu_utilization = {}
46+ self .core_ids = {}
4647 self .max_datapoints = 1000000
4748 self .last_timestamp = self .base_trial .first_timestamp
4849 self .report [
@@ -69,13 +70,13 @@ def invoke(self, step):
6970 def invoke_for_timerange (
7071 self , timestamp_start , timestamp_end , sys_events = None , framework_events = None
7172 ):
72-
7373 # get system metric events
7474 if sys_events is None :
7575 events = self .base_trial .get_system_metrics (timestamp_start , timestamp_end )
7676 else :
7777 events = sys_events
78- cpu_cores = {}
78+
79+ total_cpu = {}
7980
8081 # iterate over events
8182 for event in events :
@@ -101,27 +102,24 @@ def invoke_for_timerange(
101102
102103 # get cpu utilization values per node
103104 if event .dimension == "CPUUtilization" :
104-
105- if event .node_id not in cpu_cores :
106- cpu_cores [event .node_id ] = {}
107-
108- if event .name not in cpu_cores [event .node_id ]:
109- cpu_cores [event .node_id ][event .name ] = []
110-
111- cpu_cores [event .node_id ][event .name ].append (event .value )
105+ if event .name not in self .core_ids :
106+ self .core_ids [event .name ] = 0
107+ if event .node_id not in total_cpu :
108+ total_cpu [event .node_id ] = {}
109+ if event .timestamp not in total_cpu [event .node_id ]:
110+ total_cpu [event .node_id ][event .timestamp ] = 0
111+ total_cpu [event .node_id ][event .timestamp ] += event .value
112112
113113 # compute cpu total
114- for node_id in cpu_cores :
115- total_cpu = 0
116-
117- for cpu_id in cpu_cores [node_id ]:
118- total_cpu += np .array (cpu_cores [node_id ][cpu_id ])
114+ for node_id in total_cpu :
119115
120116 if node_id not in self .cpu_utilization :
121117 self .cpu_utilization [node_id ] = []
122118
123- avg_cpu = total_cpu / len (cpu_cores [node_id ].keys ())
124- self .cpu_utilization [node_id ].extend (avg_cpu .tolist ())
119+ for timestamp in total_cpu [node_id ]:
120+ self .cpu_utilization [node_id ].append (
121+ total_cpu [node_id ][timestamp ] / len (self .core_ids )
122+ )
125123
126124 # iterate over values and compare thresholds
127125 for node_id in self .cpu_utilization :
@@ -157,69 +155,86 @@ def invoke_for_timerange(
157155 self .logger .info (
158156 f"Node { node_id } GPU { gpu_id } utilization p95 is { gpu_p95 } % which is below the threshold of { self .gpu_threshold_p95 } % and memory p95 is { gpu_memory_p95 } % which is below the threshold of { self .gpu_memory_threshold_p95 } %. Overall CPU utilization p95 is { cpu_p95 } % which is below the threshold of { self .cpu_threshold_p95 } %."
159157 )
160- # record information for profiler report
161- self .report ["RuleTriggered" ] += 1
162- self .report ["Violations" ] += 1
163- if node_id not in self .report ["Details" ]:
164- self .report ["Details" ][node_id ] = {}
165-
166- self .report ["Details" ][node_id ]["cpu" ] = {
167- "p25" : np .quantile (self .cpu_utilization [node_id ], 0.25 ),
168- "p50" : np .quantile (self .cpu_utilization [node_id ], 0.50 ),
169- "p75" : np .quantile (self .cpu_utilization [node_id ], 0.75 ),
170- "p95" : np .quantile (self .cpu_utilization [node_id ], 0.95 ),
171- }
172- iqr = (
173- self .report ["Details" ][node_id ]["cpu" ]["p75" ]
174- - self .report ["Details" ][node_id ]["cpu" ]["p25" ]
175- )
176- upper = self .report ["Details" ][node_id ]["cpu" ]["p75" ] + 1.5 * iqr
177- lower = self .report ["Details" ][node_id ]["cpu" ]["p25" ] - 1.5 * iqr
178- self .report ["Details" ][node_id ]["cpu" ]["upper" ] = min (
179- upper , np .quantile (self .cpu_utilization [node_id ], 1 )
180- )
181- self .report ["Details" ][node_id ]["cpu" ]["lower" ] = max (
182- lower , np .quantile (self .cpu_utilization [node_id ], 0.0 )
183- )
158+ # record information for profiler report
159+ self .report ["RuleTriggered" ] += 1
160+ self .report ["Violations" ] += 1
161+ if node_id not in self .report ["Details" ]:
162+ self .report ["Details" ][node_id ] = {}
163+
164+ self .report ["Details" ][node_id ]["cpu" ] = {
165+ "p25" : np .quantile (self .cpu_utilization [node_id ], 0.25 ),
166+ "p50" : np .quantile (self .cpu_utilization [node_id ], 0.50 ),
167+ "p75" : np .quantile (self .cpu_utilization [node_id ], 0.75 ),
168+ "p95" : np .quantile (self .cpu_utilization [node_id ], 0.95 ),
169+ }
170+ iqr = (
171+ self .report ["Details" ][node_id ]["cpu" ]["p75" ]
172+ - self .report ["Details" ][node_id ]["cpu" ]["p25" ]
173+ )
174+ upper = (
175+ self .report ["Details" ][node_id ]["cpu" ]["p75" ] + 1.5 * iqr
176+ )
177+ lower = (
178+ self .report ["Details" ][node_id ]["cpu" ]["p25" ] - 1.5 * iqr
179+ )
180+ self .report ["Details" ][node_id ]["cpu" ]["upper" ] = min (
181+ upper , np .quantile (self .cpu_utilization [node_id ], 1 )
182+ )
183+ self .report ["Details" ][node_id ]["cpu" ]["lower" ] = max (
184+ lower , np .quantile (self .cpu_utilization [node_id ], 0.0 )
185+ )
184186
185- self .report ["Details" ][node_id ][gpu_id ] = {
186- "p25" : np .quantile (self .gpu_utilization [node_id ][gpu_id ], 0.25 ),
187- "p50" : np .quantile (self .gpu_utilization [node_id ][gpu_id ], 0.50 ),
188- "p75" : np .quantile (self .gpu_utilization [node_id ][gpu_id ], 0.75 ),
189- "p95" : np .quantile (self .gpu_utilization [node_id ][gpu_id ], 0.95 ),
190- }
191- iqr = (
192- self .report ["Details" ][node_id ][gpu_id ]["p75" ]
193- - self .report ["Details" ][node_id ][gpu_id ]["p25" ]
194- )
195- upper = self .report ["Details" ][node_id ][gpu_id ]["p75" ] + 1.5 * iqr
196- lower = self .report ["Details" ][node_id ][gpu_id ]["p25" ] - 1.5 * iqr
197- self .report ["Details" ][node_id ][gpu_id ]["upper" ] = min (
198- upper , np .quantile (self .gpu_utilization [node_id ][gpu_id ], 1 )
199- )
200- self .report ["Details" ][node_id ][gpu_id ]["lower" ] = max (
201- lower , np .quantile (self .gpu_utilization [node_id ][gpu_id ], 0.0 )
202- )
203- key = f"{ gpu_id } _memory"
204- self .report ["Details" ][node_id ][key ] = {
205- "p25" : np .quantile (self .gpu_memory [node_id ][gpu_id ], 0.25 ),
206- "p50" : np .quantile (self .gpu_memory [node_id ][gpu_id ], 0.50 ),
207- "p75" : np .quantile (self .gpu_memory [node_id ][gpu_id ], 0.75 ),
208- "p95" : np .quantile (self .gpu_memory [node_id ][gpu_id ], 0.95 ),
209- }
210- iqr = (
211- self .report ["Details" ][node_id ][key ]["p75" ]
212- - self .report ["Details" ][node_id ][gpu_id ]["p25" ]
213- )
214- upper = self .report ["Details" ][node_id ][key ]["p75" ] + 1.5 * iqr
215- lower = self .report ["Details" ][node_id ][key ]["p25" ] - 1.5 * iqr
216- self .report ["Details" ][node_id ][key ]["upper" ] = min (
217- upper , np .quantile (self .gpu_memory [node_id ][gpu_id ], 1 )
218- )
219- self .report ["Details" ][node_id ][key ]["lower" ] = max (
220- lower , np .quantile (self .gpu_memory [node_id ][gpu_id ], 0.0 )
221- )
222- self .report ["Details" ]["last_timestamp" ] = self .last_timestamp
187+ self .report ["Details" ][node_id ][gpu_id ] = {
188+ "p25" : np .quantile (
189+ self .gpu_utilization [node_id ][gpu_id ], 0.25
190+ ),
191+ "p50" : np .quantile (
192+ self .gpu_utilization [node_id ][gpu_id ], 0.50
193+ ),
194+ "p75" : np .quantile (
195+ self .gpu_utilization [node_id ][gpu_id ], 0.75
196+ ),
197+ "p95" : np .quantile (
198+ self .gpu_utilization [node_id ][gpu_id ], 0.95
199+ ),
200+ }
201+ iqr = (
202+ self .report ["Details" ][node_id ][gpu_id ]["p75" ]
203+ - self .report ["Details" ][node_id ][gpu_id ]["p25" ]
204+ )
205+ upper = (
206+ self .report ["Details" ][node_id ][gpu_id ]["p75" ] + 1.5 * iqr
207+ )
208+ lower = (
209+ self .report ["Details" ][node_id ][gpu_id ]["p25" ] - 1.5 * iqr
210+ )
211+ self .report ["Details" ][node_id ][gpu_id ]["upper" ] = min (
212+ upper , np .quantile (self .gpu_utilization [node_id ][gpu_id ], 1 )
213+ )
214+ self .report ["Details" ][node_id ][gpu_id ]["lower" ] = max (
215+ lower ,
216+ np .quantile (self .gpu_utilization [node_id ][gpu_id ], 0.0 ),
217+ )
218+ key = f"{ gpu_id } _memory"
219+ self .report ["Details" ][node_id ][key ] = {
220+ "p25" : np .quantile (self .gpu_memory [node_id ][gpu_id ], 0.25 ),
221+ "p50" : np .quantile (self .gpu_memory [node_id ][gpu_id ], 0.50 ),
222+ "p75" : np .quantile (self .gpu_memory [node_id ][gpu_id ], 0.75 ),
223+ "p95" : np .quantile (self .gpu_memory [node_id ][gpu_id ], 0.95 ),
224+ }
225+ iqr = (
226+ self .report ["Details" ][node_id ][key ]["p75" ]
227+ - self .report ["Details" ][node_id ][gpu_id ]["p25" ]
228+ )
229+ upper = self .report ["Details" ][node_id ][key ]["p75" ] + 1.5 * iqr
230+ lower = self .report ["Details" ][node_id ][key ]["p25" ] - 1.5 * iqr
231+ self .report ["Details" ][node_id ][key ]["upper" ] = min (
232+ upper , np .quantile (self .gpu_memory [node_id ][gpu_id ], 1 )
233+ )
234+ self .report ["Details" ][node_id ][key ]["lower" ] = max (
235+ lower , np .quantile (self .gpu_memory [node_id ][gpu_id ], 0.0 )
236+ )
237+ self .report ["Details" ]["last_timestamp" ] = self .last_timestamp
223238
224239 else :
225240 self .logger .info (f"Node { node_id } Overall CPU utilization p95 is { cpu_p95 } % " )
0 commit comments