Skip to content

Commit

Permalink
record task manager at regular intervals
Browse files Browse the repository at this point in the history
  • Loading branch information
fosterseth committed Jun 14, 2022
1 parent 3a2d96c commit 0046c4d
Show file tree
Hide file tree
Showing 4 changed files with 276 additions and 9 deletions.
4 changes: 4 additions & 0 deletions awx/main/analytics/subsystem_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ def __init__(self, auto_pipe_execute=False, instance_name=None):
SetFloatM('task_manager_spawn_workflow_graph_jobs_seconds', 'Time spent spawning workflow jobs'),
SetFloatM('task_manager__schedule_seconds', 'Time spent in running the entire _schedule'),
IntM('task_manager_schedule_calls', 'Number of calls to task manager schedule'),
SetFloatM('task_manager_recorded_timestamp', 'Unix timestamp when metrics were last recorded'),
SetIntM('task_manager_tasks_started', 'Number of tasks started'),
SetIntM('task_manager_running_processed', 'Number of running tasks processed'),
SetIntM('task_manager_pending_processed', 'Number of pending tasks processed'),
Expand Down Expand Up @@ -227,6 +228,9 @@ def set(self, field, value):
def get(self, field):
return self.METRICS[field].get()

def decode(self, field):
return self.METRICS[field].decode(self.conn)

def observe(self, field, value):
self.METRICS[field].observe(value)
self.metrics_have_changed = True
Expand Down
22 changes: 17 additions & 5 deletions awx/main/scheduler/task_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,12 @@ def __init__(self):
self.start_task_limit = settings.START_TASK_LIMIT
self.time_delta_job_explanation = timedelta(seconds=30)
self.subsystem_metrics = s_metrics.Metrics(auto_pipe_execute=False)
# initialize each metric to 0 and force metric_has_changed to true. This
# ensures each task manager metric will be overridden when pipe_execute
# is called later.
for m in self.subsystem_metrics.METRICS:
if m.startswith("task_manager"):
self.subsystem_metrics.set(m, 0)

def after_lock_init(self, all_sorted_tasks):
"""
Expand Down Expand Up @@ -682,12 +688,18 @@ def record_aggregate_metrics(self, *args):
# increment task_manager_schedule_calls regardless if the other
# metrics are recorded
s_metrics.Metrics(auto_pipe_execute=True).inc("task_manager_schedule_calls", 1)
_schedule_dur = self.subsystem_metrics.get("task_manager__schedule_seconds")
# to prevent overriding short-duration task manager calls, only
# record metrics if the total time to run task manager is greater
# than the user-defined setting
if _schedule_dur > settings.SUBSYSTEM_METRICS_TASK_MANAGER_EXECUTION_TIME:
# Only record metrics if the last time recording was more
# than SUBSYSTEM_METRICS_TASK_MANAGER_RECORD_INTERVAL ago.
# Prevents a short-duration task manager that runs directly after a
# long task manager to override useful metrics.
current_time = time.time()
time_last_recorded = current_time - self.subsystem_metrics.decode("task_manager_recorded_timestamp")
if time_last_recorded > settings.SUBSYSTEM_METRICS_TASK_MANAGER_RECORD_INTERVAL:
logger.debug(f"recording metrics, last recorded {time_last_recorded} seconds ago")
self.subsystem_metrics.set("task_manager_recorded_timestamp", current_time)
self.subsystem_metrics.pipe_execute()
else:
logger.debug(f"skipping recording metrics, last recorded {time_last_recorded} seconds ago")

def record_aggregate_metrics_and_exit(self, *args):
self.record_aggregate_metrics()
Expand Down
4 changes: 2 additions & 2 deletions awx/settings/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,8 @@ def IS_TESTING(argv=None):
# Interval in seconds for saving local metrics to redis
SUBSYSTEM_METRICS_INTERVAL_SAVE_TO_REDIS = 2

# Only record stats from task manager cycles that are >= this execution time (seconds)
SUBSYSTEM_METRICS_TASK_MANAGER_EXECUTION_TIME = 15
# Record task manager metrics at the following interval in seconds
SUBSYSTEM_METRICS_TASK_MANAGER_RECORD_INTERVAL = 15

# The maximum allowed jobs to start on a given task manager cycle
START_TASK_LIMIT = 100
Expand Down
255 changes: 253 additions & 2 deletions tools/grafana/dashboards/demo_dashboard.json
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,256 @@
"x": 0,
"y": 0
},
"id": 12,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"expr": "task_manager_running_processed",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"expr": "task_manager_pending_processed",
"hide": false,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"expr": "task_manager_tasks_blocked",
"hide": false,
"refId": "D"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"expr": "task_manager_tasks_started",
"hide": false,
"refId": "C"
}
],
"title": "Task manager workload",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"id": 10,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "builder",
"expr": "task_manager_process_pending_tasks_seconds",
"legendFormat": "__auto",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "builder",
"expr": "task_manager_process_running_tasks_seconds",
"hide": false,
"legendFormat": "__auto",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "builder",
"expr": "task_manager_generate_dependencies_seconds",
"hide": false,
"legendFormat": "__auto",
"range": true,
"refId": "C"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "builder",
"expr": "task_manager_get_tasks_seconds",
"hide": false,
"legendFormat": "__auto",
"range": true,
"refId": "D"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "builder",
"expr": "task_manager_spawn_workflow_graph_jobs_seconds",
"hide": false,
"legendFormat": "__auto",
"range": true,
"refId": "E"
}
],
"title": "Task manager timings",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 16
},
"id": 8,
"options": {
"legend": {
Expand All @@ -115,20 +365,21 @@
"type": "timeseries"
}
],
"refresh": "5s",
"schemaVersion": 36,
"style": "dark",
"tags": [],
"templating": {
"list": []
},
"time": {
"from": "now-30m",
"from": "now-5m",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "awx-demo",
"uid": "GISWZOXnk",
"version": 6,
"version": 2,
"weekStart": ""
}

0 comments on commit 0046c4d

Please sign in to comment.