diff --git a/docker/grafana/json-models/job-tracing.json b/docker/grafana/json-models/job-tracing.json new file mode 100644 index 00000000..dd56cb1c --- /dev/null +++ b/docker/grafana/json-models/job-tracing.json @@ -0,0 +1,2416 @@ +{ + "annotations" : { + "list" : [ + { + "builtIn" : 1, + "datasource" : { + "type" : "grafana", + "uid" : "-- Grafana --" + }, + "enable" : true, + "hide" : true, + "iconColor" : "rgba(0, 211, 255, 1)", + "name" : "Annotations & Alerts", + "type" : "dashboard" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "enable" : false, + "expr" : "rmsjob_annotations{jobid=\"$jobid\"}", + "iconColor" : "red", + "name" : "User Annotations", + "titleFormat" : "{{marker}}" + } + ] + }, + "editable" : true, + "fiscalYearStartMonth" : 0, + "graphTooltip" : 0, + "links" : [], + "liveNow" : false, + "panels" : [ + { + "collapsed" : false, + "gridPos" : { + "h" : 1, + "w" : 24, + "x" : 0, + "y" : 0 + }, + "id" : 35, + "panels" : [], + "title" : "Summary Info", + "type" : "row" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "fieldConfig" : { + "defaults" : { + "color" : { + "mode" : "thresholds" + }, + "mappings" : [], + "thresholds" : { + "mode" : "absolute", + "steps" : [ + { + "color" : "text", + "value" : null + } + ] + } + }, + "overrides" : [] + }, + "gridPos" : { + "h" : 2, + "w" : 6, + "x" : 0, + "y" : 1 + }, + "id" : 9, + "options" : { + "colorMode" : "value", + "graphMode" : "area", + "justifyMode" : "auto", + "orientation" : "auto", + "reduceOptions" : { + "calcs" : [ + "lastNotNull" + ], + "fields" : "/^jobid$/", + "values" : false + }, + "showPercentChange" : false, + "text" : { + "valueSize" : 28 + }, + "textMode" : "auto", + "wideLayout" : true + }, + "pluginVersion" : "11.0.0", + "targets" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "disableTextWrap" : false, + "editorMode" : "code", + "exemplar" : false, + "expr" : "rmsjob_info{jobid=\"$jobid\"}", + "format" : "table", + "fullMetaSearch" : false, + "includeNullMetadata" : true, + "instant" : true, + "legendFormat" : "__auto", + "range" : false, + "refId" : "A", + "useBackend" : false + } + ], + "title" : "Job ID", + "type" : "stat" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "fieldConfig" : { + "defaults" : { + "color" : { + "mode" : "thresholds" + }, + "mappings" : [ + { + "options" : { + "0" : { + "index" : 1, + "text" : "Interactive" + }, + "1" : { + "index" : 0, + "text" : "Batch" + } + }, + "type" : "value" + } + ], + "thresholds" : { + "mode" : "absolute", + "steps" : [ + { + "color" : "green", + "value" : null + } + ] + } + }, + "overrides" : [] + }, + "gridPos" : { + "h" : 2, + "w" : 6, + "x" : 6, + "y" : 1 + }, + "id" : 36, + "options" : { + "colorMode" : "none", + "graphMode" : "none", + "justifyMode" : "auto", + "orientation" : "auto", + "reduceOptions" : { + "calcs" : [ + "lastNotNull" + ], + "fields" : "/^batchflag$/", + "values" : false + }, + "showPercentChange" : false, + "text" : { + "valueSize" : 28 + }, + "textMode" : "auto", + "wideLayout" : true + }, + "pluginVersion" : "11.0.0", + "targets" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "editorMode" : "code", + "exemplar" : false, + "expr" : "(rmsjob_info{jobid=\"$jobid\"})", + "format" : "table", + "instant" : false, + "legendFormat" : "__auto", + "range" : true, + "refId" : "A" + } + ], + "title" : "Job Type", + "type" : "stat" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "fieldConfig" : { + "defaults" : { + "color" : { + "mode" : "thresholds" + }, + "mappings" : [], + "thresholds" : { + "mode" : "absolute", + "steps" : [ + { + "color" : "green", + "value" : null + } + ] + } + }, + "overrides" : [] + }, + "gridPos" : { + "h" : 2, + "w" : 5, + "x" : 12, + "y" : 1 + }, + "id" : 12, + "options" : { + "colorMode" : "none", + "graphMode" : "area", + "justifyMode" : "auto", + "orientation" : "auto", + "reduceOptions" : { + "calcs" : [ + "lastNotNull" + ], + "fields" : "/^nodes$/", + "values" : false + }, + "showPercentChange" : false, + "text" : { + "valueSize" : 32 + }, + "textMode" : "auto", + "wideLayout" : true + }, + "pluginVersion" : "11.0.0", + "targets" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "editorMode" : "code", + "expr" : "rmsjob_info{jobid=\"$jobid\"}", + "format" : "table", + "instant" : false, + "legendFormat" : "__auto", + "range" : true, + "refId" : "A" + } + ], + "title" : "# Nodes", + "type" : "stat" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "fieldConfig" : { + "defaults" : { + "color" : { + "mode" : "thresholds" + }, + "decimals" : 0, + "mappings" : [], + "thresholds" : { + "mode" : "absolute", + "steps" : [ + { + "color" : "red", + "value" : null + }, + { + "color" : "#EAB839", + "value" : 33.33 + }, + { + "color" : "green", + "value" : 66.66 + } + ] + }, + "unit" : "percent" + }, + "overrides" : [] + }, + "gridPos" : { + "h" : 4, + "w" : 7, + "x" : 17, + "y" : 1 + }, + "id" : 32, + "options" : { + "minVizHeight" : 75, + "minVizWidth" : 75, + "orientation" : "auto", + "reduceOptions" : { + "calcs" : [ + "mean" + ], + "fields" : "", + "values" : false + }, + "showThresholdLabels" : false, + "showThresholdMarkers" : false, + "sizing" : "auto" + }, + "pluginVersion" : "11.0.0", + "targets" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "editorMode" : "code", + "expr" : "avg(rocm_utilization_percentage * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"})", + "instant" : false, + "legendFormat" : "GPU Core", + "range" : true, + "refId" : "A" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "editorMode" : "code", + "expr" : "avg(rocm_vram_used_percentage * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"})", + "hide" : false, + "instant" : false, + "legendFormat" : "GPU Memory", + "range" : true, + "refId" : "B" + } + ], + "title" : "Mean Utilization Summary", + "type" : "gauge" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "fieldConfig" : { + "defaults" : { + "color" : { + "mode" : "thresholds" + }, + "mappings" : [], + "thresholds" : { + "mode" : "absolute", + "steps" : [ + { + "color" : "green", + "value" : null + } + ] + } + }, + "overrides" : [] + }, + "gridPos" : { + "h" : 2, + "w" : 6, + "x" : 0, + "y" : 3 + }, + "id" : 10, + "options" : { + "colorMode" : "none", + "graphMode" : "area", + "justifyMode" : "auto", + "orientation" : "auto", + "reduceOptions" : { + "calcs" : [ + "lastNotNull" + ], + "fields" : "/^user$/", + "values" : false + }, + "showPercentChange" : false, + "text" : { + "valueSize" : 28 + }, + "textMode" : "auto", + "wideLayout" : true + }, + "pluginVersion" : "11.0.0", + "targets" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "disableTextWrap" : false, + "editorMode" : "code", + "exemplar" : false, + "expr" : "rmsjob_info{jobid=\"$jobid\"}", + "format" : "table", + "fullMetaSearch" : false, + "includeNullMetadata" : true, + "instant" : false, + "legendFormat" : "__auto", + "range" : true, + "refId" : "A", + "useBackend" : false + } + ], + "title" : "User", + "type" : "stat" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "fieldConfig" : { + "defaults" : { + "color" : { + "mode" : "thresholds" + }, + "mappings" : [], + "thresholds" : { + "mode" : "absolute", + "steps" : [ + { + "color" : "green", + "value" : null + } + ] + } + }, + "overrides" : [] + }, + "gridPos" : { + "h" : 2, + "w" : 6, + "x" : 6, + "y" : 3 + }, + "id" : 11, + "options" : { + "colorMode" : "none", + "graphMode" : "area", + "justifyMode" : "auto", + "orientation" : "auto", + "reduceOptions" : { + "calcs" : [ + "lastNotNull" + ], + "fields" : "/^partition$/", + "values" : false + }, + "showPercentChange" : false, + "text" : { + "valueSize" : 28 + }, + "textMode" : "auto", + "wideLayout" : true + }, + "pluginVersion" : "11.0.0", + "targets" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "editorMode" : "code", + "exemplar" : false, + "expr" : "rmsjob_info{jobid=\"$jobid\"}", + "format" : "table", + "instant" : false, + "legendFormat" : "__auto", + "range" : true, + "refId" : "A" + } + ], + "title" : "Partition", + "type" : "stat" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "fieldConfig" : { + "defaults" : { + "color" : { + "mode" : "thresholds" + }, + "mappings" : [], + "thresholds" : { + "mode" : "absolute", + "steps" : [ + { + "color" : "green", + "value" : null + } + ] + }, + "unit" : "dthms" + }, + "overrides" : [] + }, + "gridPos" : { + "h" : 2, + "w" : 5, + "x" : 12, + "y" : 3 + }, + "id" : 39, + "options" : { + "colorMode" : "none", + "graphMode" : "none", + "justifyMode" : "auto", + "orientation" : "auto", + "reduceOptions" : { + "calcs" : [ + "range" + ], + "fields" : "", + "values" : false + }, + "showPercentChange" : false, + "text" : { + "valueSize" : 28 + }, + "textMode" : "auto", + "wideLayout" : true + }, + "pluginVersion" : "11.0.0", + "targets" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "editorMode" : "code", + "expr" : "max(timestamp(rmsjob_info{jobid=\"$jobid\"}))", + "hide" : false, + "instant" : false, + "interval" : "", + "legendFormat" : "__auto", + "range" : true, + "refId" : "A" + } + ], + "title" : "Job Duration", + "type" : "stat" + }, + { + "collapsed" : false, + "gridPos" : { + "h" : 1, + "w" : 24, + "x" : 0, + "y" : 5 + }, + "id" : 15, + "panels" : [], + "title" : "GPU Usage Summary", + "type" : "row" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "fieldConfig" : { + "defaults" : { + "color" : { + "mode" : "thresholds" + }, + "decimals" : 2, + "mappings" : [], + "thresholds" : { + "mode" : "absolute", + "steps" : [ + { + "color" : "red", + "value" : null + }, + { + "color" : "#EAB839", + "value" : 25 + }, + { + "color" : "green", + "value" : 75 + } + ] + }, + "unit" : "percent" + }, + "overrides" : [] + }, + "gridPos" : { + "h" : 8, + "w" : 9, + "x" : 0, + "y" : 6 + }, + "id" : 28, + "interval" : "$interval", + "options" : { + "minVizHeight" : 75, + "minVizWidth" : 75, + "orientation" : "auto", + "reduceOptions" : { + "calcs" : [ + "mean" + ], + "fields" : "", + "values" : false + }, + "showThresholdLabels" : false, + "showThresholdMarkers" : true, + "sizing" : "auto" + }, + "pluginVersion" : "11.0.0", + "targets" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "editorMode" : "code", + "exemplar" : false, + "expr" : "avg by (card) (rocm_utilization_percentage * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"})", + "instant" : false, + "legendFormat" : "Card: {{card}}", + "range" : true, + "refId" : "A" + } + ], + "title" : "Mean GPU Utilization (%)", + "type" : "gauge" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "fieldConfig" : { + "defaults" : { + "custom" : { + "hideFrom" : { + "legend" : false, + "tooltip" : false, + "viz" : false + }, + "scaleDistribution" : { + "type" : "linear" + } + } + }, + "overrides" : [] + }, + "gridPos" : { + "h" : 8, + "w" : 15, + "x" : 9, + "y" : 6 + }, + "id" : 27, + "interval" : "$interval", + "options" : { + "calculate" : false, + "calculation" : { + "xBuckets" : { + "mode" : "count" + }, + "yBuckets" : { + "mode" : "size", + "scale" : { + "type" : "linear" + } + } + }, + "cellGap" : 1, + "cellValues" : { + "unit" : "percent" + }, + "color" : { + "exponent" : 0.5, + "fill" : "dark-orange", + "max" : 100, + "min" : 0, + "mode" : "scheme", + "reverse" : true, + "scale" : "exponential", + "scheme" : "RdYlGn", + "steps" : 64 + }, + "exemplars" : { + "color" : "rgba(255,0,255,0.7)" + }, + "filterValues" : { + "le" : -10 + }, + "legend" : { + "show" : true + }, + "rowsFrame" : { + "layout" : "auto" + }, + "tooltip" : { + "maxHeight" : 600, + "mode" : "single", + "showColorScale" : false, + "yHistogram" : false + }, + "yAxis" : { + "axisLabel" : "Card", + "axisPlacement" : "left", + "reverse" : false + } + }, + "pluginVersion" : "11.0.0", + "targets" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "editorMode" : "code", + "exemplar" : false, + "expr" : "avg by (card) (rocm_utilization_percentage * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"})", + "format" : "time_series", + "instant" : false, + "interval" : "", + "legendFormat" : "{{card}}", + "range" : true, + "refId" : "A" + } + ], + "title" : "GPU Utilization Heatmap", + "type" : "heatmap" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "fieldConfig" : { + "defaults" : { + "color" : { + "mode" : "thresholds" + }, + "decimals" : 2, + "mappings" : [], + "thresholds" : { + "mode" : "percentage", + "steps" : [ + { + "color" : "red", + "value" : null + }, + { + "color" : "#EAB839", + "value" : 25 + }, + { + "color" : "green", + "value" : 50 + } + ] + }, + "unit" : "percent" + }, + "overrides" : [] + }, + "gridPos" : { + "h" : 8, + "w" : 9, + "x" : 0, + "y" : 14 + }, + "id" : 29, + "interval" : "$interval", + "options" : { + "minVizHeight" : 75, + "minVizWidth" : 75, + "orientation" : "auto", + "reduceOptions" : { + "calcs" : [ + "max" + ], + "fields" : "", + "values" : false + }, + "showThresholdLabels" : false, + "showThresholdMarkers" : true, + "sizing" : "auto" + }, + "pluginVersion" : "11.0.0", + "targets" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "editorMode" : "code", + "exemplar" : false, + "expr" : "max by (card) (rocm_vram_used_percentage * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"})", + "instant" : false, + "legendFormat" : "Card: {{card}}", + "range" : true, + "refId" : "A" + } + ], + "title" : "Max GPU Memory Utilization (%)", + "type" : "gauge" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "fieldConfig" : { + "defaults" : { + "custom" : { + "hideFrom" : { + "legend" : false, + "tooltip" : false, + "viz" : false + }, + "scaleDistribution" : { + "type" : "linear" + } + } + }, + "overrides" : [] + }, + "gridPos" : { + "h" : 8, + "w" : 15, + "x" : 9, + "y" : 14 + }, + "id" : 31, + "interval" : "$interval", + "options" : { + "calculate" : false, + "calculation" : { + "xBuckets" : { + "mode" : "count" + }, + "yBuckets" : { + "mode" : "size", + "scale" : { + "type" : "linear" + } + } + }, + "cellGap" : 1, + "cellValues" : { + "unit" : "percent" + }, + "color" : { + "exponent" : 0.5, + "fill" : "dark-orange", + "max" : 100, + "min" : 0, + "mode" : "scheme", + "reverse" : true, + "scale" : "exponential", + "scheme" : "RdYlGn", + "steps" : 64 + }, + "exemplars" : { + "color" : "rgba(255,0,255,0.7)" + }, + "filterValues" : { + "le" : 1e-09 + }, + "legend" : { + "show" : true + }, + "rowsFrame" : { + "layout" : "auto" + }, + "tooltip" : { + "maxHeight" : 600, + "mode" : "single", + "showColorScale" : false, + "yHistogram" : false + }, + "yAxis" : { + "axisLabel" : "Card", + "axisPlacement" : "left", + "reverse" : false + } + }, + "pluginVersion" : "11.0.0", + "targets" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "editorMode" : "code", + "exemplar" : false, + "expr" : "max by (card) (rocm_vram_used_percentage * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"})", + "format" : "time_series", + "instant" : false, + "interval" : "", + "legendFormat" : "{{card}}", + "range" : true, + "refId" : "A" + } + ], + "title" : "GPU Memory Utilization Heatmap", + "type" : "heatmap" + }, + { + "collapsed" : true, + "gridPos" : { + "h" : 1, + "w" : 24, + "x" : 0, + "y" : 22 + }, + "id" : 21, + "panels" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "fieldConfig" : { + "defaults" : { + "color" : { + "mode" : "palette-classic" + }, + "custom" : { + "axisBorderShow" : false, + "axisCenteredZero" : false, + "axisColorMode" : "text", + "axisLabel" : "", + "axisPlacement" : "auto", + "barAlignment" : 0, + "drawStyle" : "line", + "fillOpacity" : 0, + "gradientMode" : "none", + "hideFrom" : { + "legend" : false, + "tooltip" : false, + "viz" : false + }, + "insertNulls" : false, + "lineInterpolation" : "linear", + "lineWidth" : 1, + "pointSize" : 5, + "scaleDistribution" : { + "type" : "linear" + }, + "showPoints" : "auto", + "spanNulls" : false, + "stacking" : { + "group" : "A", + "mode" : "none" + }, + "thresholdsStyle" : { + "mode" : "off" + } + }, + "mappings" : [], + "thresholds" : { + "mode" : "absolute", + "steps" : [ + { + "color" : "green", + "value" : null + }, + { + "color" : "red", + "value" : 80 + } + ] + }, + "unit" : "percent" + }, + "overrides" : [] + }, + "gridPos" : { + "h" : 8, + "w" : 24, + "x" : 0, + "y" : 7 + }, + "id" : 44, + "interval" : "$interval", + "options" : { + "legend" : { + "calcs" : [], + "displayMode" : "list", + "placement" : "bottom", + "showLegend" : true + }, + "tooltip" : { + "maxHeight" : 600, + "mode" : "single", + "sort" : "none" + } + }, + "targets" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "editorMode" : "code", + "expr" : "avg by (card) (rocm_utilization_percentage * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"})", + "instant" : false, + "legendFormat" : "Card: {{card}}", + "range" : true, + "refId" : "A" + } + ], + "title" : "GPU Utilization (%)", + "type" : "timeseries" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "fieldConfig" : { + "defaults" : { + "color" : { + "mode" : "palette-classic" + }, + "custom" : { + "axisBorderShow" : false, + "axisCenteredZero" : false, + "axisColorMode" : "text", + "axisLabel" : "", + "axisPlacement" : "auto", + "barAlignment" : 0, + "drawStyle" : "line", + "fillOpacity" : 0, + "gradientMode" : "none", + "hideFrom" : { + "legend" : false, + "tooltip" : false, + "viz" : false + }, + "insertNulls" : false, + "lineInterpolation" : "linear", + "lineWidth" : 1, + "pointSize" : 5, + "scaleDistribution" : { + "type" : "linear" + }, + "showPoints" : "auto", + "spanNulls" : false, + "stacking" : { + "group" : "A", + "mode" : "none" + }, + "thresholdsStyle" : { + "mode" : "off" + } + }, + "mappings" : [], + "thresholds" : { + "mode" : "absolute", + "steps" : [ + { + "color" : "green", + "value" : null + }, + { + "color" : "red", + "value" : 80 + } + ] + }, + "unit" : "celsius" + }, + "overrides" : [] + }, + "gridPos" : { + "h" : 8, + "w" : 24, + "x" : 0, + "y" : 15 + }, + "id" : 33, + "interval" : "$interval", + "options" : { + "legend" : { + "calcs" : [], + "displayMode" : "list", + "placement" : "bottom", + "showLegend" : true + }, + "tooltip" : { + "maxHeight" : 600, + "mode" : "single", + "sort" : "none" + } + }, + "targets" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "editorMode" : "code", + "expr" : "avg by (card) (rocm_temperature_celsius * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"})", + "instant" : false, + "legendFormat" : "Card: {{card}}", + "range" : true, + "refId" : "A" + } + ], + "title" : "GPU Temperature - Die Edge (°C)", + "type" : "timeseries" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "fieldConfig" : { + "defaults" : { + "color" : { + "mode" : "palette-classic" + }, + "custom" : { + "axisBorderShow" : false, + "axisCenteredZero" : false, + "axisColorMode" : "text", + "axisLabel" : "", + "axisPlacement" : "auto", + "barAlignment" : 0, + "drawStyle" : "line", + "fillOpacity" : 0, + "gradientMode" : "none", + "hideFrom" : { + "legend" : false, + "tooltip" : false, + "viz" : false + }, + "insertNulls" : false, + "lineInterpolation" : "linear", + "lineWidth" : 1, + "pointSize" : 5, + "scaleDistribution" : { + "type" : "linear" + }, + "showPoints" : "auto", + "spanNulls" : false, + "stacking" : { + "group" : "A", + "mode" : "none" + }, + "thresholdsStyle" : { + "mode" : "off" + } + }, + "mappings" : [], + "thresholds" : { + "mode" : "absolute", + "steps" : [ + { + "color" : "green", + "value" : null + }, + { + "color" : "red", + "value" : 80 + } + ] + }, + "unit" : "watt" + }, + "overrides" : [] + }, + "gridPos" : { + "h" : 8, + "w" : 24, + "x" : 0, + "y" : 23 + }, + "id" : 22, + "interval" : "$interval", + "options" : { + "legend" : { + "calcs" : [], + "displayMode" : "list", + "placement" : "bottom", + "showLegend" : true + }, + "tooltip" : { + "maxHeight" : 600, + "mode" : "single", + "sort" : "none" + } + }, + "targets" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "editorMode" : "code", + "expr" : "avg by (card) (rocm_average_socket_power_watts * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"})", + "instant" : false, + "legendFormat" : "Card: {{card}}", + "range" : true, + "refId" : "A" + } + ], + "title" : "GPU Power (W)", + "type" : "timeseries" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "fieldConfig" : { + "defaults" : { + "color" : { + "mode" : "palette-classic" + }, + "custom" : { + "axisBorderShow" : false, + "axisCenteredZero" : false, + "axisColorMode" : "text", + "axisLabel" : "", + "axisPlacement" : "auto", + "barAlignment" : 0, + "drawStyle" : "line", + "fillOpacity" : 0, + "gradientMode" : "none", + "hideFrom" : { + "legend" : false, + "tooltip" : false, + "viz" : false + }, + "insertNulls" : false, + "lineInterpolation" : "linear", + "lineWidth" : 1, + "pointSize" : 5, + "scaleDistribution" : { + "type" : "linear" + }, + "showPoints" : "auto", + "spanNulls" : false, + "stacking" : { + "group" : "A", + "mode" : "none" + }, + "thresholdsStyle" : { + "mode" : "off" + } + }, + "mappings" : [], + "thresholds" : { + "mode" : "absolute", + "steps" : [ + { + "color" : "green", + "value" : null + }, + { + "color" : "red", + "value" : 80 + } + ] + }, + "unit" : "rotmhz" + }, + "overrides" : [] + }, + "gridPos" : { + "h" : 8, + "w" : 12, + "x" : 0, + "y" : 31 + }, + "id" : 25, + "interval" : "$interval", + "options" : { + "legend" : { + "calcs" : [], + "displayMode" : "list", + "placement" : "bottom", + "showLegend" : true + }, + "tooltip" : { + "maxHeight" : 600, + "mode" : "single", + "sort" : "none" + } + }, + "targets" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "editorMode" : "code", + "expr" : "avg by (card) (rocm_sclk_clock_mhz * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"})", + "instant" : false, + "legendFormat" : "Card: {{card}}", + "range" : true, + "refId" : "A" + } + ], + "title" : "GPU Frequency - Engine Clock (MHz)", + "type" : "timeseries" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "fieldConfig" : { + "defaults" : { + "color" : { + "mode" : "palette-classic" + }, + "custom" : { + "axisBorderShow" : false, + "axisCenteredZero" : false, + "axisColorMode" : "text", + "axisLabel" : "", + "axisPlacement" : "auto", + "barAlignment" : 0, + "drawStyle" : "line", + "fillOpacity" : 0, + "gradientMode" : "none", + "hideFrom" : { + "legend" : false, + "tooltip" : false, + "viz" : false + }, + "insertNulls" : false, + "lineInterpolation" : "linear", + "lineWidth" : 1, + "pointSize" : 5, + "scaleDistribution" : { + "type" : "linear" + }, + "showPoints" : "auto", + "spanNulls" : false, + "stacking" : { + "group" : "A", + "mode" : "none" + }, + "thresholdsStyle" : { + "mode" : "off" + } + }, + "mappings" : [], + "thresholds" : { + "mode" : "absolute", + "steps" : [ + { + "color" : "green", + "value" : null + }, + { + "color" : "red", + "value" : 80 + } + ] + }, + "unit" : "rotmhz" + }, + "overrides" : [] + }, + "gridPos" : { + "h" : 8, + "w" : 12, + "x" : 12, + "y" : 31 + }, + "id" : 23, + "interval" : "$interval", + "options" : { + "legend" : { + "calcs" : [], + "displayMode" : "list", + "placement" : "bottom", + "showLegend" : true + }, + "tooltip" : { + "maxHeight" : 600, + "mode" : "single", + "sort" : "none" + } + }, + "targets" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "editorMode" : "code", + "expr" : "avg by (card) (rocm_mclk_clock_mhz * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"})", + "instant" : false, + "legendFormat" : "Card: {{card}}", + "range" : true, + "refId" : "A" + } + ], + "title" : "GPU Frequency - Memory Clock (MHz)", + "type" : "timeseries" + } + ], + "title" : "Additional GPU telemetry: Utilization, Temperature, Power and Frequency", + "type" : "row" + }, + { + "collapsed" : true, + "gridPos" : { + "h" : 1, + "w" : 24, + "x" : 0, + "y" : 23 + }, + "id" : 40, + "panels" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "fieldConfig" : { + "defaults" : { + "color" : { + "mode" : "palette-classic" + }, + "custom" : { + "axisBorderShow" : false, + "axisCenteredZero" : false, + "axisColorMode" : "text", + "axisLabel" : "", + "axisPlacement" : "auto", + "barAlignment" : 0, + "drawStyle" : "line", + "fillOpacity" : 0, + "gradientMode" : "none", + "hideFrom" : { + "legend" : false, + "tooltip" : false, + "viz" : false + }, + "insertNulls" : false, + "lineInterpolation" : "linear", + "lineWidth" : 1, + "pointSize" : 5, + "scaleDistribution" : { + "type" : "linear" + }, + "showPoints" : "auto", + "spanNulls" : false, + "stacking" : { + "group" : "A", + "mode" : "none" + }, + "thresholdsStyle" : { + "mode" : "off" + } + }, + "mappings" : [], + "thresholds" : { + "mode" : "absolute", + "steps" : [ + { + "color" : "green", + "value" : null + } + ] + }, + "unit" : "percent" + }, + "overrides" : [ + { + "matcher" : { + "id" : "byName", + "options" : "Median" + }, + "properties" : [ + { + "id" : "custom.fillOpacity", + "value" : 15 + }, + { + "id" : "custom.fillBelowTo", + "value" : "Quantile 0.2 " + }, + { + "id" : "color", + "value" : { + "fixedColor" : "blue", + "mode" : "fixed" + } + }, + { + "id" : "custom.lineWidth", + "value" : 1 + }, + { + "id" : "displayName", + "value" : "Median" + } + ] + }, + { + "matcher" : { + "id" : "byName", + "options" : "Quantile 0.8" + }, + "properties" : [ + { + "id" : "custom.fillOpacity", + "value" : 15 + }, + { + "id" : "custom.fillBelowTo", + "value" : "Median" + }, + { + "id" : "color", + "value" : { + "fixedColor" : "blue", + "mode" : "fixed" + } + }, + { + "id" : "custom.lineWidth", + "value" : 0 + }, + { + "id" : "custom.hideFrom", + "value" : { + "legend" : true, + "tooltip" : false, + "viz" : false + } + } + ] + }, + { + "matcher" : { + "id" : "byName", + "options" : "Quantile 0.2 " + }, + "properties" : [ + { + "id" : "custom.lineWidth", + "value" : 0 + }, + { + "id" : "displayName" + }, + { + "id" : "color", + "value" : { + "fixedColor" : "blue", + "mode" : "fixed" + } + }, + { + "id" : "custom.hideFrom", + "value" : { + "legend" : true, + "tooltip" : false, + "viz" : false + } + } + ] + }, + { + "matcher" : { + "id" : "byName", + "options" : "Average" + }, + "properties" : [ + { + "id" : "color", + "value" : { + "fixedColor" : "#8080805e", + "mode" : "fixed" + } + } + ] + } + ] + }, + "gridPos" : { + "h" : 9, + "w" : 24, + "x" : 0, + "y" : 32 + }, + "id" : 24, + "interval" : "$interval", + "options" : { + "legend" : { + "calcs" : [], + "displayMode" : "list", + "placement" : "bottom", + "showLegend" : true + }, + "tooltip" : { + "maxHeight" : 600, + "mode" : "multi", + "sort" : "none" + } + }, + "targets" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "editorMode" : "code", + "expr" : "quantile(0.20, (rocm_utilization_percentage * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"})) ", + "instant" : false, + "legendFormat" : "Quantile 0.2 ", + "range" : true, + "refId" : "Quantile 0.2" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "editorMode" : "code", + "expr" : "quantile(0.5, (rocm_utilization_percentage * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"})) ", + "hide" : false, + "instant" : false, + "legendFormat" : "Median", + "range" : true, + "refId" : "Median" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "editorMode" : "code", + "expr" : "quantile(0.8, (rocm_utilization_percentage * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"})) ", + "hide" : false, + "instant" : false, + "legendFormat" : "Quantile 0.8", + "range" : true, + "refId" : "Quantile 0.8" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "editorMode" : "code", + "expr" : "avg(rocm_utilization_percentage * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"})", + "hide" : false, + "instant" : false, + "legendFormat" : "Average", + "range" : true, + "refId" : "Average" + } + ], + "title" : "Distribution of GPU Utilization Over Time (%)", + "type" : "timeseries" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "fieldConfig" : { + "defaults" : { + "color" : { + "mode" : "palette-classic" + }, + "custom" : { + "axisBorderShow" : false, + "axisCenteredZero" : false, + "axisColorMode" : "text", + "axisLabel" : "", + "axisPlacement" : "auto", + "axisSoftMax" : 100, + "axisSoftMin" : 0, + "hideFrom" : { + "legend" : false, + "tooltip" : false, + "viz" : false + }, + "pointSize" : { + "fixed" : 4 + }, + "scaleDistribution" : { + "type" : "linear" + }, + "show" : "points" + }, + "fieldMinMax" : false, + "mappings" : [], + "thresholds" : { + "mode" : "absolute", + "steps" : [ + { + "color" : "green" + }, + { + "color" : "red", + "value" : 80 + } + ] + }, + "unit" : "percent" + }, + "overrides" : [ + { + "matcher" : { + "id" : "byName", + "options" : "Node" + }, + "properties" : [ + { + "id" : "unit", + "value" : "none" + }, + { + "id" : "decimals", + "value" : 0 + } + ] + } + ] + }, + "gridPos" : { + "h" : 9, + "w" : 12, + "x" : 0, + "y" : 41 + }, + "id" : 41, + "interval" : "$interval", + "options" : { + "legend" : { + "calcs" : [], + "displayMode" : "list", + "placement" : "bottom", + "showLegend" : false + }, + "series" : [ + { + "x" : "Node", + "y" : "GPU Utilization (%)" + } + ], + "seriesMapping" : "manual", + "tooltip" : { + "maxHeight" : 600, + "mode" : "single", + "sort" : "none" + } + }, + "targets" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "editorMode" : "code", + "expr" : "avg by (instance) (rocm_utilization_percentage * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"})", + "instant" : false, + "legendFormat" : "__auto", + "range" : true, + "refId" : "A" + } + ], + "title" : "Distribution of Average Utilization per Node", + "transformations" : [ + { + "id" : "reduce", + "options" : { + "reducers" : [ + "mean" + ] + } + }, + { + "id" : "sortBy", + "options" : { + "fields" : {}, + "sort" : [ + { + "desc" : true, + "field" : "Mean" + } + ] + } + }, + { + "id" : "calculateField", + "options" : { + "alias" : "Node", + "mode" : "index", + "reduce" : { + "reducer" : "sum" + } + } + }, + { + "id" : "organize", + "options" : { + "excludeByName" : {}, + "includeByName" : {}, + "indexByName" : {}, + "renameByName" : { + "Mean" : "GPU Utilization (%)" + } + } + } + ], + "type" : "xychart" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "fieldConfig" : { + "defaults" : { + "color" : { + "mode" : "palette-classic" + }, + "custom" : { + "axisBorderShow" : false, + "axisCenteredZero" : false, + "axisColorMode" : "text", + "axisLabel" : "", + "axisPlacement" : "auto", + "axisSoftMax" : 100, + "axisSoftMin" : 0, + "hideFrom" : { + "legend" : false, + "tooltip" : false, + "viz" : false + }, + "pointSize" : { + "fixed" : 4 + }, + "scaleDistribution" : { + "type" : "linear" + }, + "show" : "points" + }, + "fieldMinMax" : false, + "mappings" : [], + "thresholds" : { + "mode" : "absolute", + "steps" : [ + { + "color" : "green" + }, + { + "color" : "red", + "value" : 80 + } + ] + }, + "unit" : "percent" + }, + "overrides" : [ + { + "matcher" : { + "id" : "byName", + "options" : "GPU" + }, + "properties" : [ + { + "id" : "unit", + "value" : "none" + }, + { + "id" : "decimals", + "value" : 0 + } + ] + } + ] + }, + "gridPos" : { + "h" : 9, + "w" : 12, + "x" : 12, + "y" : 41 + }, + "id" : 42, + "interval" : "$interval", + "options" : { + "legend" : { + "calcs" : [], + "displayMode" : "list", + "placement" : "bottom", + "showLegend" : false + }, + "series" : [ + { + "x" : "GPU", + "y" : "GPU Utilization (%)" + } + ], + "seriesMapping" : "manual", + "tooltip" : { + "maxHeight" : 600, + "mode" : "single", + "sort" : "none" + } + }, + "targets" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "editorMode" : "code", + "expr" : "(rocm_utilization_percentage * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"})", + "instant" : false, + "legendFormat" : "__auto", + "range" : true, + "refId" : "A" + } + ], + "title" : "Distribution of Average Utilization per GPU", + "transformations" : [ + { + "id" : "reduce", + "options" : { + "reducers" : [ + "mean" + ] + } + }, + { + "id" : "sortBy", + "options" : { + "fields" : {}, + "sort" : [ + { + "desc" : true, + "field" : "Mean" + } + ] + } + }, + { + "id" : "calculateField", + "options" : { + "alias" : "GPU", + "mode" : "index", + "reduce" : { + "reducer" : "sum" + } + } + }, + { + "id" : "organize", + "options" : { + "excludeByName" : {}, + "includeByName" : {}, + "indexByName" : {}, + "renameByName" : { + "Mean" : "GPU Utilization (%)" + } + } + } + ], + "type" : "xychart" + } + ], + "title" : "Multiple Nodes", + "type" : "row" + }, + { + "collapsed" : true, + "gridPos" : { + "h" : 1, + "w" : 24, + "x" : 0, + "y" : 24 + }, + "id" : 45, + "panels" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "prometheus" + }, + "description" : "", + "fieldConfig" : { + "defaults" : { + "color" : { + "mode" : "palette-classic-by-name" + }, + "custom" : { + "fillOpacity" : 70, + "hideFrom" : { + "legend" : false, + "tooltip" : false, + "viz" : false + }, + "insertNulls" : false, + "lineWidth" : 0.1, + "spanNulls" : false + }, + "mappings" : [ + { + "options" : { + "from" : 0, + "result" : { + "color" : "transparent", + "index" : 1 + }, + "to" : 0 + }, + "type" : "range" + }, + { + "options" : { + "match" : "empty", + "result" : { + "color" : "transparent", + "index" : 2 + } + }, + "type" : "special" + } + ], + "thresholds" : { + "mode" : "absolute", + "steps" : [ + { + "color" : "green", + "value" : null + } + ] + } + }, + "overrides" : [] + }, + "gridPos" : { + "h" : 8, + "w" : 24, + "x" : 0, + "y" : 41 + }, + "id" : 46, + "interval" : "1s", + "options" : { + "alignValue" : "left", + "legend" : { + "displayMode" : "list", + "placement" : "bottom", + "showLegend" : false + }, + "mergeValues" : true, + "rowHeight" : 1, + "showValue" : "never", + "tooltip" : { + "maxHeight" : 600, + "mode" : "single", + "sort" : "none" + } + }, + "targets" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "prometheus" + }, + "editorMode" : "code", + "expr" : "rmsjob_annotations{jobid=\"$jobid\"}", + "format" : "table", + "instant" : false, + "legendFormat" : "__auto", + "range" : true, + "refId" : "A" + } + ], + "title" : "Application Trace", + "transformations" : [ + { + "id" : "groupingToMatrix", + "options" : { + "columnField" : "marker", + "rowField" : "Time", + "valueField" : "Value" + } + }, + { + "id" : "convertFieldType", + "options" : { + "conversions" : [ + { + "destinationType" : "time", + "targetField" : "Time\\marker" + } + ], + "fields" : {} + } + } + ], + "type" : "state-timeline" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "fieldConfig" : { + "defaults" : { + "color" : { + "mode" : "thresholds" + }, + "custom" : { + "align" : "auto", + "cellOptions" : { + "type" : "auto" + }, + "inspect" : false + }, + "mappings" : [], + "thresholds" : { + "mode" : "absolute", + "steps" : [ + { + "color" : "green", + "value" : null + }, + { + "color" : "red", + "value" : 80 + } + ] + }, + "unit" : "s" + }, + "overrides" : [ + { + "matcher" : { + "id" : "byName", + "options" : "First" + }, + "properties" : [ + { + "id" : "custom.hidden", + "value" : true + } + ] + }, + { + "matcher" : { + "id" : "byName", + "options" : "Last" + }, + "properties" : [ + { + "id" : "custom.hidden", + "value" : true + } + ] + }, + { + "matcher" : { + "id" : "byName", + "options" : "Range" + }, + "properties" : [ + { + "id" : "displayName", + "value" : "Elapsed Time" + } + ] + }, + { + "matcher" : { + "id" : "byName", + "options" : "Field" + }, + "properties" : [ + { + "id" : "displayName", + "value" : "Annotation Tag" + } + ] + }, + { + "matcher" : { + "id" : "byName", + "options" : "Mean" + }, + "properties" : [ + { + "id" : "displayName", + "value" : "Average GPU Utilization" + }, + { + "id" : "unit", + "value" : "percent" + } + ] + } + ] + }, + "gridPos" : { + "h" : 8, + "w" : 24, + "x" : 0, + "y" : 49 + }, + "id" : 43, + "interval" : "$interval", + "options" : { + "cellHeight" : "sm", + "footer" : { + "countRows" : false, + "fields" : "", + "reducer" : [ + "sum" + ], + "show" : false + }, + "frameIndex" : 1, + "showHeader" : true + }, + "pluginVersion" : "11.0.0", + "targets" : [ + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "editorMode" : "code", + "expr" : "timestamp(count by (marker) (rmsjob_annotations{jobid=\"$jobid\"} > 0))", + "instant" : false, + "legendFormat" : "__auto", + "range" : true, + "refId" : "A" + }, + { + "datasource" : { + "type" : "prometheus", + "uid" : "$source" + }, + "editorMode" : "code", + "expr" : "avg by (marker) ((rocm_utilization_percentage) * on (instance) group_left(jobid,marker) rmsjob_info{jobid=\"$jobid\"} * on (jobid) group_left(marker) count by (jobid,marker) (rmsjob_annotations{jobid=\"$jobid\"} > 0))", + "hide" : false, + "instant" : false, + "legendFormat" : "__auto", + "range" : true, + "refId" : "B" + } + ], + "title" : "Annotated Regions", + "transformations" : [ + { + "filter" : { + "id" : "byRefId", + "options" : "A" + }, + "id" : "reduce", + "options" : { + "includeTimeField" : false, + "mode" : "seriesToRows", + "reducers" : [ + "first", + "last", + "range" + ] + }, + "topic" : "series" + }, + { + "filter" : { + "id" : "byRefId", + "options" : "B" + }, + "id" : "reduce", + "options" : { + "reducers" : [ + "mean" + ] + } + }, + { + "id" : "merge", + "options" : {} + } + ], + "type" : "table" + } + ], + "title" : "Annotations", + "type" : "row" + } + ], + "refresh" : "", + "schemaVersion" : 39, + "tags" : [ + "omnistat" + ], + "templating" : { + "list" : [ + { + "current" : { + "selected" : false, + "text" : "58777", + "value" : "58777" + }, + "hide" : 2, + "includeAll" : false, + "label" : "SLURM Job ID", + "multi" : false, + "name" : "jobid", + "options" : [], + "query" : "", + "skipUrlSync" : false, + "type" : "custom" + }, + { + "current" : { + "selected" : true, + "text" : "> 1s", + "value" : "1s" + }, + "hide" : 0, + "includeAll" : false, + "label" : "Interval", + "multi" : false, + "name" : "interval", + "options" : [ + { + "selected" : false, + "text" : "60s", + "value" : "60s" + }, + { + "selected" : false, + "text" : "30s", + "value" : "30s" + }, + { + "selected" : false, + "text" : "5s", + "value" : "5s" + } + ], + "query" : "60s,30s,5s", + "queryValue" : "1s", + "skipUrlSync" : false, + "type" : "custom" + }, + { + "current" : { + "selected" : true, + "text" : "prometheus", + "value" : "prometheus" + }, + "hide" : 2, + "includeAll" : false, + "label" : "Data Source", + "multi" : false, + "name" : "source", + "options" : [], + "query" : "", + "skipUrlSync" : false, + "type" : "textbox" + } + ] + }, + "time" : { + "from" : "2024-07-25T22:30:22.858Z", + "to" : "2024-07-25T22:31:10.238Z" + }, + "timeRangeUpdatedDuringEditOrView" : false, + "timepicker" : { + "hidden" : false + }, + "timezone" : "", + "title" : "omnistat-job", + "uid" : "bccbab2a-b53a-4d32-90e7-b7613b67403c", + "version" : 1, + "weekStart" : "" +} diff --git a/omnistat/annotate.py b/omnistat/annotate.py index 6f7d136b..73022b00 100755 --- a/omnistat/annotate.py +++ b/omnistat/annotate.py @@ -25,13 +25,12 @@ """annotate.py -Standalone utility for creating user annotation labels in json format. Intended -for use in conjunction with companion Slurm data collector that looks for files of the -following form: +Standalone utility for creating user annotations that relies on Omnistat +traces. Intended for use in conjunction with companion Slurm data collector +that looks for trace messages in a specific path/socket. -/tmp/omnistat_${USER}_annotate.json - -File can also be imported for direct Python usage. +For more detailed application-level traces of Python applications, import +omnistat's trace module. """ import argparse @@ -39,42 +38,21 @@ import json import os - -class omnistat_annotate: - def __init__(self): - self.filename = "/tmp/omnistat_" + os.environ.get("USER") + "_annotate.json" - - def start(self, label): - data = {} - data["annotation"] = label - data["timestamp_secs"] = int(time.time()) - - with open(self.filename, "w") as outfile: - outfile.write(json.dumps(data, indent=4)) - outfile.write("\n") - return - - def stop(self): - if os.path.exists(self.filename): - os.remove(self.filename) - return +from omnistat import trace def main(): parser = argparse.ArgumentParser() parser.add_argument("--mode", choices=["start", "stop"], help="annotation mode", required=True) - parser.add_argument("--text", help="desired annotation", required=False) + parser.add_argument("--text", help="desired annotation", required=True) args = parser.parse_args() - if args.mode == "start" and args.text is None: - parser.error('The --text option is required for "start" mode.') - - annotate = omnistat_annotate() + trace = trace.OmnistatTrace(trace_id="annotations") if args.mode == "start": - annotate.start(args.text) + trace.start_span(args.text) else: - annotate.stop() + trace.end_span(args.text) if __name__ == "__main__": diff --git a/omnistat/collector_rms.py b/omnistat/collector_rms.py index 9d8129e9..2b66f30d 100644 --- a/omnistat/collector_rms.py +++ b/omnistat/collector_rms.py @@ -30,11 +30,13 @@ user-provided annotation timestamps. """ -import sys import json import logging import os import platform +import sys +import time +import zmq from prometheus_client import Gauge @@ -43,17 +45,21 @@ class RMSJob(Collector): - def __init__(self, annotations=False, jobDetection=None): + def __init__(self, annotations=False, annotationsPath="/tmp/omnistat_trace", jobDetection=None): logging.debug("Initializing resource manager job data collector") self.__prefix = "rmsjob_" - self.__annotationsEnabled = annotations self.__RMSMetrics = {} self.__rmsJobInfo = [] - self.__lastAnnotationLabel = None self.__rmsJobMode = jobDetection["mode"] self.__rmsJobFile = jobDetection["file"] self.__rmsJobStepFile = jobDetection["stepfile"] + self.__annotationsEnabled = annotations + self.__annotationsJobID = None + self.__spans = {} + self.__context = None + self.__socket = None + # jobMode if self.__rmsJobMode == "file-based": logging.info( @@ -81,6 +87,13 @@ def __init__(self, annotations=False, jobDetection=None): else: logging.error("Unsupported slurm job data collection mode") + if self.__annotationsEnabled: + if os.path.exists(annotationsPath): + os.remove(annotationsPath) + self.__context = zmq.Context() + self.__socket = self.__context.socket(zmq.PULL) + self.__socket.bind(f"ipc://{annotationsPath}") + def querySlurmJob(self, timeout=1, exit_on_error=False, mode="squeue"): """ Query SLURM and return job info for local host. @@ -139,7 +152,7 @@ def registerMetrics(self): # metric to support user annotations self.__RMSMetrics["annotations"] = Gauge( - self.__prefix + "annotations", "User job annotations", ["marker", "jobid"] + self.__prefix + "annotations", "User job annotations", ["marker", "jobid", "traceid"] ) for metric in self.__RMSMetrics: @@ -170,31 +183,7 @@ def updateMetrics(self): # Check for user supplied annotations if self.__annotationsEnabled: - userFile = "/tmp/omnistat_%s_annotate.json" % results["RMS_JOB_USER"] - - userFileExists = os.path.isfile(userFile) - if userFileExists: - with open(userFile, "r") as file: - data = json.load(file) - - # Reset existing annotation in two scenarios: - # 1. Previous annotation stopped (file no longer present) - # 2. There is a new annotation (label has changed) - if self.__lastAnnotationLabel != None and ( - not userFileExists or self.__lastAnnotationLabel != data["annotation"] - ): - self.__RMSMetrics["annotations"].labels( - marker=self.__lastAnnotationLabel, - jobid=results["RMS_JOB_ID"], - ).set(0) - self.__lastAnnotationLabel = None - - if userFileExists: - self.__RMSMetrics["annotations"].labels( - marker=data["annotation"], - jobid=results["RMS_JOB_ID"], - ).set(data["timestamp_secs"]) - self.__lastAnnotationLabel = data["annotation"] + self.updateAnnotations(results["RMS_JOB_ID"]) # Case when no job detected else: @@ -203,3 +192,62 @@ def updateMetrics(self): ).set(1) return + + def updateAnnotations(self, jobid): + # Reset annotations when a new job is running + if jobid != self.__annotationsJobID: + self.__spans = {} + self.__annotationsJobID = jobid + + # Read all annotation messages until work queue is empty + messages = [] + while True: + try: + msg = self.__socket.recv_json(flags=zmq.NOBLOCK) + messages.append(msg) + except zmq.ZMQError as e: + break + + # Process start/end messages + for kind, time_ms, trace_id, label in messages: + if trace_id not in self.__spans: + self.__spans[trace_id] = {"started": {}, "ended": {}} + + if kind == "start": + if label in self.__spans[trace_id]["ended"]: + self.__spans[trace_id]["ended"].pop(label) + self.__spans[trace_id]["started"][label] = time_ms + elif kind == "end": + if label in self.__spans[trace_id]["started"]: + self.__spans[trace_id]["started"].pop(label) + self.__spans[trace_id]["ended"][label] = time_ms + + # Completed spans need to remain visible for a while to make + # sure Prometheus can identify the completion + time_ms = time.time_ns() // 1_000_000 + cutoff_ms = 15_000 + + # Remove old spans that are no longer needed + for trace_id, spans in self.__spans.items(): + remove = [] + for label, end_ms in spans["ended"].items(): + if end_ms < time_ms - cutoff_ms: + remove.append(label) + for label in remove: + self.__spans[trace_id]["ended"].pop(label) + + # Set annotation metrics in Prometheus + for trace_id, spans in self.__spans.items(): + for label in spans["started"].keys(): + self.__RMSMetrics["annotations"].labels( + marker=label, + jobid=jobid, + traceid=trace_id, + ).set(1) + + for label in spans["ended"].keys(): + self.__RMSMetrics["annotations"].labels( + marker=label, + jobid=str(jobid), + traceid=str(trace_id), + ).set(0) diff --git a/omnistat/config/omnistat.default b/omnistat/config/omnistat.default index ccdab053..eed731fb 100644 --- a/omnistat/config/omnistat.default +++ b/omnistat/config/omnistat.default @@ -17,6 +17,7 @@ allowed_ips = 127.0.0.1 host_skip = "login.*" enable_annotations = False +annotations_path = /tmp/omnistat_trace job_detection_mode = file-based job_detection_file = /tmp/omni_rmsjobinfo diff --git a/omnistat/monitor.py b/omnistat/monitor.py index 4f78c229..d95ccf51 100644 --- a/omnistat/monitor.py +++ b/omnistat/monitor.py @@ -82,6 +82,9 @@ def __init__(self, config): self.runtimeConfig["rms_collector_annotations"] = config["omnistat.collectors.rms"].getboolean( "enable_annotations", False ) + self.runtimeConfig["rms_collector_annotations_path"] = config["omnistat.collectors.rms"].get( + "annotations_path", "/tmp/omnistat_trace" + ) self.jobDetection["mode"] = config["omnistat.collectors.rms"].get("job_detection_mode", "file-based") self.jobDetection["file"] = config["omnistat.collectors.rms"].get( "job_detection_file", "/tmp/omni_rmsjobinfo" @@ -132,6 +135,7 @@ def initMetrics(self): self.__collectors.append( RMSJob( annotations=self.runtimeConfig["rms_collector_annotations"], + annotationsPath=self.runtimeConfig["rms_collector_annotations_path"], jobDetection=self.jobDetection, ) ) diff --git a/omnistat/trace.py b/omnistat/trace.py new file mode 100644 index 00000000..18b4fdab --- /dev/null +++ b/omnistat/trace.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +# ------------------------------------------------------------------------------- +# MIT License +# +# Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ------------------------------------------------------------------------------- + +import time +import zmq + + +class OmnistatTrace: + """ + Tracer to annotate coarse-grained application regions. Each trace can be + used to track a single process; multiple processes connect to the same + path/socket, but require different IDs. + + Attributes + ---------- + trace_id : str + Trace identifier. In scale-out scenarios, this can be the rank of the + current process. + path : str + Path to the unix domain socket used to communicate with the collector. + + Methods + ------- + start_span(label): + Starts a region/span. + end_span(label): + Ends region/span. + """ + + def __init__(self, trace_id="0", path="/tmp/omnistat_trace"): + self._trace_id = trace_id + self._zmq_context = zmq.Context() + self._zmq_socket = self._zmq_context.socket(zmq.PUSH) + self._zmq_socket.connect(f"ipc://{path}") + + def start_span(self, label): + time_ms = time.time_ns() // 1_000_000 + msg = ["start", time_ms, self._trace_id, label] + self._zmq_socket.send_json(msg) + + def end_span(self, label): + time_ms = time.time_ns() // 1_000_000 + msg = ["end", time_ms, self._trace_id, label] + self._zmq_socket.send_json(msg) diff --git a/requirements.txt b/requirements.txt index 8eef597b..583c47f2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,9 @@ -pyyaml -importlib-metadata Flask>=2.3.2 -prometheus_client>=0.17.0 gunicorn>=21.2.0 +importlib-metadata packaging>=24.1 parallel-ssh>=2.12.0 +prometheus_client>=0.17.0 +pyyaml +pyzmq setuptools-git-versioning>=2.0,<3