From 12364c537f56cdb4eb0fa7d61789b2d0cc893de0 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 5 Apr 2023 17:19:55 +0000 Subject: [PATCH 1/3] bug fix for hostfile argument --- moneo.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/moneo.py b/moneo.py index a58d7d4..2635c5a 100644 --- a/moneo.py +++ b/moneo.py @@ -254,6 +254,10 @@ def check_deploy_shutdown(args, parser): print(args.host_file + " does not exist. Please provide a host file. i.e. host.ini.\n") parser.print_help() exit(1) + else: + # ensure we have the absolute path + args.host_file = os.path.abspath(args.host_file) + print(args.host_file) if args.job_id: print( "Job Id cannot be specified during deployment and shutdown. Ignoring Job Id.\n") From 9d5962660dc9339535a528b6c8f6942e3257873b Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 27 Feb 2024 16:46:46 +0000 Subject: [PATCH 2/3] template update --- .../Cluster_View.json | 151 ++++++++++------- .../grafana_dashboard_templates/GPU_View.json | 157 ++++++++++-------- .../Network_View.json | 121 ++++++++------ .../Node_View.json | 112 +++++++------ 4 files changed, 309 insertions(+), 232 deletions(-) mode change 100755 => 100644 deploy_managed_infra/grafana_dashboard_templates/Cluster_View.json mode change 100755 => 100644 deploy_managed_infra/grafana_dashboard_templates/GPU_View.json mode change 100755 => 100644 deploy_managed_infra/grafana_dashboard_templates/Network_View.json mode change 100755 => 100644 deploy_managed_infra/grafana_dashboard_templates/Node_View.json diff --git a/deploy_managed_infra/grafana_dashboard_templates/Cluster_View.json b/deploy_managed_infra/grafana_dashboard_templates/Cluster_View.json old mode 100755 new mode 100644 index 1160540..3435b28 --- a/deploy_managed_infra/grafana_dashboard_templates/Cluster_View.json +++ b/deploy_managed_infra/grafana_dashboard_templates/Cluster_View.json @@ -1,4 +1,53 @@ { + "__inputs": [ + { + "name": "DS_MANAGED_PROMETHEUS_MONEO-AMW", + "label": "Managed_Prometheus_moneo-amw", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "9.5.16" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], "annotations": { "list": [ { @@ -24,7 +73,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, - "id": 45, + "id": null, "links": [ { "asDropdown": true, @@ -46,7 +95,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -87,12 +136,12 @@ }, "textMode": "auto" }, - "pluginVersion": "9.5.13", + "pluginVersion": "9.5.16", "targets": [ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -112,7 +161,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -157,12 +206,12 @@ }, "textMode": "auto" }, - "pluginVersion": "9.5.13", + "pluginVersion": "9.5.16", "targets": [ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -181,7 +230,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -227,12 +276,12 @@ "showThresholdLabels": false, "showThresholdMarkers": true }, - "pluginVersion": "9.5.13", + "pluginVersion": "9.5.16", "targets": [ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -251,7 +300,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -297,12 +346,12 @@ "showThresholdLabels": false, "showThresholdMarkers": true }, - "pluginVersion": "9.5.13", + "pluginVersion": "9.5.16", "targets": [ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -334,7 +383,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -416,7 +465,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "expr": "(\r\n average_ib_port_xmit_data{subscription=\"$Subscription\", cluster=\"$Cluster\",job_id=~\"$JobId\"} * ($Operation== bool 1) +\r\n min_ib_port_xmit_data{subscription=\"$Subscription\", cluster=\"$Cluster\",job_id=~\"$JobId\"} * ($Operation== bool 2) +\r\n max_ib_port_xmit_data{subscription=\"$Subscription\", cluster=\"$Cluster\",job_id=~\"$JobId\"} * ($Operation== bool 3)\r\n)", @@ -432,7 +481,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -513,7 +562,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "expr": "(\r\n average_ib_port_rcv_data{subscription=\"$Subscription\", cluster=\"$Cluster\",job_id=~\"$JobId\"} * ($Operation== bool 1) +\r\n min_ib_port_rcv_data{subscription=\"$Subscription\", cluster=\"$Cluster\",job_id=~\"$JobId\"} * ($Operation== bool 2) +\r\n max_ib_port_rcv_data{subscription=\"$Subscription\", cluster=\"$Cluster\",job_id=~\"$JobId\"} * ($Operation== bool 3)\r\n)", @@ -542,7 +591,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation on VM GPU device utilization", "fieldConfig": { @@ -624,7 +673,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -644,7 +693,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation on VM GPU device utilization", "fieldConfig": { @@ -728,7 +777,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -761,7 +810,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation on VM GPU device SM Clock", "fieldConfig": { @@ -843,7 +892,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -863,7 +912,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation of VM GPU device Memory Clock", "fieldConfig": { @@ -945,7 +994,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -978,7 +1027,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation of VM GPU device Temperature", "fieldConfig": { @@ -1060,7 +1109,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -1080,7 +1129,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Memory temperature (in C)", "fieldConfig": { @@ -1162,7 +1211,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -1195,7 +1244,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation of VM GPU Power", "fieldConfig": { @@ -1276,7 +1325,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -1296,7 +1345,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Total energy consumption since boot (in mJ)", "fieldConfig": { @@ -1377,7 +1426,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -1410,7 +1459,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -1524,7 +1573,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "expr": "ib_port_physical_state{subscription=\"$Subscription\", cluster=\"$Cluster\", job_id=~\"$JobId\"} == 0\r\n", @@ -1615,7 +1664,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -1678,7 +1727,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "expr": "dcgm_gpu_temp{subscription=\"$Subscription\", cluster=\"$Cluster\",gpu_id=\"0\"}", @@ -1787,14 +1836,10 @@ "type": "custom" }, { - "current": { - "selected": false, - "text": "9cd0dc62-fcc2-4b6b-abd3-6010a01a8109", - "value": "9cd0dc62-fcc2-4b6b-abd3-6010a01a8109" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(up,subscription)", "hide": 0, @@ -1814,15 +1859,10 @@ "type": "query" }, { - "current": { - "isNone": true, - "selected": false, - "text": "None", - "value": "" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(dcgm_gpu_utilization{subscription=\"$Subscription\"},cluster)", "hide": 0, @@ -1842,15 +1882,10 @@ "type": "query" }, { - "current": { - "isNone": true, - "selected": false, - "text": "None", - "value": "" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(dcgm_gpu_utilization{cluster=~\"$Cluster\"},job_id)", "hide": 0, @@ -1892,4 +1927,4 @@ "uid": "e12394be-6c26-4c19-a089-f69930b17e7e", "version": 75, "weekStart": "" -} +} \ No newline at end of file diff --git a/deploy_managed_infra/grafana_dashboard_templates/GPU_View.json b/deploy_managed_infra/grafana_dashboard_templates/GPU_View.json old mode 100755 new mode 100644 index 3033e0f..7367397 --- a/deploy_managed_infra/grafana_dashboard_templates/GPU_View.json +++ b/deploy_managed_infra/grafana_dashboard_templates/GPU_View.json @@ -1,4 +1,41 @@ { + "__inputs": [ + { + "name": "DS_MANAGED_PROMETHEUS_MONEO-AMW", + "label": "Managed_Prometheus_moneo-amw", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "9.5.16" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], "annotations": { "list": [ { @@ -24,7 +61,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 39, + "id": null, "links": [ { "asDropdown": true, @@ -59,7 +96,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation on VM GPU device utilization", "fieldConfig": { @@ -141,7 +178,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "builder", "exemplar": false, @@ -161,7 +198,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation on VM GPU device utilization", "fieldConfig": { @@ -243,7 +280,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "builder", "exemplar": false, @@ -276,7 +313,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation on VM GPU device SM Clock", "fieldConfig": { @@ -358,7 +395,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -378,7 +415,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation of VM GPU device Memory Clock", "fieldConfig": { @@ -460,7 +497,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -493,7 +530,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "The rate of data transmitted over NVLink.", "fieldConfig": { @@ -575,7 +612,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -595,7 +632,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "The rate of data received over NVLink.", "fieldConfig": { @@ -677,7 +714,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -710,7 +747,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation of VM GPU device Temperature", "fieldConfig": { @@ -792,7 +829,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "builder", "exemplar": false, @@ -812,7 +849,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Memory temperature (in C)", "fieldConfig": { @@ -894,7 +931,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -927,7 +964,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation of VM GPU Power", "fieldConfig": { @@ -1008,7 +1045,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -1028,7 +1065,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Total energy consumption since boot (in mJ)", "fieldConfig": { @@ -1109,7 +1146,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -1139,7 +1176,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Current throttle code ", "fieldConfig": { @@ -1218,7 +1255,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "expr": "dcgm_current_clock_throttle_reasons{subscription=\"$Subscription\", cluster=\"$Cluster\", instance=~\"$Instance\", gpu_id=~\"$GPU\"}", @@ -1247,7 +1284,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Total number of single-bit volatile ECC errors", "fieldConfig": { @@ -1327,7 +1364,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "expr": "dcgm_ecc_sbe_volatile_total{subscription=\"$Subscription\", cluster=\"$Cluster\", instance=~\"$Instance\", gpu_id=~\"$GPU\"}", @@ -1342,7 +1379,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Total number of double-bit volatile ECC errors", "fieldConfig": { @@ -1422,7 +1459,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "expr": "dcgm_ecc_dbe_volatile_total{subscription=\"$Subscription\", cluster=\"$Cluster\", instance=~\"$Instance\", gpu_id=~\"$GPU\"}", @@ -1437,7 +1474,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Total number of double-bit persistent ECC errors", "fieldConfig": { @@ -1517,7 +1554,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "expr": "dcgm_ecc_dbe_aggregate_total{subscription=\"$Subscription\", cluster=\"$Cluster\", instance=~\"$Instance\", gpu_id=~\"$GPU\"}", @@ -1532,7 +1569,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Total number of single-bit persistent ECC errors", "fieldConfig": { @@ -1612,7 +1649,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "expr": "dcgm_ecc_sbe_aggregate_total{subscription=\"$Subscription\", cluster=\"$Cluster\", instance=~\"$Instance\", gpu_id=~\"$GPU\"}", @@ -1641,7 +1678,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -1703,7 +1740,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "expr": "dcgm_gpu_temp{subscription=\"$Subscription\", cluster=\"$Cluster\", instance=~\"$Instance\",gpu_id=\"0\"}", @@ -1784,7 +1821,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -1860,7 +1897,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "expr": "node_gpu_burn_mon{subscription=~\"$Subscription\", cluster=~\"$Cluster\", instance=~\"$Instance\", gpu_id=~\"$GPU\"}", @@ -1875,7 +1912,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -1951,7 +1988,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "builder", "expr": "node_meta_seq_mon{subscription=~\"$Subscription\", cluster=\"$Cluster\", instance=~\"$Instance\"}", @@ -1978,14 +2015,10 @@ "templating": { "list": [ { - "current": { - "selected": false, - "text": "d71c7216-6409-45f8-be15-35cf57b8527c", - "value": "d71c7216-6409-45f8-be15-35cf57b8527c" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(dcgm_gpu_utilization,subscription)", "hide": 0, @@ -2005,14 +2038,10 @@ "type": "query" }, { - "current": { - "selected": false, - "text": "none", - "value": "none" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(dcgm_gpu_utilization{subscription=\"$Subscription\"}, cluster)", "hide": 0, @@ -2032,14 +2061,10 @@ "type": "query" }, { - "current": { - "selected": false, - "text": "none", - "value": "none" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(dcgm_gpu_utilization{cluster=~\"$Cluster\"}, job_id)", "hide": 0, @@ -2059,14 +2084,10 @@ "type": "query" }, { - "current": { - "selected": false, - "text": "none", - "value": "none" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(dcgm_gpu_utilization{cluster=\"$Cluster\", job_id=~\"$JobId\"}, instance)", "hide": 0, @@ -2086,18 +2107,10 @@ "type": "query" }, { - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(dcgm_gpu_utilization{instance=~\"$Instance\"},gpu_id)", "hide": 0, @@ -2138,4 +2151,4 @@ "uid": "dHpbWBP4z", "version": 43, "weekStart": "" -} +} \ No newline at end of file diff --git a/deploy_managed_infra/grafana_dashboard_templates/Network_View.json b/deploy_managed_infra/grafana_dashboard_templates/Network_View.json old mode 100755 new mode 100644 index d81d702..26ea0d6 --- a/deploy_managed_infra/grafana_dashboard_templates/Network_View.json +++ b/deploy_managed_infra/grafana_dashboard_templates/Network_View.json @@ -1,4 +1,41 @@ { + "__inputs": [ + { + "name": "DS_MANAGED_PROMETHEUS_MONEO-AMW", + "label": "Managed_Prometheus_moneo-amw", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "9.5.16" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], "annotations": { "list": [ { @@ -24,7 +61,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 40, + "id": null, "links": [ { "asDropdown": true, @@ -59,7 +96,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Indication of IB Link Flap", "fieldConfig": { @@ -158,7 +195,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "builder", "exemplar": false, @@ -178,7 +215,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "The rate of data transmitted over InfiniBand.", "fieldConfig": { @@ -260,7 +297,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -280,7 +317,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "The rate of data transmitted over InfiniBand.", "fieldConfig": { @@ -362,7 +399,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -392,7 +429,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Total number of outbound packets discarded by the port because the port is down or congested.", "fieldConfig": { @@ -472,7 +509,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -492,7 +529,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Total number of packets not transmitted from the switch physical port.", "fieldConfig": { @@ -572,7 +609,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -592,7 +629,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Total number of packets containing an error that were received on the port.", "fieldConfig": { @@ -672,7 +709,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -692,7 +729,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Total number of packets received on the switch physical port that are discarded.", "fieldConfig": { @@ -772,7 +809,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -792,7 +829,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Indication of IB Link Flap", "fieldConfig": { @@ -857,7 +894,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -936,7 +973,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -990,7 +1027,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "builder", "expr": "ib_port_rcv_errors{subscription=\"$Subscription\", cluster=\"$Cluster\", instance=~\"$Instance\", ib_port=\"mlx5_ib0:1\"}", @@ -1069,14 +1106,10 @@ "templating": { "list": [ { - "current": { - "selected": false, - "text": "d71c7216-6409-45f8-be15-35cf57b8527c", - "value": "d71c7216-6409-45f8-be15-35cf57b8527c" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(ib_port_physical_state, subscription)", "hide": 0, @@ -1096,14 +1129,10 @@ "type": "query" }, { - "current": { - "selected": false, - "text": "none", - "value": "none" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(ib_port_physical_state{subscription=~\"$Subscription\"}, cluster)", "hide": 0, @@ -1123,14 +1152,10 @@ "type": "query" }, { - "current": { - "selected": false, - "text": "none", - "value": "none" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(ib_port_physical_state{cluster=\"$Cluster\"},job_id)", "hide": 0, @@ -1150,14 +1175,10 @@ "type": "query" }, { - "current": { - "selected": false, - "text": "none", - "value": "none" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(ib_port_physical_state{cluster=\"$Cluster\", job_id=\"$JobId\"},instance)", "hide": 0, @@ -1177,18 +1198,10 @@ "type": "query" }, { - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(ib_port_physical_state{instance=~\"$Instance\"}, ib_port)", "hide": 0, @@ -1229,4 +1242,4 @@ "uid": "IziFPI8Vk", "version": 16, "weekStart": "" -} +} \ No newline at end of file diff --git a/deploy_managed_infra/grafana_dashboard_templates/Node_View.json b/deploy_managed_infra/grafana_dashboard_templates/Node_View.json old mode 100755 new mode 100644 index a9f96a7..11464cb --- a/deploy_managed_infra/grafana_dashboard_templates/Node_View.json +++ b/deploy_managed_infra/grafana_dashboard_templates/Node_View.json @@ -1,4 +1,41 @@ { + "__inputs": [ + { + "name": "DS_MANAGED_PROMETHEUS_MONEO-AMW", + "label": "Managed_Prometheus_moneo-amw", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "9.5.16" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], "annotations": { "list": [ { @@ -25,7 +62,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 41, + "id": null, "links": [ { "asDropdown": true, @@ -60,7 +97,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "CPU Utilization", "fieldConfig": { @@ -148,7 +185,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -168,7 +205,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "CPU Utilization", "fieldConfig": { @@ -256,7 +293,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -286,7 +323,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Memory Utilization", "fieldConfig": { @@ -373,7 +410,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -393,7 +430,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Memory Utilization", "fieldConfig": { @@ -480,7 +517,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -514,7 +551,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "TX Rate of VM's Ethernet Interface", "fieldConfig": { @@ -598,7 +635,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -618,7 +655,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "RX Rate of VM's Ethernet Interface", "fieldConfig": { @@ -702,7 +739,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -739,7 +776,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -789,12 +826,12 @@ }, "showHeader": true }, - "pluginVersion": "9.5.13", + "pluginVersion": "9.5.16", "targets": [ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "builder", "expr": "node_mem_util{subscription=\"$Subscription\", cluster=\"$Cluster\", instance=\"$Instance\"}", @@ -878,14 +915,10 @@ "templating": { "list": [ { - "current": { - "selected": false, - "text": "d71c7216-6409-45f8-be15-35cf57b8527c", - "value": "d71c7216-6409-45f8-be15-35cf57b8527c" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(up,subscription)", "hide": 0, @@ -905,14 +938,10 @@ "type": "query" }, { - "current": { - "selected": false, - "text": "none", - "value": "none" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(up{subscription=\"$Subscription\"},cluster)", "hide": 0, @@ -932,15 +961,10 @@ "type": "query" }, { - "current": { - "isNone": true, - "selected": false, - "text": "None", - "value": "" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(up{cluster=\"$Cluster\"},job_id)", "description": "", @@ -961,14 +985,10 @@ "type": "query" }, { - "current": { - "selected": false, - "text": "none", - "value": "none" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(up{cluster=\"$Cluster\", job_id=\"$JobId\"},instance)", "hide": 0, @@ -988,14 +1008,10 @@ "type": "query" }, { - "current": { - "selected": false, - "text": "All", - "value": "$__all" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(node_cpu_util{instance=~\"$Instance\"},numa_domain)", "hide": 0, @@ -1036,4 +1052,4 @@ "uid": "DBUc8IU4k", "version": 19, "weekStart": "" -} +} \ No newline at end of file From 29a6bd79cd26bc8cc74c6f760f9ae6a669c33268 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 27 Feb 2024 17:20:10 +0000 Subject: [PATCH 3/3] modify start scripts to correct bugs and add worker only launch modify start scripts to correct bugs and add worker only launch modify start scripts to correct bugs and add worker only launch --- linux_service/README.md | 4 ++++ linux_service/start_moneo_services.sh | 11 +++++++++-- src/worker/start_geneva.sh | 2 +- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/linux_service/README.md b/linux_service/README.md index 16f090e..049e319 100644 --- a/linux_service/README.md +++ b/linux_service/README.md @@ -67,6 +67,10 @@ The [start_moneo_services.sh](./start_moneo_services.sh) script is used to start or ```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/start_moneo_services.sh geneva"``` +#### Exporters Alone #### + +```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/start_moneo_services.sh workers"``` + #### Exporters with Managed Prometheus #### ```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/start_moneo_services.sh"``` diff --git a/linux_service/start_moneo_services.sh b/linux_service/start_moneo_services.sh index fc12fd0..678a72c 100755 --- a/linux_service/start_moneo_services.sh +++ b/linux_service/start_moneo_services.sh @@ -5,6 +5,7 @@ # Managed Prometheus deployment: ./start_moneo_services.sh # Azure Monitor: ./start_moneo_services.sh azure_monitor # Geneva (internal msft): ./start_moneo_services.sh geneva +# Only start workers: ./start_moneo_services.sh workers PublisherMethod=$1 # Modify as necessary @@ -24,14 +25,16 @@ if lspci | grep -iq NVIDIA ; then procs+=("nvidia_exporter") fi -if [[ -n $PublisherMethod ]]; then +if [[ -n $PublisherMethod ]] ; then if [ "$PublisherMethod" == "geneva" ] || [ "$PublisherMethod" == "azure_monitor" ]; then echo "PublisherMethod is valid: $PublisherMethod" + procs+=("metrics_publisher") + elif [ "$PublisherMethod" == "workers" ]; then + echo "Only starting workers" else echo "PublisherMethod is not one of the valid choices." exit 1 fi - procs+=("metrics_publisher") fi function proc_check(){ @@ -69,6 +72,10 @@ sudo systemctl start moneo@node_exporter.service sudo systemctl start moneo@net_exporter.service sudo systemctl start moneo@nvidia_exporter.service +if [ "$PublisherMethod" == "workers" ]; then + proc_check false +fi + if [[ -n $PublisherMethod ]]; then if [ "$PublisherMethod" == "geneva" ]; then sudo $MONEO_PATH/src/worker/start_geneva.sh $PUBLISHER_AUTH /tmp/moneo-worker/publisher/config diff --git a/src/worker/start_geneva.sh b/src/worker/start_geneva.sh index bc3dc13..084523a 100755 --- a/src/worker/start_geneva.sh +++ b/src/worker/start_geneva.sh @@ -6,7 +6,7 @@ CONTAINER_NAME="genevamdmagent" GENEVA_CONFIG=$CONFIG/geneva_config.json # check if the docker container is running -if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then +if sudo docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then echo "Geneva Docker is running" exit 0 fi