From e803abb3f1a0ad9d9a585955172e67a5e2ec2579 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 3 Jul 2021 06:20:22 +0300 Subject: [PATCH 1/2] Add task dashboard --- cli/cmd/lib_task_apis.go | 4 + manager/install.sh | 1 + .../grafana/grafana-dashboard-task.yaml | 1382 +++++++++++++++++ pkg/config/config.go | 7 + pkg/operator/resources/job/taskapi/api.go | 20 + pkg/operator/resources/job/taskapi/cron.go | 11 +- pkg/operator/resources/job/taskapi/metrics.go | 61 + 7 files changed, 1484 insertions(+), 2 deletions(-) create mode 100644 manager/manifests/grafana/grafana-dashboard-task.yaml create mode 100644 pkg/operator/resources/job/taskapi/metrics.go diff --git a/cli/cmd/lib_task_apis.go b/cli/cmd/lib_task_apis.go index 216e641730..50575b8516 100644 --- a/cli/cmd/lib_task_apis.go +++ b/cli/cmd/lib_task_apis.go @@ -114,6 +114,10 @@ func taskAPITable(taskAPI schema.APIResponse) string { out += t.MustFormat() } + if taskAPI.DashboardURL != nil && *taskAPI.DashboardURL != "" { + out += "\n" + console.Bold("metrics dashboard: ") + *taskAPI.DashboardURL + "\n" + } + out += "\n" + console.Bold("endpoint: ") + taskAPI.Endpoint + "\n" out += "\n" + apiHistoryTable(taskAPI.APIVersions) diff --git a/manager/install.sh b/manager/install.sh index 8d42997637..21dba663aa 100755 --- a/manager/install.sh +++ b/manager/install.sh @@ -233,6 +233,7 @@ function setup_grafana() { kubectl apply -f manifests/grafana/grafana-dashboard-realtime.yaml >/dev/null kubectl apply -f manifests/grafana/grafana-dashboard-async.yaml >/dev/null kubectl apply -f manifests/grafana/grafana-dashboard-batch.yaml >/dev/null + kubectl apply -f manifests/grafana/grafana-dashboard-task.yaml >/dev/null kubectl apply -f manifests/grafana/grafana-dashboard-cluster.yaml >/dev/null kubectl apply -f manifests/grafana/grafana-dashboard-nodes.yaml >/dev/null envsubst < manifests/grafana/grafana.yaml | kubectl apply -f - >/dev/null diff --git a/manager/manifests/grafana/grafana-dashboard-task.yaml b/manager/manifests/grafana/grafana-dashboard-task.yaml new file mode 100644 index 0000000000..112ece5eff --- /dev/null +++ b/manager/manifests/grafana/grafana-dashboard-task.yaml @@ -0,0 +1,1382 @@ +# Copyright 2021 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-task + namespace: default +data: + task.json: |- + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "prometheus", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 7, + "iteration": 1625281006506, + "links": [], + "panels": [ + { + "datasource": null, + "gridPos": { + "h": 2, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 7, + "options": { + "content": "

TaskAPI

\n", + "mode": "markdown" + }, + "pluginVersion": "8.0.4", + "timeFrom": null, + "timeShift": null, + "transparent": true, + "type": "text" + }, + { + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 2 + }, + "id": 22, + "title": "API Stats", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Number of succeeded tasks for an API", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 3 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(cortex_task_succeeded{api_name=~\"$api_name\"}) by (api_name)", + "interval": "", + "legendFormat": "{{api_name}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "# Succeeded Tasks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:26", + "decimals": 0, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:27", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Number of failed tasks for an API", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 3 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(cortex_task_failed{api_name=~\"$api_name\"}) by (api_name)", + "interval": "", + "legendFormat": "{{api_name}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "# Failed Tasks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:262", + "decimals": 0, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:263", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Average time per task for an API", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 12 + }, + "hiddenSeries": false, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(cortex_time_per_task_sum{api_name=~\"$api_name\"}) by (api_name) / sum(cortex_time_per_task_count{api_name=~\"$api_name\"}) by (api_name)", + "interval": "", + "legendFormat": "{{api_name}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Average Time per Task", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:161", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:162", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active Jobs": "semi-dark-green", + "Active Workers": "semi-dark-orange" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Active tasks", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 12 + }, + "hiddenSeries": false, + "id": 20, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "count(kube_job_status_active{job_name=~\"$api_name.+\"} != 0)", + "interval": "", + "legendFormat": "Active Tasks", + "refId": "Active Tasks" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "# Active Tasks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:128", + "decimals": 0, + "format": "count", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:129", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 11, + "title": "Aggregate Worker Usage", + "type": "row" + }, + { + "aliasColors": { + "Total CPU Request": "semi-dark-orange", + "Total CPU Usage": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Total CPU usage across all workers of the API", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 21 + }, + "hiddenSeries": false, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": false, + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Total CPU Usage", + "refId": "CPU Usage" + }, + { + "exemplar": true, + "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"cpu\"})", + "hide": false, + "interval": "", + "legendFormat": "Total CPU Request", + "refId": "CPU Request" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "core", + "label": "cpu", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Total Memory Request": "semi-dark-orange", + "Total Memory Usage": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Total memory usage across all workers of the API", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 21 + }, + "hiddenSeries": false, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": false, + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Total Memory Usage", + "refId": "Memory Usage" + }, + { + "exemplar": true, + "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2", + "hide": false, + "interval": "", + "legendFormat": "Total Memory Request", + "refId": "Memory Request" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "MiB", + "label": "memory", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Total GPU Capacity": "semi-dark-orange", + "Total GPU Usage": "semi-dark-green", + "Total GPU Utilization": "light-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Total GPU core usage across all workers of the API", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 29 + }, + "hiddenSeries": false, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100", + "hide": false, + "interval": "", + "legendFormat": "Total GPU Usage", + "refId": "GPU Usage" + }, + { + "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Total GPU Capacity", + "refId": "GPU Capacity" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total GPU Core Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "gpuCore", + "label": "gpu", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Total Capacity GPU Memory": "semi-dark-orange", + "Total Used GPU Memory": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Total GPU memory usage across all workers of the API", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 29 + }, + "hiddenSeries": false, + "id": 19, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Total Used GPU Memory", + "refId": "GPU Used Memory" + }, + { + "exemplar": false, + "expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Total Capacity GPU Memory", + "refId": "GPU Capacity Memory" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total GPU Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "MiB", + "label": "memory", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 37 + }, + "id": 9, + "panels": [], + "title": "Avg Worker Usage", + "type": "row" + }, + { + "aliasColors": { + "Avg CPU Request": "semi-dark-orange", + "Avg CPU Usage": "semi-dark-green", + "Total CPU Request": "semi-dark-orange", + "Total CPU Usage": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Avg CPU usage across all workers of the API", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 38 + }, + "hiddenSeries": false, + "id": 23, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": false, + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Avg CPU Usage", + "refId": "CPU Usage" + }, + { + "exemplar": true, + "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Avg CPU Request", + "refId": "CPU Request" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Avg CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "core", + "label": "cpu", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Avg Memory Request": "semi-dark-orange", + "Avg Memory Usage": "semi-dark-green", + "Total Memory Request": "semi-dark-orange", + "Total Memory Usage": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Avg memory usage across all workers of the API", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 38 + }, + "hiddenSeries": false, + "id": 24, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": false, + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Avg Memory Usage", + "refId": "Memory Usage" + }, + { + "exemplar": true, + "expr": "sum(kube_pod_container_resource{exported_pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Avg Memory Request", + "refId": "Memory Request" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Avg Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "MiB", + "label": "memory", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Avg GPU Capacity": "semi-dark-orange", + "Avg GPU Usage": "semi-dark-green", + "Total GPU Capacity": "semi-dark-orange", + "Total GPU Usage": "semi-dark-green", + "Total GPU Utilization": "light-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Avg GPU core usage across all workers of the API", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 46 + }, + "hiddenSeries": false, + "id": 25, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Avg GPU Usage", + "refId": "GPU Usage" + }, + { + "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) by (exported_pod))", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Avg GPU Capacity", + "refId": "GPU Capacity" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Avg GPU Core Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "gpuCore", + "label": "gpu", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Avg Capacity GPU Memory": "semi-dark-orange", + "Avg Used GPU Memory": "semi-dark-green", + "Total Capacity GPU Memory": "semi-dark-orange", + "Total Used GPU Memory": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Avg GPU memory usage across all workers of the API", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 46 + }, + "hiddenSeries": false, + "id": 26, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Avg Used GPU Memory", + "refId": "GPU Used Memory" + }, + { + "exemplar": false, + "expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Avg Capacity GPU Memory", + "refId": "GPU Capacity Memory" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Avg GPU Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "MiB", + "label": "memory", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "30s", + "schemaVersion": 30, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "selected": false, + "text": "None", + "value": "None" + }, + "datasource": null, + "definition": "label_values({__name__=~\"cortex_task_.+\"}, api_name)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": "API Name", + "multi": true, + "name": "api_name", + "options": [], + "query": { + "query": "label_values({__name__=~\"cortex_task_.+\"}, api_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "TaskAPI", + "uid": "taskapi", + "version": 5 + } diff --git a/pkg/config/config.go b/pkg/config/config.go index c442ec6f19..c2f46c101a 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -21,6 +21,7 @@ import ( "os" "strings" + "github.com/DataDog/datadog-go/statsd" "github.com/cortexlabs/cortex/pkg/consts" batch "github.com/cortexlabs/cortex/pkg/crds/apis/batch/v1alpha1" "github.com/cortexlabs/cortex/pkg/lib/aws" @@ -46,6 +47,7 @@ var ( K8s *k8s.Client K8sIstio *k8s.Client K8sAllNamspaces *k8s.Client + MetricsClient *statsd.Client Prometheus promv1.API scheme = runtime.NewScheme() ) @@ -170,5 +172,10 @@ func Init() error { return err } + MetricsClient, err = statsd.New("prometheus-statsd-exporter.default:9125") + if err != nil { + return errors.Wrap(errors.WithStack(err), "unable to initialize metrics client") + } + return nil } diff --git a/pkg/operator/resources/job/taskapi/api.go b/pkg/operator/resources/job/taskapi/api.go index bd6b631af8..9261cc16a9 100644 --- a/pkg/operator/resources/job/taskapi/api.go +++ b/pkg/operator/resources/job/taskapi/api.go @@ -25,6 +25,7 @@ import ( "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/k8s" "github.com/cortexlabs/cortex/pkg/lib/parallel" + "github.com/cortexlabs/cortex/pkg/lib/pointer" "github.com/cortexlabs/cortex/pkg/lib/sets/strset" "github.com/cortexlabs/cortex/pkg/operator/lib/routines" "github.com/cortexlabs/cortex/pkg/operator/operator" @@ -39,6 +40,8 @@ import ( kcore "k8s.io/api/core/v1" ) +const _taskDashboardUID = "taskapi" + // UpdateAPI deploys or update a task api without triggering any task func UpdateAPI(apiConfig *userconfig.API) (*spec.API, string, error) { prevVirtualService, err := config.K8s.GetVirtualService(workloads.K8sName(apiConfig.Name)) @@ -288,11 +291,28 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp } } + dashboardURL := pointer.String(getDashboardURL(api.Name)) + return []schema.APIResponse{ { Spec: *api, TaskJobStatuses: jobStatuses, Endpoint: endpoint, + DashboardURL: dashboardURL, }, }, nil } + +func getDashboardURL(apiName string) string { + loadBalancerURL, err := operator.LoadBalancerURL() + if err != nil { + return "" + } + + dashboardURL := fmt.Sprintf( + "%s/dashboard/d/%s/taskapi?orgId=1&refresh=30s&var-api_name=%s", + loadBalancerURL, _taskDashboardUID, apiName, + ) + + return dashboardURL +} diff --git a/pkg/operator/resources/job/taskapi/cron.go b/pkg/operator/resources/job/taskapi/cron.go index 4ea382b766..2b915b5222 100644 --- a/pkg/operator/resources/job/taskapi/cron.go +++ b/pkg/operator/resources/job/taskapi/cron.go @@ -96,6 +96,7 @@ func ManageJobResources() error { err := errors.FirstError( job.DeleteInProgressFile(jobKey), deleteJobRuntimeResources(jobKey), + recordFailure(jobKey), ) if err != nil { telemetry.Error(err) @@ -136,6 +137,7 @@ func ManageJobResources() error { err := errors.FirstError( job.DeleteInProgressFile(jobKey), deleteJobRuntimeResources(jobKey), + recordFailure(jobKey), ) if err != nil { telemetry.Error(err) @@ -152,6 +154,7 @@ func ManageJobResources() error { err := errors.FirstError( job.SetTimedOutStatus(jobKey), deleteJobRuntimeResources(jobKey), + recordFailure(jobKey), ) if err != nil { telemetry.Error(err) @@ -161,7 +164,7 @@ func ManageJobResources() error { } if jobState.Status == status.JobRunning { - err = checkIfJobCompleted(jobKey, k8sJob) + err = checkIfJobCompleted(jobKey, jobSpec.StartTime, k8sJob) if err != nil { telemetry.Error(err) operatorLogger.Error(err) @@ -201,13 +204,14 @@ func reconcileInProgressJob(jobState *job.State, jobFound bool) (status.JobCode, return jobState.Status, "" } -func checkIfJobCompleted(jobKey spec.JobKey, k8sJob kbatch.Job) error { +func checkIfJobCompleted(jobKey spec.JobKey, jobStartTime time.Time, k8sJob kbatch.Job) error { pods, _ := config.K8s.ListPodsByLabel("jobID", jobKey.ID) for i := range pods { if k8s.WasPodOOMKilled(&pods[i]) { return errors.FirstError( job.SetWorkerOOMStatus(jobKey), deleteJobRuntimeResources(jobKey), + recordFailure(jobKey), ) } } @@ -216,11 +220,14 @@ func checkIfJobCompleted(jobKey spec.JobKey, k8sJob kbatch.Job) error { return errors.FirstError( job.SetWorkerErrorStatus(jobKey), deleteJobRuntimeResources(jobKey), + recordFailure(jobKey), ) } else if int(k8sJob.Status.Succeeded) == 1 && len(pods) > 0 { return errors.FirstError( job.SetSucceededStatus(jobKey), deleteJobRuntimeResources(jobKey), + recordSuccess(jobKey), + recordTimePerTask(jobKey, time.Since(jobStartTime)), ) } diff --git a/pkg/operator/resources/job/taskapi/metrics.go b/pkg/operator/resources/job/taskapi/metrics.go new file mode 100644 index 0000000000..6cf8200467 --- /dev/null +++ b/pkg/operator/resources/job/taskapi/metrics.go @@ -0,0 +1,61 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package taskapi + +import ( + "time" + + "github.com/cortexlabs/cortex/pkg/config" + "github.com/cortexlabs/cortex/pkg/lib/errors" + "github.com/cortexlabs/cortex/pkg/types/spec" +) + +func recordSuccess(jobKey spec.JobKey) error { + tags := []string{ + "api_name:" + jobKey.APIName, + "job_id:" + jobKey.ID, + } + err := config.MetricsClient.Incr("cortex_task_succeeded", tags, 1.0) + if err != nil { + return errors.WithStack(err) + } + return nil +} + +func recordFailure(jobKey spec.JobKey) error { + tags := []string{ + "api_name:" + jobKey.APIName, + "job_id:" + jobKey.ID, + } + err := config.MetricsClient.Incr("cortex_task_failed", tags, 1.0) + if err != nil { + return errors.WithStack(err) + } + return nil +} + +func recordTimePerTask(jobKey spec.JobKey, elapsedTime time.Duration) error { + tags := []string{ + "api_name:" + jobKey.APIName, + "job_id:" + jobKey.ID, + } + err := config.MetricsClient.Histogram("cortex_time_per_task", elapsedTime.Seconds(), tags, 1.0) + if err != nil { + return errors.WithStack(err) + } + return nil +} From 0eb474b8ca71a3a2ca549e61ed73f6aa9e799033 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 3 Jul 2021 06:31:14 +0300 Subject: [PATCH 2/2] Fix CPU/Memory requests --- manager/manifests/grafana/grafana-dashboard-task.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/manager/manifests/grafana/grafana-dashboard-task.yaml b/manager/manifests/grafana/grafana-dashboard-task.yaml index 112ece5eff..1305d94407 100644 --- a/manager/manifests/grafana/grafana-dashboard-task.yaml +++ b/manager/manifests/grafana/grafana-dashboard-task.yaml @@ -37,7 +37,7 @@ data: "gnetId": null, "graphTooltip": 0, "id": 7, - "iteration": 1625281006506, + "iteration": 1625281006508, "links": [], "panels": [ { @@ -618,7 +618,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2", + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"\"}[1m])) / 1024^2", "format": "time_series", "instant": false, "interval": "", @@ -1057,7 +1057,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -1378,5 +1378,5 @@ data: "timezone": "", "title": "TaskAPI", "uid": "taskapi", - "version": 5 + "version": 6 }