Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support multiple datasource #37

Merged
merged 6 commits into from
Apr 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion dashboards/cluster.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,21 @@ local heatmapPanel = grafana.heatmapPanel;
local jupyterhub = import './jupyterhub.libsonnet';
local standardDims = jupyterhub.standardDims;

local templates = [
template.datasource(
name='PROMETHEUS_DS',
query='prometheus',
current=null,
hide='label',
),
];

// Cluster-wide stats
local userNodes = graphPanel.new(
'Node Count',
decimals=0,
min=0,
datasource='$PROMETHEUS_DS'
).addTarget(
prometheus.target(
expr='sum(kube_node_labels) by (label_cloud_google_com_gke_nodepool)',
Expand All @@ -32,6 +41,7 @@ local userPods = graphPanel.new(
decimals=0,
min=0,
stack=true,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
|||
Expand All @@ -57,6 +67,7 @@ local clusterMemoryCommitment = graphPanel.new(
// but full is still full. This gets a better view of 'fullness' most of the time.
// If the commitment is "off the chart" it doesn't super matter by how much.
max=1,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
|||
Expand Down Expand Up @@ -98,6 +109,7 @@ local clusterCPUCommitment = graphPanel.new(
// but full is still full. This gets a better view of 'fullness' most of the time.
// If the commitment is "off the chart" it doesn't super matter by how much.
max=1,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
|||
Expand Down Expand Up @@ -138,6 +150,7 @@ local nodeCPUCommit = graphPanel.new(
// but full is still full. This gets a better view of 'fullness' most of the time.
// If the commitment is "off the chart" it doesn't super matter by how much.
max=1,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
|||
Expand Down Expand Up @@ -171,6 +184,7 @@ local nodeMemoryCommit = graphPanel.new(
// but full is still full. This gets a better view most of the time.
// If the commitment is "off the chart" it doesn't super matter by how much.
max=1,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
|||
Expand Down Expand Up @@ -203,6 +217,7 @@ local nodeMemoryUtil = graphPanel.new(
min=0,
// since this is actual measured utilization, it should not be able to exceed max=1
max=1,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
|||
Expand Down Expand Up @@ -230,6 +245,7 @@ local nodeCPUUtil = graphPanel.new(
min=0,
// since this is actual measured utilization, it should not be able to exceed max=1
max=1,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
|||
Expand All @@ -256,11 +272,11 @@ local nonRunningPods = graphPanel.new(
decimals=0,
legend_hideZero=true,
min=0,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
'sum(kube_pod_status_phase{phase!="Running"}) by (phase)',
legendFormat='{{phase}}',

),
]);

Expand All @@ -270,6 +286,7 @@ local userNodesNFSOps = graphPanel.new(
'User Nodes NFS Ops',
decimals=0,
min=0,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
'sum(rate(node_nfs_requests_total[5m])) by (kubernetes_node) > 0',
Expand All @@ -281,6 +298,7 @@ local userNodesIOWait = graphPanel.new(
'iowait % on each node',
decimals=0,
min=0,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
'sum(rate(node_nfs_requests_total[5m])) by (kubernetes_node)',
Expand All @@ -292,6 +310,7 @@ local userNodesHighNFSOps = graphPanel.new(
'NFS Operation Types on user nodes',
decimals=0,
min=0,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
'sum(rate(node_nfs_requests_total[5m])) by (method) > 0',
Expand All @@ -302,6 +321,7 @@ local userNodesHighNFSOps = graphPanel.new(
local nfsServerCPU = graphPanel.new(
'NFS Server CPU',
min=0,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
'avg(rate(node_cpu_seconds_total{job="prometheus-nfsd-server", mode!="idle"}[2m])) by (mode)',
Expand All @@ -313,6 +333,7 @@ local nfsServerIOPS = graphPanel.new(
'NFS Server Disk ops',
decimals=0,
min=0,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
'sum(rate(node_nfsd_disk_bytes_read_total[5m]))',
Expand All @@ -327,6 +348,7 @@ local nfsServerIOPS = graphPanel.new(
local nfsServerWriteLatency = graphPanel.new(
'NFS Server disk write latency',
min=0,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
'sum(rate(node_disk_write_time_seconds_total{job="prometheus-nfsd-server"}[5m])) by (device) / sum(rate(node_disk_writes_completed_total{job="prometheus-nfsd-server"}[5m])) by (device)',
Expand All @@ -337,6 +359,7 @@ local nfsServerWriteLatency = graphPanel.new(
local nfsServerReadLatency = graphPanel.new(
'NFS Server disk read latency',
min=0,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
'sum(rate(node_disk_read_time_seconds_total{job="prometheus-nfsd-server"}[5m])) by (device) / sum(rate(node_disk_reads_completed_total{job="prometheus-nfsd-server"}[5m])) by (device)',
Expand All @@ -349,6 +372,7 @@ local prometheusMemory = graphPanel.new(
'Prometheus Memory (Working Set)',
formatY1='bytes',
min=0,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
'sum(container_memory_working_set_bytes{pod=~"support-prometheus-server-.*", namespace="support"})'
Expand All @@ -358,6 +382,7 @@ local prometheusMemory = graphPanel.new(
local prometheusCPU = graphPanel.new(
'Prometheus CPU',
min=0,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
'sum(rate(container_cpu_usage_seconds_total{pod=~"support-prometheus-server-.*",namespace="support"}[5m]))'
Expand All @@ -368,6 +393,7 @@ local prometheusDiskSpace = graphPanel.new(
'Prometheus Free Disk space',
formatY1='bytes',
min=0,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
'sum(kubelet_volume_stats_available_bytes{namespace="support",persistentvolumeclaim="support-prometheus-server"})'
Expand All @@ -379,6 +405,7 @@ local prometheusNetwork = graphPanel.new(
formatY1='bytes',
decimals=0,
min=0,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
'sum(rate(container_network_receive_bytes_total{pod=~"support-prometheus-server-.*",namespace="support"}[5m]))',
Expand All @@ -394,6 +421,8 @@ dashboard.new(
'Cluster Information',
tags=['jupyterhub', 'kubernetes'],
editable=true
).addTemplates(
templates
).addPanel(
row.new('Cluster Stats'), {},
).addPanel(
Expand Down
19 changes: 18 additions & 1 deletion dashboards/jupyterhub.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,15 @@ local jupyterhub = import 'jupyterhub.libsonnet';
local standardDims = jupyterhub.standardDims;

local templates = [
template.datasource(
name='PROMETHEUS_DS',
query='prometheus',
current=null,
hide='label',
),
template.new(
'hub',
datasource='prometheus',
datasource='$PROMETHEUS_DS',
query='label_values(kube_service_labels{service="hub"}, namespace)',
// Allow viewing dashboard for multiple combined hubs
includeAll=true,
Expand All @@ -31,6 +37,7 @@ local currentRunningUsers = graphPanel.new(
'Current running users',
decimals=0,
min=0,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
|||
Expand All @@ -50,6 +57,7 @@ local userMemoryDistribution = heatmapPanel.new(
yAxis_format='bytes',
yAxis_min=0,
color_colorScheme='interpolateViridis',
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
|||
Expand All @@ -73,6 +81,7 @@ local userCPUDistribution = heatmapPanel.new(
yAxis_format='percentunit',
yAxis_min=0,
color_colorScheme='interpolateViridis',
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
|||
Expand All @@ -96,6 +105,7 @@ local userAgeDistribution = heatmapPanel.new(
yAxis_format='s',
yAxis_min=0,
color_colorScheme='interpolateViridis',
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
|||
Expand All @@ -117,6 +127,7 @@ local hubResponseLatency = graphPanel.new(
'Hub response latency',
formatY1='s',
min=0,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
'histogram_quantile(0.99, sum(rate(jupyterhub_request_duration_seconds_bucket{app="jupyterhub", kubernetes_namespace=~"$hub"}[5m])) by (le))',
Expand All @@ -140,6 +151,7 @@ local serverStartTimes = graphPanel.new(
min=0,
points=true,
pointradius=2,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
// Metrics from hub seems to have `kubernetes_namespace` rather than just `namespace`
Expand All @@ -156,6 +168,7 @@ local usersPerNode = graphPanel.new(
'Users per node',
decimals=0,
min=0,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
|||
Expand All @@ -181,6 +194,7 @@ local nonRunningPods = graphPanel.new(
decimalsY1=0,
min=0,
stack=true,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
|||
Expand Down Expand Up @@ -213,6 +227,7 @@ local oldUserpods = tablePanel.new(
col: 2,
desc: true,
},
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
|||
Expand Down Expand Up @@ -246,6 +261,7 @@ local highCPUUserPods = tablePanel.new(
col: 2,
desc: true,
},
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
|||
Expand Down Expand Up @@ -279,6 +295,7 @@ local highMemoryUsagePods = tablePanel.new(
col: 2,
desc: true,
},
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
|||
Expand Down
21 changes: 19 additions & 2 deletions dashboards/usage-stats.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,21 @@ local heatmapPanel = grafana.heatmapPanel;

local standardDims = { w: 12, h: 12 };

local templates = [
template.datasource(
name='PROMETHEUS_DS',
query='prometheus',
current={},
hide='label',
),
];

local monthlyActiveUsers = graphPanel.new(
'Active users (over 30 days)',
bars=true,
lines=false,
min=0,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
// Removes any pods caused by stress testing
Expand Down Expand Up @@ -44,6 +54,7 @@ local dailyActiveUsers = graphPanel.new(
bars=true,
lines=false,
min=0,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
// count singleuser-server pods
Expand Down Expand Up @@ -71,6 +82,7 @@ local userDistribution = graphPanel.new(
lines=false,
min=0,
x_axis_mode='histogram',
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
// count singleuser-server pods
Expand All @@ -95,6 +107,7 @@ local currentRunningUsers = graphPanel.new(
legend_max=true,
legend_current=true,
min=0,
datasource='$PROMETHEUS_DS'
).addTargets([
prometheus.target(
|||
Expand All @@ -112,8 +125,12 @@ dashboard.new(
uid='usage-dashboard',
tags=['jupyterhub'],
editable=true,
time_from='now-30d'
).addPanel(
time_from='now-30d',
).addTemplates(
templates
)

.addPanel(
monthlyActiveUsers, {},
).addPanel(
dailyActiveUsers, {},
Expand Down
Loading