From 9e0e2f82c98b238d574ca6870f0fbece39c3bb55 Mon Sep 17 00:00:00 2001 From: Gibson Chikafa Date: Mon, 24 Apr 2023 12:03:10 +0200 Subject: [PATCH] [HWORKS-512] Read cgroup parent from database (#1347) (#1329) * Pyhton resources fixes * Remove empty line * Take care of the . * Rename variable --- .../PythonResourcesController.java | 23 +++++++++++++------ .../hops/hopsworks/common/util/Settings.java | 9 ++++++++ 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/hopsworks-common/src/main/java/io/hops/hopsworks/common/pythonresources/PythonResourcesController.java b/hopsworks-common/src/main/java/io/hops/hopsworks/common/pythonresources/PythonResourcesController.java index e4b204eea6..fb07b08143 100644 --- a/hopsworks-common/src/main/java/io/hops/hopsworks/common/pythonresources/PythonResourcesController.java +++ b/hopsworks-common/src/main/java/io/hops/hopsworks/common/pythonresources/PythonResourcesController.java @@ -152,25 +152,34 @@ private Map getNodeExporterQueriesKube() { } private Map getCadvisorQueries() { + final String dockerCgroupParent = settings.getDockerCgroupParent(); + String cGroupParentForRegex = dockerCgroupParent.replaceAll("\\.", "\\\\\\\\."); + final String headNodeInstanceQuery = "instance='" + glassfishIp + ":" + nodeExporterPort + "'"; return new HashMap() { { put(DOCKER_CURRENT_CPU_USAGE_KEY, - "sum(avg by (cpu) (rate(container_cpu_usage_seconds_total{id=~'.*/docker/.*'}[60s]) * 100))"); + "sum(avg by (cpu) (rate(container_cpu_usage_seconds_total{id=~'.*/" + cGroupParentForRegex + + "/.*'}[60s]) * 100))"); put(DOCKER_TOTAL_ALLOCATABLE_CPU_KEY, - "(container_spec_cpu_quota{id='/docker'}/" + settings.getDockerCgroupCpuPeriod() + ")*100"); - put(DOCKER_CURRENT_MEMORY_USAGE_KEY, "sum(container_memory_working_set_bytes{id=~'.*/docker/.*'})"); - put(DOCKER_TOTAL_ALLOCATABLE_MEMORY_KEY, "container_spec_memory_limit_bytes{id='/docker'}"); + "(sum(container_spec_cpu_quota{id='/" + dockerCgroupParent + "'})/(" + + settings.getDockerCgroupCpuPeriod() + + " * (count(count(node_cpu_seconds_total{" + headNodeInstanceQuery + "}) without (mode,instance,job))" + + "without (cpu))))*100"); + put(DOCKER_CURRENT_MEMORY_USAGE_KEY, "sum(container_memory_working_set_bytes{id=~'.*/" + cGroupParentForRegex + + "/.*'})"); + put(DOCKER_TOTAL_ALLOCATABLE_MEMORY_KEY, "container_spec_memory_limit_bytes{id='/" + dockerCgroupParent + + "'}"); } }; } private Map getNodeExporterQueriesHeadNode() { - String headNodeQuery = "instance='" + glassfishIp + ":" + nodeExporterPort + "'"; + final String headNodeQuery = "instance='" + glassfishIp + ":" + nodeExporterPort + "'"; return new HashMap() { { put(CLUSTER_CURRENT_CPU_USAGE, - "100 - ((sum((avg by (instance) (rate(node_cpu_seconds_total{mode='idle', " + headNodeQuery + "}[1m])) " + - "* 100)))/(count(node_memory_Active_bytes{" + headNodeQuery + "})))"); + "100 - ((sum((avg by (instance) (rate(node_cpu_seconds_total{mode='idle', " + headNodeQuery + "}[1m])) " + + "* 100)))/(count(node_memory_Active_bytes{" + headNodeQuery + "})))"); put(CLUSTER_CURRENT_MEMORY_USAGE, "sum(node_memory_Active_bytes{" + headNodeQuery + "})"); put(CLUSTER_TOTAL_MEMORY_CAPACITY, "sum(node_memory_MemTotal_bytes{" + headNodeQuery + "})"); } diff --git a/hopsworks-common/src/main/java/io/hops/hopsworks/common/util/Settings.java b/hopsworks-common/src/main/java/io/hops/hopsworks/common/util/Settings.java index ce879d5d9f..b61005fad1 100644 --- a/hopsworks-common/src/main/java/io/hops/hopsworks/common/util/Settings.java +++ b/hopsworks-common/src/main/java/io/hops/hopsworks/common/util/Settings.java @@ -405,6 +405,7 @@ public class Settings implements Serializable { private static final String VARIABLE_DOCKER_CGROUP_CPU_QUOTA = "docker_cgroup_cpu_quota_percentage"; private static final String VARIABLE_DOCKER_CGROUP_CPU_PERIOD = "docker_cgroup_cpu_period"; private static final String VARIABLE_DOCKER_CGROUP_MONITOR_INTERVAL = "docker_cgroup_monitor_interval"; + private static final String VARIABLE_DOCKER_CGROUP_PARENT = "docker_cgroup_parent"; private static final String VARIABLE_PROMETHEUS_PORT = "prometheus_port"; @@ -902,6 +903,7 @@ private void populateCache() { ENABLE_GIT_READ_ONLY_REPOSITORIES = setBoolVar(VARIABLE_ENABLE_GIT_READ_ONLY_REPOSITORIES, ENABLE_GIT_READ_ONLY_REPOSITORIES); + //Docker cgroups DOCKER_CGROUP_ENABLED = setBoolVar(VARIABLE_DOCKER_CGROUP_ENABLED, DOCKER_CGROUP_ENABLED); DOCKER_CGROUP_MEMORY_LIMIT = setStrVar(VARIABLE_DOCKER_CGROUP_HARD_LIMIT_MEMORY, DOCKER_CGROUP_MEMORY_LIMIT); @@ -911,6 +913,7 @@ private void populateCache() { DOCKER_CGROUP_CPU_PERIOD = setIntVar(VARIABLE_DOCKER_CGROUP_CPU_PERIOD, DOCKER_CGROUP_CPU_PERIOD); DOCKER_CGROUP_MONITOR_INTERVAL = setStrVar(VARIABLE_DOCKER_CGROUP_MONITOR_INTERVAL, DOCKER_CGROUP_MONITOR_INTERVAL); + DOCKER_CGROUP_PARENT = setStrVar(VARIABLE_DOCKER_CGROUP_PARENT, DOCKER_CGROUP_PARENT); PROMETHEUS_PORT = setIntVar(VARIABLE_PROMETHEUS_PORT, PROMETHEUS_PORT); @@ -1785,6 +1788,12 @@ public synchronized String getDockerCgroupIntervalMonitor() { return DOCKER_CGROUP_MONITOR_INTERVAL; } + private String DOCKER_CGROUP_PARENT = "docker.slice"; + public synchronized String getDockerCgroupParent() { + checkCache(); + return DOCKER_CGROUP_PARENT; + } + // Service key rotation interval private static final String JUPYTER_SHUTDOWN_TIMER_INTERVAL = "jupyter_shutdown_timer_interval";