From 12bf7c486f0925150e571d4bf75962822e86ff97 Mon Sep 17 00:00:00 2001 From: WangDian Date: Thu, 31 Jan 2019 18:25:20 +0800 Subject: [PATCH 01/31] Cleaner logic --- .gitignore | 4 + .../services-configuration.yaml.template | 4 + src/cleaner/cleaner_main.py | 25 ++---- src/cleaner/config/cleaner.py | 22 +++-- src/cleaner/config/cleaner.yaml | 1 + src/cleaner/deploy/cleaner.yaml.template | 35 ++------ src/cleaner/scripts/clean_docker.py | 85 +++++++++++++++++++ .../scripts/reclaimable_docker_cache.sh | 6 +- src/cleaner/test/cleaner-test-job.json | 22 +++++ src/job-exporter/src/collector.py | 3 +- .../templates/dockerContainerScript.mustache | 13 +-- 11 files changed, 158 insertions(+), 62 deletions(-) create mode 100644 src/cleaner/scripts/clean_docker.py create mode 100644 src/cleaner/test/cleaner-test-job.json diff --git a/.gitignore b/.gitignore index 30c060ec47..37c834877e 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,7 @@ __pycache__ *.swp *.swo .DS_Store +*.sln +*.pyproj.user +*.pyproj +*.vs diff --git a/deployment/quick-start/services-configuration.yaml.template b/deployment/quick-start/services-configuration.yaml.template index 842e455214..295bb62793 100644 --- a/deployment/quick-start/services-configuration.yaml.template +++ b/deployment/quick-start/services-configuration.yaml.template @@ -128,3 +128,7 @@ rest-server: # uncomment following section if you want to customize the port of pylon # pylon: # port: 80 + +# uncomment following section if you want to customize the docker path of cleaner +# cleaner: +# threshold: 80 \ No newline at end of file diff --git a/src/cleaner/cleaner_main.py b/src/cleaner/cleaner_main.py index d4e8806d7e..53b3091bf9 100644 --- a/src/cleaner/cleaner_main.py +++ b/src/cleaner/cleaner_main.py @@ -19,7 +19,7 @@ import argparse import os from datetime import timedelta -from cleaner.scripts import clean_docker_cache, check_deleted_files +from cleaner.scripts import clean_docker_cache, check_deleted_files, clean_docker from cleaner.worker import Worker from cleaner.utils.logger import LoggerMixin from cleaner.utils import common @@ -76,31 +76,20 @@ def sync(self): time.sleep(1) -def get_worker(arg): - if arg == "docker_cache": - worker = Worker(clean_docker_cache.check_and_clean, 50, timeout=timedelta(minutes=10), cool_down_time=1800) - elif arg == "deleted_files": - worker = Worker(check_deleted_files.list_and_check_files, None, timeout=timedelta(minutes=10), cool_down_time=1800) - else: - raise ValueError("arguments %s is not supported.", arg) - return worker - - -liveness_files = { - "docker_cache": "docker-cache-cleaner-healthy", - "deleted_files": "deleted-files-cleaner-healthy" -} +def get_worker(threshold): + worker = Worker(clean_docker.check_and_clean, threshold, timeout=timedelta(minutes=10), cool_down_time=60) + return worker; def main(): parser = argparse.ArgumentParser() - parser.add_argument("option", help="the functions currently supported: [docker_cache | deleted_files]") + parser.add_argument("-t", "--threshold", help="the disk usage precent to start cleaner") args = parser.parse_args() common.setup_logging() - cleaner = Cleaner(liveness_files[args.option]) - cleaner.add_worker(args.option, get_worker(args.option)) + cleaner = Cleaner("docker-cleaner-healthy") + cleaner.add_worker("docker-cleaner", get_worker(args.threshold)) cleaner.start() cleaner.sync() diff --git a/src/cleaner/config/cleaner.py b/src/cleaner/config/cleaner.py index 71c8050638..6c6245d305 100644 --- a/src/cleaner/config/cleaner.py +++ b/src/cleaner/config/cleaner.py @@ -22,19 +22,25 @@ class Cleaner: - def __init__(self, cluster_configuration, service_configuration, default_service_configuraiton): + def __init__(self, cluster_conf, service_conf, default_service_conf): self.logger = logging.getLogger(__name__) - - self.cluster_configuration = cluster_configuration + self.cluster_conf = cluster_conf + self.service_conf = service_conf + self.default_service_conf = default_service_conf def validation_pre(self): return True, None def run(self): - com = {} - - return com - - def validation_post(self, cluster_object_model): + result = copy.deepcopy(self.default_service_conf) + result.update(self.service_conf) + return result + + def validation_post(self, conf): + threshold = conf["cleaner"].get("threshold") + if type(threshold) != int: + msg = "expect threshold in cleaner to be int but get %s with type %s" % \ + (threshold, type(threshold)) + return False, msg return True, None diff --git a/src/cleaner/config/cleaner.yaml b/src/cleaner/config/cleaner.yaml index dbde8d9a14..4684ccd1e8 100644 --- a/src/cleaner/config/cleaner.yaml +++ b/src/cleaner/config/cleaner.yaml @@ -15,3 +15,4 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +threshold: 80 \ No newline at end of file diff --git a/src/cleaner/deploy/cleaner.yaml.template b/src/cleaner/deploy/cleaner.yaml.template index 147729677b..ba7d4811ae 100644 --- a/src/cleaner/deploy/cleaner.yaml.template +++ b/src/cleaner/deploy/cleaner.yaml.template @@ -31,42 +31,16 @@ spec: hostPID: true hostNetwork: true containers: - - name: docker-cache-cleaner + - name: docker-cleaner image: {{ cluster_cfg["cluster"]["docker-registry"]["prefix"] }}cleaner:{{ cluster_cfg["cluster"]["docker-registry"]["tag"] }} args: - - 'docker_cache' + - -t {{ cluster_cfg["cleaner"]["threshold"] }} imagePullPolicy: Always securityContext: privileged: True volumeMounts: - mountPath: /var/run/docker.sock name: docker-socket - livenessProbe: - exec: - command: - - test - - '`find /tmp/docker-cache-cleaner-healthy -mmin -1`' - initialDelaySeconds: 60 - periodSeconds: 30 - {%- if cluster_cfg['cluster']['common']['qos-switch'] == "true" %} - resources: - limits: - memory: "1Gi" - {%- endif %} - - name: deleted-files-cleaner - image: {{ cluster_cfg["cluster"]["docker-registry"]["prefix"] }}cleaner:{{ cluster_cfg["cluster"]["docker-registry"]["tag"] }} - args: - - 'deleted_files' - imagePullPolicy: Always - securityContext: - privileged: True - livenessProbe: - exec: - command: - - test - - '`find /tmp/deleted-files-cleaner-healthy -mmin -1`' - initialDelaySeconds: 60 - periodSeconds: 30 {%- if cluster_cfg['cluster']['common']['qos-switch'] == "true" %} resources: limits: @@ -78,3 +52,8 @@ spec: - name: docker-socket hostPath: path: /var/run/docker.sock + tolerations: + - key: node.kubernetes.io/memory-pressure + operator: "Exists" + - key: node.kubernetes.io/disk-pressure + operator: "Exists" \ No newline at end of file diff --git a/src/cleaner/scripts/clean_docker.py b/src/cleaner/scripts/clean_docker.py new file mode 100644 index 0000000000..fd7a4e2ab7 --- /dev/null +++ b/src/cleaner/scripts/clean_docker.py @@ -0,0 +1,85 @@ +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from cleaner.utils import common +import subprocess +import multiprocessing +import re + +logger = multiprocessing.get_logger() + + +def check_disk_usage(partition): + df = subprocess.Popen(["df","-h", partition], stdout=subprocess.PIPE) + size = 0 + try: + for line in df.stdout: + splitline = line.decode().split() + if splitline[5] == partition: + size = int(splitline[4][:-1]) + except ValueError: + logger.error("cannot get disk size, reset size to 0") + size = 0 + logger.info("Checking disk, disk usage = {0}%".format(size)) + return size + + +def check_and_clean(threshold): + if check_disk_usage("/") > int(threshold): + logger.info("Disk usage is above {0}%, Try to remove containers".format(threshold)) + kill_largest_container() + + +# Clean logic v1: kill largest container +white_list = ["k8s_kube", "k8s_pylon", "k8s_zookeeper", "k8s_rest-server", "k8s_yarn", "k8s_hadoop", "k8s_job-exporter", "k8s_watchdog", "k8s_grafana", "k8s_node-exporter", "k8s_webportal", "k8s_prometheus", "k8s_nvidia-drivers", "k8s_etcd-container", "k8s_apiserver-container", "k8s_docker-cleaner", "kubelet"] +def kill_largest_container(): + containers = [] + # Only try to stop PAI jobs and user created containers + containers_source = subprocess.Popen(["docker", "ps", "-a", "--format", r'{{.ID}}\t{{.Image}}\t{{.Size}}\t{{.Names}}'], stdout=subprocess.PIPE) + for line in containers_source.stdout: + splitline = line.split("\t") + ignore = False + for prefix in white_list: + if (splitline[3].startswith(prefix)): + ignore = True + break + if ignore == False: + size = calculate_size(splitline[2].split()[0]) + containers.append([size, splitline[0], splitline[1]]) + + containers.sort(key=lambda x:x[0], reverse=True) + + if containers.count > 0 and containers[0][0] > 1024**3: + logger.warning("Kill container {0} due to disk pressure. Container size: {1}".format(containers[0][1], containers[0][0])) + subprocess.Popen(["docker", "container", "stop", containers[0][1]]) + + # Because docker stop will not immedicately stop container, we can not remove docker image right after stop container + #container_image = subprocess.Popen(["docker", "inspect", containers[0][1], r"--format='{{.Image}}'"], stdout=subprocess.PIPE).stdout.readline() + #subprocess.Popen(["docker", "image", "rmi", container_image]) + return True + else: + return False + +size_defs={'B':1, 'K':1024, 'M':1024**2, 'G':1024**3, 'T':1024**4, 'b':1, 'k':1024, 'm':1024**2, 'g':1024**3, 't':1024**4} +def calculate_size(size_str): + size_search = re.search(r"[BbKkMmGgTt]", size_str) + return float(size_str[0:size_search.start()]) * size_defs[size_search.group()] + + +if __name__ == "__main__": + common.setup_logging() + check_and_clean(60) \ No newline at end of file diff --git a/src/cleaner/scripts/reclaimable_docker_cache.sh b/src/cleaner/scripts/reclaimable_docker_cache.sh index baa85db0f2..a907394d0e 100644 --- a/src/cleaner/scripts/reclaimable_docker_cache.sh +++ b/src/cleaner/scripts/reclaimable_docker_cache.sh @@ -28,14 +28,16 @@ # # We summer up the result in column 5 (RECLAIMABLE) and return the size in gigabytes. -docker system df | \ +docker system df --format "{{.Reclaimable}}" | \ gawk 'BEGIN {s=0} END {print s} - match($5, /([0-9]+\.?[0-9]*)(M|G|B)/, a) { + match($1, /([0-9]+\.?[0-9]*)(M|G|B|T)/, a) { if(a[2] == "M") s += a[1]/1024; else if(a[2] == "B") s += a[1]/1024/1024; + else if(a[2] == "T") + s += a[1]*1024; else s += a[1]; }' diff --git a/src/cleaner/test/cleaner-test-job.json b/src/cleaner/test/cleaner-test-job.json new file mode 100644 index 0000000000..c71a71e375 --- /dev/null +++ b/src/cleaner/test/cleaner-test-job.json @@ -0,0 +1,22 @@ +{ + "jobName": "cleaner-test-job", + "image": "ubuntu", + "authFile": "", + "dataDir": "", + "outputDir": "", + "codeDir": "", + "virtualCluster": "default", + "gpuType": "", + "retryCount": 0, + "taskRoles": [ + { + "name": "main", + "taskNumber": 1, + "cpuNumber": 4, + "memoryMB": 8192, + "gpuNumber": 0, + "command": "fallocate -l 200G \"fake_base\"; for var in {1..100}; do fallocate -l 1G \"fake$var\"; sleep 5; done" + } + ], + "jobEnvs": {} +} \ No newline at end of file diff --git a/src/job-exporter/src/collector.py b/src/job-exporter/src/collector.py index 2df34dc29d..6e85e68fa9 100644 --- a/src/job-exporter/src/collector.py +++ b/src/job-exporter/src/collector.py @@ -331,7 +331,8 @@ class ContainerCollector(Collector): "node-exporter", "job-exporter", "yarn-exporter", - "nvidia-drivers" + "nvidia-drivers", + "docker-cleaner" ])) def __init__(self, name, sleep_time, atomic_ref, iteration_counter, gpu_info_ref, diff --git a/src/rest-server/src/templates/dockerContainerScript.mustache b/src/rest-server/src/templates/dockerContainerScript.mustache index 753e768163..83bfde7b06 100644 --- a/src/rest-server/src/templates/dockerContainerScript.mustache +++ b/src/rest-server/src/templates/dockerContainerScript.mustache @@ -29,10 +29,17 @@ function exit_handler() "[DEBUG]" "Docker container exit handler: EXIT signal received in docker container, exiting ..." } +function kill_handler() +{ + printf "%s %s\n" \ + "[DEBUG]" "Docker container kill handler: SIGTERM signal received in docker container, exiting ..." + exit 0 +} + set -x PS4="+[\t] " trap exit_handler EXIT - +trap kill_handler SIGTERM touch "/alive/docker_$PAI_CONTAINER_ID" @@ -44,7 +51,6 @@ HDFS_LAUNCHER_PREFIX=$PAI_DEFAULT_FS_URI/Container export CLASSPATH="$(hadoop classpath --glob)" task_role_no={{ idx }} - printf "%s %s\n%s\n\n" "[INFO]" "ENV" "$(printenv | sort)" cp -r /pai/code/* ./ @@ -101,7 +107,6 @@ function get_ssh_key_files() info_source="webhdfs" localKeyPath=/root/.ssh/{{ jobData.jobName }}.pub localPrivateKeyPath=/root/.ssh/{{ jobData.jobName }} - if [[ -f $localKeyPath ]]; then rm -f $localKeyPath fi @@ -164,14 +169,12 @@ function azure_rdma_preparation() {{# paiMachineList }} echo {{hostip}} {{hostname}} >> /hosts.tmp {{/ paiMachineList }} - cat /hosts.tmp > /etc/hosts } azure_rdma_preparation {{/ azRDMA }} {{/ reqAzRDMA }} - # Write env to system-wide environment env | grep -E "^PAI|PATH|PREFIX|JAVA|HADOOP|NVIDIA|CUDA" > /etc/environment From 3fbc7e849f7cbcfa16b6b3806d5e31b81dde4a36 Mon Sep 17 00:00:00 2001 From: WangDian Date: Thu, 31 Jan 2019 18:32:42 +0800 Subject: [PATCH 02/31] Fix bug --- src/cleaner/config/cleaner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cleaner/config/cleaner.py b/src/cleaner/config/cleaner.py index 6c6245d305..dafa3221d0 100644 --- a/src/cleaner/config/cleaner.py +++ b/src/cleaner/config/cleaner.py @@ -18,9 +18,9 @@ import logging import logging.config +import copy - -class Cleaner: +class Cleaner(object): def __init__(self, cluster_conf, service_conf, default_service_conf): self.logger = logging.getLogger(__name__) From dae2b65363ad09db7e86ba3866253449fbeac5b6 Mon Sep 17 00:00:00 2001 From: WangDian Date: Sun, 3 Feb 2019 14:04:20 +0800 Subject: [PATCH 03/31] 1. Remove worker 2. Add interval as var 3. Kill docker, send signal --- .../services-configuration.yaml.template | 5 +- .../services-configuration.yaml | 5 + src/cleaner/cleaner_main.py | 9 +- src/cleaner/config/cleaner.py | 11 ++ src/cleaner/config/cleaner.yaml | 3 +- src/cleaner/deploy/cleaner.yaml.template | 1 + src/cleaner/scripts/clean_docker.py | 120 ++++++++++-------- src/cleaner/utils/common.py | 6 + .../templates/dockerContainerScript.mustache | 6 +- 9 files changed, 104 insertions(+), 62 deletions(-) diff --git a/deployment/quick-start/services-configuration.yaml.template b/deployment/quick-start/services-configuration.yaml.template index 295bb62793..81f0993f70 100644 --- a/deployment/quick-start/services-configuration.yaml.template +++ b/deployment/quick-start/services-configuration.yaml.template @@ -129,6 +129,7 @@ rest-server: # pylon: # port: 80 -# uncomment following section if you want to customize the docker path of cleaner +# uncomment following section if you want to customize the threshold of cleaner # cleaner: -# threshold: 80 \ No newline at end of file +# threshold: 94 +# interval: 60 \ No newline at end of file diff --git a/examples/cluster-configuration/services-configuration.yaml b/examples/cluster-configuration/services-configuration.yaml index 8e22bdc8d0..2c6fed8bba 100644 --- a/examples/cluster-configuration/services-configuration.yaml +++ b/examples/cluster-configuration/services-configuration.yaml @@ -128,3 +128,8 @@ rest-server: # uncomment following section if you want to customize the port of pylon # pylon: # port: 80 + +# uncomment following section if you want to customize the threshold of cleaner +# cleaner: +# threshold: 94 +# interval: 60 \ No newline at end of file diff --git a/src/cleaner/cleaner_main.py b/src/cleaner/cleaner_main.py index 53b3091bf9..2af78875ff 100644 --- a/src/cleaner/cleaner_main.py +++ b/src/cleaner/cleaner_main.py @@ -19,7 +19,7 @@ import argparse import os from datetime import timedelta -from cleaner.scripts import clean_docker_cache, check_deleted_files, clean_docker +from cleaner.scripts.clean_docker import DockerCleaner from cleaner.worker import Worker from cleaner.utils.logger import LoggerMixin from cleaner.utils import common @@ -84,14 +84,13 @@ def get_worker(threshold): def main(): parser = argparse.ArgumentParser() parser.add_argument("-t", "--threshold", help="the disk usage precent to start cleaner") + parser.add_argument("-i", "--interval", help="the base interval to check disk usage") args = parser.parse_args() common.setup_logging() - cleaner = Cleaner("docker-cleaner-healthy") - cleaner.add_worker("docker-cleaner", get_worker(args.threshold)) - cleaner.start() - cleaner.sync() + cleaner = DockerCleaner(args.threshold, args.interval, timedelta(minutes=10)) + cleaner.run() if __name__ == "__main__": diff --git a/src/cleaner/config/cleaner.py b/src/cleaner/config/cleaner.py index dafa3221d0..2eccd24f8a 100644 --- a/src/cleaner/config/cleaner.py +++ b/src/cleaner/config/cleaner.py @@ -42,5 +42,16 @@ def validation_post(self, conf): msg = "expect threshold in cleaner to be int but get %s with type %s" % \ (threshold, type(threshold)) return False, msg + else: + if threshold < 0 or threshold > 100: + msg = "expect threshold in [0, 100]" + return False, msg + + interval = conf["cleaner"].get("interval") + if type(interval) != int: + msg = "expect interval in cleaner to be int but get %s with type %s" % \ + (interval, type(interval)) + return False, msg + return True, None diff --git a/src/cleaner/config/cleaner.yaml b/src/cleaner/config/cleaner.yaml index 4684ccd1e8..4f9a94e88b 100644 --- a/src/cleaner/config/cleaner.yaml +++ b/src/cleaner/config/cleaner.yaml @@ -15,4 +15,5 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -threshold: 80 \ No newline at end of file +threshold: 94 +interval: 60 \ No newline at end of file diff --git a/src/cleaner/deploy/cleaner.yaml.template b/src/cleaner/deploy/cleaner.yaml.template index ba7d4811ae..4c568bf3de 100644 --- a/src/cleaner/deploy/cleaner.yaml.template +++ b/src/cleaner/deploy/cleaner.yaml.template @@ -35,6 +35,7 @@ spec: image: {{ cluster_cfg["cluster"]["docker-registry"]["prefix"] }}cleaner:{{ cluster_cfg["cluster"]["docker-registry"]["tag"] }} args: - -t {{ cluster_cfg["cleaner"]["threshold"] }} + - -i {{ cluster_cfg["cleaner"]["interval"] }} imagePullPolicy: Always securityContext: privileged: True diff --git a/src/cleaner/scripts/clean_docker.py b/src/cleaner/scripts/clean_docker.py index fd7a4e2ab7..6d743b9ffa 100644 --- a/src/cleaner/scripts/clean_docker.py +++ b/src/cleaner/scripts/clean_docker.py @@ -15,71 +15,89 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +from cleaner.utils.logger import LoggerMixin +from cleaner.utils.timer import CountdownTimer, Timeout from cleaner.utils import common +from datetime import timedelta import subprocess import multiprocessing import re +import time -logger = multiprocessing.get_logger() +class DockerCleaner(LoggerMixin): + def __init__(self, threshold, interval, timeout=timedelta(hours=1)): + self.__threshold = int(threshold) + self.__interval = int(interval) + self.__timeout = timeout + def _exec(self): + exc = None + try: + with CountdownTimer(duration=self.__timeout): + self.check_and_clean(self.__threshold, self.__timespan) + except Timeout as e: + self.logger.error("Cleaner timeout.") + exc = e + except Exception as e: + self.logger.error("Unexpected error to run cleaner.") + exc = e -def check_disk_usage(partition): - df = subprocess.Popen(["df","-h", partition], stdout=subprocess.PIPE) - size = 0 - try: - for line in df.stdout: - splitline = line.decode().split() - if splitline[5] == partition: - size = int(splitline[4][:-1]) - except ValueError: - logger.error("cannot get disk size, reset size to 0") - size = 0 - logger.info("Checking disk, disk usage = {0}%".format(size)) - return size + if exc is not None: + self.logger.exception(exc) + + def run(self): + while True: + # allow a delay before the cleaning + time.sleep(self.__timespan) + self._exec() -def check_and_clean(threshold): - if check_disk_usage("/") > int(threshold): - logger.info("Disk usage is above {0}%, Try to remove containers".format(threshold)) - kill_largest_container() + def check_disk_usage(self, partition): + df = subprocess.Popen(["df","-h", partition], stdout=subprocess.PIPE) + size = 0 + try: + for line in df.stdout: + splitline = line.decode().split() + if splitline[5] == partition: + size = int(splitline[4][:-1]) + except ValueError: + self.logger.error("cannot get disk size, reset size to 0") + size = 0 + self.logger.info("Checking disk, disk usage = {0}%".format(size)) + return size -# Clean logic v1: kill largest container -white_list = ["k8s_kube", "k8s_pylon", "k8s_zookeeper", "k8s_rest-server", "k8s_yarn", "k8s_hadoop", "k8s_job-exporter", "k8s_watchdog", "k8s_grafana", "k8s_node-exporter", "k8s_webportal", "k8s_prometheus", "k8s_nvidia-drivers", "k8s_etcd-container", "k8s_apiserver-container", "k8s_docker-cleaner", "kubelet"] -def kill_largest_container(): - containers = [] - # Only try to stop PAI jobs and user created containers - containers_source = subprocess.Popen(["docker", "ps", "-a", "--format", r'{{.ID}}\t{{.Image}}\t{{.Size}}\t{{.Names}}'], stdout=subprocess.PIPE) - for line in containers_source.stdout: - splitline = line.split("\t") - ignore = False - for prefix in white_list: - if (splitline[3].startswith(prefix)): - ignore = True - break - if ignore == False: - size = calculate_size(splitline[2].split()[0]) - containers.append([size, splitline[0], splitline[1]]) + def check_and_clean(self, threshold): + if self.check_disk_usage("/") > self.___threshold: + self.logger.info("Disk usage is above {0}%, Try to remove containers".format(self.__threshold)) + self.kill_largest_container() - containers.sort(key=lambda x:x[0], reverse=True) - if containers.count > 0 and containers[0][0] > 1024**3: - logger.warning("Kill container {0} due to disk pressure. Container size: {1}".format(containers[0][1], containers[0][0])) - subprocess.Popen(["docker", "container", "stop", containers[0][1]]) + # Clean logic v1: kill largest container + white_list = ["k8s_kube", "k8s_pylon", "k8s_zookeeper", "k8s_rest-server", "k8s_yarn", "k8s_hadoop", "k8s_job-exporter", "k8s_watchdog", "k8s_grafana", "k8s_node-exporter", "k8s_webportal", "k8s_prometheus", "k8s_nvidia-drivers", "k8s_etcd-container", "k8s_apiserver-container", "k8s_docker-cleaner", "kubelet", "dev-box"] + def kill_largest_container(self): + containers = [] + # Only try to stop PAI jobs and user created containers + containers_source = subprocess.Popen(["docker", "ps", "-a", "--format", r'{{.ID}}\t{{.Image}}\t{{.Size}}\t{{.Names}}'], stdout=subprocess.PIPE) + for line in containers_source.stdout: + splitline = line.split("\t") + for prefix in white_list: + if (splitline[3].startswith(prefix)): + break + else: + size = calculate_size(splitline[2].split()[0]) + containers.append([size, splitline[0], splitline[1]]) - # Because docker stop will not immedicately stop container, we can not remove docker image right after stop container - #container_image = subprocess.Popen(["docker", "inspect", containers[0][1], r"--format='{{.Image}}'"], stdout=subprocess.PIPE).stdout.readline() - #subprocess.Popen(["docker", "image", "rmi", container_image]) - return True - else: - return False + containers.sort(key=lambda x:x[0], reverse=True) -size_defs={'B':1, 'K':1024, 'M':1024**2, 'G':1024**3, 'T':1024**4, 'b':1, 'k':1024, 'm':1024**2, 'g':1024**3, 't':1024**4} -def calculate_size(size_str): - size_search = re.search(r"[BbKkMmGgTt]", size_str) - return float(size_str[0:size_search.start()]) * size_defs[size_search.group()] + if containers.count > 0 and containers[0][0] > 1024**3: + logger.warning("Kill container {0} due to disk pressure. Container size: {1}".format(containers[0][1], containers[0][0])) + subprocess.Popen(["docker", "kill", "--signal=SIGRTMIN+8", containers[0][1]]) + # Because docker stop will not immedicately stop container, we can not remove docker image right after stop container + #container_image = subprocess.Popen(["docker", "inspect", containers[0][1], r"--format='{{.Image}}'"], stdout=subprocess.PIPE).stdout.readline() + #subprocess.Popen(["docker", "image", "rmi", container_image]) + return True + else: + return False -if __name__ == "__main__": - common.setup_logging() - check_and_clean(60) \ No newline at end of file diff --git a/src/cleaner/utils/common.py b/src/cleaner/utils/common.py index 70b98942e7..d09c8bb061 100644 --- a/src/cleaner/utils/common.py +++ b/src/cleaner/utils/common.py @@ -129,3 +129,9 @@ def setup_logging(): handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(logging.INFO) + + +size_defs={'B':1, 'K':1024, 'M':1024**2, 'G':1024**3, 'T':1024**4, 'b':1, 'k':1024, 'm':1024**2, 'g':1024**3, 't':1024**4} +def calculate_size(size_str): + size_search = re.search(r"[BbKkMmGgTt]", size_str) + return float(size_str[0:size_search.start()]) * size_defs[size_search.group()] \ No newline at end of file diff --git a/src/rest-server/src/templates/dockerContainerScript.mustache b/src/rest-server/src/templates/dockerContainerScript.mustache index 83bfde7b06..8c8c1b3e7e 100644 --- a/src/rest-server/src/templates/dockerContainerScript.mustache +++ b/src/rest-server/src/templates/dockerContainerScript.mustache @@ -32,14 +32,14 @@ function exit_handler() function kill_handler() { printf "%s %s\n" \ - "[DEBUG]" "Docker container kill handler: SIGTERM signal received in docker container, exiting ..." - exit 0 + "[INFO]" "Docker container killed due to disk pressure. If your job needs large disk space, please use HDFS or NFS to store your data." + exit 1 } set -x PS4="+[\t] " trap exit_handler EXIT -trap kill_handler SIGTERM +trap kill_handler SIGRTMIN+8 touch "/alive/docker_$PAI_CONTAINER_ID" From 2e9482b5bf0006f6ce744a343e17f841fbdecaaf Mon Sep 17 00:00:00 2001 From: WangDian Date: Sun, 3 Feb 2019 14:40:59 +0800 Subject: [PATCH 04/31] Fix '\t' --- src/cleaner/deploy/cleaner.yaml.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cleaner/deploy/cleaner.yaml.template b/src/cleaner/deploy/cleaner.yaml.template index 4c568bf3de..0eb259f576 100644 --- a/src/cleaner/deploy/cleaner.yaml.template +++ b/src/cleaner/deploy/cleaner.yaml.template @@ -35,7 +35,7 @@ spec: image: {{ cluster_cfg["cluster"]["docker-registry"]["prefix"] }}cleaner:{{ cluster_cfg["cluster"]["docker-registry"]["tag"] }} args: - -t {{ cluster_cfg["cleaner"]["threshold"] }} - - -i {{ cluster_cfg["cleaner"]["interval"] }} + - -i {{ cluster_cfg["cleaner"]["interval"] }} imagePullPolicy: Always securityContext: privileged: True From faa18f0e91155d42c134b3df75742fd2bd084a97 Mon Sep 17 00:00:00 2001 From: WangDian Date: Sun, 3 Feb 2019 15:41:33 +0800 Subject: [PATCH 05/31] Bug fix --- src/cleaner/scripts/clean_docker.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/cleaner/scripts/clean_docker.py b/src/cleaner/scripts/clean_docker.py index 6d743b9ffa..37f3eb7e80 100644 --- a/src/cleaner/scripts/clean_docker.py +++ b/src/cleaner/scripts/clean_docker.py @@ -34,7 +34,7 @@ def _exec(self): exc = None try: with CountdownTimer(duration=self.__timeout): - self.check_and_clean(self.__threshold, self.__timespan) + self.check_and_clean() except Timeout as e: self.logger.error("Cleaner timeout.") exc = e @@ -48,7 +48,7 @@ def _exec(self): def run(self): while True: # allow a delay before the cleaning - time.sleep(self.__timespan) + time.sleep(self.__interval) self._exec() @@ -67,8 +67,8 @@ def check_disk_usage(self, partition): return size - def check_and_clean(self, threshold): - if self.check_disk_usage("/") > self.___threshold: + def check_and_clean(self): + if self.check_disk_usage("/") >= self.__threshold: self.logger.info("Disk usage is above {0}%, Try to remove containers".format(self.__threshold)) self.kill_largest_container() @@ -81,11 +81,11 @@ def kill_largest_container(self): containers_source = subprocess.Popen(["docker", "ps", "-a", "--format", r'{{.ID}}\t{{.Image}}\t{{.Size}}\t{{.Names}}'], stdout=subprocess.PIPE) for line in containers_source.stdout: splitline = line.split("\t") - for prefix in white_list: + for prefix in self.white_list: if (splitline[3].startswith(prefix)): break else: - size = calculate_size(splitline[2].split()[0]) + size = common.calculate_size(splitline[2].split()[0]) containers.append([size, splitline[0], splitline[1]]) containers.sort(key=lambda x:x[0], reverse=True) From f88c51e66f22e0b5547e6d0875e880e3d53efbcb Mon Sep 17 00:00:00 2001 From: WangDian Date: Sun, 3 Feb 2019 15:50:38 +0800 Subject: [PATCH 06/31] Import re in common --- src/cleaner/utils/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cleaner/utils/common.py b/src/cleaner/utils/common.py index d09c8bb061..875d74c097 100644 --- a/src/cleaner/utils/common.py +++ b/src/cleaner/utils/common.py @@ -22,7 +22,7 @@ import os import psutil import signal - +import re def kill_process_tree(pid, time_to_die, logger): """ From 04b67df698c19b98d3cbcd4d2b58294e7c9f9963 Mon Sep 17 00:00:00 2001 From: WangDian Date: Mon, 11 Feb 2019 13:41:24 +0800 Subject: [PATCH 07/31] Update openjdk build version --- src/base-image/build/base-image.dockerfile | 4 ++-- src/cleaner/scripts/clean_docker.py | 2 +- src/dev-box/build/dev-box.dockerfile | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/base-image/build/base-image.dockerfile b/src/base-image/build/base-image.dockerfile index 4ffaef3d3f..42d724e503 100644 --- a/src/base-image/build/base-image.dockerfile +++ b/src/base-image/build/base-image.dockerfile @@ -38,8 +38,8 @@ RUN apt-get -y update && \ python-dev \ python-pip \ python-mysqldb \ - openjdk-8-jre=8u191-b12-0ubuntu0.16.04.1 \ - openjdk-8-jdk=8u191-b12-0ubuntu0.16.04.1 \ + openjdk-8-jre=8u191-b12-2ubuntu0.16.04.1 \ + openjdk-8-jdk=8u191-b12-2ubuntu0.16.04.1 \ openssh-server \ openssh-client \ git \ diff --git a/src/cleaner/scripts/clean_docker.py b/src/cleaner/scripts/clean_docker.py index 37f3eb7e80..f44996a246 100644 --- a/src/cleaner/scripts/clean_docker.py +++ b/src/cleaner/scripts/clean_docker.py @@ -91,7 +91,7 @@ def kill_largest_container(self): containers.sort(key=lambda x:x[0], reverse=True) if containers.count > 0 and containers[0][0] > 1024**3: - logger.warning("Kill container {0} due to disk pressure. Container size: {1}".format(containers[0][1], containers[0][0])) + self.logger.warning("Kill container {0} due to disk pressure. Container size: {1}".format(containers[0][1], containers[0][0])) subprocess.Popen(["docker", "kill", "--signal=SIGRTMIN+8", containers[0][1]]) # Because docker stop will not immedicately stop container, we can not remove docker image right after stop container diff --git a/src/dev-box/build/dev-box.dockerfile b/src/dev-box/build/dev-box.dockerfile index 8d0e97ff29..b2b2dc13f0 100644 --- a/src/dev-box/build/dev-box.dockerfile +++ b/src/dev-box/build/dev-box.dockerfile @@ -39,8 +39,8 @@ RUN apt-get -y update && \ python-dev \ python-pip \ python-mysqldb \ - openjdk-8-jre=8u191-b12-0ubuntu0.16.04.1 \ - openjdk-8-jdk=8u191-b12-0ubuntu0.16.04.1 \ + openjdk-8-jre=8u191-b12-2ubuntu0.16.04.1 \ + openjdk-8-jdk=8u191-b12-2ubuntu0.16.04.1 \ openssh-server \ openssh-client \ git \ From f0c3c1973bdc46fd399af916358958533060322d Mon Sep 17 00:00:00 2001 From: YundongYe Date: Mon, 11 Feb 2019 16:19:30 +0800 Subject: [PATCH 08/31] [JDK] Remove JDK version hardcode and add print in pai_build for debuging (#2123) --- build/core/build_center.py | 8 +++----- .../k8sPaiLibrary/maintaintool/restart-etcd-server.sh | 1 + paictl.py | 1 - src/base-image/build/base-image.dockerfile | 4 ++-- src/dev-box/build/dev-box.dockerfile | 4 ++-- src/hadoop-ai/build/hadoop-ai | 4 ++-- 6 files changed, 10 insertions(+), 12 deletions(-) diff --git a/build/core/build_center.py b/build/core/build_center.py index 73d89530aa..d1c4f437f1 100644 --- a/build/core/build_center.py +++ b/build/core/build_center.py @@ -23,6 +23,7 @@ import os import sys +import traceback import logging import logging.config @@ -122,8 +123,9 @@ def build_center(self): build_worker.build_single_component(self.graph.services[item]) self.logger.info("Build all components succeed") - except: + except Exception, err: self.logger.error("Build all components failed") + traceback.print_exc() sys.exit(1) finally: @@ -156,7 +158,3 @@ def push_center(self): self.docker_cli.docker_image_tag(image,self.build_config['dockerRegistryInfo']['dockerTag']) self.docker_cli.docker_image_push(image,self.build_config['dockerRegistryInfo']['dockerTag']) self.logger.info("Push image:{0} successfully".format(image)) - - - - diff --git a/deployment/k8sPaiLibrary/maintaintool/restart-etcd-server.sh b/deployment/k8sPaiLibrary/maintaintool/restart-etcd-server.sh index 70f6c377e3..34212318f3 100755 --- a/deployment/k8sPaiLibrary/maintaintool/restart-etcd-server.sh +++ b/deployment/k8sPaiLibrary/maintaintool/restart-etcd-server.sh @@ -18,3 +18,4 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. cp etcd-reconfiguration-restart/etcd.yaml /etc/kubernetes/manifests/ + diff --git a/paictl.py b/paictl.py index 1f68602c38..ba05218549 100755 --- a/paictl.py +++ b/paictl.py @@ -99,4 +99,3 @@ def main(args): setup_logging() main(sys.argv[1:]) - diff --git a/src/base-image/build/base-image.dockerfile b/src/base-image/build/base-image.dockerfile index 4ffaef3d3f..8d76018a81 100644 --- a/src/base-image/build/base-image.dockerfile +++ b/src/base-image/build/base-image.dockerfile @@ -38,8 +38,8 @@ RUN apt-get -y update && \ python-dev \ python-pip \ python-mysqldb \ - openjdk-8-jre=8u191-b12-0ubuntu0.16.04.1 \ - openjdk-8-jdk=8u191-b12-0ubuntu0.16.04.1 \ + openjdk-8-jre \ + openjdk-8-jdk \ openssh-server \ openssh-client \ git \ diff --git a/src/dev-box/build/dev-box.dockerfile b/src/dev-box/build/dev-box.dockerfile index 8d0e97ff29..3427011a20 100644 --- a/src/dev-box/build/dev-box.dockerfile +++ b/src/dev-box/build/dev-box.dockerfile @@ -39,8 +39,8 @@ RUN apt-get -y update && \ python-dev \ python-pip \ python-mysqldb \ - openjdk-8-jre=8u191-b12-0ubuntu0.16.04.1 \ - openjdk-8-jdk=8u191-b12-0ubuntu0.16.04.1 \ + openjdk-8-jre \ + openjdk-8-jdk \ openssh-server \ openssh-client \ git \ diff --git a/src/hadoop-ai/build/hadoop-ai b/src/hadoop-ai/build/hadoop-ai index dda11091e1..3a9d0709ce 100644 --- a/src/hadoop-ai/build/hadoop-ai +++ b/src/hadoop-ai/build/hadoop-ai @@ -38,8 +38,8 @@ RUN apt-get -y update && \ python-dev \ python-pip \ python-mysqldb \ - openjdk-8-jre=8u191-b12-0ubuntu0.16.04.1 \ - openjdk-8-jdk=8u191-b12-0ubuntu0.16.04.1 \ + openjdk-8-jre \ + openjdk-8-jdk \ openssh-server \ openssh-client \ git \ From 3398ab19659acaaf5cb53fc7f4146e25afd1b85f Mon Sep 17 00:00:00 2001 From: WangDian Date: Mon, 11 Feb 2019 16:54:16 +0800 Subject: [PATCH 09/31] Change kill signal to 10 (SIGUSR1) --- src/cleaner/scripts/clean_docker.py | 2 +- src/rest-server/src/templates/dockerContainerScript.mustache | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cleaner/scripts/clean_docker.py b/src/cleaner/scripts/clean_docker.py index f44996a246..8e3bcbf4cf 100644 --- a/src/cleaner/scripts/clean_docker.py +++ b/src/cleaner/scripts/clean_docker.py @@ -92,7 +92,7 @@ def kill_largest_container(self): if containers.count > 0 and containers[0][0] > 1024**3: self.logger.warning("Kill container {0} due to disk pressure. Container size: {1}".format(containers[0][1], containers[0][0])) - subprocess.Popen(["docker", "kill", "--signal=SIGRTMIN+8", containers[0][1]]) + subprocess.Popen(["docker", "kill", "--signal=10", containers[0][1]]) # Because docker stop will not immedicately stop container, we can not remove docker image right after stop container #container_image = subprocess.Popen(["docker", "inspect", containers[0][1], r"--format='{{.Image}}'"], stdout=subprocess.PIPE).stdout.readline() diff --git a/src/rest-server/src/templates/dockerContainerScript.mustache b/src/rest-server/src/templates/dockerContainerScript.mustache index 8c8c1b3e7e..92f7e69397 100644 --- a/src/rest-server/src/templates/dockerContainerScript.mustache +++ b/src/rest-server/src/templates/dockerContainerScript.mustache @@ -33,13 +33,13 @@ function kill_handler() { printf "%s %s\n" \ "[INFO]" "Docker container killed due to disk pressure. If your job needs large disk space, please use HDFS or NFS to store your data." - exit 1 + exit 1 } set -x PS4="+[\t] " trap exit_handler EXIT -trap kill_handler SIGRTMIN+8 +trap kill_handler 10 touch "/alive/docker_$PAI_CONTAINER_ID" From 6b2842f616b4b1219226a2c007a0fa20ab432ec5 Mon Sep 17 00:00:00 2001 From: YundongYe Date: Mon, 11 Feb 2019 17:27:19 +0800 Subject: [PATCH 10/31] [Pod Eviction] Disable kubernetes's pod eviction (#2124) * According to https://github.com/kubernetes/kubernetes/issues/71661 * add imagegc threshold --- deployment/k8sPaiLibrary/template/kubelet.sh.template | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deployment/k8sPaiLibrary/template/kubelet.sh.template b/deployment/k8sPaiLibrary/template/kubelet.sh.template index 22e6125bb4..8cb2d59b79 100644 --- a/deployment/k8sPaiLibrary/template/kubelet.sh.template +++ b/deployment/k8sPaiLibrary/template/kubelet.sh.template @@ -66,7 +66,9 @@ docker run \ --image-pull-progress-deadline=10m \ --docker-root=${DOCKER_ROOT_DIR_FOR_KUBELET} \ --system-reserved=memory=3Gi \ - --eviction-hard="memory.available<5%,nodefs.available<5%,imagefs.available<5%,nodefs.inodesFree<5%,imagefs.inodesFree<5%" \ + --eviction-hard= \ + --image-gc-high-threshold=100 \ + --image-gc-low-threshold=95 \ --healthz-bind-address="0.0.0.0" \ --healthz-port="10248" \ --feature-gates="DevicePlugins=true,TaintBasedEvictions=true" \ From c7f8261a965cbd371ecb76729b595e107efa2e56 Mon Sep 17 00:00:00 2001 From: WangDian Date: Mon, 11 Feb 2019 18:01:49 +0800 Subject: [PATCH 11/31] Add k8s_POD to white list Add markdown document for cleaner configuration --- src/base-image/build/base-image.dockerfile | 4 +- src/cleaner/config/cleaner.md | 54 ++++++++++++++++++++++ src/cleaner/scripts/clean_docker.py | 2 +- src/dev-box/build/dev-box.dockerfile | 4 +- 4 files changed, 59 insertions(+), 5 deletions(-) create mode 100644 src/cleaner/config/cleaner.md diff --git a/src/base-image/build/base-image.dockerfile b/src/base-image/build/base-image.dockerfile index 42d724e503..8d76018a81 100644 --- a/src/base-image/build/base-image.dockerfile +++ b/src/base-image/build/base-image.dockerfile @@ -38,8 +38,8 @@ RUN apt-get -y update && \ python-dev \ python-pip \ python-mysqldb \ - openjdk-8-jre=8u191-b12-2ubuntu0.16.04.1 \ - openjdk-8-jdk=8u191-b12-2ubuntu0.16.04.1 \ + openjdk-8-jre \ + openjdk-8-jdk \ openssh-server \ openssh-client \ git \ diff --git a/src/cleaner/config/cleaner.md b/src/cleaner/config/cleaner.md new file mode 100644 index 0000000000..4acd44a15b --- /dev/null +++ b/src/cleaner/config/cleaner.md @@ -0,0 +1,54 @@ +## Cleaner section parser + +- [Default Configuration](#D_Config) +- [How to Configure](#HT_Config) +- [Generated Configuraiton](#G_Config) +- [Data Table](#T_config) + +#### Default configuration + +[cleaner default configuration](cleaner.yaml) + +#### How to configure cluster section in service-configuraiton.yaml + +All configurations in this section is optional. If you want to customized these value, you can configure it in service-configuration.yaml. + +For example, if you want to use different threshold than the default value 94, add following to your service-configuration.yaml as following: +```yaml +cleaner: + threshold: new-value + interval: new-value +``` + +#### Generated Configuration + +After parsing, object model looks like: +```yaml +cleaner: + threshold: 94 + interval: 60 +``` + + +#### Table + + + + + + + + + + + + + + + + + + + + +
Data in Configuration FileData in Cluster Object ModelData in Jinja2 TemplateData type
cleaner.thresholdcom["cleaner"]["threshold"]cluster_cfg["cleaner"]["threshold"]Int
cleaner.intervalcom["cleaner"]["interval"]cluster_cfg["cleaner"]["interval"]Int
diff --git a/src/cleaner/scripts/clean_docker.py b/src/cleaner/scripts/clean_docker.py index 8e3bcbf4cf..d70251686d 100644 --- a/src/cleaner/scripts/clean_docker.py +++ b/src/cleaner/scripts/clean_docker.py @@ -74,7 +74,7 @@ def check_and_clean(self): # Clean logic v1: kill largest container - white_list = ["k8s_kube", "k8s_pylon", "k8s_zookeeper", "k8s_rest-server", "k8s_yarn", "k8s_hadoop", "k8s_job-exporter", "k8s_watchdog", "k8s_grafana", "k8s_node-exporter", "k8s_webportal", "k8s_prometheus", "k8s_nvidia-drivers", "k8s_etcd-container", "k8s_apiserver-container", "k8s_docker-cleaner", "kubelet", "dev-box"] + white_list = ["k8s_POD", "k8s_kube", "k8s_pylon", "k8s_zookeeper", "k8s_rest-server", "k8s_yarn", "k8s_hadoop", "k8s_job-exporter", "k8s_watchdog", "k8s_grafana", "k8s_node-exporter", "k8s_webportal", "k8s_prometheus", "k8s_nvidia-drivers", "k8s_etcd-container", "k8s_apiserver-container", "k8s_docker-cleaner", "kubelet", "dev-box"] def kill_largest_container(self): containers = [] # Only try to stop PAI jobs and user created containers diff --git a/src/dev-box/build/dev-box.dockerfile b/src/dev-box/build/dev-box.dockerfile index b2b2dc13f0..b1e22e4047 100644 --- a/src/dev-box/build/dev-box.dockerfile +++ b/src/dev-box/build/dev-box.dockerfile @@ -39,8 +39,8 @@ RUN apt-get -y update && \ python-dev \ python-pip \ python-mysqldb \ - openjdk-8-jre=8u191-b12-2ubuntu0.16.04.1 \ - openjdk-8-jdk=8u191-b12-2ubuntu0.16.04.1 \ + openjdk-8-jre \ + openjdk-8-jdk1 \ openssh-server \ openssh-client \ git \ From 1f05b00444c568d8c92d1971e7b0a90d5acab24d Mon Sep 17 00:00:00 2001 From: WangDian Date: Mon, 11 Feb 2019 18:03:25 +0800 Subject: [PATCH 12/31] Minor bug fix --- src/dev-box/build/dev-box.dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dev-box/build/dev-box.dockerfile b/src/dev-box/build/dev-box.dockerfile index b1e22e4047..3427011a20 100644 --- a/src/dev-box/build/dev-box.dockerfile +++ b/src/dev-box/build/dev-box.dockerfile @@ -40,7 +40,7 @@ RUN apt-get -y update && \ python-pip \ python-mysqldb \ openjdk-8-jre \ - openjdk-8-jdk1 \ + openjdk-8-jdk \ openssh-server \ openssh-client \ git \ From d45083d0902691a449b2e8b2c22390d3b0540380 Mon Sep 17 00:00:00 2001 From: WangDian Date: Thu, 31 Jan 2019 18:25:20 +0800 Subject: [PATCH 13/31] Cleaner logic --- .gitignore | 4 + .../services-configuration.yaml.template | 4 + src/cleaner/cleaner_main.py | 25 ++---- src/cleaner/config/cleaner.py | 22 +++-- src/cleaner/config/cleaner.yaml | 1 + src/cleaner/deploy/cleaner.yaml.template | 35 ++------ src/cleaner/scripts/clean_docker.py | 85 +++++++++++++++++++ .../scripts/reclaimable_docker_cache.sh | 6 +- src/cleaner/test/cleaner-test-job.json | 22 +++++ src/job-exporter/src/collector.py | 3 +- .../templates/dockerContainerScript.mustache | 13 +-- 11 files changed, 158 insertions(+), 62 deletions(-) create mode 100644 src/cleaner/scripts/clean_docker.py create mode 100644 src/cleaner/test/cleaner-test-job.json diff --git a/.gitignore b/.gitignore index 30c060ec47..37c834877e 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,7 @@ __pycache__ *.swp *.swo .DS_Store +*.sln +*.pyproj.user +*.pyproj +*.vs diff --git a/deployment/quick-start/services-configuration.yaml.template b/deployment/quick-start/services-configuration.yaml.template index 842e455214..295bb62793 100644 --- a/deployment/quick-start/services-configuration.yaml.template +++ b/deployment/quick-start/services-configuration.yaml.template @@ -128,3 +128,7 @@ rest-server: # uncomment following section if you want to customize the port of pylon # pylon: # port: 80 + +# uncomment following section if you want to customize the docker path of cleaner +# cleaner: +# threshold: 80 \ No newline at end of file diff --git a/src/cleaner/cleaner_main.py b/src/cleaner/cleaner_main.py index d4e8806d7e..53b3091bf9 100644 --- a/src/cleaner/cleaner_main.py +++ b/src/cleaner/cleaner_main.py @@ -19,7 +19,7 @@ import argparse import os from datetime import timedelta -from cleaner.scripts import clean_docker_cache, check_deleted_files +from cleaner.scripts import clean_docker_cache, check_deleted_files, clean_docker from cleaner.worker import Worker from cleaner.utils.logger import LoggerMixin from cleaner.utils import common @@ -76,31 +76,20 @@ def sync(self): time.sleep(1) -def get_worker(arg): - if arg == "docker_cache": - worker = Worker(clean_docker_cache.check_and_clean, 50, timeout=timedelta(minutes=10), cool_down_time=1800) - elif arg == "deleted_files": - worker = Worker(check_deleted_files.list_and_check_files, None, timeout=timedelta(minutes=10), cool_down_time=1800) - else: - raise ValueError("arguments %s is not supported.", arg) - return worker - - -liveness_files = { - "docker_cache": "docker-cache-cleaner-healthy", - "deleted_files": "deleted-files-cleaner-healthy" -} +def get_worker(threshold): + worker = Worker(clean_docker.check_and_clean, threshold, timeout=timedelta(minutes=10), cool_down_time=60) + return worker; def main(): parser = argparse.ArgumentParser() - parser.add_argument("option", help="the functions currently supported: [docker_cache | deleted_files]") + parser.add_argument("-t", "--threshold", help="the disk usage precent to start cleaner") args = parser.parse_args() common.setup_logging() - cleaner = Cleaner(liveness_files[args.option]) - cleaner.add_worker(args.option, get_worker(args.option)) + cleaner = Cleaner("docker-cleaner-healthy") + cleaner.add_worker("docker-cleaner", get_worker(args.threshold)) cleaner.start() cleaner.sync() diff --git a/src/cleaner/config/cleaner.py b/src/cleaner/config/cleaner.py index 71c8050638..6c6245d305 100644 --- a/src/cleaner/config/cleaner.py +++ b/src/cleaner/config/cleaner.py @@ -22,19 +22,25 @@ class Cleaner: - def __init__(self, cluster_configuration, service_configuration, default_service_configuraiton): + def __init__(self, cluster_conf, service_conf, default_service_conf): self.logger = logging.getLogger(__name__) - - self.cluster_configuration = cluster_configuration + self.cluster_conf = cluster_conf + self.service_conf = service_conf + self.default_service_conf = default_service_conf def validation_pre(self): return True, None def run(self): - com = {} - - return com - - def validation_post(self, cluster_object_model): + result = copy.deepcopy(self.default_service_conf) + result.update(self.service_conf) + return result + + def validation_post(self, conf): + threshold = conf["cleaner"].get("threshold") + if type(threshold) != int: + msg = "expect threshold in cleaner to be int but get %s with type %s" % \ + (threshold, type(threshold)) + return False, msg return True, None diff --git a/src/cleaner/config/cleaner.yaml b/src/cleaner/config/cleaner.yaml index dbde8d9a14..4684ccd1e8 100644 --- a/src/cleaner/config/cleaner.yaml +++ b/src/cleaner/config/cleaner.yaml @@ -15,3 +15,4 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +threshold: 80 \ No newline at end of file diff --git a/src/cleaner/deploy/cleaner.yaml.template b/src/cleaner/deploy/cleaner.yaml.template index 147729677b..ba7d4811ae 100644 --- a/src/cleaner/deploy/cleaner.yaml.template +++ b/src/cleaner/deploy/cleaner.yaml.template @@ -31,42 +31,16 @@ spec: hostPID: true hostNetwork: true containers: - - name: docker-cache-cleaner + - name: docker-cleaner image: {{ cluster_cfg["cluster"]["docker-registry"]["prefix"] }}cleaner:{{ cluster_cfg["cluster"]["docker-registry"]["tag"] }} args: - - 'docker_cache' + - -t {{ cluster_cfg["cleaner"]["threshold"] }} imagePullPolicy: Always securityContext: privileged: True volumeMounts: - mountPath: /var/run/docker.sock name: docker-socket - livenessProbe: - exec: - command: - - test - - '`find /tmp/docker-cache-cleaner-healthy -mmin -1`' - initialDelaySeconds: 60 - periodSeconds: 30 - {%- if cluster_cfg['cluster']['common']['qos-switch'] == "true" %} - resources: - limits: - memory: "1Gi" - {%- endif %} - - name: deleted-files-cleaner - image: {{ cluster_cfg["cluster"]["docker-registry"]["prefix"] }}cleaner:{{ cluster_cfg["cluster"]["docker-registry"]["tag"] }} - args: - - 'deleted_files' - imagePullPolicy: Always - securityContext: - privileged: True - livenessProbe: - exec: - command: - - test - - '`find /tmp/deleted-files-cleaner-healthy -mmin -1`' - initialDelaySeconds: 60 - periodSeconds: 30 {%- if cluster_cfg['cluster']['common']['qos-switch'] == "true" %} resources: limits: @@ -78,3 +52,8 @@ spec: - name: docker-socket hostPath: path: /var/run/docker.sock + tolerations: + - key: node.kubernetes.io/memory-pressure + operator: "Exists" + - key: node.kubernetes.io/disk-pressure + operator: "Exists" \ No newline at end of file diff --git a/src/cleaner/scripts/clean_docker.py b/src/cleaner/scripts/clean_docker.py new file mode 100644 index 0000000000..fd7a4e2ab7 --- /dev/null +++ b/src/cleaner/scripts/clean_docker.py @@ -0,0 +1,85 @@ +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from cleaner.utils import common +import subprocess +import multiprocessing +import re + +logger = multiprocessing.get_logger() + + +def check_disk_usage(partition): + df = subprocess.Popen(["df","-h", partition], stdout=subprocess.PIPE) + size = 0 + try: + for line in df.stdout: + splitline = line.decode().split() + if splitline[5] == partition: + size = int(splitline[4][:-1]) + except ValueError: + logger.error("cannot get disk size, reset size to 0") + size = 0 + logger.info("Checking disk, disk usage = {0}%".format(size)) + return size + + +def check_and_clean(threshold): + if check_disk_usage("/") > int(threshold): + logger.info("Disk usage is above {0}%, Try to remove containers".format(threshold)) + kill_largest_container() + + +# Clean logic v1: kill largest container +white_list = ["k8s_kube", "k8s_pylon", "k8s_zookeeper", "k8s_rest-server", "k8s_yarn", "k8s_hadoop", "k8s_job-exporter", "k8s_watchdog", "k8s_grafana", "k8s_node-exporter", "k8s_webportal", "k8s_prometheus", "k8s_nvidia-drivers", "k8s_etcd-container", "k8s_apiserver-container", "k8s_docker-cleaner", "kubelet"] +def kill_largest_container(): + containers = [] + # Only try to stop PAI jobs and user created containers + containers_source = subprocess.Popen(["docker", "ps", "-a", "--format", r'{{.ID}}\t{{.Image}}\t{{.Size}}\t{{.Names}}'], stdout=subprocess.PIPE) + for line in containers_source.stdout: + splitline = line.split("\t") + ignore = False + for prefix in white_list: + if (splitline[3].startswith(prefix)): + ignore = True + break + if ignore == False: + size = calculate_size(splitline[2].split()[0]) + containers.append([size, splitline[0], splitline[1]]) + + containers.sort(key=lambda x:x[0], reverse=True) + + if containers.count > 0 and containers[0][0] > 1024**3: + logger.warning("Kill container {0} due to disk pressure. Container size: {1}".format(containers[0][1], containers[0][0])) + subprocess.Popen(["docker", "container", "stop", containers[0][1]]) + + # Because docker stop will not immedicately stop container, we can not remove docker image right after stop container + #container_image = subprocess.Popen(["docker", "inspect", containers[0][1], r"--format='{{.Image}}'"], stdout=subprocess.PIPE).stdout.readline() + #subprocess.Popen(["docker", "image", "rmi", container_image]) + return True + else: + return False + +size_defs={'B':1, 'K':1024, 'M':1024**2, 'G':1024**3, 'T':1024**4, 'b':1, 'k':1024, 'm':1024**2, 'g':1024**3, 't':1024**4} +def calculate_size(size_str): + size_search = re.search(r"[BbKkMmGgTt]", size_str) + return float(size_str[0:size_search.start()]) * size_defs[size_search.group()] + + +if __name__ == "__main__": + common.setup_logging() + check_and_clean(60) \ No newline at end of file diff --git a/src/cleaner/scripts/reclaimable_docker_cache.sh b/src/cleaner/scripts/reclaimable_docker_cache.sh index baa85db0f2..a907394d0e 100644 --- a/src/cleaner/scripts/reclaimable_docker_cache.sh +++ b/src/cleaner/scripts/reclaimable_docker_cache.sh @@ -28,14 +28,16 @@ # # We summer up the result in column 5 (RECLAIMABLE) and return the size in gigabytes. -docker system df | \ +docker system df --format "{{.Reclaimable}}" | \ gawk 'BEGIN {s=0} END {print s} - match($5, /([0-9]+\.?[0-9]*)(M|G|B)/, a) { + match($1, /([0-9]+\.?[0-9]*)(M|G|B|T)/, a) { if(a[2] == "M") s += a[1]/1024; else if(a[2] == "B") s += a[1]/1024/1024; + else if(a[2] == "T") + s += a[1]*1024; else s += a[1]; }' diff --git a/src/cleaner/test/cleaner-test-job.json b/src/cleaner/test/cleaner-test-job.json new file mode 100644 index 0000000000..c71a71e375 --- /dev/null +++ b/src/cleaner/test/cleaner-test-job.json @@ -0,0 +1,22 @@ +{ + "jobName": "cleaner-test-job", + "image": "ubuntu", + "authFile": "", + "dataDir": "", + "outputDir": "", + "codeDir": "", + "virtualCluster": "default", + "gpuType": "", + "retryCount": 0, + "taskRoles": [ + { + "name": "main", + "taskNumber": 1, + "cpuNumber": 4, + "memoryMB": 8192, + "gpuNumber": 0, + "command": "fallocate -l 200G \"fake_base\"; for var in {1..100}; do fallocate -l 1G \"fake$var\"; sleep 5; done" + } + ], + "jobEnvs": {} +} \ No newline at end of file diff --git a/src/job-exporter/src/collector.py b/src/job-exporter/src/collector.py index 2df34dc29d..6e85e68fa9 100644 --- a/src/job-exporter/src/collector.py +++ b/src/job-exporter/src/collector.py @@ -331,7 +331,8 @@ class ContainerCollector(Collector): "node-exporter", "job-exporter", "yarn-exporter", - "nvidia-drivers" + "nvidia-drivers", + "docker-cleaner" ])) def __init__(self, name, sleep_time, atomic_ref, iteration_counter, gpu_info_ref, diff --git a/src/rest-server/src/templates/dockerContainerScript.mustache b/src/rest-server/src/templates/dockerContainerScript.mustache index 753e768163..83bfde7b06 100644 --- a/src/rest-server/src/templates/dockerContainerScript.mustache +++ b/src/rest-server/src/templates/dockerContainerScript.mustache @@ -29,10 +29,17 @@ function exit_handler() "[DEBUG]" "Docker container exit handler: EXIT signal received in docker container, exiting ..." } +function kill_handler() +{ + printf "%s %s\n" \ + "[DEBUG]" "Docker container kill handler: SIGTERM signal received in docker container, exiting ..." + exit 0 +} + set -x PS4="+[\t] " trap exit_handler EXIT - +trap kill_handler SIGTERM touch "/alive/docker_$PAI_CONTAINER_ID" @@ -44,7 +51,6 @@ HDFS_LAUNCHER_PREFIX=$PAI_DEFAULT_FS_URI/Container export CLASSPATH="$(hadoop classpath --glob)" task_role_no={{ idx }} - printf "%s %s\n%s\n\n" "[INFO]" "ENV" "$(printenv | sort)" cp -r /pai/code/* ./ @@ -101,7 +107,6 @@ function get_ssh_key_files() info_source="webhdfs" localKeyPath=/root/.ssh/{{ jobData.jobName }}.pub localPrivateKeyPath=/root/.ssh/{{ jobData.jobName }} - if [[ -f $localKeyPath ]]; then rm -f $localKeyPath fi @@ -164,14 +169,12 @@ function azure_rdma_preparation() {{# paiMachineList }} echo {{hostip}} {{hostname}} >> /hosts.tmp {{/ paiMachineList }} - cat /hosts.tmp > /etc/hosts } azure_rdma_preparation {{/ azRDMA }} {{/ reqAzRDMA }} - # Write env to system-wide environment env | grep -E "^PAI|PATH|PREFIX|JAVA|HADOOP|NVIDIA|CUDA" > /etc/environment From 467e4dde512cb1851bfb91d1e72ef83c9effb4a5 Mon Sep 17 00:00:00 2001 From: WangDian Date: Thu, 31 Jan 2019 18:32:42 +0800 Subject: [PATCH 14/31] Fix bug --- src/cleaner/config/cleaner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cleaner/config/cleaner.py b/src/cleaner/config/cleaner.py index 6c6245d305..dafa3221d0 100644 --- a/src/cleaner/config/cleaner.py +++ b/src/cleaner/config/cleaner.py @@ -18,9 +18,9 @@ import logging import logging.config +import copy - -class Cleaner: +class Cleaner(object): def __init__(self, cluster_conf, service_conf, default_service_conf): self.logger = logging.getLogger(__name__) From 02907703e7ec46200db7476b9f1be04af3ea7707 Mon Sep 17 00:00:00 2001 From: WangDian Date: Sun, 3 Feb 2019 14:04:20 +0800 Subject: [PATCH 15/31] 1. Remove worker 2. Add interval as var 3. Kill docker, send signal --- .../services-configuration.yaml.template | 5 +- .../services-configuration.yaml | 5 + src/cleaner/cleaner_main.py | 9 +- src/cleaner/config/cleaner.py | 11 ++ src/cleaner/config/cleaner.yaml | 3 +- src/cleaner/deploy/cleaner.yaml.template | 1 + src/cleaner/scripts/clean_docker.py | 120 ++++++++++-------- src/cleaner/utils/common.py | 6 + .../templates/dockerContainerScript.mustache | 6 +- 9 files changed, 104 insertions(+), 62 deletions(-) diff --git a/deployment/quick-start/services-configuration.yaml.template b/deployment/quick-start/services-configuration.yaml.template index 295bb62793..81f0993f70 100644 --- a/deployment/quick-start/services-configuration.yaml.template +++ b/deployment/quick-start/services-configuration.yaml.template @@ -129,6 +129,7 @@ rest-server: # pylon: # port: 80 -# uncomment following section if you want to customize the docker path of cleaner +# uncomment following section if you want to customize the threshold of cleaner # cleaner: -# threshold: 80 \ No newline at end of file +# threshold: 94 +# interval: 60 \ No newline at end of file diff --git a/examples/cluster-configuration/services-configuration.yaml b/examples/cluster-configuration/services-configuration.yaml index 8e22bdc8d0..2c6fed8bba 100644 --- a/examples/cluster-configuration/services-configuration.yaml +++ b/examples/cluster-configuration/services-configuration.yaml @@ -128,3 +128,8 @@ rest-server: # uncomment following section if you want to customize the port of pylon # pylon: # port: 80 + +# uncomment following section if you want to customize the threshold of cleaner +# cleaner: +# threshold: 94 +# interval: 60 \ No newline at end of file diff --git a/src/cleaner/cleaner_main.py b/src/cleaner/cleaner_main.py index 53b3091bf9..2af78875ff 100644 --- a/src/cleaner/cleaner_main.py +++ b/src/cleaner/cleaner_main.py @@ -19,7 +19,7 @@ import argparse import os from datetime import timedelta -from cleaner.scripts import clean_docker_cache, check_deleted_files, clean_docker +from cleaner.scripts.clean_docker import DockerCleaner from cleaner.worker import Worker from cleaner.utils.logger import LoggerMixin from cleaner.utils import common @@ -84,14 +84,13 @@ def get_worker(threshold): def main(): parser = argparse.ArgumentParser() parser.add_argument("-t", "--threshold", help="the disk usage precent to start cleaner") + parser.add_argument("-i", "--interval", help="the base interval to check disk usage") args = parser.parse_args() common.setup_logging() - cleaner = Cleaner("docker-cleaner-healthy") - cleaner.add_worker("docker-cleaner", get_worker(args.threshold)) - cleaner.start() - cleaner.sync() + cleaner = DockerCleaner(args.threshold, args.interval, timedelta(minutes=10)) + cleaner.run() if __name__ == "__main__": diff --git a/src/cleaner/config/cleaner.py b/src/cleaner/config/cleaner.py index dafa3221d0..2eccd24f8a 100644 --- a/src/cleaner/config/cleaner.py +++ b/src/cleaner/config/cleaner.py @@ -42,5 +42,16 @@ def validation_post(self, conf): msg = "expect threshold in cleaner to be int but get %s with type %s" % \ (threshold, type(threshold)) return False, msg + else: + if threshold < 0 or threshold > 100: + msg = "expect threshold in [0, 100]" + return False, msg + + interval = conf["cleaner"].get("interval") + if type(interval) != int: + msg = "expect interval in cleaner to be int but get %s with type %s" % \ + (interval, type(interval)) + return False, msg + return True, None diff --git a/src/cleaner/config/cleaner.yaml b/src/cleaner/config/cleaner.yaml index 4684ccd1e8..4f9a94e88b 100644 --- a/src/cleaner/config/cleaner.yaml +++ b/src/cleaner/config/cleaner.yaml @@ -15,4 +15,5 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -threshold: 80 \ No newline at end of file +threshold: 94 +interval: 60 \ No newline at end of file diff --git a/src/cleaner/deploy/cleaner.yaml.template b/src/cleaner/deploy/cleaner.yaml.template index ba7d4811ae..4c568bf3de 100644 --- a/src/cleaner/deploy/cleaner.yaml.template +++ b/src/cleaner/deploy/cleaner.yaml.template @@ -35,6 +35,7 @@ spec: image: {{ cluster_cfg["cluster"]["docker-registry"]["prefix"] }}cleaner:{{ cluster_cfg["cluster"]["docker-registry"]["tag"] }} args: - -t {{ cluster_cfg["cleaner"]["threshold"] }} + - -i {{ cluster_cfg["cleaner"]["interval"] }} imagePullPolicy: Always securityContext: privileged: True diff --git a/src/cleaner/scripts/clean_docker.py b/src/cleaner/scripts/clean_docker.py index fd7a4e2ab7..6d743b9ffa 100644 --- a/src/cleaner/scripts/clean_docker.py +++ b/src/cleaner/scripts/clean_docker.py @@ -15,71 +15,89 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +from cleaner.utils.logger import LoggerMixin +from cleaner.utils.timer import CountdownTimer, Timeout from cleaner.utils import common +from datetime import timedelta import subprocess import multiprocessing import re +import time -logger = multiprocessing.get_logger() +class DockerCleaner(LoggerMixin): + def __init__(self, threshold, interval, timeout=timedelta(hours=1)): + self.__threshold = int(threshold) + self.__interval = int(interval) + self.__timeout = timeout + def _exec(self): + exc = None + try: + with CountdownTimer(duration=self.__timeout): + self.check_and_clean(self.__threshold, self.__timespan) + except Timeout as e: + self.logger.error("Cleaner timeout.") + exc = e + except Exception as e: + self.logger.error("Unexpected error to run cleaner.") + exc = e -def check_disk_usage(partition): - df = subprocess.Popen(["df","-h", partition], stdout=subprocess.PIPE) - size = 0 - try: - for line in df.stdout: - splitline = line.decode().split() - if splitline[5] == partition: - size = int(splitline[4][:-1]) - except ValueError: - logger.error("cannot get disk size, reset size to 0") - size = 0 - logger.info("Checking disk, disk usage = {0}%".format(size)) - return size + if exc is not None: + self.logger.exception(exc) + + def run(self): + while True: + # allow a delay before the cleaning + time.sleep(self.__timespan) + self._exec() -def check_and_clean(threshold): - if check_disk_usage("/") > int(threshold): - logger.info("Disk usage is above {0}%, Try to remove containers".format(threshold)) - kill_largest_container() + def check_disk_usage(self, partition): + df = subprocess.Popen(["df","-h", partition], stdout=subprocess.PIPE) + size = 0 + try: + for line in df.stdout: + splitline = line.decode().split() + if splitline[5] == partition: + size = int(splitline[4][:-1]) + except ValueError: + self.logger.error("cannot get disk size, reset size to 0") + size = 0 + self.logger.info("Checking disk, disk usage = {0}%".format(size)) + return size -# Clean logic v1: kill largest container -white_list = ["k8s_kube", "k8s_pylon", "k8s_zookeeper", "k8s_rest-server", "k8s_yarn", "k8s_hadoop", "k8s_job-exporter", "k8s_watchdog", "k8s_grafana", "k8s_node-exporter", "k8s_webportal", "k8s_prometheus", "k8s_nvidia-drivers", "k8s_etcd-container", "k8s_apiserver-container", "k8s_docker-cleaner", "kubelet"] -def kill_largest_container(): - containers = [] - # Only try to stop PAI jobs and user created containers - containers_source = subprocess.Popen(["docker", "ps", "-a", "--format", r'{{.ID}}\t{{.Image}}\t{{.Size}}\t{{.Names}}'], stdout=subprocess.PIPE) - for line in containers_source.stdout: - splitline = line.split("\t") - ignore = False - for prefix in white_list: - if (splitline[3].startswith(prefix)): - ignore = True - break - if ignore == False: - size = calculate_size(splitline[2].split()[0]) - containers.append([size, splitline[0], splitline[1]]) + def check_and_clean(self, threshold): + if self.check_disk_usage("/") > self.___threshold: + self.logger.info("Disk usage is above {0}%, Try to remove containers".format(self.__threshold)) + self.kill_largest_container() - containers.sort(key=lambda x:x[0], reverse=True) - if containers.count > 0 and containers[0][0] > 1024**3: - logger.warning("Kill container {0} due to disk pressure. Container size: {1}".format(containers[0][1], containers[0][0])) - subprocess.Popen(["docker", "container", "stop", containers[0][1]]) + # Clean logic v1: kill largest container + white_list = ["k8s_kube", "k8s_pylon", "k8s_zookeeper", "k8s_rest-server", "k8s_yarn", "k8s_hadoop", "k8s_job-exporter", "k8s_watchdog", "k8s_grafana", "k8s_node-exporter", "k8s_webportal", "k8s_prometheus", "k8s_nvidia-drivers", "k8s_etcd-container", "k8s_apiserver-container", "k8s_docker-cleaner", "kubelet", "dev-box"] + def kill_largest_container(self): + containers = [] + # Only try to stop PAI jobs and user created containers + containers_source = subprocess.Popen(["docker", "ps", "-a", "--format", r'{{.ID}}\t{{.Image}}\t{{.Size}}\t{{.Names}}'], stdout=subprocess.PIPE) + for line in containers_source.stdout: + splitline = line.split("\t") + for prefix in white_list: + if (splitline[3].startswith(prefix)): + break + else: + size = calculate_size(splitline[2].split()[0]) + containers.append([size, splitline[0], splitline[1]]) - # Because docker stop will not immedicately stop container, we can not remove docker image right after stop container - #container_image = subprocess.Popen(["docker", "inspect", containers[0][1], r"--format='{{.Image}}'"], stdout=subprocess.PIPE).stdout.readline() - #subprocess.Popen(["docker", "image", "rmi", container_image]) - return True - else: - return False + containers.sort(key=lambda x:x[0], reverse=True) -size_defs={'B':1, 'K':1024, 'M':1024**2, 'G':1024**3, 'T':1024**4, 'b':1, 'k':1024, 'm':1024**2, 'g':1024**3, 't':1024**4} -def calculate_size(size_str): - size_search = re.search(r"[BbKkMmGgTt]", size_str) - return float(size_str[0:size_search.start()]) * size_defs[size_search.group()] + if containers.count > 0 and containers[0][0] > 1024**3: + logger.warning("Kill container {0} due to disk pressure. Container size: {1}".format(containers[0][1], containers[0][0])) + subprocess.Popen(["docker", "kill", "--signal=SIGRTMIN+8", containers[0][1]]) + # Because docker stop will not immedicately stop container, we can not remove docker image right after stop container + #container_image = subprocess.Popen(["docker", "inspect", containers[0][1], r"--format='{{.Image}}'"], stdout=subprocess.PIPE).stdout.readline() + #subprocess.Popen(["docker", "image", "rmi", container_image]) + return True + else: + return False -if __name__ == "__main__": - common.setup_logging() - check_and_clean(60) \ No newline at end of file diff --git a/src/cleaner/utils/common.py b/src/cleaner/utils/common.py index 70b98942e7..d09c8bb061 100644 --- a/src/cleaner/utils/common.py +++ b/src/cleaner/utils/common.py @@ -129,3 +129,9 @@ def setup_logging(): handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(logging.INFO) + + +size_defs={'B':1, 'K':1024, 'M':1024**2, 'G':1024**3, 'T':1024**4, 'b':1, 'k':1024, 'm':1024**2, 'g':1024**3, 't':1024**4} +def calculate_size(size_str): + size_search = re.search(r"[BbKkMmGgTt]", size_str) + return float(size_str[0:size_search.start()]) * size_defs[size_search.group()] \ No newline at end of file diff --git a/src/rest-server/src/templates/dockerContainerScript.mustache b/src/rest-server/src/templates/dockerContainerScript.mustache index 83bfde7b06..8c8c1b3e7e 100644 --- a/src/rest-server/src/templates/dockerContainerScript.mustache +++ b/src/rest-server/src/templates/dockerContainerScript.mustache @@ -32,14 +32,14 @@ function exit_handler() function kill_handler() { printf "%s %s\n" \ - "[DEBUG]" "Docker container kill handler: SIGTERM signal received in docker container, exiting ..." - exit 0 + "[INFO]" "Docker container killed due to disk pressure. If your job needs large disk space, please use HDFS or NFS to store your data." + exit 1 } set -x PS4="+[\t] " trap exit_handler EXIT -trap kill_handler SIGTERM +trap kill_handler SIGRTMIN+8 touch "/alive/docker_$PAI_CONTAINER_ID" From b64da73bce326dedd39bffada0e18e16c35d40ce Mon Sep 17 00:00:00 2001 From: WangDian Date: Sun, 3 Feb 2019 14:40:59 +0800 Subject: [PATCH 16/31] Fix '\t' --- src/cleaner/deploy/cleaner.yaml.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cleaner/deploy/cleaner.yaml.template b/src/cleaner/deploy/cleaner.yaml.template index 4c568bf3de..0eb259f576 100644 --- a/src/cleaner/deploy/cleaner.yaml.template +++ b/src/cleaner/deploy/cleaner.yaml.template @@ -35,7 +35,7 @@ spec: image: {{ cluster_cfg["cluster"]["docker-registry"]["prefix"] }}cleaner:{{ cluster_cfg["cluster"]["docker-registry"]["tag"] }} args: - -t {{ cluster_cfg["cleaner"]["threshold"] }} - - -i {{ cluster_cfg["cleaner"]["interval"] }} + - -i {{ cluster_cfg["cleaner"]["interval"] }} imagePullPolicy: Always securityContext: privileged: True From b6e2d04077ece9bca4b434293873a8d64772d2e6 Mon Sep 17 00:00:00 2001 From: WangDian Date: Sun, 3 Feb 2019 15:41:33 +0800 Subject: [PATCH 17/31] Bug fix --- src/cleaner/scripts/clean_docker.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/cleaner/scripts/clean_docker.py b/src/cleaner/scripts/clean_docker.py index 6d743b9ffa..37f3eb7e80 100644 --- a/src/cleaner/scripts/clean_docker.py +++ b/src/cleaner/scripts/clean_docker.py @@ -34,7 +34,7 @@ def _exec(self): exc = None try: with CountdownTimer(duration=self.__timeout): - self.check_and_clean(self.__threshold, self.__timespan) + self.check_and_clean() except Timeout as e: self.logger.error("Cleaner timeout.") exc = e @@ -48,7 +48,7 @@ def _exec(self): def run(self): while True: # allow a delay before the cleaning - time.sleep(self.__timespan) + time.sleep(self.__interval) self._exec() @@ -67,8 +67,8 @@ def check_disk_usage(self, partition): return size - def check_and_clean(self, threshold): - if self.check_disk_usage("/") > self.___threshold: + def check_and_clean(self): + if self.check_disk_usage("/") >= self.__threshold: self.logger.info("Disk usage is above {0}%, Try to remove containers".format(self.__threshold)) self.kill_largest_container() @@ -81,11 +81,11 @@ def kill_largest_container(self): containers_source = subprocess.Popen(["docker", "ps", "-a", "--format", r'{{.ID}}\t{{.Image}}\t{{.Size}}\t{{.Names}}'], stdout=subprocess.PIPE) for line in containers_source.stdout: splitline = line.split("\t") - for prefix in white_list: + for prefix in self.white_list: if (splitline[3].startswith(prefix)): break else: - size = calculate_size(splitline[2].split()[0]) + size = common.calculate_size(splitline[2].split()[0]) containers.append([size, splitline[0], splitline[1]]) containers.sort(key=lambda x:x[0], reverse=True) From bd6dc38291dc63fb861b6ff5512ce6b27bb9ac11 Mon Sep 17 00:00:00 2001 From: WangDian Date: Sun, 3 Feb 2019 15:50:38 +0800 Subject: [PATCH 18/31] Import re in common --- src/cleaner/utils/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cleaner/utils/common.py b/src/cleaner/utils/common.py index d09c8bb061..875d74c097 100644 --- a/src/cleaner/utils/common.py +++ b/src/cleaner/utils/common.py @@ -22,7 +22,7 @@ import os import psutil import signal - +import re def kill_process_tree(pid, time_to_die, logger): """ From 43598a093eba61e79d843fb5cb8d838a2025d1df Mon Sep 17 00:00:00 2001 From: WangDian Date: Mon, 11 Feb 2019 13:41:24 +0800 Subject: [PATCH 19/31] Update openjdk build version --- src/cleaner/scripts/clean_docker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cleaner/scripts/clean_docker.py b/src/cleaner/scripts/clean_docker.py index 37f3eb7e80..f44996a246 100644 --- a/src/cleaner/scripts/clean_docker.py +++ b/src/cleaner/scripts/clean_docker.py @@ -91,7 +91,7 @@ def kill_largest_container(self): containers.sort(key=lambda x:x[0], reverse=True) if containers.count > 0 and containers[0][0] > 1024**3: - logger.warning("Kill container {0} due to disk pressure. Container size: {1}".format(containers[0][1], containers[0][0])) + self.logger.warning("Kill container {0} due to disk pressure. Container size: {1}".format(containers[0][1], containers[0][0])) subprocess.Popen(["docker", "kill", "--signal=SIGRTMIN+8", containers[0][1]]) # Because docker stop will not immedicately stop container, we can not remove docker image right after stop container From ad36c98d1e9945c30dac4afa16648457c69f5838 Mon Sep 17 00:00:00 2001 From: WangDian Date: Mon, 11 Feb 2019 16:54:16 +0800 Subject: [PATCH 20/31] Change kill signal to 10 (SIGUSR1) --- src/cleaner/scripts/clean_docker.py | 2 +- src/rest-server/src/templates/dockerContainerScript.mustache | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cleaner/scripts/clean_docker.py b/src/cleaner/scripts/clean_docker.py index f44996a246..8e3bcbf4cf 100644 --- a/src/cleaner/scripts/clean_docker.py +++ b/src/cleaner/scripts/clean_docker.py @@ -92,7 +92,7 @@ def kill_largest_container(self): if containers.count > 0 and containers[0][0] > 1024**3: self.logger.warning("Kill container {0} due to disk pressure. Container size: {1}".format(containers[0][1], containers[0][0])) - subprocess.Popen(["docker", "kill", "--signal=SIGRTMIN+8", containers[0][1]]) + subprocess.Popen(["docker", "kill", "--signal=10", containers[0][1]]) # Because docker stop will not immedicately stop container, we can not remove docker image right after stop container #container_image = subprocess.Popen(["docker", "inspect", containers[0][1], r"--format='{{.Image}}'"], stdout=subprocess.PIPE).stdout.readline() diff --git a/src/rest-server/src/templates/dockerContainerScript.mustache b/src/rest-server/src/templates/dockerContainerScript.mustache index 8c8c1b3e7e..92f7e69397 100644 --- a/src/rest-server/src/templates/dockerContainerScript.mustache +++ b/src/rest-server/src/templates/dockerContainerScript.mustache @@ -33,13 +33,13 @@ function kill_handler() { printf "%s %s\n" \ "[INFO]" "Docker container killed due to disk pressure. If your job needs large disk space, please use HDFS or NFS to store your data." - exit 1 + exit 1 } set -x PS4="+[\t] " trap exit_handler EXIT -trap kill_handler SIGRTMIN+8 +trap kill_handler 10 touch "/alive/docker_$PAI_CONTAINER_ID" From 130f811cd084585a37eb2913e2bb8d291c52fae1 Mon Sep 17 00:00:00 2001 From: WangDian Date: Mon, 11 Feb 2019 18:01:49 +0800 Subject: [PATCH 21/31] Add k8s_POD to white list Add markdown document for cleaner configuration --- src/cleaner/config/cleaner.md | 54 +++++++++++++++++++++++++++++ src/cleaner/scripts/clean_docker.py | 2 +- 2 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 src/cleaner/config/cleaner.md diff --git a/src/cleaner/config/cleaner.md b/src/cleaner/config/cleaner.md new file mode 100644 index 0000000000..4acd44a15b --- /dev/null +++ b/src/cleaner/config/cleaner.md @@ -0,0 +1,54 @@ +## Cleaner section parser + +- [Default Configuration](#D_Config) +- [How to Configure](#HT_Config) +- [Generated Configuraiton](#G_Config) +- [Data Table](#T_config) + +#### Default configuration + +[cleaner default configuration](cleaner.yaml) + +#### How to configure cluster section in service-configuraiton.yaml + +All configurations in this section is optional. If you want to customized these value, you can configure it in service-configuration.yaml. + +For example, if you want to use different threshold than the default value 94, add following to your service-configuration.yaml as following: +```yaml +cleaner: + threshold: new-value + interval: new-value +``` + +#### Generated Configuration + +After parsing, object model looks like: +```yaml +cleaner: + threshold: 94 + interval: 60 +``` + + +#### Table + + + + + + + + + + + + + + + + + + + + +
Data in Configuration FileData in Cluster Object ModelData in Jinja2 TemplateData type
cleaner.thresholdcom["cleaner"]["threshold"]cluster_cfg["cleaner"]["threshold"]Int
cleaner.intervalcom["cleaner"]["interval"]cluster_cfg["cleaner"]["interval"]Int
diff --git a/src/cleaner/scripts/clean_docker.py b/src/cleaner/scripts/clean_docker.py index 8e3bcbf4cf..d70251686d 100644 --- a/src/cleaner/scripts/clean_docker.py +++ b/src/cleaner/scripts/clean_docker.py @@ -74,7 +74,7 @@ def check_and_clean(self): # Clean logic v1: kill largest container - white_list = ["k8s_kube", "k8s_pylon", "k8s_zookeeper", "k8s_rest-server", "k8s_yarn", "k8s_hadoop", "k8s_job-exporter", "k8s_watchdog", "k8s_grafana", "k8s_node-exporter", "k8s_webportal", "k8s_prometheus", "k8s_nvidia-drivers", "k8s_etcd-container", "k8s_apiserver-container", "k8s_docker-cleaner", "kubelet", "dev-box"] + white_list = ["k8s_POD", "k8s_kube", "k8s_pylon", "k8s_zookeeper", "k8s_rest-server", "k8s_yarn", "k8s_hadoop", "k8s_job-exporter", "k8s_watchdog", "k8s_grafana", "k8s_node-exporter", "k8s_webportal", "k8s_prometheus", "k8s_nvidia-drivers", "k8s_etcd-container", "k8s_apiserver-container", "k8s_docker-cleaner", "kubelet", "dev-box"] def kill_largest_container(self): containers = [] # Only try to stop PAI jobs and user created containers From 2db1cd00c219ee0b9bff0988eed59655e23ce9c2 Mon Sep 17 00:00:00 2001 From: WangDian Date: Thu, 31 Jan 2019 18:25:20 +0800 Subject: [PATCH 22/31] Cleaner logic --- .gitignore | 4 + .../services-configuration.yaml.template | 4 + src/cleaner/cleaner_main.py | 25 ++---- src/cleaner/config/cleaner.py | 22 +++-- src/cleaner/config/cleaner.yaml | 1 + src/cleaner/deploy/cleaner.yaml.template | 35 ++------ src/cleaner/scripts/clean_docker.py | 85 +++++++++++++++++++ .../scripts/reclaimable_docker_cache.sh | 6 +- src/cleaner/test/cleaner-test-job.json | 22 +++++ src/job-exporter/src/collector.py | 3 +- .../templates/dockerContainerScript.mustache | 13 +-- 11 files changed, 158 insertions(+), 62 deletions(-) create mode 100644 src/cleaner/scripts/clean_docker.py create mode 100644 src/cleaner/test/cleaner-test-job.json diff --git a/.gitignore b/.gitignore index 30c060ec47..37c834877e 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,7 @@ __pycache__ *.swp *.swo .DS_Store +*.sln +*.pyproj.user +*.pyproj +*.vs diff --git a/deployment/quick-start/services-configuration.yaml.template b/deployment/quick-start/services-configuration.yaml.template index 842e455214..295bb62793 100644 --- a/deployment/quick-start/services-configuration.yaml.template +++ b/deployment/quick-start/services-configuration.yaml.template @@ -128,3 +128,7 @@ rest-server: # uncomment following section if you want to customize the port of pylon # pylon: # port: 80 + +# uncomment following section if you want to customize the docker path of cleaner +# cleaner: +# threshold: 80 \ No newline at end of file diff --git a/src/cleaner/cleaner_main.py b/src/cleaner/cleaner_main.py index d4e8806d7e..53b3091bf9 100644 --- a/src/cleaner/cleaner_main.py +++ b/src/cleaner/cleaner_main.py @@ -19,7 +19,7 @@ import argparse import os from datetime import timedelta -from cleaner.scripts import clean_docker_cache, check_deleted_files +from cleaner.scripts import clean_docker_cache, check_deleted_files, clean_docker from cleaner.worker import Worker from cleaner.utils.logger import LoggerMixin from cleaner.utils import common @@ -76,31 +76,20 @@ def sync(self): time.sleep(1) -def get_worker(arg): - if arg == "docker_cache": - worker = Worker(clean_docker_cache.check_and_clean, 50, timeout=timedelta(minutes=10), cool_down_time=1800) - elif arg == "deleted_files": - worker = Worker(check_deleted_files.list_and_check_files, None, timeout=timedelta(minutes=10), cool_down_time=1800) - else: - raise ValueError("arguments %s is not supported.", arg) - return worker - - -liveness_files = { - "docker_cache": "docker-cache-cleaner-healthy", - "deleted_files": "deleted-files-cleaner-healthy" -} +def get_worker(threshold): + worker = Worker(clean_docker.check_and_clean, threshold, timeout=timedelta(minutes=10), cool_down_time=60) + return worker; def main(): parser = argparse.ArgumentParser() - parser.add_argument("option", help="the functions currently supported: [docker_cache | deleted_files]") + parser.add_argument("-t", "--threshold", help="the disk usage precent to start cleaner") args = parser.parse_args() common.setup_logging() - cleaner = Cleaner(liveness_files[args.option]) - cleaner.add_worker(args.option, get_worker(args.option)) + cleaner = Cleaner("docker-cleaner-healthy") + cleaner.add_worker("docker-cleaner", get_worker(args.threshold)) cleaner.start() cleaner.sync() diff --git a/src/cleaner/config/cleaner.py b/src/cleaner/config/cleaner.py index 71c8050638..6c6245d305 100644 --- a/src/cleaner/config/cleaner.py +++ b/src/cleaner/config/cleaner.py @@ -22,19 +22,25 @@ class Cleaner: - def __init__(self, cluster_configuration, service_configuration, default_service_configuraiton): + def __init__(self, cluster_conf, service_conf, default_service_conf): self.logger = logging.getLogger(__name__) - - self.cluster_configuration = cluster_configuration + self.cluster_conf = cluster_conf + self.service_conf = service_conf + self.default_service_conf = default_service_conf def validation_pre(self): return True, None def run(self): - com = {} - - return com - - def validation_post(self, cluster_object_model): + result = copy.deepcopy(self.default_service_conf) + result.update(self.service_conf) + return result + + def validation_post(self, conf): + threshold = conf["cleaner"].get("threshold") + if type(threshold) != int: + msg = "expect threshold in cleaner to be int but get %s with type %s" % \ + (threshold, type(threshold)) + return False, msg return True, None diff --git a/src/cleaner/config/cleaner.yaml b/src/cleaner/config/cleaner.yaml index dbde8d9a14..4684ccd1e8 100644 --- a/src/cleaner/config/cleaner.yaml +++ b/src/cleaner/config/cleaner.yaml @@ -15,3 +15,4 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +threshold: 80 \ No newline at end of file diff --git a/src/cleaner/deploy/cleaner.yaml.template b/src/cleaner/deploy/cleaner.yaml.template index 147729677b..ba7d4811ae 100644 --- a/src/cleaner/deploy/cleaner.yaml.template +++ b/src/cleaner/deploy/cleaner.yaml.template @@ -31,42 +31,16 @@ spec: hostPID: true hostNetwork: true containers: - - name: docker-cache-cleaner + - name: docker-cleaner image: {{ cluster_cfg["cluster"]["docker-registry"]["prefix"] }}cleaner:{{ cluster_cfg["cluster"]["docker-registry"]["tag"] }} args: - - 'docker_cache' + - -t {{ cluster_cfg["cleaner"]["threshold"] }} imagePullPolicy: Always securityContext: privileged: True volumeMounts: - mountPath: /var/run/docker.sock name: docker-socket - livenessProbe: - exec: - command: - - test - - '`find /tmp/docker-cache-cleaner-healthy -mmin -1`' - initialDelaySeconds: 60 - periodSeconds: 30 - {%- if cluster_cfg['cluster']['common']['qos-switch'] == "true" %} - resources: - limits: - memory: "1Gi" - {%- endif %} - - name: deleted-files-cleaner - image: {{ cluster_cfg["cluster"]["docker-registry"]["prefix"] }}cleaner:{{ cluster_cfg["cluster"]["docker-registry"]["tag"] }} - args: - - 'deleted_files' - imagePullPolicy: Always - securityContext: - privileged: True - livenessProbe: - exec: - command: - - test - - '`find /tmp/deleted-files-cleaner-healthy -mmin -1`' - initialDelaySeconds: 60 - periodSeconds: 30 {%- if cluster_cfg['cluster']['common']['qos-switch'] == "true" %} resources: limits: @@ -78,3 +52,8 @@ spec: - name: docker-socket hostPath: path: /var/run/docker.sock + tolerations: + - key: node.kubernetes.io/memory-pressure + operator: "Exists" + - key: node.kubernetes.io/disk-pressure + operator: "Exists" \ No newline at end of file diff --git a/src/cleaner/scripts/clean_docker.py b/src/cleaner/scripts/clean_docker.py new file mode 100644 index 0000000000..fd7a4e2ab7 --- /dev/null +++ b/src/cleaner/scripts/clean_docker.py @@ -0,0 +1,85 @@ +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from cleaner.utils import common +import subprocess +import multiprocessing +import re + +logger = multiprocessing.get_logger() + + +def check_disk_usage(partition): + df = subprocess.Popen(["df","-h", partition], stdout=subprocess.PIPE) + size = 0 + try: + for line in df.stdout: + splitline = line.decode().split() + if splitline[5] == partition: + size = int(splitline[4][:-1]) + except ValueError: + logger.error("cannot get disk size, reset size to 0") + size = 0 + logger.info("Checking disk, disk usage = {0}%".format(size)) + return size + + +def check_and_clean(threshold): + if check_disk_usage("/") > int(threshold): + logger.info("Disk usage is above {0}%, Try to remove containers".format(threshold)) + kill_largest_container() + + +# Clean logic v1: kill largest container +white_list = ["k8s_kube", "k8s_pylon", "k8s_zookeeper", "k8s_rest-server", "k8s_yarn", "k8s_hadoop", "k8s_job-exporter", "k8s_watchdog", "k8s_grafana", "k8s_node-exporter", "k8s_webportal", "k8s_prometheus", "k8s_nvidia-drivers", "k8s_etcd-container", "k8s_apiserver-container", "k8s_docker-cleaner", "kubelet"] +def kill_largest_container(): + containers = [] + # Only try to stop PAI jobs and user created containers + containers_source = subprocess.Popen(["docker", "ps", "-a", "--format", r'{{.ID}}\t{{.Image}}\t{{.Size}}\t{{.Names}}'], stdout=subprocess.PIPE) + for line in containers_source.stdout: + splitline = line.split("\t") + ignore = False + for prefix in white_list: + if (splitline[3].startswith(prefix)): + ignore = True + break + if ignore == False: + size = calculate_size(splitline[2].split()[0]) + containers.append([size, splitline[0], splitline[1]]) + + containers.sort(key=lambda x:x[0], reverse=True) + + if containers.count > 0 and containers[0][0] > 1024**3: + logger.warning("Kill container {0} due to disk pressure. Container size: {1}".format(containers[0][1], containers[0][0])) + subprocess.Popen(["docker", "container", "stop", containers[0][1]]) + + # Because docker stop will not immedicately stop container, we can not remove docker image right after stop container + #container_image = subprocess.Popen(["docker", "inspect", containers[0][1], r"--format='{{.Image}}'"], stdout=subprocess.PIPE).stdout.readline() + #subprocess.Popen(["docker", "image", "rmi", container_image]) + return True + else: + return False + +size_defs={'B':1, 'K':1024, 'M':1024**2, 'G':1024**3, 'T':1024**4, 'b':1, 'k':1024, 'm':1024**2, 'g':1024**3, 't':1024**4} +def calculate_size(size_str): + size_search = re.search(r"[BbKkMmGgTt]", size_str) + return float(size_str[0:size_search.start()]) * size_defs[size_search.group()] + + +if __name__ == "__main__": + common.setup_logging() + check_and_clean(60) \ No newline at end of file diff --git a/src/cleaner/scripts/reclaimable_docker_cache.sh b/src/cleaner/scripts/reclaimable_docker_cache.sh index baa85db0f2..a907394d0e 100644 --- a/src/cleaner/scripts/reclaimable_docker_cache.sh +++ b/src/cleaner/scripts/reclaimable_docker_cache.sh @@ -28,14 +28,16 @@ # # We summer up the result in column 5 (RECLAIMABLE) and return the size in gigabytes. -docker system df | \ +docker system df --format "{{.Reclaimable}}" | \ gawk 'BEGIN {s=0} END {print s} - match($5, /([0-9]+\.?[0-9]*)(M|G|B)/, a) { + match($1, /([0-9]+\.?[0-9]*)(M|G|B|T)/, a) { if(a[2] == "M") s += a[1]/1024; else if(a[2] == "B") s += a[1]/1024/1024; + else if(a[2] == "T") + s += a[1]*1024; else s += a[1]; }' diff --git a/src/cleaner/test/cleaner-test-job.json b/src/cleaner/test/cleaner-test-job.json new file mode 100644 index 0000000000..c71a71e375 --- /dev/null +++ b/src/cleaner/test/cleaner-test-job.json @@ -0,0 +1,22 @@ +{ + "jobName": "cleaner-test-job", + "image": "ubuntu", + "authFile": "", + "dataDir": "", + "outputDir": "", + "codeDir": "", + "virtualCluster": "default", + "gpuType": "", + "retryCount": 0, + "taskRoles": [ + { + "name": "main", + "taskNumber": 1, + "cpuNumber": 4, + "memoryMB": 8192, + "gpuNumber": 0, + "command": "fallocate -l 200G \"fake_base\"; for var in {1..100}; do fallocate -l 1G \"fake$var\"; sleep 5; done" + } + ], + "jobEnvs": {} +} \ No newline at end of file diff --git a/src/job-exporter/src/collector.py b/src/job-exporter/src/collector.py index 2df34dc29d..6e85e68fa9 100644 --- a/src/job-exporter/src/collector.py +++ b/src/job-exporter/src/collector.py @@ -331,7 +331,8 @@ class ContainerCollector(Collector): "node-exporter", "job-exporter", "yarn-exporter", - "nvidia-drivers" + "nvidia-drivers", + "docker-cleaner" ])) def __init__(self, name, sleep_time, atomic_ref, iteration_counter, gpu_info_ref, diff --git a/src/rest-server/src/templates/dockerContainerScript.mustache b/src/rest-server/src/templates/dockerContainerScript.mustache index 753e768163..83bfde7b06 100644 --- a/src/rest-server/src/templates/dockerContainerScript.mustache +++ b/src/rest-server/src/templates/dockerContainerScript.mustache @@ -29,10 +29,17 @@ function exit_handler() "[DEBUG]" "Docker container exit handler: EXIT signal received in docker container, exiting ..." } +function kill_handler() +{ + printf "%s %s\n" \ + "[DEBUG]" "Docker container kill handler: SIGTERM signal received in docker container, exiting ..." + exit 0 +} + set -x PS4="+[\t] " trap exit_handler EXIT - +trap kill_handler SIGTERM touch "/alive/docker_$PAI_CONTAINER_ID" @@ -44,7 +51,6 @@ HDFS_LAUNCHER_PREFIX=$PAI_DEFAULT_FS_URI/Container export CLASSPATH="$(hadoop classpath --glob)" task_role_no={{ idx }} - printf "%s %s\n%s\n\n" "[INFO]" "ENV" "$(printenv | sort)" cp -r /pai/code/* ./ @@ -101,7 +107,6 @@ function get_ssh_key_files() info_source="webhdfs" localKeyPath=/root/.ssh/{{ jobData.jobName }}.pub localPrivateKeyPath=/root/.ssh/{{ jobData.jobName }} - if [[ -f $localKeyPath ]]; then rm -f $localKeyPath fi @@ -164,14 +169,12 @@ function azure_rdma_preparation() {{# paiMachineList }} echo {{hostip}} {{hostname}} >> /hosts.tmp {{/ paiMachineList }} - cat /hosts.tmp > /etc/hosts } azure_rdma_preparation {{/ azRDMA }} {{/ reqAzRDMA }} - # Write env to system-wide environment env | grep -E "^PAI|PATH|PREFIX|JAVA|HADOOP|NVIDIA|CUDA" > /etc/environment From 39f4cff0fb2ba1ef14f83b759075748332f31212 Mon Sep 17 00:00:00 2001 From: WangDian Date: Thu, 31 Jan 2019 18:32:42 +0800 Subject: [PATCH 23/31] Fix bug --- src/cleaner/config/cleaner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cleaner/config/cleaner.py b/src/cleaner/config/cleaner.py index 6c6245d305..dafa3221d0 100644 --- a/src/cleaner/config/cleaner.py +++ b/src/cleaner/config/cleaner.py @@ -18,9 +18,9 @@ import logging import logging.config +import copy - -class Cleaner: +class Cleaner(object): def __init__(self, cluster_conf, service_conf, default_service_conf): self.logger = logging.getLogger(__name__) From 75a4a820a4e94affb454ae346735c5dad7470c39 Mon Sep 17 00:00:00 2001 From: WangDian Date: Sun, 3 Feb 2019 14:04:20 +0800 Subject: [PATCH 24/31] 1. Remove worker 2. Add interval as var 3. Kill docker, send signal --- .../services-configuration.yaml.template | 5 +- .../services-configuration.yaml | 5 + src/cleaner/cleaner_main.py | 9 +- src/cleaner/config/cleaner.py | 11 ++ src/cleaner/config/cleaner.yaml | 3 +- src/cleaner/deploy/cleaner.yaml.template | 1 + src/cleaner/scripts/clean_docker.py | 120 ++++++++++-------- src/cleaner/utils/common.py | 6 + .../templates/dockerContainerScript.mustache | 6 +- 9 files changed, 104 insertions(+), 62 deletions(-) diff --git a/deployment/quick-start/services-configuration.yaml.template b/deployment/quick-start/services-configuration.yaml.template index 295bb62793..81f0993f70 100644 --- a/deployment/quick-start/services-configuration.yaml.template +++ b/deployment/quick-start/services-configuration.yaml.template @@ -129,6 +129,7 @@ rest-server: # pylon: # port: 80 -# uncomment following section if you want to customize the docker path of cleaner +# uncomment following section if you want to customize the threshold of cleaner # cleaner: -# threshold: 80 \ No newline at end of file +# threshold: 94 +# interval: 60 \ No newline at end of file diff --git a/examples/cluster-configuration/services-configuration.yaml b/examples/cluster-configuration/services-configuration.yaml index 8e22bdc8d0..2c6fed8bba 100644 --- a/examples/cluster-configuration/services-configuration.yaml +++ b/examples/cluster-configuration/services-configuration.yaml @@ -128,3 +128,8 @@ rest-server: # uncomment following section if you want to customize the port of pylon # pylon: # port: 80 + +# uncomment following section if you want to customize the threshold of cleaner +# cleaner: +# threshold: 94 +# interval: 60 \ No newline at end of file diff --git a/src/cleaner/cleaner_main.py b/src/cleaner/cleaner_main.py index 53b3091bf9..2af78875ff 100644 --- a/src/cleaner/cleaner_main.py +++ b/src/cleaner/cleaner_main.py @@ -19,7 +19,7 @@ import argparse import os from datetime import timedelta -from cleaner.scripts import clean_docker_cache, check_deleted_files, clean_docker +from cleaner.scripts.clean_docker import DockerCleaner from cleaner.worker import Worker from cleaner.utils.logger import LoggerMixin from cleaner.utils import common @@ -84,14 +84,13 @@ def get_worker(threshold): def main(): parser = argparse.ArgumentParser() parser.add_argument("-t", "--threshold", help="the disk usage precent to start cleaner") + parser.add_argument("-i", "--interval", help="the base interval to check disk usage") args = parser.parse_args() common.setup_logging() - cleaner = Cleaner("docker-cleaner-healthy") - cleaner.add_worker("docker-cleaner", get_worker(args.threshold)) - cleaner.start() - cleaner.sync() + cleaner = DockerCleaner(args.threshold, args.interval, timedelta(minutes=10)) + cleaner.run() if __name__ == "__main__": diff --git a/src/cleaner/config/cleaner.py b/src/cleaner/config/cleaner.py index dafa3221d0..2eccd24f8a 100644 --- a/src/cleaner/config/cleaner.py +++ b/src/cleaner/config/cleaner.py @@ -42,5 +42,16 @@ def validation_post(self, conf): msg = "expect threshold in cleaner to be int but get %s with type %s" % \ (threshold, type(threshold)) return False, msg + else: + if threshold < 0 or threshold > 100: + msg = "expect threshold in [0, 100]" + return False, msg + + interval = conf["cleaner"].get("interval") + if type(interval) != int: + msg = "expect interval in cleaner to be int but get %s with type %s" % \ + (interval, type(interval)) + return False, msg + return True, None diff --git a/src/cleaner/config/cleaner.yaml b/src/cleaner/config/cleaner.yaml index 4684ccd1e8..4f9a94e88b 100644 --- a/src/cleaner/config/cleaner.yaml +++ b/src/cleaner/config/cleaner.yaml @@ -15,4 +15,5 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -threshold: 80 \ No newline at end of file +threshold: 94 +interval: 60 \ No newline at end of file diff --git a/src/cleaner/deploy/cleaner.yaml.template b/src/cleaner/deploy/cleaner.yaml.template index ba7d4811ae..4c568bf3de 100644 --- a/src/cleaner/deploy/cleaner.yaml.template +++ b/src/cleaner/deploy/cleaner.yaml.template @@ -35,6 +35,7 @@ spec: image: {{ cluster_cfg["cluster"]["docker-registry"]["prefix"] }}cleaner:{{ cluster_cfg["cluster"]["docker-registry"]["tag"] }} args: - -t {{ cluster_cfg["cleaner"]["threshold"] }} + - -i {{ cluster_cfg["cleaner"]["interval"] }} imagePullPolicy: Always securityContext: privileged: True diff --git a/src/cleaner/scripts/clean_docker.py b/src/cleaner/scripts/clean_docker.py index fd7a4e2ab7..6d743b9ffa 100644 --- a/src/cleaner/scripts/clean_docker.py +++ b/src/cleaner/scripts/clean_docker.py @@ -15,71 +15,89 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +from cleaner.utils.logger import LoggerMixin +from cleaner.utils.timer import CountdownTimer, Timeout from cleaner.utils import common +from datetime import timedelta import subprocess import multiprocessing import re +import time -logger = multiprocessing.get_logger() +class DockerCleaner(LoggerMixin): + def __init__(self, threshold, interval, timeout=timedelta(hours=1)): + self.__threshold = int(threshold) + self.__interval = int(interval) + self.__timeout = timeout + def _exec(self): + exc = None + try: + with CountdownTimer(duration=self.__timeout): + self.check_and_clean(self.__threshold, self.__timespan) + except Timeout as e: + self.logger.error("Cleaner timeout.") + exc = e + except Exception as e: + self.logger.error("Unexpected error to run cleaner.") + exc = e -def check_disk_usage(partition): - df = subprocess.Popen(["df","-h", partition], stdout=subprocess.PIPE) - size = 0 - try: - for line in df.stdout: - splitline = line.decode().split() - if splitline[5] == partition: - size = int(splitline[4][:-1]) - except ValueError: - logger.error("cannot get disk size, reset size to 0") - size = 0 - logger.info("Checking disk, disk usage = {0}%".format(size)) - return size + if exc is not None: + self.logger.exception(exc) + + def run(self): + while True: + # allow a delay before the cleaning + time.sleep(self.__timespan) + self._exec() -def check_and_clean(threshold): - if check_disk_usage("/") > int(threshold): - logger.info("Disk usage is above {0}%, Try to remove containers".format(threshold)) - kill_largest_container() + def check_disk_usage(self, partition): + df = subprocess.Popen(["df","-h", partition], stdout=subprocess.PIPE) + size = 0 + try: + for line in df.stdout: + splitline = line.decode().split() + if splitline[5] == partition: + size = int(splitline[4][:-1]) + except ValueError: + self.logger.error("cannot get disk size, reset size to 0") + size = 0 + self.logger.info("Checking disk, disk usage = {0}%".format(size)) + return size -# Clean logic v1: kill largest container -white_list = ["k8s_kube", "k8s_pylon", "k8s_zookeeper", "k8s_rest-server", "k8s_yarn", "k8s_hadoop", "k8s_job-exporter", "k8s_watchdog", "k8s_grafana", "k8s_node-exporter", "k8s_webportal", "k8s_prometheus", "k8s_nvidia-drivers", "k8s_etcd-container", "k8s_apiserver-container", "k8s_docker-cleaner", "kubelet"] -def kill_largest_container(): - containers = [] - # Only try to stop PAI jobs and user created containers - containers_source = subprocess.Popen(["docker", "ps", "-a", "--format", r'{{.ID}}\t{{.Image}}\t{{.Size}}\t{{.Names}}'], stdout=subprocess.PIPE) - for line in containers_source.stdout: - splitline = line.split("\t") - ignore = False - for prefix in white_list: - if (splitline[3].startswith(prefix)): - ignore = True - break - if ignore == False: - size = calculate_size(splitline[2].split()[0]) - containers.append([size, splitline[0], splitline[1]]) + def check_and_clean(self, threshold): + if self.check_disk_usage("/") > self.___threshold: + self.logger.info("Disk usage is above {0}%, Try to remove containers".format(self.__threshold)) + self.kill_largest_container() - containers.sort(key=lambda x:x[0], reverse=True) - if containers.count > 0 and containers[0][0] > 1024**3: - logger.warning("Kill container {0} due to disk pressure. Container size: {1}".format(containers[0][1], containers[0][0])) - subprocess.Popen(["docker", "container", "stop", containers[0][1]]) + # Clean logic v1: kill largest container + white_list = ["k8s_kube", "k8s_pylon", "k8s_zookeeper", "k8s_rest-server", "k8s_yarn", "k8s_hadoop", "k8s_job-exporter", "k8s_watchdog", "k8s_grafana", "k8s_node-exporter", "k8s_webportal", "k8s_prometheus", "k8s_nvidia-drivers", "k8s_etcd-container", "k8s_apiserver-container", "k8s_docker-cleaner", "kubelet", "dev-box"] + def kill_largest_container(self): + containers = [] + # Only try to stop PAI jobs and user created containers + containers_source = subprocess.Popen(["docker", "ps", "-a", "--format", r'{{.ID}}\t{{.Image}}\t{{.Size}}\t{{.Names}}'], stdout=subprocess.PIPE) + for line in containers_source.stdout: + splitline = line.split("\t") + for prefix in white_list: + if (splitline[3].startswith(prefix)): + break + else: + size = calculate_size(splitline[2].split()[0]) + containers.append([size, splitline[0], splitline[1]]) - # Because docker stop will not immedicately stop container, we can not remove docker image right after stop container - #container_image = subprocess.Popen(["docker", "inspect", containers[0][1], r"--format='{{.Image}}'"], stdout=subprocess.PIPE).stdout.readline() - #subprocess.Popen(["docker", "image", "rmi", container_image]) - return True - else: - return False + containers.sort(key=lambda x:x[0], reverse=True) -size_defs={'B':1, 'K':1024, 'M':1024**2, 'G':1024**3, 'T':1024**4, 'b':1, 'k':1024, 'm':1024**2, 'g':1024**3, 't':1024**4} -def calculate_size(size_str): - size_search = re.search(r"[BbKkMmGgTt]", size_str) - return float(size_str[0:size_search.start()]) * size_defs[size_search.group()] + if containers.count > 0 and containers[0][0] > 1024**3: + logger.warning("Kill container {0} due to disk pressure. Container size: {1}".format(containers[0][1], containers[0][0])) + subprocess.Popen(["docker", "kill", "--signal=SIGRTMIN+8", containers[0][1]]) + # Because docker stop will not immedicately stop container, we can not remove docker image right after stop container + #container_image = subprocess.Popen(["docker", "inspect", containers[0][1], r"--format='{{.Image}}'"], stdout=subprocess.PIPE).stdout.readline() + #subprocess.Popen(["docker", "image", "rmi", container_image]) + return True + else: + return False -if __name__ == "__main__": - common.setup_logging() - check_and_clean(60) \ No newline at end of file diff --git a/src/cleaner/utils/common.py b/src/cleaner/utils/common.py index 70b98942e7..d09c8bb061 100644 --- a/src/cleaner/utils/common.py +++ b/src/cleaner/utils/common.py @@ -129,3 +129,9 @@ def setup_logging(): handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(logging.INFO) + + +size_defs={'B':1, 'K':1024, 'M':1024**2, 'G':1024**3, 'T':1024**4, 'b':1, 'k':1024, 'm':1024**2, 'g':1024**3, 't':1024**4} +def calculate_size(size_str): + size_search = re.search(r"[BbKkMmGgTt]", size_str) + return float(size_str[0:size_search.start()]) * size_defs[size_search.group()] \ No newline at end of file diff --git a/src/rest-server/src/templates/dockerContainerScript.mustache b/src/rest-server/src/templates/dockerContainerScript.mustache index 83bfde7b06..8c8c1b3e7e 100644 --- a/src/rest-server/src/templates/dockerContainerScript.mustache +++ b/src/rest-server/src/templates/dockerContainerScript.mustache @@ -32,14 +32,14 @@ function exit_handler() function kill_handler() { printf "%s %s\n" \ - "[DEBUG]" "Docker container kill handler: SIGTERM signal received in docker container, exiting ..." - exit 0 + "[INFO]" "Docker container killed due to disk pressure. If your job needs large disk space, please use HDFS or NFS to store your data." + exit 1 } set -x PS4="+[\t] " trap exit_handler EXIT -trap kill_handler SIGTERM +trap kill_handler SIGRTMIN+8 touch "/alive/docker_$PAI_CONTAINER_ID" From 4f481b5449ff09229aae6381867ad9cf2076113c Mon Sep 17 00:00:00 2001 From: WangDian Date: Sun, 3 Feb 2019 14:40:59 +0800 Subject: [PATCH 25/31] Fix '\t' --- src/cleaner/deploy/cleaner.yaml.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cleaner/deploy/cleaner.yaml.template b/src/cleaner/deploy/cleaner.yaml.template index 4c568bf3de..0eb259f576 100644 --- a/src/cleaner/deploy/cleaner.yaml.template +++ b/src/cleaner/deploy/cleaner.yaml.template @@ -35,7 +35,7 @@ spec: image: {{ cluster_cfg["cluster"]["docker-registry"]["prefix"] }}cleaner:{{ cluster_cfg["cluster"]["docker-registry"]["tag"] }} args: - -t {{ cluster_cfg["cleaner"]["threshold"] }} - - -i {{ cluster_cfg["cleaner"]["interval"] }} + - -i {{ cluster_cfg["cleaner"]["interval"] }} imagePullPolicy: Always securityContext: privileged: True From c21f1c3189770bcca1c4afcd6c785516d519c5ff Mon Sep 17 00:00:00 2001 From: WangDian Date: Sun, 3 Feb 2019 15:41:33 +0800 Subject: [PATCH 26/31] Bug fix --- src/cleaner/scripts/clean_docker.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/cleaner/scripts/clean_docker.py b/src/cleaner/scripts/clean_docker.py index 6d743b9ffa..37f3eb7e80 100644 --- a/src/cleaner/scripts/clean_docker.py +++ b/src/cleaner/scripts/clean_docker.py @@ -34,7 +34,7 @@ def _exec(self): exc = None try: with CountdownTimer(duration=self.__timeout): - self.check_and_clean(self.__threshold, self.__timespan) + self.check_and_clean() except Timeout as e: self.logger.error("Cleaner timeout.") exc = e @@ -48,7 +48,7 @@ def _exec(self): def run(self): while True: # allow a delay before the cleaning - time.sleep(self.__timespan) + time.sleep(self.__interval) self._exec() @@ -67,8 +67,8 @@ def check_disk_usage(self, partition): return size - def check_and_clean(self, threshold): - if self.check_disk_usage("/") > self.___threshold: + def check_and_clean(self): + if self.check_disk_usage("/") >= self.__threshold: self.logger.info("Disk usage is above {0}%, Try to remove containers".format(self.__threshold)) self.kill_largest_container() @@ -81,11 +81,11 @@ def kill_largest_container(self): containers_source = subprocess.Popen(["docker", "ps", "-a", "--format", r'{{.ID}}\t{{.Image}}\t{{.Size}}\t{{.Names}}'], stdout=subprocess.PIPE) for line in containers_source.stdout: splitline = line.split("\t") - for prefix in white_list: + for prefix in self.white_list: if (splitline[3].startswith(prefix)): break else: - size = calculate_size(splitline[2].split()[0]) + size = common.calculate_size(splitline[2].split()[0]) containers.append([size, splitline[0], splitline[1]]) containers.sort(key=lambda x:x[0], reverse=True) From a93baf4365f0e659f40b0e0de4ce8a873a1e3e35 Mon Sep 17 00:00:00 2001 From: WangDian Date: Sun, 3 Feb 2019 15:50:38 +0800 Subject: [PATCH 27/31] Import re in common --- src/cleaner/utils/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cleaner/utils/common.py b/src/cleaner/utils/common.py index d09c8bb061..875d74c097 100644 --- a/src/cleaner/utils/common.py +++ b/src/cleaner/utils/common.py @@ -22,7 +22,7 @@ import os import psutil import signal - +import re def kill_process_tree(pid, time_to_die, logger): """ From 2a56c41c24016c86cd2e521ccc6a9972200fb9f2 Mon Sep 17 00:00:00 2001 From: WangDian Date: Mon, 11 Feb 2019 13:41:24 +0800 Subject: [PATCH 28/31] Update openjdk build version --- src/cleaner/scripts/clean_docker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cleaner/scripts/clean_docker.py b/src/cleaner/scripts/clean_docker.py index 37f3eb7e80..f44996a246 100644 --- a/src/cleaner/scripts/clean_docker.py +++ b/src/cleaner/scripts/clean_docker.py @@ -91,7 +91,7 @@ def kill_largest_container(self): containers.sort(key=lambda x:x[0], reverse=True) if containers.count > 0 and containers[0][0] > 1024**3: - logger.warning("Kill container {0} due to disk pressure. Container size: {1}".format(containers[0][1], containers[0][0])) + self.logger.warning("Kill container {0} due to disk pressure. Container size: {1}".format(containers[0][1], containers[0][0])) subprocess.Popen(["docker", "kill", "--signal=SIGRTMIN+8", containers[0][1]]) # Because docker stop will not immedicately stop container, we can not remove docker image right after stop container From 7c94642e6401f28b70885be7672f1313409e8a83 Mon Sep 17 00:00:00 2001 From: WangDian Date: Mon, 11 Feb 2019 16:54:16 +0800 Subject: [PATCH 29/31] Change kill signal to 10 (SIGUSR1) --- src/cleaner/scripts/clean_docker.py | 2 +- src/rest-server/src/templates/dockerContainerScript.mustache | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cleaner/scripts/clean_docker.py b/src/cleaner/scripts/clean_docker.py index f44996a246..8e3bcbf4cf 100644 --- a/src/cleaner/scripts/clean_docker.py +++ b/src/cleaner/scripts/clean_docker.py @@ -92,7 +92,7 @@ def kill_largest_container(self): if containers.count > 0 and containers[0][0] > 1024**3: self.logger.warning("Kill container {0} due to disk pressure. Container size: {1}".format(containers[0][1], containers[0][0])) - subprocess.Popen(["docker", "kill", "--signal=SIGRTMIN+8", containers[0][1]]) + subprocess.Popen(["docker", "kill", "--signal=10", containers[0][1]]) # Because docker stop will not immedicately stop container, we can not remove docker image right after stop container #container_image = subprocess.Popen(["docker", "inspect", containers[0][1], r"--format='{{.Image}}'"], stdout=subprocess.PIPE).stdout.readline() diff --git a/src/rest-server/src/templates/dockerContainerScript.mustache b/src/rest-server/src/templates/dockerContainerScript.mustache index 8c8c1b3e7e..92f7e69397 100644 --- a/src/rest-server/src/templates/dockerContainerScript.mustache +++ b/src/rest-server/src/templates/dockerContainerScript.mustache @@ -33,13 +33,13 @@ function kill_handler() { printf "%s %s\n" \ "[INFO]" "Docker container killed due to disk pressure. If your job needs large disk space, please use HDFS or NFS to store your data." - exit 1 + exit 1 } set -x PS4="+[\t] " trap exit_handler EXIT -trap kill_handler SIGRTMIN+8 +trap kill_handler 10 touch "/alive/docker_$PAI_CONTAINER_ID" From 3871fd52458fe332327b56eb4ebc71ec438ef1c0 Mon Sep 17 00:00:00 2001 From: WangDian Date: Mon, 11 Feb 2019 18:01:49 +0800 Subject: [PATCH 30/31] Add k8s_POD to white list Add markdown document for cleaner configuration --- src/cleaner/config/cleaner.md | 54 +++++++++++++++++++++++++++++ src/cleaner/scripts/clean_docker.py | 2 +- 2 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 src/cleaner/config/cleaner.md diff --git a/src/cleaner/config/cleaner.md b/src/cleaner/config/cleaner.md new file mode 100644 index 0000000000..4acd44a15b --- /dev/null +++ b/src/cleaner/config/cleaner.md @@ -0,0 +1,54 @@ +## Cleaner section parser + +- [Default Configuration](#D_Config) +- [How to Configure](#HT_Config) +- [Generated Configuraiton](#G_Config) +- [Data Table](#T_config) + +#### Default configuration + +[cleaner default configuration](cleaner.yaml) + +#### How to configure cluster section in service-configuraiton.yaml + +All configurations in this section is optional. If you want to customized these value, you can configure it in service-configuration.yaml. + +For example, if you want to use different threshold than the default value 94, add following to your service-configuration.yaml as following: +```yaml +cleaner: + threshold: new-value + interval: new-value +``` + +#### Generated Configuration + +After parsing, object model looks like: +```yaml +cleaner: + threshold: 94 + interval: 60 +``` + + +#### Table + + + + + + + + + + + + + + + + + + + + +
Data in Configuration FileData in Cluster Object ModelData in Jinja2 TemplateData type
cleaner.thresholdcom["cleaner"]["threshold"]cluster_cfg["cleaner"]["threshold"]Int
cleaner.intervalcom["cleaner"]["interval"]cluster_cfg["cleaner"]["interval"]Int
diff --git a/src/cleaner/scripts/clean_docker.py b/src/cleaner/scripts/clean_docker.py index 8e3bcbf4cf..d70251686d 100644 --- a/src/cleaner/scripts/clean_docker.py +++ b/src/cleaner/scripts/clean_docker.py @@ -74,7 +74,7 @@ def check_and_clean(self): # Clean logic v1: kill largest container - white_list = ["k8s_kube", "k8s_pylon", "k8s_zookeeper", "k8s_rest-server", "k8s_yarn", "k8s_hadoop", "k8s_job-exporter", "k8s_watchdog", "k8s_grafana", "k8s_node-exporter", "k8s_webportal", "k8s_prometheus", "k8s_nvidia-drivers", "k8s_etcd-container", "k8s_apiserver-container", "k8s_docker-cleaner", "kubelet", "dev-box"] + white_list = ["k8s_POD", "k8s_kube", "k8s_pylon", "k8s_zookeeper", "k8s_rest-server", "k8s_yarn", "k8s_hadoop", "k8s_job-exporter", "k8s_watchdog", "k8s_grafana", "k8s_node-exporter", "k8s_webportal", "k8s_prometheus", "k8s_nvidia-drivers", "k8s_etcd-container", "k8s_apiserver-container", "k8s_docker-cleaner", "kubelet", "dev-box"] def kill_largest_container(self): containers = [] # Only try to stop PAI jobs and user created containers From 1353fc9f098eeb71fd1ecf088a68180cd398e161 Mon Sep 17 00:00:00 2001 From: WangDian Date: Wed, 13 Feb 2019 11:58:09 +0800 Subject: [PATCH 31/31] Add comment on cleaner kill handler in rest-server --- src/rest-server/src/templates/dockerContainerScript.mustache | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/rest-server/src/templates/dockerContainerScript.mustache b/src/rest-server/src/templates/dockerContainerScript.mustache index 92f7e69397..fce0b9bb60 100644 --- a/src/rest-server/src/templates/dockerContainerScript.mustache +++ b/src/rest-server/src/templates/dockerContainerScript.mustache @@ -29,10 +29,11 @@ function exit_handler() "[DEBUG]" "Docker container exit handler: EXIT signal received in docker container, exiting ..." } +# The cleaner will send SIGUSR1(10) to container to kill it. Trap the signal here and exit with code 1. function kill_handler() { printf "%s %s\n" \ - "[INFO]" "Docker container killed due to disk pressure. If your job needs large disk space, please use HDFS or NFS to store your data." + "[INFO]" "Docker container killed by cleaner due to disk pressure." exit 1 }