From 83b61bea06709041e84aac9b7c67f8540e16fe1a Mon Sep 17 00:00:00 2001
From: suiguoxin <suiguoxin@gmail.com>
Date: Tue, 15 Dec 2020 21:45:03 +0800
Subject: [PATCH 1/3] generate hived config from layout.yaml

---
 .../services-configuration.yaml.template      |  58 ++--
 contrib/kubespray/script/openpai-generator.py | 265 +++++-------------
 contrib/kubespray/script/service-boot.sh      |  14 -
 3 files changed, 107 insertions(+), 230 deletions(-)

diff --git a/contrib/kubespray/quick-start/services-configuration.yaml.template b/contrib/kubespray/quick-start/services-configuration.yaml.template
index b215de327b..cfc66449a3 100644
--- a/contrib/kubespray/quick-start/services-configuration.yaml.template
+++ b/contrib/kubespray/quick-start/services-configuration.yaml.template
@@ -5,15 +5,13 @@ cluster:
     k8s-rbac: "true"
     job-history: "true"
     data-path: "/datastorage"
-    qos-switch: "{{ env["cfg"]["qos-switch"] }} | default('false') "
+    qos-switch: "{{ env["cfg"]["qos-switch"] | default('false') }}"
     docker-data-root: "{{ env['cfg']['docker_data_root'] | default('/mnt/docker') }}"
 
   # the docker registry to store docker images that contain system services like frameworklauncher, hadoop, etc.
   docker-registry:
-
     # The namespace in your registry. If the registry is docker.io, the namespace will be your user account.
     namespace: {{ env["cfg"]['docker_registry_namespace'] | default('openpai') }}
-
     # E.g., gcr.io.
     # if the registry is hub.docker, please fill this value with docker.io
     domain: {{ env["cfg"]['docker_registry_domain'] | default('docker.io') }}
@@ -48,36 +46,50 @@ rest-server:
 webportal:
   server-port: 9286
 
-#If you want to customize the scheduling config, such add more virtual clusters or more gpu types, check:
-#https://github.com/microsoft/pai/blob/master/docs/hivedscheduler/devops.md
+# If you want to customize the scheduling config, such add more virtual clusters or more gpu types, check:
+# https://github.com/microsoft/pai/blob/master/docs/manual/cluster-admin/how-to-set-up-virtual-clusters.md
 hivedscheduler:
   config: |
     physicalCluster:
       skuTypes:
-        DT:
+        {% for sku_name, sku_spec in env["hived"]["skus"].items() -%}
+        {{ sku_name }}:
+        {%- if sku_spec.gpu %}
           gpu: 1
-          cpu: {{ env["hived"]["unit-cpu"] }}
-          memory: {{ env["hived"]["unit-mem"] }}Mi
+        {%- endif %}
+          cpu: {{ sku_spec.cpu }}
+          memory: {{ sku_spec.memory }}Mi
+        {% endfor %}
       cellTypes:
-        DT-NODE:
-          childCellType: DT
-          childCellNumber: {{ env["hived"]["min-gpu"] }}
+        {% for sku_name, sku_spec in env["hived"]["skus"].items() -%}
+        {{ sku_name }}-NODE:
+          childCellType: {{ sku_name }}
+          childCellNumber: {{ sku_spec.gpuCount }}
           isNodeLevel: true
-        DT-NODE-POOL:
-          childCellType: DT-NODE
-          childCellNumber: {{ env["hived"]["nodelist"]|length }}
+        {{ sku_name }}-NODE-POOL:
+          childCellType: {{ sku_name }}-NODE
+          childCellNumber: {{ sku_spec.workers|length }}
+        {% endfor %}
       physicalCells:
-      - cellType: DT-NODE-POOL
+      {% for sku_name, sku_spec  in env["hived"]["skus"].items() -%}
+      - cellType: {{ sku_name }}-NODE-POOL.{{ sku_name }}-NODE
         cellChildren:
-        {%- for nodename in env["hived"]["nodelist"] %}
-        - cellAddress: {{nodename}}
-        {%- endfor %}
-    virtualClusters:
-      default:
-        virtualCells:
-        - cellType: DT-NODE-POOL.DT-NODE
-          cellNumber: {{ env["hived"]["nodelist"]|length }}
+        {% for worker in sku_spec.workers -%}
+        - cellAddress: {{ worker }}
+        {% endfor %}
+      {% endfor -%}
 
+    virtualClusters:
+      {% for sku_name, sku_spec  in env["hived"]["skus"].items() -%}
+        {% if loop.index0 == 0 %}
+        default:
+        {% else %}
+        {{ sku_name }}:
+        {% endif %}
+          virtualCells:
+          - cellType: {{ sku_name }}-NODE-POOL.{{ sku_name }}-NODE
+            cellNumber: {{ sku_spec.workers|length }}
+      {% endfor %}
 
 # uncomment following section, if you want to customize the authentication solution.
 authentication:
diff --git a/contrib/kubespray/script/openpai-generator.py b/contrib/kubespray/script/openpai-generator.py
index 092cf07a69..8daaee83a9 100644
--- a/contrib/kubespray/script/openpai-generator.py
+++ b/contrib/kubespray/script/openpai-generator.py
@@ -1,24 +1,25 @@
-import copy
-from decimal import *
-import logging
-import logging.config
 import os
-import argparse
-import re
 import sys
-import time
-
+import re
+import argparse
+import logging
+import logging.config
+from decimal import Decimal
+import yaml
 import jinja2
-from kubernetes import client, config
 from kubernetes.utils import parse_quantity
-from kubernetes.client.rest import ApiException
-import yaml
-
 
+# reserved resources
 PAI_RESERVE_RESOURCE_PERCENTAGE = 0.01
 PAI_MAX_RESERVE_CPU_PER_NODE = 0.5
 PAI_MAX_RESERVE_MEMORY_PER_NODE = 1024 # 1Gi
 
+KUBE_RESERVED_CPU = 0.01 # 100m
+KUBE_RESERVED_MEM = 256 # Mi
+SYSTEM_RESERVED_CPU = 0
+SYSTEM_RESERVED_MEM = 0
+EVICTION_HARD_MEM = 100 #Mi
+
 
 def setup_logger_config(logger):
     """
@@ -68,122 +69,6 @@ def generate_template_file(template_file_path, output_path, map_table):
     write_generated_file(output_path, generated_template)
 
 
-def pod_is_ready_or_not(label_key, label_value, service_name):
-
-    label_selector_str="{0}={1}".format(label_key, label_value)
-
-    config.load_kube_config()
-    v1 = client.CoreV1Api()
-
-    try:
-        pod_list = v1.list_pod_for_all_namespaces(label_selector=label_selector_str, watch=False)
-    except ApiException as e:
-        logger.error("Exception when calling CoreV1Api->list_pod_for_all_namespaces: %s\n" % e)
-        return False
-
-    if len(pod_list.items) == 0:
-        logger.warning("No pod can be dectected.")
-        return False
-
-    ready = 0
-    unready = 0
-    for pod in pod_list.items:
-        if pod.status.container_statuses is None:
-            unready = unready + 1
-            continue
-        flag = True
-        for container in pod.status.container_statuses:
-            if container.ready != True:
-                unready = unready + 1
-                flag = False
-                break
-        if flag:
-            ready = ready + 1
-
-    if unready != 0:
-        logger.info("{0} is not ready.".format(service_name))
-        logger.info("Total: {0}".format(ready + unready))
-        logger.info("Ready: {0}".format(ready))
-        return False
-
-    return True
-
-
-def get_kubernetes_node_info_from_API():
-    config.load_kube_config()
-    api_instance = client.CoreV1Api()
-
-    # https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/CoreV1Api.md#list_node
-    pretty = 'true'
-    timeout_seconds = 56
-
-    ret = dict()
-    try:
-        api_response = api_instance.list_node(pretty=pretty, timeout_seconds=timeout_seconds)
-        for node in api_response.items:
-            gpu_resource = 0
-            if 'nvidia.com/gpu' in node.status.allocatable:
-                gpu_resource = int(parse_quantity(node.status.allocatable['nvidia.com/gpu']))
-            if 'amd.com/gpu' in node.status.allocatable:
-                gpu_resource = int(parse_quantity(node.status.allocatable['amd.com/gpu']))
-            ret[node.metadata.name] = {
-                "cpu-resource": parse_quantity(node.status.allocatable['cpu']),
-                "mem-resource": parse_quantity(node.status.allocatable['memory']) / 1024 / 1024,
-                "gpu-resource": gpu_resource,
-            }
-    except ApiException as e:
-        logger.error("Exception when calling CoreV1Api->list_node: %s\n" % e)
-        raise
-
-    return ret
-
-
-def get_pod_requests(pod):
-    ret = {
-        "cpu-resource": 0,
-        "mem-resource": 0,
-    }
-    for container in pod.spec.containers:
-        if container.resources.requests is None:
-            continue
-        ret["cpu-resource"] += parse_quantity(container.resources.requests.get("cpu", 0))
-        ret["mem-resource"] += parse_quantity(container.resources.requests.get("memory", 0)) / 1024 / 1024
-    return ret
-
-
-def get_kubernetes_pod_info_from_API():
-    config.load_kube_config()
-    api_instance = client.CoreV1Api()
-
-    timeout_seconds = 56
-
-    ret = dict()
-    try:
-        api_response = api_instance.list_pod_for_all_namespaces(timeout_seconds=timeout_seconds)
-        for pod in api_response.items:
-            if pod.spec.node_name not in ret:
-                ret[pod.spec.node_name] = [get_pod_requests(pod)]
-            else:
-                ret[pod.spec.node_name].append(get_pod_requests(pod))
-    except ApiException:
-        logger.error("Exception when calling CoreV1Api->list_pod", exc_info=True)
-        raise
-    return ret
-
-
-def get_node_resources():
-    node_allocatable_resources = get_kubernetes_node_info_from_API()
-    node_free_resources = copy.deepcopy(node_allocatable_resources)
-    pod_resources_dict = get_kubernetes_pod_info_from_API()
-    for node_name in node_free_resources:
-        if node_name not in pod_resources_dict:
-            continue
-        for pod in pod_resources_dict[node_name]:
-            node_free_resources[node_name]["cpu-resource"] -= pod["cpu-resource"]
-            node_free_resources[node_name]["mem-resource"] -= pod["mem-resource"]
-    return {"allocatable": node_allocatable_resources, "free": node_free_resources}
-
-
 def get_pai_daemon_resource_request(cfg):
     ret = {
         "cpu-resource": 0,
@@ -222,73 +107,70 @@ def get_pai_daemon_resource_request(cfg):
     return ret
 
 
-def wait_nvidia_device_plugin_ready(total_time=3600):
-    while pod_is_ready_or_not("name", "nvidia-device-plugin-ds", "Nvidia-Device-Plugin") != True:
-        logger.info("Nvidia-Device-Plugin is not ready yet. Please wait for a moment!")
-        time.sleep(10)
-        total_time = total_time - 10
-        if total_time < 0:
-            logger.error("An issue occur when starting up Nvidia-Device-Plugin")
-            sys.exit(1)
-
-
-def wait_amd_device_plugin_ready(total_time=3600):
-    while pod_is_ready_or_not("name", "amdgpu-dp-ds", "AMD-Device-Plugin") != True:
-        logger.info("AMD-Device-Plugin is not ready yet. Please wait for a moment!")
-        time.sleep(10)
-        total_time = total_time - 10
-        if total_time < 0:
-            logger.error("An issue occure when starting up AMD-Device-Plugin")
+def get_hived_config(layout, config):
+    """
+    generate hived config from layout.yaml and config.yaml
+    Resources (gpu/cpu/mem) specified in layout.yaml is considered as the total resources.
+
+    Parameters:
+    -----------
+    layout: dict
+        layout
+    config: dict
+        config
+
+    Returns:
+    --------
+    dict
+        hived config, used to render hived config template
+    """
+    pai_daemon_resource_dict = get_pai_daemon_resource_request(config)
+    sku_specs = {}
+    for sku_name, sku_spec in layout['machine-sku'].items():
+        # save memory with unit Mi
+        sku_spec['mem'] = parse_quantity(sku_spec['mem']) / 1024 / 1024
+
+        # calculate reserved resources
+        pai_reserved_cpu = min(sku_spec['cpu']['vcore'] * Decimal(PAI_RESERVE_RESOURCE_PERCENTAGE), Decimal(PAI_MAX_RESERVE_CPU_PER_NODE))
+        pai_reserved_mem = min(sku_spec['mem'] * Decimal(PAI_RESERVE_RESOURCE_PERCENTAGE), Decimal(PAI_MAX_RESERVE_MEMORY_PER_NODE))
+        reserved_cpu = SYSTEM_RESERVED_CPU + KUBE_RESERVED_CPU + pai_reserved_cpu + pai_daemon_resource_dict["cpu-resource"]
+        reserved_mem = SYSTEM_RESERVED_MEM + KUBE_RESERVED_MEM + EVICTION_HARD_MEM + pai_reserved_mem + pai_daemon_resource_dict["mem-resource"]
+
+        if sku_spec['cpu']['vcore'] <= reserved_cpu or sku_spec['mem'] <= reserved_mem:
+            logger.error("The node resource does not satisfy minmal requests. Toal cpu: %s, mem: %sMB; Reserved cpu:%s, mem: %sMB.",
+                sku_spec['cpu']['vcore'], sku_spec['mem'], reserved_cpu, reserved_mem)
             sys.exit(1)
 
+        # check if the machine has GPUs
+        if 'computing-device' in sku_spec:
+            sku_specs[sku_name] = {
+                'cpu': int((sku_spec['cpu']['vcore'] - reserved_cpu) / sku_spec['computing-device']['count']),
+                'mem': int((sku_spec['mem'] - reserved_mem) / sku_spec['computing-device']['count']),
+                'gpu': True,
+                'gpuCount': sku_spec['computing-device']['count'],
+            }
+        else:
+            sku_specs[sku_name] = {
+                'cpu': int(sku_spec['cpu']['vcore'] - reserved_cpu),
+                'mem': int(sku_spec['mem'] - reserved_mem),
+            }
 
-def hived_config_prepare(workers, node_resource_dict, pai_daemon_resource_dict):
-    # convert workers to hived worker_dict
-    worker_dict = {}
-    for worker in workers:
-        worker_dict[worker['hostname']] = worker['hostip']
-
-    hived_config = dict()
-    hived_config["nodelist"] = []
-
-    min_mem = 100000000
-    min_gpu = 100000000
-    min_cpu = 100000000
-
+    skus = {}
+    for machine in layout['machine-list']:
+        if 'pai-worker' in machine and machine['pai-worker'] == 'true':
+            sku_name = machine['machine-type']
+            sku_spec = sku_specs[sku_name]
+            if sku_name not in skus:
+                skus[sku_name] = sku_spec.copy()
+                skus[sku_name]['workers'] = [machine['hostname']]
+            else:
+                skus[sku_name]['workers'].append(machine['hostname'])
 
-    node_resource_free = node_resource_dict["free"]
-    node_resource_allocatable = node_resource_dict["allocatable"]
-    for key in node_resource_dict["free"]:
-        if key not in worker_dict:
-            continue
-        if node_resource_free[key]["gpu-resource"] == 0:
-            logger.error("Allocatable GPU number in {0} is 0, current quick start script does not allow.".format(key))
-            logger.error("Please remove {0} from your workerlist, or check if the device plugin is running healthy on the node.".format(key))
-            sys.exit(1)
-        reserved_cpu = min(node_resource_allocatable[key]["cpu-resource"] * Decimal(PAI_RESERVE_RESOURCE_PERCENTAGE), Decimal(PAI_MAX_RESERVE_CPU_PER_NODE))
-        reserved_mem = min(node_resource_allocatable[key]["mem-resource"] * Decimal(PAI_RESERVE_RESOURCE_PERCENTAGE), Decimal(PAI_MAX_RESERVE_MEMORY_PER_NODE))
-        min_cpu = min(min_cpu, node_resource_free[key]["cpu-resource"] - pai_daemon_resource_dict["cpu-resource"] - reserved_cpu)
-        min_mem = min(min_mem, node_resource_free[key]["mem-resource"] - pai_daemon_resource_dict["mem-resource"] - reserved_mem)
-        min_gpu = min(min_gpu, node_resource_free[key]["gpu-resource"])
-        if min_cpu <= 0 or min_mem <= 0:
-            logger.error("The node resource is not satisfy minmal requests. Requests cpu: %s, mem: %sMB.\
-                          Allcoatable cpu: %s, mem: %sMB. Reserved cpu:%s, mem: %sMB.",
-                node_resource_allocatable[key]["cpu-resource"] + abs(min_cpu),
-                node_resource_allocatable[key]["mem-resource"] + abs(min_mem),
-                node_resource_allocatable[key]["cpu-resource"],
-                node_resource_allocatable[key]["mem-resource"],
-                reserved_cpu, reserved_mem)
-            sys.exit(1)
-        hived_config["nodelist"].append(key)
-    if not hived_config["nodelist"]:
+    if not bool(skus):
         logger.error("No worker node is detected.")
         sys.exit(1)
 
-    hived_config["min-gpu"] = min_gpu
-    hived_config["unit-cpu"] = int(min_cpu / min_gpu)
-    hived_config["unit-mem"] = int(min_mem / min_gpu)
-
-    return hived_config
+    return { "skus": skus }
 
 
 def main():
@@ -308,11 +190,8 @@ def main():
     masters = list(filter(lambda elem: 'pai-master' in elem and elem["pai-master"] == 'true', layout['machine-list']))
     workers = list(filter(lambda elem: 'pai-worker' in elem and elem["pai-worker"] == 'true', layout['machine-list']))
     head_node = masters[0]
-    wait_nvidia_device_plugin_ready()
-    wait_amd_device_plugin_ready()
-    node_resource_dict = get_node_resources()
-    pai_daemon_resource_dict = get_pai_daemon_resource_request(config)
-    hived_config = hived_config_prepare(workers, node_resource_dict, pai_daemon_resource_dict)
+
+    hived_config = get_hived_config(layout, config)
 
     environment = {
         'masters': masters,
diff --git a/contrib/kubespray/script/service-boot.sh b/contrib/kubespray/script/service-boot.sh
index c579566add..d1ca070007 100644
--- a/contrib/kubespray/script/service-boot.sh
+++ b/contrib/kubespray/script/service-boot.sh
@@ -43,24 +43,10 @@ sudo docker exec -it dev-box-quick-start kubectl get node || { cleanup; exit 1;
 sudo docker exec -i dev-box-quick-start /bin/bash << EOF_DEV_BOX
 set -e
 
-echo "Starting nvidia device plugin to detect nvidia gpu resource..."
-svn cat https://github.com/NVIDIA/k8s-device-plugin.git/tags/1.0.0-beta4/nvidia-device-plugin.yml \
-  | kubectl apply --overwrite=true -f -
-sleep 5
-
-echo "Starting AMD device plugin to detect AMD gpu resource..."
-svn cat https://github.com/RadeonOpenCompute/k8s-device-plugin.git/trunk/k8s-ds-amdgpu-dp.yaml \
-  | kubectl apply --overwrite=true -f -
-sleep 5
-
 echo "Generating services configurations..."
 cd /pai && git checkout ${OPENPAI_BRANCH_NAME}
 python3 /pai/contrib/kubespray/script/openpai-generator.py -l /cluster-configuration/layout.yaml -c /cluster-configuration/config.yaml -o /cluster-configuration
 
-kubectl delete ds nvidia-device-plugin-daemonset -n kube-system
-kubectl delete ds amdgpu-device-plugin-daemonset -n kube-system
-sleep 5
-
 # TODO: This should be done at our source code.
 kubectl create namespace pai-storage
 

From 466a7e50d45b65b04642a328e59ac8200878f404 Mon Sep 17 00:00:00 2001
From: suiguoxin <suiguoxin@gmail.com>
Date: Wed, 16 Dec 2020 18:01:49 +0800
Subject: [PATCH 2/3] keep min mem/cpu set logic, read only gpu count from
 layout.yaml

---
 contrib/kubespray/script/openpai-generator.py | 178 ++++++++++++++----
 1 file changed, 137 insertions(+), 41 deletions(-)

diff --git a/contrib/kubespray/script/openpai-generator.py b/contrib/kubespray/script/openpai-generator.py
index 8daaee83a9..6e91ff8e1e 100644
--- a/contrib/kubespray/script/openpai-generator.py
+++ b/contrib/kubespray/script/openpai-generator.py
@@ -1,25 +1,23 @@
 import os
 import sys
 import re
+import copy
 import argparse
 import logging
 import logging.config
+import math
 from decimal import Decimal
 import yaml
 import jinja2
+from kubernetes import client, config
 from kubernetes.utils import parse_quantity
+from kubernetes.client.rest import ApiException
 
 # reserved resources
 PAI_RESERVE_RESOURCE_PERCENTAGE = 0.01
 PAI_MAX_RESERVE_CPU_PER_NODE = 0.5
 PAI_MAX_RESERVE_MEMORY_PER_NODE = 1024 # 1Gi
 
-KUBE_RESERVED_CPU = 0.01 # 100m
-KUBE_RESERVED_MEM = 256 # Mi
-SYSTEM_RESERVED_CPU = 0
-SYSTEM_RESERVED_MEM = 0
-EVICTION_HARD_MEM = 100 #Mi
-
 
 def setup_logger_config(logger):
     """
@@ -69,6 +67,73 @@ def generate_template_file(template_file_path, output_path, map_table):
     write_generated_file(output_path, generated_template)
 
 
+def get_kubernetes_node_info_from_API():
+    config.load_kube_config()
+    api_instance = client.CoreV1Api()
+    # https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/CoreV1Api.md#list_node
+    pretty = 'true'
+    timeout_seconds = 56
+    ret = dict()
+    try:
+        api_response = api_instance.list_node(pretty=pretty, timeout_seconds=timeout_seconds)
+        for node in api_response.items:
+            ret[node.metadata.name] = {
+                "cpu-resource": parse_quantity(node.status.allocatable['cpu']),
+                "mem-resource": parse_quantity(node.status.allocatable['memory']) / 1024 / 1024,
+            }
+    except ApiException as e:
+        logger.error("Exception when calling CoreV1Api->list_node: %s\n" % e)
+        raise
+
+    return ret
+
+
+def get_pod_requests(pod):
+    ret = {
+        "cpu-resource": 0,
+        "mem-resource": 0,
+    }
+    for container in pod.spec.containers:
+        if container.resources.requests is None:
+            continue
+        ret["cpu-resource"] += parse_quantity(container.resources.requests.get("cpu", 0))
+        ret["mem-resource"] += parse_quantity(container.resources.requests.get("memory", 0)) / 1024 / 1024
+    return ret
+
+
+def get_kubernetes_pod_info_from_API():
+    config.load_kube_config()
+    api_instance = client.CoreV1Api()
+
+    timeout_seconds = 56
+
+    ret = dict()
+    try:
+        api_response = api_instance.list_pod_for_all_namespaces(timeout_seconds=timeout_seconds)
+        for pod in api_response.items:
+            if pod.spec.node_name not in ret:
+                ret[pod.spec.node_name] = [get_pod_requests(pod)]
+            else:
+                ret[pod.spec.node_name].append(get_pod_requests(pod))
+    except ApiException:
+        logger.error("Exception when calling CoreV1Api->list_pod", exc_info=True)
+        raise
+    return ret
+
+
+def get_node_resources():
+    node_allocatable_resources = get_kubernetes_node_info_from_API()
+    node_free_resources = copy.deepcopy(node_allocatable_resources)
+    pod_resources_dict = get_kubernetes_pod_info_from_API()
+    for node_name in node_free_resources:
+        if node_name not in pod_resources_dict:
+            continue
+        for pod in pod_resources_dict[node_name]:
+            node_free_resources[node_name]["cpu-resource"] -= pod["cpu-resource"]
+            node_free_resources[node_name]["mem-resource"] -= pod["mem-resource"]
+    return {"allocatable": node_allocatable_resources, "free": node_free_resources}
+
+
 def get_pai_daemon_resource_request(cfg):
     ret = {
         "cpu-resource": 0,
@@ -107,6 +172,31 @@ def get_pai_daemon_resource_request(cfg):
     return ret
 
 
+def get_min_free_resource(workers, node_resource_dict, pai_daemon_resource_dict):
+    """
+    get the minimum free memory and cpu resource among a list of workers
+    """
+    min_mem = math.inf
+    min_cpu = math.inf
+
+    for node_name in workers:
+        reserved_cpu = min(node_resource_dict["allocatable"][node_name]["cpu-resource"] * Decimal(PAI_RESERVE_RESOURCE_PERCENTAGE), Decimal(PAI_MAX_RESERVE_CPU_PER_NODE))
+        reserved_mem = min(node_resource_dict["allocatable"][node_name]["mem-resource"] * Decimal(PAI_RESERVE_RESOURCE_PERCENTAGE), Decimal(PAI_MAX_RESERVE_MEMORY_PER_NODE))
+        min_cpu = min(min_cpu, node_resource_dict["free"][node_name]["cpu-resource"] - pai_daemon_resource_dict["cpu-resource"] - reserved_cpu)
+        min_mem = min(min_mem, node_resource_dict["free"][node_name]["mem-resource"] - pai_daemon_resource_dict["mem-resource"] - reserved_mem)
+        if min_cpu <= 0 or min_mem <= 0:
+            logger.error("The node resource does not satisfy minmal requests. Requests cpu: %s, mem: %sMB.\
+                          Allcoatable cpu: %s, mem: %sMB. Reserved cpu:%s, mem: %sMB.",
+                node_resource_dict["allocatable"][node_name]["cpu-resource"] + abs(min_cpu),
+                node_resource_dict["allocatable"][node_name]["mem-resource"] + abs(min_mem),
+                node_resource_dict["allocatable"][node_name]["cpu-resource"],
+                node_resource_dict["allocatable"][node_name]["mem-resource"],
+                reserved_cpu, reserved_mem)
+            sys.exit(1)
+
+    return min_mem, min_cpu
+
+
 def get_hived_config(layout, config):
     """
     generate hived config from layout.yaml and config.yaml
@@ -123,53 +213,60 @@ def get_hived_config(layout, config):
     --------
     dict
         hived config, used to render hived config template
-    """
-    pai_daemon_resource_dict = get_pai_daemon_resource_request(config)
-    sku_specs = {}
-    for sku_name, sku_spec in layout['machine-sku'].items():
-        # save memory with unit Mi
-        sku_spec['mem'] = parse_quantity(sku_spec['mem']) / 1024 / 1024
-
-        # calculate reserved resources
-        pai_reserved_cpu = min(sku_spec['cpu']['vcore'] * Decimal(PAI_RESERVE_RESOURCE_PERCENTAGE), Decimal(PAI_MAX_RESERVE_CPU_PER_NODE))
-        pai_reserved_mem = min(sku_spec['mem'] * Decimal(PAI_RESERVE_RESOURCE_PERCENTAGE), Decimal(PAI_MAX_RESERVE_MEMORY_PER_NODE))
-        reserved_cpu = SYSTEM_RESERVED_CPU + KUBE_RESERVED_CPU + pai_reserved_cpu + pai_daemon_resource_dict["cpu-resource"]
-        reserved_mem = SYSTEM_RESERVED_MEM + KUBE_RESERVED_MEM + EVICTION_HARD_MEM + pai_reserved_mem + pai_daemon_resource_dict["mem-resource"]
-
-        if sku_spec['cpu']['vcore'] <= reserved_cpu or sku_spec['mem'] <= reserved_mem:
-            logger.error("The node resource does not satisfy minmal requests. Toal cpu: %s, mem: %sMB; Reserved cpu:%s, mem: %sMB.",
-                sku_spec['cpu']['vcore'], sku_spec['mem'], reserved_cpu, reserved_mem)
-            sys.exit(1)
-
-        # check if the machine has GPUs
-        if 'computing-device' in sku_spec:
-            sku_specs[sku_name] = {
-                'cpu': int((sku_spec['cpu']['vcore'] - reserved_cpu) / sku_spec['computing-device']['count']),
-                'mem': int((sku_spec['mem'] - reserved_mem) / sku_spec['computing-device']['count']),
-                'gpu': True,
-                'gpuCount': sku_spec['computing-device']['count'],
+        Example:
+        {
+            "skus": {
+                "gpu-machine": {
+                    "mem": 500,
+                    "cpu": 2,
+                    "gpu": True,
+                    "gpuCount": 4,
+                    "workers": [
+                        "pai-gpu-worker0",
+                        "pai-gpu-worker1"
+                    ]
+                },
+                "cpu-machine": {
+                    "mem": 500,
+                    "cpu": 2,
+                    "workers": [
+                        "pai-cpu-worker0",
+                        "pai-cpu-worker1"
+                    ]
+                }
             }
-        else:
-            sku_specs[sku_name] = {
-                'cpu': int(sku_spec['cpu']['vcore'] - reserved_cpu),
-                'mem': int(sku_spec['mem'] - reserved_mem),
-            }
-
+        }
+    """
+    # set `workers` field
     skus = {}
     for machine in layout['machine-list']:
         if 'pai-worker' in machine and machine['pai-worker'] == 'true':
             sku_name = machine['machine-type']
-            sku_spec = sku_specs[sku_name]
             if sku_name not in skus:
-                skus[sku_name] = sku_spec.copy()
                 skus[sku_name]['workers'] = [machine['hostname']]
             else:
                 skus[sku_name]['workers'].append(machine['hostname'])
 
     if not bool(skus):
-        logger.error("No worker node is detected.")
+        logger.error("No worker node detected.")
         sys.exit(1)
 
+    node_resource_dict = get_node_resources()
+    pai_daemon_resource_dict = get_pai_daemon_resource_request(config)
+
+    for sku_name in skus:
+        sku_mem_free, sku_cpu_free = get_min_free_resource(skus[sku_name]['workers'], node_resource_dict, pai_daemon_resource_dict)
+        sku_spec = layout['machine-sku'][sku_name]
+        # check if the machine has GPUs
+        if 'computing-device' in sku_spec:
+            skus[sku_name]['gpu'] = True
+            skus[sku_name]['gpuCount'] = sku_spec['computing-device']['count']
+            skus[sku_name]['mem'] = int(sku_mem_free / sku_spec['computing-device']['count'])
+            skus[sku_name]['cpu'] = int(sku_cpu_free / sku_spec['computing-device']['count'])
+        else:
+            skus[sku_name]['mem'] = int(sku_mem_free)
+            skus[sku_name]['cpu'] = int(sku_cpu_free)
+
     return { "skus": skus }
 
 
@@ -190,7 +287,6 @@ def main():
     masters = list(filter(lambda elem: 'pai-master' in elem and elem["pai-master"] == 'true', layout['machine-list']))
     workers = list(filter(lambda elem: 'pai-worker' in elem and elem["pai-worker"] == 'true', layout['machine-list']))
     head_node = masters[0]
-
     hived_config = get_hived_config(layout, config)
 
     environment = {

From 67d489bdfa4586252325f3fdf02cf7df83b26a2e Mon Sep 17 00:00:00 2001
From: suiguoxin <suiguoxin@gmail.com>
Date: Thu, 17 Dec 2020 10:59:25 +0800
Subject: [PATCH 3/3] fix

---
 .../services-configuration.yaml.template      |  8 +++-
 contrib/kubespray/script/openpai-generator.py | 40 ++++++++++---------
 2 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/contrib/kubespray/quick-start/services-configuration.yaml.template b/contrib/kubespray/quick-start/services-configuration.yaml.template
index cfc66449a3..2139443e58 100644
--- a/contrib/kubespray/quick-start/services-configuration.yaml.template
+++ b/contrib/kubespray/quick-start/services-configuration.yaml.template
@@ -56,15 +56,21 @@ hivedscheduler:
         {{ sku_name }}:
         {%- if sku_spec.gpu %}
           gpu: 1
-        {%- endif %}
           cpu: {{ sku_spec.cpu }}
+        {%- else %}
+          cpu: 1
+        {%- endif %}
           memory: {{ sku_spec.memory }}Mi
         {% endfor %}
       cellTypes:
         {% for sku_name, sku_spec in env["hived"]["skus"].items() -%}
         {{ sku_name }}-NODE:
           childCellType: {{ sku_name }}
+          {%- if sku_spec.gpu %}
           childCellNumber: {{ sku_spec.gpuCount }}
+          {%- else %}
+          childCellNumber: {{ sku_spec.cpu }}
+          {%- endif %}
           isNodeLevel: true
         {{ sku_name }}-NODE-POOL:
           childCellType: {{ sku_name }}-NODE
diff --git a/contrib/kubespray/script/openpai-generator.py b/contrib/kubespray/script/openpai-generator.py
index 6e91ff8e1e..a6876776a0 100644
--- a/contrib/kubespray/script/openpai-generator.py
+++ b/contrib/kubespray/script/openpai-generator.py
@@ -215,23 +215,24 @@ def get_hived_config(layout, config):
         hived config, used to render hived config template
         Example:
         {
-            "skus": {
-                "gpu-machine": {
-                    "mem": 500,
-                    "cpu": 2,
-                    "gpu": True,
-                    "gpuCount": 4,
-                    "workers": [
-                        "pai-gpu-worker0",
-                        "pai-gpu-worker1"
+            'skus': {
+                'gpu-machine': {
+                    'mem': 500,
+                    'cpu': 2,
+                    'gpu': True,
+                    'gpuCount': 4,
+                    'workers': [
+                        'pai-gpu-worker0',
+                        'pai-gpu-worker1'
                     ]
                 },
-                "cpu-machine": {
-                    "mem": 500,
-                    "cpu": 2,
-                    "workers": [
-                        "pai-cpu-worker0",
-                        "pai-cpu-worker1"
+                'cpu-machine': {
+                    'mem': 500,
+                    'cpu': 2,
+                    'gpu': False,
+                    'workers': [
+                        'pai-cpu-worker0',
+                        'pai-cpu-worker1'
                     ]
                 }
             }
@@ -243,7 +244,9 @@ def get_hived_config(layout, config):
         if 'pai-worker' in machine and machine['pai-worker'] == 'true':
             sku_name = machine['machine-type']
             if sku_name not in skus:
-                skus[sku_name]['workers'] = [machine['hostname']]
+                skus[sku_name] = {
+                    'workers' : [machine['hostname']]
+                } 
             else:
                 skus[sku_name]['workers'].append(machine['hostname'])
 
@@ -261,10 +264,11 @@ def get_hived_config(layout, config):
         if 'computing-device' in sku_spec:
             skus[sku_name]['gpu'] = True
             skus[sku_name]['gpuCount'] = sku_spec['computing-device']['count']
-            skus[sku_name]['mem'] = int(sku_mem_free / sku_spec['computing-device']['count'])
+            skus[sku_name]['memory'] = int(sku_mem_free / sku_spec['computing-device']['count'])
             skus[sku_name]['cpu'] = int(sku_cpu_free / sku_spec['computing-device']['count'])
         else:
-            skus[sku_name]['mem'] = int(sku_mem_free)
+            skus[sku_name]['gpu'] = False
+            skus[sku_name]['memory'] = int(sku_mem_free / sku_cpu_free)
             skus[sku_name]['cpu'] = int(sku_cpu_free)
 
     return { "skus": skus }