FedML-AI · fedml-alex · May 21, 2024 · May 11, 2024 · May 11, 2024 · May 11, 2024
diff --git a/python/fedml/api/modules/run.py b/python/fedml/api/modules/run.py
@@ -51,7 +51,7 @@ def start(platform: str, create_run_result: FedMLRunStartedModel, device_server:
 
     run_start_result = FedMLRunManager.get_instance().start_run(platform=platform, create_run_result=create_run_result,
                                                                 device_server=device_server, device_edges=device_edges,
-                                                                api_key=api_key,
+                                                                api_key=get_api_key(),
                                                                 feature_entry_point=feature_entry_point)
 
     return run_start_result
@@ -79,7 +79,7 @@ def status(run_name: Optional[str], run_id: str, platform: str, api_key: str) ->
     _authenticate_and_validate_platform(api_key, platform)
 
     run_status = None
-    run_list_obj = list_run(run_name=run_name, run_id=run_id, platform=platform, api_key=api_key)
+    run_list_obj = list_run(run_name=run_name, run_id=run_id, platform=platform, api_key=get_api_key())
 
     if run_list_obj is not None:
         if len(run_list_obj.run_list) > 1:
@@ -93,12 +93,13 @@ def status(run_name: Optional[str], run_id: str, platform: str, api_key: str) ->
 # input: run_id, page_num, page_size, need_all_logs, platform, api_key
 # return RunLogResult(run_status, total_log_lines, total_log_pages, log_line_list, run_logs)
 def logs(run_id: str, page_num: int, page_size: int, need_all_logs: bool, platform: str, api_key: str) -> RunLogResult:
-    _authenticate_and_validate_platform(api_key, platform)
+    api_key = authenticate(api_key)
+    validate_platform(platform)
 
     if run_id is None:
         raise Exception("Please specify run id.")
 
-    _, run_status = status(run_name=None, run_id=run_id, platform=platform, api_key=get_api_key())
+    _, run_status = status(run_name=None, run_id=run_id, platform=platform, api_key=api_key)
 
     total_log_nums, total_log_pages, log_line_list, run_logs = 0, 0, list(), None
 
@@ -110,7 +111,7 @@ def logs(run_id: str, page_num: int, page_size: int, need_all_logs: bool, platfo
                                                                user_api_key=api_key)
 
         if run_logs is not None:
-            total_log_pages, total_log_nums = run_logs.total_num, run_logs.total_pages
+            total_log_pages, total_log_nums = run_logs.total_pages, run_logs.total_num
             _parse_logs(log_line_list, run_logs)
 
         return RunLogResult(run_status=run_status, total_log_lines=total_log_nums, total_log_pages=total_log_pages,

diff --git a/python/fedml/cli/modules/run.py b/python/fedml/cli/modules/run.py
@@ -184,21 +184,21 @@ def status(platform, run_name, run_id, api_key, version):
     "--page_num",
     "-pn",
     type=int,
-    default=0,
+    default=1,
     help="request page num for logs. --need_all_logs should be set to False if you want to use this option.",
 )
 @click.option(
     "--page_size",
     "-ps",
     type=int,
-    default=0,
+    default=10,
     help="request page size for logs, --need_all_logs should be set to False if you want to use this option.",
 )
 @click.option(
     "--need_all_logs",
     "-a",
     type=bool,
-    default=True,
+    default=False,
     help="boolean value representing if all logs are needed. Default to True",
 )
 def logs(platform, run_id, api_key, version, page_num, page_size, need_all_logs):
@@ -217,8 +217,8 @@ def logs(platform, run_id, api_key, version, page_num, page_size, need_all_logs)
         return
 
     # Show run log summary info
-    log_head_table = PrettyTable(['Run ID', 'Total Log Lines', 'Log URL'])
-    log_head_table.add_row([run_id, run_log_result.total_log_lines, run_logs.log_full_url])
+    log_head_table = PrettyTable(['Run ID', 'Printed Log Lines', 'Total Log Lines', 'Log URL'])
+    log_head_table.add_row([run_id, len(run_log_result.log_line_list), run_logs.total_num, run_logs.log_full_url])
     click.echo("\nLogs summary info is as follows.")
     print(log_head_table)
 
@@ -234,7 +234,7 @@ def logs(platform, run_id, api_key, version, page_num, page_size, need_all_logs)
     if len(run_log_result.log_line_list) > 0:
         click.echo("\nAll logs is as follows.")
         for log_line in run_log_result.log_line_list:
-            click.echo(log_line.rstrip('\n'))
+            click.echo(log_line)
 
 
 def _print_run_table(run_list_obj):

diff --git a/python/fedml/computing/scheduler/comm_utils/container_utils.py b/python/fedml/computing/scheduler/comm_utils/container_utils.py
@@ -2,16 +2,18 @@
 import os
 import traceback
 import datetime
+from typing import List
+
 from dateutil.parser import isoparse
 
 import docker
 from docker import errors
 
 from fedml.computing.scheduler.comm_utils import sys_utils
+from fedml.computing.scheduler.comm_utils.hardware_utils import HardwareUtil
 from fedml.core.common.singleton import Singleton
 from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants
 import time
-from GPUtil import getGPUs
 
 
 class ContainerUtils(Singleton):
@@ -225,9 +227,8 @@ def pull_image_with_policy(self, image_pull_policy, image_name, client=None):
             raise Exception(f"Unsupported image pull policy: {image_pull_policy}")
 
     class ContainerMetrics:
-        def __init__(self, cpu_percent, mem_used_megabytes, mem_avail_megabytes, network_recv_megabytes, network_sent_megabytes,
-                     blk_read_megabytes, blk_write_megabytes, timestamp, gpus_stat
-                     ):
+        def __init__(self, cpu_percent, mem_used_megabytes, mem_avail_megabytes, network_recv_megabytes,
+                     network_sent_megabytes, blk_read_megabytes, blk_write_megabytes, timestamp, gpus_stat):
             self.cpu_percent = cpu_percent
             self.mem_used_megabytes = mem_used_megabytes
             self.mem_avail_megabytes = mem_avail_megabytes
@@ -252,7 +253,7 @@ def get_container_perf(self, c_name) -> ContainerMetrics:
         CPU %     MEM USAGE / LIMIT     MEM %     NET I/O          BLOCK I/O
         0.26%     8.703GiB / 503.5GiB   1.73%     17.4GB / 176MB   545kB / 20.9GB
 
-        GPU: We currently use GPUtil to get the GPU stats on host machine since one GPU is not
+        GPU: We currently use HardwareUtil to get the GPU stats on host machine since one GPU is not
         shared by multiple containers
         (TODO: get the GPU stats inside the container)
         """
@@ -320,47 +321,35 @@ def get_container_perf(self, c_name) -> ContainerMetrics:
             round(blk_read_bytes / (1024 * 1024), 1), round(blk_write_bytes / (1024 * 1024), 1))
 
         # Calculate the gpu usage
-        gpus_stat = self.generate_container_gpu_stats(c_name)
+        gpus_stat = self.generate_container_gpu_stats(container_name=c_name)
 
         # Record timestamp
         timestamp = stats["read"]
 
         return ContainerUtils.ContainerMetrics(cpu_percent, mem_gb_used, mem_gb_avail, recv_megabytes, sent_megabytes,
                                                blk_read_bytes, blk_write_bytes, timestamp, gpus_stat)
 
-    def generate_container_gpu_stats(self, c_name):
-        gpu_ids = self.get_gpu_ids_by_container_name(c_name)
+    def generate_container_gpu_stats(self, container_name):
+        client = self.get_docker_client()
+        gpu_ids = HardwareUtil.get_docker_gpu_ids_by_container_name(container_name=container_name, docker_client=client)
         gpu_stats = self.gpu_stats(gpu_ids)
         return gpu_stats
 
-    def get_gpu_ids_by_container_name(self, c_name):
-        client = self.get_docker_client()
-        gpu_ids = []
-        try:
-            gpu_ids = client.api.inspect_container(c_name)["HostConfig"]["DeviceRequests"][0]["DeviceIDs"]
-            gpu_ids = list(map(int, gpu_ids))
-        except Exception as e:
-            logging.error(f"Failed to get GPU IDs: {e}")
-            pass
-
-        return gpu_ids
-
     @staticmethod
-    def gpu_stats(gpu_ids):
+    def gpu_stats(gpu_ids: List[int]):
         utilz, memory, temp = None, None, None
         gpu_stats_map = {}  # gpu_id: int -> {"gpu_utilization", "gpu_memory_allocated", "gpu_temp"}
+        gpu_ids = set(gpu_ids)
         try:
-            gpus = getGPUs()
-
-            for i in gpu_ids:
-                gpu = gpus[i]
-                gpu_stats_map[i] = {
-                    "gpu_utilization": gpu.load*100,
-                    "gpu_memory_allocated": gpu.memoryUtil*100,
-                    "gpu_temp": gpu.temperature,
-                    # "gpu_power_usage": pynvml.nvmlDeviceGetPowerUsage(handle) / 1000,   # in watts
-                    # "gpu_time_spent_accessing_memory": utilz.memory   # in ms
-                }
+            for gpu in HardwareUtil.get_gpus():
+                if gpu.id in gpu_ids:
+                    gpu_stats_map[gpu.id] = {
+                        "gpu_utilization": gpu.load * 100,
+                        "gpu_memory_allocated": gpu.memoryUsed / gpu.memoryTotal * 100,
+                        "gpu_temp": gpu.temperature,
+                        # "gpu_power_usage": pynvml.nvmlDeviceGetPowerUsage(handle) / 1000,   # in watts
+                        # "gpu_time_spent_accessing_memory": utilz.memory   # in ms
+                    }
         except Exception as e:
             logging.error(f"Failed to get GPU stats: {e}")
 

diff --git a/python/fedml/computing/scheduler/comm_utils/gpu_utils/__init__.py b/python/fedml/computing/scheduler/comm_utils/gpu_utils/__init__.py
diff --git a/python/fedml/computing/scheduler/comm_utils/gpu_utils/gpu_utils.py b/python/fedml/computing/scheduler/comm_utils/gpu_utils/gpu_utils.py
@@ -0,0 +1,61 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import Optional, List, Dict
+
+from docker import DockerClient
+
+
+class GPUCardType(Enum):
+    NVIDIA = auto()
+    QUALCOMM = auto()
+    UNKNOWN = auto()
+
+    def __str__(self):
+        return self.name
+
+
+@dataclass
+class GPUCard:
+    id: int
+    name: str
+    driver: str
+    serial: str
+    vendor: str
+    memoryTotal: float
+    memoryFree: float
+    memoryUsed: float
+    memoryUtil: float
+    load: Optional[float] = 0.0
+    uuid: Optional[str] = ""
+    display_mode: Optional[str] = ""
+    display_active: Optional[str] = ""
+    temperature: Optional[float] = 0.0
+
+
+class GPUCardUtil(ABC):
+
+    @classmethod
+    @abstractmethod
+    def detect_gpu_card_type(cls) -> Optional[GPUCardType]:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memory: float) -> List[int]:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def get_gpu_cards() -> List[GPUCard]:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def get_docker_gpu_device_mapping(gpu_ids: Optional[List[int]], num_gpus: int = 0) -> Optional[Dict]:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: DockerClient) -> List[int]:
+        raise NotImplementedError
diff --git a/python/fedml/computing/scheduler/comm_utils/gpu_utils/nvidia_utils.py b/python/fedml/computing/scheduler/comm_utils/gpu_utils/nvidia_utils.py
@@ -0,0 +1,65 @@
+import logging
+import subprocess
+from typing import List, Optional, Dict
+
+import docker
+from docker import types, DockerClient
+from GPUtil import GPUtil, GPU
+
+from fedml.computing.scheduler.comm_utils.gpu_utils.gpu_utils import GPUCard, GPUCardUtil, GPUCardType
+
+
+class NvidiaGPUtil(GPUCardUtil):
+
+    @classmethod
+    def detect_gpu_card_type(cls) -> Optional[GPUCardType]:
+        try:
+            subprocess.check_output(["nvidia-smi"], universal_newlines=True)
+            return GPUCardType.NVIDIA
+        except Exception:
+            return None
+
+    @staticmethod
+    def get_gpu_cards() -> List[GPUCard]:
+        return [NvidiaGPUtil.__convert(gpu) for gpu in GPUtil.getGPUs()]
+
+    @staticmethod
+    def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memory: float) -> List[int]:
+        return GPUtil.getAvailable(order=order, limit=limit, maxLoad=max_load, maxMemory=max_memory)
+
+    @staticmethod
+    def get_docker_gpu_device_mapping(gpu_ids: List[int], num_gpus: int = 0) -> Optional[Dict]:
+        if gpu_ids is not None and len(gpu_ids):
+            gpu_id_list = list(map(lambda x: str(x), gpu_ids))
+            return {"device_requests": [docker.types.DeviceRequest(device_ids=gpu_id_list, capabilities=[["gpu"]])]}
+        else:
+            return {"device_requests": [docker.types.DeviceRequest(count=num_gpus, capabilities=[['gpu']])]}
+
+    @staticmethod
+    def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: DockerClient) -> List[int]:
+        try:
+            gpu_ids = docker_client.api.inspect_container(container_name)["HostConfig"]["DeviceRequests"][0]["DeviceIDs"]
+            return list(map(int, gpu_ids))
+        except Exception as e:
+            logging.error(f"Failed to get GPU IDs: {e}")
+            pass
+        return []
+
+    @staticmethod
+    def __convert(gpu: GPU) -> GPUCard:
+        return GPUCard(
+            id=gpu.id,
+            name=gpu.name,
+            driver=gpu.driver,
+            serial=gpu.serial,
+            vendor=GPUCardType.NVIDIA.name,
+            memoryTotal=gpu.memoryTotal,
+            memoryFree=gpu.memoryFree,
+            memoryUsed=gpu.memoryUsed,
+            memoryUtil=gpu.memoryUtil,
+            load=gpu.load,
+            uuid=gpu.uuid,
+            display_mode=gpu.display_mode,
+            display_active=gpu.display_active,
+            temperature=gpu.temperature,
+        )