Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev/v0.7.0 #2121

Merged
merged 37 commits into from
May 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
3cc9bb2
Add Util classes
alaydshah May 11, 2024
e184286
Rename util files for pattern matching
alaydshah May 11, 2024
8c8c1da
MVP
alaydshah May 11, 2024
6b987ed
Add GPU Type Registry
alaydshah May 11, 2024
436233b
Rolling back GPU Registry change
alaydshah May 11, 2024
502c031
Qualcomm Util -> get gpus
alaydshah May 12, 2024
fba65b2
Qualcomm Util -> get_available_gpu_card_ids
alaydshah May 12, 2024
63c682c
Add sys path in init
alaydshah May 12, 2024
d3c081d
Replace GPUtil with Hardware Util
alaydshah May 12, 2024
8b2ff80
Make fedml env hardware agnostic
alaydshah May 12, 2024
f9aaee9
Nit
alaydshah May 12, 2024
906e8b0
Minor Bug
alaydshah May 12, 2024
3a5daf6
Add Hardware specific docker device mapping
alaydshah May 12, 2024
e57af1e
Bug Fix
alaydshah May 12, 2024
e70f56f
Add util function to get gpu_ids from container name
alaydshah May 12, 2024
33da11e
Make gpu stats fetching hardware agnostic
alaydshah May 12, 2024
2d458a8
Nits
alaydshah May 12, 2024
57c5dc4
Merge pull request #2087 from FedML-AI/alaydshah/gpu_utils/qualcomm
chaoyanghe May 12, 2024
be635db
Update container creation during deployment
alaydshah May 12, 2024
3792255
Merge pull request #2090 from FedML-AI/alaydshah/gpu_utils/qualcomm2
chaoyanghe May 12, 2024
9d5b54f
Nit: Update naming
alaydshah May 13, 2024
2fcf57d
Add check as device_mapping can be None
alaydshah May 13, 2024
2bbe2f9
Merge pull request #2095 from FedML-AI/alexleung/dev_v070_for_refactor
chaoyanghe May 15, 2024
69b3956
Merge pull request #2091 from FedML-AI/alaydshah/gpu_utils/qualcomm3
chaoyanghe May 16, 2024
f4332b1
Merge pull request #2099 from FedML-AI/alexleung/dev_v070_for_refactor
fedml-alex May 16, 2024
056a546
Merge pull request #2107 from FedML-AI/alexleung/dev_v070_for_refactor
fedml-alex May 16, 2024
7c0e08c
Merge pull request #2109 from FedML-AI/alexleung/dev_v070_for_refactor
fedml-alex May 16, 2024
d7f2423
Merge pull request #2111 from FedML-AI/alexleung/dev_v070_for_refactor
fedml-alex May 16, 2024
41e62ac
Merge pull request #2115 from FedML-AI/alexleung/dev_v070_for_refactor
fedml-alex May 17, 2024
c26eaff
Add timestamp in status payload
alaydshah May 17, 2024
43c26e5
Merge pull request #2117 from FedML-AI/alaydshah/run_status/timestamp
alaydshah May 17, 2024
7b576cb
Fix Import
alaydshah May 17, 2024
a34535b
Merge pull request #2118 from FedML-AI/alaydshah/fix/import
alaydshah May 17, 2024
3f40511
Fix run logs cli command
alaydshah May 18, 2024
9a16aa4
Merge pull request #2119 from FedML-AI/alaydshah/run/logs
chaoyanghe May 18, 2024
a0664d8
Add support for GPU Utilization
May 20, 2024
cc7b9c3
Merge pull request #2120 from FedML-AI/alaydshah/qualcomm/add/gpu_uti…
chaoyanghe May 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions python/fedml/api/modules/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def start(platform: str, create_run_result: FedMLRunStartedModel, device_server:

run_start_result = FedMLRunManager.get_instance().start_run(platform=platform, create_run_result=create_run_result,
device_server=device_server, device_edges=device_edges,
api_key=api_key,
api_key=get_api_key(),
feature_entry_point=feature_entry_point)

return run_start_result
Expand Down Expand Up @@ -79,7 +79,7 @@ def status(run_name: Optional[str], run_id: str, platform: str, api_key: str) ->
_authenticate_and_validate_platform(api_key, platform)

run_status = None
run_list_obj = list_run(run_name=run_name, run_id=run_id, platform=platform, api_key=api_key)
run_list_obj = list_run(run_name=run_name, run_id=run_id, platform=platform, api_key=get_api_key())

if run_list_obj is not None:
if len(run_list_obj.run_list) > 1:
Expand All @@ -93,12 +93,13 @@ def status(run_name: Optional[str], run_id: str, platform: str, api_key: str) ->
# input: run_id, page_num, page_size, need_all_logs, platform, api_key
# return RunLogResult(run_status, total_log_lines, total_log_pages, log_line_list, run_logs)
def logs(run_id: str, page_num: int, page_size: int, need_all_logs: bool, platform: str, api_key: str) -> RunLogResult:
_authenticate_and_validate_platform(api_key, platform)
api_key = authenticate(api_key)
validate_platform(platform)

if run_id is None:
raise Exception("Please specify run id.")

_, run_status = status(run_name=None, run_id=run_id, platform=platform, api_key=get_api_key())
_, run_status = status(run_name=None, run_id=run_id, platform=platform, api_key=api_key)

total_log_nums, total_log_pages, log_line_list, run_logs = 0, 0, list(), None

Expand All @@ -110,7 +111,7 @@ def logs(run_id: str, page_num: int, page_size: int, need_all_logs: bool, platfo
user_api_key=api_key)

if run_logs is not None:
total_log_pages, total_log_nums = run_logs.total_num, run_logs.total_pages
total_log_pages, total_log_nums = run_logs.total_pages, run_logs.total_num
_parse_logs(log_line_list, run_logs)

return RunLogResult(run_status=run_status, total_log_lines=total_log_nums, total_log_pages=total_log_pages,
Expand Down
12 changes: 6 additions & 6 deletions python/fedml/cli/modules/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,21 +184,21 @@ def status(platform, run_name, run_id, api_key, version):
"--page_num",
"-pn",
type=int,
default=0,
default=1,
help="request page num for logs. --need_all_logs should be set to False if you want to use this option.",
)
@click.option(
"--page_size",
"-ps",
type=int,
default=0,
default=10,
help="request page size for logs, --need_all_logs should be set to False if you want to use this option.",
)
@click.option(
"--need_all_logs",
"-a",
type=bool,
default=True,
default=False,
help="boolean value representing if all logs are needed. Default to True",
)
def logs(platform, run_id, api_key, version, page_num, page_size, need_all_logs):
Expand All @@ -217,8 +217,8 @@ def logs(platform, run_id, api_key, version, page_num, page_size, need_all_logs)
return

# Show run log summary info
log_head_table = PrettyTable(['Run ID', 'Total Log Lines', 'Log URL'])
log_head_table.add_row([run_id, run_log_result.total_log_lines, run_logs.log_full_url])
log_head_table = PrettyTable(['Run ID', 'Printed Log Lines', 'Total Log Lines', 'Log URL'])
log_head_table.add_row([run_id, len(run_log_result.log_line_list), run_logs.total_num, run_logs.log_full_url])
click.echo("\nLogs summary info is as follows.")
print(log_head_table)

Expand All @@ -234,7 +234,7 @@ def logs(platform, run_id, api_key, version, page_num, page_size, need_all_logs)
if len(run_log_result.log_line_list) > 0:
click.echo("\nAll logs is as follows.")
for log_line in run_log_result.log_line_list:
click.echo(log_line.rstrip('\n'))
click.echo(log_line)


def _print_run_table(run_list_obj):
Expand Down
53 changes: 21 additions & 32 deletions python/fedml/computing/scheduler/comm_utils/container_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,18 @@
import os
import traceback
import datetime
from typing import List

from dateutil.parser import isoparse

import docker
from docker import errors

from fedml.computing.scheduler.comm_utils import sys_utils
from fedml.computing.scheduler.comm_utils.hardware_utils import HardwareUtil
from fedml.core.common.singleton import Singleton
from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants
import time
from GPUtil import getGPUs


class ContainerUtils(Singleton):
Expand Down Expand Up @@ -225,9 +227,8 @@ def pull_image_with_policy(self, image_pull_policy, image_name, client=None):
raise Exception(f"Unsupported image pull policy: {image_pull_policy}")

class ContainerMetrics:
def __init__(self, cpu_percent, mem_used_megabytes, mem_avail_megabytes, network_recv_megabytes, network_sent_megabytes,
blk_read_megabytes, blk_write_megabytes, timestamp, gpus_stat
):
def __init__(self, cpu_percent, mem_used_megabytes, mem_avail_megabytes, network_recv_megabytes,
network_sent_megabytes, blk_read_megabytes, blk_write_megabytes, timestamp, gpus_stat):
self.cpu_percent = cpu_percent
self.mem_used_megabytes = mem_used_megabytes
self.mem_avail_megabytes = mem_avail_megabytes
Expand All @@ -252,7 +253,7 @@ def get_container_perf(self, c_name) -> ContainerMetrics:
CPU % MEM USAGE / LIMIT MEM % NET I/O BLOCK I/O
0.26% 8.703GiB / 503.5GiB 1.73% 17.4GB / 176MB 545kB / 20.9GB

GPU: We currently use GPUtil to get the GPU stats on host machine since one GPU is not
GPU: We currently use HardwareUtil to get the GPU stats on host machine since one GPU is not
shared by multiple containers
(TODO: get the GPU stats inside the container)
"""
Expand Down Expand Up @@ -320,47 +321,35 @@ def get_container_perf(self, c_name) -> ContainerMetrics:
round(blk_read_bytes / (1024 * 1024), 1), round(blk_write_bytes / (1024 * 1024), 1))

# Calculate the gpu usage
gpus_stat = self.generate_container_gpu_stats(c_name)
gpus_stat = self.generate_container_gpu_stats(container_name=c_name)

# Record timestamp
timestamp = stats["read"]

return ContainerUtils.ContainerMetrics(cpu_percent, mem_gb_used, mem_gb_avail, recv_megabytes, sent_megabytes,
blk_read_bytes, blk_write_bytes, timestamp, gpus_stat)

def generate_container_gpu_stats(self, c_name):
gpu_ids = self.get_gpu_ids_by_container_name(c_name)
def generate_container_gpu_stats(self, container_name):
client = self.get_docker_client()
gpu_ids = HardwareUtil.get_docker_gpu_ids_by_container_name(container_name=container_name, docker_client=client)
gpu_stats = self.gpu_stats(gpu_ids)
return gpu_stats

def get_gpu_ids_by_container_name(self, c_name):
client = self.get_docker_client()
gpu_ids = []
try:
gpu_ids = client.api.inspect_container(c_name)["HostConfig"]["DeviceRequests"][0]["DeviceIDs"]
gpu_ids = list(map(int, gpu_ids))
except Exception as e:
logging.error(f"Failed to get GPU IDs: {e}")
pass

return gpu_ids

@staticmethod
def gpu_stats(gpu_ids):
def gpu_stats(gpu_ids: List[int]):
utilz, memory, temp = None, None, None
gpu_stats_map = {} # gpu_id: int -> {"gpu_utilization", "gpu_memory_allocated", "gpu_temp"}
gpu_ids = set(gpu_ids)
try:
gpus = getGPUs()

for i in gpu_ids:
gpu = gpus[i]
gpu_stats_map[i] = {
"gpu_utilization": gpu.load*100,
"gpu_memory_allocated": gpu.memoryUtil*100,
"gpu_temp": gpu.temperature,
# "gpu_power_usage": pynvml.nvmlDeviceGetPowerUsage(handle) / 1000, # in watts
# "gpu_time_spent_accessing_memory": utilz.memory # in ms
}
for gpu in HardwareUtil.get_gpus():
if gpu.id in gpu_ids:
gpu_stats_map[gpu.id] = {
"gpu_utilization": gpu.load * 100,
"gpu_memory_allocated": gpu.memoryUsed / gpu.memoryTotal * 100,
"gpu_temp": gpu.temperature,
# "gpu_power_usage": pynvml.nvmlDeviceGetPowerUsage(handle) / 1000, # in watts
# "gpu_time_spent_accessing_memory": utilz.memory # in ms
}
except Exception as e:
logging.error(f"Failed to get GPU stats: {e}")

Expand Down
Empty file.
61 changes: 61 additions & 0 deletions python/fedml/computing/scheduler/comm_utils/gpu_utils/gpu_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass
from enum import Enum, auto
from typing import Optional, List, Dict

from docker import DockerClient


class GPUCardType(Enum):
NVIDIA = auto()
QUALCOMM = auto()
UNKNOWN = auto()

def __str__(self):
return self.name


@dataclass
class GPUCard:
id: int
name: str
driver: str
serial: str
vendor: str
memoryTotal: float
memoryFree: float
memoryUsed: float
memoryUtil: float
load: Optional[float] = 0.0
uuid: Optional[str] = ""
display_mode: Optional[str] = ""
display_active: Optional[str] = ""
temperature: Optional[float] = 0.0


class GPUCardUtil(ABC):

@classmethod
@abstractmethod
def detect_gpu_card_type(cls) -> Optional[GPUCardType]:
raise NotImplementedError

@staticmethod
@abstractmethod
def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memory: float) -> List[int]:
raise NotImplementedError

@staticmethod
@abstractmethod
def get_gpu_cards() -> List[GPUCard]:
raise NotImplementedError

@staticmethod
@abstractmethod
def get_docker_gpu_device_mapping(gpu_ids: Optional[List[int]], num_gpus: int = 0) -> Optional[Dict]:
raise NotImplementedError

@staticmethod
@abstractmethod
def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: DockerClient) -> List[int]:
raise NotImplementedError
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import logging
import subprocess
from typing import List, Optional, Dict

import docker
from docker import types, DockerClient
from GPUtil import GPUtil, GPU

from fedml.computing.scheduler.comm_utils.gpu_utils.gpu_utils import GPUCard, GPUCardUtil, GPUCardType


class NvidiaGPUtil(GPUCardUtil):

@classmethod
def detect_gpu_card_type(cls) -> Optional[GPUCardType]:
try:
subprocess.check_output(["nvidia-smi"], universal_newlines=True)
return GPUCardType.NVIDIA
except Exception:
return None

@staticmethod
def get_gpu_cards() -> List[GPUCard]:
return [NvidiaGPUtil.__convert(gpu) for gpu in GPUtil.getGPUs()]

@staticmethod
def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memory: float) -> List[int]:
return GPUtil.getAvailable(order=order, limit=limit, maxLoad=max_load, maxMemory=max_memory)

@staticmethod
def get_docker_gpu_device_mapping(gpu_ids: List[int], num_gpus: int = 0) -> Optional[Dict]:
if gpu_ids is not None and len(gpu_ids):
gpu_id_list = list(map(lambda x: str(x), gpu_ids))
return {"device_requests": [docker.types.DeviceRequest(device_ids=gpu_id_list, capabilities=[["gpu"]])]}
else:
return {"device_requests": [docker.types.DeviceRequest(count=num_gpus, capabilities=[['gpu']])]}

@staticmethod
def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: DockerClient) -> List[int]:
try:
gpu_ids = docker_client.api.inspect_container(container_name)["HostConfig"]["DeviceRequests"][0]["DeviceIDs"]
return list(map(int, gpu_ids))
except Exception as e:
logging.error(f"Failed to get GPU IDs: {e}")
pass
return []

@staticmethod
def __convert(gpu: GPU) -> GPUCard:
return GPUCard(
id=gpu.id,
name=gpu.name,
driver=gpu.driver,
serial=gpu.serial,
vendor=GPUCardType.NVIDIA.name,
memoryTotal=gpu.memoryTotal,
memoryFree=gpu.memoryFree,
memoryUsed=gpu.memoryUsed,
memoryUtil=gpu.memoryUtil,
load=gpu.load,
uuid=gpu.uuid,
display_mode=gpu.display_mode,
display_active=gpu.display_active,
temperature=gpu.temperature,
)
Loading