From 92b7e162b66fa09b22a6af2dcfc22acf46ddf5cb Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Tue, 28 May 2024 17:46:52 +0000 Subject: [PATCH] [Deploy] Try to convert the gpu_topology value type to int. --- .../model_scheduler/device_replica_controller.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/fedml/computing/scheduler/model_scheduler/device_replica_controller.py b/python/fedml/computing/scheduler/model_scheduler/device_replica_controller.py index 667d57c4f4..ea19efb8b6 100644 --- a/python/fedml/computing/scheduler/model_scheduler/device_replica_controller.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_replica_controller.py @@ -67,7 +67,9 @@ def __init__(self, master_id, request_json: dict): def calc_total_gpu_num(self): total_gpu_num = 0 for device_id, gpu_num in self.devices_avail_gpus.items(): - total_gpu_num += gpu_num + if type(gpu_num) is not int: + logging.warning(f"The value in gpu_topology should be int, but got {type(gpu_num)}. Try to convert it.") + total_gpu_num += int(gpu_num) return total_gpu_num def init_id_replica_num(self): @@ -77,6 +79,11 @@ def init_id_replica_num(self): """ id_replica_num = {} for id, avail_num in self.devices_avail_gpus.items(): + if type(avail_num) is not int: + logging.warning(f"The value in gpu_topology should be int, " + f"but got {type(avail_num)}. Try to convert it.") + avail_num = int(avail_num) + if avail_num % self.gpu_per_replica != 0: raise ValueError("The number of gpus for each device should be divisible by gpu_per_replica") id_replica_num[str(id)] = avail_num // self.gpu_per_replica