diff --git a/python/fedml/computing/scheduler/model_scheduler/device_replica_controller.py b/python/fedml/computing/scheduler/model_scheduler/device_replica_controller.py index 667d57c4f4..ea19efb8b6 100644 --- a/python/fedml/computing/scheduler/model_scheduler/device_replica_controller.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_replica_controller.py @@ -67,7 +67,9 @@ def __init__(self, master_id, request_json: dict): def calc_total_gpu_num(self): total_gpu_num = 0 for device_id, gpu_num in self.devices_avail_gpus.items(): - total_gpu_num += gpu_num + if type(gpu_num) is not int: + logging.warning(f"The value in gpu_topology should be int, but got {type(gpu_num)}. Try to convert it.") + total_gpu_num += int(gpu_num) return total_gpu_num def init_id_replica_num(self): @@ -77,6 +79,11 @@ def init_id_replica_num(self): """ id_replica_num = {} for id, avail_num in self.devices_avail_gpus.items(): + if type(avail_num) is not int: + logging.warning(f"The value in gpu_topology should be int, " + f"but got {type(avail_num)}. Try to convert it.") + avail_num = int(avail_num) + if avail_num % self.gpu_per_replica != 0: raise ValueError("The number of gpus for each device should be divisible by gpu_per_replica") id_replica_num[str(id)] = avail_num // self.gpu_per_replica