From c4a87149e3af296310f0a7ca04cd467e0bc9b06f Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Tue, 11 Jun 2024 00:06:14 +0000 Subject: [PATCH 01/38] [Deploy] Report worker's connectivity when it finished. --- .../scheduler/comm_utils/network_util.py | 16 +++++ .../device_client_constants.py | 5 ++ .../model_scheduler/device_model_inference.py | 60 +++++++++++-------- .../model_scheduler/master_job_runner.py | 8 --- .../model_scheduler/worker_job_runner.py | 33 +++++++--- .../scheduler_core/general_constants.py | 16 ++--- 6 files changed, 87 insertions(+), 51 deletions(-) create mode 100644 python/fedml/computing/scheduler/comm_utils/network_util.py diff --git a/python/fedml/computing/scheduler/comm_utils/network_util.py b/python/fedml/computing/scheduler/comm_utils/network_util.py new file mode 100644 index 0000000000..13674840c5 --- /dev/null +++ b/python/fedml/computing/scheduler/comm_utils/network_util.py @@ -0,0 +1,16 @@ +import os +from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants + + +def return_this_device_connectivity_type() -> str: + """ + Return -> "http" | "http_proxy" |"mqtt" + """ + if os.environ.get(ClientConstants.ENV_CONNECTION_TYPE_KEY) == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP: + return ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP + elif os.environ.get(ClientConstants.ENV_CONNECTION_TYPE_KEY) == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP_PROXY: + return ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP_PROXY + elif os.environ.get(ClientConstants.ENV_CONNECTION_TYPE_KEY) == ClientConstants.WORKER_CONNECTIVITY_TYPE_MQTT: + return ClientConstants.WORKER_CONNECTIVITY_TYPE_MQTT + else: + return ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP diff --git a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py index 7894f2c73e..d66c2f966a 100644 --- a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py @@ -97,6 +97,11 @@ class ClientConstants(object): INFERENCE_INFERENCE_SERVER_VERSION = "v2" INFERENCE_REQUEST_TIMEOUT = 30 + ENV_CONNECTION_TYPE_KEY = "FEDML_CONNECTION_TYPE" + WORKER_CONNECTIVITY_TYPE_HTTP = "http" + WORKER_CONNECTIVITY_TYPE_HTTP_PROXY = "http_proxy" + WORKER_CONNECTIVITY_TYPE_MQTT = "mqtt" + MSG_MODELOPS_DEPLOYMENT_STATUS_INITIALIZING = "INITIALIZING" MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYING = "DEPLOYING" MSG_MODELOPS_DEPLOYMENT_STATUS_INFERRING = "INFERRING" diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py index d073533b72..a9205ceb9a 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py @@ -210,7 +210,8 @@ async def _predict( return inference_response # Found idle inference device - idle_device, end_point_id, model_id, model_name, model_version, inference_host, inference_output_url = \ + idle_device, end_point_id, model_id, model_name, model_version, inference_host, inference_output_url,\ + connectivity_type = \ found_idle_inference_device(in_end_point_id, in_end_point_name, in_model_name, in_model_version) if idle_device is None or idle_device == "": FEDML_MODEL_CACHE.update_pending_requests_counter(end_point_id, decrease=True) @@ -235,13 +236,16 @@ async def _predict( stream_flag = input_json.get("stream", False) input_list["stream"] = input_list.get("stream", stream_flag) output_list = input_json.get("outputs", []) + + # main execution of redirecting the inference request to the idle device inference_response = await send_inference_request( idle_device, end_point_id, inference_output_url, input_list, output_list, - inference_type=in_return_type) + inference_type=in_return_type, + connectivity_type=connectivity_type) # Calculate model metrics try: @@ -304,11 +308,12 @@ def found_idle_inference_device(end_point_id, end_point_name, in_model_name, in_ inference_host = "" inference_output_url = "" model_version = "" + connectivity_type = "" + # Found idle device (TODO: optimize the algorithm to search best device for inference) payload, idle_device = FEDML_MODEL_CACHE. \ get_idle_device(end_point_id, end_point_name, in_model_name, in_model_version) if payload is not None: - logging.info("found idle deployment result {}".format(payload)) deployment_result = payload model_name = deployment_result["model_name"] model_version = deployment_result["model_version"] @@ -317,24 +322,25 @@ def found_idle_inference_device(end_point_id, end_point_name, in_model_name, in_ inference_output_url = deployment_result["model_url"] url_parsed = urlparse(inference_output_url) inference_host = url_parsed.hostname + connectivity_type = deployment_result.get("connectivity_type", ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP) else: logging.info("not found idle deployment result") - return idle_device, end_point_id, model_id, model_name, model_version, inference_host, inference_output_url + res = (idle_device, end_point_id, model_id, model_name, model_version, inference_host, inference_output_url, + connectivity_type) + logging.info(f"found idle device with metrics: {res}") + + return res async def send_inference_request(idle_device, end_point_id, inference_url, input_list, output_list, - inference_type="default", has_public_ip=True): + inference_type="default", + connectivity_type=ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP): request_timeout_sec = FEDML_MODEL_CACHE.get_endpoint_settings(end_point_id) \ .get("request_timeout_sec", ClientConstants.INFERENCE_REQUEST_TIMEOUT) try: - http_infer_available = os.getenv("FEDML_INFERENCE_HTTP_AVAILABLE", True) - if not http_infer_available: - if http_infer_available == "False" or http_infer_available == "false": - http_infer_available = False - - if http_infer_available: + if connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP: response_ok = await FedMLHttpInference.is_inference_ready( inference_url, timeout=request_timeout_sec) @@ -347,22 +353,23 @@ async def send_inference_request(idle_device, end_point_id, inference_url, input timeout=request_timeout_sec) logging.info(f"Use http inference. return {response_ok}") return inference_response - - response_ok = await FedMLHttpProxyInference.is_inference_ready( - inference_url, - timeout=request_timeout_sec) - if response_ok: - response_ok, inference_response = await FedMLHttpProxyInference.run_http_proxy_inference_with_request( - end_point_id, + elif connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP_PROXY: + logging.warning("Use http proxy inference.") + response_ok = await FedMLHttpProxyInference.is_inference_ready( inference_url, - input_list, - output_list, - inference_type=inference_type, timeout=request_timeout_sec) - logging.info(f"Use http proxy inference. return {response_ok}") - return inference_response - - if not has_public_ip: + if response_ok: + response_ok, inference_response = await FedMLHttpProxyInference.run_http_proxy_inference_with_request( + end_point_id, + inference_url, + input_list, + output_list, + inference_type=inference_type, + timeout=request_timeout_sec) + logging.info(f"Use http proxy inference. return {response_ok}") + return inference_response + elif connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_MQTT: + logging.warning("Use mqtt inference.") agent_config = {"mqtt_config": Settings.mqtt_config} mqtt_inference = FedMLMqttInference( agent_config=agent_config, @@ -385,7 +392,8 @@ async def send_inference_request(idle_device, end_point_id, inference_url, input logging.info(f"Use mqtt inference. return {response_ok}.") return inference_response - return {"error": True, "message": "Failed to use http, http-proxy for inference, no response from replica."} + else: + return {"error": True, "message": "Failed to use http, http-proxy for inference, no response from replica."} except Exception as e: inference_response = {"error": True, "message": f"Exception when using http, http-proxy and mqtt " diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py index a10bd2c559..b9b9b4c356 100755 --- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py +++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py @@ -250,14 +250,6 @@ def process_deployment_result_message(self, topic=None, payload=None): logging.info(f"Endpoint {end_point_id}; Device {device_id}; replica {replica_no}; " f"run_operation {run_operation} model status {model_status}.") - # OPTIONAL DEBUG PARAMS - # this_run_controller = self.model_runner_mapping[run_id_str].replica_controller - # logging.info(f"The current replica controller state is " - # f"Total version diff num {this_run_controller.total_replica_version_diff_num}") - # logging.info(f"self.request_json now {self.request_json}") # request_json will be deprecated - # this_run_request_json = self.request_json - # logging.info(f"self.request_json now {this_run_request_json}") - # Set redis + sqlite deployment result FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py index 3c357e9dab..9e178228b2 100755 --- a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py +++ b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py @@ -9,6 +9,8 @@ from abc import ABC import yaml from fedml.computing.scheduler.comm_utils.job_utils import JobRunnerUtils +from fedml.computing.scheduler.comm_utils.network_util import return_this_device_connectivity_type + from fedml.core.mlops import MLOpsRuntimeLog from fedml.computing.scheduler.comm_utils import file_utils from .device_client_constants import ClientConstants @@ -234,8 +236,11 @@ def run_impl(self, run_extend_queue_list, sender_message_center, running_model_name, inference_output_url, inference_model_version, model_metadata, model_config = \ "", "", model_version, {}, {} + # ip and connectivity + worker_ip = GeneralConstants.get_ip_address(self.request_json) + connectivity = return_this_device_connectivity_type() + if op == "add": - worker_ip = GeneralConstants.get_ip_address(self.request_json) for rank in range(prev_rank + 1, prev_rank + 1 + op_num): try: running_model_name, inference_output_url, inference_model_version, model_metadata, model_config = \ @@ -269,7 +274,9 @@ def run_impl(self, run_extend_queue_list, sender_message_center, result_payload = self.send_deployment_results( end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED, model_id, model_name, inference_output_url, model_version, inference_port_external, - inference_engine, model_metadata, model_config, replica_no=rank + 1) + inference_engine, model_metadata, model_config, replica_no=rank + 1, + connectivity=connectivity + ) if inference_port_external != inference_port: # Save internal port to local db @@ -278,7 +285,9 @@ def run_impl(self, run_extend_queue_list, sender_message_center, result_payload = self.construct_deployment_results( end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED, model_id, model_name, inference_output_url, model_version, inference_port, - inference_engine, model_metadata, model_config, replica_no=rank + 1) + inference_engine, model_metadata, model_config, replica_no=rank + 1, + connectivity=connectivity + ) FedMLModelDatabase.get_instance().set_deployment_result( run_id, end_point_name, model_name, model_version, self.edge_id, @@ -326,7 +335,6 @@ def run_impl(self, run_extend_queue_list, sender_message_center, return True elif op == "update" or op == "rollback": # Update is combine of delete and add - worker_ip = GeneralConstants.get_ip_address(self.request_json) for rank in replica_rank_to_update: # Delete a replica (container) if exists self.replica_handler.remove_replica(rank) @@ -402,7 +410,9 @@ def run_impl(self, run_extend_queue_list, sender_message_center, result_payload = self.send_deployment_results( end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED, model_id, model_name, inference_output_url, model_version, inference_port_external, - inference_engine, model_metadata, model_config, replica_no=rank + 1) + inference_engine, model_metadata, model_config, replica_no=rank + 1, + connectivity=connectivity + ) if inference_port_external != inference_port: # Save internal port to local db logging.info("inference_port_external {} != inference_port {}".format( @@ -410,7 +420,9 @@ def run_impl(self, run_extend_queue_list, sender_message_center, result_payload = self.construct_deployment_results( end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED, model_id, model_name, inference_output_url, model_version, inference_port, - inference_engine, model_metadata, model_config, replica_no=rank + 1) + inference_engine, model_metadata, model_config, replica_no=rank + 1, + connectivity=connectivity + ) FedMLModelDatabase.get_instance().set_deployment_result( run_id, end_point_name, model_name, model_version, self.edge_id, @@ -433,7 +445,8 @@ def run_impl(self, run_extend_queue_list, sender_message_center, def construct_deployment_results(self, end_point_name, device_id, model_status, model_id, model_name, model_inference_url, model_version, inference_port, inference_engine, - model_metadata, model_config, replica_no=1): + model_metadata, model_config, replica_no=1, + connectivity=ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP): deployment_results_payload = {"end_point_id": self.run_id, "end_point_name": end_point_name, "model_id": model_id, "model_name": model_name, "model_url": model_inference_url, "model_version": model_version, @@ -444,6 +457,7 @@ def construct_deployment_results(self, end_point_name, device_id, model_status, "model_status": model_status, "inference_port": inference_port, "replica_no": replica_no, + "connectivity_type": connectivity, } return deployment_results_payload @@ -466,7 +480,8 @@ def construct_deployment_status(self, end_point_name, device_id, def send_deployment_results(self, end_point_name, device_id, model_status, model_id, model_name, model_inference_url, model_version, inference_port, inference_engine, - model_metadata, model_config, replica_no=1): + model_metadata, model_config, replica_no=1, + connectivity=ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP): deployment_results_topic = "model_device/model_device/return_deployment_result/{}/{}".format( self.run_id, device_id) @@ -474,7 +489,7 @@ def send_deployment_results(self, end_point_name, device_id, model_status, end_point_name, device_id, model_status, model_id, model_name, model_inference_url, model_version, inference_port, inference_engine, - model_metadata, model_config, replica_no=replica_no) + model_metadata, model_config, replica_no=replica_no, connectivity=connectivity) logging.info("[client] send_deployment_results: topic {}, payload {}.".format(deployment_results_topic, deployment_results_payload)) diff --git a/python/fedml/computing/scheduler/scheduler_core/general_constants.py b/python/fedml/computing/scheduler/scheduler_core/general_constants.py index 68c1a8e09d..8c60b17bdf 100755 --- a/python/fedml/computing/scheduler/scheduler_core/general_constants.py +++ b/python/fedml/computing/scheduler/scheduler_core/general_constants.py @@ -192,14 +192,14 @@ def get_public_ip(): @staticmethod def get_ip_address(request_json, infer_host=None): # OPTION 1: Use local ip - ip = GeneralConstants.get_local_ip() - - # OPTION 2: Auto detect public ip - if "parameters" in request_json and \ - GeneralConstants.CONFIG_KEY_AUTO_DETECT_PUBLIC_IP in request_json["parameters"] and \ - request_json["parameters"][GeneralConstants.CONFIG_KEY_AUTO_DETECT_PUBLIC_IP]: - ip = GeneralConstants.get_public_ip() - logging.info("Auto detect public ip for master: " + ip) + # ip = GeneralConstants.get_local_ip() + # + # # OPTION 2: Auto detect public ip + # if "parameters" in request_json and \ + # GeneralConstants.CONFIG_KEY_AUTO_DETECT_PUBLIC_IP in request_json["parameters"] and \ + # request_json["parameters"][GeneralConstants.CONFIG_KEY_AUTO_DETECT_PUBLIC_IP]: + ip = GeneralConstants.get_public_ip() + logging.info("Auto detect public ip for master: " + ip) # OPTION 3: Use user indicated ip if infer_host is not None and infer_host != "127.0.0.1" and infer_host != "localhost": From 4a9622c439f4368a4111490aef8722145825c659 Mon Sep 17 00:00:00 2001 From: fedml-dimitris Date: Tue, 11 Jun 2024 15:53:08 -0400 Subject: [PATCH 02/38] Adding default http connectivity type constant. Fixing minor typos and reducing condition checks. --- .../scheduler/comm_utils/network_util.py | 16 +++++++++------- .../device_client_constants.py | 1 + .../model_scheduler/device_model_cache.py | 10 +++++++--- .../model_scheduler/device_model_inference.py | 19 ++++++++++--------- .../model_scheduler/worker_job_runner.py | 4 ++-- 5 files changed, 29 insertions(+), 21 deletions(-) diff --git a/python/fedml/computing/scheduler/comm_utils/network_util.py b/python/fedml/computing/scheduler/comm_utils/network_util.py index 13674840c5..48e478f23f 100644 --- a/python/fedml/computing/scheduler/comm_utils/network_util.py +++ b/python/fedml/computing/scheduler/comm_utils/network_util.py @@ -6,11 +6,13 @@ def return_this_device_connectivity_type() -> str: """ Return -> "http" | "http_proxy" |"mqtt" """ - if os.environ.get(ClientConstants.ENV_CONNECTION_TYPE_KEY) == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP: - return ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP - elif os.environ.get(ClientConstants.ENV_CONNECTION_TYPE_KEY) == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP_PROXY: - return ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP_PROXY - elif os.environ.get(ClientConstants.ENV_CONNECTION_TYPE_KEY) == ClientConstants.WORKER_CONNECTIVITY_TYPE_MQTT: - return ClientConstants.WORKER_CONNECTIVITY_TYPE_MQTT + # Get the environmental variable's value and convert to lower case. + env_conn_type = os.getenv(ClientConstants.ENV_CONNECTION_TYPE_KEY, "").lower() + if env_conn_type in [ + ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP, + ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP_PROXY, + ClientConstants.WORKER_CONNECTIVITY_TYPE_MQTT + ]: + return env_conn_type else: - return ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP + return ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT diff --git a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py index d66c2f966a..2c06189d2e 100644 --- a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py @@ -101,6 +101,7 @@ class ClientConstants(object): WORKER_CONNECTIVITY_TYPE_HTTP = "http" WORKER_CONNECTIVITY_TYPE_HTTP_PROXY = "http_proxy" WORKER_CONNECTIVITY_TYPE_MQTT = "mqtt" + WORKER_CONNECTIVITY_TYPE_DEFAULT = WORKER_CONNECTIVITY_TYPE_HTTP MSG_MODELOPS_DEPLOYMENT_STATUS_INITIALIZING = "INITIALIZING" MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYING = "DEPLOYING" diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py index 30e4f460e6..6c90944277 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py @@ -344,9 +344,13 @@ def get_result_item_info(self, result_item): result_payload = result_item_json["result"] return device_id, replica_no, result_payload - def get_idle_device(self, end_point_id, end_point_name, - model_name, model_version, - check_end_point_status=True, limit_specific_model_version=False): + def get_idle_device(self, + end_point_id, + end_point_name, + model_name, + model_version, + check_end_point_status=True, + limit_specific_model_version=False): # Deprecated the model status logic, query directly from the deployment result list idle_device_list = list() diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py index a9205ceb9a..3aeec67932 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py @@ -313,16 +313,17 @@ def found_idle_inference_device(end_point_id, end_point_name, in_model_name, in_ # Found idle device (TODO: optimize the algorithm to search best device for inference) payload, idle_device = FEDML_MODEL_CACHE. \ get_idle_device(end_point_id, end_point_name, in_model_name, in_model_version) - if payload is not None: - deployment_result = payload - model_name = deployment_result["model_name"] - model_version = deployment_result["model_version"] - model_id = deployment_result["model_id"] - end_point_id = deployment_result["end_point_id"] - inference_output_url = deployment_result["model_url"] + if payload: + model_name = payload["model_name"] + model_version = payload["model_version"] + model_id = payload["model_id"] + end_point_id = payload["end_point_id"] + inference_output_url = payload["model_url"] + connectivity_type = \ + payload.get("connectivity_type", + ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT) url_parsed = urlparse(inference_output_url) inference_host = url_parsed.hostname - connectivity_type = deployment_result.get("connectivity_type", ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP) else: logging.info("not found idle deployment result") @@ -335,7 +336,7 @@ def found_idle_inference_device(end_point_id, end_point_name, in_model_name, in_ async def send_inference_request(idle_device, end_point_id, inference_url, input_list, output_list, inference_type="default", - connectivity_type=ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP): + connectivity_type=ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT): request_timeout_sec = FEDML_MODEL_CACHE.get_endpoint_settings(end_point_id) \ .get("request_timeout_sec", ClientConstants.INFERENCE_REQUEST_TIMEOUT) diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py index 9e178228b2..ef65e37904 100755 --- a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py +++ b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py @@ -446,7 +446,7 @@ def construct_deployment_results(self, end_point_name, device_id, model_status, model_id, model_name, model_inference_url, model_version, inference_port, inference_engine, model_metadata, model_config, replica_no=1, - connectivity=ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP): + connectivity=ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT): deployment_results_payload = {"end_point_id": self.run_id, "end_point_name": end_point_name, "model_id": model_id, "model_name": model_name, "model_url": model_inference_url, "model_version": model_version, @@ -481,7 +481,7 @@ def send_deployment_results(self, end_point_name, device_id, model_status, model_id, model_name, model_inference_url, model_version, inference_port, inference_engine, model_metadata, model_config, replica_no=1, - connectivity=ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP): + connectivity=ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT): deployment_results_topic = "model_device/model_device/return_deployment_result/{}/{}".format( self.run_id, device_id) From 23d88fc7dcfdbe9f9b319a08b72b39f0c58fdbb3 Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Tue, 11 Jun 2024 11:48:20 -0700 Subject: [PATCH 03/38] [Deploy] Remove unnecessary logic. --- .../device_model_deployment.py | 232 +----------------- .../model_scheduler/master_job_runner.py | 1 - .../model_scheduler/worker_job_runner.py | 16 +- 3 files changed, 10 insertions(+), 239 deletions(-) diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py index 1876373d25..5d3ba9873d 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py @@ -1,12 +1,13 @@ +import fedml + import logging import os -import pickle -import platform import shutil import time import traceback import yaml import datetime +import docker import requests import torch @@ -15,27 +16,18 @@ import collections.abc -import fedml from fedml.computing.scheduler.comm_utils import sys_utils, security_utils -from fedml.computing.scheduler.comm_utils.container_utils import ContainerUtils from fedml.computing.scheduler.comm_utils.hardware_utils import HardwareUtil from fedml.computing.scheduler.comm_utils.job_utils import JobRunnerUtils - -for type_name in collections.abc.__all__: - setattr(collections, type_name, getattr(collections.abc, type_name)) - from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants -import io - -import docker -from ..scheduler_core.compute_cache_manager import ComputeCacheManager +from fedml.computing.scheduler.model_scheduler.device_model_cache import FedMLModelCache from ..scheduler_core.compute_utils import ComputeUtils from ..comm_utils.container_utils import ContainerUtils - from .device_http_inference_protocol import FedMLHttpInference -from fedml.computing.scheduler.model_scheduler.device_model_cache import FedMLModelCache +for type_name in collections.abc.__all__: + setattr(collections, type_name, getattr(collections.abc, type_name)) no_real_gpu_allocation = None @@ -432,8 +424,6 @@ def should_exit_logs(end_point_id, model_id, cmd_type, model_name, inference_eng if cmd_type == ClientConstants.CMD_TYPE_RUN_DEFAULT_SERVER: # TODO: Exited Quickly if the container is Exited or Removed # If the container has exited, return True, means we should exit the logs - # container_name = "{}".format(ClientConstants.FEDML_DEFAULT_SERVER_CONTAINER_NAME_PREFIX) + "__" + \ - # security_utils.get_content_hash(model_name) try: inference_output_url, model_version, model_metadata, model_config = \ get_model_info(model_name, inference_engine, inference_port, infer_host, @@ -554,8 +544,6 @@ def log_deployment_result(end_point_id, model_id, cmd_container_name, cmd_type, def is_client_inference_container_ready(infer_url_host, inference_http_port, inference_model_name, local_infer_url, inference_type="default", model_version="", request_input_example=None): - # logging.info(f"Inference type: {inference_type}, infer_url_host {infer_url_host}, \ - # inference_http_port: {inference_http_port}, local_infer_url {local_infer_url}") if inference_type == "default": default_client_container_ready_url = "http://{}:{}/ready".format("0.0.0.0", inference_http_port) @@ -631,211 +619,5 @@ def run_http_inference_with_curl_request(inference_url, inference_input_list, in inference_type=inference_type, engine_type=engine_type, timeout=timeout) -def convert_model_to_onnx( - torch_model, output_path: str, dummy_input_list, input_size: int, input_is_tensor=True -) -> None: - from collections import OrderedDict - import torch - from torch.onnx import TrainingMode - - torch.onnx.export(torch_model, # model being run - dummy_input_list if input_is_tensor else tuple(dummy_input_list), - # model input (or a tuple for multiple inputs) - f=output_path, # where to save the model (can be a file or file-like object) - export_params=True, # store the trained parameter weights inside the model file - opset_version=11, # the ONNX version to export the model to - do_constant_folding=False, # whether to execute constant folding for optimization - input_names=["input1", "input2"], - # the model's input names - output_names=['output'], # the model's output names - training=TrainingMode.EVAL, - verbose=True, - dynamic_axes={"input1": {0: "batch_size"}, - "input2": {0: "batch_size"}, - "output": {0: "batch_size"}} - ) - - -def test_start_triton_server(model_serving_dir): - sudo_prefix = "sudo " - sys_name = platform.system() - if sys_name == "Darwin": - sudo_prefix = "" - gpu_attach_cmd = "" - - triton_server_container_name = "{}".format(ClientConstants.FEDML_TRITON_SERVER_CONTAINER_NAME_PREFIX) - triton_server_cmd = "{}docker stop {}; {}docker rm {}; {}docker run --name {} {} -p{}:8000 " \ - "-p{}:8001 -p{}:8002 " \ - "--shm-size {} " \ - "-v {}:/models {} " \ - "bash -c \"pip install transformers && tritonserver --strict-model-config=false " \ - "--model-control-mode=poll --repository-poll-secs={} " \ - "--model-repository=/models\" ".format(sudo_prefix, triton_server_container_name, - sudo_prefix, triton_server_container_name, - sudo_prefix, triton_server_container_name, - gpu_attach_cmd, - ClientConstants.INFERENCE_HTTP_PORT, - ClientConstants.INFERENCE_GRPC_PORT, - 8002, - "4096m", - model_serving_dir, - ClientConstants.INFERENCE_SERVER_IMAGE, - ClientConstants.FEDML_MODEL_SERVING_REPO_SCAN_INTERVAL) - logging.info("Run triton inference server: {}".format(triton_server_cmd)) - triton_server_process = ClientConstants.exec_console_with_script(triton_server_cmd, - should_capture_stdout=False, - should_capture_stderr=False, - no_sys_out_err=True) - - -def test_convert_pytorch_model_to_onnx(model_net_file, model_bin_file, model_name, model_in_params): - torch_model = torch.jit.load(model_net_file) - with open(model_bin_file, 'rb') as model_pkl_file: - model_state_dict = pickle.load(model_pkl_file) - torch_model.load_state_dict(model_state_dict) - torch_model.eval() - - input_size = model_in_params["input_size"] - input_types = model_in_params["input_types"] - - dummy_input_list = [] - for index, input_i in enumerate(input_size): - if input_types[index] == "int": - this_input = torch.tensor(torch.randint(0, 1, input_i)) - else: - this_input = torch.tensor(torch.zeros(input_i)) - dummy_input_list.append(this_input) - - onnx_model_dir = os.path.join(ClientConstants.get_model_cache_dir(), - ClientConstants.FEDML_CONVERTED_MODEL_DIR_NAME, - model_name, ClientConstants.INFERENCE_MODEL_VERSION) - if not os.path.exists(onnx_model_dir): - os.makedirs(onnx_model_dir, exist_ok=True) - onnx_model_path = os.path.join(onnx_model_dir, "model.onnx") - - convert_model_to_onnx(torch_model, onnx_model_path, dummy_input_list, input_size, - input_is_tensor=True) - - model_serving_dir = os.path.join(ClientConstants.get_model_cache_dir(), - ClientConstants.FEDML_CONVERTED_MODEL_DIR_NAME) - return model_serving_dir - - -def start_gpu_model_load_process(): - from multiprocessing import Process - import time - process = Process(target=load_gpu_model_to_cpu_device) - process.start() - while True: - time.sleep(1) - - -def load_gpu_model_to_cpu_device(): - import pickle - import io - import torch - - class CPU_Unpickler(pickle.Unpickler): - def find_class(self, module, name): - if module == 'torch.storage' and name == '_load_from_bytes': - return lambda b: torch.load(io.BytesIO(b), map_location='cpu') - else: - return super().find_class(module, name) - - model_file = "/home/fedml/.fedml/fedml-client/fedml/models/theta_rec_auc_81_single_label/theta_rec_auc_81_single_label" - with open(model_file, "rb") as model_pkl_file: - if not torch.cuda.is_available(): - model = CPU_Unpickler(model_pkl_file).load() - if model is None: - print("Failed to load gpu model to cpu device") - else: - print("Succeeded to load gpu model to cpu device") - - if __name__ == "__main__": - start_gpu_model_load_process() - - model_serving_dir = test_convert_pytorch_model_to_onnx("./sample-open-training-model-net", - "./sample-open-training-model", - "rec-model", - {"input_size": [[1, 24], [1, 2]], - "input_types": ["int", "float"]}) - - test_start_triton_server(model_serving_dir) - - # input_data = {"model_version": "v0-Sun Feb 05 12:17:16 GMT 2023", - # "model_name": "model_414_45_open-model-test_v0-Sun-Feb-05-12-17-16-GMT-2023", - # # "data": "file:///Users/alexliang/fedml_data/mnist-image.png", - # "data": "https://raw.githubusercontent.com/niyazed/triton-mnist-example/master/images/sample_image.png", - # "end_point_id": 414, "model_id": 45, "token": "a09a18a14c4c4d89a8d5f9515704c073"} - # - # data_list = list() - # data_list.append(input_data["data"]) - # run_http_inference_with_lib_http_api_with_image_data(input_data["model_name"], - # 5001, 1, data_list, "") - # - # - # class LogisticRegression(torch.nn.Module): - # def __init__(self, input_dim, output_dim): - # super(LogisticRegression, self).__init__() - # self.linear = torch.nn.Linear(input_dim, output_dim) - # - # def forward(self, x): - # outputs = torch.sigmoid(self.linear(x)) - # return outputs - # - # - # model = LogisticRegression(28 * 28, 10) - # checkpoint = {'model': model} - # model_net_file = "/Users/alexliang/fedml-client/fedml/models/open-model-test/model-net.pt" - # torch.save(checkpoint, model_net_file) - # - # with open("/Users/alexliang/fedml-client/fedml/models/open-model-test/open-model-test", 'rb') as model_pkl_file: - # model_params = pickle.load(model_pkl_file) - # # torch.save(model_params, "/Users/alexliang/fedml-client/fedml/models/open-model-test/a.pt") - # # model = torch.load("/Users/alexliang/fedml-client/fedml/models/open-model-test/a.pt") - # loaded_checkpoint = torch.load(model_net_file) - # loaded_model = loaded_checkpoint["model"] - # loaded_model.load_state_dict(model_params) - # for parameter in loaded_model.parameters(): - # parameter.requires_grad = False - # loaded_model.eval() - # input_names = {"x": 0} - # convert_model_to_onnx(loaded_model, "/Users/alexliang/fedml-client/fedml/models/open-model-test/a.onnx", - # input_names, 28 * 28) - - # parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - # parser.add_argument("--cf", "-c", help="config file") - # parser.add_argument("--role", "-r", type=str, default="client", help="role") - # parser.add_argument("--model_storage_local_path", "-url", type=str, default="/home/ubuntu", - # help="model storage local path") - # parser.add_argument("--inference_model_name", "-n", type=str, default="fedml-model", - # help="inference model name") - # parser.add_argument("--inference_engine", "-engine", type=str, default="ONNX", help="inference engine") - # parser.add_argument("--inference_http_port", "-http", type=int, default=8000, help="inference http port") - # parser.add_argument("--inference_grpc_port", "-gprc", type=int, default=8001, help="inference grpc port") - # parser.add_argument("--inference_metric_port", "-metric", type=int, default=8002, help="inference metric port") - # parser.add_argument("--inference_use_gpu", "-gpu", type=str, default="gpu", help="inference use gpu") - # parser.add_argument("--inference_memory_size", "-mem", type=str, default="256m", help="inference memory size") - # parser.add_argument("--inference_convertor_image", "-convertor", type=str, - # default=ClientConstants.INFERENCE_CONVERTOR_IMAGE, help="inference convertor image") - # parser.add_argument("--inference_server_image", "-server", type=str, - # default=ClientConstants.INFERENCE_SERVER_IMAGE, help="inference server image") - # args = parser.parse_args() - # args.user = args.user - # - # pip_source_dir = os.path.dirname(__file__) - # __running_model_name, __inference_output_url, __model_version, __model_metadata, __model_config = \ - # start_deployment( - # args.model_storage_local_path, - # args.inference_model_name, - # args.inference_engine, - # args.inference_http_port, - # args.inference_grpc_port, - # args.inference_metric_port, - # args.inference_use_gpu, - # args.inference_memory_size, - # args.inference_convertor_image, - # args.inference_server_image) - # print("Model deployment results, running model name: {}, url: {}, model metadata: {}, model config: {}".format( - # __running_model_name, __inference_output_url, __model_metadata, __model_config)) + pass diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py index b9b9b4c356..ef2c01c49d 100755 --- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py +++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py @@ -453,7 +453,6 @@ def process_deployment_result_message(self, topic=None, payload=None): time.sleep(3) self.trigger_completed_event() - def cleanup_runner_process(self, run_id): ServerConstants.cleanup_run_process(run_id, not_kill_subprocess=True) diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py index ef65e37904..8100707386 100755 --- a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py +++ b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py @@ -294,9 +294,7 @@ def run_impl(self, run_extend_queue_list, sender_message_center, json.dumps(result_payload), replica_no=rank + 1) logging.info(f"Deploy replica {rank + 1} / {prev_rank + 1 + op_num} successfully.") - time.sleep(5) - time.sleep(1) self.status_reporter.run_id = self.run_id self.status_reporter.report_client_id_status( self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED, @@ -348,7 +346,8 @@ def run_impl(self, run_extend_queue_list, sender_message_center, # TODO (Raphael) check if this will allow another job to seize the gpu during high concurrency: try: - JobRunnerUtils.get_instance().release_partial_job_gpu(run_id, self.edge_id, replica_occupied_gpu_ids) + JobRunnerUtils.get_instance().release_partial_job_gpu( + run_id, self.edge_id, replica_occupied_gpu_ids) except Exception as e: if op == "rollback": pass @@ -395,7 +394,7 @@ def run_impl(self, run_extend_queue_list, sender_message_center, JobRunnerUtils.get_instance().release_partial_job_gpu( run_id, self.edge_id, replica_occupied_gpu_ids) - result_payload = self.send_deployment_results( + self.send_deployment_results( end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED, model_id, model_name, inference_output_url, inference_model_version, inference_port, inference_engine, model_metadata, model_config) @@ -496,15 +495,6 @@ def send_deployment_results(self, end_point_name, device_id, model_status, self.message_center.send_message_json(deployment_results_topic, json.dumps(deployment_results_payload)) return deployment_results_payload - def send_deployment_status(self, end_point_name, device_id, - model_id, model_name, model_version, - model_inference_url, model_status, - inference_port=ClientConstants.MODEL_INFERENCE_DEFAULT_PORT, - replica_no=1, # start from 1 - ): - # Deprecated - pass - def reset_devices_status(self, edge_id, status): self.status_reporter.run_id = self.run_id self.status_reporter.edge_id = edge_id From e0ad9b5bef5bcea1eaefe3458a3d6b49aa399d46 Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Tue, 11 Jun 2024 12:15:22 -0700 Subject: [PATCH 04/38] [Deploy] Remove unnecessary logic; Rename readiness check function; Forbidden user level control of host post. --- .../device_model_deployment.py | 150 +++++------------- 1 file changed, 40 insertions(+), 110 deletions(-) diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py index 5d3ba9873d..edd2ebea9a 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py @@ -68,6 +68,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, num_gpus = gpu_per_replica gpu_ids, gpu_attach_cmd = None, "" + # Concatenate the model name running_model_name = ClientConstants.get_running_model_name( end_point_name, inference_model_name, model_version, end_point_id, model_id, edge_id=edge_id) @@ -77,6 +78,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, config = yaml.safe_load(file) # Resource related + inference_type = "default" use_gpu = config.get('use_gpu', True) num_gpus_frm_yml = config.get('num_gpus', None) if not use_gpu: @@ -85,9 +87,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, if num_gpus_frm_yml is not None: num_gpus = int(num_gpus_frm_yml) usr_indicated_wait_time = config.get('deploy_timeout', 900) - usr_indicated_worker_port = config.get('worker_port', "") - if usr_indicated_worker_port == "": - usr_indicated_worker_port = os.environ.get("FEDML_WORKER_PORT", "") + usr_indicated_retry_cnt = max(int(usr_indicated_wait_time) // 10, 1) shm_size = config.get('shm_size', None) storage_opt = config.get('storage_opt', None) tmpfs = config.get('tmpfs', None) @@ -96,17 +96,6 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, cpus = int(cpus) memory = config.get('memory', None) - if usr_indicated_worker_port == "": - usr_indicated_worker_port = None - else: - usr_indicated_worker_port = int(usr_indicated_worker_port) - - worker_port_env = os.environ.get("FEDML_WORKER_PORT", "") - worker_port_from_config = config.get('worker_port', "") - logging.info(f"usr_indicated_worker_port {usr_indicated_worker_port}, worker port env {worker_port_env}, " - f"worker port from config {worker_port_from_config}") - - usr_indicated_retry_cnt = max(int(usr_indicated_wait_time) // 10, 1) inference_image_name = config.get('inference_image_name', ClientConstants.INFERENCE_SERVER_CUSTOME_IMAGE) image_pull_policy = config.get('image_pull_policy', SchedulerConstants.IMAGE_PULL_POLICY_IF_NOT_PRESENT) @@ -144,6 +133,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, # If using customized image, then bootstrap + job will be the entry point enable_custom_image = config.get("enable_custom_image", False) + # inference_type = "custom" customized_image_entry_cmd = \ "/bin/bash /home/fedml/models_serving/fedml-deploy-bootstrap-entry-auto-gen.sh" @@ -151,18 +141,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, docker_registry_user_password = config.get("docker_registry_user_password", "") docker_registry = config.get("docker_registry", "") - port_inside_container = int(config.get("port_inside_container", 2345)) - use_triton = config.get("use_triton", False) - if use_triton: - inference_type = "triton" - else: - inference_type = "default" - - # Config check - if src_code_dir == "": - raise Exception("Please indicate source_code_dir in the fedml_model_config.yaml") - if relative_entry == "": - logging.warning("You missed main_entry in the fedml_model_config.yaml") + port_inside_container = int(config.get("port", 2345)) # Request the GPU ids for the deployment if num_gpus > 0: @@ -175,22 +154,10 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, end_point_id, end_point_name, inference_model_name, edge_id, replica_rank+1, gpu_ids) logging.info("GPU ids allocated: {}".format(gpu_ids)) + # Create the model serving dir if not exists model_serving_dir = ClientConstants.get_model_serving_dir() if not os.path.exists(model_serving_dir): os.makedirs(model_serving_dir, exist_ok=True) - converted_model_path = os.path.join(model_storage_local_path, ClientConstants.FEDML_CONVERTED_MODEL_DIR_NAME) - if os.path.exists(converted_model_path): - model_file_list = os.listdir(converted_model_path) - for model_file in model_file_list: - src_model_file = os.path.join(converted_model_path, model_file) - dst_model_file = os.path.join(model_serving_dir, model_file) - if os.path.isdir(src_model_file): - if not os.path.exists(dst_model_file): - shutil.copytree(src_model_file, dst_model_file, copy_function=shutil.copy, - ignore_dangling_symlinks=True) - else: - if not os.path.exists(dst_model_file): - shutil.copyfile(src_model_file, dst_model_file) if inference_engine != ClientConstants.INFERENCE_ENGINE_TYPE_INT_DEFAULT: raise Exception(f"inference engine {inference_engine} is not supported") @@ -228,13 +195,12 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, logging.info(f"Start pulling the inference image {inference_image_name}... with policy {image_pull_policy}") ContainerUtils.get_instance().pull_image_with_policy(image_pull_policy, inference_image_name) - volumns = [] + volumes = [] binds = {} environment = {} # data_cache_dir mounting - assert type(data_cache_dir_input) == dict or type(data_cache_dir_input) == str - if type(data_cache_dir_input) == str: + if isinstance(data_cache_dir_input, str): # In this case, we mount to the same folder, if it has ~, we replace it with /home/fedml src_data_cache_dir, dst_data_cache_dir = "", "" if data_cache_dir_input != "": @@ -253,28 +219,30 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, if type(src_data_cache_dir) == str and src_data_cache_dir != "": logging.info("Start copying the data cache to the container...") if os.path.exists(src_data_cache_dir): - volumns.append(src_data_cache_dir) + volumes.append(src_data_cache_dir) binds[src_data_cache_dir] = { "bind": dst_data_cache_dir, "mode": "rw" } environment["DATA_CACHE_FOLDER"] = dst_data_cache_dir - else: + elif isinstance(data_cache_dir_input, dict): for k, v in data_cache_dir_input.items(): if os.path.exists(k): - volumns.append(v) + volumes.append(v) binds[k] = { "bind": v, "mode": "rw" } else: logging.warning(f"{k} does not exist, skip mounting it to the container") - logging.info(f"Data cache mount: {volumns}, {binds}") + logging.info(f"Data cache mount: {volumes}, {binds}") + else: + logging.warning("data_cache_dir_input is not a string or a dictionary, skip mounting it to the container") # Default mounting if not enable_custom_image or (enable_custom_image and relative_entry != ""): logging.info("Start copying the source code to the container...") - volumns.append(src_code_dir) + volumes.append(src_code_dir) binds[src_code_dir] = { "bind": dst_model_serving_dir, "mode": "rw" @@ -284,7 +252,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, host_config_dict = { "binds": binds, "port_bindings": { - port_inside_container: usr_indicated_worker_port + port_inside_container: None }, "shm_size": shm_size, "storage_opt": storage_opt, @@ -312,7 +280,6 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, if not enable_custom_image: # For some image, the default user is root. Unified to fedml. environment["HOME"] = "/home/fedml" - environment["BOOTSTRAP_DIR"] = dst_bootstrap_dir environment["FEDML_CURRENT_RUN_ID"] = end_point_id environment["FEDML_CURRENT_EDGE_ID"] = edge_id @@ -326,12 +293,13 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, for key in extra_envs: environment[key] = extra_envs[key] + # Create the container try: host_config = client.api.create_host_config(**host_config_dict) new_container = client.api.create_container( image=inference_image_name, name=default_server_container_name, - volumes=volumns, + volumes=volumes, ports=[port_inside_container], # port open inside the container environment=environment, host_config=host_config, @@ -349,22 +317,18 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, while True: cnt += 1 try: - if usr_indicated_worker_port is not None: - inference_http_port = usr_indicated_worker_port - break - else: - # Find the random port - port_info = client.api.port(new_container.get("Id"), port_inside_container) - inference_http_port = port_info[0]["HostPort"] - logging.info("inference_http_port: {}".format(inference_http_port)) - break + # Find the random port + port_info = client.api.port(new_container.get("Id"), port_inside_container) + inference_http_port = port_info[0]["HostPort"] + logging.info("host port allocated: {}".format(inference_http_port)) + break except: if cnt >= 5: raise Exception("Failed to get the port allocation") time.sleep(3) # Logging the info from the container when starting - log_deployment_result(end_point_id, model_id, default_server_container_name, + log_deployment_output(end_point_id, model_id, default_server_container_name, ClientConstants.CMD_TYPE_RUN_DEFAULT_SERVER, inference_model_name, inference_engine, inference_http_port, inference_type, retry_interval=10, deploy_attempt_threshold=usr_indicated_retry_cnt, @@ -373,9 +337,8 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, # Return the running model name and the inference output url inference_output_url, running_model_version, ret_model_metadata, ret_model_config = \ - get_model_info(inference_model_name, inference_engine, inference_http_port, - infer_host, False, inference_type, request_input_example=request_input_example, - enable_custom_image=enable_custom_image) + check_container_readiness(inference_http_port=inference_http_port, infer_host=infer_host, + request_input_example=request_input_example) if inference_output_url == "": return running_model_name, "", None, None, None @@ -426,9 +389,8 @@ def should_exit_logs(end_point_id, model_id, cmd_type, model_name, inference_eng # If the container has exited, return True, means we should exit the logs try: inference_output_url, model_version, model_metadata, model_config = \ - get_model_info(model_name, inference_engine, inference_port, infer_host, - inference_type=inference_type, request_input_example=request_input_example, - enable_custom_image=enable_custom_image) + check_container_readiness(inference_http_port=inference_port, infer_host=infer_host, + request_input_example=request_input_example) if inference_output_url != "": logging.info("Log test for deploying model successfully, inference url: {}, " "model metadata: {}, model config: {}". @@ -443,7 +405,7 @@ def should_exit_logs(end_point_id, model_id, cmd_type, model_name, inference_eng return False -def log_deployment_result(end_point_id, model_id, cmd_container_name, cmd_type, +def log_deployment_output(end_point_id, model_id, cmd_container_name, cmd_type, inference_model_name, inference_engine, inference_http_port, inference_type="default", retry_interval=10, deploy_attempt_threshold=10, @@ -542,10 +504,10 @@ def log_deployment_result(end_point_id, model_id, cmd_container_name, cmd_type, time.sleep(retry_interval) -def is_client_inference_container_ready(infer_url_host, inference_http_port, inference_model_name, local_infer_url, - inference_type="default", model_version="", request_input_example=None): +def is_client_inference_container_ready(infer_url_host, inference_http_port, readiness_check_type="default", + readiness_check_cmd=None, request_input_example=None): - if inference_type == "default": + if readiness_check_type == "default": default_client_container_ready_url = "http://{}:{}/ready".format("0.0.0.0", inference_http_port) response = None try: @@ -555,7 +517,7 @@ def is_client_inference_container_ready(infer_url_host, inference_http_port, inf if not response or response.status_code != 200: return "", "", {}, {} - # Report the deployed model info + # Construct the model metadata (input and output) model_metadata = {} if request_input_example is not None and len(request_input_example) > 0: model_metadata["inputs"] = request_input_example @@ -563,51 +525,19 @@ def is_client_inference_container_ready(infer_url_host, inference_http_port, inf model_metadata["inputs"] = {"text": "What is a good cure for hiccups?"} model_metadata["outputs"] = [] model_metadata["type"] = "default" + return "http://{}:{}/predict".format(infer_url_host, inference_http_port), None, model_metadata, None else: - triton_server_url = "{}:{}".format(infer_url_host, inference_http_port) - if model_version == "" or model_version is None: - model_version = ClientConstants.INFERENCE_MODEL_VERSION - logging.info( - f"triton_server_url: {triton_server_url} model_version: {model_version} model_name: {inference_model_name}") - triton_client = http_client.InferenceServerClient(url=triton_server_url, verbose=False) - if not triton_client.is_model_ready( - model_name=inference_model_name, model_version=model_version - ): - return "", model_version, {}, {} - logging.info(f"Model {inference_model_name} is ready, start to get model metadata...") - model_metadata = triton_client.get_model_metadata(model_name=inference_model_name, model_version=model_version) - model_config = triton_client.get_model_config(model_name=inference_model_name, model_version=model_version) - version_list = model_metadata.get("versions", None) - if version_list is not None and len(version_list) > 0: - model_version = version_list[0] - else: - model_version = ClientConstants.INFERENCE_MODEL_VERSION - - inference_output_url = "http://{}:{}/{}/models/{}/versions/{}/infer".format(infer_url_host, - inference_http_port, - ClientConstants.INFERENCE_INFERENCE_SERVER_VERSION, - inference_model_name, - model_version) - - return inference_output_url, model_version, model_metadata, model_config - - -def get_model_info(model_name, inference_engine, inference_http_port, infer_host="127.0.0.1", is_hg_model=False, - inference_type="default", request_input_example=None, enable_custom_image=False): - if model_name is None: + # TODO(Raphael): Support arbitrary readiness check command + logging.error(f"Unknown readiness check type: {readiness_check_type}") return "", "", {}, {} - local_infer_url = "{}:{}".format(infer_host, inference_http_port) - - if is_hg_model: - inference_model_name = "{}_{}_inference".format(model_name, str(inference_engine)) - else: - inference_model_name = model_name +def check_container_readiness(inference_http_port, infer_host="127.0.0.1", request_input_example=None, + readiness_check_type="default", readiness_check_cmd=None): response_from_client_container = is_client_inference_container_ready( - infer_host, inference_http_port, inference_model_name, local_infer_url, - inference_type, model_version="", request_input_example=request_input_example) + infer_host, inference_http_port, readiness_check_type, readiness_check_cmd, + request_input_example=request_input_example) return response_from_client_container From 64e8c779c61edfecf7ca8e638b6b54ff31d7983b Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Tue, 11 Jun 2024 16:29:37 -0700 Subject: [PATCH 05/38] [Deploy] Nit --- .../computing/scheduler/model_scheduler/device_model_cards.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_cards.py b/python/fedml/computing/scheduler/model_scheduler/device_model_cards.py index 8feb757a63..c2f11a2917 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_cards.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_cards.py @@ -14,7 +14,6 @@ from fedml.core.common.singleton import Singleton from fedml.computing.scheduler.model_scheduler.modelops_configs import ModelOpsConfigs -from fedml.computing.scheduler.model_scheduler.device_model_deployment import get_model_info from fedml.computing.scheduler.model_scheduler.device_server_constants import ServerConstants from fedml.computing.scheduler.model_scheduler.device_model_object import FedMLModelList, FedMLEndpointDetail from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants From 9194f8424f77008b49a48908ee72f19fe59ba23d Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Tue, 11 Jun 2024 16:42:46 -0700 Subject: [PATCH 06/38] [Deploy] Hide unnecessary log. --- .../scheduler/model_scheduler/device_model_cache.py | 8 ++++---- .../scheduler/model_scheduler/device_model_inference.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py index 6c90944277..c941c42102 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py @@ -369,7 +369,7 @@ def get_idle_device(self, if "model_status" in result_payload and result_payload["model_status"] == "DEPLOYED": idle_device_list.append({"device_id": device_id, "end_point_id": end_point_id}) - logging.info(f"{len(idle_device_list)} devices this model has on it: {idle_device_list}") + logging.debug(f"{len(idle_device_list)} devices this model has on it: {idle_device_list}") if len(idle_device_list) <= 0: return None, None @@ -398,7 +398,7 @@ def get_idle_device(self, logging.info("Inference Device selection Failed:") logging.info(e) - logging.info(f"Using Round Robin, the device index is {selected_device_index}") + logging.debug(f"Using Round Robin, the device index is {selected_device_index}") idle_device_dict = idle_device_list[selected_device_index] # Note that within the same endpoint_id, there could be one device with multiple same models @@ -411,7 +411,7 @@ def get_idle_device(self, # Find deployment result from the target idle device. try: for result_item in result_list: - logging.info("enter the for loop") + logging.debug("enter the for loop") device_id, _, result_payload = self.get_result_item_info(result_item) found_end_point_id = result_payload["end_point_id"] found_end_point_name = result_payload["end_point_name"] @@ -425,7 +425,7 @@ def get_idle_device(self, if same_model_device_rank > 0: same_model_device_rank -= 1 continue - logging.info(f"The chosen device is {device_id}") + logging.debug(f"The chosen device is {device_id}") return result_payload, device_id except Exception as e: logging.info(str(e)) diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py index 3aeec67932..ba13006245 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py @@ -230,7 +230,7 @@ async def _predict( model_metrics.set_start_time(start_time) # Send inference request to idle device - logging.info("inference url {}.".format(inference_output_url)) + logging.debug("inference url {}.".format(inference_output_url)) if inference_output_url != "": input_list = input_json.get("inputs", input_json) stream_flag = input_json.get("stream", False) @@ -329,7 +329,7 @@ def found_idle_inference_device(end_point_id, end_point_name, in_model_name, in_ res = (idle_device, end_point_id, model_id, model_name, model_version, inference_host, inference_output_url, connectivity_type) - logging.info(f"found idle device with metrics: {res}") + logging.debug(f"found idle device with metrics: {res}") return res @@ -352,7 +352,7 @@ async def send_inference_request(idle_device, end_point_id, inference_url, input output_list, inference_type=inference_type, timeout=request_timeout_sec) - logging.info(f"Use http inference. return {response_ok}") + logging.debug(f"Use http inference. return {response_ok}") return inference_response elif connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP_PROXY: logging.warning("Use http proxy inference.") From 243be07831c7ffd078203f402efae339ed0b58a3 Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Tue, 11 Jun 2024 17:50:30 -0700 Subject: [PATCH 07/38] [Deploy] Read port info from env. --- .../scheduler/model_scheduler/device_client_constants.py | 1 + .../scheduler/model_scheduler/device_server_constants.py | 1 + .../scheduler/model_scheduler/master_job_runner.py | 6 +++--- .../scheduler/model_scheduler/worker_protocol_manager.py | 8 +++++++- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py index 2c06189d2e..f1e7dea91f 100644 --- a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py @@ -74,6 +74,7 @@ class ClientConstants(object): K8S_DEPLOYMENT_SLAVE_MOUNT_HOME_DIR = "/home/fedml/fedml-client" LOCAL_CLIENT_API_PORT = 22030 + ENV_CLIENT_PROXY_PORT_KEY = "FEDML_WORKER_INFERENCE_PROXY_PORT" INFERENCE_HTTP_PORT = 8000 INFERENCE_GRPC_PORT = 8001 diff --git a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py index 243c197b2f..a868d03b41 100644 --- a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py @@ -103,6 +103,7 @@ class ServerConstants(object): AUTO_DETECT_PUBLIC_IP = "auto_detect_public_ip" MODEL_INFERENCE_DEFAULT_PORT = 2203 + ENV_MASTER_INFERENCE_PORT_KEY = "FEDML_MASTER_INFERENCE_GATEWAY_PORT" MODEL_CACHE_KEY_EXPIRE_TIME = 1 * 10 INFERENCE_REQUEST_TIMEOUT_KEY = "request_timeout_sec" diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py index ef2c01c49d..d7565d7647 100755 --- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py +++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py @@ -460,9 +460,9 @@ def cleanup_runner_process(self, run_id): def start_device_inference_gateway(inference_port=ServerConstants.MODEL_INFERENCE_DEFAULT_PORT): # start unified inference server python_program = get_python_program() - master_port = os.getenv("FEDML_MASTER_PORT", None) - if master_port is not None: - inference_port = int(master_port) + master_port_frm_env = os.getenv(ServerConstants.MODEL_INFERENCE_DEFAULT_PORT, None) + if master_port_frm_env is not None: + inference_port = int(master_port_frm_env) if not ServerConstants.is_running_on_k8s(): logging.info(f"start the model inference gateway...") inference_gw_cmd = "fedml.computing.scheduler.model_scheduler.device_model_inference:api" diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py b/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py index f9bc70452d..ee59f87441 100755 --- a/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py +++ b/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py @@ -79,6 +79,12 @@ def _init_extra_items(self): client_api_cmd = "fedml.computing.scheduler.model_scheduler.device_client_api:api" client_api_pids = RunProcessUtils.get_pid_from_cmd_line(client_api_cmd) + + worker_proxy_port = ClientConstants.LOCAL_CLIENT_API_PORT + worker_proxy_port_frm_env = os.environ.get(ClientConstants.ENV_CLIENT_PROXY_PORT_KEY, None) + if worker_proxy_port_frm_env is not None: + worker_proxy_port = int(worker_proxy_port_frm_env) + if client_api_pids is None or len(client_api_pids) <= 0: # Start local API services cur_dir = os.path.dirname(__file__) @@ -88,7 +94,7 @@ def _init_extra_items(self): "{} -m uvicorn {} --host 0.0.0.0 --port {} --reload --reload-delay 3 --reload-dir {} " "--log-level critical".format( python_program, client_api_cmd, - ClientConstants.LOCAL_CLIENT_API_PORT, fedml_base_dir + worker_proxy_port, fedml_base_dir ), should_capture_stdout=False, should_capture_stderr=False From 3a034717f7ebc43ff035e73cc49c13ea1c2e7d79 Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Wed, 12 Jun 2024 12:04:36 -0700 Subject: [PATCH 08/38] [Deploy] Nit. --- .../computing/scheduler/model_scheduler/master_job_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py index d7565d7647..67a3e8bb82 100755 --- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py +++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py @@ -460,7 +460,7 @@ def cleanup_runner_process(self, run_id): def start_device_inference_gateway(inference_port=ServerConstants.MODEL_INFERENCE_DEFAULT_PORT): # start unified inference server python_program = get_python_program() - master_port_frm_env = os.getenv(ServerConstants.MODEL_INFERENCE_DEFAULT_PORT, None) + master_port_frm_env = os.getenv(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY, None) if master_port_frm_env is not None: inference_port = int(master_port_frm_env) if not ServerConstants.is_running_on_k8s(): From f0dd29e04fa600339c0efb74526694d4dee2842e Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Wed, 12 Jun 2024 12:12:25 -0700 Subject: [PATCH 09/38] [Deploy] Nit. --- .../scheduler/model_scheduler/master_job_runner.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py index 67a3e8bb82..5f82a6c046 100755 --- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py +++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py @@ -369,12 +369,8 @@ def process_deployment_result_message(self, topic=None, payload=None): """ When all the devices have finished the add / delete / update operation """ - # Generate one unified inference api - # Note that here we use the gateway port instead of the inference port that is used by the slave device - model_config_parameters = request_json["parameters"] - inference_port = model_config_parameters.get("server_internal_port", - ServerConstants.MODEL_INFERENCE_DEFAULT_PORT) - inference_port_external = model_config_parameters.get("server_external_port", inference_port) + inference_port_external = os.environ.get(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY, + ServerConstants.MODEL_INFERENCE_DEFAULT_PORT) ip = GeneralConstants.get_ip_address(request_json) if ip.startswith("http://") or ip.startswith("https://"): From 21a8a4c9d97e712f029f0e7abe39e0b5e56954a2 Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Wed, 12 Jun 2024 12:18:58 -0700 Subject: [PATCH 10/38] [Deploy] Change few more places relate to gateway port. --- .../scheduler/model_scheduler/master_job_runner.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py index 5f82a6c046..50d902b933 100755 --- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py +++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py @@ -144,7 +144,8 @@ def run_impl( # No device is added, updated or removed logging.info("No device is added, updated or removed. No action needed for reconciliation.") ip = GeneralConstants.get_ip_address(self.request_json) - master_port = os.getenv("FEDML_MASTER_PORT", None) + master_port = os.environ.get(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY, + ServerConstants.MODEL_INFERENCE_DEFAULT_PORT) if master_port is not None: inference_port = int(master_port) model_inference_port = inference_port @@ -299,9 +300,8 @@ def process_deployment_result_message(self, topic=None, payload=None): else: # This is the last worker that failed, so we should continue to "ABORTED" status model_config_parameters = self.request_json["parameters"] - inference_port = model_config_parameters.get("server_internal_port", - ServerConstants.MODEL_INFERENCE_DEFAULT_PORT) - inference_port_external = model_config_parameters.get("server_external_port", inference_port) + inference_port_external = os.environ.get(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY, + ServerConstants.MODEL_INFERENCE_DEFAULT_PORT) ip = GeneralConstants.get_ip_address(self.request_json) if ip.startswith("http://") or ip.startswith("https://"): model_inference_url = "{}/inference/{}".format(ip, end_point_id) @@ -753,9 +753,8 @@ def parse_model_run_params(running_json): model_version = model_config["model_version"] model_config_parameters = running_json.get("parameters", {}) - inference_port = model_config_parameters.get("server_internal_port", # Internal port is for the gateway - ServerConstants.MODEL_INFERENCE_DEFAULT_PORT) - inference_port_external = model_config_parameters.get("server_external_port", inference_port) + inference_port = int(os.environ.get(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY, + ServerConstants.MODEL_INFERENCE_DEFAULT_PORT)) return run_id, end_point_name, token, user_id, user_name, device_ids, device_objs, model_config, model_name, \ model_id, model_storage_url, scale_min, scale_max, inference_engine, model_is_from_open, \ From e7e974d24f510a47e2ee5e9df1a6161665fffa1e Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Wed, 12 Jun 2024 15:29:53 -0700 Subject: [PATCH 11/38] [Deploy] Write port info into env file. --- python/fedml/api/__init__.py | 12 ++++++---- python/fedml/api/modules/device.py | 13 ++++++++--- python/fedml/cli/modules/login.py | 22 +++++++++++++++++-- .../device_client_constants.py | 8 +++++++ .../device_server_constants.py | 9 ++++++++ .../model_scheduler/master_job_runner.py | 22 +++++++------------ .../worker_protocol_manager.py | 3 +-- 7 files changed, 64 insertions(+), 25 deletions(-) diff --git a/python/fedml/api/__init__.py b/python/fedml/api/__init__.py index 3e75b987d6..f753e4255b 100755 --- a/python/fedml/api/__init__.py +++ b/python/fedml/api/__init__.py @@ -24,6 +24,8 @@ from fedml.computing.scheduler.scheduler_entry.cluster_manager import FedMLClusterModelList from fedml.computing.scheduler.scheduler_entry.run_manager import FedMLRunStartedModel, FedMLGpuDevices, \ FedMLRunModelList, FeatureEntryPoint +from fedml.computing.scheduler.model_scheduler.device_server_constants import ServerConstants +from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants def fedml_login(api_key: str = None): @@ -209,16 +211,18 @@ def fedml_build(platform, type, source_folder, entry_point, config_folder, dest_ return build.build(platform, type, source_folder, entry_point, config_folder, dest_folder, ignore) -def login(api_key, computing, server, supplier): - device_bind(api_key, computing, server, supplier) +def login(api_key, computing, server, supplier, + master_inference_gateway_port: int = ServerConstants.MODEL_INFERENCE_DEFAULT_PORT, + worker_inference_proxy_port: int = ClientConstants.LOCAL_CLIENT_API_PORT): + device_bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port) def logout(computing, server): device_unbind(computing, server) -def device_bind(api_key, computing, server, supplier): - device.bind(api_key, computing, server, supplier) +def device_bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port): + device.bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port) def device_unbind(computing, server): diff --git a/python/fedml/api/modules/device.py b/python/fedml/api/modules/device.py index a853d538d0..14591147a6 100644 --- a/python/fedml/api/modules/device.py +++ b/python/fedml/api/modules/device.py @@ -10,14 +10,18 @@ from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants from fedml.computing.scheduler.comm_utils.run_process_utils import RunProcessUtils from fedml.computing.scheduler.master.server_constants import ServerConstants +from fedml.computing.scheduler.model_scheduler.device_server_constants import ServerConstants as DeviceServerConstants from fedml.computing.scheduler.master.server_login import logout as server_logout from fedml.computing.scheduler.slave.client_constants import ClientConstants +from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants as DeviceClientConstants from fedml.computing.scheduler.slave.client_login import logout as client_logout from fedml.computing.scheduler.scheduler_entry.resource_manager import FedMLResourceManager def bind( - api_key, computing, server, supplier + api_key, computing, server, supplier, + master_inference_gateway_port=DeviceServerConstants.MODEL_INFERENCE_DEFAULT_PORT, + worker_inference_proxy_port=DeviceClientConstants.LOCAL_CLIENT_API_PORT ): userid = api_key runner_cmd = "{}" @@ -43,13 +47,13 @@ def bind( _bind( userid, computing, server, api_key, role, runner_cmd, device_id, os_name, - docker) + docker, master_inference_gateway_port, worker_inference_proxy_port) def _bind( userid, computing, server, api_key, role, runner_cmd, device_id, os_name, - docker): + docker, master_inference_gateway_port, worker_inference_proxy_port): fedml.load_env() if os.getenv(ModuleConstants.ENV_FEDML_INFER_HOST) is None: fedml.set_env_kv(ModuleConstants.ENV_FEDML_INFER_HOST, SchedulerConstants.REDIS_INFER_HOST) @@ -60,6 +64,9 @@ def _bind( if os.getenv(ModuleConstants.ENV_FEDML_INFER_REDIS_PASSWORD) is None: fedml.set_env_kv(ModuleConstants.ENV_FEDML_INFER_REDIS_PASSWORD, SchedulerConstants.REDIS_PASSWORD) + fedml.set_env_kv(DeviceServerConstants.ENV_MASTER_INFERENCE_PORT_KEY, str(master_inference_gateway_port)) + fedml.set_env_kv(DeviceClientConstants.ENV_CLIENT_PROXY_PORT_KEY, str(worker_inference_proxy_port)) + url = fedml._get_backend_service() platform_name = platform.system() docker_config_text = None diff --git a/python/fedml/cli/modules/login.py b/python/fedml/cli/modules/login.py index f2e4d76322..f3c982f456 100644 --- a/python/fedml/cli/modules/login.py +++ b/python/fedml/cli/modules/login.py @@ -4,6 +4,8 @@ import fedml.api from fedml.api.modules.utils import authenticate +from fedml.computing.scheduler.model_scheduler.device_server_constants import ServerConstants +from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants @click.command("login", help="Login the FedML® Nexus AI Platform") @@ -51,9 +53,25 @@ default=80, help="The port for local on-premise Nexus AI Platform.", ) +@click.option( + "--master_inference_gateway_port", + "-mgp", + type=int, + default=ServerConstants.MODEL_INFERENCE_DEFAULT_PORT, + help="The port for master inference gateway.", +) +@click.option( + "--worker_inference_proxy_port", + "-wpp", + type=int, + default=ClientConstants.LOCAL_CLIENT_API_PORT, + help="The port for worker inference proxy.", +) def fedml_login( api_key, version, compute_node, server, provider, deploy_worker_num, - local_on_premise_platform, local_on_premise_platform_port): + local_on_premise_platform, local_on_premise_platform_port, + master_inference_gateway_port, worker_inference_proxy_port +): fedml.set_env_version(version) fedml.set_local_on_premise_platform_host(local_on_premise_platform) fedml.set_local_on_premise_platform_port(local_on_premise_platform_port) @@ -66,4 +84,4 @@ def fedml_login( print(f"Maybe you are using account id to login, we will try to login with account {api_key}.") pass os.environ["FEDML_MODEL_WORKER_NUM"] = str(deploy_worker_num) - fedml.api.login(api_key, compute_node, server, provider) + fedml.api.login(api_key, compute_node, server, provider, master_inference_gateway_port, worker_inference_proxy_port) diff --git a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py index f1e7dea91f..fdcbdf0a34 100644 --- a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py @@ -458,6 +458,14 @@ def get_public_ip(): logging.info("Failed to get public ip: {}".format(e)) return ip + @staticmethod + def get_inference_worker_proxy_port() -> int: + # Use dotenv to load the environment variables + fedml.load_env() + worker_proxy_port = int(os.getenv(ClientConstants.ENV_CLIENT_PROXY_PORT_KEY, + default=ClientConstants.LOCAL_CLIENT_API_PORT)) + return worker_proxy_port + @staticmethod def check_process_is_running(process_id): for proc in psutil.process_iter(): diff --git a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py index a868d03b41..a5048c26a6 100644 --- a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py @@ -349,6 +349,15 @@ def get_runner_infos(): logging.error(f"Failed to parse runner info: {e}") return runner_info + @staticmethod + def get_inference_master_gateway_port(): + # Use dotenv to load the environment variables + fedml.load_env() + master_inference_port = int(os.getenv(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY, + default=ServerConstants.MODEL_INFERENCE_DEFAULT_PORT)) + return master_inference_port + + @staticmethod def save_runner_infos(unique_device_id, edge_id, run_id=None): local_pkg_data_dir = ServerConstants.get_data_dir() diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py index 50d902b933..eff26684b7 100755 --- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py +++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py @@ -115,7 +115,7 @@ def run_impl( message_center=self.message_center) # start unified inference gateway process if not started - FedMLDeployMasterJobRunner.start_device_inference_gateway(inference_port=inference_port) + FedMLDeployMasterJobRunner.start_device_inference_gateway() # start inference monitor process FedMLDeployMasterJobRunner.stop_device_inference_monitor( @@ -144,8 +144,7 @@ def run_impl( # No device is added, updated or removed logging.info("No device is added, updated or removed. No action needed for reconciliation.") ip = GeneralConstants.get_ip_address(self.request_json) - master_port = os.environ.get(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY, - ServerConstants.MODEL_INFERENCE_DEFAULT_PORT) + master_port = ServerConstants.get_inference_master_gateway_port() if master_port is not None: inference_port = int(master_port) model_inference_port = inference_port @@ -300,8 +299,7 @@ def process_deployment_result_message(self, topic=None, payload=None): else: # This is the last worker that failed, so we should continue to "ABORTED" status model_config_parameters = self.request_json["parameters"] - inference_port_external = os.environ.get(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY, - ServerConstants.MODEL_INFERENCE_DEFAULT_PORT) + inference_port_external = ServerConstants.get_inference_master_gateway_port() ip = GeneralConstants.get_ip_address(self.request_json) if ip.startswith("http://") or ip.startswith("https://"): model_inference_url = "{}/inference/{}".format(ip, end_point_id) @@ -369,8 +367,7 @@ def process_deployment_result_message(self, topic=None, payload=None): """ When all the devices have finished the add / delete / update operation """ - inference_port_external = os.environ.get(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY, - ServerConstants.MODEL_INFERENCE_DEFAULT_PORT) + inference_port_external = ServerConstants.get_inference_master_gateway_port() ip = GeneralConstants.get_ip_address(request_json) if ip.startswith("http://") or ip.startswith("https://"): @@ -453,12 +450,10 @@ def cleanup_runner_process(self, run_id): ServerConstants.cleanup_run_process(run_id, not_kill_subprocess=True) @staticmethod - def start_device_inference_gateway(inference_port=ServerConstants.MODEL_INFERENCE_DEFAULT_PORT): + def start_device_inference_gateway(): # start unified inference server python_program = get_python_program() - master_port_frm_env = os.getenv(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY, None) - if master_port_frm_env is not None: - inference_port = int(master_port_frm_env) + inference_port = ServerConstants.get_inference_master_gateway_port() if not ServerConstants.is_running_on_k8s(): logging.info(f"start the model inference gateway...") inference_gw_cmd = "fedml.computing.scheduler.model_scheduler.device_model_inference:api" @@ -539,7 +534,7 @@ def recover_inference_and_monitor(): if not is_activated: continue - FedMLDeployMasterJobRunner.start_device_inference_gateway(inference_port=inference_port) + FedMLDeployMasterJobRunner.start_device_inference_gateway() FedMLDeployMasterJobRunner.stop_device_inference_monitor( run_id, end_point_name, model_id, model_name, model_version) @@ -753,8 +748,7 @@ def parse_model_run_params(running_json): model_version = model_config["model_version"] model_config_parameters = running_json.get("parameters", {}) - inference_port = int(os.environ.get(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY, - ServerConstants.MODEL_INFERENCE_DEFAULT_PORT)) + inference_port = ServerConstants.get_inference_master_gateway_port() return run_id, end_point_name, token, user_id, user_name, device_ids, device_objs, model_config, model_name, \ model_id, model_storage_url, scale_min, scale_max, inference_engine, model_is_from_open, \ diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py b/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py index ee59f87441..cdfa43c33b 100755 --- a/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py +++ b/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py @@ -80,8 +80,7 @@ def _init_extra_items(self): client_api_cmd = "fedml.computing.scheduler.model_scheduler.device_client_api:api" client_api_pids = RunProcessUtils.get_pid_from_cmd_line(client_api_cmd) - worker_proxy_port = ClientConstants.LOCAL_CLIENT_API_PORT - worker_proxy_port_frm_env = os.environ.get(ClientConstants.ENV_CLIENT_PROXY_PORT_KEY, None) + worker_proxy_port = ClientConstants.get_inference_worker_proxy_port() if worker_proxy_port_frm_env is not None: worker_proxy_port = int(worker_proxy_port_frm_env) From 9c8ce99c41e6bf8df8f38fe88a6f782141d3a19e Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Wed, 12 Jun 2024 15:33:47 -0700 Subject: [PATCH 12/38] [Deploy] Nit. --- .../scheduler/model_scheduler/worker_protocol_manager.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py b/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py index cdfa43c33b..b1d0bebc47 100755 --- a/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py +++ b/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py @@ -81,8 +81,6 @@ def _init_extra_items(self): client_api_pids = RunProcessUtils.get_pid_from_cmd_line(client_api_cmd) worker_proxy_port = ClientConstants.get_inference_worker_proxy_port() - if worker_proxy_port_frm_env is not None: - worker_proxy_port = int(worker_proxy_port_frm_env) if client_api_pids is None or len(client_api_pids) <= 0: # Start local API services From 505103f9f05106712de4ea7078441526ee33b9f7 Mon Sep 17 00:00:00 2001 From: bhargav191098 Date: Thu, 13 Jun 2024 17:08:51 -0700 Subject: [PATCH 13/38] removing zip from upload --- python/fedml/api/modules/storage.py | 45 ++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/python/fedml/api/modules/storage.py b/python/fedml/api/modules/storage.py index e7d492c999..a928b325b2 100644 --- a/python/fedml/api/modules/storage.py +++ b/python/fedml/api/modules/storage.py @@ -38,27 +38,42 @@ def upload(data_path, api_key, name, description, tag_list, service, show_progre if user_id is None: return FedMLResponse(code=ResponseCode.FAILURE, message=message) + + data_type = _get_data_type(data_path) - if(not _check_data_path(data_path)): + if(data_type == "invalid"): return FedMLResponse(code=ResponseCode.FAILURE,message="Invalid data path") - archive_path, message = _archive_data(data_path) - if not archive_path: + if(data_type == "dir"): + to_upload_path, message = _archive_data(data_path) + name = os.path.splitext(os.path.basename(to_upload_path))[0] if name is None else name + file_name = name + ".zip" + else: + to_upload_path = data_path + base_name = os.path.basename(to_upload_path) + given_extension = os.path.splitext(name)[1] + if given_extension is None or given_extension == "": + given_extension = os.path.splitext(base_name)[1] + name = base_name if name is None else name + given_extension + file_name = name + + if not to_upload_path: return FedMLResponse(code=ResponseCode.FAILURE, message=message) - name = os.path.splitext(os.path.basename(archive_path))[0] if name is None else name - file_name = name + ".zip" + dest_path = os.path.join(user_id, file_name) - file_size = os.path.getsize(archive_path) + file_size = os.path.getsize(to_upload_path) - file_uploaded_url, message = _upload_multipart(api_key, file_name, archive_path, show_progress, + file_uploaded_url, message = _upload_multipart(api_key, file_name, to_upload_path, show_progress, out_progress_to_err, progress_desc, metadata) - - os.remove(archive_path) + if(data_type == "dir"): + os.remove(to_upload_path) if not file_uploaded_url: - return FedMLResponse(code=ResponseCode.FAILURE, message=f"Failed to upload file: {archive_path}") + return FedMLResponse(code=ResponseCode.FAILURE, message=f"Failed to upload file: {to_upload_path}") + + print("url: ",file_uploaded_url) json_data = { "datasetName": name, @@ -438,10 +453,12 @@ def _get_storage_service(service): else: raise NotImplementedError(f"Service {service} not implemented") -def _check_data_path(data_path): - if os.path.isdir(data_path) or os.path.isfile(data_path): - return True - return False +def _get_data_type(data_path): + if os.path.isdir(data_path): + return "dir" + elif os.path.isfile(data_path): + return "file" + return "invalid" def _archive_data(data_path: str) -> (str, str): From 03c58a2a42d8b43b3adf6331b38e38de92cc69d2 Mon Sep 17 00:00:00 2001 From: bhargav191098 Date: Thu, 13 Jun 2024 17:32:36 -0700 Subject: [PATCH 14/38] changes in the download to support files --- python/fedml/api/modules/storage.py | 23 ++++++++++++++++++----- python/fedml/cli/modules/storage.py | 2 +- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/python/fedml/api/modules/storage.py b/python/fedml/api/modules/storage.py index a928b325b2..94031c163e 100644 --- a/python/fedml/api/modules/storage.py +++ b/python/fedml/api/modules/storage.py @@ -110,13 +110,26 @@ def download(data_name, api_key, service, dest_path, show_progress=True) -> FedM logging.error(error_message) return FedMLResponse(code=ResponseCode.FAILURE, message=error_message) download_url = metadata.download_url - zip_file_name = data_name + ".zip" - path_local = os.path.abspath(zip_file_name) + given_extension = os.path.splitext(data_name)[1] + is_file = True + if(given_extension is None or given_extension ==""): + is_file = False + + if not is_file: + download_file_name = data_name + ".zip" + else: + download_file_name = data_name + path_local = os.path.abspath(download_file_name) dest_path = os.path.abspath(dest_path) if dest_path else data_name - if _download_using_presigned_url(download_url, zip_file_name, show_progress=show_progress): + if _download_using_presigned_url(download_url, download_file_name, show_progress=show_progress): try: - shutil.unpack_archive(path_local, dest_path) - os.remove(path_local) + if not is_file: + shutil.unpack_archive(path_local, dest_path) + os.remove(path_local) + else: + if not os.path.exists(dest_path): + os.makedirs(dest_path) + shutil.move(path_local,dest_path) abs_dest_path = os.path.abspath(dest_path) return FedMLResponse(code=ResponseCode.SUCCESS, message=f"Successfully downloaded and unzipped data at " f"{abs_dest_path}", data=abs_dest_path) diff --git a/python/fedml/cli/modules/storage.py b/python/fedml/cli/modules/storage.py index af75cda85f..7e060fc12e 100644 --- a/python/fedml/cli/modules/storage.py +++ b/python/fedml/cli/modules/storage.py @@ -47,7 +47,7 @@ def validate_argument(ctx, param, value): @click.help_option("--help", "-h") @click.argument("data_path", nargs=1, callback=validate_argument) @click.option("--name", "-n", type=str, help="Name your data to store. If not provided, the name will be the same as " - "the data file or directory name.") + "the data file or directory name. For files, extension need not be mentioned!") @click.option("--description", "-d", type=str, help="Add description to your data to store. If not provided, " "the description will be empty.") @click.option("--user_metadata", "-um", type=str, help="User-defined metadata in the form of a dictionary, for instance, " From cb7da7009f13fdf0191ba7710fdb0b100d90796f Mon Sep 17 00:00:00 2001 From: bhargav191098 Date: Thu, 13 Jun 2024 17:38:05 -0700 Subject: [PATCH 15/38] print statement removal --- python/fedml/api/modules/storage.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/fedml/api/modules/storage.py b/python/fedml/api/modules/storage.py index 94031c163e..2d10ff2588 100644 --- a/python/fedml/api/modules/storage.py +++ b/python/fedml/api/modules/storage.py @@ -73,8 +73,6 @@ def upload(data_path, api_key, name, description, tag_list, service, show_progre if not file_uploaded_url: return FedMLResponse(code=ResponseCode.FAILURE, message=f"Failed to upload file: {to_upload_path}") - print("url: ",file_uploaded_url) - json_data = { "datasetName": name, "description": description, From 394906ecf03fe2e221bfba4a7a46c87105d26a35 Mon Sep 17 00:00:00 2001 From: bhargav191098 Date: Fri, 14 Jun 2024 12:33:23 -0700 Subject: [PATCH 16/38] name issue --- python/fedml/api/modules/storage.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/python/fedml/api/modules/storage.py b/python/fedml/api/modules/storage.py index 2d10ff2588..3e4219775d 100644 --- a/python/fedml/api/modules/storage.py +++ b/python/fedml/api/modules/storage.py @@ -51,10 +51,15 @@ def upload(data_path, api_key, name, description, tag_list, service, show_progre else: to_upload_path = data_path base_name = os.path.basename(to_upload_path) - given_extension = os.path.splitext(name)[1] - if given_extension is None or given_extension == "": - given_extension = os.path.splitext(base_name)[1] - name = base_name if name is None else name + given_extension + file_extension = os.path.splitext(base_name)[1] + given_extension = None + if name is not None: + given_extension = os.path.splitext(name)[1] + if given_extension is None or given_extension == "": + name = name + file_extension + else: + name = base_name + file_name = name if not to_upload_path: From 2170797de1235e78f9a92722b495cb01af8d92c2 Mon Sep 17 00:00:00 2001 From: bhargav191098 Date: Fri, 14 Jun 2024 18:13:08 -0700 Subject: [PATCH 17/38] \Adding Enum for data type --- python/fedml/api/modules/storage.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/python/fedml/api/modules/storage.py b/python/fedml/api/modules/storage.py index 3e4219775d..0729c09edc 100644 --- a/python/fedml/api/modules/storage.py +++ b/python/fedml/api/modules/storage.py @@ -4,6 +4,7 @@ import requests import math +from enum import Enum, unique import requests.exceptions import tqdm @@ -26,6 +27,10 @@ def __init__(self, data: dict): self.tag_list = data.get("tags", None) self.download_url = data.get("fileUrl", None) +class DataType(Enum): + FILE = "file" + DIRECTORY = "directory" + INVALID = "invalid" # Todo (alaydshah): Store service name in metadata # Todo (alaydshah): If data already exists, don't upload again. Instead suggest to use update command @@ -41,10 +46,10 @@ def upload(data_path, api_key, name, description, tag_list, service, show_progre data_type = _get_data_type(data_path) - if(data_type == "invalid"): + if(data_type == DataType.INVALID): return FedMLResponse(code=ResponseCode.FAILURE,message="Invalid data path") - if(data_type == "dir"): + if(data_type == DataType.DIRECTORY): to_upload_path, message = _archive_data(data_path) name = os.path.splitext(os.path.basename(to_upload_path))[0] if name is None else name file_name = name + ".zip" @@ -471,10 +476,10 @@ def _get_storage_service(service): def _get_data_type(data_path): if os.path.isdir(data_path): - return "dir" + return DataType.DIRECTORY elif os.path.isfile(data_path): - return "file" - return "invalid" + return DataType.FILE + return DataType.INVALID def _archive_data(data_path: str) -> (str, str): From 5fb5ed43d42f54b0c47e9a0ae802bcab29197052 Mon Sep 17 00:00:00 2001 From: bhargav191098 Date: Fri, 14 Jun 2024 18:32:05 -0700 Subject: [PATCH 18/38] adding user_id to bucket path --- python/fedml/api/modules/storage.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/fedml/api/modules/storage.py b/python/fedml/api/modules/storage.py index 0729c09edc..33e781be08 100644 --- a/python/fedml/api/modules/storage.py +++ b/python/fedml/api/modules/storage.py @@ -70,11 +70,11 @@ def upload(data_path, api_key, name, description, tag_list, service, show_progre if not to_upload_path: return FedMLResponse(code=ResponseCode.FAILURE, message=message) - + #TODO(bhargav191098) - Better done on the backend. Remove and pass file_name once completed on backend. dest_path = os.path.join(user_id, file_name) file_size = os.path.getsize(to_upload_path) - file_uploaded_url, message = _upload_multipart(api_key, file_name, to_upload_path, show_progress, + file_uploaded_url, message = _upload_multipart(api_key, dest_path, to_upload_path, show_progress, out_progress_to_err, progress_desc, metadata) From aecafb80f9d6731b6b15e4cfca7b15035b82cf84 Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Mon, 17 Jun 2024 14:39:16 -0700 Subject: [PATCH 19/38] Fix compatibility by limiting numpy latest version. --- python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index 9651465d32..4757c10a17 100644 --- a/python/setup.py +++ b/python/setup.py @@ -40,7 +40,7 @@ def finalize_options(self): 'multiprocess', 'networkx<3.0', 'ntplib', - 'numpy>=1.21', + 'numpy<2.0.0', 'onnx', 'paho-mqtt<2.0.0', 'pandas', From 89219fb3c20972ff94badc76f8e90d71592e5647 Mon Sep 17 00:00:00 2001 From: alaydshah Date: Tue, 18 Jun 2024 07:00:43 +0000 Subject: [PATCH 20/38] Workaround device mapping inconsistency --- .../comm_utils/gpu_utils/gpu_utils.py | 1 + .../comm_utils/gpu_utils/qualcomm_utils.py | 36 +++++++++++++------ .../scheduler/comm_utils/hardware_utils.py | 2 ++ .../scheduler_core/account_manager.py | 2 +- 4 files changed, 29 insertions(+), 12 deletions(-) diff --git a/python/fedml/computing/scheduler/comm_utils/gpu_utils/gpu_utils.py b/python/fedml/computing/scheduler/comm_utils/gpu_utils/gpu_utils.py index bc7a3b8216..b48a3e85b7 100644 --- a/python/fedml/computing/scheduler/comm_utils/gpu_utils/gpu_utils.py +++ b/python/fedml/computing/scheduler/comm_utils/gpu_utils/gpu_utils.py @@ -27,6 +27,7 @@ class GPUCard: memoryUsed: float memoryUtil: float load: Optional[float] = 0.0 + device_path: Optional[str] = "" uuid: Optional[str] = "" display_mode: Optional[str] = "" display_active: Optional[str] = "" diff --git a/python/fedml/computing/scheduler/comm_utils/gpu_utils/qualcomm_utils.py b/python/fedml/computing/scheduler/comm_utils/gpu_utils/qualcomm_utils.py index 88114cf2ad..13131e362d 100644 --- a/python/fedml/computing/scheduler/comm_utils/gpu_utils/qualcomm_utils.py +++ b/python/fedml/computing/scheduler/comm_utils/gpu_utils/qualcomm_utils.py @@ -26,19 +26,22 @@ def detect_gpu_card_type(cls) -> Optional[GPUCardType]: @staticmethod def get_gpu_cards() -> List[GPUCard]: - from qaicrt import Util, QIDList, QDevInfo, QStatus + return list(QualcommNPUtil.__get_gpu_cards().values()) - cards = [] + @staticmethod + def __get_gpu_cards() -> Dict[int, GPUCard]: + from qaicrt import Util, QIDList, QDevInfo, QStatus + cards = dict() util = Util() status, card_list = util.getDeviceIds() if status.value == 0: for card in card_list: status, card_info = util.getDeviceInfo(card) if status.value == 0 and card_info.devStatus.value == 1: - cards.append(QualcommNPUtil.__convert(card_info)) - + gpu_card = QualcommNPUtil.__convert(card_info) + cards[gpu_card.id] = gpu_card else: - logging.error("Qualcomm Card Status not Healthy") + logging.error("Qualcomm Cards Status not Healthy") return cards @staticmethod @@ -58,11 +61,21 @@ def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memo @staticmethod def get_docker_gpu_device_mapping(gpu_ids: Optional[List[int]], num_gpus: int = 0) -> Optional[Dict]: - if gpu_ids is not None and len(gpu_ids): - return { - "devices": [f"{QualcommNPUtil.NPU_CARD_PATH}{gpu_id}:{QualcommNPUtil.NPU_CARD_PATH}{gpu_id}" for gpu_id - in gpu_ids]} - return None + if gpu_ids is None or not len(gpu_ids): + return None + + devices = [] + gpu_cards = QualcommNPUtil.__get_gpu_cards() + + for gpu_id in gpu_ids: + if not (gpu_id in gpu_cards and gpu_cards[gpu_id].device_path): + logging.error("Failed to get gpu device mapping for docker") + break + else: + device_path = gpu_cards[gpu_id].device_path + devices.append(f"{device_path}:{device_path}") + + return {"devices": devices} if len(devices) == len(gpu_ids) else None @staticmethod def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: DockerClient) -> List[int]: @@ -87,7 +100,8 @@ def __convert(npu) -> GPUCard: load = (nsp_total - nsp_free) / nsp_total return GPUCard( - id=npu.qid, + id=npu.mhiId, + device_path=npu.name, name=npu.pciInfo.devicename, driver=npu.devData.fwQCImageVersionString, serial=npu.devData.serial, diff --git a/python/fedml/computing/scheduler/comm_utils/hardware_utils.py b/python/fedml/computing/scheduler/comm_utils/hardware_utils.py index e73809955e..c876948145 100644 --- a/python/fedml/computing/scheduler/comm_utils/hardware_utils.py +++ b/python/fedml/computing/scheduler/comm_utils/hardware_utils.py @@ -60,5 +60,7 @@ def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: Doc if __name__ == "__main__": gpus = HardwareUtil.get_gpus() get_available_gpu_cards = HardwareUtil.get_available_gpu_ids(limit=len(gpus)) + device_mapping = HardwareUtil.get_docker_gpu_device_mapping(get_available_gpu_cards, len(get_available_gpu_cards)) print(gpus) print(get_available_gpu_cards) + print(device_mapping) diff --git a/python/fedml/computing/scheduler/scheduler_core/account_manager.py b/python/fedml/computing/scheduler/scheduler_core/account_manager.py index 3491e102f6..3b80511d12 100755 --- a/python/fedml/computing/scheduler/scheduler_core/account_manager.py +++ b/python/fedml/computing/scheduler/scheduler_core/account_manager.py @@ -266,7 +266,7 @@ def get_uuid(): if not use_machine_id: device_id = hex(uuid.getnode()) else: - device_id = device_id = FedMLAccountManager.get_gpu_machine_id() + device_id = FedMLAccountManager.get_gpu_machine_id() else: device_id = sys_utils.run_subprocess_open( "hal-get-property --udi /org/freedesktop/Hal/devices/computer --key system.hardware.uuid".split() From 1d5a05db71ba3943cb42eea0836fabc181af7ac6 Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Tue, 18 Jun 2024 18:23:40 -0700 Subject: [PATCH 21/38] [Deploy][Autoscale] Bug fix: continue the for loop if no scale op. --- python/fedml/computing/scheduler/comm_utils/job_monitor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/fedml/computing/scheduler/comm_utils/job_monitor.py b/python/fedml/computing/scheduler/comm_utils/job_monitor.py index a7d5214a02..97a4cb6ebc 100644 --- a/python/fedml/computing/scheduler/comm_utils/job_monitor.py +++ b/python/fedml/computing/scheduler/comm_utils/job_monitor.py @@ -148,7 +148,7 @@ def autoscaler_reconcile_after_interval(self): if current_replicas == new_replicas: # Basically the autoscaler decided that no scaling operation should take place. logging.info(f"No scaling operation for endpoint {e_id}.") - return + continue # Should scale in / out curr_version = fedml.get_env_version() @@ -159,7 +159,7 @@ def autoscaler_reconcile_after_interval(self): mlops_prefix = "https://open-test.fedml.ai/" else: logging.error(f"Do not support the version {curr_version}.") - return + continue autoscale_url_path = "fedmlModelServer/api/v1/endpoint/auto-scale" url = f"{mlops_prefix}{autoscale_url_path}" @@ -167,7 +167,7 @@ def autoscaler_reconcile_after_interval(self): cached_token = fedml_model_cache.get_end_point_token(e_id, e_name, model_name) if cached_token is None: logging.error(f"Failed to get the cached token for endpoint {e_id}.") - return + continue req_header = { "Authorization": f"Bearer {cached_token}" From 31c57e01d426a82127fd3cff2ae45ee36f6bbe14 Mon Sep 17 00:00:00 2001 From: fedml-dimitris Date: Tue, 18 Jun 2024 21:32:41 -0400 Subject: [PATCH 22/38] Polishing the autoscaler real test. --- .../test/scaling_algorithm_real_test.py | 64 +++++-------------- 1 file changed, 15 insertions(+), 49 deletions(-) diff --git a/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/scaling_algorithm_real_test.py b/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/scaling_algorithm_real_test.py index 34721d9002..0fae77c3f3 100644 --- a/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/scaling_algorithm_real_test.py +++ b/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/scaling_algorithm_real_test.py @@ -2,9 +2,10 @@ import logging from collections import namedtuple -from fedml.computing.scheduler.model_scheduler.autoscaler.autoscaler import Autoscaler, ReactivePolicy +from fedml.computing.scheduler.model_scheduler.autoscaler.autoscaler import Autoscaler from fedml.core.mlops.mlops_runtime_log import MLOpsRuntimeLog from fedml.computing.scheduler.model_scheduler.device_model_cache import FedMLModelCache +from fedml.computing.scheduler.model_scheduler.autoscaler.policies import ConcurrentQueryPolicy if __name__ == "__main__": @@ -18,9 +19,6 @@ parser.add_argument('--redis_addr', default="local") parser.add_argument('--redis_port', default=6379) parser.add_argument('--redis_password', default="fedml_default") - parser.add_argument('--metric', - default="latency", - help="Either latency or qps") args = parser.parse_args() fedml_model_cache = FedMLModelCache.get_instance() @@ -32,50 +30,18 @@ # Init the autoscaler autoscaler = Autoscaler(args.redis_addr, args.redis_port, args.redis_password) - latency_reactive_policy_default = { - "metric": "latency", - "ewm_mins": 15, - "ewm_alpha": 0.5, - "ub_threshold": 0.5, - "lb_threshold": 0.99, - "triggering_value": 1.6561916828471053 + autoscaling_policy_config = { + "current_replicas": 1, + "min_replicas": 1, + "max_replicas": 3, + "queries_per_replica": 2, + "window_size_secs": 60, + "scaledown_delay_secs": 120, } - qps_reactive_policy_default = { - "metric": "qps", - "ewm_mins": 15, - "ewm_alpha": 0.5, - "ub_threshold": 2, - "lb_threshold": 0.5 - } - policy_config = latency_reactive_policy_default \ - if args.metric == "latency" else qps_reactive_policy_default - autoscaling_policy = ReactivePolicy(**policy_config) - - for endpoint_settings in endpoints_settings_list: - endpoint_state = endpoint_settings["state"] - if endpoint_state == "DEPLOYED" and endpoint_settings["enable_auto_scaling"]: - - e_id, e_name, model_name = \ - endpoint_settings["endpoint_id"], \ - endpoint_settings["endpoint_name"], \ - endpoint_settings["model_name"] - logging.info(f"Querying the autoscaler for endpoint {e_id} with user settings {endpoint_settings}.") - - # For every endpoint we just update the policy configuration. - autoscaling_policy.min_replicas = endpoint_settings["scale_min"] - autoscaling_policy.max_replicas = endpoint_settings["scale_max"] - # We retrieve a list of replicas for every endpoint. The number - # of running replicas is the length of that list. - current_replicas = len(fedml_model_cache.get_endpoint_replicas_results(e_id)) - autoscaling_policy.current_replicas = current_replicas - logging.info(f"Endpoint {e_id} autoscaling policy: {autoscaling_policy}.") - - scale_op = autoscaler.scale_operation_endpoint( - autoscaling_policy, - str(e_id)) - - new_replicas = current_replicas + scale_op.value + autoscaling_policy = ConcurrentQueryPolicy(**autoscaling_policy_config) - logging.info(f"Scaling operation {scale_op.value} for endpoint {e_id} .") - logging.info(f"New Replicas {new_replicas} for endpoint {e_id} .") - logging.info(f"Current Replicas {current_replicas} for endpoint {e_id} .") + e_id = 1821952311 + scale_op = autoscaler.scale_operation_endpoint( + autoscaling_policy, + str(e_id)) + logging.info(f"Scaling operation {scale_op.value} for endpoint {e_id} .") From 4cb53fe55f5a4e748af0daaf27eb53773cdde2d6 Mon Sep 17 00:00:00 2001 From: fedml-dimitris Date: Tue, 18 Jun 2024 21:37:12 -0400 Subject: [PATCH 23/38] Replacing e_id. --- .../autoscaler/test/scaling_algorithm_real_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/scaling_algorithm_real_test.py b/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/scaling_algorithm_real_test.py index 0fae77c3f3..78a1231abf 100644 --- a/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/scaling_algorithm_real_test.py +++ b/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/scaling_algorithm_real_test.py @@ -40,7 +40,8 @@ } autoscaling_policy = ConcurrentQueryPolicy(**autoscaling_policy_config) - e_id = 1821952311 + # Please replace the `e_id` below with a proper e_id value. + e_id = 1111 scale_op = autoscaler.scale_operation_endpoint( autoscaling_policy, str(e_id)) From 31b7ae05772060e589d65b7b07788366d2b6eb4a Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Thu, 20 Jun 2024 00:26:14 +0000 Subject: [PATCH 24/38] [Deploy] Hotfix: job runner context lost when logout. --- .../model_scheduler/master_job_runner_manager.py | 9 ++++++++- .../scheduler/model_scheduler/master_protocol_manager.py | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner_manager.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner_manager.py index c761cd6d8f..0c674cb5f0 100755 --- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner_manager.py +++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner_manager.py @@ -42,11 +42,18 @@ def send_deployment_stages( message_center=message_center ) - def send_deployment_delete_request_to_edges(self, end_point_id, payload, model_msg_object, message_center=None): + def send_deployment_delete_request_to_edges(self, end_point_id, payload, model_msg_object, message_center=None, + args=None): run_id_str = str(end_point_id) if self.job_runners.get(run_id_str, None) is not None: self.job_runners[run_id_str].send_deployment_delete_request_to_edges( payload, model_msg_object, message_center=message_center) + else: + # Hotfix: re-instantiate the job runner + # TODO(Alay, Raphael): Try to dig into whether re-instantiate the job runner is necessary + self.job_runners[run_id_str] = self._generate_job_runner_instance(args) + self.job_runners[run_id_str].send_deployment_delete_request_to_edges( + payload, model_msg_object, message_center=message_center) def stop_device_inference_monitor(self, run_id, end_point_name, model_id, model_name, model_version): run_id_str = str(run_id) diff --git a/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py b/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py index 668d1192ce..7bfad2f3eb 100755 --- a/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py +++ b/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py @@ -132,7 +132,7 @@ def callback_delete_deployment(self, topic, payload): # Send delete deployment request to the edge devices FedMLDeployJobRunnerManager.get_instance().send_deployment_delete_request_to_edges( - model_msg_object.run_id, payload, model_msg_object, message_center=self.message_center) + model_msg_object.run_id, payload, model_msg_object, message_center=self.message_center, args=self.args) # Stop processes on master FedMLDeployJobRunnerManager.get_instance().stop_job_runner(model_msg_object.run_id) From 7ccf195113d4c5dccfe3fabc3aae9fea71f33e0d Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Sat, 15 Jun 2024 00:17:04 +0000 Subject: [PATCH 25/38] [Deploy] Support arbitrary container image onboarding. --- .../custom_inference_image.yaml | 19 +- .../custom_inference_image/serve_main.py | 16 -- .../scheduler/comm_utils/job_monitor.py | 27 +- .../device_client_constants.py | 4 + .../device_http_inference_protocol.py | 8 +- .../device_model_deployment.py | 265 +++++++++--------- .../model_scheduler/device_model_inference.py | 56 ++-- .../model_scheduler/worker_job_runner.py | 16 -- python/fedml/core/mlops/mlops_device_perfs.py | 2 +- 9 files changed, 201 insertions(+), 212 deletions(-) delete mode 100644 python/examples/deploy/custom_inference_image/serve_main.py diff --git a/python/examples/deploy/custom_inference_image/custom_inference_image.yaml b/python/examples/deploy/custom_inference_image/custom_inference_image.yaml index 0c62767b40..467c7c48b0 100644 --- a/python/examples/deploy/custom_inference_image/custom_inference_image.yaml +++ b/python/examples/deploy/custom_inference_image/custom_inference_image.yaml @@ -1,13 +1,14 @@ workspace: "./" -job: | - echo "Start serving..." - python3 serve_main.py -bootstrap: | - echo "Bootstrap start..." - echo "Bootstrap finished!" +inference_image_name: "ghcr.io/predibase/lorax:main" +container_run_command: "--model-id mistralai/Mistral-7B-Instruct-v0.1" -enable_custom_image: true -inference_image_name: "fedml/fedml-default-inference-backend" -deploy_timeout: 1000 +environment_variables: + HUGGING_FACE_HUB_TOKEN: "" +readiness_probe: + path: "health" + +port: 80 + +deploy_timeout: 1600 diff --git a/python/examples/deploy/custom_inference_image/serve_main.py b/python/examples/deploy/custom_inference_image/serve_main.py deleted file mode 100644 index a7a1dd84f3..0000000000 --- a/python/examples/deploy/custom_inference_image/serve_main.py +++ /dev/null @@ -1,16 +0,0 @@ -from fedml.serving import FedMLPredictor -from fedml.serving import FedMLInferenceRunner - - -class DummyPredictor(FedMLPredictor): - def __init__(self): - super().__init__() - - def predict(self, request): - return {"Aloha": request} - - -if __name__ == "__main__": - predictor = DummyPredictor() - fedml_inference_runner = FedMLInferenceRunner(predictor) - fedml_inference_runner.run() \ No newline at end of file diff --git a/python/fedml/computing/scheduler/comm_utils/job_monitor.py b/python/fedml/computing/scheduler/comm_utils/job_monitor.py index 97a4cb6ebc..d216b46dad 100644 --- a/python/fedml/computing/scheduler/comm_utils/job_monitor.py +++ b/python/fedml/computing/scheduler/comm_utils/job_monitor.py @@ -40,6 +40,7 @@ from fedml.core.mlops.mlops_runtime_log import MLOpsRuntimeLog from fedml.core.mlops.mlops_utils import MLOpsLoggingUtils from fedml.core.mlops.mlops_runtime_log_daemon import MLOpsRuntimeLogDaemon +from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants from ..scheduler_core.endpoint_sync_protocol import FedMLEndpointSyncProtocol from ..model_scheduler.device_server_constants import ServerConstants @@ -758,9 +759,8 @@ def monitor_slave_endpoint_status(self): except Exception as e: pass - def _lenient_check_replica_ready( - self, deployment_result - ): + @staticmethod + def _lenient_check_replica_ready(deployment_result): """ Double-check the replica's liveness using /ready api: if 200 -> return True @@ -769,8 +769,27 @@ def _lenient_check_replica_ready( """ result_json = deployment_result inference_url = result_json.get("model_url", None) + liveliness_check = result_json.get("model_metadata", {}).get("liveliness_check", None) + readiness_check = result_json.get("model_metadata", {}).get("readiness_check", None) + + if liveliness_check is not None: + if liveliness_check == ClientConstants.LIVENESS_PROBE_DEFAULT: + liveliness_check = readiness_check # Follow the readiness check pattern + if not isinstance(liveliness_check, dict): + logging.warning(f"Healthiness check is not a dict. {liveliness_check}") + return True + if "path" not in liveliness_check: + logging.warning(f"Healthiness check does not have path. {liveliness_check}") + return True + response_ok = asyncio.run(FedMLHttpInference.is_inference_ready( + inference_url, timeout=SchedulerConstants.ENDPOINT_INFERENCE_READY_TIMEOUT, + path=liveliness_check["path"])) + if response_ok is None: + # This means the server return 202 + return False + return True - # Make a curl get to inference_url with timeout 5s + # Make a curl get to inference_url/ready with timeout 5s # TODO(Raphael): Also support PROXY and MQTT to check the readiness response_ok = asyncio.run(FedMLHttpInference.is_inference_ready(inference_url, timeout=5)) if response_ok is None: diff --git a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py index fdcbdf0a34..cd21de2e04 100644 --- a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py @@ -143,6 +143,10 @@ class ClientConstants(object): DEVICE_DIFF_DELETE_OPERATION = "op: delete" DEVICE_DIFF_REPLACE_OPERATION = "op: replace" + READINESS_PROBE_DEFAULT = "DEFAULT" + LIVENESS_PROBE_DEFAULT = "DEFAULT" + + LOGIN_MODE_ON_PREMISE_INDEX = 0 LOGIN_MODE_FEDML_CLOUD_INDEX = 1 LOGIN_MODE_PUBLIC_CLOUD_INDEX = 2 diff --git a/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py b/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py index 7e4c06ea5d..41c565d5d8 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py @@ -14,14 +14,14 @@ def __init__(self): pass @staticmethod - async def is_inference_ready(inference_url, timeout=None): - ''' + async def is_inference_ready(inference_url, path="ready", timeout=None): + """ True: inference is ready False: cannot be reached, will try other protocols None: can be reached, but not ready - ''' + """ url_parsed = urlparse(inference_url) - ready_url = f"http://{url_parsed.hostname}:{url_parsed.port}/ready" + ready_url = f"http://{url_parsed.hostname}:{url_parsed.port}/{path}" response_ok = False try: async with httpx.AsyncClient() as client: diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py index edd2ebea9a..71f0c8032a 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py @@ -68,42 +68,26 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, num_gpus = gpu_per_replica gpu_ids, gpu_attach_cmd = None, "" - # Concatenate the model name + # Concatenate the full model name running_model_name = ClientConstants.get_running_model_name( end_point_name, inference_model_name, model_version, end_point_id, model_id, edge_id=edge_id) - # Parse the model config file and get the necessary information for the deployment + # Parse the model config file model_config_path = os.path.join(model_storage_local_path, "fedml_model_config.yaml") with open(model_config_path, 'r') as file: config = yaml.safe_load(file) + inference_type = "default" # Resource related - inference_type = "default" - use_gpu = config.get('use_gpu', True) - num_gpus_frm_yml = config.get('num_gpus', None) - if not use_gpu: - num_gpus = 0 - else: - if num_gpus_frm_yml is not None: - num_gpus = int(num_gpus_frm_yml) - usr_indicated_wait_time = config.get('deploy_timeout', 900) - usr_indicated_retry_cnt = max(int(usr_indicated_wait_time) // 10, 1) - shm_size = config.get('shm_size', None) - storage_opt = config.get('storage_opt', None) - tmpfs = config.get('tmpfs', None) - cpus = config.get('cpus', None) - if cpus is not None: - cpus = int(cpus) - memory = config.get('memory', None) - - inference_image_name = config.get('inference_image_name', - ClientConstants.INFERENCE_SERVER_CUSTOME_IMAGE) - image_pull_policy = config.get('image_pull_policy', SchedulerConstants.IMAGE_PULL_POLICY_IF_NOT_PRESENT) - - # Source code dir, bootstrap dir, data cache dir - src_code_dir = os.path.join(model_storage_local_path, config.get('source_code_dir', "")) + use_gpu, num_gpus, shm_size, storage_opt, tmpfs, cpus, memory, port_inside_container = \ + parse_resource_related_config(config, gpu_per_replica) - # Get the bootstrap and job commands inside the yaml file + # Image related + inference_image_name, image_pull_policy, registry_name, registry_provider, \ + registry_user_name, registry_user_password = parse_image_registry_related_config(config) + + # Bootstrap, job and entrypoint related + dst_model_serving_dir = "/home/fedml/models_serving" bootstrap_cmds_str_frm_yaml = config.get('bootstrap', "") job_cmds_str_frm_yaml = config.get('job', "") @@ -119,36 +103,37 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, else: src_bootstrap_file_path = "" - data_cache_dir_input = config.get('data_cache_dir', "") - request_input_example = config.get('request_input_example', None) - extra_envs = config.get('environment_variables', None) - - # Serving dir inside docker - dst_model_serving_dir = "/home/fedml/models_serving" - relative_entry = config.get('entry_point') if src_bootstrap_file_path != "": dst_bootstrap_dir = os.path.join(dst_model_serving_dir, auto_gen_bootstrap_file_name) else: dst_bootstrap_dir = "" - # If using customized image, then bootstrap + job will be the entry point - enable_custom_image = config.get("enable_custom_image", False) - # inference_type = "custom" - customized_image_entry_cmd = \ - "/bin/bash /home/fedml/models_serving/fedml-deploy-bootstrap-entry-auto-gen.sh" + # If the entry point is in fedml format (e.g., "main.py") + relative_entry_fedml_format = config.get('entry_point', "") + + # User indicate either fedml format python main entry filename or entry command + customized_image_entry_cmd = config.get('container_run_command', None) + customized_readiness_check = config.get('readiness_probe', ClientConstants.READINESS_PROBE_DEFAULT) + customized_liveliness_check = config.get('liveness_probe', ClientConstants.LIVENESS_PROBE_DEFAULT) + + # Storage related + src_code_dir = os.path.join(model_storage_local_path, config.get('source_code_dir', "")) + data_cache_dir_input = config.get('data_cache_dir', "") - docker_registry_user_name = config.get("docker_registry_user_name", "") - docker_registry_user_password = config.get("docker_registry_user_password", "") - docker_registry = config.get("docker_registry", "") + # Others + extra_envs = config.get('environment_variables', None) + usr_indicated_wait_time = config.get('deploy_timeout', 900) + usr_indicated_retry_cnt = max(int(usr_indicated_wait_time) // 10, 1) + request_input_example = config.get('request_input_example', None) - port_inside_container = int(config.get("port", 2345)) + # Parameter's check + if inference_engine != ClientConstants.INFERENCE_ENGINE_TYPE_INT_DEFAULT: + raise Exception(f"inference engine {inference_engine} is not supported") - # Request the GPU ids for the deployment + # Request the GPU if num_gpus > 0: gpu_ids, gpu_attach_cmd = request_gpu_ids_on_deployment( edge_id, end_point_id, num_gpus=num_gpus, master_device_id=master_device_id) - - # set replica and their gpu ids FedMLModelCache.get_instance().set_redis_params() FedMLModelCache.get_instance().set_replica_gpu_ids( end_point_id, end_point_name, inference_model_name, edge_id, replica_rank+1, gpu_ids) @@ -159,50 +144,51 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, if not os.path.exists(model_serving_dir): os.makedirs(model_serving_dir, exist_ok=True) - if inference_engine != ClientConstants.INFERENCE_ENGINE_TYPE_INT_DEFAULT: - raise Exception(f"inference engine {inference_engine} is not supported") - - # Get the master device id - logging.info(f"master ip: {master_ip}, worker ip: {infer_host}") + # Determine whether to report public ip or localhost if infer_host == master_ip: logging.info("infer_host is the same as master ip, will use 127.0.0.1 to avoid firewall issue") infer_host = "127.0.0.1" + else: + logging.info("Master and worker are located in different machines, will use the public ip for inference") + # Init container interface client try: client = docker.from_env() - if enable_custom_image and docker_registry_user_name != "" and docker_registry_user_password != "" \ - and docker_registry != "": - client.login(username=docker_registry_user_name, password=docker_registry_user_password, - registry=docker_registry) + if registry_provider == "Docker" and registry_user_name != "" and registry_user_password != "" \ + and registry_name != "": + client.login(username=registry_user_name, password=registry_user_password, + registry=registry_name) except Exception: logging.error("Failed to connect to the docker daemon, please ensure that you have " "installed Docker Desktop or Docker Engine, and the docker is running") return "", "", None, None, None + # Pull the inference image + logging.info(f"Start pulling the inference image {inference_image_name}... with policy {image_pull_policy}") + ContainerUtils.get_instance().pull_image_with_policy(image_pull_policy, inference_image_name) + + # Remove if the container exists container_prefix = ("{}".format(ClientConstants.FEDML_DEFAULT_SERVER_CONTAINER_NAME_PREFIX) + "__" + security_utils.get_content_hash(running_model_name)) - default_server_container_name = container_prefix + "__" + str(replica_rank) - try: exist_container_obj = client.containers.get(default_server_container_name) except docker.errors.NotFound: exist_container_obj = None except docker.errors.APIError: raise Exception("Failed to get the container object") + # Allocate the GPU + # TODO: Make sure no competition for each replica in a single deployment + if exist_container_obj is not None: + client.api.remove_container(exist_container_obj.id, v=True, force=True) - # Pull the inference image - logging.info(f"Start pulling the inference image {inference_image_name}... with policy {image_pull_policy}") - ContainerUtils.get_instance().pull_image_with_policy(image_pull_policy, inference_image_name) - + # Build host config volumes = [] binds = {} environment = {} - # data_cache_dir mounting if isinstance(data_cache_dir_input, str): # In this case, we mount to the same folder, if it has ~, we replace it with /home/fedml - src_data_cache_dir, dst_data_cache_dir = "", "" if data_cache_dir_input != "": if data_cache_dir_input[0] == "~": src_data_cache_dir = os.path.expanduser(data_cache_dir_input) @@ -239,16 +225,17 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, else: logging.warning("data_cache_dir_input is not a string or a dictionary, skip mounting it to the container") - # Default mounting - if not enable_custom_image or (enable_custom_image and relative_entry != ""): + # FedML format main entry filename, e.g., main.py + if relative_entry_fedml_format != "": logging.info("Start copying the source code to the container...") volumes.append(src_code_dir) binds[src_code_dir] = { "bind": dst_model_serving_dir, "mode": "rw" } - environment["MAIN_ENTRY"] = relative_entry + environment["MAIN_ENTRY"] = relative_entry_fedml_format + # Host config host_config_dict = { "binds": binds, "port_bindings": { @@ -261,10 +248,6 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, "mem_limit": memory } - # Allocate the GPU - # TODO: Make sure no competition for each replica in a single deployment - if exist_container_obj is not None: - client.api.remove_container(exist_container_obj.id, v=True, force=True) device_mapping = {} if no_real_gpu_allocation is not None: use_gpu = not no_real_gpu_allocation @@ -277,6 +260,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, host_config_dict.update(device_mapping) # Environment variables + enable_custom_image = False if relative_entry_fedml_format != "" else True if not enable_custom_image: # For some image, the default user is root. Unified to fedml. environment["HOME"] = "/home/fedml" @@ -288,7 +272,6 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, environment["FEDML_ENV_VERSION"] = fedml.get_env_version() environment["FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_HOST"] = fedml.get_local_on_premise_platform_host() environment["FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_PORT"] = fedml.get_local_on_premise_platform_port() - if extra_envs is not None: for key in extra_envs: environment[key] = extra_envs[key] @@ -304,8 +287,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, environment=environment, host_config=host_config, detach=True, - command=customized_image_entry_cmd if enable_custom_image else None, - entrypoint=customized_image_entry_cmd if enable_custom_image else None + command=customized_image_entry_cmd, ) client.api.start(container=new_container.get("Id")) except Exception as e: @@ -333,11 +315,12 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, inference_model_name, inference_engine, inference_http_port, inference_type, retry_interval=10, deploy_attempt_threshold=usr_indicated_retry_cnt, request_input_example=request_input_example, infer_host=infer_host, - enable_custom_image=enable_custom_image) + readiness_check=customized_readiness_check) # Return the running model name and the inference output url inference_output_url, running_model_version, ret_model_metadata, ret_model_config = \ check_container_readiness(inference_http_port=inference_http_port, infer_host=infer_host, + readiness_check=customized_readiness_check, request_input_example=request_input_example) if inference_output_url == "": @@ -345,51 +328,24 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, # Successfully get the result from the container model_metadata = ret_model_metadata + model_metadata["liveliness_check"] = customized_liveliness_check + model_metadata["readiness_check"] = customized_readiness_check logging.info(f"[Worker][Replica{replica_rank}] Model deployment is successful with inference_output_url: " f"{inference_output_url}, model_metadata: {model_metadata}, model_config: {ret_model_config}") return running_model_name, inference_output_url, model_version, model_metadata, ret_model_config -def build_inference_req(end_point_name, model_name, token, in_model_metadata): - model_inputs = in_model_metadata["inputs"] - ret_inputs = list() - - for input_item in model_inputs: - ret_item = input_item - shape = ret_item["shape"] - data_type = ret_item["datatype"] - if ClientConstants.MODEL_DATA_TYPE_MAPPING[data_type] == ClientConstants.MODEL_DATA_TYPE_INT: - for i in range(len(shape)): - if shape[i] == -1: # if input shape is dynamic, we set a default value 1 - shape[i] = 1 - ret_item["data"] = torch.randint(0, 1, shape).tolist() - else: - for i in range(len(shape)): - if shape[i] == -1: # if input shape is dynamic, we set a default value 1 - shape[i] = 1 - ret_item["data"] = torch.zeros(shape).tolist() - ret_inputs.append(ret_item) - - input_json = {"end_point_name": end_point_name, - "model_name": model_name, - "token": str(token), - "inputs": ret_inputs, - "outputs": in_model_metadata["outputs"]} - output_json = in_model_metadata["outputs"] - - return input_json, output_json - - def should_exit_logs(end_point_id, model_id, cmd_type, model_name, inference_engine, inference_port, inference_type="default", request_input_example=None, infer_host="127.0.0.1", - enable_custom_image=False): + readiness_check=ClientConstants.READINESS_PROBE_DEFAULT): if cmd_type == ClientConstants.CMD_TYPE_RUN_DEFAULT_SERVER: # TODO: Exited Quickly if the container is Exited or Removed # If the container has exited, return True, means we should exit the logs try: inference_output_url, model_version, model_metadata, model_config = \ check_container_readiness(inference_http_port=inference_port, infer_host=infer_host, + readiness_check=readiness_check, request_input_example=request_input_example) if inference_output_url != "": logging.info("Log test for deploying model successfully, inference url: {}, " @@ -410,7 +366,7 @@ def log_deployment_output(end_point_id, model_id, cmd_container_name, cmd_type, inference_http_port, inference_type="default", retry_interval=10, deploy_attempt_threshold=10, request_input_example=None, infer_host="127.0.0.1", - enable_custom_image=False): + readiness_check=ClientConstants.READINESS_PROBE_DEFAULT): deploy_attempt = 0 last_log_time = datetime.datetime.now() @@ -478,11 +434,10 @@ def log_deployment_output(end_point_id, model_id, cmd_container_name, cmd_type, client.api.remove_container(container_obj.id, v=True, force=True) break - # should_exit_logs will ping the inference container - # return True if ready + # should_exit_logs will ping the inference container, return True if ready if should_exit_logs(end_point_id, model_id, cmd_type, inference_model_name, inference_engine, inference_http_port, inference_type, request_input_example, - infer_host, enable_custom_image=enable_custom_image): + infer_host, readiness_check=readiness_check): break # Not yet ready, retry @@ -504,10 +459,58 @@ def log_deployment_output(end_point_id, model_id, cmd_container_name, cmd_type, time.sleep(retry_interval) -def is_client_inference_container_ready(infer_url_host, inference_http_port, readiness_check_type="default", - readiness_check_cmd=None, request_input_example=None): +def parse_resource_related_config(config, gpu_num_frm_platform=0): + use_gpu = config.get('use_gpu', True) + num_gpus_frm_yml = config.get('num_gpus', None) + + num_gpus = gpu_num_frm_platform + # Priority: num_gpus from yaml > num_gpus from platform + if use_gpu: + if num_gpus_frm_yml is not None: + num_gpus = int(num_gpus_frm_yml) + else: + num_gpus = 0 + + shm_size = config.get('shm_size', None) + storage_opt = config.get('storage_opt', None) + tmpfs = config.get('tmpfs', None) + cpus = config.get('cpus', None) + if cpus is not None: + cpus = int(cpus) + memory = config.get('memory', None) + port_inside_container = int(config.get("port", 2345)) + + return use_gpu, num_gpus, shm_size, storage_opt, tmpfs, cpus, memory, port_inside_container + + +def parse_image_registry_related_config(config): + inference_image_name = config.get('inference_image_name', ClientConstants.INFERENCE_SERVER_CUSTOME_IMAGE) + image_pull_policy = config.get('image_pull_policy', SchedulerConstants.IMAGE_PULL_POLICY_IF_NOT_PRESENT) + + # Optional + registry_specs = config.get('registry_specs', {}) + registry_name = registry_specs.get("docker_registry_user_name", "") + registry_provider = registry_specs.get("registry_provider", "") + registry_user_name = config.get("registry_user_name", "") + registry_user_password = config.get("registry_user_password", "") + + return (inference_image_name, image_pull_policy, registry_name, registry_provider, + registry_user_name, registry_user_password) + + +def is_client_inference_container_ready(infer_url_host, inference_http_port, + readiness_check=ClientConstants.READINESS_PROBE_DEFAULT, + request_input_example=None, container_id=None): + # Construct the model metadata (input and output) + model_metadata = {} + if request_input_example is not None and len(request_input_example) > 0: + model_metadata["inputs"] = request_input_example + else: + model_metadata["inputs"] = {"text": "What is a good cure for hiccups?"} + model_metadata["outputs"] = [] + model_metadata["type"] = "default" - if readiness_check_type == "default": + if readiness_check == ClientConstants.READINESS_PROBE_DEFAULT: default_client_container_ready_url = "http://{}:{}/ready".format("0.0.0.0", inference_http_port) response = None try: @@ -517,26 +520,36 @@ def is_client_inference_container_ready(infer_url_host, inference_http_port, rea if not response or response.status_code != 200: return "", "", {}, {} - # Construct the model metadata (input and output) - model_metadata = {} - if request_input_example is not None and len(request_input_example) > 0: - model_metadata["inputs"] = request_input_example - else: - model_metadata["inputs"] = {"text": "What is a good cure for hiccups?"} - model_metadata["outputs"] = [] - model_metadata["type"] = "default" - return "http://{}:{}/predict".format(infer_url_host, inference_http_port), None, model_metadata, None else: - # TODO(Raphael): Support arbitrary readiness check command - logging.error(f"Unknown readiness check type: {readiness_check_type}") - return "", "", {}, {} + if not isinstance(readiness_check, dict): + logging.error(f"Unknown readiness check type: {readiness_check}") + return "", "", {}, {} + + if "path" in readiness_check: + readiness_check_url = f"http://{infer_url_host}:{inference_http_port}/{readiness_check['path']}" + response = None + try: + response = requests.get(readiness_check_url) + except: + pass + if not response or response.status_code != 200: + return "", "", {}, {} + + return "http://{}:{}/".format(infer_url_host, inference_http_port), None, model_metadata, None + elif "command" in readiness_check: + # TODO(raphael): Support arbitrary readiness check command by using + # container id and docker exec + return "http://{}:{}/".format(infer_url_host, inference_http_port), None, model_metadata, None + else: + logging.error(f"Unknown readiness check type: {readiness_check}") + return "", "", {}, {} def check_container_readiness(inference_http_port, infer_host="127.0.0.1", request_input_example=None, - readiness_check_type="default", readiness_check_cmd=None): + readiness_check=ClientConstants.READINESS_PROBE_DEFAULT): response_from_client_container = is_client_inference_container_ready( - infer_host, inference_http_port, readiness_check_type, readiness_check_cmd, + infer_host, inference_http_port, readiness_check=readiness_check, request_input_example=request_input_example) return response_from_client_container diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py index ba13006245..84141851b0 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py @@ -342,56 +342,40 @@ async def send_inference_request(idle_device, end_point_id, inference_url, input try: if connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP: - response_ok = await FedMLHttpInference.is_inference_ready( + response_ok, inference_response = await FedMLHttpInference.run_http_inference_with_curl_request( inference_url, + input_list, + output_list, + inference_type=inference_type, timeout=request_timeout_sec) - if response_ok: - response_ok, inference_response = await FedMLHttpInference.run_http_inference_with_curl_request( - inference_url, - input_list, - output_list, - inference_type=inference_type, - timeout=request_timeout_sec) - logging.debug(f"Use http inference. return {response_ok}") - return inference_response + logging.debug(f"Use http inference. return {response_ok}") + return inference_response elif connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP_PROXY: - logging.warning("Use http proxy inference.") - response_ok = await FedMLHttpProxyInference.is_inference_ready( + logging.debug("Use http proxy inference.") + response_ok, inference_response = await FedMLHttpProxyInference.run_http_proxy_inference_with_request( + end_point_id, inference_url, + input_list, + output_list, + inference_type=inference_type, timeout=request_timeout_sec) - if response_ok: - response_ok, inference_response = await FedMLHttpProxyInference.run_http_proxy_inference_with_request( - end_point_id, - inference_url, - input_list, - output_list, - inference_type=inference_type, - timeout=request_timeout_sec) - logging.info(f"Use http proxy inference. return {response_ok}") - return inference_response + logging.debug(f"Use http proxy inference. return {response_ok}") + return inference_response elif connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_MQTT: - logging.warning("Use mqtt inference.") + logging.debug("Use mqtt inference.") agent_config = {"mqtt_config": Settings.mqtt_config} mqtt_inference = FedMLMqttInference( agent_config=agent_config, run_id=end_point_id) - response_ok = mqtt_inference.run_mqtt_health_check_with_request( + response_ok, inference_response = mqtt_inference.run_mqtt_inference_with_request( idle_device, end_point_id, inference_url, + input_list, + output_list, + inference_type=inference_type, timeout=request_timeout_sec) - inference_response = {"error": True, "message": "Failed to use http, http-proxy and mqtt for inference."} - if response_ok: - response_ok, inference_response = mqtt_inference.run_mqtt_inference_with_request( - idle_device, - end_point_id, - inference_url, - input_list, - output_list, - inference_type=inference_type, - timeout=request_timeout_sec) - - logging.info(f"Use mqtt inference. return {response_ok}.") + logging.debug(f"Use mqtt inference. return {response_ok}.") return inference_response else: return {"error": True, "message": "Failed to use http, http-proxy for inference, no response from replica."} diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py index 8100707386..a892412d29 100755 --- a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py +++ b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py @@ -460,22 +460,6 @@ def construct_deployment_results(self, end_point_name, device_id, model_status, } return deployment_results_payload - def construct_deployment_status(self, end_point_name, device_id, - model_id, model_name, model_version, - model_inference_url, model_status, - inference_port=ClientConstants.MODEL_INFERENCE_DEFAULT_PORT, - replica_no=1, # start from 1 - ): - deployment_status_payload = {"end_point_id": self.run_id, "end_point_name": end_point_name, - "device_id": device_id, - "model_id": model_id, "model_name": model_name, - "model_version": model_version, - "model_url": model_inference_url, "model_status": model_status, - "inference_port": inference_port, - "replica_no": replica_no, - } - return deployment_status_payload - def send_deployment_results(self, end_point_name, device_id, model_status, model_id, model_name, model_inference_url, model_version, inference_port, inference_engine, diff --git a/python/fedml/core/mlops/mlops_device_perfs.py b/python/fedml/core/mlops/mlops_device_perfs.py index 29183a6e78..4bb41df73f 100644 --- a/python/fedml/core/mlops/mlops_device_perfs.py +++ b/python/fedml/core/mlops/mlops_device_perfs.py @@ -42,7 +42,7 @@ def __init__(self): self.monitor_replica_num_process = None self.monitor_replica_perf_process = None self.job_total_monitor_process = None - self.enable_job_total_monitor = False + self.enable_job_total_monitor = False # TODO(Raphael): Enable the healthiness check by this job total monitor self.args = None self.device_id = None self.run_id = None From 9ca6ecc1d23166223e7788adfbe688379a18f193 Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Mon, 17 Jun 2024 18:25:59 -0700 Subject: [PATCH 26/38] [Deploy] Add LoraX and Triton examples; Add url match pattern. --- .../deploy/custom_inference_image/README.md | 48 ------------------- .../{ => lorax}/custom_inference_image.yaml | 0 .../template/custom_inference_image.yaml | 16 +++++++ .../model_repository/dummy/1/model.py | 25 ++++++++++ .../scheduler/comm_utils/network_util.py | 11 +++++ .../device_model_deployment.py | 19 ++++---- .../model_scheduler/device_model_inference.py | 28 +++++++++-- 7 files changed, 87 insertions(+), 60 deletions(-) delete mode 100644 python/examples/deploy/custom_inference_image/README.md rename python/examples/deploy/custom_inference_image/{ => lorax}/custom_inference_image.yaml (100%) create mode 100644 python/examples/deploy/custom_inference_image/triton_inference_server/template/custom_inference_image.yaml create mode 100644 python/examples/deploy/custom_inference_image/triton_inference_server/template/model_repository/dummy/1/model.py diff --git a/python/examples/deploy/custom_inference_image/README.md b/python/examples/deploy/custom_inference_image/README.md deleted file mode 100644 index 1269e4c064..0000000000 --- a/python/examples/deploy/custom_inference_image/README.md +++ /dev/null @@ -1,48 +0,0 @@ -## Create a model card at local -First, create a model card at local -```bash -fedml model create -n custom_inference_image -cf custom_inference_image.yaml -``` - -## Low Code UI Deploy -Push the model to nexus ai platform -```bash -fedml model push -n custom_inference_image -``` -Do the following docs to deploy the model on nexus ai platform -https://docs-dev.fedml.ai/deploy/low_code_ui - -## CLI Deploy -### Deploy to current machine -Docs: https://docs-dev.fedml.ai/deploy/deploy_local -```bash -fedml model deploy -n custom_inference_image --local -``` - -### Deploy to On-premise -Docs: https://docs-dev.fedml.ai/deploy/deploy_on_premise -```bash -fedml device bind $api_key -``` -```bash -fedml model deploy -n my_model -m $master_ids -w $worker_ids -``` - -### Deploy to GPU Cloud -Docs: https://docs-dev.fedml.ai/deploy/deploy_cloud - -Change the `custom_inference_image.yaml` file, adding following lines -```yaml -computing: - minimum_num_gpus: 1 # minimum # of GPUs to provision - maximum_cost_per_hour: $3000 # max cost per hour for your job per gpu card - #allow_cross_cloud_resources: true # true, false - #device_type: CPU # options: GPU, CPU, hybrid - resource_type: A100-80G # e.g., A100-80G, - # please check the resource type list by "fedml show-resource-type" - # or visiting URL: https://fedml.ai/accelerator_resource_type -``` - -```bash -fedml model deploy -n custom_inference_image -``` \ No newline at end of file diff --git a/python/examples/deploy/custom_inference_image/custom_inference_image.yaml b/python/examples/deploy/custom_inference_image/lorax/custom_inference_image.yaml similarity index 100% rename from python/examples/deploy/custom_inference_image/custom_inference_image.yaml rename to python/examples/deploy/custom_inference_image/lorax/custom_inference_image.yaml diff --git a/python/examples/deploy/custom_inference_image/triton_inference_server/template/custom_inference_image.yaml b/python/examples/deploy/custom_inference_image/triton_inference_server/template/custom_inference_image.yaml new file mode 100644 index 0000000000..02dca147ce --- /dev/null +++ b/python/examples/deploy/custom_inference_image/triton_inference_server/template/custom_inference_image.yaml @@ -0,0 +1,16 @@ +workspace: "./" + +inference_image_name: "nvcr.io/nvidia/tritonserver:24.05-py3" + +# If you put the model repository in $workspace/model_repository, it will be mounted to /home/fedml/models_serving/model_repository +container_run_command: "tritonserver --model-repository=/home/fedml/models_serving/model_repository" + +# If your image has the repository inside it, say in /my_models_dir/model_repository, you can do: +#container_run_command: "tritonserver --model-repository=/my_models_dir/model_repository" + +readiness_probe: + path: "v2/health/ready" + +port: 8000 + +deploy_timeout: 1600 diff --git a/python/examples/deploy/custom_inference_image/triton_inference_server/template/model_repository/dummy/1/model.py b/python/examples/deploy/custom_inference_image/triton_inference_server/template/model_repository/dummy/1/model.py new file mode 100644 index 0000000000..0404a127ff --- /dev/null +++ b/python/examples/deploy/custom_inference_image/triton_inference_server/template/model_repository/dummy/1/model.py @@ -0,0 +1,25 @@ +import json +import numpy as np +import triton_python_backend_utils as pb_utils + +class TritonPythonModel: + def initialize(self, args): + self.model_name = args['model_name'] + + @staticmethod + def auto_complete_config(auto_complete_model_config): + auto_complete_model_config.add_input( {"name": "text_input", "data_type": "TYPE_STRING", "dims": [-1]}) + auto_complete_model_config.add_output({"name": "text_output", "data_type": "TYPE_STRING", "dims": [-1]}) + auto_complete_model_config.set_max_batch_size(0) + return auto_complete_model_config + + def execute(self, requests): + responses = [] + for request in requests: + in_numpy = pb_utils.get_input_tensor_by_name(request, "text_input").as_numpy() + assert np.object_ == in_numpy.dtype, 'in this demo, triton passes in a numpy array of size 1 with object_ dtype, this dtype encapsulates a python bytes-array' + print('in this demo len(in_numpy) is 1:', len(in_numpy.tolist())) + out_numpy = np.array([ (self.model_name + ': ' + python_byte_array.decode('utf-8') + ' World').encode('utf-8') for python_byte_array in in_numpy.tolist()], dtype = np.object_) + out_pb = pb_utils.Tensor("text_output", out_numpy) + responses.append(pb_utils.InferenceResponse(output_tensors = [out_pb])) + return responses diff --git a/python/fedml/computing/scheduler/comm_utils/network_util.py b/python/fedml/computing/scheduler/comm_utils/network_util.py index 48e478f23f..b03b0428d0 100644 --- a/python/fedml/computing/scheduler/comm_utils/network_util.py +++ b/python/fedml/computing/scheduler/comm_utils/network_util.py @@ -1,4 +1,5 @@ import os +from urllib.parse import urlparse from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants @@ -16,3 +17,13 @@ def return_this_device_connectivity_type() -> str: return env_conn_type else: return ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT + + +def replace_url_with_path(url: str, path: str) -> str: + """ + Replace the path of the URL with the given path. + """ + if path is None: + return url + url_parsed = urlparse(url) + return f"{url_parsed.scheme}://{url_parsed.netloc}/{path}" diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py index 71f0c8032a..1aef8c09f1 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py @@ -118,6 +118,8 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, # Storage related src_code_dir = os.path.join(model_storage_local_path, config.get('source_code_dir', "")) + # TODO(Raphael): In the future, the data_cache_dir should not be controlled by the user. It only + # used for internal avoiding checkpoint re-download. e.g. ~/.cache/huggingface/hub/ data_cache_dir_input = config.get('data_cache_dir', "") # Others @@ -225,15 +227,14 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, else: logging.warning("data_cache_dir_input is not a string or a dictionary, skip mounting it to the container") - # FedML format main entry filename, e.g., main.py - if relative_entry_fedml_format != "": - logging.info("Start copying the source code to the container...") - volumes.append(src_code_dir) - binds[src_code_dir] = { - "bind": dst_model_serving_dir, - "mode": "rw" - } - environment["MAIN_ENTRY"] = relative_entry_fedml_format + # Inject the source code + logging.info("Start copying the source code to the container...") + volumes.append(src_code_dir) + binds[src_code_dir] = { + "bind": dst_model_serving_dir, + "mode": "rw" + } + environment["MAIN_ENTRY"] = relative_entry_fedml_format # Host config host_config_dict = { diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py index 84141851b0..f6fa99d6d4 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py @@ -21,6 +21,7 @@ from fedml.computing.scheduler.model_scheduler.device_model_cache import FedMLModelCache from fedml.computing.scheduler.model_scheduler.device_mqtt_inference_protocol import FedMLMqttInference from fedml.computing.scheduler.model_scheduler.device_http_proxy_inference_protocol import FedMLHttpProxyInference +from fedml.computing.scheduler.comm_utils.network_util import replace_url_with_path from fedml.core.mlops.mlops_configs import MLOpsConfigs from fedml.core.mlops import MLOpsRuntimeLog, MLOpsRuntimeLogDaemon @@ -168,10 +169,27 @@ async def predict_with_end_point_id(end_point_id, request: Request, response: Re return inference_response +@api.post('/custom_inference/{end_point_id}/{path:path}') +async def custom_inference(end_point_id, path: str, request: Request): + # Get json data + input_json = await request.json() + + # Get header + header = request.headers + + try: + inference_response = await _predict(end_point_id, input_json, header, path) + except Exception as e: + inference_response = {"error": True, "message": f"{traceback.format_exc()}"} + + return inference_response + + async def _predict( end_point_id, input_json, - header=None + header=None, + path=None, ) -> Union[MutableMapping[str, Any], Response, StreamingResponse]: # Always increase the pending requests counter on a new incoming request. FEDML_MODEL_CACHE.update_pending_requests_counter(end_point_id, increase=True) @@ -245,7 +263,8 @@ async def _predict( input_list, output_list, inference_type=in_return_type, - connectivity_type=connectivity_type) + connectivity_type=connectivity_type, + path=path) # Calculate model metrics try: @@ -336,10 +355,13 @@ def found_idle_inference_device(end_point_id, end_point_name, in_model_name, in_ async def send_inference_request(idle_device, end_point_id, inference_url, input_list, output_list, inference_type="default", - connectivity_type=ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT): + connectivity_type=ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT, + path=None): request_timeout_sec = FEDML_MODEL_CACHE.get_endpoint_settings(end_point_id) \ .get("request_timeout_sec", ClientConstants.INFERENCE_REQUEST_TIMEOUT) + inference_url = replace_url_with_path(inference_url, path) + try: if connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP: response_ok, inference_response = await FedMLHttpInference.run_http_inference_with_curl_request( From 786718bc6b61508b239a4106738724c458ed8c38 Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Thu, 20 Jun 2024 10:37:19 -0700 Subject: [PATCH 27/38] [Deploy] Support serverless container. --- .../lorax/custom_inference_image.yaml | 6 +- .../tensorrt_llm/tensorrtllm.yaml | 17 ++ .../template/custom_inference_image.yaml | 16 +- .../device_client_constants.py | 25 +-- .../device_http_inference_protocol.py | 42 ++-- .../device_http_proxy_inference_protocol.py | 1 + .../device_model_deployment.py | 191 ++++++++++++------ .../model_scheduler/device_model_inference.py | 18 +- .../device_server_constants.py | 4 + .../model_scheduler/master_job_runner.py | 38 ++-- 10 files changed, 233 insertions(+), 125 deletions(-) create mode 100644 python/examples/deploy/custom_inference_image/tensorrt_llm/tensorrtllm.yaml diff --git a/python/examples/deploy/custom_inference_image/lorax/custom_inference_image.yaml b/python/examples/deploy/custom_inference_image/lorax/custom_inference_image.yaml index 467c7c48b0..41cbe501d2 100644 --- a/python/examples/deploy/custom_inference_image/lorax/custom_inference_image.yaml +++ b/python/examples/deploy/custom_inference_image/lorax/custom_inference_image.yaml @@ -1,5 +1,6 @@ workspace: "./" +enable_serverless_container: true inference_image_name: "ghcr.io/predibase/lorax:main" container_run_command: "--model-id mistralai/Mistral-7B-Instruct-v0.1" @@ -7,8 +8,9 @@ environment_variables: HUGGING_FACE_HUB_TOKEN: "" readiness_probe: - path: "health" + httpGet: + path: "/health" port: 80 -deploy_timeout: 1600 +deploy_timeout_sec: 1600 diff --git a/python/examples/deploy/custom_inference_image/tensorrt_llm/tensorrtllm.yaml b/python/examples/deploy/custom_inference_image/tensorrt_llm/tensorrtllm.yaml new file mode 100644 index 0000000000..d41dba7983 --- /dev/null +++ b/python/examples/deploy/custom_inference_image/tensorrt_llm/tensorrtllm.yaml @@ -0,0 +1,17 @@ +workspace: "./" + +enable_serverless_container: true +inference_image_name: "fedml/llama3-8b-tensorrtllm" + +# If you put the model repository in $workspace/model_repository, it will be mounted to /home/fedml/models_serving/model_repository +container_run_command: ["sh", "-c", "cd / && huggingface-cli login --token $your_hf_token && pip install sentencepiece protobuf && python3 tensorrtllm_backend/scripts/launch_triton_server.py --model_repo tensorrtllm_backend/all_models/inflight_batcher_llm --world_size 1 && tail -f /dev/null"] + +readiness_probe: + httpGet: + path: "/v2/health/ready" + +port: 8000 + +deploy_timeout_sec: 1600 + + diff --git a/python/examples/deploy/custom_inference_image/triton_inference_server/template/custom_inference_image.yaml b/python/examples/deploy/custom_inference_image/triton_inference_server/template/custom_inference_image.yaml index 02dca147ce..eb02e3904a 100644 --- a/python/examples/deploy/custom_inference_image/triton_inference_server/template/custom_inference_image.yaml +++ b/python/examples/deploy/custom_inference_image/triton_inference_server/template/custom_inference_image.yaml @@ -1,16 +1,20 @@ workspace: "./" +enable_serverless_container: true inference_image_name: "nvcr.io/nvidia/tritonserver:24.05-py3" -# If you put the model repository in $workspace/model_repository, it will be mounted to /home/fedml/models_serving/model_repository -container_run_command: "tritonserver --model-repository=/home/fedml/models_serving/model_repository" +volumes: + - workspace_path: "./model_repository" + mount_path: "/repo_inside_container" -# If your image has the repository inside it, say in /my_models_dir/model_repository, you can do: -#container_run_command: "tritonserver --model-repository=/my_models_dir/model_repository" +container_run_command: "tritonserver --model-repository=/repo_inside_container" readiness_probe: - path: "v2/health/ready" + httpGet: + path: "/v2/health/ready" port: 8000 -deploy_timeout: 1600 +deploy_timeout_sec: 1600 + +request_input_example: {"text_input": "Hello"} diff --git a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py index cd21de2e04..e18c9f730b 100644 --- a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py @@ -146,7 +146,6 @@ class ClientConstants(object): READINESS_PROBE_DEFAULT = "DEFAULT" LIVENESS_PROBE_DEFAULT = "DEFAULT" - LOGIN_MODE_ON_PREMISE_INDEX = 0 LOGIN_MODE_FEDML_CLOUD_INDEX = 1 LOGIN_MODE_PUBLIC_CLOUD_INDEX = 2 @@ -155,20 +154,16 @@ class ClientConstants(object): MODEL_DATA_TYPE_INT = "int" MODEL_DATA_TYPE_FLOAT = "float" MODEL_DATA_TYPE_STR = "str" - MODEL_DATA_TYPE_MAPPING = {"TYPE_BOOL": MODEL_DATA_TYPE_INT, "TYPE_UINT8": MODEL_DATA_TYPE_INT, - "TYPE_UINT16": MODEL_DATA_TYPE_INT, "TYPE_UINT32": MODEL_DATA_TYPE_INT, - "TYPE_UINT64": MODEL_DATA_TYPE_INT, "TYPE_INT8": MODEL_DATA_TYPE_INT, - "TYPE_INT16": MODEL_DATA_TYPE_INT, "TYPE_INT32": MODEL_DATA_TYPE_INT, - "TYPE_INT64": MODEL_DATA_TYPE_INT, "TYPE_FP16": MODEL_DATA_TYPE_FLOAT, - "TYPE_FP32": MODEL_DATA_TYPE_FLOAT, "TYPE_FP64": MODEL_DATA_TYPE_FLOAT, - "TYPE_STRING": MODEL_DATA_TYPE_STR, "TYPE_BF16": MODEL_DATA_TYPE_INT, - "BOOL": MODEL_DATA_TYPE_INT, "UINT8": MODEL_DATA_TYPE_INT, - "UINT16": MODEL_DATA_TYPE_INT, "UINT32": MODEL_DATA_TYPE_INT, - "UINT64": MODEL_DATA_TYPE_INT, "INT8": MODEL_DATA_TYPE_INT, - "INT16": MODEL_DATA_TYPE_INT, "INT32": MODEL_DATA_TYPE_INT, - "INT64": MODEL_DATA_TYPE_INT, "FP16": MODEL_DATA_TYPE_FLOAT, - "FP32": MODEL_DATA_TYPE_FLOAT, "FP64": MODEL_DATA_TYPE_FLOAT, - "STRING": MODEL_DATA_TYPE_STR, "BF16": MODEL_DATA_TYPE_INT} + + # Model config yaml related + DEPLOY_TIMEOUT_SEC_KEY = "deploy_timeout_sec" + DEPLOY_TIMEOUT_SEC_DEFAULT = 600 + + ENABLE_SERVERLESS_CONTAINER_KEY = "enable_serverless_container" + + CUSTOMIZED_VOLUMES_MOUNT_KEY = "volumes" + CUSTOMIZED_VOLUMES_PATH_FROM_WORKSPACE_KEY = "workspace_path" + CUSTOMIZED_VOLUMES_PATH_FROM_CONTAINER_KEY = "mount_path" @staticmethod def get_fedml_home_dir(): diff --git a/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py b/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py index 41c565d5d8..5b2658f0b3 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py @@ -1,3 +1,5 @@ +import logging + import httpx import traceback @@ -46,9 +48,8 @@ async def is_inference_ready(inference_url, path="ready", timeout=None): @staticmethod async def run_http_inference_with_curl_request( inference_url, inference_input_list, inference_output_list, - inference_type="default", engine_type="default", timeout=None + inference_type="default", engine_type="default", timeout=None, method="POST" ): - model_inference_result = {} if inference_type == "default": model_api_headers = {'Content-Type': 'application/json', 'Connection': 'close', 'Accept': 'application/json'} @@ -63,11 +64,10 @@ async def run_http_inference_with_curl_request( "outputs": inference_output_list } - response_ok = False try: if model_inference_json.get("stream", False): model_inference_result = StreamingResponse( - stream_generator(inference_url, input_json=model_inference_json), + stream_generator(inference_url, input_json=model_inference_json, method=method), media_type="text/event-stream", headers={ "Content-Type": model_api_headers.get("Accept", "text/event-stream"), @@ -76,8 +76,8 @@ async def run_http_inference_with_curl_request( ) response_ok = True else: - response_ok, model_inference_result = await redirect_request_to_worker( - inference_type, inference_url, model_api_headers, model_inference_json, timeout) + response_ok, model_inference_result = await redirect_non_stream_req_to_worker( + inference_type, inference_url, model_api_headers, model_inference_json, timeout, method=method) except Exception as e: response_ok = False model_inference_result = {"response": f"{traceback.format_exc()}"} @@ -85,21 +85,22 @@ async def run_http_inference_with_curl_request( return response_ok, model_inference_result -async def stream_generator(inference_url, input_json): +async def stream_generator(inference_url, input_json, method="POST"): async with httpx.AsyncClient() as client: - async with client.stream("POST", inference_url, json=input_json, + async with client.stream(method, inference_url, json=input_json, timeout=ClientConstants.WORKER_STREAM_API_TIMEOUT) as response: async for chunk in response.aiter_lines(): # we consumed a newline, need to put it back yield f"{chunk}\n" -async def redirect_request_to_worker(inference_type, inference_url, model_api_headers, model_inference_json, timeout=None): +async def redirect_non_stream_req_to_worker(inference_type, inference_url, model_api_headers, model_inference_json, + timeout=None, method="POST"): response_ok = True try: async with httpx.AsyncClient() as client: - response = await client.post( - url=inference_url, headers=model_api_headers, json=model_inference_json, timeout=timeout + response = await client.request( + method=method, url=inference_url, headers=model_api_headers, json=model_inference_json, timeout=timeout ) except Exception as e: response_ok = False @@ -107,13 +108,18 @@ async def redirect_request_to_worker(inference_type, inference_url, model_api_he return response_ok, model_inference_result if response.status_code == 200: - if inference_type == "default": - model_inference_result = response.json() - elif inference_type == "image/png": - binary_content: bytes = response.content - model_inference_result = Response(content=binary_content, media_type="image/png") - else: - model_inference_result = response.json() + try: + if inference_type == "default": + model_inference_result = response.json() + elif inference_type == "image/png": + binary_content: bytes = response.content + model_inference_result = Response(content=binary_content, media_type="image/png") + else: + model_inference_result = response.json() + except Exception as e: + response_ok = True + logging.warning(f"Status code 200, but cannot trans response to json due to: {e}.") + model_inference_result = {"response": f"{response.content}"} else: model_inference_result = {"response": f"{response.content}"} diff --git a/python/fedml/computing/scheduler/model_scheduler/device_http_proxy_inference_protocol.py b/python/fedml/computing/scheduler/model_scheduler/device_http_proxy_inference_protocol.py index 53f5a002eb..746d17bb7c 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_http_proxy_inference_protocol.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_http_proxy_inference_protocol.py @@ -50,6 +50,7 @@ async def run_http_proxy_inference_with_request( endpoint_id, inference_url, inference_input_list, inference_output_list, inference_type="default", timeout=None + # TODO(Raphael): Add support for GET and other methods ): inference_response = {} http_proxy_url = f"http://{urlparse(inference_url).hostname}:{ClientConstants.LOCAL_CLIENT_API_PORT}/api/v1/predict" diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py index 1aef8c09f1..e18081c324 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py @@ -112,19 +112,20 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, relative_entry_fedml_format = config.get('entry_point', "") # User indicate either fedml format python main entry filename or entry command - customized_image_entry_cmd = config.get('container_run_command', None) + enable_serverless_container = config.get(ClientConstants.ENABLE_SERVERLESS_CONTAINER_KEY, False) + customized_image_entry_cmd = config.get('container_run_command', None) # Could be str or list customized_readiness_check = config.get('readiness_probe', ClientConstants.READINESS_PROBE_DEFAULT) customized_liveliness_check = config.get('liveness_probe', ClientConstants.LIVENESS_PROBE_DEFAULT) # Storage related src_code_dir = os.path.join(model_storage_local_path, config.get('source_code_dir', "")) - # TODO(Raphael): In the future, the data_cache_dir should not be controlled by the user. It only - # used for internal avoiding checkpoint re-download. e.g. ~/.cache/huggingface/hub/ data_cache_dir_input = config.get('data_cache_dir', "") + usr_customized_mount_rule = config.get(ClientConstants.CUSTOMIZED_VOLUMES_MOUNT_KEY, None) # Others extra_envs = config.get('environment_variables', None) - usr_indicated_wait_time = config.get('deploy_timeout', 900) + usr_indicated_wait_time = config.get(ClientConstants.DEPLOY_TIMEOUT_SEC_KEY, + config.get("deploy_timeout", ClientConstants.DEPLOY_TIMEOUT_SEC_DEFAULT)) usr_indicated_retry_cnt = max(int(usr_indicated_wait_time) // 10, 1) request_input_example = config.get('request_input_example', None) @@ -189,52 +190,12 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, binds = {} environment = {} - if isinstance(data_cache_dir_input, str): - # In this case, we mount to the same folder, if it has ~, we replace it with /home/fedml - if data_cache_dir_input != "": - if data_cache_dir_input[0] == "~": - src_data_cache_dir = os.path.expanduser(data_cache_dir_input) - dst_data_cache_dir = data_cache_dir_input.replace("~", "/home/fedml") - else: - # check if the data_cache_dir is a relative path - if data_cache_dir_input[0] != "/": - raise "data_cache_dir_input has to be an absolute path or start with ~" - else: - src_data_cache_dir = data_cache_dir_input - dst_data_cache_dir = data_cache_dir_input - logging.info(f"src_data_cache_dir: {src_data_cache_dir}, dst_data_cache_dir: {dst_data_cache_dir}") + # Handle the union volume mount + _handle_union_volume_mount(binds, volumes, environment, data_cache_dir_input) - if type(src_data_cache_dir) == str and src_data_cache_dir != "": - logging.info("Start copying the data cache to the container...") - if os.path.exists(src_data_cache_dir): - volumes.append(src_data_cache_dir) - binds[src_data_cache_dir] = { - "bind": dst_data_cache_dir, - "mode": "rw" - } - environment["DATA_CACHE_FOLDER"] = dst_data_cache_dir - elif isinstance(data_cache_dir_input, dict): - for k, v in data_cache_dir_input.items(): - if os.path.exists(k): - volumes.append(v) - binds[k] = { - "bind": v, - "mode": "rw" - } - else: - logging.warning(f"{k} does not exist, skip mounting it to the container") - logging.info(f"Data cache mount: {volumes}, {binds}") - else: - logging.warning("data_cache_dir_input is not a string or a dictionary, skip mounting it to the container") - - # Inject the source code - logging.info("Start copying the source code to the container...") - volumes.append(src_code_dir) - binds[src_code_dir] = { - "bind": dst_model_serving_dir, - "mode": "rw" - } - environment["MAIN_ENTRY"] = relative_entry_fedml_format + # Handle the default volume mount + handle_volume_mount(volumes, binds, environment, relative_entry_fedml_format, src_code_dir, + dst_model_serving_dir, usr_customized_mount_rule, host_workspace_root=model_storage_local_path) # Host config host_config_dict = { @@ -331,6 +292,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, model_metadata = ret_model_metadata model_metadata["liveliness_check"] = customized_liveliness_check model_metadata["readiness_check"] = customized_readiness_check + model_metadata[ClientConstants.ENABLE_SERVERLESS_CONTAINER_KEY] = enable_serverless_container logging.info(f"[Worker][Replica{replica_rank}] Model deployment is successful with inference_output_url: " f"{inference_output_url}, model_metadata: {model_metadata}, model_config: {ret_model_config}") @@ -527,24 +489,129 @@ def is_client_inference_container_ready(infer_url_host, inference_http_port, logging.error(f"Unknown readiness check type: {readiness_check}") return "", "", {}, {} - if "path" in readiness_check: - readiness_check_url = f"http://{infer_url_host}:{inference_http_port}/{readiness_check['path']}" - response = None - try: - response = requests.get(readiness_check_url) - except: - pass - if not response or response.status_code != 200: - return "", "", {}, {} + if "httpGet" in readiness_check: + if "path" in readiness_check["httpGet"]: + check_path = readiness_check["httpGet"]["path"] + if not isinstance(check_path, str): + logging.error(f"Invalid path type: {check_path}, expected str") + return "", "", {}, {} + else: + if not check_path.startswith("/"): + check_path = "/" + check_path + readiness_check_url = f"http://{infer_url_host}:{inference_http_port}{check_path}" - return "http://{}:{}/".format(infer_url_host, inference_http_port), None, model_metadata, None - elif "command" in readiness_check: + response = None + try: + response = requests.get(readiness_check_url) + except: + pass + if not response or response.status_code != 200: + return "", "", {}, {} + + return readiness_check_url, None, model_metadata, None + else: + logging.error("'path' is not specified in httpGet readiness check") + return "", "", {}, {} + elif "exec" in readiness_check: # TODO(raphael): Support arbitrary readiness check command by using # container id and docker exec return "http://{}:{}/".format(infer_url_host, inference_http_port), None, model_metadata, None else: - logging.error(f"Unknown readiness check type: {readiness_check}") - return "", "", {}, {} + # Ref K8S, if no readiness check, we assume the container is ready immediately + return "http://{}:{}/".format(infer_url_host, inference_http_port), None, model_metadata, None + + +def _handle_union_volume_mount(binds, volumes, environment, data_cache_dir_input=None): + """ + Private: data_cache_dir is the union folder on host machine, which will be shard across different containers, + the control of this folder should be handled by the platform. + """ + if isinstance(data_cache_dir_input, str): + # In this case, we mount to the same folder, if it has ~, we replace it with /home/fedml + if data_cache_dir_input != "": + if data_cache_dir_input[0] == "~": + src_data_cache_dir = os.path.expanduser(data_cache_dir_input) + dst_data_cache_dir = data_cache_dir_input.replace("~", "/home/fedml") + else: + # check if the data_cache_dir is a relative path + if data_cache_dir_input[0] != "/": + raise "data_cache_dir_input has to be an absolute path or start with ~" + else: + src_data_cache_dir = data_cache_dir_input + dst_data_cache_dir = data_cache_dir_input + logging.info(f"src_data_cache_dir: {src_data_cache_dir}, dst_data_cache_dir: {dst_data_cache_dir}") + + if isinstance(src_data_cache_dir, str) and src_data_cache_dir != "": + logging.info("Start copying the data cache to the container...") + if os.path.exists(src_data_cache_dir): + volumes.append(src_data_cache_dir) + binds[src_data_cache_dir] = { + "bind": dst_data_cache_dir, + "mode": "rw" + } + environment["DATA_CACHE_FOLDER"] = dst_data_cache_dir + elif isinstance(data_cache_dir_input, dict): + for k, v in data_cache_dir_input.items(): + if os.path.exists(k): + volumes.append(v) + binds[k] = { + "bind": v, + "mode": "rw" + } + else: + logging.warning(f"{k} does not exist, skip mounting it to the container") + logging.info(f"Data cache mount: {volumes}, {binds}") + else: + logging.info("data_cache_dir_input is not a string or a dictionary, skip mounting it to the container") + + +def handle_volume_mount(volumes, binds, environment, relative_entry_fedml_format="", src_code_dir="", + dst_model_serving_dir="", customized_volumes_mount_rule=None, host_workspace_root=""): + # If fedml format entry point is specified, inject the source code, e.g., main.py (FedMLPredictor inside) + if relative_entry_fedml_format != "": + logging.info("Using FedML format entry point, mounting the source code...") + volumes.append(src_code_dir) + binds[src_code_dir] = { + "bind": dst_model_serving_dir, + "mode": "rw" + } + environment["MAIN_ENTRY"] = relative_entry_fedml_format + return # The reason we return here is that we don't need to mount the source code again + + # If customized volume mount rule is specified, just follow the mount rule + """ + e.g., + volumes: + - workspace_path: "./model_repository" + mount_path: "/repo_inside_container" + """ + mount_list = [] + if not isinstance(customized_volumes_mount_rule, list): + if not isinstance(customized_volumes_mount_rule, dict): + logging.warning("customized_volumes_mount_rule is not a list or a dictionary, " + "skip mounting it to the container") + return + + # transform the dict to list + for k, v in customized_volumes_mount_rule.items(): + mount_list.append({ClientConstants.CUSTOMIZED_VOLUMES_PATH_FROM_WORKSPACE_KEY: k, + ClientConstants.CUSTOMIZED_VOLUMES_PATH_FROM_CONTAINER_KEY: v}) + else: + mount_list = customized_volumes_mount_rule if customized_volumes_mount_rule is not None else [] + + for mount in mount_list: + workspace_relative_path = mount[ClientConstants.CUSTOMIZED_VOLUMES_PATH_FROM_WORKSPACE_KEY] + mount_path = mount[ClientConstants.CUSTOMIZED_VOLUMES_PATH_FROM_CONTAINER_KEY] + + workspace_path = os.path.join(host_workspace_root, workspace_relative_path) + if os.path.exists(workspace_path): + volumes.append(workspace_path) + binds[workspace_path] = { + "bind": mount_path, + "mode": "rw" + } + else: + logging.warning(f"{workspace_path} does not exist, skip mounting it to the container") def check_container_readiness(inference_http_port, infer_host="127.0.0.1", request_input_example=None, diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py index f6fa99d6d4..7ef9689c1c 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py @@ -8,7 +8,7 @@ from typing import Any, Mapping, MutableMapping, Union from urllib.parse import urlparse -from fastapi import FastAPI, Request, Response, status +from fastapi import FastAPI, Request, Response, status, APIRouter from fastapi.responses import StreamingResponse, JSONResponse import fedml @@ -38,6 +38,7 @@ class Settings: api = FastAPI() +router = APIRouter() FEDML_MODEL_CACHE = FedMLModelCache.get_instance() FEDML_MODEL_CACHE.set_redis_params(redis_addr=Settings.redis_addr, @@ -169,7 +170,8 @@ async def predict_with_end_point_id(end_point_id, request: Request, response: Re return inference_response -@api.post('/custom_inference/{end_point_id}/{path:path}') +# @api.post('/custom_inference/{end_point_id}/{path:path}') +@router.api_route("/custom_inference/{end_point_id}/{path:path}", methods=["POST", "GET"]) async def custom_inference(end_point_id, path: str, request: Request): # Get json data input_json = await request.json() @@ -178,18 +180,21 @@ async def custom_inference(end_point_id, path: str, request: Request): header = request.headers try: - inference_response = await _predict(end_point_id, input_json, header, path) + inference_response = await _predict(end_point_id, input_json, header, path, request.method) except Exception as e: inference_response = {"error": True, "message": f"{traceback.format_exc()}"} return inference_response +api.include_router(router) + async def _predict( end_point_id, input_json, header=None, path=None, + request_method="POST" ) -> Union[MutableMapping[str, Any], Response, StreamingResponse]: # Always increase the pending requests counter on a new incoming request. FEDML_MODEL_CACHE.update_pending_requests_counter(end_point_id, increase=True) @@ -264,7 +269,7 @@ async def _predict( output_list, inference_type=in_return_type, connectivity_type=connectivity_type, - path=path) + path=path, request_method=request_method) # Calculate model metrics try: @@ -356,7 +361,7 @@ def found_idle_inference_device(end_point_id, end_point_name, in_model_name, in_ async def send_inference_request(idle_device, end_point_id, inference_url, input_list, output_list, inference_type="default", connectivity_type=ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT, - path=None): + path=None, request_method="POST"): request_timeout_sec = FEDML_MODEL_CACHE.get_endpoint_settings(end_point_id) \ .get("request_timeout_sec", ClientConstants.INFERENCE_REQUEST_TIMEOUT) @@ -369,7 +374,8 @@ async def send_inference_request(idle_device, end_point_id, inference_url, input input_list, output_list, inference_type=inference_type, - timeout=request_timeout_sec) + timeout=request_timeout_sec, + method=request_method) logging.debug(f"Use http inference. return {response_ok}") return inference_response elif connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP_PROXY: diff --git a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py index a5048c26a6..b58b8fae72 100644 --- a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py @@ -147,6 +147,10 @@ class ServerConstants(object): DEVICE_DIFF_ADD_OPERATION = "op: add" DEVICE_DIFF_DELETE_OPERATION = "op: delete" DEVICE_DIFF_REPLACE_OPERATION = "op: replace" + + # Worker comfig yaml related + ENABLE_SERVERLESS_CONTAINER_KEY = "enable_serverless_container" + @staticmethod def get_fedml_home_dir(): home_dir = expanduser("~") diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py index eff26684b7..f95dd8e176 100755 --- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py +++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py @@ -298,14 +298,7 @@ def process_deployment_result_message(self, topic=None, payload=None): return else: # This is the last worker that failed, so we should continue to "ABORTED" status - model_config_parameters = self.request_json["parameters"] - inference_port_external = ServerConstants.get_inference_master_gateway_port() - ip = GeneralConstants.get_ip_address(self.request_json) - if ip.startswith("http://") or ip.startswith("https://"): - model_inference_url = "{}/inference/{}".format(ip, end_point_id) - else: - model_inference_url = "http://{}:{}/inference/{}".format(ip, inference_port_external, - end_point_id) + model_inference_url = self.construct_final_gateway_url(end_point_id) self.send_deployment_status( end_point_id, end_point_name, payload_json["model_name"], model_inference_url, @@ -367,13 +360,7 @@ def process_deployment_result_message(self, topic=None, payload=None): """ When all the devices have finished the add / delete / update operation """ - inference_port_external = ServerConstants.get_inference_master_gateway_port() - ip = GeneralConstants.get_ip_address(request_json) - - if ip.startswith("http://") or ip.startswith("https://"): - model_inference_url = "{}/inference/{}".format(ip, end_point_id) - else: - model_inference_url = "http://{}:{}/inference/{}".format(ip, inference_port_external, end_point_id) + model_inference_url, inference_port_external = self.construct_final_gateway_url(end_point_id) # Send stage: MODEL_DEPLOYMENT_STAGE5 = "StartInferenceIngress" self.send_deployment_stages(end_point_id, model_name, model_id, @@ -394,7 +381,7 @@ def process_deployment_result_message(self, topic=None, payload=None): model_metadata = payload_json["model_metadata"] model_inputs = model_metadata["inputs"] - ret_inputs = list() + if "type" in model_metadata and model_metadata["type"] == "default": payload_json["input_json"] = {"end_point_name": end_point_name, "model_name": model_name, @@ -768,3 +755,22 @@ def build_dynamic_args(self, run_id, run_config, package_conf_object, base_dir): def build_dynamic_constrain_variables(self, run_id, run_config): pass + def construct_final_gateway_url(self, end_point_id): + inference_port_external = ServerConstants.get_inference_master_gateway_port() + ip = GeneralConstants.get_ip_address(self.request_json) + + identifier = "inference" + if self.deployed_replica_payload is not None: + payload_json = self.deployed_replica_payload + enable_custom_path = payload_json["model_metadata"].get( + ServerConstants.ENABLE_SERVERLESS_CONTAINER_KEY, False) + if enable_custom_path: + identifier = "custom_inference" + + if ip.startswith("http://") or ip.startswith("https://"): + model_inference_url = "{}/{}/{}".format(ip, identifier, end_point_id) + else: + model_inference_url = "http://{}:{}/{}/{}".format(ip, inference_port_external, identifier, + end_point_id) + return model_inference_url, inference_port_external + From c0f691c7fd468549ee311c8ae260ba9c5599a43e Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Thu, 20 Jun 2024 14:27:35 -0700 Subject: [PATCH 28/38] [Deploy] Nit. --- .../custom_inference_image/template.yaml | 22 +++++++++++++++++++ .../scheduler/comm_utils/job_monitor.py | 2 +- .../device_http_inference_protocol.py | 9 ++++---- .../device_model_deployment.py | 2 -- .../model_scheduler/device_model_inference.py | 1 - .../device_server_constants.py | 2 +- .../model_scheduler/master_job_runner.py | 12 ++++------ 7 files changed, 33 insertions(+), 17 deletions(-) create mode 100644 python/examples/deploy/custom_inference_image/template.yaml diff --git a/python/examples/deploy/custom_inference_image/template.yaml b/python/examples/deploy/custom_inference_image/template.yaml new file mode 100644 index 0000000000..10e6580bcf --- /dev/null +++ b/python/examples/deploy/custom_inference_image/template.yaml @@ -0,0 +1,22 @@ +# Required +workspace: "./" # We will pacakge all the files in the workspace directory +enable_serverless_container: true # Identify whether to use serverless container +inference_image_name: "" # Container image name +container_run_command: "" # str or list, similar to CMD in the dockerfile +port: 80 # Service port, currently you can only indicate one arbitrary port + +# Optional, these are the default values +readiness_probe: # Probe for checking whether a container is ready for inference + httpGet: + path: "" +environment_variables: {} # Environment variables inside the container +volumes: # Volumes to mount to the container + - workspace_path: "" # Path to the volume in the workspace + mount_path: "" # Path to mount the volume inside the container +deploy_timeout_sec: 900 # Maximum time waiting for deployment to finish (Does not include the time to pull the image) +request_input_example: {} # Example of input request, will be shown in the UI +registry_specs: # Registry information for pulling the image + registry_name: "" + registry_provider: "DockerHub" + registry_user_name: "" + registry_user_password: "" \ No newline at end of file diff --git a/python/fedml/computing/scheduler/comm_utils/job_monitor.py b/python/fedml/computing/scheduler/comm_utils/job_monitor.py index d216b46dad..667a54e565 100644 --- a/python/fedml/computing/scheduler/comm_utils/job_monitor.py +++ b/python/fedml/computing/scheduler/comm_utils/job_monitor.py @@ -772,7 +772,7 @@ def _lenient_check_replica_ready(deployment_result): liveliness_check = result_json.get("model_metadata", {}).get("liveliness_check", None) readiness_check = result_json.get("model_metadata", {}).get("readiness_check", None) - if liveliness_check is not None: + if liveliness_check: if liveliness_check == ClientConstants.LIVENESS_PROBE_DEFAULT: liveliness_check = readiness_check # Follow the readiness check pattern if not isinstance(liveliness_check, dict): diff --git a/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py b/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py index 5b2658f0b3..28d50d5a50 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py @@ -25,6 +25,8 @@ async def is_inference_ready(inference_url, path="ready", timeout=None): url_parsed = urlparse(inference_url) ready_url = f"http://{url_parsed.hostname}:{url_parsed.port}/{path}" response_ok = False + + # TODO (Raphael): Support more methods and return codes rules. try: async with httpx.AsyncClient() as client: ready_response = await client.get(url=ready_url, timeout=timeout) @@ -109,11 +111,10 @@ async def redirect_non_stream_req_to_worker(inference_type, inference_url, model if response.status_code == 200: try: - if inference_type == "default": - model_inference_result = response.json() - elif inference_type == "image/png": + if inference_type == "image/png": + # wrapped media type for image binary_content: bytes = response.content - model_inference_result = Response(content=binary_content, media_type="image/png") + model_inference_result = Response(content=binary_content, media_type=inference_type) else: model_inference_result = response.json() except Exception as e: diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py index e18081c324..552d7ffaca 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py @@ -2,7 +2,6 @@ import logging import os -import shutil import time import traceback import yaml @@ -12,7 +11,6 @@ import requests import torch import torch.nn -import tritonclient.http as http_client import collections.abc diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py index 7ef9689c1c..9adc17538d 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py @@ -170,7 +170,6 @@ async def predict_with_end_point_id(end_point_id, request: Request, response: Re return inference_response -# @api.post('/custom_inference/{end_point_id}/{path:path}') @router.api_route("/custom_inference/{end_point_id}/{path:path}", methods=["POST", "GET"]) async def custom_inference(end_point_id, path: str, request: Request): # Get json data diff --git a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py index b58b8fae72..f86056229e 100644 --- a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py @@ -358,7 +358,7 @@ def get_inference_master_gateway_port(): # Use dotenv to load the environment variables fedml.load_env() master_inference_port = int(os.getenv(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY, - default=ServerConstants.MODEL_INFERENCE_DEFAULT_PORT)) + default=ServerConstants.MODEL_INFERENCE_DEFAULT_PORT)) return master_inference_port diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py index f95dd8e176..ab6bc4c895 100755 --- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py +++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py @@ -360,7 +360,7 @@ def process_deployment_result_message(self, topic=None, payload=None): """ When all the devices have finished the add / delete / update operation """ - model_inference_url, inference_port_external = self.construct_final_gateway_url(end_point_id) + model_inference_url = self.construct_final_gateway_url(end_point_id) # Send stage: MODEL_DEPLOYMENT_STAGE5 = "StartInferenceIngress" self.send_deployment_stages(end_point_id, model_name, model_id, @@ -375,7 +375,7 @@ def process_deployment_result_message(self, topic=None, payload=None): payload_json = self.deployed_replica_payload model_slave_url = payload_json["model_url"] payload_json["model_url"] = model_inference_url - payload_json["port"] = inference_port_external + payload_json["port"] = ServerConstants.get_inference_master_gateway_port() token = FedMLModelCache.get_instance(self.redis_addr, self.redis_port).get_end_point_token( end_point_id, end_point_name, model_name) @@ -767,10 +767,6 @@ def construct_final_gateway_url(self, end_point_id): if enable_custom_path: identifier = "custom_inference" - if ip.startswith("http://") or ip.startswith("https://"): - model_inference_url = "{}/{}/{}".format(ip, identifier, end_point_id) - else: - model_inference_url = "http://{}:{}/{}/{}".format(ip, inference_port_external, identifier, - end_point_id) - return model_inference_url, inference_port_external + model_inference_url = "http://{}:{}/{}/{}".format(ip, inference_port_external, identifier, end_point_id) + return model_inference_url From 33fb5b45fc674d18d74e7f435d41e69ebfde703d Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Fri, 21 Jun 2024 14:21:16 -0700 Subject: [PATCH 29/38] [Deploy] Pass down the api key to container. --- .../device_client_constants.py | 2 + .../model_scheduler/device_model_cache.py | 15 +++++- .../device_model_deployment.py | 47 ++++++++++++------- .../device_server_constants.py | 2 + .../master_protocol_manager.py | 25 +++++----- .../model_scheduler/worker_job_runner.py | 4 +- 6 files changed, 62 insertions(+), 33 deletions(-) diff --git a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py index e18c9f730b..4aee592fca 100644 --- a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py @@ -165,6 +165,8 @@ class ClientConstants(object): CUSTOMIZED_VOLUMES_PATH_FROM_WORKSPACE_KEY = "workspace_path" CUSTOMIZED_VOLUMES_PATH_FROM_CONTAINER_KEY = "mount_path" + ENV_USER_ENCRYPTED_API_KEY = "FEDML_USER_ENCRYPTED_API_KEY" + @staticmethod def get_fedml_home_dir(): home_dir = expanduser("~") diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py index c941c42102..b0021aa7df 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py @@ -112,7 +112,8 @@ def set_user_setting_replica_num(self, end_point_id, replica_num: int, enable_auto_scaling: bool = False, scale_min: int = 0, scale_max: int = 0, state: str = "UNKNOWN", target_queries_per_replica: int = 60, aggregation_window_size_seconds: int = 60, - scale_down_delay_seconds: int = 120, timeout_s: int = 30 + scale_down_delay_seconds: int = 120, timeout_s: int = 30, + user_encrypted_api_key: str = "" ) -> bool: """ Key: FEDML_MODEL_ENDPOINT_REPLICA_USER_SETTING_TAG-- @@ -139,7 +140,8 @@ def set_user_setting_replica_num(self, end_point_id, "target_queries_per_replica": target_queries_per_replica, "aggregation_window_size_seconds": aggregation_window_size_seconds, "scale_down_delay_seconds": scale_down_delay_seconds, - ServerConstants.INFERENCE_REQUEST_TIMEOUT_KEY: timeout_s + ServerConstants.INFERENCE_REQUEST_TIMEOUT_KEY: timeout_s, + ServerConstants.USER_ENCRYPTED_API_KEY: user_encrypted_api_key } try: self.redis_connection.set(self.get_user_setting_replica_num_key(end_point_id), json.dumps(replica_num_dict)) @@ -169,6 +171,15 @@ def update_user_setting_replica_num(self, end_point_id: str, state: str = "UNKNO return False return True + def get_user_encrypted_api_key(self, end_point_id: str) -> str: + try: + replica_num_dict = self.redis_connection.get(self.get_user_setting_replica_num_key(end_point_id)) + replica_num_dict = json.loads(replica_num_dict) + return replica_num_dict.get(ServerConstants.USER_ENCRYPTED_API_KEY, "") + except Exception as e: + logging.error(e) + return "" + def get_all_endpoints_user_setting(self) -> List[dict]: """ Return a list of dict, each dict is the user setting of an endpoint. diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py index 552d7ffaca..9416d243d2 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py @@ -19,6 +19,7 @@ from fedml.computing.scheduler.comm_utils.job_utils import JobRunnerUtils from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants +from fedml.computing.scheduler.model_scheduler.device_server_constants import ServerConstants from fedml.computing.scheduler.model_scheduler.device_model_cache import FedMLModelCache from ..scheduler_core.compute_utils import ComputeUtils from ..comm_utils.container_utils import ContainerUtils @@ -59,7 +60,9 @@ def request_gpu_ids_on_deployment(edge_id, end_point_id, num_gpus=None, master_d def start_deployment(end_point_id, end_point_name, model_id, model_version, model_storage_local_path, inference_model_name, inference_engine, infer_host, master_ip, edge_id, master_device_id=None, replica_rank=0, - gpu_per_replica=1): + gpu_per_replica=1, request_json=None): + if request_json is None: + request_json = dict() logging.info("[Worker] Model deployment is starting...") # Real gpu per replica (container-level) @@ -219,22 +222,9 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, if device_mapping: host_config_dict.update(device_mapping) - # Environment variables - enable_custom_image = False if relative_entry_fedml_format != "" else True - if not enable_custom_image: - # For some image, the default user is root. Unified to fedml. - environment["HOME"] = "/home/fedml" - environment["BOOTSTRAP_DIR"] = dst_bootstrap_dir - environment["FEDML_CURRENT_RUN_ID"] = end_point_id - environment["FEDML_CURRENT_EDGE_ID"] = edge_id - environment["FEDML_REPLICA_RANK"] = replica_rank - environment["FEDML_CURRENT_VERSION"] = fedml.get_env_version() - environment["FEDML_ENV_VERSION"] = fedml.get_env_version() - environment["FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_HOST"] = fedml.get_local_on_premise_platform_host() - environment["FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_PORT"] = fedml.get_local_on_premise_platform_port() - if extra_envs is not None: - for key in extra_envs: - environment[key] = extra_envs[key] + # Handle the environment variables + handle_env_vars(environment, relative_entry_fedml_format, extra_envs, dst_bootstrap_dir, + end_point_id, edge_id, replica_rank, request_json) # Create the container try: @@ -612,6 +602,29 @@ def handle_volume_mount(volumes, binds, environment, relative_entry_fedml_format logging.warning(f"{workspace_path} does not exist, skip mounting it to the container") +def handle_env_vars(environment, relative_entry_fedml_format, extra_envs, dst_bootstrap_dir, end_point_id, edge_id, + replica_rank, request_json): + enable_custom_image = False if relative_entry_fedml_format != "" else True + if not enable_custom_image: + # For some image, the default user is root. Unified to fedml. + environment["HOME"] = "/home/fedml" + + if request_json and ServerConstants.USER_ENCRYPTED_API_KEY in request_json: + environment[ClientConstants.ENV_USER_ENCRYPTED_API_KEY] = request_json[ServerConstants.USER_ENCRYPTED_API_KEY] + + environment["BOOTSTRAP_DIR"] = dst_bootstrap_dir + environment["FEDML_CURRENT_RUN_ID"] = end_point_id + environment["FEDML_CURRENT_EDGE_ID"] = edge_id + environment["FEDML_REPLICA_RANK"] = replica_rank + environment["FEDML_CURRENT_VERSION"] = fedml.get_env_version() + environment["FEDML_ENV_VERSION"] = fedml.get_env_version() + environment["FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_HOST"] = fedml.get_local_on_premise_platform_host() + environment["FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_PORT"] = fedml.get_local_on_premise_platform_port() + if extra_envs is not None: + for key in extra_envs: + environment[key] = extra_envs[key] + + def check_container_readiness(inference_http_port, infer_host="127.0.0.1", request_input_example=None, readiness_check=ClientConstants.READINESS_PROBE_DEFAULT): response_from_client_container = is_client_inference_container_ready( diff --git a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py index f86056229e..c41b150bc2 100644 --- a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py @@ -108,6 +108,8 @@ class ServerConstants(object): INFERENCE_REQUEST_TIMEOUT_KEY = "request_timeout_sec" INFERENCE_REQUEST_TIMEOUT_DEFAULT = 30 + + USER_ENCRYPTED_API_KEY = "user_encrypted_api_key" # -----End----- MODEL_DEPLOYMENT_STAGE1 = {"index": 1, "text": "ReceivedRequest"} diff --git a/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py b/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py index 7bfad2f3eb..5e16d5a02a 100755 --- a/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py +++ b/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py @@ -158,25 +158,20 @@ def callback_start_deployment(self, topic, payload): run_id = request_json["end_point_id"] end_point_name = request_json["end_point_name"] token = request_json["token"] - user_id = request_json["user_id"] - user_name = request_json["user_name"] - device_ids = request_json["device_ids"] device_objs = request_json["device_objs"] + enable_auto_scaling = request_json.get("enable_auto_scaling", False) + desired_replica_num = request_json.get("desired_replica_num", 1) + target_queries_per_replica = request_json.get("target_queries_per_replica", 10) + aggregation_window_size_seconds = request_json.get("aggregation_window_size_seconds", 60) + scale_down_delay_seconds = request_json.get("scale_down_delay_seconds", 120) + user_encrypted_api_key = request_json.get("encrypted_api_key", "") model_config = request_json["model_config"] model_name = model_config["model_name"] model_version = model_config["model_version"] model_id = model_config["model_id"] - model_storage_url = model_config["model_storage_url"] scale_min = model_config.get("instance_scale_min", 0) scale_max = model_config.get("instance_scale_max", 0) - inference_engine = model_config.get("inference_engine", 0) - enable_auto_scaling = request_json.get("enable_auto_scaling", False) - desired_replica_num = request_json.get("desired_replica_num", 1) - - target_queries_per_replica = request_json.get("target_queries_per_replica", 10) - aggregation_window_size_seconds = request_json.get("aggregation_window_size_seconds", 60) - scale_down_delay_seconds = request_json.get("scale_down_delay_seconds", 120) model_config_parameters = request_json.get("parameters", {}) timeout_s = model_config_parameters.get("request_timeout_sec", 30) @@ -193,6 +188,12 @@ def callback_start_deployment(self, topic, payload): request_json["end_point_id"]) request_json["is_fresh_endpoint"] = True if endpoint_device_info is None else False + if user_encrypted_api_key == "": + user_encrypted_api_key = (FedMLModelCache.get_instance(self.redis_addr, self.redis_port). + get_user_encrypted_api_key(run_id)) + if user_encrypted_api_key != "": # Pass the cached key to the workers + request_json[ServerConstants.USER_ENCRYPTED_API_KEY] = user_encrypted_api_key + # Save the user setting (about replica number) of this run to Redis, if existed, update it FedMLModelCache.get_instance(self.redis_addr, self.redis_port).set_user_setting_replica_num( end_point_id=run_id, end_point_name=end_point_name, model_name=model_name, model_version=model_version, @@ -201,7 +202,7 @@ def callback_start_deployment(self, topic, payload): aggregation_window_size_seconds=aggregation_window_size_seconds, target_queries_per_replica=target_queries_per_replica, scale_down_delay_seconds=int(scale_down_delay_seconds), - timeout_s=timeout_s + timeout_s=timeout_s, user_encrypted_api_key=user_encrypted_api_key ) # Start log processor for current run diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py index a892412d29..113a20e825 100755 --- a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py +++ b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py @@ -250,7 +250,7 @@ def run_impl(self, run_extend_queue_list, sender_message_center, inference_model_name=model_name, inference_engine=inference_engine, infer_host=worker_ip, master_ip=master_ip, edge_id=self.edge_id, master_device_id=device_ids[0], replica_rank=rank, - gpu_per_replica=int(self.replica_handler.gpu_per_replica) + gpu_per_replica=int(self.replica_handler.gpu_per_replica), request_json=self.request_json ) except Exception as e: inference_output_url = "" @@ -373,7 +373,7 @@ def run_impl(self, run_extend_queue_list, sender_message_center, inference_model_name=model_name, inference_engine=inference_engine, infer_host=worker_ip, master_ip=master_ip, edge_id=self.edge_id, master_device_id=device_ids[0], replica_rank=rank, - gpu_per_replica=int(self.replica_handler.gpu_per_replica) + gpu_per_replica=int(self.replica_handler.gpu_per_replica), request_json=self.request_json ) except Exception as e: inference_output_url = "" From f412a2637b6ae83f9fc1ecaa60b5205d4d43507d Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Fri, 21 Jun 2024 21:36:12 +0000 Subject: [PATCH 30/38] [Deploy] Nit. --- .../scheduler/model_scheduler/device_server_constants.py | 2 +- .../scheduler/model_scheduler/master_protocol_manager.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py index c41b150bc2..00f0fe73bf 100644 --- a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py @@ -109,7 +109,7 @@ class ServerConstants(object): INFERENCE_REQUEST_TIMEOUT_KEY = "request_timeout_sec" INFERENCE_REQUEST_TIMEOUT_DEFAULT = 30 - USER_ENCRYPTED_API_KEY = "user_encrypted_api_key" + USER_ENCRYPTED_API_KEY = "encrypted_api_key" # -----End----- MODEL_DEPLOYMENT_STAGE1 = {"index": 1, "text": "ReceivedRequest"} diff --git a/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py b/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py index 5e16d5a02a..9e0d51b588 100755 --- a/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py +++ b/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py @@ -164,7 +164,7 @@ def callback_start_deployment(self, topic, payload): target_queries_per_replica = request_json.get("target_queries_per_replica", 10) aggregation_window_size_seconds = request_json.get("aggregation_window_size_seconds", 60) scale_down_delay_seconds = request_json.get("scale_down_delay_seconds", 120) - user_encrypted_api_key = request_json.get("encrypted_api_key", "") + user_encrypted_api_key = request_json.get(ServerConstants.USER_ENCRYPTED_API_KEY, "") model_config = request_json["model_config"] model_name = model_config["model_name"] From d6c9411774318e812e7f0b4dd73478f2a88e4cb3 Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Fri, 21 Jun 2024 15:00:33 -0700 Subject: [PATCH 31/38] [Deploy] Remove example. --- .../lorax/custom_inference_image.yaml | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 python/examples/deploy/custom_inference_image/lorax/custom_inference_image.yaml diff --git a/python/examples/deploy/custom_inference_image/lorax/custom_inference_image.yaml b/python/examples/deploy/custom_inference_image/lorax/custom_inference_image.yaml deleted file mode 100644 index 41cbe501d2..0000000000 --- a/python/examples/deploy/custom_inference_image/lorax/custom_inference_image.yaml +++ /dev/null @@ -1,16 +0,0 @@ -workspace: "./" - -enable_serverless_container: true -inference_image_name: "ghcr.io/predibase/lorax:main" -container_run_command: "--model-id mistralai/Mistral-7B-Instruct-v0.1" - -environment_variables: - HUGGING_FACE_HUB_TOKEN: "" - -readiness_probe: - httpGet: - path: "/health" - -port: 80 - -deploy_timeout_sec: 1600 From fa44ccce0a553f7c7d7dcceb5312b830061e718f Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Tue, 25 Jun 2024 11:48:17 -0700 Subject: [PATCH 32/38] [Deploy] Return custom path other than /predict. --- .../device_client_constants.py | 1 + .../device_model_deployment.py | 108 +++++++++++------- 2 files changed, 67 insertions(+), 42 deletions(-) diff --git a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py index 4aee592fca..4006e50726 100644 --- a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py @@ -164,6 +164,7 @@ class ClientConstants(object): CUSTOMIZED_VOLUMES_MOUNT_KEY = "volumes" CUSTOMIZED_VOLUMES_PATH_FROM_WORKSPACE_KEY = "workspace_path" CUSTOMIZED_VOLUMES_PATH_FROM_CONTAINER_KEY = "mount_path" + CUSTOMIZED_SERVICE_KEY = "service" ENV_USER_ENCRYPTED_API_KEY = "FEDML_USER_ENCRYPTED_API_KEY" diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py index 9416d243d2..25fc1e1d64 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py @@ -87,36 +87,10 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, inference_image_name, image_pull_policy, registry_name, registry_provider, \ registry_user_name, registry_user_password = parse_image_registry_related_config(config) - # Bootstrap, job and entrypoint related - dst_model_serving_dir = "/home/fedml/models_serving" - bootstrap_cmds_str_frm_yaml = config.get('bootstrap', "") - job_cmds_str_frm_yaml = config.get('job', "") - - if bootstrap_cmds_str_frm_yaml != "" or job_cmds_str_frm_yaml != "": - auto_gen_bootstrap_file_name = "fedml-deploy-bootstrap-entry-auto-gen.sh" - src_bootstrap_file_path = os.path.join(model_storage_local_path, auto_gen_bootstrap_file_name) - with open(src_bootstrap_file_path, 'w') as f: - f.write("cd /home/fedml/models_serving/\n") - f.write(bootstrap_cmds_str_frm_yaml) - f.write("\n") - f.write("cd /home/fedml/models_serving/\n") - f.write(job_cmds_str_frm_yaml) - else: - src_bootstrap_file_path = "" - - if src_bootstrap_file_path != "": - dst_bootstrap_dir = os.path.join(dst_model_serving_dir, auto_gen_bootstrap_file_name) - else: - dst_bootstrap_dir = "" - - # If the entry point is in fedml format (e.g., "main.py") - relative_entry_fedml_format = config.get('entry_point', "") - - # User indicate either fedml format python main entry filename or entry command - enable_serverless_container = config.get(ClientConstants.ENABLE_SERVERLESS_CONTAINER_KEY, False) - customized_image_entry_cmd = config.get('container_run_command', None) # Could be str or list - customized_readiness_check = config.get('readiness_probe', ClientConstants.READINESS_PROBE_DEFAULT) - customized_liveliness_check = config.get('liveness_probe', ClientConstants.LIVENESS_PROBE_DEFAULT) + # Service app related + dst_bootstrap_dir, dst_model_serving_dir, relative_entry_fedml_format, enable_serverless_container, \ + customized_image_entry_cmd, customized_readiness_check, customized_liveliness_check, customized_uri = \ + handle_container_service_app(config, model_storage_local_path) # Storage related src_code_dir = os.path.join(model_storage_local_path, config.get('source_code_dir', "")) @@ -451,7 +425,7 @@ def parse_image_registry_related_config(config): def is_client_inference_container_ready(infer_url_host, inference_http_port, readiness_check=ClientConstants.READINESS_PROBE_DEFAULT, - request_input_example=None, container_id=None): + request_input_example=None, container_id=None, customized_uri=None): # Construct the model metadata (input and output) model_metadata = {} if request_input_example is not None and len(request_input_example) > 0: @@ -461,6 +435,7 @@ def is_client_inference_container_ready(infer_url_host, inference_http_port, model_metadata["outputs"] = [] model_metadata["type"] = "default" + # Check the readiness of the container if readiness_check == ClientConstants.READINESS_PROBE_DEFAULT: default_client_container_ready_url = "http://{}:{}/ready".format("0.0.0.0", inference_http_port) response = None @@ -486,27 +461,38 @@ def is_client_inference_container_ready(infer_url_host, inference_http_port, else: if not check_path.startswith("/"): check_path = "/" + check_path - readiness_check_url = f"http://{infer_url_host}:{inference_http_port}{check_path}" - response = None try: - response = requests.get(readiness_check_url) + response = requests.get(f"http://{infer_url_host}:{inference_http_port}{check_path}") except: pass if not response or response.status_code != 200: return "", "", {}, {} - - return readiness_check_url, None, model_metadata, None else: logging.error("'path' is not specified in httpGet readiness check") return "", "", {}, {} elif "exec" in readiness_check: - # TODO(raphael): Support arbitrary readiness check command by using - # container id and docker exec - return "http://{}:{}/".format(infer_url_host, inference_http_port), None, model_metadata, None + # TODO(raphael): Support arbitrary readiness check command by using container id and docker exec + pass else: # Ref K8S, if no readiness check, we assume the container is ready immediately - return "http://{}:{}/".format(infer_url_host, inference_http_port), None, model_metadata, None + pass + + # Construct the customized URI + path = "" + if customized_uri is not None: + if "httpPost" in customized_uri and "path" in customized_uri["httpPost"]: + path = customized_uri["httpPost"]["path"] + if not isinstance(path, str): + logging.error(f"Invalid path type: {path}, expected str") + return "", "", {}, {} + else: + if not path.startswith("/"): + path = "/" + path + # TODO(raphael): Finalized more customized URI types + readiness_check_url = f"http://{infer_url_host}:{inference_http_port}{path}" + + return readiness_check_url, None, model_metadata, None def _handle_union_volume_mount(binds, volumes, environment, data_cache_dir_input=None): @@ -602,6 +588,43 @@ def handle_volume_mount(volumes, binds, environment, relative_entry_fedml_format logging.warning(f"{workspace_path} does not exist, skip mounting it to the container") +def handle_container_service_app(config, model_storage_local_path): + # Bootstrap, job and entrypoint related + dst_model_serving_dir = "/home/fedml/models_serving" + bootstrap_cmds_str_frm_yaml = config.get('bootstrap', "") + job_cmds_str_frm_yaml = config.get('job', "") + + auto_gen_bootstrap_file_name = "fedml-deploy-bootstrap-entry-auto-gen.sh" + if bootstrap_cmds_str_frm_yaml != "" or job_cmds_str_frm_yaml != "": + src_bootstrap_file_path = os.path.join(model_storage_local_path, auto_gen_bootstrap_file_name) + with open(src_bootstrap_file_path, 'w') as f: + f.write("cd /home/fedml/models_serving/\n") + f.write(bootstrap_cmds_str_frm_yaml) + f.write("\n") + f.write("cd /home/fedml/models_serving/\n") + f.write(job_cmds_str_frm_yaml) + else: + src_bootstrap_file_path = "" + + if src_bootstrap_file_path != "": + dst_bootstrap_dir = os.path.join(dst_model_serving_dir, auto_gen_bootstrap_file_name) + else: + dst_bootstrap_dir = "" + + # If the entry point is in fedml format (e.g., "main.py") + relative_entry_fedml_format = config.get('entry_point', "") + + # User indicate either fedml format python main entry filename or entry command + enable_serverless_container = config.get(ClientConstants.ENABLE_SERVERLESS_CONTAINER_KEY, False) + customized_image_entry_cmd = config.get('container_run_command', None) # Could be str or list + customized_readiness_check = config.get('readiness_probe', ClientConstants.READINESS_PROBE_DEFAULT) + customized_liveliness_check = config.get('liveness_probe', ClientConstants.LIVENESS_PROBE_DEFAULT) + customized_uri = config.get(ClientConstants.CUSTOMIZED_SERVICE_KEY, "") + + return (dst_bootstrap_dir, dst_model_serving_dir, relative_entry_fedml_format, enable_serverless_container, + customized_image_entry_cmd, customized_readiness_check, customized_liveliness_check, customized_uri) + + def handle_env_vars(environment, relative_entry_fedml_format, extra_envs, dst_bootstrap_dir, end_point_id, edge_id, replica_rank, request_json): enable_custom_image = False if relative_entry_fedml_format != "" else True @@ -626,10 +649,11 @@ def handle_env_vars(environment, relative_entry_fedml_format, extra_envs, dst_bo def check_container_readiness(inference_http_port, infer_host="127.0.0.1", request_input_example=None, - readiness_check=ClientConstants.READINESS_PROBE_DEFAULT): + readiness_check=ClientConstants.READINESS_PROBE_DEFAULT, + customized_uri=None): response_from_client_container = is_client_inference_container_ready( infer_host, inference_http_port, readiness_check=readiness_check, - request_input_example=request_input_example) + request_input_example=request_input_example, customized_uri=customized_uri) return response_from_client_container From bd89be1a1f01f0ff1528cd1766c2a22a25af5975 Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Tue, 25 Jun 2024 11:50:04 -0700 Subject: [PATCH 33/38] [Deploy] Add sqlite backup for get_all_deployment_result_list. --- .../scheduler/comm_utils/constants.py | 1 - .../scheduler/comm_utils/job_monitor.py | 2 +- .../model_scheduler/device_model_cache.py | 30 +++++++++-- .../model_scheduler/device_model_db.py | 51 +++++++++++++++++-- .../model_scheduler/worker_job_runner.py | 2 +- 5 files changed, 75 insertions(+), 11 deletions(-) diff --git a/python/fedml/computing/scheduler/comm_utils/constants.py b/python/fedml/computing/scheduler/comm_utils/constants.py index 67b9d8b14b..6e46ce207b 100644 --- a/python/fedml/computing/scheduler/comm_utils/constants.py +++ b/python/fedml/computing/scheduler/comm_utils/constants.py @@ -114,7 +114,6 @@ class SchedulerConstants: REDIS_PORT = "6379" REDIS_PASSWORD = "fedml_default" - @staticmethod def get_log_source(run_json): run_config = run_json.get("run_config", {}) diff --git a/python/fedml/computing/scheduler/comm_utils/job_monitor.py b/python/fedml/computing/scheduler/comm_utils/job_monitor.py index 667a54e565..b8237d93ba 100644 --- a/python/fedml/computing/scheduler/comm_utils/job_monitor.py +++ b/python/fedml/computing/scheduler/comm_utils/job_monitor.py @@ -210,6 +210,7 @@ def monitor_replicas_number(): endpoint_replicas_details = {} if isinstance(endpoint_detail, str): endpoint_replicas_details = json.loads(endpoint_detail) + # TODO: Check out this nested json if isinstance(endpoint_replicas_details, str): endpoint_replicas_details = json.loads(endpoint_replicas_details) @@ -222,7 +223,6 @@ def monitor_replicas_number(): endpoint_replica_details["end_point_id"], 0) + 1 for endpoint_id, num_replica in res_to_mlops.items(): - curr_version = fedml.get_env_version() num_replica_url_path = "fedmlModelServer/api/v1/endpoint/replica-info" mlops_prefix = fedml._get_backend_service() url = f"{mlops_prefix}/{num_replica_url_path}" diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py index b0021aa7df..1836971075 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py @@ -302,7 +302,27 @@ def get_all_deployment_result_list(self): result_list.extend(self.redis_connection.lrange(key, 0, -1)) except Exception as e: logging.error(e) - # TODO(Raphael): Use Sqlite for the replica backup + + # Get cached results from the persist sqlite database + if len(result_list) <= 0: + db_result_list = list() + try: + db_result_list = self.model_deployment_db.get_all_deployment_results_list() + except Exception as e: + logging.error(f"Failed to get all deployment results from the database due to {e}") + pass + + for result in db_result_list: + try: + self.redis_connection.rpush(self.get_deployment_result_key( + result["end_point_id"], result["end_point_name"], result["model_name"]), + json.dumps(result["replica_info"])) + except Exception as e: + logging.error(e) + pass + + for result in db_result_list: + result_list.append(result["replica_info"]) return result_list @@ -330,7 +350,8 @@ def get_deployment_status_list_size(self, end_point_id, end_point_name, model_na status_list = self.get_deployment_status_list(end_point_id, end_point_name, model_name) return len(status_list) - def get_status_item_info(self, status_item): + @staticmethod + def get_status_item_info(status_item): status_item_json = json.loads(status_item) if isinstance(status_item_json, str): status_item_json = json.loads(status_item_json) @@ -341,7 +362,8 @@ def get_status_item_info(self, status_item): status_payload = status_item_json["status"] return device_id, status_payload - def get_result_item_info(self, result_item): + @staticmethod + def get_result_item_info(result_item): result_item_json = json.loads(result_item) if isinstance(result_item_json, str): result_item_json = json.loads(result_item_json) @@ -386,7 +408,7 @@ def get_idle_device(self, return None, None # # Randomly shuffle - # shuffle the list of deployed devices and get the first one as the target idle device. + # the list of deployed devices and get the first one as the target idle device. # if len(idle_device_list) <= 0: # return None, None # shuffle(idle_device_list) diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_db.py b/python/fedml/computing/scheduler/model_scheduler/device_model_db.py index 09573a1d1b..606d8c010b 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_db.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_db.py @@ -10,6 +10,7 @@ from sqlalchemy.ext.declarative import declarative_base from fedml.core.common.singleton import Singleton from sqlalchemy.sql import text +from typing import List, Dict Base = declarative_base() @@ -42,9 +43,11 @@ def set_deployment_status(self, end_point_id, end_point_name, model_name, model_ self.set_deployment_results_info(end_point_id, end_point_name, model_name, model_version, device_id, deployment_status=deployment_status, replica_no=replica_no) - def get_deployment_result_list(self, end_point_id, end_point_name, model_name, model_version=None): + def get_deployment_result_list(self, end_point_id, end_point_name, model_name, model_version=None) -> List[str]: """ - query from sqlite db using e_id + Get the orm use get_deployment_results_info, + but (1) nested results with cache_device_id, cache_replica_no. + (2) return a list of json string, so that redis can store it. """ result_list = self.get_deployment_results_info(end_point_id, end_point_name, model_name, model_version) ret_result_list = list() @@ -55,6 +58,39 @@ def get_deployment_result_list(self, end_point_id, end_point_name, model_name, m ret_result_list.append(json.dumps(result_dict)) return ret_result_list + def get_all_deployment_results_list(self) -> List[Dict]: + """ + Similar to _get_all_deployment_results_info, + but return a list of json string, so that redis can store it. + + return a list of dict, for each item: + [ + { + "end_point_id": "", + "end_point_name": "", + "model_name":"", + "replica_res": "" # Json string + }, + ] + value in the dict is a string that contains the deployment result. + """ + flat_ep_list = self._get_all_deployment_results_info() + ret_result_list = list() + for result in flat_ep_list: + result_dict = { + "end_point_id": result.end_point_id, + "end_point_name": result.end_point_name, + "model_name": result.model_name, + "replica_info": json.dumps( + { + "cache_device_id": result.device_id, + "cache_replica_no": int(result.replica_no), + "result": result.deployment_result + } + ) + } + ret_result_list.append(result_dict) + return ret_result_list def get_deployment_status_list(self, end_point_id, end_point_name, model_name, model_version=None): result_list = self.get_deployment_results_info(end_point_id, end_point_name, model_name, model_version) @@ -156,7 +192,8 @@ def delete_deployment_run_info(self, end_point_id): end_point_id=f'{end_point_id}').delete() self.db_connection.commit() - def get_result_item_info(self, result_item): + @staticmethod + def get_result_item_info(result_item): result_item_json = json.loads(result_item) if isinstance(result_item_json, dict): result_item_json = json.loads(result_item) @@ -169,7 +206,8 @@ def get_result_item_info(self, result_item): result_payload = result_item_json["result"] return device_id, replica_no, result_payload - def get_status_item_info(self, status_item): + @staticmethod + def get_status_item_info(status_item): status_item_json = json.loads(status_item) if isinstance(status_item_json, dict): status_item_json = json.loads(status_item) @@ -320,6 +358,11 @@ def get_deployment_results_info(self, end_point_id, end_point_name, model_name, FedMLDeploymentResultInfoModel.model_version == f'{model_version}')).all() return result_info + def _get_all_deployment_results_info(self): + self.open_job_db() + result_info = self.db_connection.query(FedMLDeploymentResultInfoModel).all() + return result_info + def set_deployment_results_info(self, end_point_id, end_point_name, model_name, model_version, device_id, deployment_result=None, deployment_status=None, replica_no=None): diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py index 113a20e825..c73630fb65 100755 --- a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py +++ b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py @@ -260,7 +260,7 @@ def run_impl(self, run_extend_queue_list, sender_message_center, logging.error("[Worker] Failed to deploy the model.") # Send failed result back to master - result_payload = self.send_deployment_results( + _ = self.send_deployment_results( end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED, model_id, model_name, inference_output_url, inference_model_version, inference_port, inference_engine, model_metadata, model_config) From 43f99cf0acf9df685272fc02a5890981ac3d0ee2 Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Tue, 25 Jun 2024 11:55:14 -0700 Subject: [PATCH 34/38] [Deploy] Nit. --- .../scheduler/model_scheduler/device_model_deployment.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py index 25fc1e1d64..a47f9dbc20 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py @@ -245,7 +245,8 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, inference_output_url, running_model_version, ret_model_metadata, ret_model_config = \ check_container_readiness(inference_http_port=inference_http_port, infer_host=infer_host, readiness_check=customized_readiness_check, - request_input_example=request_input_example) + request_input_example=request_input_example, + customized_uri=customized_uri) if inference_output_url == "": return running_model_name, "", None, None, None From 766c52aaf7a5b1dd567e2b91730780a05c594d36 Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Tue, 25 Jun 2024 11:59:14 -0700 Subject: [PATCH 35/38] [Deploy] Nit. --- .../trt-llm-openai/config.yaml | 22 +++++++++++++++++++ .../device_model_deployment.py | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 python/examples/deploy/custom_inference_image/trt-llm-openai/config.yaml diff --git a/python/examples/deploy/custom_inference_image/trt-llm-openai/config.yaml b/python/examples/deploy/custom_inference_image/trt-llm-openai/config.yaml new file mode 100644 index 0000000000..1bdcf32f75 --- /dev/null +++ b/python/examples/deploy/custom_inference_image/trt-llm-openai/config.yaml @@ -0,0 +1,22 @@ +workspace: "./" + +inference_image_name: "fedml/trt-llm-openai" + +# The image has its self-contained cmd, no need for rewriting the command +container_run_command: null + +port: 3000 + +readiness_probe: + httpGet: + path: "/health_check" + +# If you do not use serverless container mode, and you want to indicate another resource path, +# e.g. localhost:3000/v1/chat/completions, you can set the following uri: +service: + httpPost: + path: "/v1/chat/completions" + +deploy_timeout_sec: 1600 + +endpoint_api_type: "text2text_llm_openai_chat_completions" \ No newline at end of file diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py index a47f9dbc20..665bb4082e 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py @@ -233,7 +233,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, raise Exception("Failed to get the port allocation") time.sleep(3) - # Logging the info from the container when starting + # Logging the info from the container when initializing log_deployment_output(end_point_id, model_id, default_server_container_name, ClientConstants.CMD_TYPE_RUN_DEFAULT_SERVER, inference_model_name, inference_engine, inference_http_port, inference_type, From 0c29c4990d9f8f06940d3f3a658f9ffd1f0ddc86 Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Tue, 25 Jun 2024 17:22:02 -0700 Subject: [PATCH 36/38] [Deploy] Hot fix hash exist. --- .../computing/scheduler/model_scheduler/device_model_cache.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py index b0021aa7df..0d92466169 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py @@ -990,6 +990,8 @@ def delete_endpoint_scaling_down_decision_time(self, end_point_id) -> bool: end_point_id)) def get_pending_requests_counter(self, end_point_id) -> int: + if not end_point_id: + return 0 # If the endpoint does not exist inside the Hash collection, set its counter to 0. if self.redis_connection.hexists(self.FEDML_PENDING_REQUESTS_COUNTER, end_point_id): return int(self.redis_connection.hget(self.FEDML_PENDING_REQUESTS_COUNTER, end_point_id)) From 36378f876018163508f03592fca556afa3a9ec8f Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Tue, 25 Jun 2024 17:52:33 -0700 Subject: [PATCH 37/38] [Deploy] Indicate worker connection type through cli and api. --- python/fedml/api/__init__.py | 12 ++++++++---- python/fedml/api/modules/device.py | 8 +++++--- python/fedml/cli/modules/login.py | 12 ++++++++++-- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/python/fedml/api/__init__.py b/python/fedml/api/__init__.py index f753e4255b..b03c72b675 100755 --- a/python/fedml/api/__init__.py +++ b/python/fedml/api/__init__.py @@ -213,16 +213,20 @@ def fedml_build(platform, type, source_folder, entry_point, config_folder, dest_ def login(api_key, computing, server, supplier, master_inference_gateway_port: int = ServerConstants.MODEL_INFERENCE_DEFAULT_PORT, - worker_inference_proxy_port: int = ClientConstants.LOCAL_CLIENT_API_PORT): - device_bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port) + worker_inference_proxy_port: int = ClientConstants.LOCAL_CLIENT_API_PORT, + worker_connection_type: str = ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT): + device_bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port, + worker_connection_type) def logout(computing, server): device_unbind(computing, server) -def device_bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port): - device.bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port) +def device_bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port, + worker_connection_type): + device.bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port, + worker_connection_type) def device_unbind(computing, server): diff --git a/python/fedml/api/modules/device.py b/python/fedml/api/modules/device.py index 14591147a6..7c4e52c8b5 100644 --- a/python/fedml/api/modules/device.py +++ b/python/fedml/api/modules/device.py @@ -21,7 +21,8 @@ def bind( api_key, computing, server, supplier, master_inference_gateway_port=DeviceServerConstants.MODEL_INFERENCE_DEFAULT_PORT, - worker_inference_proxy_port=DeviceClientConstants.LOCAL_CLIENT_API_PORT + worker_inference_proxy_port=DeviceClientConstants.LOCAL_CLIENT_API_PORT, + worker_connection_type=DeviceClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT ): userid = api_key runner_cmd = "{}" @@ -47,13 +48,13 @@ def bind( _bind( userid, computing, server, api_key, role, runner_cmd, device_id, os_name, - docker, master_inference_gateway_port, worker_inference_proxy_port) + docker, master_inference_gateway_port, worker_inference_proxy_port, worker_connection_type) def _bind( userid, computing, server, api_key, role, runner_cmd, device_id, os_name, - docker, master_inference_gateway_port, worker_inference_proxy_port): + docker, master_inference_gateway_port, worker_inference_proxy_port, worker_connection_type): fedml.load_env() if os.getenv(ModuleConstants.ENV_FEDML_INFER_HOST) is None: fedml.set_env_kv(ModuleConstants.ENV_FEDML_INFER_HOST, SchedulerConstants.REDIS_INFER_HOST) @@ -66,6 +67,7 @@ def _bind( fedml.set_env_kv(DeviceServerConstants.ENV_MASTER_INFERENCE_PORT_KEY, str(master_inference_gateway_port)) fedml.set_env_kv(DeviceClientConstants.ENV_CLIENT_PROXY_PORT_KEY, str(worker_inference_proxy_port)) + fedml.set_env_kv(DeviceClientConstants.ENV_CONNECTION_TYPE_KEY, worker_connection_type) url = fedml._get_backend_service() platform_name = platform.system() diff --git a/python/fedml/cli/modules/login.py b/python/fedml/cli/modules/login.py index f3c982f456..7ec4191a3e 100644 --- a/python/fedml/cli/modules/login.py +++ b/python/fedml/cli/modules/login.py @@ -67,10 +67,17 @@ default=ClientConstants.LOCAL_CLIENT_API_PORT, help="The port for worker inference proxy.", ) +@click.option( + "--worker_connection_type", + "-wct", + type=str, + default=ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT, + help="The connection type for worker inference proxy.", +) def fedml_login( api_key, version, compute_node, server, provider, deploy_worker_num, local_on_premise_platform, local_on_premise_platform_port, - master_inference_gateway_port, worker_inference_proxy_port + master_inference_gateway_port, worker_inference_proxy_port, worker_connection_type ): fedml.set_env_version(version) fedml.set_local_on_premise_platform_host(local_on_premise_platform) @@ -84,4 +91,5 @@ def fedml_login( print(f"Maybe you are using account id to login, we will try to login with account {api_key}.") pass os.environ["FEDML_MODEL_WORKER_NUM"] = str(deploy_worker_num) - fedml.api.login(api_key, compute_node, server, provider, master_inference_gateway_port, worker_inference_proxy_port) + fedml.api.login(api_key, compute_node, server, provider, master_inference_gateway_port, + worker_inference_proxy_port, worker_connection_type) From 5097ff29bf48b7f6d8c097721d96e44f421a4192 Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Tue, 25 Jun 2024 18:01:29 -0700 Subject: [PATCH 38/38] [Deploy] Nit. --- .../scheduler/model_scheduler/device_model_cache.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py index 0d92466169..7e79126fa6 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py @@ -989,11 +989,9 @@ def delete_endpoint_scaling_down_decision_time(self, end_point_id) -> bool: self.FEDML_MODEL_ENDPOINT_SCALING_DOWN_DECISION_TIME_TAG, end_point_id)) - def get_pending_requests_counter(self, end_point_id) -> int: - if not end_point_id: - return 0 + def get_pending_requests_counter(self, end_point_id=None) -> int: # If the endpoint does not exist inside the Hash collection, set its counter to 0. - if self.redis_connection.hexists(self.FEDML_PENDING_REQUESTS_COUNTER, end_point_id): + if end_point_id and self.redis_connection.hexists(self.FEDML_PENDING_REQUESTS_COUNTER, end_point_id): return int(self.redis_connection.hget(self.FEDML_PENDING_REQUESTS_COUNTER, end_point_id)) return 0