From c4a87149e3af296310f0a7ca04cd467e0bc9b06f Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Tue, 11 Jun 2024 00:06:14 +0000
Subject: [PATCH 01/38] [Deploy] Report worker's connectivity when it finished.

---
 .../scheduler/comm_utils/network_util.py      | 16 +++++
 .../device_client_constants.py                |  5 ++
 .../model_scheduler/device_model_inference.py | 60 +++++++++++--------
 .../model_scheduler/master_job_runner.py      |  8 ---
 .../model_scheduler/worker_job_runner.py      | 33 +++++++---
 .../scheduler_core/general_constants.py       | 16 ++---
 6 files changed, 87 insertions(+), 51 deletions(-)
 create mode 100644 python/fedml/computing/scheduler/comm_utils/network_util.py

diff --git a/python/fedml/computing/scheduler/comm_utils/network_util.py b/python/fedml/computing/scheduler/comm_utils/network_util.py
new file mode 100644
index 0000000000..13674840c5
--- /dev/null
+++ b/python/fedml/computing/scheduler/comm_utils/network_util.py
@@ -0,0 +1,16 @@
+import os
+from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants
+
+
+def return_this_device_connectivity_type() -> str:
+    """
+    Return -> "http" | "http_proxy" |"mqtt"
+    """
+    if os.environ.get(ClientConstants.ENV_CONNECTION_TYPE_KEY) == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP:
+        return ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP
+    elif os.environ.get(ClientConstants.ENV_CONNECTION_TYPE_KEY) == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP_PROXY:
+        return ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP_PROXY
+    elif os.environ.get(ClientConstants.ENV_CONNECTION_TYPE_KEY) == ClientConstants.WORKER_CONNECTIVITY_TYPE_MQTT:
+        return ClientConstants.WORKER_CONNECTIVITY_TYPE_MQTT
+    else:
+        return ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
index 7894f2c73e..d66c2f966a 100644
--- a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
@@ -97,6 +97,11 @@ class ClientConstants(object):
     INFERENCE_INFERENCE_SERVER_VERSION = "v2"
     INFERENCE_REQUEST_TIMEOUT = 30
 
+    ENV_CONNECTION_TYPE_KEY = "FEDML_CONNECTION_TYPE"
+    WORKER_CONNECTIVITY_TYPE_HTTP = "http"
+    WORKER_CONNECTIVITY_TYPE_HTTP_PROXY = "http_proxy"
+    WORKER_CONNECTIVITY_TYPE_MQTT = "mqtt"
+
     MSG_MODELOPS_DEPLOYMENT_STATUS_INITIALIZING = "INITIALIZING"
     MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYING = "DEPLOYING"
     MSG_MODELOPS_DEPLOYMENT_STATUS_INFERRING = "INFERRING"
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py
index d073533b72..a9205ceb9a 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py
@@ -210,7 +210,8 @@ async def _predict(
                 return inference_response
 
             # Found idle inference device
-            idle_device, end_point_id, model_id, model_name, model_version, inference_host, inference_output_url = \
+            idle_device, end_point_id, model_id, model_name, model_version, inference_host, inference_output_url,\
+                connectivity_type = \
                 found_idle_inference_device(in_end_point_id, in_end_point_name, in_model_name, in_model_version)
             if idle_device is None or idle_device == "":
                 FEDML_MODEL_CACHE.update_pending_requests_counter(end_point_id, decrease=True)
@@ -235,13 +236,16 @@ async def _predict(
                 stream_flag = input_json.get("stream", False)
                 input_list["stream"] = input_list.get("stream", stream_flag)
                 output_list = input_json.get("outputs", [])
+
+                # main execution of redirecting the inference request to the idle device
                 inference_response = await send_inference_request(
                     idle_device,
                     end_point_id,
                     inference_output_url,
                     input_list,
                     output_list,
-                    inference_type=in_return_type)
+                    inference_type=in_return_type,
+                    connectivity_type=connectivity_type)
 
             # Calculate model metrics
             try:
@@ -304,11 +308,12 @@ def found_idle_inference_device(end_point_id, end_point_name, in_model_name, in_
     inference_host = ""
     inference_output_url = ""
     model_version = ""
+    connectivity_type = ""
+
     # Found idle device (TODO: optimize the algorithm to search best device for inference)
     payload, idle_device = FEDML_MODEL_CACHE. \
         get_idle_device(end_point_id, end_point_name, in_model_name, in_model_version)
     if payload is not None:
-        logging.info("found idle deployment result {}".format(payload))
         deployment_result = payload
         model_name = deployment_result["model_name"]
         model_version = deployment_result["model_version"]
@@ -317,24 +322,25 @@ def found_idle_inference_device(end_point_id, end_point_name, in_model_name, in_
         inference_output_url = deployment_result["model_url"]
         url_parsed = urlparse(inference_output_url)
         inference_host = url_parsed.hostname
+        connectivity_type = deployment_result.get("connectivity_type", ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP)
     else:
         logging.info("not found idle deployment result")
 
-    return idle_device, end_point_id, model_id, model_name, model_version, inference_host, inference_output_url
+    res = (idle_device, end_point_id, model_id, model_name, model_version, inference_host, inference_output_url,
+           connectivity_type)
+    logging.info(f"found idle device with metrics: {res}")
+
+    return res
 
 
 async def send_inference_request(idle_device, end_point_id, inference_url, input_list, output_list,
-                                 inference_type="default", has_public_ip=True):
+                                 inference_type="default",
+                                 connectivity_type=ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP):
     request_timeout_sec = FEDML_MODEL_CACHE.get_endpoint_settings(end_point_id) \
         .get("request_timeout_sec", ClientConstants.INFERENCE_REQUEST_TIMEOUT)
 
     try:
-        http_infer_available = os.getenv("FEDML_INFERENCE_HTTP_AVAILABLE", True)
-        if not http_infer_available:
-            if http_infer_available == "False" or http_infer_available == "false":
-                http_infer_available = False
-
-        if http_infer_available:
+        if connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP:
             response_ok = await FedMLHttpInference.is_inference_ready(
                 inference_url,
                 timeout=request_timeout_sec)
@@ -347,22 +353,23 @@ async def send_inference_request(idle_device, end_point_id, inference_url, input
                     timeout=request_timeout_sec)
                 logging.info(f"Use http inference. return {response_ok}")
                 return inference_response
-
-        response_ok = await FedMLHttpProxyInference.is_inference_ready(
-            inference_url,
-            timeout=request_timeout_sec)
-        if response_ok:
-            response_ok, inference_response = await FedMLHttpProxyInference.run_http_proxy_inference_with_request(
-                end_point_id,
+        elif connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP_PROXY:
+            logging.warning("Use http proxy inference.")
+            response_ok = await FedMLHttpProxyInference.is_inference_ready(
                 inference_url,
-                input_list,
-                output_list,
-                inference_type=inference_type,
                 timeout=request_timeout_sec)
-            logging.info(f"Use http proxy inference. return {response_ok}")
-            return inference_response
-
-        if not has_public_ip:
+            if response_ok:
+                response_ok, inference_response = await FedMLHttpProxyInference.run_http_proxy_inference_with_request(
+                    end_point_id,
+                    inference_url,
+                    input_list,
+                    output_list,
+                    inference_type=inference_type,
+                    timeout=request_timeout_sec)
+                logging.info(f"Use http proxy inference. return {response_ok}")
+                return inference_response
+        elif connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_MQTT:
+            logging.warning("Use mqtt inference.")
             agent_config = {"mqtt_config": Settings.mqtt_config}
             mqtt_inference = FedMLMqttInference(
                 agent_config=agent_config,
@@ -385,7 +392,8 @@ async def send_inference_request(idle_device, end_point_id, inference_url, input
 
             logging.info(f"Use mqtt inference. return {response_ok}.")
             return inference_response
-        return {"error": True, "message": "Failed to use http, http-proxy for inference, no response from replica."}
+        else:
+            return {"error": True, "message": "Failed to use http, http-proxy for inference, no response from replica."}
     except Exception as e:
         inference_response = {"error": True,
                               "message": f"Exception when using http, http-proxy and mqtt "
diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
index a10bd2c559..b9b9b4c356 100755
--- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
+++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
@@ -250,14 +250,6 @@ def process_deployment_result_message(self, topic=None, payload=None):
         logging.info(f"Endpoint {end_point_id}; Device {device_id}; replica {replica_no}; "
                      f"run_operation {run_operation} model status {model_status}.")
 
-        # OPTIONAL DEBUG PARAMS
-        # this_run_controller = self.model_runner_mapping[run_id_str].replica_controller
-        # logging.info(f"The current replica controller state is "
-        #              f"Total version diff num {this_run_controller.total_replica_version_diff_num}")
-        # logging.info(f"self.request_json now {self.request_json}")    # request_json will be deprecated
-        # this_run_request_json = self.request_json
-        # logging.info(f"self.request_json now {this_run_request_json}")
-
         # Set redis + sqlite deployment result
         FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password)
 
diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py
index 3c357e9dab..9e178228b2 100755
--- a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py
+++ b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py
@@ -9,6 +9,8 @@
 from abc import ABC
 import yaml
 from fedml.computing.scheduler.comm_utils.job_utils import JobRunnerUtils
+from fedml.computing.scheduler.comm_utils.network_util import return_this_device_connectivity_type
+
 from fedml.core.mlops import MLOpsRuntimeLog
 from fedml.computing.scheduler.comm_utils import file_utils
 from .device_client_constants import ClientConstants
@@ -234,8 +236,11 @@ def run_impl(self, run_extend_queue_list, sender_message_center,
         running_model_name, inference_output_url, inference_model_version, model_metadata, model_config = \
             "", "", model_version, {}, {}
 
+        # ip and connectivity
+        worker_ip = GeneralConstants.get_ip_address(self.request_json)
+        connectivity = return_this_device_connectivity_type()
+
         if op == "add":
-            worker_ip = GeneralConstants.get_ip_address(self.request_json)
             for rank in range(prev_rank + 1, prev_rank + 1 + op_num):
                 try:
                     running_model_name, inference_output_url, inference_model_version, model_metadata, model_config = \
@@ -269,7 +274,9 @@ def run_impl(self, run_extend_queue_list, sender_message_center,
                     result_payload = self.send_deployment_results(
                         end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED,
                         model_id, model_name, inference_output_url, model_version, inference_port_external,
-                        inference_engine, model_metadata, model_config, replica_no=rank + 1)
+                        inference_engine, model_metadata, model_config, replica_no=rank + 1,
+                        connectivity=connectivity
+                    )
 
                     if inference_port_external != inference_port:
                         # Save internal port to local db
@@ -278,7 +285,9 @@ def run_impl(self, run_extend_queue_list, sender_message_center,
                         result_payload = self.construct_deployment_results(
                             end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED,
                             model_id, model_name, inference_output_url, model_version, inference_port,
-                            inference_engine, model_metadata, model_config, replica_no=rank + 1)
+                            inference_engine, model_metadata, model_config, replica_no=rank + 1,
+                            connectivity=connectivity
+                        )
 
                     FedMLModelDatabase.get_instance().set_deployment_result(
                         run_id, end_point_name, model_name, model_version, self.edge_id,
@@ -326,7 +335,6 @@ def run_impl(self, run_extend_queue_list, sender_message_center,
             return True
         elif op == "update" or op == "rollback":
             # Update is combine of delete and add
-            worker_ip = GeneralConstants.get_ip_address(self.request_json)
             for rank in replica_rank_to_update:
                 # Delete a replica (container) if exists
                 self.replica_handler.remove_replica(rank)
@@ -402,7 +410,9 @@ def run_impl(self, run_extend_queue_list, sender_message_center,
                     result_payload = self.send_deployment_results(
                         end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED,
                         model_id, model_name, inference_output_url, model_version, inference_port_external,
-                        inference_engine, model_metadata, model_config, replica_no=rank + 1)
+                        inference_engine, model_metadata, model_config, replica_no=rank + 1,
+                        connectivity=connectivity
+                    )
 
                     if inference_port_external != inference_port:  # Save internal port to local db
                         logging.info("inference_port_external {} != inference_port {}".format(
@@ -410,7 +420,9 @@ def run_impl(self, run_extend_queue_list, sender_message_center,
                         result_payload = self.construct_deployment_results(
                             end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED,
                             model_id, model_name, inference_output_url, model_version, inference_port,
-                            inference_engine, model_metadata, model_config, replica_no=rank + 1)
+                            inference_engine, model_metadata, model_config, replica_no=rank + 1,
+                            connectivity=connectivity
+                        )
 
                     FedMLModelDatabase.get_instance().set_deployment_result(
                         run_id, end_point_name, model_name, model_version, self.edge_id,
@@ -433,7 +445,8 @@ def run_impl(self, run_extend_queue_list, sender_message_center,
     def construct_deployment_results(self, end_point_name, device_id, model_status,
                                      model_id, model_name, model_inference_url,
                                      model_version, inference_port, inference_engine,
-                                     model_metadata, model_config, replica_no=1):
+                                     model_metadata, model_config, replica_no=1,
+                                     connectivity=ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP):
         deployment_results_payload = {"end_point_id": self.run_id, "end_point_name": end_point_name,
                                       "model_id": model_id, "model_name": model_name,
                                       "model_url": model_inference_url, "model_version": model_version,
@@ -444,6 +457,7 @@ def construct_deployment_results(self, end_point_name, device_id, model_status,
                                       "model_status": model_status,
                                       "inference_port": inference_port,
                                       "replica_no": replica_no,
+                                      "connectivity_type": connectivity,
                                       }
         return deployment_results_payload
 
@@ -466,7 +480,8 @@ def construct_deployment_status(self, end_point_name, device_id,
     def send_deployment_results(self, end_point_name, device_id, model_status,
                                 model_id, model_name, model_inference_url,
                                 model_version, inference_port, inference_engine,
-                                model_metadata, model_config, replica_no=1):
+                                model_metadata, model_config, replica_no=1,
+                                connectivity=ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP):
         deployment_results_topic = "model_device/model_device/return_deployment_result/{}/{}".format(
             self.run_id, device_id)
 
@@ -474,7 +489,7 @@ def send_deployment_results(self, end_point_name, device_id, model_status,
             end_point_name, device_id, model_status,
             model_id, model_name, model_inference_url,
             model_version, inference_port, inference_engine,
-            model_metadata, model_config, replica_no=replica_no)
+            model_metadata, model_config, replica_no=replica_no, connectivity=connectivity)
 
         logging.info("[client] send_deployment_results: topic {}, payload {}.".format(deployment_results_topic,
                                                                                       deployment_results_payload))
diff --git a/python/fedml/computing/scheduler/scheduler_core/general_constants.py b/python/fedml/computing/scheduler/scheduler_core/general_constants.py
index 68c1a8e09d..8c60b17bdf 100755
--- a/python/fedml/computing/scheduler/scheduler_core/general_constants.py
+++ b/python/fedml/computing/scheduler/scheduler_core/general_constants.py
@@ -192,14 +192,14 @@ def get_public_ip():
     @staticmethod
     def get_ip_address(request_json, infer_host=None):
         # OPTION 1: Use local ip
-        ip = GeneralConstants.get_local_ip()
-
-        # OPTION 2: Auto detect public ip
-        if "parameters" in request_json and \
-                GeneralConstants.CONFIG_KEY_AUTO_DETECT_PUBLIC_IP in request_json["parameters"] and \
-                request_json["parameters"][GeneralConstants.CONFIG_KEY_AUTO_DETECT_PUBLIC_IP]:
-            ip = GeneralConstants.get_public_ip()
-            logging.info("Auto detect public ip for master: " + ip)
+        # ip = GeneralConstants.get_local_ip()
+        #
+        # # OPTION 2: Auto detect public ip
+        # if "parameters" in request_json and \
+        #         GeneralConstants.CONFIG_KEY_AUTO_DETECT_PUBLIC_IP in request_json["parameters"] and \
+        #         request_json["parameters"][GeneralConstants.CONFIG_KEY_AUTO_DETECT_PUBLIC_IP]:
+        ip = GeneralConstants.get_public_ip()
+        logging.info("Auto detect public ip for master: " + ip)
 
         # OPTION 3: Use user indicated ip
         if infer_host is not None and infer_host != "127.0.0.1" and infer_host != "localhost":

From 4a9622c439f4368a4111490aef8722145825c659 Mon Sep 17 00:00:00 2001
From: fedml-dimitris <dimitris@fedml.ai>
Date: Tue, 11 Jun 2024 15:53:08 -0400
Subject: [PATCH 02/38] Adding default http connectivity type constant. Fixing
 minor typos and reducing condition checks.

---
 .../scheduler/comm_utils/network_util.py      | 16 +++++++++-------
 .../device_client_constants.py                |  1 +
 .../model_scheduler/device_model_cache.py     | 10 +++++++---
 .../model_scheduler/device_model_inference.py | 19 ++++++++++---------
 .../model_scheduler/worker_job_runner.py      |  4 ++--
 5 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/python/fedml/computing/scheduler/comm_utils/network_util.py b/python/fedml/computing/scheduler/comm_utils/network_util.py
index 13674840c5..48e478f23f 100644
--- a/python/fedml/computing/scheduler/comm_utils/network_util.py
+++ b/python/fedml/computing/scheduler/comm_utils/network_util.py
@@ -6,11 +6,13 @@ def return_this_device_connectivity_type() -> str:
     """
     Return -> "http" | "http_proxy" |"mqtt"
     """
-    if os.environ.get(ClientConstants.ENV_CONNECTION_TYPE_KEY) == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP:
-        return ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP
-    elif os.environ.get(ClientConstants.ENV_CONNECTION_TYPE_KEY) == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP_PROXY:
-        return ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP_PROXY
-    elif os.environ.get(ClientConstants.ENV_CONNECTION_TYPE_KEY) == ClientConstants.WORKER_CONNECTIVITY_TYPE_MQTT:
-        return ClientConstants.WORKER_CONNECTIVITY_TYPE_MQTT
+    # Get the environmental variable's value and convert to lower case.
+    env_conn_type = os.getenv(ClientConstants.ENV_CONNECTION_TYPE_KEY, "").lower()
+    if env_conn_type in [
+        ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP,
+        ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP_PROXY,
+        ClientConstants.WORKER_CONNECTIVITY_TYPE_MQTT
+    ]:
+        return env_conn_type
     else:
-        return ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP
+        return ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
index d66c2f966a..2c06189d2e 100644
--- a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
@@ -101,6 +101,7 @@ class ClientConstants(object):
     WORKER_CONNECTIVITY_TYPE_HTTP = "http"
     WORKER_CONNECTIVITY_TYPE_HTTP_PROXY = "http_proxy"
     WORKER_CONNECTIVITY_TYPE_MQTT = "mqtt"
+    WORKER_CONNECTIVITY_TYPE_DEFAULT = WORKER_CONNECTIVITY_TYPE_HTTP
 
     MSG_MODELOPS_DEPLOYMENT_STATUS_INITIALIZING = "INITIALIZING"
     MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYING = "DEPLOYING"
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py
index 30e4f460e6..6c90944277 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py
@@ -344,9 +344,13 @@ def get_result_item_info(self, result_item):
             result_payload = result_item_json["result"]
         return device_id, replica_no, result_payload
 
-    def get_idle_device(self, end_point_id, end_point_name,
-                        model_name, model_version,
-                        check_end_point_status=True, limit_specific_model_version=False):
+    def get_idle_device(self,
+                        end_point_id,
+                        end_point_name,
+                        model_name,
+                        model_version,
+                        check_end_point_status=True,
+                        limit_specific_model_version=False):
         # Deprecated the model status logic, query directly from the deployment result list
         idle_device_list = list()
 
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py
index a9205ceb9a..3aeec67932 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py
@@ -313,16 +313,17 @@ def found_idle_inference_device(end_point_id, end_point_name, in_model_name, in_
     # Found idle device (TODO: optimize the algorithm to search best device for inference)
     payload, idle_device = FEDML_MODEL_CACHE. \
         get_idle_device(end_point_id, end_point_name, in_model_name, in_model_version)
-    if payload is not None:
-        deployment_result = payload
-        model_name = deployment_result["model_name"]
-        model_version = deployment_result["model_version"]
-        model_id = deployment_result["model_id"]
-        end_point_id = deployment_result["end_point_id"]
-        inference_output_url = deployment_result["model_url"]
+    if payload:
+        model_name = payload["model_name"]
+        model_version = payload["model_version"]
+        model_id = payload["model_id"]
+        end_point_id = payload["end_point_id"]
+        inference_output_url = payload["model_url"]
+        connectivity_type = \
+            payload.get("connectivity_type",
+                        ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT)
         url_parsed = urlparse(inference_output_url)
         inference_host = url_parsed.hostname
-        connectivity_type = deployment_result.get("connectivity_type", ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP)
     else:
         logging.info("not found idle deployment result")
 
@@ -335,7 +336,7 @@ def found_idle_inference_device(end_point_id, end_point_name, in_model_name, in_
 
 async def send_inference_request(idle_device, end_point_id, inference_url, input_list, output_list,
                                  inference_type="default",
-                                 connectivity_type=ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP):
+                                 connectivity_type=ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT):
     request_timeout_sec = FEDML_MODEL_CACHE.get_endpoint_settings(end_point_id) \
         .get("request_timeout_sec", ClientConstants.INFERENCE_REQUEST_TIMEOUT)
 
diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py
index 9e178228b2..ef65e37904 100755
--- a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py
+++ b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py
@@ -446,7 +446,7 @@ def construct_deployment_results(self, end_point_name, device_id, model_status,
                                      model_id, model_name, model_inference_url,
                                      model_version, inference_port, inference_engine,
                                      model_metadata, model_config, replica_no=1,
-                                     connectivity=ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP):
+                                     connectivity=ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT):
         deployment_results_payload = {"end_point_id": self.run_id, "end_point_name": end_point_name,
                                       "model_id": model_id, "model_name": model_name,
                                       "model_url": model_inference_url, "model_version": model_version,
@@ -481,7 +481,7 @@ def send_deployment_results(self, end_point_name, device_id, model_status,
                                 model_id, model_name, model_inference_url,
                                 model_version, inference_port, inference_engine,
                                 model_metadata, model_config, replica_no=1,
-                                connectivity=ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP):
+                                connectivity=ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT):
         deployment_results_topic = "model_device/model_device/return_deployment_result/{}/{}".format(
             self.run_id, device_id)
 

From 23d88fc7dcfdbe9f9b319a08b72b39f0c58fdbb3 Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Tue, 11 Jun 2024 11:48:20 -0700
Subject: [PATCH 03/38] [Deploy] Remove unnecessary logic.

---
 .../device_model_deployment.py                | 232 +-----------------
 .../model_scheduler/master_job_runner.py      |   1 -
 .../model_scheduler/worker_job_runner.py      |  16 +-
 3 files changed, 10 insertions(+), 239 deletions(-)

diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
index 1876373d25..5d3ba9873d 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
@@ -1,12 +1,13 @@
+import fedml
+
 import logging
 import os
-import pickle
-import platform
 import shutil
 import time
 import traceback
 import yaml
 import datetime
+import docker
 
 import requests
 import torch
@@ -15,27 +16,18 @@
 
 import collections.abc
 
-import fedml
 from fedml.computing.scheduler.comm_utils import sys_utils, security_utils
-from fedml.computing.scheduler.comm_utils.container_utils import ContainerUtils
 from fedml.computing.scheduler.comm_utils.hardware_utils import HardwareUtil
 from fedml.computing.scheduler.comm_utils.job_utils import JobRunnerUtils
-
-for type_name in collections.abc.__all__:
-    setattr(collections, type_name, getattr(collections.abc, type_name))
-
 from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants
 from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants
-import io
-
-import docker
-from ..scheduler_core.compute_cache_manager import ComputeCacheManager
+from fedml.computing.scheduler.model_scheduler.device_model_cache import FedMLModelCache
 from ..scheduler_core.compute_utils import ComputeUtils
 from ..comm_utils.container_utils import ContainerUtils
-
 from .device_http_inference_protocol import FedMLHttpInference
 
-from fedml.computing.scheduler.model_scheduler.device_model_cache import FedMLModelCache
+for type_name in collections.abc.__all__:
+    setattr(collections, type_name, getattr(collections.abc, type_name))
 
 no_real_gpu_allocation = None
 
@@ -432,8 +424,6 @@ def should_exit_logs(end_point_id, model_id, cmd_type, model_name, inference_eng
     if cmd_type == ClientConstants.CMD_TYPE_RUN_DEFAULT_SERVER:
         # TODO: Exited Quickly if the container is Exited or Removed
         # If the container has exited, return True, means we should exit the logs
-        # container_name = "{}".format(ClientConstants.FEDML_DEFAULT_SERVER_CONTAINER_NAME_PREFIX) + "__" + \
-        #                             security_utils.get_content_hash(model_name)
         try:
             inference_output_url, model_version, model_metadata, model_config = \
                 get_model_info(model_name, inference_engine, inference_port, infer_host,
@@ -554,8 +544,6 @@ def log_deployment_result(end_point_id, model_id, cmd_container_name, cmd_type,
 
 def is_client_inference_container_ready(infer_url_host, inference_http_port, inference_model_name, local_infer_url,
                                         inference_type="default", model_version="", request_input_example=None):
-    # logging.info(f"Inference type: {inference_type}, infer_url_host {infer_url_host}, \
-    #               inference_http_port: {inference_http_port}, local_infer_url {local_infer_url}")
 
     if inference_type == "default":
         default_client_container_ready_url = "http://{}:{}/ready".format("0.0.0.0", inference_http_port)
@@ -631,211 +619,5 @@ def run_http_inference_with_curl_request(inference_url, inference_input_list, in
         inference_type=inference_type, engine_type=engine_type, timeout=timeout)
 
 
-def convert_model_to_onnx(
-        torch_model, output_path: str, dummy_input_list, input_size: int, input_is_tensor=True
-) -> None:
-    from collections import OrderedDict
-    import torch
-    from torch.onnx import TrainingMode
-
-    torch.onnx.export(torch_model,  # model being run
-                      dummy_input_list if input_is_tensor else tuple(dummy_input_list),
-                      # model input (or a tuple for multiple inputs)
-                      f=output_path,  # where to save the model (can be a file or file-like object)
-                      export_params=True,  # store the trained parameter weights inside the model file
-                      opset_version=11,  # the ONNX version to export the model to
-                      do_constant_folding=False,  # whether to execute constant folding for optimization
-                      input_names=["input1", "input2"],
-                      # the model's input names
-                      output_names=['output'],  # the model's output names
-                      training=TrainingMode.EVAL,
-                      verbose=True,
-                      dynamic_axes={"input1": {0: "batch_size"},
-                                    "input2": {0: "batch_size"},
-                                    "output": {0: "batch_size"}}
-                      )
-
-
-def test_start_triton_server(model_serving_dir):
-    sudo_prefix = "sudo "
-    sys_name = platform.system()
-    if sys_name == "Darwin":
-        sudo_prefix = ""
-        gpu_attach_cmd = ""
-
-    triton_server_container_name = "{}".format(ClientConstants.FEDML_TRITON_SERVER_CONTAINER_NAME_PREFIX)
-    triton_server_cmd = "{}docker stop {}; {}docker rm {}; {}docker run --name {} {} -p{}:8000 " \
-                        "-p{}:8001 -p{}:8002 " \
-                        "--shm-size {} " \
-                        "-v {}:/models {} " \
-                        "bash -c \"pip install transformers && tritonserver --strict-model-config=false " \
-                        "--model-control-mode=poll --repository-poll-secs={} " \
-                        "--model-repository=/models\" ".format(sudo_prefix, triton_server_container_name,
-                                                               sudo_prefix, triton_server_container_name,
-                                                               sudo_prefix, triton_server_container_name,
-                                                               gpu_attach_cmd,
-                                                               ClientConstants.INFERENCE_HTTP_PORT,
-                                                               ClientConstants.INFERENCE_GRPC_PORT,
-                                                               8002,
-                                                               "4096m",
-                                                               model_serving_dir,
-                                                               ClientConstants.INFERENCE_SERVER_IMAGE,
-                                                               ClientConstants.FEDML_MODEL_SERVING_REPO_SCAN_INTERVAL)
-    logging.info("Run triton inference server: {}".format(triton_server_cmd))
-    triton_server_process = ClientConstants.exec_console_with_script(triton_server_cmd,
-                                                                     should_capture_stdout=False,
-                                                                     should_capture_stderr=False,
-                                                                     no_sys_out_err=True)
-
-
-def test_convert_pytorch_model_to_onnx(model_net_file, model_bin_file, model_name, model_in_params):
-    torch_model = torch.jit.load(model_net_file)
-    with open(model_bin_file, 'rb') as model_pkl_file:
-        model_state_dict = pickle.load(model_pkl_file)
-        torch_model.load_state_dict(model_state_dict)
-        torch_model.eval()
-
-    input_size = model_in_params["input_size"]
-    input_types = model_in_params["input_types"]
-
-    dummy_input_list = []
-    for index, input_i in enumerate(input_size):
-        if input_types[index] == "int":
-            this_input = torch.tensor(torch.randint(0, 1, input_i))
-        else:
-            this_input = torch.tensor(torch.zeros(input_i))
-        dummy_input_list.append(this_input)
-
-    onnx_model_dir = os.path.join(ClientConstants.get_model_cache_dir(),
-                                  ClientConstants.FEDML_CONVERTED_MODEL_DIR_NAME,
-                                  model_name, ClientConstants.INFERENCE_MODEL_VERSION)
-    if not os.path.exists(onnx_model_dir):
-        os.makedirs(onnx_model_dir, exist_ok=True)
-    onnx_model_path = os.path.join(onnx_model_dir, "model.onnx")
-
-    convert_model_to_onnx(torch_model, onnx_model_path, dummy_input_list, input_size,
-                          input_is_tensor=True)
-
-    model_serving_dir = os.path.join(ClientConstants.get_model_cache_dir(),
-                                     ClientConstants.FEDML_CONVERTED_MODEL_DIR_NAME)
-    return model_serving_dir
-
-
-def start_gpu_model_load_process():
-    from multiprocessing import Process
-    import time
-    process = Process(target=load_gpu_model_to_cpu_device)
-    process.start()
-    while True:
-        time.sleep(1)
-
-
-def load_gpu_model_to_cpu_device():
-    import pickle
-    import io
-    import torch
-
-    class CPU_Unpickler(pickle.Unpickler):
-        def find_class(self, module, name):
-            if module == 'torch.storage' and name == '_load_from_bytes':
-                return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
-            else:
-                return super().find_class(module, name)
-
-    model_file = "/home/fedml/.fedml/fedml-client/fedml/models/theta_rec_auc_81_single_label/theta_rec_auc_81_single_label"
-    with open(model_file, "rb") as model_pkl_file:
-        if not torch.cuda.is_available():
-            model = CPU_Unpickler(model_pkl_file).load()
-            if model is None:
-                print("Failed to load gpu model to cpu device")
-            else:
-                print("Succeeded to load gpu model to cpu device")
-
-
 if __name__ == "__main__":
-    start_gpu_model_load_process()
-
-    model_serving_dir = test_convert_pytorch_model_to_onnx("./sample-open-training-model-net",
-                                                           "./sample-open-training-model",
-                                                           "rec-model",
-                                                           {"input_size": [[1, 24], [1, 2]],
-                                                            "input_types": ["int", "float"]})
-
-    test_start_triton_server(model_serving_dir)
-
-    # input_data = {"model_version": "v0-Sun Feb 05 12:17:16 GMT 2023",
-    #               "model_name": "model_414_45_open-model-test_v0-Sun-Feb-05-12-17-16-GMT-2023",
-    #               # "data": "file:///Users/alexliang/fedml_data/mnist-image.png",
-    #               "data": "https://raw.githubusercontent.com/niyazed/triton-mnist-example/master/images/sample_image.png",
-    #               "end_point_id": 414, "model_id": 45, "token": "a09a18a14c4c4d89a8d5f9515704c073"}
-    #
-    # data_list = list()
-    # data_list.append(input_data["data"])
-    # run_http_inference_with_lib_http_api_with_image_data(input_data["model_name"],
-    #                                                      5001, 1, data_list, "")
-    #
-    #
-    # class LogisticRegression(torch.nn.Module):
-    #     def __init__(self, input_dim, output_dim):
-    #         super(LogisticRegression, self).__init__()
-    #         self.linear = torch.nn.Linear(input_dim, output_dim)
-    #
-    #     def forward(self, x):
-    #         outputs = torch.sigmoid(self.linear(x))
-    #         return outputs
-    #
-    #
-    # model = LogisticRegression(28 * 28, 10)
-    # checkpoint = {'model': model}
-    # model_net_file = "/Users/alexliang/fedml-client/fedml/models/open-model-test/model-net.pt"
-    # torch.save(checkpoint, model_net_file)
-    #
-    # with open("/Users/alexliang/fedml-client/fedml/models/open-model-test/open-model-test", 'rb') as model_pkl_file:
-    #     model_params = pickle.load(model_pkl_file)
-    #     # torch.save(model_params, "/Users/alexliang/fedml-client/fedml/models/open-model-test/a.pt")
-    #     # model = torch.load("/Users/alexliang/fedml-client/fedml/models/open-model-test/a.pt")
-    #     loaded_checkpoint = torch.load(model_net_file)
-    #     loaded_model = loaded_checkpoint["model"]
-    #     loaded_model.load_state_dict(model_params)
-    #     for parameter in loaded_model.parameters():
-    #         parameter.requires_grad = False
-    #     loaded_model.eval()
-    #     input_names = {"x": 0}
-    #     convert_model_to_onnx(loaded_model, "/Users/alexliang/fedml-client/fedml/models/open-model-test/a.onnx",
-    #                           input_names, 28 * 28)
-
-    # parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    # parser.add_argument("--cf", "-c", help="config file")
-    # parser.add_argument("--role", "-r", type=str, default="client", help="role")
-    # parser.add_argument("--model_storage_local_path", "-url", type=str, default="/home/ubuntu",
-    #                     help="model storage local path")
-    # parser.add_argument("--inference_model_name", "-n", type=str, default="fedml-model",
-    #                     help="inference model name")
-    # parser.add_argument("--inference_engine", "-engine", type=str, default="ONNX", help="inference engine")
-    # parser.add_argument("--inference_http_port", "-http", type=int, default=8000, help="inference http port")
-    # parser.add_argument("--inference_grpc_port", "-gprc", type=int, default=8001, help="inference grpc port")
-    # parser.add_argument("--inference_metric_port", "-metric", type=int, default=8002, help="inference metric port")
-    # parser.add_argument("--inference_use_gpu", "-gpu", type=str, default="gpu", help="inference use gpu")
-    # parser.add_argument("--inference_memory_size", "-mem", type=str, default="256m", help="inference memory size")
-    # parser.add_argument("--inference_convertor_image", "-convertor", type=str,
-    #                     default=ClientConstants.INFERENCE_CONVERTOR_IMAGE, help="inference convertor image")
-    # parser.add_argument("--inference_server_image", "-server", type=str,
-    #                     default=ClientConstants.INFERENCE_SERVER_IMAGE, help="inference server image")
-    # args = parser.parse_args()
-    # args.user = args.user
-    #
-    # pip_source_dir = os.path.dirname(__file__)
-    # __running_model_name, __inference_output_url, __model_version, __model_metadata, __model_config = \
-    #     start_deployment(
-    #         args.model_storage_local_path,
-    #         args.inference_model_name,
-    #         args.inference_engine,
-    #         args.inference_http_port,
-    #         args.inference_grpc_port,
-    #         args.inference_metric_port,
-    #         args.inference_use_gpu,
-    #         args.inference_memory_size,
-    #         args.inference_convertor_image,
-    #         args.inference_server_image)
-    # print("Model deployment results, running model name: {}, url: {}, model metadata: {}, model config: {}".format(
-    #     __running_model_name, __inference_output_url, __model_metadata, __model_config))
+    pass
diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
index b9b9b4c356..ef2c01c49d 100755
--- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
+++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
@@ -453,7 +453,6 @@ def process_deployment_result_message(self, topic=None, payload=None):
             time.sleep(3)
             self.trigger_completed_event()
 
-
     def cleanup_runner_process(self, run_id):
         ServerConstants.cleanup_run_process(run_id, not_kill_subprocess=True)
 
diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py
index ef65e37904..8100707386 100755
--- a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py
+++ b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py
@@ -294,9 +294,7 @@ def run_impl(self, run_extend_queue_list, sender_message_center,
                         json.dumps(result_payload), replica_no=rank + 1)
 
                     logging.info(f"Deploy replica {rank + 1} / {prev_rank + 1 + op_num} successfully.")
-                    time.sleep(5)
 
-            time.sleep(1)
             self.status_reporter.run_id = self.run_id
             self.status_reporter.report_client_id_status(
                 self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED,
@@ -348,7 +346,8 @@ def run_impl(self, run_extend_queue_list, sender_message_center,
 
                 # TODO (Raphael) check if this will allow another job to seize the gpu during high concurrency:
                 try:
-                    JobRunnerUtils.get_instance().release_partial_job_gpu(run_id, self.edge_id, replica_occupied_gpu_ids)
+                    JobRunnerUtils.get_instance().release_partial_job_gpu(
+                        run_id, self.edge_id, replica_occupied_gpu_ids)
                 except Exception as e:
                     if op == "rollback":
                         pass
@@ -395,7 +394,7 @@ def run_impl(self, run_extend_queue_list, sender_message_center,
                         JobRunnerUtils.get_instance().release_partial_job_gpu(
                             run_id, self.edge_id, replica_occupied_gpu_ids)
 
-                    result_payload = self.send_deployment_results(
+                    self.send_deployment_results(
                         end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED,
                         model_id, model_name, inference_output_url, inference_model_version, inference_port,
                         inference_engine, model_metadata, model_config)
@@ -496,15 +495,6 @@ def send_deployment_results(self, end_point_name, device_id, model_status,
         self.message_center.send_message_json(deployment_results_topic, json.dumps(deployment_results_payload))
         return deployment_results_payload
 
-    def send_deployment_status(self, end_point_name, device_id,
-                               model_id, model_name, model_version,
-                               model_inference_url, model_status,
-                               inference_port=ClientConstants.MODEL_INFERENCE_DEFAULT_PORT,
-                               replica_no=1,     # start from 1
-                               ):
-        # Deprecated
-        pass
-
     def reset_devices_status(self, edge_id, status):
         self.status_reporter.run_id = self.run_id
         self.status_reporter.edge_id = edge_id

From e0ad9b5bef5bcea1eaefe3458a3d6b49aa399d46 Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Tue, 11 Jun 2024 12:15:22 -0700
Subject: [PATCH 04/38] [Deploy] Remove unnecessary logic; Rename readiness
 check function; Forbidden user level control of host post.

---
 .../device_model_deployment.py                | 150 +++++-------------
 1 file changed, 40 insertions(+), 110 deletions(-)

diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
index 5d3ba9873d..edd2ebea9a 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
@@ -68,6 +68,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
     num_gpus = gpu_per_replica
     gpu_ids, gpu_attach_cmd = None, ""
 
+    # Concatenate the model name
     running_model_name = ClientConstants.get_running_model_name(
         end_point_name, inference_model_name, model_version, end_point_id, model_id, edge_id=edge_id)
 
@@ -77,6 +78,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
         config = yaml.safe_load(file)
 
         # Resource related
+        inference_type = "default"
         use_gpu = config.get('use_gpu', True)
         num_gpus_frm_yml = config.get('num_gpus', None)
         if not use_gpu:
@@ -85,9 +87,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
             if num_gpus_frm_yml is not None:
                 num_gpus = int(num_gpus_frm_yml)
         usr_indicated_wait_time = config.get('deploy_timeout', 900)
-        usr_indicated_worker_port = config.get('worker_port', "")
-        if usr_indicated_worker_port == "":
-            usr_indicated_worker_port = os.environ.get("FEDML_WORKER_PORT", "")
+        usr_indicated_retry_cnt = max(int(usr_indicated_wait_time) // 10, 1)
         shm_size = config.get('shm_size', None)
         storage_opt = config.get('storage_opt', None)
         tmpfs = config.get('tmpfs', None)
@@ -96,17 +96,6 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
             cpus = int(cpus)
         memory = config.get('memory', None)
 
-        if usr_indicated_worker_port == "":
-            usr_indicated_worker_port = None
-        else:
-            usr_indicated_worker_port = int(usr_indicated_worker_port)
-
-        worker_port_env = os.environ.get("FEDML_WORKER_PORT", "")
-        worker_port_from_config = config.get('worker_port', "")
-        logging.info(f"usr_indicated_worker_port {usr_indicated_worker_port}, worker port env {worker_port_env}, "
-              f"worker port from config {worker_port_from_config}")
-
-        usr_indicated_retry_cnt = max(int(usr_indicated_wait_time) // 10, 1)
         inference_image_name = config.get('inference_image_name',
                                           ClientConstants.INFERENCE_SERVER_CUSTOME_IMAGE)
         image_pull_policy = config.get('image_pull_policy', SchedulerConstants.IMAGE_PULL_POLICY_IF_NOT_PRESENT)
@@ -144,6 +133,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
 
         # If using customized image, then bootstrap + job will be the entry point
         enable_custom_image = config.get("enable_custom_image", False)
+        # inference_type = "custom"
         customized_image_entry_cmd = \
             "/bin/bash /home/fedml/models_serving/fedml-deploy-bootstrap-entry-auto-gen.sh"
 
@@ -151,18 +141,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
         docker_registry_user_password = config.get("docker_registry_user_password", "")
         docker_registry = config.get("docker_registry", "")
 
-        port_inside_container = int(config.get("port_inside_container", 2345))
-        use_triton = config.get("use_triton", False)
-        if use_triton:
-            inference_type = "triton"
-        else:
-            inference_type = "default"
-
-    # Config check
-    if src_code_dir == "":
-        raise Exception("Please indicate source_code_dir in the fedml_model_config.yaml")
-    if relative_entry == "":
-        logging.warning("You missed main_entry in the fedml_model_config.yaml")
+        port_inside_container = int(config.get("port", 2345))
 
     # Request the GPU ids for the deployment
     if num_gpus > 0:
@@ -175,22 +154,10 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
             end_point_id, end_point_name, inference_model_name, edge_id, replica_rank+1, gpu_ids)
     logging.info("GPU ids allocated: {}".format(gpu_ids))
 
+    # Create the model serving dir if not exists
     model_serving_dir = ClientConstants.get_model_serving_dir()
     if not os.path.exists(model_serving_dir):
         os.makedirs(model_serving_dir, exist_ok=True)
-    converted_model_path = os.path.join(model_storage_local_path, ClientConstants.FEDML_CONVERTED_MODEL_DIR_NAME)
-    if os.path.exists(converted_model_path):
-        model_file_list = os.listdir(converted_model_path)
-        for model_file in model_file_list:
-            src_model_file = os.path.join(converted_model_path, model_file)
-            dst_model_file = os.path.join(model_serving_dir, model_file)
-            if os.path.isdir(src_model_file):
-                if not os.path.exists(dst_model_file):
-                    shutil.copytree(src_model_file, dst_model_file, copy_function=shutil.copy,
-                                    ignore_dangling_symlinks=True)
-            else:
-                if not os.path.exists(dst_model_file):
-                    shutil.copyfile(src_model_file, dst_model_file)
 
     if inference_engine != ClientConstants.INFERENCE_ENGINE_TYPE_INT_DEFAULT:
         raise Exception(f"inference engine {inference_engine} is not supported")
@@ -228,13 +195,12 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
     logging.info(f"Start pulling the inference image {inference_image_name}... with policy {image_pull_policy}")
     ContainerUtils.get_instance().pull_image_with_policy(image_pull_policy, inference_image_name)
 
-    volumns = []
+    volumes = []
     binds = {}
     environment = {}
 
     # data_cache_dir mounting
-    assert type(data_cache_dir_input) == dict or type(data_cache_dir_input) == str
-    if type(data_cache_dir_input) == str:
+    if isinstance(data_cache_dir_input, str):
         # In this case, we mount to the same folder, if it has ~, we replace it with /home/fedml
         src_data_cache_dir, dst_data_cache_dir = "", ""
         if data_cache_dir_input != "":
@@ -253,28 +219,30 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
             if type(src_data_cache_dir) == str and src_data_cache_dir != "":
                 logging.info("Start copying the data cache to the container...")
                 if os.path.exists(src_data_cache_dir):
-                    volumns.append(src_data_cache_dir)
+                    volumes.append(src_data_cache_dir)
                     binds[src_data_cache_dir] = {
                         "bind": dst_data_cache_dir,
                         "mode": "rw"
                     }
                     environment["DATA_CACHE_FOLDER"] = dst_data_cache_dir
-    else:
+    elif isinstance(data_cache_dir_input, dict):
         for k, v in data_cache_dir_input.items():
             if os.path.exists(k):
-                volumns.append(v)
+                volumes.append(v)
                 binds[k] = {
                     "bind": v,
                     "mode": "rw"
                 }
             else:
                 logging.warning(f"{k} does not exist, skip mounting it to the container")
-        logging.info(f"Data cache mount: {volumns}, {binds}")
+        logging.info(f"Data cache mount: {volumes}, {binds}")
+    else:
+        logging.warning("data_cache_dir_input is not a string or a dictionary, skip mounting it to the container")
 
     # Default mounting
     if not enable_custom_image or (enable_custom_image and relative_entry != ""):
         logging.info("Start copying the source code to the container...")
-        volumns.append(src_code_dir)
+        volumes.append(src_code_dir)
         binds[src_code_dir] = {
             "bind": dst_model_serving_dir,
             "mode": "rw"
@@ -284,7 +252,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
     host_config_dict = {
         "binds": binds,
         "port_bindings": {
-            port_inside_container: usr_indicated_worker_port
+            port_inside_container: None
         },
         "shm_size": shm_size,
         "storage_opt": storage_opt,
@@ -312,7 +280,6 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
     if not enable_custom_image:
         # For some image, the default user is root. Unified to fedml.
         environment["HOME"] = "/home/fedml"
-
     environment["BOOTSTRAP_DIR"] = dst_bootstrap_dir
     environment["FEDML_CURRENT_RUN_ID"] = end_point_id
     environment["FEDML_CURRENT_EDGE_ID"] = edge_id
@@ -326,12 +293,13 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
         for key in extra_envs:
             environment[key] = extra_envs[key]
 
+    # Create the container
     try:
         host_config = client.api.create_host_config(**host_config_dict)
         new_container = client.api.create_container(
             image=inference_image_name,
             name=default_server_container_name,
-            volumes=volumns,
+            volumes=volumes,
             ports=[port_inside_container],  # port open inside the container
             environment=environment,
             host_config=host_config,
@@ -349,22 +317,18 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
     while True:
         cnt += 1
         try:
-            if usr_indicated_worker_port is not None:
-                inference_http_port = usr_indicated_worker_port
-                break
-            else:
-                # Find the random port
-                port_info = client.api.port(new_container.get("Id"), port_inside_container)
-                inference_http_port = port_info[0]["HostPort"]
-                logging.info("inference_http_port: {}".format(inference_http_port))
-                break
+            # Find the random port
+            port_info = client.api.port(new_container.get("Id"), port_inside_container)
+            inference_http_port = port_info[0]["HostPort"]
+            logging.info("host port allocated: {}".format(inference_http_port))
+            break
         except:
             if cnt >= 5:
                 raise Exception("Failed to get the port allocation")
             time.sleep(3)
 
     # Logging the info from the container when starting
-    log_deployment_result(end_point_id, model_id, default_server_container_name,
+    log_deployment_output(end_point_id, model_id, default_server_container_name,
                           ClientConstants.CMD_TYPE_RUN_DEFAULT_SERVER,
                           inference_model_name, inference_engine, inference_http_port, inference_type,
                           retry_interval=10, deploy_attempt_threshold=usr_indicated_retry_cnt,
@@ -373,9 +337,8 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
 
     # Return the running model name and the inference output url
     inference_output_url, running_model_version, ret_model_metadata, ret_model_config = \
-        get_model_info(inference_model_name, inference_engine, inference_http_port,
-                       infer_host, False, inference_type, request_input_example=request_input_example,
-                       enable_custom_image=enable_custom_image)
+        check_container_readiness(inference_http_port=inference_http_port, infer_host=infer_host,
+                                  request_input_example=request_input_example)
 
     if inference_output_url == "":
         return running_model_name, "", None, None, None
@@ -426,9 +389,8 @@ def should_exit_logs(end_point_id, model_id, cmd_type, model_name, inference_eng
         # If the container has exited, return True, means we should exit the logs
         try:
             inference_output_url, model_version, model_metadata, model_config = \
-                get_model_info(model_name, inference_engine, inference_port, infer_host,
-                               inference_type=inference_type, request_input_example=request_input_example,
-                               enable_custom_image=enable_custom_image)
+                check_container_readiness(inference_http_port=inference_port, infer_host=infer_host,
+                                          request_input_example=request_input_example)
             if inference_output_url != "":
                 logging.info("Log test for deploying model successfully, inference url: {}, "
                              "model metadata: {}, model config: {}".
@@ -443,7 +405,7 @@ def should_exit_logs(end_point_id, model_id, cmd_type, model_name, inference_eng
         return False
 
 
-def log_deployment_result(end_point_id, model_id, cmd_container_name, cmd_type,
+def log_deployment_output(end_point_id, model_id, cmd_container_name, cmd_type,
                           inference_model_name, inference_engine,
                           inference_http_port, inference_type="default",
                           retry_interval=10, deploy_attempt_threshold=10,
@@ -542,10 +504,10 @@ def log_deployment_result(end_point_id, model_id, cmd_container_name, cmd_type,
         time.sleep(retry_interval)
 
 
-def is_client_inference_container_ready(infer_url_host, inference_http_port, inference_model_name, local_infer_url,
-                                        inference_type="default", model_version="", request_input_example=None):
+def is_client_inference_container_ready(infer_url_host, inference_http_port, readiness_check_type="default",
+                                        readiness_check_cmd=None, request_input_example=None):
 
-    if inference_type == "default":
+    if readiness_check_type == "default":
         default_client_container_ready_url = "http://{}:{}/ready".format("0.0.0.0", inference_http_port)
         response = None
         try:
@@ -555,7 +517,7 @@ def is_client_inference_container_ready(infer_url_host, inference_http_port, inf
         if not response or response.status_code != 200:
             return "", "", {}, {}
 
-        # Report the deployed model info
+        # Construct the model metadata (input and output)
         model_metadata = {}
         if request_input_example is not None and len(request_input_example) > 0:
             model_metadata["inputs"] = request_input_example
@@ -563,51 +525,19 @@ def is_client_inference_container_ready(infer_url_host, inference_http_port, inf
             model_metadata["inputs"] = {"text": "What is a good cure for hiccups?"}
         model_metadata["outputs"] = []
         model_metadata["type"] = "default"
+
         return "http://{}:{}/predict".format(infer_url_host, inference_http_port), None, model_metadata, None
     else:
-        triton_server_url = "{}:{}".format(infer_url_host, inference_http_port)
-        if model_version == "" or model_version is None:
-            model_version = ClientConstants.INFERENCE_MODEL_VERSION
-        logging.info(
-            f"triton_server_url: {triton_server_url} model_version: {model_version} model_name: {inference_model_name}")
-        triton_client = http_client.InferenceServerClient(url=triton_server_url, verbose=False)
-        if not triton_client.is_model_ready(
-            model_name=inference_model_name, model_version=model_version
-        ):
-            return "", model_version, {}, {}
-        logging.info(f"Model {inference_model_name} is ready, start to get model metadata...")
-        model_metadata = triton_client.get_model_metadata(model_name=inference_model_name, model_version=model_version)
-        model_config = triton_client.get_model_config(model_name=inference_model_name, model_version=model_version)
-        version_list = model_metadata.get("versions", None)
-        if version_list is not None and len(version_list) > 0:
-            model_version = version_list[0]
-        else:
-            model_version = ClientConstants.INFERENCE_MODEL_VERSION
-
-        inference_output_url = "http://{}:{}/{}/models/{}/versions/{}/infer".format(infer_url_host,
-                                                                                    inference_http_port,
-                                                                                    ClientConstants.INFERENCE_INFERENCE_SERVER_VERSION,
-                                                                                    inference_model_name,
-                                                                                    model_version)
-
-        return inference_output_url, model_version, model_metadata, model_config
-
-
-def get_model_info(model_name, inference_engine, inference_http_port, infer_host="127.0.0.1", is_hg_model=False,
-                   inference_type="default", request_input_example=None, enable_custom_image=False):
-    if model_name is None:
+        # TODO(Raphael): Support arbitrary readiness check command
+        logging.error(f"Unknown readiness check type: {readiness_check_type}")
         return "", "", {}, {}
 
-    local_infer_url = "{}:{}".format(infer_host, inference_http_port)
-
-    if is_hg_model:
-        inference_model_name = "{}_{}_inference".format(model_name, str(inference_engine))
-    else:
-        inference_model_name = model_name
 
+def check_container_readiness(inference_http_port, infer_host="127.0.0.1", request_input_example=None,
+                              readiness_check_type="default", readiness_check_cmd=None):
     response_from_client_container = is_client_inference_container_ready(
-        infer_host, inference_http_port, inference_model_name, local_infer_url,
-        inference_type, model_version="", request_input_example=request_input_example)
+        infer_host, inference_http_port, readiness_check_type, readiness_check_cmd,
+        request_input_example=request_input_example)
 
     return response_from_client_container
 

From 64e8c779c61edfecf7ca8e638b6b54ff31d7983b Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Tue, 11 Jun 2024 16:29:37 -0700
Subject: [PATCH 05/38] [Deploy] Nit

---
 .../computing/scheduler/model_scheduler/device_model_cards.py    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_cards.py b/python/fedml/computing/scheduler/model_scheduler/device_model_cards.py
index 8feb757a63..c2f11a2917 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_cards.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_cards.py
@@ -14,7 +14,6 @@
 
 from fedml.core.common.singleton import Singleton
 from fedml.computing.scheduler.model_scheduler.modelops_configs import ModelOpsConfigs
-from fedml.computing.scheduler.model_scheduler.device_model_deployment import get_model_info
 from fedml.computing.scheduler.model_scheduler.device_server_constants import ServerConstants
 from fedml.computing.scheduler.model_scheduler.device_model_object import FedMLModelList, FedMLEndpointDetail
 from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants

From 9194f8424f77008b49a48908ee72f19fe59ba23d Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Tue, 11 Jun 2024 16:42:46 -0700
Subject: [PATCH 06/38] [Deploy] Hide unnecessary log.

---
 .../scheduler/model_scheduler/device_model_cache.py       | 8 ++++----
 .../scheduler/model_scheduler/device_model_inference.py   | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py
index 6c90944277..c941c42102 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py
@@ -369,7 +369,7 @@ def get_idle_device(self,
                 if "model_status" in result_payload and result_payload["model_status"] == "DEPLOYED":
                     idle_device_list.append({"device_id": device_id, "end_point_id": end_point_id})
 
-        logging.info(f"{len(idle_device_list)} devices this model has on it: {idle_device_list}")
+        logging.debug(f"{len(idle_device_list)} devices this model has on it: {idle_device_list}")
 
         if len(idle_device_list) <= 0:
             return None, None
@@ -398,7 +398,7 @@ def get_idle_device(self,
             logging.info("Inference Device selection Failed:")
             logging.info(e)
 
-        logging.info(f"Using Round Robin, the device index is {selected_device_index}")
+        logging.debug(f"Using Round Robin, the device index is {selected_device_index}")
         idle_device_dict = idle_device_list[selected_device_index]
 
         # Note that within the same endpoint_id, there could be one device with multiple same models
@@ -411,7 +411,7 @@ def get_idle_device(self,
         # Find deployment result from the target idle device.
         try:
             for result_item in result_list:
-                logging.info("enter the for loop")
+                logging.debug("enter the for loop")
                 device_id, _, result_payload = self.get_result_item_info(result_item)
                 found_end_point_id = result_payload["end_point_id"]
                 found_end_point_name = result_payload["end_point_name"]
@@ -425,7 +425,7 @@ def get_idle_device(self,
                     if same_model_device_rank > 0:
                         same_model_device_rank -= 1
                         continue
-                    logging.info(f"The chosen device is {device_id}")
+                    logging.debug(f"The chosen device is {device_id}")
                     return result_payload, device_id
         except Exception as e:
             logging.info(str(e))
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py
index 3aeec67932..ba13006245 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py
@@ -230,7 +230,7 @@ async def _predict(
             model_metrics.set_start_time(start_time)
 
             # Send inference request to idle device
-            logging.info("inference url {}.".format(inference_output_url))
+            logging.debug("inference url {}.".format(inference_output_url))
             if inference_output_url != "":
                 input_list = input_json.get("inputs", input_json)
                 stream_flag = input_json.get("stream", False)
@@ -329,7 +329,7 @@ def found_idle_inference_device(end_point_id, end_point_name, in_model_name, in_
 
     res = (idle_device, end_point_id, model_id, model_name, model_version, inference_host, inference_output_url,
            connectivity_type)
-    logging.info(f"found idle device with metrics: {res}")
+    logging.debug(f"found idle device with metrics: {res}")
 
     return res
 
@@ -352,7 +352,7 @@ async def send_inference_request(idle_device, end_point_id, inference_url, input
                     output_list,
                     inference_type=inference_type,
                     timeout=request_timeout_sec)
-                logging.info(f"Use http inference. return {response_ok}")
+                logging.debug(f"Use http inference. return {response_ok}")
                 return inference_response
         elif connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP_PROXY:
             logging.warning("Use http proxy inference.")

From 243be07831c7ffd078203f402efae339ed0b58a3 Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Tue, 11 Jun 2024 17:50:30 -0700
Subject: [PATCH 07/38] [Deploy] Read port info from env.

---
 .../scheduler/model_scheduler/device_client_constants.py  | 1 +
 .../scheduler/model_scheduler/device_server_constants.py  | 1 +
 .../scheduler/model_scheduler/master_job_runner.py        | 6 +++---
 .../scheduler/model_scheduler/worker_protocol_manager.py  | 8 +++++++-
 4 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
index 2c06189d2e..f1e7dea91f 100644
--- a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
@@ -74,6 +74,7 @@ class ClientConstants(object):
     K8S_DEPLOYMENT_SLAVE_MOUNT_HOME_DIR = "/home/fedml/fedml-client"
 
     LOCAL_CLIENT_API_PORT = 22030
+    ENV_CLIENT_PROXY_PORT_KEY = "FEDML_WORKER_INFERENCE_PROXY_PORT"
 
     INFERENCE_HTTP_PORT = 8000
     INFERENCE_GRPC_PORT = 8001
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py
index 243c197b2f..a868d03b41 100644
--- a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py
@@ -103,6 +103,7 @@ class ServerConstants(object):
 
     AUTO_DETECT_PUBLIC_IP = "auto_detect_public_ip"
     MODEL_INFERENCE_DEFAULT_PORT = 2203
+    ENV_MASTER_INFERENCE_PORT_KEY = "FEDML_MASTER_INFERENCE_GATEWAY_PORT"
     MODEL_CACHE_KEY_EXPIRE_TIME = 1 * 10
 
     INFERENCE_REQUEST_TIMEOUT_KEY = "request_timeout_sec"
diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
index ef2c01c49d..d7565d7647 100755
--- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
+++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
@@ -460,9 +460,9 @@ def cleanup_runner_process(self, run_id):
     def start_device_inference_gateway(inference_port=ServerConstants.MODEL_INFERENCE_DEFAULT_PORT):
         # start unified inference server
         python_program = get_python_program()
-        master_port = os.getenv("FEDML_MASTER_PORT", None)
-        if master_port is not None:
-            inference_port = int(master_port)
+        master_port_frm_env = os.getenv(ServerConstants.MODEL_INFERENCE_DEFAULT_PORT, None)
+        if master_port_frm_env is not None:
+            inference_port = int(master_port_frm_env)
         if not ServerConstants.is_running_on_k8s():
             logging.info(f"start the model inference gateway...")
             inference_gw_cmd = "fedml.computing.scheduler.model_scheduler.device_model_inference:api"
diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py b/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py
index f9bc70452d..ee59f87441 100755
--- a/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py
+++ b/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py
@@ -79,6 +79,12 @@ def _init_extra_items(self):
 
         client_api_cmd = "fedml.computing.scheduler.model_scheduler.device_client_api:api"
         client_api_pids = RunProcessUtils.get_pid_from_cmd_line(client_api_cmd)
+
+        worker_proxy_port = ClientConstants.LOCAL_CLIENT_API_PORT
+        worker_proxy_port_frm_env = os.environ.get(ClientConstants.ENV_CLIENT_PROXY_PORT_KEY, None)
+        if worker_proxy_port_frm_env is not None:
+            worker_proxy_port = int(worker_proxy_port_frm_env)
+
         if client_api_pids is None or len(client_api_pids) <= 0:
             # Start local API services
             cur_dir = os.path.dirname(__file__)
@@ -88,7 +94,7 @@ def _init_extra_items(self):
                 "{} -m uvicorn {} --host 0.0.0.0 --port {} --reload --reload-delay 3 --reload-dir {} "
                 "--log-level critical".format(
                     python_program, client_api_cmd,
-                    ClientConstants.LOCAL_CLIENT_API_PORT, fedml_base_dir
+                    worker_proxy_port, fedml_base_dir
                 ),
                 should_capture_stdout=False,
                 should_capture_stderr=False

From 3a034717f7ebc43ff035e73cc49c13ea1c2e7d79 Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Wed, 12 Jun 2024 12:04:36 -0700
Subject: [PATCH 08/38] [Deploy] Nit.

---
 .../computing/scheduler/model_scheduler/master_job_runner.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
index d7565d7647..67a3e8bb82 100755
--- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
+++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
@@ -460,7 +460,7 @@ def cleanup_runner_process(self, run_id):
     def start_device_inference_gateway(inference_port=ServerConstants.MODEL_INFERENCE_DEFAULT_PORT):
         # start unified inference server
         python_program = get_python_program()
-        master_port_frm_env = os.getenv(ServerConstants.MODEL_INFERENCE_DEFAULT_PORT, None)
+        master_port_frm_env = os.getenv(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY, None)
         if master_port_frm_env is not None:
             inference_port = int(master_port_frm_env)
         if not ServerConstants.is_running_on_k8s():

From f0dd29e04fa600339c0efb74526694d4dee2842e Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Wed, 12 Jun 2024 12:12:25 -0700
Subject: [PATCH 09/38] [Deploy] Nit.

---
 .../scheduler/model_scheduler/master_job_runner.py        | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
index 67a3e8bb82..5f82a6c046 100755
--- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
+++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
@@ -369,12 +369,8 @@ def process_deployment_result_message(self, topic=None, payload=None):
             """
             When all the devices have finished the add / delete / update operation
             """
-            # Generate one unified inference api
-            # Note that here we use the gateway port instead of the inference port that is used by the slave device
-            model_config_parameters = request_json["parameters"]
-            inference_port = model_config_parameters.get("server_internal_port",
-                                                         ServerConstants.MODEL_INFERENCE_DEFAULT_PORT)
-            inference_port_external = model_config_parameters.get("server_external_port", inference_port)
+            inference_port_external = os.environ.get(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY,
+                                                     ServerConstants.MODEL_INFERENCE_DEFAULT_PORT)
             ip = GeneralConstants.get_ip_address(request_json)
 
             if ip.startswith("http://") or ip.startswith("https://"):

From 21a8a4c9d97e712f029f0e7abe39e0b5e56954a2 Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Wed, 12 Jun 2024 12:18:58 -0700
Subject: [PATCH 10/38] [Deploy] Change few more places relate to gateway port.

---
 .../scheduler/model_scheduler/master_job_runner.py  | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
index 5f82a6c046..50d902b933 100755
--- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
+++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
@@ -144,7 +144,8 @@ def run_impl(
                 # No device is added, updated or removed
                 logging.info("No device is added, updated or removed. No action needed for reconciliation.")
                 ip = GeneralConstants.get_ip_address(self.request_json)
-                master_port = os.getenv("FEDML_MASTER_PORT", None)
+                master_port = os.environ.get(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY,
+                                                     ServerConstants.MODEL_INFERENCE_DEFAULT_PORT)
                 if master_port is not None:
                     inference_port = int(master_port)
                 model_inference_port = inference_port
@@ -299,9 +300,8 @@ def process_deployment_result_message(self, topic=None, payload=None):
                 else:
                     # This is the last worker that failed, so we should continue to "ABORTED" status
                     model_config_parameters = self.request_json["parameters"]
-                    inference_port = model_config_parameters.get("server_internal_port",
-                                                                 ServerConstants.MODEL_INFERENCE_DEFAULT_PORT)
-                    inference_port_external = model_config_parameters.get("server_external_port", inference_port)
+                    inference_port_external = os.environ.get(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY,
+                                                             ServerConstants.MODEL_INFERENCE_DEFAULT_PORT)
                     ip = GeneralConstants.get_ip_address(self.request_json)
                     if ip.startswith("http://") or ip.startswith("https://"):
                         model_inference_url = "{}/inference/{}".format(ip, end_point_id)
@@ -753,9 +753,8 @@ def parse_model_run_params(running_json):
         model_version = model_config["model_version"]
         model_config_parameters = running_json.get("parameters", {})
 
-        inference_port = model_config_parameters.get("server_internal_port",  # Internal port is for the gateway
-                                                     ServerConstants.MODEL_INFERENCE_DEFAULT_PORT)
-        inference_port_external = model_config_parameters.get("server_external_port", inference_port)
+        inference_port = int(os.environ.get(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY,
+                                        ServerConstants.MODEL_INFERENCE_DEFAULT_PORT))
 
         return run_id, end_point_name, token, user_id, user_name, device_ids, device_objs, model_config, model_name, \
             model_id, model_storage_url, scale_min, scale_max, inference_engine, model_is_from_open, \

From e7e974d24f510a47e2ee5e9df1a6161665fffa1e Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Wed, 12 Jun 2024 15:29:53 -0700
Subject: [PATCH 11/38] [Deploy] Write port info into env file.

---
 python/fedml/api/__init__.py                  | 12 ++++++----
 python/fedml/api/modules/device.py            | 13 ++++++++---
 python/fedml/cli/modules/login.py             | 22 +++++++++++++++++--
 .../device_client_constants.py                |  8 +++++++
 .../device_server_constants.py                |  9 ++++++++
 .../model_scheduler/master_job_runner.py      | 22 +++++++------------
 .../worker_protocol_manager.py                |  3 +--
 7 files changed, 64 insertions(+), 25 deletions(-)

diff --git a/python/fedml/api/__init__.py b/python/fedml/api/__init__.py
index 3e75b987d6..f753e4255b 100755
--- a/python/fedml/api/__init__.py
+++ b/python/fedml/api/__init__.py
@@ -24,6 +24,8 @@
 from fedml.computing.scheduler.scheduler_entry.cluster_manager import FedMLClusterModelList
 from fedml.computing.scheduler.scheduler_entry.run_manager import FedMLRunStartedModel, FedMLGpuDevices, \
     FedMLRunModelList, FeatureEntryPoint
+from fedml.computing.scheduler.model_scheduler.device_server_constants import ServerConstants
+from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants
 
 
 def fedml_login(api_key: str = None):
@@ -209,16 +211,18 @@ def fedml_build(platform, type, source_folder, entry_point, config_folder, dest_
     return build.build(platform, type, source_folder, entry_point, config_folder, dest_folder, ignore)
 
 
-def login(api_key, computing, server, supplier):
-    device_bind(api_key, computing, server, supplier)
+def login(api_key, computing, server, supplier,
+          master_inference_gateway_port: int = ServerConstants.MODEL_INFERENCE_DEFAULT_PORT,
+          worker_inference_proxy_port: int = ClientConstants.LOCAL_CLIENT_API_PORT):
+    device_bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port)
 
 
 def logout(computing, server):
     device_unbind(computing, server)
 
 
-def device_bind(api_key, computing, server, supplier):
-    device.bind(api_key, computing, server, supplier)
+def device_bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port):
+    device.bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port)
 
 
 def device_unbind(computing, server):
diff --git a/python/fedml/api/modules/device.py b/python/fedml/api/modules/device.py
index a853d538d0..14591147a6 100644
--- a/python/fedml/api/modules/device.py
+++ b/python/fedml/api/modules/device.py
@@ -10,14 +10,18 @@
 from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants
 from fedml.computing.scheduler.comm_utils.run_process_utils import RunProcessUtils
 from fedml.computing.scheduler.master.server_constants import ServerConstants
+from fedml.computing.scheduler.model_scheduler.device_server_constants import ServerConstants as DeviceServerConstants
 from fedml.computing.scheduler.master.server_login import logout as server_logout
 from fedml.computing.scheduler.slave.client_constants import ClientConstants
+from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants as DeviceClientConstants
 from fedml.computing.scheduler.slave.client_login import logout as client_logout
 from fedml.computing.scheduler.scheduler_entry.resource_manager import FedMLResourceManager
 
 
 def bind(
-        api_key, computing, server, supplier
+        api_key, computing, server, supplier,
+        master_inference_gateway_port=DeviceServerConstants.MODEL_INFERENCE_DEFAULT_PORT,
+        worker_inference_proxy_port=DeviceClientConstants.LOCAL_CLIENT_API_PORT
 ):
     userid = api_key
     runner_cmd = "{}"
@@ -43,13 +47,13 @@ def bind(
     _bind(
         userid, computing, server,
         api_key, role, runner_cmd, device_id, os_name,
-        docker)
+        docker, master_inference_gateway_port, worker_inference_proxy_port)
 
 
 def _bind(
         userid, computing, server,
         api_key, role, runner_cmd, device_id, os_name,
-        docker):
+        docker, master_inference_gateway_port, worker_inference_proxy_port):
     fedml.load_env()
     if os.getenv(ModuleConstants.ENV_FEDML_INFER_HOST) is None:
         fedml.set_env_kv(ModuleConstants.ENV_FEDML_INFER_HOST, SchedulerConstants.REDIS_INFER_HOST)
@@ -60,6 +64,9 @@ def _bind(
     if os.getenv(ModuleConstants.ENV_FEDML_INFER_REDIS_PASSWORD) is None:
         fedml.set_env_kv(ModuleConstants.ENV_FEDML_INFER_REDIS_PASSWORD, SchedulerConstants.REDIS_PASSWORD)
 
+    fedml.set_env_kv(DeviceServerConstants.ENV_MASTER_INFERENCE_PORT_KEY, str(master_inference_gateway_port))
+    fedml.set_env_kv(DeviceClientConstants.ENV_CLIENT_PROXY_PORT_KEY, str(worker_inference_proxy_port))
+
     url = fedml._get_backend_service()
     platform_name = platform.system()
     docker_config_text = None
diff --git a/python/fedml/cli/modules/login.py b/python/fedml/cli/modules/login.py
index f2e4d76322..f3c982f456 100644
--- a/python/fedml/cli/modules/login.py
+++ b/python/fedml/cli/modules/login.py
@@ -4,6 +4,8 @@
 
 import fedml.api
 from fedml.api.modules.utils import authenticate
+from fedml.computing.scheduler.model_scheduler.device_server_constants import ServerConstants
+from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants
 
 
 @click.command("login", help="Login the FedML® Nexus AI Platform")
@@ -51,9 +53,25 @@
     default=80,
     help="The port for local on-premise Nexus AI Platform.",
 )
+@click.option(
+    "--master_inference_gateway_port",
+    "-mgp",
+    type=int,
+    default=ServerConstants.MODEL_INFERENCE_DEFAULT_PORT,
+    help="The port for master inference gateway.",
+)
+@click.option(
+    "--worker_inference_proxy_port",
+    "-wpp",
+    type=int,
+    default=ClientConstants.LOCAL_CLIENT_API_PORT,
+    help="The port for worker inference proxy.",
+)
 def fedml_login(
         api_key, version, compute_node, server, provider, deploy_worker_num,
-        local_on_premise_platform, local_on_premise_platform_port):
+        local_on_premise_platform, local_on_premise_platform_port,
+        master_inference_gateway_port, worker_inference_proxy_port
+):
     fedml.set_env_version(version)
     fedml.set_local_on_premise_platform_host(local_on_premise_platform)
     fedml.set_local_on_premise_platform_port(local_on_premise_platform_port)
@@ -66,4 +84,4 @@ def fedml_login(
         print(f"Maybe you are using account id to login, we will try to login with account {api_key}.")
         pass
     os.environ["FEDML_MODEL_WORKER_NUM"] = str(deploy_worker_num)
-    fedml.api.login(api_key, compute_node, server, provider)
+    fedml.api.login(api_key, compute_node, server, provider, master_inference_gateway_port, worker_inference_proxy_port)
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
index f1e7dea91f..fdcbdf0a34 100644
--- a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
@@ -458,6 +458,14 @@ def get_public_ip():
             logging.info("Failed to get public ip: {}".format(e))
         return ip
 
+    @staticmethod
+    def get_inference_worker_proxy_port() -> int:
+        # Use dotenv to load the environment variables
+        fedml.load_env()
+        worker_proxy_port = int(os.getenv(ClientConstants.ENV_CLIENT_PROXY_PORT_KEY,
+                                      default=ClientConstants.LOCAL_CLIENT_API_PORT))
+        return worker_proxy_port
+
     @staticmethod
     def check_process_is_running(process_id):
         for proc in psutil.process_iter():
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py
index a868d03b41..a5048c26a6 100644
--- a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py
@@ -349,6 +349,15 @@ def get_runner_infos():
             logging.error(f"Failed to parse runner info: {e}")
         return runner_info
 
+    @staticmethod
+    def get_inference_master_gateway_port():
+        # Use dotenv to load the environment variables
+        fedml.load_env()
+        master_inference_port = int(os.getenv(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY,
+                                            default=ServerConstants.MODEL_INFERENCE_DEFAULT_PORT))
+        return master_inference_port
+
+
     @staticmethod
     def save_runner_infos(unique_device_id, edge_id, run_id=None):
         local_pkg_data_dir = ServerConstants.get_data_dir()
diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
index 50d902b933..eff26684b7 100755
--- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
+++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
@@ -115,7 +115,7 @@ def run_impl(
             message_center=self.message_center)
 
         # start unified inference gateway process if not started
-        FedMLDeployMasterJobRunner.start_device_inference_gateway(inference_port=inference_port)
+        FedMLDeployMasterJobRunner.start_device_inference_gateway()
 
         # start inference monitor process
         FedMLDeployMasterJobRunner.stop_device_inference_monitor(
@@ -144,8 +144,7 @@ def run_impl(
                 # No device is added, updated or removed
                 logging.info("No device is added, updated or removed. No action needed for reconciliation.")
                 ip = GeneralConstants.get_ip_address(self.request_json)
-                master_port = os.environ.get(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY,
-                                                     ServerConstants.MODEL_INFERENCE_DEFAULT_PORT)
+                master_port = ServerConstants.get_inference_master_gateway_port()
                 if master_port is not None:
                     inference_port = int(master_port)
                 model_inference_port = inference_port
@@ -300,8 +299,7 @@ def process_deployment_result_message(self, topic=None, payload=None):
                 else:
                     # This is the last worker that failed, so we should continue to "ABORTED" status
                     model_config_parameters = self.request_json["parameters"]
-                    inference_port_external = os.environ.get(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY,
-                                                             ServerConstants.MODEL_INFERENCE_DEFAULT_PORT)
+                    inference_port_external = ServerConstants.get_inference_master_gateway_port()
                     ip = GeneralConstants.get_ip_address(self.request_json)
                     if ip.startswith("http://") or ip.startswith("https://"):
                         model_inference_url = "{}/inference/{}".format(ip, end_point_id)
@@ -369,8 +367,7 @@ def process_deployment_result_message(self, topic=None, payload=None):
             """
             When all the devices have finished the add / delete / update operation
             """
-            inference_port_external = os.environ.get(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY,
-                                                     ServerConstants.MODEL_INFERENCE_DEFAULT_PORT)
+            inference_port_external = ServerConstants.get_inference_master_gateway_port()
             ip = GeneralConstants.get_ip_address(request_json)
 
             if ip.startswith("http://") or ip.startswith("https://"):
@@ -453,12 +450,10 @@ def cleanup_runner_process(self, run_id):
         ServerConstants.cleanup_run_process(run_id, not_kill_subprocess=True)
 
     @staticmethod
-    def start_device_inference_gateway(inference_port=ServerConstants.MODEL_INFERENCE_DEFAULT_PORT):
+    def start_device_inference_gateway():
         # start unified inference server
         python_program = get_python_program()
-        master_port_frm_env = os.getenv(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY, None)
-        if master_port_frm_env is not None:
-            inference_port = int(master_port_frm_env)
+        inference_port = ServerConstants.get_inference_master_gateway_port()
         if not ServerConstants.is_running_on_k8s():
             logging.info(f"start the model inference gateway...")
             inference_gw_cmd = "fedml.computing.scheduler.model_scheduler.device_model_inference:api"
@@ -539,7 +534,7 @@ def recover_inference_and_monitor():
                 if not is_activated:
                     continue
 
-                FedMLDeployMasterJobRunner.start_device_inference_gateway(inference_port=inference_port)
+                FedMLDeployMasterJobRunner.start_device_inference_gateway()
 
                 FedMLDeployMasterJobRunner.stop_device_inference_monitor(
                     run_id, end_point_name, model_id, model_name, model_version)
@@ -753,8 +748,7 @@ def parse_model_run_params(running_json):
         model_version = model_config["model_version"]
         model_config_parameters = running_json.get("parameters", {})
 
-        inference_port = int(os.environ.get(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY,
-                                        ServerConstants.MODEL_INFERENCE_DEFAULT_PORT))
+        inference_port = ServerConstants.get_inference_master_gateway_port()
 
         return run_id, end_point_name, token, user_id, user_name, device_ids, device_objs, model_config, model_name, \
             model_id, model_storage_url, scale_min, scale_max, inference_engine, model_is_from_open, \
diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py b/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py
index ee59f87441..cdfa43c33b 100755
--- a/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py
+++ b/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py
@@ -80,8 +80,7 @@ def _init_extra_items(self):
         client_api_cmd = "fedml.computing.scheduler.model_scheduler.device_client_api:api"
         client_api_pids = RunProcessUtils.get_pid_from_cmd_line(client_api_cmd)
 
-        worker_proxy_port = ClientConstants.LOCAL_CLIENT_API_PORT
-        worker_proxy_port_frm_env = os.environ.get(ClientConstants.ENV_CLIENT_PROXY_PORT_KEY, None)
+        worker_proxy_port = ClientConstants.get_inference_worker_proxy_port()
         if worker_proxy_port_frm_env is not None:
             worker_proxy_port = int(worker_proxy_port_frm_env)
 

From 9c8ce99c41e6bf8df8f38fe88a6f782141d3a19e Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Wed, 12 Jun 2024 15:33:47 -0700
Subject: [PATCH 12/38] [Deploy] Nit.

---
 .../scheduler/model_scheduler/worker_protocol_manager.py        | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py b/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py
index cdfa43c33b..b1d0bebc47 100755
--- a/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py
+++ b/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py
@@ -81,8 +81,6 @@ def _init_extra_items(self):
         client_api_pids = RunProcessUtils.get_pid_from_cmd_line(client_api_cmd)
 
         worker_proxy_port = ClientConstants.get_inference_worker_proxy_port()
-        if worker_proxy_port_frm_env is not None:
-            worker_proxy_port = int(worker_proxy_port_frm_env)
 
         if client_api_pids is None or len(client_api_pids) <= 0:
             # Start local API services

From 505103f9f05106712de4ea7078441526ee33b9f7 Mon Sep 17 00:00:00 2001
From: bhargav191098 <bhargav3514@gmail.com>
Date: Thu, 13 Jun 2024 17:08:51 -0700
Subject: [PATCH 13/38] removing zip from upload

---
 python/fedml/api/modules/storage.py | 45 ++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 14 deletions(-)

diff --git a/python/fedml/api/modules/storage.py b/python/fedml/api/modules/storage.py
index e7d492c999..a928b325b2 100644
--- a/python/fedml/api/modules/storage.py
+++ b/python/fedml/api/modules/storage.py
@@ -38,27 +38,42 @@ def upload(data_path, api_key, name, description, tag_list, service, show_progre
 
     if user_id is None:
         return FedMLResponse(code=ResponseCode.FAILURE, message=message)
+
+    data_type = _get_data_type(data_path)
     
-    if(not _check_data_path(data_path)):
+    if(data_type == "invalid"):
         return FedMLResponse(code=ResponseCode.FAILURE,message="Invalid data path")
 
-    archive_path, message = _archive_data(data_path)
-    if not archive_path:
+    if(data_type == "dir"):
+        to_upload_path, message = _archive_data(data_path)
+        name = os.path.splitext(os.path.basename(to_upload_path))[0] if name is None else name
+        file_name = name + ".zip"
+    else:
+        to_upload_path = data_path
+        base_name = os.path.basename(to_upload_path)
+        given_extension = os.path.splitext(name)[1]
+        if given_extension is None or given_extension == "":
+            given_extension = os.path.splitext(base_name)[1]
+        name = base_name if name is None else name + given_extension
+        file_name = name
+
+    if not to_upload_path:
         return FedMLResponse(code=ResponseCode.FAILURE, message=message)
 
-    name = os.path.splitext(os.path.basename(archive_path))[0] if name is None else name
-    file_name = name + ".zip"
+
     dest_path = os.path.join(user_id, file_name)
-    file_size = os.path.getsize(archive_path)
+    file_size = os.path.getsize(to_upload_path)
 
-    file_uploaded_url, message = _upload_multipart(api_key, file_name, archive_path, show_progress,
+    file_uploaded_url, message = _upload_multipart(api_key, file_name, to_upload_path, show_progress,
                                                        out_progress_to_err,
                                                        progress_desc, metadata)
 
-
-    os.remove(archive_path)
+    if(data_type == "dir"):
+        os.remove(to_upload_path)
     if not file_uploaded_url:
-        return FedMLResponse(code=ResponseCode.FAILURE, message=f"Failed to upload file: {archive_path}")
+        return FedMLResponse(code=ResponseCode.FAILURE, message=f"Failed to upload file: {to_upload_path}")
+
+    print("url: ",file_uploaded_url)
 
     json_data = {
         "datasetName": name,
@@ -438,10 +453,12 @@ def _get_storage_service(service):
     else:
         raise NotImplementedError(f"Service {service} not implemented")
 
-def _check_data_path(data_path):
-    if os.path.isdir(data_path) or os.path.isfile(data_path):
-        return True
-    return False
+def _get_data_type(data_path):
+    if os.path.isdir(data_path):
+        return "dir"
+    elif os.path.isfile(data_path):
+        return "file"
+    return "invalid"
 
 
 def _archive_data(data_path: str) -> (str, str):

From 03c58a2a42d8b43b3adf6331b38e38de92cc69d2 Mon Sep 17 00:00:00 2001
From: bhargav191098 <bhargav3514@gmail.com>
Date: Thu, 13 Jun 2024 17:32:36 -0700
Subject: [PATCH 14/38] changes in the download to support files

---
 python/fedml/api/modules/storage.py | 23 ++++++++++++++++++-----
 python/fedml/cli/modules/storage.py |  2 +-
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/python/fedml/api/modules/storage.py b/python/fedml/api/modules/storage.py
index a928b325b2..94031c163e 100644
--- a/python/fedml/api/modules/storage.py
+++ b/python/fedml/api/modules/storage.py
@@ -110,13 +110,26 @@ def download(data_name, api_key, service, dest_path, show_progress=True) -> FedM
             logging.error(error_message)
             return FedMLResponse(code=ResponseCode.FAILURE, message=error_message)
         download_url = metadata.download_url
-        zip_file_name = data_name + ".zip"
-        path_local = os.path.abspath(zip_file_name)
+        given_extension = os.path.splitext(data_name)[1]
+        is_file = True
+        if(given_extension is None or given_extension ==""):
+            is_file = False
+
+        if not is_file:
+            download_file_name = data_name + ".zip"
+        else:
+            download_file_name = data_name
+        path_local = os.path.abspath(download_file_name)
         dest_path = os.path.abspath(dest_path) if dest_path else data_name
-        if _download_using_presigned_url(download_url, zip_file_name, show_progress=show_progress):
+        if _download_using_presigned_url(download_url, download_file_name, show_progress=show_progress):
             try:
-                shutil.unpack_archive(path_local, dest_path)
-                os.remove(path_local)
+                if not is_file:
+                    shutil.unpack_archive(path_local, dest_path)
+                    os.remove(path_local)
+                else:
+                    if not os.path.exists(dest_path):
+                        os.makedirs(dest_path)
+                    shutil.move(path_local,dest_path)
                 abs_dest_path = os.path.abspath(dest_path)
                 return FedMLResponse(code=ResponseCode.SUCCESS, message=f"Successfully downloaded and unzipped data at "
                                                                         f"{abs_dest_path}", data=abs_dest_path)
diff --git a/python/fedml/cli/modules/storage.py b/python/fedml/cli/modules/storage.py
index af75cda85f..7e060fc12e 100644
--- a/python/fedml/cli/modules/storage.py
+++ b/python/fedml/cli/modules/storage.py
@@ -47,7 +47,7 @@ def validate_argument(ctx, param, value):
 @click.help_option("--help", "-h")
 @click.argument("data_path", nargs=1, callback=validate_argument)
 @click.option("--name", "-n", type=str, help="Name your data to store. If not provided, the name will be the same as "
-                                             "the data file or directory name.")
+                                             "the data file or directory name. For files, extension need not be mentioned!")
 @click.option("--description", "-d", type=str, help="Add description to your data to store. If not provided, "
                                                     "the description will be empty.")
 @click.option("--user_metadata", "-um", type=str, help="User-defined metadata in the form of a dictionary, for instance, "

From cb7da7009f13fdf0191ba7710fdb0b100d90796f Mon Sep 17 00:00:00 2001
From: bhargav191098 <bhargav3514@gmail.com>
Date: Thu, 13 Jun 2024 17:38:05 -0700
Subject: [PATCH 15/38] print statement removal

---
 python/fedml/api/modules/storage.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/fedml/api/modules/storage.py b/python/fedml/api/modules/storage.py
index 94031c163e..2d10ff2588 100644
--- a/python/fedml/api/modules/storage.py
+++ b/python/fedml/api/modules/storage.py
@@ -73,8 +73,6 @@ def upload(data_path, api_key, name, description, tag_list, service, show_progre
     if not file_uploaded_url:
         return FedMLResponse(code=ResponseCode.FAILURE, message=f"Failed to upload file: {to_upload_path}")
 
-    print("url: ",file_uploaded_url)
-
     json_data = {
         "datasetName": name,
         "description": description,

From 394906ecf03fe2e221bfba4a7a46c87105d26a35 Mon Sep 17 00:00:00 2001
From: bhargav191098 <bhargav3514@gmail.com>
Date: Fri, 14 Jun 2024 12:33:23 -0700
Subject: [PATCH 16/38] name issue

---
 python/fedml/api/modules/storage.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/python/fedml/api/modules/storage.py b/python/fedml/api/modules/storage.py
index 2d10ff2588..3e4219775d 100644
--- a/python/fedml/api/modules/storage.py
+++ b/python/fedml/api/modules/storage.py
@@ -51,10 +51,15 @@ def upload(data_path, api_key, name, description, tag_list, service, show_progre
     else:
         to_upload_path = data_path
         base_name = os.path.basename(to_upload_path)
-        given_extension = os.path.splitext(name)[1]
-        if given_extension is None or given_extension == "":
-            given_extension = os.path.splitext(base_name)[1]
-        name = base_name if name is None else name + given_extension
+        file_extension = os.path.splitext(base_name)[1]
+        given_extension = None
+        if name is not None:
+            given_extension = os.path.splitext(name)[1]
+            if given_extension is None or given_extension == "":
+                name = name + file_extension
+        else:
+            name = base_name
+
         file_name = name
 
     if not to_upload_path:

From 2170797de1235e78f9a92722b495cb01af8d92c2 Mon Sep 17 00:00:00 2001
From: bhargav191098 <bhargav3514@gmail.com>
Date: Fri, 14 Jun 2024 18:13:08 -0700
Subject: [PATCH 17/38] \Adding Enum for data type

---
 python/fedml/api/modules/storage.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/python/fedml/api/modules/storage.py b/python/fedml/api/modules/storage.py
index 3e4219775d..0729c09edc 100644
--- a/python/fedml/api/modules/storage.py
+++ b/python/fedml/api/modules/storage.py
@@ -4,6 +4,7 @@
 
 import requests
 import math
+from enum import Enum, unique
 
 import requests.exceptions
 import tqdm
@@ -26,6 +27,10 @@ def __init__(self, data: dict):
         self.tag_list = data.get("tags", None)
         self.download_url = data.get("fileUrl", None)
 
+class DataType(Enum):
+    FILE = "file"
+    DIRECTORY = "directory"
+    INVALID = "invalid"
 
 # Todo (alaydshah): Store service name in metadata
 # Todo (alaydshah): If data already exists, don't upload again. Instead suggest to use update command
@@ -41,10 +46,10 @@ def upload(data_path, api_key, name, description, tag_list, service, show_progre
 
     data_type = _get_data_type(data_path)
     
-    if(data_type == "invalid"):
+    if(data_type == DataType.INVALID):
         return FedMLResponse(code=ResponseCode.FAILURE,message="Invalid data path")
 
-    if(data_type == "dir"):
+    if(data_type == DataType.DIRECTORY):
         to_upload_path, message = _archive_data(data_path)
         name = os.path.splitext(os.path.basename(to_upload_path))[0] if name is None else name
         file_name = name + ".zip"
@@ -471,10 +476,10 @@ def _get_storage_service(service):
 
 def _get_data_type(data_path):
     if os.path.isdir(data_path):
-        return "dir"
+        return DataType.DIRECTORY
     elif os.path.isfile(data_path):
-        return "file"
-    return "invalid"
+        return DataType.FILE
+    return DataType.INVALID
 
 
 def _archive_data(data_path: str) -> (str, str):

From 5fb5ed43d42f54b0c47e9a0ae802bcab29197052 Mon Sep 17 00:00:00 2001
From: bhargav191098 <bhargav3514@gmail.com>
Date: Fri, 14 Jun 2024 18:32:05 -0700
Subject: [PATCH 18/38] adding user_id to bucket path

---
 python/fedml/api/modules/storage.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/fedml/api/modules/storage.py b/python/fedml/api/modules/storage.py
index 0729c09edc..33e781be08 100644
--- a/python/fedml/api/modules/storage.py
+++ b/python/fedml/api/modules/storage.py
@@ -70,11 +70,11 @@ def upload(data_path, api_key, name, description, tag_list, service, show_progre
     if not to_upload_path:
         return FedMLResponse(code=ResponseCode.FAILURE, message=message)
 
-
+    #TODO(bhargav191098) - Better done on the backend. Remove and pass file_name once completed on backend.
     dest_path = os.path.join(user_id, file_name)
     file_size = os.path.getsize(to_upload_path)
 
-    file_uploaded_url, message = _upload_multipart(api_key, file_name, to_upload_path, show_progress,
+    file_uploaded_url, message = _upload_multipart(api_key, dest_path, to_upload_path, show_progress,
                                                        out_progress_to_err,
                                                        progress_desc, metadata)
 

From aecafb80f9d6731b6b15e4cfca7b15035b82cf84 Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Mon, 17 Jun 2024 14:39:16 -0700
Subject: [PATCH 19/38] Fix compatibility by limiting numpy latest version.

---
 python/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/setup.py b/python/setup.py
index 9651465d32..4757c10a17 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -40,7 +40,7 @@ def finalize_options(self):
     'multiprocess',
     'networkx<3.0',
     'ntplib',
-    'numpy>=1.21',
+    'numpy<2.0.0',
     'onnx',
     'paho-mqtt<2.0.0',
     'pandas',

From 89219fb3c20972ff94badc76f8e90d71592e5647 Mon Sep 17 00:00:00 2001
From: alaydshah <alay11shah@gmail.com>
Date: Tue, 18 Jun 2024 07:00:43 +0000
Subject: [PATCH 20/38] Workaround device mapping inconsistency

---
 .../comm_utils/gpu_utils/gpu_utils.py         |  1 +
 .../comm_utils/gpu_utils/qualcomm_utils.py    | 36 +++++++++++++------
 .../scheduler/comm_utils/hardware_utils.py    |  2 ++
 .../scheduler_core/account_manager.py         |  2 +-
 4 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/python/fedml/computing/scheduler/comm_utils/gpu_utils/gpu_utils.py b/python/fedml/computing/scheduler/comm_utils/gpu_utils/gpu_utils.py
index bc7a3b8216..b48a3e85b7 100644
--- a/python/fedml/computing/scheduler/comm_utils/gpu_utils/gpu_utils.py
+++ b/python/fedml/computing/scheduler/comm_utils/gpu_utils/gpu_utils.py
@@ -27,6 +27,7 @@ class GPUCard:
     memoryUsed: float
     memoryUtil: float
     load: Optional[float] = 0.0
+    device_path: Optional[str] = ""
     uuid: Optional[str] = ""
     display_mode: Optional[str] = ""
     display_active: Optional[str] = ""
diff --git a/python/fedml/computing/scheduler/comm_utils/gpu_utils/qualcomm_utils.py b/python/fedml/computing/scheduler/comm_utils/gpu_utils/qualcomm_utils.py
index 88114cf2ad..13131e362d 100644
--- a/python/fedml/computing/scheduler/comm_utils/gpu_utils/qualcomm_utils.py
+++ b/python/fedml/computing/scheduler/comm_utils/gpu_utils/qualcomm_utils.py
@@ -26,19 +26,22 @@ def detect_gpu_card_type(cls) -> Optional[GPUCardType]:
 
     @staticmethod
     def get_gpu_cards() -> List[GPUCard]:
-        from qaicrt import Util, QIDList, QDevInfo, QStatus
+        return list(QualcommNPUtil.__get_gpu_cards().values())
 
-        cards = []
+    @staticmethod
+    def __get_gpu_cards() -> Dict[int, GPUCard]:
+        from qaicrt import Util, QIDList, QDevInfo, QStatus
+        cards = dict()
         util = Util()
         status, card_list = util.getDeviceIds()
         if status.value == 0:
             for card in card_list:
                 status, card_info = util.getDeviceInfo(card)
                 if status.value == 0 and card_info.devStatus.value == 1:
-                    cards.append(QualcommNPUtil.__convert(card_info))
-
+                    gpu_card = QualcommNPUtil.__convert(card_info)
+                    cards[gpu_card.id] = gpu_card
         else:
-            logging.error("Qualcomm Card Status not Healthy")
+            logging.error("Qualcomm Cards Status not Healthy")
         return cards
 
     @staticmethod
@@ -58,11 +61,21 @@ def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memo
 
     @staticmethod
     def get_docker_gpu_device_mapping(gpu_ids: Optional[List[int]], num_gpus: int = 0) -> Optional[Dict]:
-        if gpu_ids is not None and len(gpu_ids):
-            return {
-                "devices": [f"{QualcommNPUtil.NPU_CARD_PATH}{gpu_id}:{QualcommNPUtil.NPU_CARD_PATH}{gpu_id}" for gpu_id
-                            in gpu_ids]}
-        return None
+        if gpu_ids is None or not len(gpu_ids):
+            return None
+
+        devices = []
+        gpu_cards = QualcommNPUtil.__get_gpu_cards()
+
+        for gpu_id in gpu_ids:
+            if not (gpu_id in gpu_cards and gpu_cards[gpu_id].device_path):
+                logging.error("Failed to get gpu device mapping for docker")
+                break
+            else:
+                device_path = gpu_cards[gpu_id].device_path
+                devices.append(f"{device_path}:{device_path}")
+
+        return {"devices": devices} if len(devices) == len(gpu_ids) else None
 
     @staticmethod
     def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: DockerClient) -> List[int]:
@@ -87,7 +100,8 @@ def __convert(npu) -> GPUCard:
         load = (nsp_total - nsp_free) / nsp_total
 
         return GPUCard(
-            id=npu.qid,
+            id=npu.mhiId,
+            device_path=npu.name,
             name=npu.pciInfo.devicename,
             driver=npu.devData.fwQCImageVersionString,
             serial=npu.devData.serial,
diff --git a/python/fedml/computing/scheduler/comm_utils/hardware_utils.py b/python/fedml/computing/scheduler/comm_utils/hardware_utils.py
index e73809955e..c876948145 100644
--- a/python/fedml/computing/scheduler/comm_utils/hardware_utils.py
+++ b/python/fedml/computing/scheduler/comm_utils/hardware_utils.py
@@ -60,5 +60,7 @@ def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: Doc
 if __name__ == "__main__":
     gpus = HardwareUtil.get_gpus()
     get_available_gpu_cards = HardwareUtil.get_available_gpu_ids(limit=len(gpus))
+    device_mapping = HardwareUtil.get_docker_gpu_device_mapping(get_available_gpu_cards, len(get_available_gpu_cards))
     print(gpus)
     print(get_available_gpu_cards)
+    print(device_mapping)
diff --git a/python/fedml/computing/scheduler/scheduler_core/account_manager.py b/python/fedml/computing/scheduler/scheduler_core/account_manager.py
index 3491e102f6..3b80511d12 100755
--- a/python/fedml/computing/scheduler/scheduler_core/account_manager.py
+++ b/python/fedml/computing/scheduler/scheduler_core/account_manager.py
@@ -266,7 +266,7 @@ def get_uuid():
                     if not use_machine_id:
                         device_id = hex(uuid.getnode())
                     else:
-                        device_id = device_id = FedMLAccountManager.get_gpu_machine_id()
+                        device_id = FedMLAccountManager.get_gpu_machine_id()
             else:
                 device_id = sys_utils.run_subprocess_open(
                     "hal-get-property --udi /org/freedesktop/Hal/devices/computer --key system.hardware.uuid".split()

From 1d5a05db71ba3943cb42eea0836fabc181af7ac6 Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Tue, 18 Jun 2024 18:23:40 -0700
Subject: [PATCH 21/38] [Deploy][Autoscale] Bug fix: continue the for loop if
 no scale op.

---
 python/fedml/computing/scheduler/comm_utils/job_monitor.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/fedml/computing/scheduler/comm_utils/job_monitor.py b/python/fedml/computing/scheduler/comm_utils/job_monitor.py
index a7d5214a02..97a4cb6ebc 100644
--- a/python/fedml/computing/scheduler/comm_utils/job_monitor.py
+++ b/python/fedml/computing/scheduler/comm_utils/job_monitor.py
@@ -148,7 +148,7 @@ def autoscaler_reconcile_after_interval(self):
                     if current_replicas == new_replicas:
                         # Basically the autoscaler decided that no scaling operation should take place.
                         logging.info(f"No scaling operation for endpoint {e_id}.")
-                        return
+                        continue
 
                     # Should scale in / out
                     curr_version = fedml.get_env_version()
@@ -159,7 +159,7 @@ def autoscaler_reconcile_after_interval(self):
                         mlops_prefix = "https://open-test.fedml.ai/"
                     else:
                         logging.error(f"Do not support the version {curr_version}.")
-                        return
+                        continue
                     autoscale_url_path = "fedmlModelServer/api/v1/endpoint/auto-scale"
                     url = f"{mlops_prefix}{autoscale_url_path}"
 
@@ -167,7 +167,7 @@ def autoscaler_reconcile_after_interval(self):
                     cached_token = fedml_model_cache.get_end_point_token(e_id, e_name, model_name)
                     if cached_token is None:
                         logging.error(f"Failed to get the cached token for endpoint {e_id}.")
-                        return
+                        continue
 
                     req_header = {
                         "Authorization": f"Bearer {cached_token}"

From 31c57e01d426a82127fd3cff2ae45ee36f6bbe14 Mon Sep 17 00:00:00 2001
From: fedml-dimitris <dimitris@fedml.ai>
Date: Tue, 18 Jun 2024 21:32:41 -0400
Subject: [PATCH 22/38] Polishing the autoscaler real test.

---
 .../test/scaling_algorithm_real_test.py       | 64 +++++--------------
 1 file changed, 15 insertions(+), 49 deletions(-)

diff --git a/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/scaling_algorithm_real_test.py b/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/scaling_algorithm_real_test.py
index 34721d9002..0fae77c3f3 100644
--- a/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/scaling_algorithm_real_test.py
+++ b/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/scaling_algorithm_real_test.py
@@ -2,9 +2,10 @@
 import logging
 
 from collections import namedtuple
-from fedml.computing.scheduler.model_scheduler.autoscaler.autoscaler import Autoscaler, ReactivePolicy
+from fedml.computing.scheduler.model_scheduler.autoscaler.autoscaler import Autoscaler
 from fedml.core.mlops.mlops_runtime_log import MLOpsRuntimeLog
 from fedml.computing.scheduler.model_scheduler.device_model_cache import FedMLModelCache
+from fedml.computing.scheduler.model_scheduler.autoscaler.policies import ConcurrentQueryPolicy
 
 
 if __name__ == "__main__":
@@ -18,9 +19,6 @@
     parser.add_argument('--redis_addr', default="local")
     parser.add_argument('--redis_port', default=6379)
     parser.add_argument('--redis_password', default="fedml_default")
-    parser.add_argument('--metric',
-                        default="latency",
-                        help="Either latency or qps")
     args = parser.parse_args()
 
     fedml_model_cache = FedMLModelCache.get_instance()
@@ -32,50 +30,18 @@
     # Init the autoscaler
     autoscaler = Autoscaler(args.redis_addr, args.redis_port, args.redis_password)
 
-    latency_reactive_policy_default = {
-        "metric": "latency",
-        "ewm_mins": 15,
-        "ewm_alpha": 0.5,
-        "ub_threshold": 0.5,
-        "lb_threshold": 0.99,
-        "triggering_value": 1.6561916828471053
+    autoscaling_policy_config = {
+            "current_replicas": 1,
+            "min_replicas": 1,
+            "max_replicas": 3,
+            "queries_per_replica": 2,
+            "window_size_secs": 60,
+            "scaledown_delay_secs": 120,
     }
-    qps_reactive_policy_default = {
-        "metric": "qps",
-        "ewm_mins": 15,
-        "ewm_alpha": 0.5,
-        "ub_threshold": 2,
-        "lb_threshold": 0.5
-    }
-    policy_config = latency_reactive_policy_default \
-        if args.metric == "latency" else qps_reactive_policy_default
-    autoscaling_policy = ReactivePolicy(**policy_config)
-
-    for endpoint_settings in endpoints_settings_list:
-        endpoint_state = endpoint_settings["state"]
-        if endpoint_state == "DEPLOYED" and endpoint_settings["enable_auto_scaling"]:
-
-            e_id, e_name, model_name = \
-                endpoint_settings["endpoint_id"], \
-                endpoint_settings["endpoint_name"], \
-                endpoint_settings["model_name"]
-            logging.info(f"Querying the autoscaler for endpoint {e_id} with user settings {endpoint_settings}.")
-
-            # For every endpoint we just update the policy configuration.
-            autoscaling_policy.min_replicas = endpoint_settings["scale_min"]
-            autoscaling_policy.max_replicas = endpoint_settings["scale_max"]
-            # We retrieve a list of replicas for every endpoint. The number
-            # of running replicas is the length of that list.
-            current_replicas = len(fedml_model_cache.get_endpoint_replicas_results(e_id))
-            autoscaling_policy.current_replicas = current_replicas
-            logging.info(f"Endpoint {e_id} autoscaling policy: {autoscaling_policy}.")
-
-            scale_op = autoscaler.scale_operation_endpoint(
-                autoscaling_policy,
-                str(e_id))
-
-            new_replicas = current_replicas + scale_op.value
+    autoscaling_policy = ConcurrentQueryPolicy(**autoscaling_policy_config)
 
-            logging.info(f"Scaling operation {scale_op.value} for endpoint {e_id} .")
-            logging.info(f"New Replicas {new_replicas} for endpoint {e_id} .")
-            logging.info(f"Current Replicas {current_replicas} for endpoint {e_id} .")
+    e_id = 1821952311
+    scale_op = autoscaler.scale_operation_endpoint(
+        autoscaling_policy,
+        str(e_id))
+    logging.info(f"Scaling operation {scale_op.value} for endpoint {e_id} .")

From 4cb53fe55f5a4e748af0daaf27eb53773cdde2d6 Mon Sep 17 00:00:00 2001
From: fedml-dimitris <dimitris@fedml.ai>
Date: Tue, 18 Jun 2024 21:37:12 -0400
Subject: [PATCH 23/38] Replacing e_id.

---
 .../autoscaler/test/scaling_algorithm_real_test.py             | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/scaling_algorithm_real_test.py b/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/scaling_algorithm_real_test.py
index 0fae77c3f3..78a1231abf 100644
--- a/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/scaling_algorithm_real_test.py
+++ b/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/scaling_algorithm_real_test.py
@@ -40,7 +40,8 @@
     }
     autoscaling_policy = ConcurrentQueryPolicy(**autoscaling_policy_config)
 
-    e_id = 1821952311
+    # Please replace the `e_id` below with a proper e_id value.
+    e_id = 1111
     scale_op = autoscaler.scale_operation_endpoint(
         autoscaling_policy,
         str(e_id))

From 31b7ae05772060e589d65b7b07788366d2b6eb4a Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Thu, 20 Jun 2024 00:26:14 +0000
Subject: [PATCH 24/38] [Deploy] Hotfix: job runner context lost when logout.

---
 .../model_scheduler/master_job_runner_manager.py         | 9 ++++++++-
 .../scheduler/model_scheduler/master_protocol_manager.py | 2 +-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner_manager.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner_manager.py
index c761cd6d8f..0c674cb5f0 100755
--- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner_manager.py
+++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner_manager.py
@@ -42,11 +42,18 @@ def send_deployment_stages(
                 message_center=message_center
             )
 
-    def send_deployment_delete_request_to_edges(self, end_point_id, payload, model_msg_object, message_center=None):
+    def send_deployment_delete_request_to_edges(self, end_point_id, payload, model_msg_object, message_center=None,
+                                                args=None):
         run_id_str = str(end_point_id)
         if self.job_runners.get(run_id_str, None) is not None:
             self.job_runners[run_id_str].send_deployment_delete_request_to_edges(
                 payload, model_msg_object, message_center=message_center)
+        else:
+            # Hotfix: re-instantiate the job runner
+            # TODO(Alay, Raphael): Try to dig into whether re-instantiate the job runner is necessary
+            self.job_runners[run_id_str] = self._generate_job_runner_instance(args)
+            self.job_runners[run_id_str].send_deployment_delete_request_to_edges(
+                payload, model_msg_object, message_center=message_center)
 
     def stop_device_inference_monitor(self, run_id, end_point_name, model_id, model_name, model_version):
         run_id_str = str(run_id)
diff --git a/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py b/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py
index 668d1192ce..7bfad2f3eb 100755
--- a/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py
+++ b/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py
@@ -132,7 +132,7 @@ def callback_delete_deployment(self, topic, payload):
 
         # Send delete deployment request to the edge devices
         FedMLDeployJobRunnerManager.get_instance().send_deployment_delete_request_to_edges(
-            model_msg_object.run_id, payload, model_msg_object, message_center=self.message_center)
+            model_msg_object.run_id, payload, model_msg_object, message_center=self.message_center, args=self.args)
 
         # Stop processes on master
         FedMLDeployJobRunnerManager.get_instance().stop_job_runner(model_msg_object.run_id)

From 7ccf195113d4c5dccfe3fabc3aae9fea71f33e0d Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Sat, 15 Jun 2024 00:17:04 +0000
Subject: [PATCH 25/38] [Deploy] Support arbitrary container image onboarding.

---
 .../custom_inference_image.yaml               |  19 +-
 .../custom_inference_image/serve_main.py      |  16 --
 .../scheduler/comm_utils/job_monitor.py       |  27 +-
 .../device_client_constants.py                |   4 +
 .../device_http_inference_protocol.py         |   8 +-
 .../device_model_deployment.py                | 265 +++++++++---------
 .../model_scheduler/device_model_inference.py |  56 ++--
 .../model_scheduler/worker_job_runner.py      |  16 --
 python/fedml/core/mlops/mlops_device_perfs.py |   2 +-
 9 files changed, 201 insertions(+), 212 deletions(-)
 delete mode 100644 python/examples/deploy/custom_inference_image/serve_main.py

diff --git a/python/examples/deploy/custom_inference_image/custom_inference_image.yaml b/python/examples/deploy/custom_inference_image/custom_inference_image.yaml
index 0c62767b40..467c7c48b0 100644
--- a/python/examples/deploy/custom_inference_image/custom_inference_image.yaml
+++ b/python/examples/deploy/custom_inference_image/custom_inference_image.yaml
@@ -1,13 +1,14 @@
 workspace: "./"
-job: |
-  echo "Start serving..."
-  python3 serve_main.py
 
-bootstrap: |
-  echo "Bootstrap start..."
-  echo "Bootstrap finished!"
+inference_image_name: "ghcr.io/predibase/lorax:main"
+container_run_command: "--model-id mistralai/Mistral-7B-Instruct-v0.1"
 
-enable_custom_image: true
-inference_image_name: "fedml/fedml-default-inference-backend"
-deploy_timeout: 1000
+environment_variables:
+  HUGGING_FACE_HUB_TOKEN: ""
 
+readiness_probe:
+  path: "health"
+
+port: 80
+
+deploy_timeout: 1600
diff --git a/python/examples/deploy/custom_inference_image/serve_main.py b/python/examples/deploy/custom_inference_image/serve_main.py
deleted file mode 100644
index a7a1dd84f3..0000000000
--- a/python/examples/deploy/custom_inference_image/serve_main.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from fedml.serving import FedMLPredictor
-from fedml.serving import FedMLInferenceRunner
-
-
-class DummyPredictor(FedMLPredictor):
-    def __init__(self):
-        super().__init__()
-        
-    def predict(self, request):
-        return {"Aloha": request}
-
-
-if __name__ == "__main__":
-    predictor = DummyPredictor()
-    fedml_inference_runner = FedMLInferenceRunner(predictor)
-    fedml_inference_runner.run()
\ No newline at end of file
diff --git a/python/fedml/computing/scheduler/comm_utils/job_monitor.py b/python/fedml/computing/scheduler/comm_utils/job_monitor.py
index 97a4cb6ebc..d216b46dad 100644
--- a/python/fedml/computing/scheduler/comm_utils/job_monitor.py
+++ b/python/fedml/computing/scheduler/comm_utils/job_monitor.py
@@ -40,6 +40,7 @@
 from fedml.core.mlops.mlops_runtime_log import MLOpsRuntimeLog
 from fedml.core.mlops.mlops_utils import MLOpsLoggingUtils
 from fedml.core.mlops.mlops_runtime_log_daemon import MLOpsRuntimeLogDaemon
+from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants
 from ..scheduler_core.endpoint_sync_protocol import FedMLEndpointSyncProtocol
 
 from ..model_scheduler.device_server_constants import ServerConstants
@@ -758,9 +759,8 @@ def monitor_slave_endpoint_status(self):
                 except Exception as e:
                     pass
 
-    def _lenient_check_replica_ready(
-            self, deployment_result
-    ):
+    @staticmethod
+    def _lenient_check_replica_ready(deployment_result):
         """
         Double-check the replica's liveness using /ready api:
             if 200 -> return True
@@ -769,8 +769,27 @@ def _lenient_check_replica_ready(
         """
         result_json = deployment_result
         inference_url = result_json.get("model_url", None)
+        liveliness_check = result_json.get("model_metadata", {}).get("liveliness_check", None)
+        readiness_check = result_json.get("model_metadata", {}).get("readiness_check", None)
+
+        if liveliness_check is not None:
+            if liveliness_check == ClientConstants.LIVENESS_PROBE_DEFAULT:
+                liveliness_check = readiness_check  # Follow the readiness check pattern
+            if not isinstance(liveliness_check, dict):
+                logging.warning(f"Healthiness check is not a dict. {liveliness_check}")
+                return True
+            if "path" not in liveliness_check:
+                logging.warning(f"Healthiness check does not have path. {liveliness_check}")
+                return True
+            response_ok = asyncio.run(FedMLHttpInference.is_inference_ready(
+                inference_url, timeout=SchedulerConstants.ENDPOINT_INFERENCE_READY_TIMEOUT,
+                path=liveliness_check["path"]))
+            if response_ok is None:
+                # This means the server return 202
+                return False
+            return True
 
-        # Make a curl get to inference_url with timeout 5s
+        # Make a curl get to inference_url/ready with timeout 5s
         # TODO(Raphael): Also support PROXY and MQTT to check the readiness
         response_ok = asyncio.run(FedMLHttpInference.is_inference_ready(inference_url, timeout=5))
         if response_ok is None:
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
index fdcbdf0a34..cd21de2e04 100644
--- a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
@@ -143,6 +143,10 @@ class ClientConstants(object):
     DEVICE_DIFF_DELETE_OPERATION = "op: delete"
     DEVICE_DIFF_REPLACE_OPERATION = "op: replace"
 
+    READINESS_PROBE_DEFAULT = "DEFAULT"
+    LIVENESS_PROBE_DEFAULT = "DEFAULT"
+
+
     LOGIN_MODE_ON_PREMISE_INDEX = 0
     LOGIN_MODE_FEDML_CLOUD_INDEX = 1
     LOGIN_MODE_PUBLIC_CLOUD_INDEX = 2
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py b/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py
index 7e4c06ea5d..41c565d5d8 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py
@@ -14,14 +14,14 @@ def __init__(self):
         pass
 
     @staticmethod    
-    async def is_inference_ready(inference_url, timeout=None):
-        '''
+    async def is_inference_ready(inference_url, path="ready", timeout=None):
+        """
         True: inference is ready
         False: cannot be reached, will try other protocols
         None: can be reached, but not ready
-        '''
+        """
         url_parsed = urlparse(inference_url)
-        ready_url = f"http://{url_parsed.hostname}:{url_parsed.port}/ready"
+        ready_url = f"http://{url_parsed.hostname}:{url_parsed.port}/{path}"
         response_ok = False
         try:
             async with httpx.AsyncClient() as client:
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
index edd2ebea9a..71f0c8032a 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
@@ -68,42 +68,26 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
     num_gpus = gpu_per_replica
     gpu_ids, gpu_attach_cmd = None, ""
 
-    # Concatenate the model name
+    # Concatenate the full model name
     running_model_name = ClientConstants.get_running_model_name(
         end_point_name, inference_model_name, model_version, end_point_id, model_id, edge_id=edge_id)
 
-    # Parse the model config file and get the necessary information for the deployment
+    # Parse the model config file
     model_config_path = os.path.join(model_storage_local_path, "fedml_model_config.yaml")
     with open(model_config_path, 'r') as file:
         config = yaml.safe_load(file)
+        inference_type = "default"
 
         # Resource related
-        inference_type = "default"
-        use_gpu = config.get('use_gpu', True)
-        num_gpus_frm_yml = config.get('num_gpus', None)
-        if not use_gpu:
-            num_gpus = 0
-        else:
-            if num_gpus_frm_yml is not None:
-                num_gpus = int(num_gpus_frm_yml)
-        usr_indicated_wait_time = config.get('deploy_timeout', 900)
-        usr_indicated_retry_cnt = max(int(usr_indicated_wait_time) // 10, 1)
-        shm_size = config.get('shm_size', None)
-        storage_opt = config.get('storage_opt', None)
-        tmpfs = config.get('tmpfs', None)
-        cpus = config.get('cpus', None)
-        if cpus is not None:
-            cpus = int(cpus)
-        memory = config.get('memory', None)
-
-        inference_image_name = config.get('inference_image_name',
-                                          ClientConstants.INFERENCE_SERVER_CUSTOME_IMAGE)
-        image_pull_policy = config.get('image_pull_policy', SchedulerConstants.IMAGE_PULL_POLICY_IF_NOT_PRESENT)
-
-        # Source code dir, bootstrap dir, data cache dir
-        src_code_dir = os.path.join(model_storage_local_path, config.get('source_code_dir', ""))
+        use_gpu, num_gpus, shm_size, storage_opt, tmpfs, cpus, memory, port_inside_container = \
+            parse_resource_related_config(config, gpu_per_replica)
 
-        # Get the bootstrap and job commands inside the yaml file
+        # Image related
+        inference_image_name, image_pull_policy, registry_name, registry_provider, \
+            registry_user_name, registry_user_password = parse_image_registry_related_config(config)
+
+        # Bootstrap, job and entrypoint related
+        dst_model_serving_dir = "/home/fedml/models_serving"
         bootstrap_cmds_str_frm_yaml = config.get('bootstrap', "")
         job_cmds_str_frm_yaml = config.get('job', "")
 
@@ -119,36 +103,37 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
         else:
             src_bootstrap_file_path = ""
 
-        data_cache_dir_input = config.get('data_cache_dir', "")
-        request_input_example = config.get('request_input_example', None)
-        extra_envs = config.get('environment_variables', None)
-
-        # Serving dir inside docker
-        dst_model_serving_dir = "/home/fedml/models_serving"
-        relative_entry = config.get('entry_point')
         if src_bootstrap_file_path != "":
             dst_bootstrap_dir = os.path.join(dst_model_serving_dir, auto_gen_bootstrap_file_name)
         else:
             dst_bootstrap_dir = ""
 
-        # If using customized image, then bootstrap + job will be the entry point
-        enable_custom_image = config.get("enable_custom_image", False)
-        # inference_type = "custom"
-        customized_image_entry_cmd = \
-            "/bin/bash /home/fedml/models_serving/fedml-deploy-bootstrap-entry-auto-gen.sh"
+        # If the entry point is in fedml format (e.g., "main.py")
+        relative_entry_fedml_format = config.get('entry_point', "")
+
+        # User indicate either fedml format python main entry filename or entry command
+        customized_image_entry_cmd = config.get('container_run_command', None)
+        customized_readiness_check = config.get('readiness_probe', ClientConstants.READINESS_PROBE_DEFAULT)
+        customized_liveliness_check = config.get('liveness_probe', ClientConstants.LIVENESS_PROBE_DEFAULT)
+
+        # Storage related
+        src_code_dir = os.path.join(model_storage_local_path, config.get('source_code_dir', ""))
+        data_cache_dir_input = config.get('data_cache_dir', "")
 
-        docker_registry_user_name = config.get("docker_registry_user_name", "")
-        docker_registry_user_password = config.get("docker_registry_user_password", "")
-        docker_registry = config.get("docker_registry", "")
+        # Others
+        extra_envs = config.get('environment_variables', None)
+        usr_indicated_wait_time = config.get('deploy_timeout', 900)
+        usr_indicated_retry_cnt = max(int(usr_indicated_wait_time) // 10, 1)
+        request_input_example = config.get('request_input_example', None)
 
-        port_inside_container = int(config.get("port", 2345))
+    # Parameter's check
+    if inference_engine != ClientConstants.INFERENCE_ENGINE_TYPE_INT_DEFAULT:
+        raise Exception(f"inference engine {inference_engine} is not supported")
 
-    # Request the GPU ids for the deployment
+    # Request the GPU
     if num_gpus > 0:
         gpu_ids, gpu_attach_cmd = request_gpu_ids_on_deployment(
             edge_id, end_point_id, num_gpus=num_gpus, master_device_id=master_device_id)
-
-        # set replica and their gpu ids
         FedMLModelCache.get_instance().set_redis_params()
         FedMLModelCache.get_instance().set_replica_gpu_ids(
             end_point_id, end_point_name, inference_model_name, edge_id, replica_rank+1, gpu_ids)
@@ -159,50 +144,51 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
     if not os.path.exists(model_serving_dir):
         os.makedirs(model_serving_dir, exist_ok=True)
 
-    if inference_engine != ClientConstants.INFERENCE_ENGINE_TYPE_INT_DEFAULT:
-        raise Exception(f"inference engine {inference_engine} is not supported")
-
-    # Get the master device id
-    logging.info(f"master ip: {master_ip}, worker ip: {infer_host}")
+    # Determine whether to report public ip or localhost
     if infer_host == master_ip:
         logging.info("infer_host is the same as master ip, will use 127.0.0.1 to avoid firewall issue")
         infer_host = "127.0.0.1"
+    else:
+        logging.info("Master and worker are located in different machines, will use the public ip for inference")
 
+    # Init container interface client
     try:
         client = docker.from_env()
-        if enable_custom_image and docker_registry_user_name != "" and docker_registry_user_password != "" \
-                and docker_registry != "":
-            client.login(username=docker_registry_user_name, password=docker_registry_user_password,
-                         registry=docker_registry)
+        if registry_provider == "Docker" and registry_user_name != "" and registry_user_password != "" \
+                and registry_name != "":
+            client.login(username=registry_user_name, password=registry_user_password,
+                         registry=registry_name)
     except Exception:
         logging.error("Failed to connect to the docker daemon, please ensure that you have "
                       "installed Docker Desktop or Docker Engine, and the docker is running")
         return "", "", None, None, None
 
+    # Pull the inference image
+    logging.info(f"Start pulling the inference image {inference_image_name}... with policy {image_pull_policy}")
+    ContainerUtils.get_instance().pull_image_with_policy(image_pull_policy, inference_image_name)
+
+    # Remove if the container exists
     container_prefix = ("{}".format(ClientConstants.FEDML_DEFAULT_SERVER_CONTAINER_NAME_PREFIX) + "__" +
                         security_utils.get_content_hash(running_model_name))
-
     default_server_container_name = container_prefix + "__" + str(replica_rank)
-
     try:
         exist_container_obj = client.containers.get(default_server_container_name)
     except docker.errors.NotFound:
         exist_container_obj = None
     except docker.errors.APIError:
         raise Exception("Failed to get the container object")
+    # Allocate the GPU
+    # TODO: Make sure no competition for each replica in a single deployment
+    if exist_container_obj is not None:
+        client.api.remove_container(exist_container_obj.id, v=True, force=True)
 
-    # Pull the inference image
-    logging.info(f"Start pulling the inference image {inference_image_name}... with policy {image_pull_policy}")
-    ContainerUtils.get_instance().pull_image_with_policy(image_pull_policy, inference_image_name)
-
+    # Build host config
     volumes = []
     binds = {}
     environment = {}
 
-    # data_cache_dir mounting
     if isinstance(data_cache_dir_input, str):
         # In this case, we mount to the same folder, if it has ~, we replace it with /home/fedml
-        src_data_cache_dir, dst_data_cache_dir = "", ""
         if data_cache_dir_input != "":
             if data_cache_dir_input[0] == "~":
                 src_data_cache_dir = os.path.expanduser(data_cache_dir_input)
@@ -239,16 +225,17 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
     else:
         logging.warning("data_cache_dir_input is not a string or a dictionary, skip mounting it to the container")
 
-    # Default mounting
-    if not enable_custom_image or (enable_custom_image and relative_entry != ""):
+    # FedML format main entry filename, e.g., main.py
+    if relative_entry_fedml_format != "":
         logging.info("Start copying the source code to the container...")
         volumes.append(src_code_dir)
         binds[src_code_dir] = {
             "bind": dst_model_serving_dir,
             "mode": "rw"
         }
-        environment["MAIN_ENTRY"] = relative_entry
+        environment["MAIN_ENTRY"] = relative_entry_fedml_format
 
+    # Host config
     host_config_dict = {
         "binds": binds,
         "port_bindings": {
@@ -261,10 +248,6 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
         "mem_limit": memory
     }
 
-    # Allocate the GPU
-    # TODO: Make sure no competition for each replica in a single deployment
-    if exist_container_obj is not None:
-        client.api.remove_container(exist_container_obj.id, v=True, force=True)
     device_mapping = {}
     if no_real_gpu_allocation is not None:
         use_gpu = not no_real_gpu_allocation
@@ -277,6 +260,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
         host_config_dict.update(device_mapping)
 
     # Environment variables
+    enable_custom_image = False if relative_entry_fedml_format != "" else True
     if not enable_custom_image:
         # For some image, the default user is root. Unified to fedml.
         environment["HOME"] = "/home/fedml"
@@ -288,7 +272,6 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
     environment["FEDML_ENV_VERSION"] = fedml.get_env_version()
     environment["FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_HOST"] = fedml.get_local_on_premise_platform_host()
     environment["FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_PORT"] = fedml.get_local_on_premise_platform_port()
-
     if extra_envs is not None:
         for key in extra_envs:
             environment[key] = extra_envs[key]
@@ -304,8 +287,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
             environment=environment,
             host_config=host_config,
             detach=True,
-            command=customized_image_entry_cmd if enable_custom_image else None,
-            entrypoint=customized_image_entry_cmd if enable_custom_image else None
+            command=customized_image_entry_cmd,
         )
         client.api.start(container=new_container.get("Id"))
     except Exception as e:
@@ -333,11 +315,12 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
                           inference_model_name, inference_engine, inference_http_port, inference_type,
                           retry_interval=10, deploy_attempt_threshold=usr_indicated_retry_cnt,
                           request_input_example=request_input_example, infer_host=infer_host,
-                          enable_custom_image=enable_custom_image)
+                          readiness_check=customized_readiness_check)
 
     # Return the running model name and the inference output url
     inference_output_url, running_model_version, ret_model_metadata, ret_model_config = \
         check_container_readiness(inference_http_port=inference_http_port, infer_host=infer_host,
+                                  readiness_check=customized_readiness_check,
                                   request_input_example=request_input_example)
 
     if inference_output_url == "":
@@ -345,51 +328,24 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
 
     # Successfully get the result from the container
     model_metadata = ret_model_metadata
+    model_metadata["liveliness_check"] = customized_liveliness_check
+    model_metadata["readiness_check"] = customized_readiness_check
     logging.info(f"[Worker][Replica{replica_rank}] Model deployment is successful with inference_output_url: "
                  f"{inference_output_url}, model_metadata: {model_metadata}, model_config: {ret_model_config}")
 
     return running_model_name, inference_output_url, model_version, model_metadata, ret_model_config
 
 
-def build_inference_req(end_point_name, model_name, token, in_model_metadata):
-    model_inputs = in_model_metadata["inputs"]
-    ret_inputs = list()
-
-    for input_item in model_inputs:
-        ret_item = input_item
-        shape = ret_item["shape"]
-        data_type = ret_item["datatype"]
-        if ClientConstants.MODEL_DATA_TYPE_MAPPING[data_type] == ClientConstants.MODEL_DATA_TYPE_INT:
-            for i in range(len(shape)):
-                if shape[i] == -1:  # if input shape is dynamic, we set a default value 1
-                    shape[i] = 1
-            ret_item["data"] = torch.randint(0, 1, shape).tolist()
-        else:
-            for i in range(len(shape)):
-                if shape[i] == -1:  # if input shape is dynamic, we set a default value 1
-                    shape[i] = 1
-            ret_item["data"] = torch.zeros(shape).tolist()
-        ret_inputs.append(ret_item)
-
-    input_json = {"end_point_name": end_point_name,
-                  "model_name": model_name,
-                  "token": str(token),
-                  "inputs": ret_inputs,
-                  "outputs": in_model_metadata["outputs"]}
-    output_json = in_model_metadata["outputs"]
-
-    return input_json, output_json
-
-
 def should_exit_logs(end_point_id, model_id, cmd_type, model_name, inference_engine, inference_port,
                      inference_type="default", request_input_example=None, infer_host="127.0.0.1",
-                     enable_custom_image=False):
+                     readiness_check=ClientConstants.READINESS_PROBE_DEFAULT):
     if cmd_type == ClientConstants.CMD_TYPE_RUN_DEFAULT_SERVER:
         # TODO: Exited Quickly if the container is Exited or Removed
         # If the container has exited, return True, means we should exit the logs
         try:
             inference_output_url, model_version, model_metadata, model_config = \
                 check_container_readiness(inference_http_port=inference_port, infer_host=infer_host,
+                                          readiness_check=readiness_check,
                                           request_input_example=request_input_example)
             if inference_output_url != "":
                 logging.info("Log test for deploying model successfully, inference url: {}, "
@@ -410,7 +366,7 @@ def log_deployment_output(end_point_id, model_id, cmd_container_name, cmd_type,
                           inference_http_port, inference_type="default",
                           retry_interval=10, deploy_attempt_threshold=10,
                           request_input_example=None, infer_host="127.0.0.1",
-                          enable_custom_image=False):
+                          readiness_check=ClientConstants.READINESS_PROBE_DEFAULT):
     deploy_attempt = 0
     last_log_time = datetime.datetime.now()
 
@@ -478,11 +434,10 @@ def log_deployment_output(end_point_id, model_id, cmd_container_name, cmd_type,
                     client.api.remove_container(container_obj.id, v=True, force=True)
                     break
 
-        # should_exit_logs will ping the inference container
-        # return True if ready
+        # should_exit_logs will ping the inference container, return True if ready
         if should_exit_logs(end_point_id, model_id, cmd_type, inference_model_name, inference_engine,
                             inference_http_port, inference_type, request_input_example,
-                            infer_host, enable_custom_image=enable_custom_image):
+                            infer_host, readiness_check=readiness_check):
             break
 
         # Not yet ready, retry
@@ -504,10 +459,58 @@ def log_deployment_output(end_point_id, model_id, cmd_container_name, cmd_type,
         time.sleep(retry_interval)
 
 
-def is_client_inference_container_ready(infer_url_host, inference_http_port, readiness_check_type="default",
-                                        readiness_check_cmd=None, request_input_example=None):
+def parse_resource_related_config(config, gpu_num_frm_platform=0):
+    use_gpu = config.get('use_gpu', True)
+    num_gpus_frm_yml = config.get('num_gpus', None)
+
+    num_gpus = gpu_num_frm_platform
+    # Priority: num_gpus from yaml > num_gpus from platform
+    if use_gpu:
+        if num_gpus_frm_yml is not None:
+            num_gpus = int(num_gpus_frm_yml)
+    else:
+        num_gpus = 0
+
+    shm_size = config.get('shm_size', None)
+    storage_opt = config.get('storage_opt', None)
+    tmpfs = config.get('tmpfs', None)
+    cpus = config.get('cpus', None)
+    if cpus is not None:
+        cpus = int(cpus)
+    memory = config.get('memory', None)
+    port_inside_container = int(config.get("port", 2345))
+
+    return use_gpu, num_gpus, shm_size, storage_opt, tmpfs, cpus, memory, port_inside_container
+
+
+def parse_image_registry_related_config(config):
+    inference_image_name = config.get('inference_image_name', ClientConstants.INFERENCE_SERVER_CUSTOME_IMAGE)
+    image_pull_policy = config.get('image_pull_policy', SchedulerConstants.IMAGE_PULL_POLICY_IF_NOT_PRESENT)
+
+    # Optional
+    registry_specs = config.get('registry_specs', {})
+    registry_name = registry_specs.get("docker_registry_user_name", "")
+    registry_provider = registry_specs.get("registry_provider", "")
+    registry_user_name = config.get("registry_user_name", "")
+    registry_user_password = config.get("registry_user_password", "")
+
+    return (inference_image_name, image_pull_policy, registry_name, registry_provider,
+            registry_user_name, registry_user_password)
+
+
+def is_client_inference_container_ready(infer_url_host, inference_http_port,
+                                        readiness_check=ClientConstants.READINESS_PROBE_DEFAULT,
+                                        request_input_example=None, container_id=None):
+    # Construct the model metadata (input and output)
+    model_metadata = {}
+    if request_input_example is not None and len(request_input_example) > 0:
+        model_metadata["inputs"] = request_input_example
+    else:
+        model_metadata["inputs"] = {"text": "What is a good cure for hiccups?"}
+    model_metadata["outputs"] = []
+    model_metadata["type"] = "default"
 
-    if readiness_check_type == "default":
+    if readiness_check == ClientConstants.READINESS_PROBE_DEFAULT:
         default_client_container_ready_url = "http://{}:{}/ready".format("0.0.0.0", inference_http_port)
         response = None
         try:
@@ -517,26 +520,36 @@ def is_client_inference_container_ready(infer_url_host, inference_http_port, rea
         if not response or response.status_code != 200:
             return "", "", {}, {}
 
-        # Construct the model metadata (input and output)
-        model_metadata = {}
-        if request_input_example is not None and len(request_input_example) > 0:
-            model_metadata["inputs"] = request_input_example
-        else:
-            model_metadata["inputs"] = {"text": "What is a good cure for hiccups?"}
-        model_metadata["outputs"] = []
-        model_metadata["type"] = "default"
-
         return "http://{}:{}/predict".format(infer_url_host, inference_http_port), None, model_metadata, None
     else:
-        # TODO(Raphael): Support arbitrary readiness check command
-        logging.error(f"Unknown readiness check type: {readiness_check_type}")
-        return "", "", {}, {}
+        if not isinstance(readiness_check, dict):
+            logging.error(f"Unknown readiness check type: {readiness_check}")
+            return "", "", {}, {}
+
+        if "path" in readiness_check:
+            readiness_check_url = f"http://{infer_url_host}:{inference_http_port}/{readiness_check['path']}"
+            response = None
+            try:
+                response = requests.get(readiness_check_url)
+            except:
+                pass
+            if not response or response.status_code != 200:
+                return "", "", {}, {}
+
+            return "http://{}:{}/".format(infer_url_host, inference_http_port), None, model_metadata, None
+        elif "command" in readiness_check:
+            # TODO(raphael): Support arbitrary readiness check command by using
+            #  container id and docker exec
+            return "http://{}:{}/".format(infer_url_host, inference_http_port), None, model_metadata, None
+        else:
+            logging.error(f"Unknown readiness check type: {readiness_check}")
+            return "", "", {}, {}
 
 
 def check_container_readiness(inference_http_port, infer_host="127.0.0.1", request_input_example=None,
-                              readiness_check_type="default", readiness_check_cmd=None):
+                              readiness_check=ClientConstants.READINESS_PROBE_DEFAULT):
     response_from_client_container = is_client_inference_container_ready(
-        infer_host, inference_http_port, readiness_check_type, readiness_check_cmd,
+        infer_host, inference_http_port, readiness_check=readiness_check,
         request_input_example=request_input_example)
 
     return response_from_client_container
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py
index ba13006245..84141851b0 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py
@@ -342,56 +342,40 @@ async def send_inference_request(idle_device, end_point_id, inference_url, input
 
     try:
         if connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP:
-            response_ok = await FedMLHttpInference.is_inference_ready(
+            response_ok, inference_response = await FedMLHttpInference.run_http_inference_with_curl_request(
                 inference_url,
+                input_list,
+                output_list,
+                inference_type=inference_type,
                 timeout=request_timeout_sec)
-            if response_ok:
-                response_ok, inference_response = await FedMLHttpInference.run_http_inference_with_curl_request(
-                    inference_url,
-                    input_list,
-                    output_list,
-                    inference_type=inference_type,
-                    timeout=request_timeout_sec)
-                logging.debug(f"Use http inference. return {response_ok}")
-                return inference_response
+            logging.debug(f"Use http inference. return {response_ok}")
+            return inference_response
         elif connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP_PROXY:
-            logging.warning("Use http proxy inference.")
-            response_ok = await FedMLHttpProxyInference.is_inference_ready(
+            logging.debug("Use http proxy inference.")
+            response_ok, inference_response = await FedMLHttpProxyInference.run_http_proxy_inference_with_request(
+                end_point_id,
                 inference_url,
+                input_list,
+                output_list,
+                inference_type=inference_type,
                 timeout=request_timeout_sec)
-            if response_ok:
-                response_ok, inference_response = await FedMLHttpProxyInference.run_http_proxy_inference_with_request(
-                    end_point_id,
-                    inference_url,
-                    input_list,
-                    output_list,
-                    inference_type=inference_type,
-                    timeout=request_timeout_sec)
-                logging.info(f"Use http proxy inference. return {response_ok}")
-                return inference_response
+            logging.debug(f"Use http proxy inference. return {response_ok}")
+            return inference_response
         elif connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_MQTT:
-            logging.warning("Use mqtt inference.")
+            logging.debug("Use mqtt inference.")
             agent_config = {"mqtt_config": Settings.mqtt_config}
             mqtt_inference = FedMLMqttInference(
                 agent_config=agent_config,
                 run_id=end_point_id)
-            response_ok = mqtt_inference.run_mqtt_health_check_with_request(
+            response_ok, inference_response = mqtt_inference.run_mqtt_inference_with_request(
                 idle_device,
                 end_point_id,
                 inference_url,
+                input_list,
+                output_list,
+                inference_type=inference_type,
                 timeout=request_timeout_sec)
-            inference_response = {"error": True, "message": "Failed to use http, http-proxy and mqtt for inference."}
-            if response_ok:
-                response_ok, inference_response = mqtt_inference.run_mqtt_inference_with_request(
-                    idle_device,
-                    end_point_id,
-                    inference_url,
-                    input_list,
-                    output_list,
-                    inference_type=inference_type,
-                    timeout=request_timeout_sec)
-
-            logging.info(f"Use mqtt inference. return {response_ok}.")
+            logging.debug(f"Use mqtt inference. return {response_ok}.")
             return inference_response
         else:
             return {"error": True, "message": "Failed to use http, http-proxy for inference, no response from replica."}
diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py
index 8100707386..a892412d29 100755
--- a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py
+++ b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py
@@ -460,22 +460,6 @@ def construct_deployment_results(self, end_point_name, device_id, model_status,
                                       }
         return deployment_results_payload
 
-    def construct_deployment_status(self, end_point_name, device_id,
-                                    model_id, model_name, model_version,
-                                    model_inference_url, model_status,
-                                    inference_port=ClientConstants.MODEL_INFERENCE_DEFAULT_PORT,
-                                    replica_no=1,     # start from 1
-                                    ):
-        deployment_status_payload = {"end_point_id": self.run_id, "end_point_name": end_point_name,
-                                     "device_id": device_id,
-                                     "model_id": model_id, "model_name": model_name,
-                                     "model_version": model_version,
-                                     "model_url": model_inference_url, "model_status": model_status,
-                                     "inference_port": inference_port,
-                                     "replica_no": replica_no,
-                                     }
-        return deployment_status_payload
-
     def send_deployment_results(self, end_point_name, device_id, model_status,
                                 model_id, model_name, model_inference_url,
                                 model_version, inference_port, inference_engine,
diff --git a/python/fedml/core/mlops/mlops_device_perfs.py b/python/fedml/core/mlops/mlops_device_perfs.py
index 29183a6e78..4bb41df73f 100644
--- a/python/fedml/core/mlops/mlops_device_perfs.py
+++ b/python/fedml/core/mlops/mlops_device_perfs.py
@@ -42,7 +42,7 @@ def __init__(self):
         self.monitor_replica_num_process = None
         self.monitor_replica_perf_process = None
         self.job_total_monitor_process = None
-        self.enable_job_total_monitor = False
+        self.enable_job_total_monitor = False   # TODO(Raphael): Enable the healthiness check by this job total monitor
         self.args = None
         self.device_id = None
         self.run_id = None

From 9ca6ecc1d23166223e7788adfbe688379a18f193 Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Mon, 17 Jun 2024 18:25:59 -0700
Subject: [PATCH 26/38] [Deploy] Add LoraX and Triton examples; Add url match
 pattern.

---
 .../deploy/custom_inference_image/README.md   | 48 -------------------
 .../{ => lorax}/custom_inference_image.yaml   |  0
 .../template/custom_inference_image.yaml      | 16 +++++++
 .../model_repository/dummy/1/model.py         | 25 ++++++++++
 .../scheduler/comm_utils/network_util.py      | 11 +++++
 .../device_model_deployment.py                | 19 ++++----
 .../model_scheduler/device_model_inference.py | 28 +++++++++--
 7 files changed, 87 insertions(+), 60 deletions(-)
 delete mode 100644 python/examples/deploy/custom_inference_image/README.md
 rename python/examples/deploy/custom_inference_image/{ => lorax}/custom_inference_image.yaml (100%)
 create mode 100644 python/examples/deploy/custom_inference_image/triton_inference_server/template/custom_inference_image.yaml
 create mode 100644 python/examples/deploy/custom_inference_image/triton_inference_server/template/model_repository/dummy/1/model.py

diff --git a/python/examples/deploy/custom_inference_image/README.md b/python/examples/deploy/custom_inference_image/README.md
deleted file mode 100644
index 1269e4c064..0000000000
--- a/python/examples/deploy/custom_inference_image/README.md
+++ /dev/null
@@ -1,48 +0,0 @@
-## Create a model card at local
-First, create a model card at local
-```bash
-fedml model create -n custom_inference_image -cf custom_inference_image.yaml
-```
-
-## Low Code UI Deploy
-Push the model to nexus ai platform
-```bash
-fedml model push -n custom_inference_image
-```
-Do the following docs to deploy the model on nexus ai platform
-https://docs-dev.fedml.ai/deploy/low_code_ui
-
-## CLI Deploy
-### Deploy to current machine 
-Docs: https://docs-dev.fedml.ai/deploy/deploy_local
-```bash
-fedml model deploy -n custom_inference_image --local
-```
-
-### Deploy to On-premise  
-Docs: https://docs-dev.fedml.ai/deploy/deploy_on_premise
-```bash
-fedml device bind $api_key
-```
-```bash
-fedml model deploy -n my_model -m $master_ids -w $worker_ids
-```
-
-### Deploy to GPU Cloud  
-Docs: https://docs-dev.fedml.ai/deploy/deploy_cloud
-
-Change the `custom_inference_image.yaml` file, adding following lines
-```yaml
-computing:
-  minimum_num_gpus: 1           # minimum # of GPUs to provision
-  maximum_cost_per_hour: $3000   # max cost per hour for your job per gpu card
-  #allow_cross_cloud_resources: true # true, false
-  #device_type: CPU              # options: GPU, CPU, hybrid
-  resource_type: A100-80G       # e.g., A100-80G,
-  # please check the resource type list by "fedml show-resource-type"
-  # or visiting URL: https://fedml.ai/accelerator_resource_type
-```
-
-```bash
-fedml model deploy -n custom_inference_image
-```
\ No newline at end of file
diff --git a/python/examples/deploy/custom_inference_image/custom_inference_image.yaml b/python/examples/deploy/custom_inference_image/lorax/custom_inference_image.yaml
similarity index 100%
rename from python/examples/deploy/custom_inference_image/custom_inference_image.yaml
rename to python/examples/deploy/custom_inference_image/lorax/custom_inference_image.yaml
diff --git a/python/examples/deploy/custom_inference_image/triton_inference_server/template/custom_inference_image.yaml b/python/examples/deploy/custom_inference_image/triton_inference_server/template/custom_inference_image.yaml
new file mode 100644
index 0000000000..02dca147ce
--- /dev/null
+++ b/python/examples/deploy/custom_inference_image/triton_inference_server/template/custom_inference_image.yaml
@@ -0,0 +1,16 @@
+workspace: "./"
+
+inference_image_name: "nvcr.io/nvidia/tritonserver:24.05-py3"
+
+# If you put the model repository in $workspace/model_repository, it will be mounted to /home/fedml/models_serving/model_repository
+container_run_command: "tritonserver --model-repository=/home/fedml/models_serving/model_repository"
+
+# If your image has the repository inside it, say in /my_models_dir/model_repository, you can do:
+#container_run_command: "tritonserver --model-repository=/my_models_dir/model_repository"
+
+readiness_probe:
+  path: "v2/health/ready"
+
+port: 8000
+
+deploy_timeout: 1600
diff --git a/python/examples/deploy/custom_inference_image/triton_inference_server/template/model_repository/dummy/1/model.py b/python/examples/deploy/custom_inference_image/triton_inference_server/template/model_repository/dummy/1/model.py
new file mode 100644
index 0000000000..0404a127ff
--- /dev/null
+++ b/python/examples/deploy/custom_inference_image/triton_inference_server/template/model_repository/dummy/1/model.py
@@ -0,0 +1,25 @@
+import json
+import numpy as np
+import triton_python_backend_utils as pb_utils
+
+class TritonPythonModel:
+    def initialize(self, args):
+        self.model_name = args['model_name']
+
+    @staticmethod
+    def auto_complete_config(auto_complete_model_config):
+        auto_complete_model_config.add_input( {"name": "text_input",  "data_type": "TYPE_STRING", "dims": [-1]})
+        auto_complete_model_config.add_output({"name": "text_output", "data_type": "TYPE_STRING", "dims": [-1]})
+        auto_complete_model_config.set_max_batch_size(0)
+        return auto_complete_model_config
+
+    def execute(self, requests):
+        responses = []
+        for request in requests:
+            in_numpy = pb_utils.get_input_tensor_by_name(request, "text_input").as_numpy()
+            assert np.object_ == in_numpy.dtype, 'in this demo, triton passes in a numpy array of size 1 with object_ dtype, this dtype encapsulates a python bytes-array'
+            print('in this demo len(in_numpy) is 1:', len(in_numpy.tolist()))
+            out_numpy = np.array([ (self.model_name + ': ' + python_byte_array.decode('utf-8') + ' World').encode('utf-8') for python_byte_array in in_numpy.tolist()], dtype = np.object_)
+            out_pb = pb_utils.Tensor("text_output", out_numpy)
+            responses.append(pb_utils.InferenceResponse(output_tensors = [out_pb]))
+        return responses
diff --git a/python/fedml/computing/scheduler/comm_utils/network_util.py b/python/fedml/computing/scheduler/comm_utils/network_util.py
index 48e478f23f..b03b0428d0 100644
--- a/python/fedml/computing/scheduler/comm_utils/network_util.py
+++ b/python/fedml/computing/scheduler/comm_utils/network_util.py
@@ -1,4 +1,5 @@
 import os
+from urllib.parse import urlparse
 from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants
 
 
@@ -16,3 +17,13 @@ def return_this_device_connectivity_type() -> str:
         return env_conn_type
     else:
         return ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT
+
+
+def replace_url_with_path(url: str, path: str) -> str:
+    """
+    Replace the path of the URL with the given path.
+    """
+    if path is None:
+        return url
+    url_parsed = urlparse(url)
+    return f"{url_parsed.scheme}://{url_parsed.netloc}/{path}"
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
index 71f0c8032a..1aef8c09f1 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
@@ -118,6 +118,8 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
 
         # Storage related
         src_code_dir = os.path.join(model_storage_local_path, config.get('source_code_dir', ""))
+        # TODO(Raphael): In the future, the data_cache_dir should not be controlled by the user. It only
+        #  used for internal avoiding checkpoint re-download. e.g. ~/.cache/huggingface/hub/
         data_cache_dir_input = config.get('data_cache_dir', "")
 
         # Others
@@ -225,15 +227,14 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
     else:
         logging.warning("data_cache_dir_input is not a string or a dictionary, skip mounting it to the container")
 
-    # FedML format main entry filename, e.g., main.py
-    if relative_entry_fedml_format != "":
-        logging.info("Start copying the source code to the container...")
-        volumes.append(src_code_dir)
-        binds[src_code_dir] = {
-            "bind": dst_model_serving_dir,
-            "mode": "rw"
-        }
-        environment["MAIN_ENTRY"] = relative_entry_fedml_format
+    # Inject the source code
+    logging.info("Start copying the source code to the container...")
+    volumes.append(src_code_dir)
+    binds[src_code_dir] = {
+        "bind": dst_model_serving_dir,
+        "mode": "rw"
+    }
+    environment["MAIN_ENTRY"] = relative_entry_fedml_format
 
     # Host config
     host_config_dict = {
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py
index 84141851b0..f6fa99d6d4 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py
@@ -21,6 +21,7 @@
 from fedml.computing.scheduler.model_scheduler.device_model_cache import FedMLModelCache
 from fedml.computing.scheduler.model_scheduler.device_mqtt_inference_protocol import FedMLMqttInference
 from fedml.computing.scheduler.model_scheduler.device_http_proxy_inference_protocol import FedMLHttpProxyInference
+from fedml.computing.scheduler.comm_utils.network_util import replace_url_with_path
 from fedml.core.mlops.mlops_configs import MLOpsConfigs
 from fedml.core.mlops import MLOpsRuntimeLog, MLOpsRuntimeLogDaemon
 
@@ -168,10 +169,27 @@ async def predict_with_end_point_id(end_point_id, request: Request, response: Re
     return inference_response
 
 
+@api.post('/custom_inference/{end_point_id}/{path:path}')
+async def custom_inference(end_point_id, path: str, request: Request):
+    # Get json data
+    input_json = await request.json()
+
+    # Get header
+    header = request.headers
+
+    try:
+        inference_response = await _predict(end_point_id, input_json, header, path)
+    except Exception as e:
+        inference_response = {"error": True, "message": f"{traceback.format_exc()}"}
+
+    return inference_response
+
+
 async def _predict(
         end_point_id,
         input_json,
-        header=None
+        header=None,
+        path=None,
 ) -> Union[MutableMapping[str, Any], Response, StreamingResponse]:
     # Always increase the pending requests counter on a new incoming request.
     FEDML_MODEL_CACHE.update_pending_requests_counter(end_point_id, increase=True)
@@ -245,7 +263,8 @@ async def _predict(
                     input_list,
                     output_list,
                     inference_type=in_return_type,
-                    connectivity_type=connectivity_type)
+                    connectivity_type=connectivity_type,
+                    path=path)
 
             # Calculate model metrics
             try:
@@ -336,10 +355,13 @@ def found_idle_inference_device(end_point_id, end_point_name, in_model_name, in_
 
 async def send_inference_request(idle_device, end_point_id, inference_url, input_list, output_list,
                                  inference_type="default",
-                                 connectivity_type=ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT):
+                                 connectivity_type=ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT,
+                                 path=None):
     request_timeout_sec = FEDML_MODEL_CACHE.get_endpoint_settings(end_point_id) \
         .get("request_timeout_sec", ClientConstants.INFERENCE_REQUEST_TIMEOUT)
 
+    inference_url = replace_url_with_path(inference_url, path)
+
     try:
         if connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP:
             response_ok, inference_response = await FedMLHttpInference.run_http_inference_with_curl_request(

From 786718bc6b61508b239a4106738724c458ed8c38 Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Thu, 20 Jun 2024 10:37:19 -0700
Subject: [PATCH 27/38] [Deploy] Support serverless container.

---
 .../lorax/custom_inference_image.yaml         |   6 +-
 .../tensorrt_llm/tensorrtllm.yaml             |  17 ++
 .../template/custom_inference_image.yaml      |  16 +-
 .../device_client_constants.py                |  25 +--
 .../device_http_inference_protocol.py         |  42 ++--
 .../device_http_proxy_inference_protocol.py   |   1 +
 .../device_model_deployment.py                | 191 ++++++++++++------
 .../model_scheduler/device_model_inference.py |  18 +-
 .../device_server_constants.py                |   4 +
 .../model_scheduler/master_job_runner.py      |  38 ++--
 10 files changed, 233 insertions(+), 125 deletions(-)
 create mode 100644 python/examples/deploy/custom_inference_image/tensorrt_llm/tensorrtllm.yaml

diff --git a/python/examples/deploy/custom_inference_image/lorax/custom_inference_image.yaml b/python/examples/deploy/custom_inference_image/lorax/custom_inference_image.yaml
index 467c7c48b0..41cbe501d2 100644
--- a/python/examples/deploy/custom_inference_image/lorax/custom_inference_image.yaml
+++ b/python/examples/deploy/custom_inference_image/lorax/custom_inference_image.yaml
@@ -1,5 +1,6 @@
 workspace: "./"
 
+enable_serverless_container: true
 inference_image_name: "ghcr.io/predibase/lorax:main"
 container_run_command: "--model-id mistralai/Mistral-7B-Instruct-v0.1"
 
@@ -7,8 +8,9 @@ environment_variables:
   HUGGING_FACE_HUB_TOKEN: ""
 
 readiness_probe:
-  path: "health"
+  httpGet:
+    path: "/health"
 
 port: 80
 
-deploy_timeout: 1600
+deploy_timeout_sec: 1600
diff --git a/python/examples/deploy/custom_inference_image/tensorrt_llm/tensorrtllm.yaml b/python/examples/deploy/custom_inference_image/tensorrt_llm/tensorrtllm.yaml
new file mode 100644
index 0000000000..d41dba7983
--- /dev/null
+++ b/python/examples/deploy/custom_inference_image/tensorrt_llm/tensorrtllm.yaml
@@ -0,0 +1,17 @@
+workspace: "./"
+
+enable_serverless_container: true
+inference_image_name: "fedml/llama3-8b-tensorrtllm"
+
+# If you put the model repository in $workspace/model_repository, it will be mounted to /home/fedml/models_serving/model_repository
+container_run_command: ["sh", "-c", "cd / && huggingface-cli login --token $your_hf_token && pip install sentencepiece protobuf && python3 tensorrtllm_backend/scripts/launch_triton_server.py --model_repo tensorrtllm_backend/all_models/inflight_batcher_llm --world_size 1 && tail -f /dev/null"]
+
+readiness_probe:
+  httpGet:
+    path: "/v2/health/ready"
+
+port: 8000
+
+deploy_timeout_sec: 1600
+
+
diff --git a/python/examples/deploy/custom_inference_image/triton_inference_server/template/custom_inference_image.yaml b/python/examples/deploy/custom_inference_image/triton_inference_server/template/custom_inference_image.yaml
index 02dca147ce..eb02e3904a 100644
--- a/python/examples/deploy/custom_inference_image/triton_inference_server/template/custom_inference_image.yaml
+++ b/python/examples/deploy/custom_inference_image/triton_inference_server/template/custom_inference_image.yaml
@@ -1,16 +1,20 @@
 workspace: "./"
 
+enable_serverless_container: true
 inference_image_name: "nvcr.io/nvidia/tritonserver:24.05-py3"
 
-# If you put the model repository in $workspace/model_repository, it will be mounted to /home/fedml/models_serving/model_repository
-container_run_command: "tritonserver --model-repository=/home/fedml/models_serving/model_repository"
+volumes:
+  - workspace_path: "./model_repository"
+    mount_path: "/repo_inside_container"
 
-# If your image has the repository inside it, say in /my_models_dir/model_repository, you can do:
-#container_run_command: "tritonserver --model-repository=/my_models_dir/model_repository"
+container_run_command: "tritonserver --model-repository=/repo_inside_container"
 
 readiness_probe:
-  path: "v2/health/ready"
+  httpGet:
+    path: "/v2/health/ready"
 
 port: 8000
 
-deploy_timeout: 1600
+deploy_timeout_sec: 1600
+
+request_input_example: {"text_input": "Hello"}
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
index cd21de2e04..e18c9f730b 100644
--- a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
@@ -146,7 +146,6 @@ class ClientConstants(object):
     READINESS_PROBE_DEFAULT = "DEFAULT"
     LIVENESS_PROBE_DEFAULT = "DEFAULT"
 
-
     LOGIN_MODE_ON_PREMISE_INDEX = 0
     LOGIN_MODE_FEDML_CLOUD_INDEX = 1
     LOGIN_MODE_PUBLIC_CLOUD_INDEX = 2
@@ -155,20 +154,16 @@ class ClientConstants(object):
     MODEL_DATA_TYPE_INT = "int"
     MODEL_DATA_TYPE_FLOAT = "float"
     MODEL_DATA_TYPE_STR = "str"
-    MODEL_DATA_TYPE_MAPPING = {"TYPE_BOOL": MODEL_DATA_TYPE_INT, "TYPE_UINT8": MODEL_DATA_TYPE_INT,
-                               "TYPE_UINT16": MODEL_DATA_TYPE_INT, "TYPE_UINT32": MODEL_DATA_TYPE_INT,
-                               "TYPE_UINT64": MODEL_DATA_TYPE_INT, "TYPE_INT8": MODEL_DATA_TYPE_INT,
-                               "TYPE_INT16": MODEL_DATA_TYPE_INT, "TYPE_INT32": MODEL_DATA_TYPE_INT,
-                               "TYPE_INT64": MODEL_DATA_TYPE_INT, "TYPE_FP16": MODEL_DATA_TYPE_FLOAT,
-                               "TYPE_FP32": MODEL_DATA_TYPE_FLOAT, "TYPE_FP64": MODEL_DATA_TYPE_FLOAT,
-                               "TYPE_STRING": MODEL_DATA_TYPE_STR, "TYPE_BF16": MODEL_DATA_TYPE_INT,
-                               "BOOL": MODEL_DATA_TYPE_INT, "UINT8": MODEL_DATA_TYPE_INT,
-                               "UINT16": MODEL_DATA_TYPE_INT, "UINT32": MODEL_DATA_TYPE_INT,
-                               "UINT64": MODEL_DATA_TYPE_INT, "INT8": MODEL_DATA_TYPE_INT,
-                               "INT16": MODEL_DATA_TYPE_INT, "INT32": MODEL_DATA_TYPE_INT,
-                               "INT64": MODEL_DATA_TYPE_INT, "FP16": MODEL_DATA_TYPE_FLOAT,
-                               "FP32": MODEL_DATA_TYPE_FLOAT, "FP64": MODEL_DATA_TYPE_FLOAT,
-                               "STRING": MODEL_DATA_TYPE_STR, "BF16": MODEL_DATA_TYPE_INT}
+
+    # Model config yaml related
+    DEPLOY_TIMEOUT_SEC_KEY = "deploy_timeout_sec"
+    DEPLOY_TIMEOUT_SEC_DEFAULT = 600
+
+    ENABLE_SERVERLESS_CONTAINER_KEY = "enable_serverless_container"
+
+    CUSTOMIZED_VOLUMES_MOUNT_KEY = "volumes"
+    CUSTOMIZED_VOLUMES_PATH_FROM_WORKSPACE_KEY = "workspace_path"
+    CUSTOMIZED_VOLUMES_PATH_FROM_CONTAINER_KEY = "mount_path"
 
     @staticmethod
     def get_fedml_home_dir():
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py b/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py
index 41c565d5d8..5b2658f0b3 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py
@@ -1,3 +1,5 @@
+import logging
+
 import httpx
 import traceback
 
@@ -46,9 +48,8 @@ async def is_inference_ready(inference_url, path="ready", timeout=None):
     @staticmethod
     async def run_http_inference_with_curl_request(
             inference_url, inference_input_list, inference_output_list,
-            inference_type="default", engine_type="default", timeout=None
+            inference_type="default", engine_type="default", timeout=None, method="POST"
     ):
-        model_inference_result = {}
         if inference_type == "default":
             model_api_headers = {'Content-Type': 'application/json', 'Connection': 'close',
                                  'Accept': 'application/json'}
@@ -63,11 +64,10 @@ async def run_http_inference_with_curl_request(
                 "outputs": inference_output_list
             }
 
-        response_ok = False
         try:
             if model_inference_json.get("stream", False):
                 model_inference_result = StreamingResponse(
-                    stream_generator(inference_url, input_json=model_inference_json),
+                    stream_generator(inference_url, input_json=model_inference_json, method=method),
                     media_type="text/event-stream",
                     headers={
                         "Content-Type": model_api_headers.get("Accept", "text/event-stream"),
@@ -76,8 +76,8 @@ async def run_http_inference_with_curl_request(
                 )
                 response_ok = True
             else:
-                response_ok, model_inference_result = await redirect_request_to_worker(
-                    inference_type, inference_url, model_api_headers, model_inference_json, timeout)
+                response_ok, model_inference_result = await redirect_non_stream_req_to_worker(
+                    inference_type, inference_url, model_api_headers, model_inference_json, timeout, method=method)
         except Exception as e:
             response_ok = False
             model_inference_result = {"response": f"{traceback.format_exc()}"}
@@ -85,21 +85,22 @@ async def run_http_inference_with_curl_request(
         return response_ok, model_inference_result
 
 
-async def stream_generator(inference_url, input_json):
+async def stream_generator(inference_url, input_json, method="POST"):
     async with httpx.AsyncClient() as client:
-        async with client.stream("POST", inference_url, json=input_json,
+        async with client.stream(method, inference_url, json=input_json,
                                  timeout=ClientConstants.WORKER_STREAM_API_TIMEOUT) as response:
             async for chunk in response.aiter_lines():
                 # we consumed a newline, need to put it back
                 yield f"{chunk}\n"
 
 
-async def redirect_request_to_worker(inference_type, inference_url, model_api_headers, model_inference_json, timeout=None):
+async def redirect_non_stream_req_to_worker(inference_type, inference_url, model_api_headers, model_inference_json,
+                                            timeout=None, method="POST"):
     response_ok = True
     try:
         async with httpx.AsyncClient() as client:
-            response = await client.post(
-                url=inference_url, headers=model_api_headers, json=model_inference_json, timeout=timeout
+            response = await client.request(
+                method=method, url=inference_url, headers=model_api_headers, json=model_inference_json, timeout=timeout
             )
     except Exception as e:
         response_ok = False
@@ -107,13 +108,18 @@ async def redirect_request_to_worker(inference_type, inference_url, model_api_he
         return response_ok, model_inference_result
     
     if response.status_code == 200:
-        if inference_type == "default":
-            model_inference_result = response.json()
-        elif inference_type == "image/png":
-            binary_content: bytes = response.content
-            model_inference_result = Response(content=binary_content, media_type="image/png")
-        else:
-            model_inference_result = response.json()
+        try:
+            if inference_type == "default":
+                model_inference_result = response.json()
+            elif inference_type == "image/png":
+                binary_content: bytes = response.content
+                model_inference_result = Response(content=binary_content, media_type="image/png")
+            else:
+                model_inference_result = response.json()
+        except Exception as e:
+            response_ok = True
+            logging.warning(f"Status code 200, but cannot trans response to json due to: {e}.")
+            model_inference_result = {"response": f"{response.content}"}
     else:
         model_inference_result = {"response": f"{response.content}"}
 
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_http_proxy_inference_protocol.py b/python/fedml/computing/scheduler/model_scheduler/device_http_proxy_inference_protocol.py
index 53f5a002eb..746d17bb7c 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_http_proxy_inference_protocol.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_http_proxy_inference_protocol.py
@@ -50,6 +50,7 @@ async def run_http_proxy_inference_with_request(
             endpoint_id, inference_url, inference_input_list,
             inference_output_list, inference_type="default",
             timeout=None
+            # TODO(Raphael): Add support for GET and other methods
     ):
         inference_response = {}
         http_proxy_url = f"http://{urlparse(inference_url).hostname}:{ClientConstants.LOCAL_CLIENT_API_PORT}/api/v1/predict"
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
index 1aef8c09f1..e18081c324 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
@@ -112,19 +112,20 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
         relative_entry_fedml_format = config.get('entry_point', "")
 
         # User indicate either fedml format python main entry filename or entry command
-        customized_image_entry_cmd = config.get('container_run_command', None)
+        enable_serverless_container = config.get(ClientConstants.ENABLE_SERVERLESS_CONTAINER_KEY, False)
+        customized_image_entry_cmd = config.get('container_run_command', None)  # Could be str or list
         customized_readiness_check = config.get('readiness_probe', ClientConstants.READINESS_PROBE_DEFAULT)
         customized_liveliness_check = config.get('liveness_probe', ClientConstants.LIVENESS_PROBE_DEFAULT)
 
         # Storage related
         src_code_dir = os.path.join(model_storage_local_path, config.get('source_code_dir', ""))
-        # TODO(Raphael): In the future, the data_cache_dir should not be controlled by the user. It only
-        #  used for internal avoiding checkpoint re-download. e.g. ~/.cache/huggingface/hub/
         data_cache_dir_input = config.get('data_cache_dir', "")
+        usr_customized_mount_rule = config.get(ClientConstants.CUSTOMIZED_VOLUMES_MOUNT_KEY, None)
 
         # Others
         extra_envs = config.get('environment_variables', None)
-        usr_indicated_wait_time = config.get('deploy_timeout', 900)
+        usr_indicated_wait_time = config.get(ClientConstants.DEPLOY_TIMEOUT_SEC_KEY,
+                                             config.get("deploy_timeout", ClientConstants.DEPLOY_TIMEOUT_SEC_DEFAULT))
         usr_indicated_retry_cnt = max(int(usr_indicated_wait_time) // 10, 1)
         request_input_example = config.get('request_input_example', None)
 
@@ -189,52 +190,12 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
     binds = {}
     environment = {}
 
-    if isinstance(data_cache_dir_input, str):
-        # In this case, we mount to the same folder, if it has ~, we replace it with /home/fedml
-        if data_cache_dir_input != "":
-            if data_cache_dir_input[0] == "~":
-                src_data_cache_dir = os.path.expanduser(data_cache_dir_input)
-                dst_data_cache_dir = data_cache_dir_input.replace("~", "/home/fedml")
-            else:
-                # check if the data_cache_dir is a relative path
-                if data_cache_dir_input[0] != "/":
-                    raise "data_cache_dir_input has to be an absolute path or start with ~"
-                else:
-                    src_data_cache_dir = data_cache_dir_input
-                    dst_data_cache_dir = data_cache_dir_input
-            logging.info(f"src_data_cache_dir: {src_data_cache_dir}, dst_data_cache_dir: {dst_data_cache_dir}")
+    # Handle the union volume mount
+    _handle_union_volume_mount(binds, volumes, environment, data_cache_dir_input)
 
-            if type(src_data_cache_dir) == str and src_data_cache_dir != "":
-                logging.info("Start copying the data cache to the container...")
-                if os.path.exists(src_data_cache_dir):
-                    volumes.append(src_data_cache_dir)
-                    binds[src_data_cache_dir] = {
-                        "bind": dst_data_cache_dir,
-                        "mode": "rw"
-                    }
-                    environment["DATA_CACHE_FOLDER"] = dst_data_cache_dir
-    elif isinstance(data_cache_dir_input, dict):
-        for k, v in data_cache_dir_input.items():
-            if os.path.exists(k):
-                volumes.append(v)
-                binds[k] = {
-                    "bind": v,
-                    "mode": "rw"
-                }
-            else:
-                logging.warning(f"{k} does not exist, skip mounting it to the container")
-        logging.info(f"Data cache mount: {volumes}, {binds}")
-    else:
-        logging.warning("data_cache_dir_input is not a string or a dictionary, skip mounting it to the container")
-
-    # Inject the source code
-    logging.info("Start copying the source code to the container...")
-    volumes.append(src_code_dir)
-    binds[src_code_dir] = {
-        "bind": dst_model_serving_dir,
-        "mode": "rw"
-    }
-    environment["MAIN_ENTRY"] = relative_entry_fedml_format
+    # Handle the default volume mount
+    handle_volume_mount(volumes, binds, environment, relative_entry_fedml_format, src_code_dir,
+                        dst_model_serving_dir, usr_customized_mount_rule, host_workspace_root=model_storage_local_path)
 
     # Host config
     host_config_dict = {
@@ -331,6 +292,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
     model_metadata = ret_model_metadata
     model_metadata["liveliness_check"] = customized_liveliness_check
     model_metadata["readiness_check"] = customized_readiness_check
+    model_metadata[ClientConstants.ENABLE_SERVERLESS_CONTAINER_KEY] = enable_serverless_container
     logging.info(f"[Worker][Replica{replica_rank}] Model deployment is successful with inference_output_url: "
                  f"{inference_output_url}, model_metadata: {model_metadata}, model_config: {ret_model_config}")
 
@@ -527,24 +489,129 @@ def is_client_inference_container_ready(infer_url_host, inference_http_port,
             logging.error(f"Unknown readiness check type: {readiness_check}")
             return "", "", {}, {}
 
-        if "path" in readiness_check:
-            readiness_check_url = f"http://{infer_url_host}:{inference_http_port}/{readiness_check['path']}"
-            response = None
-            try:
-                response = requests.get(readiness_check_url)
-            except:
-                pass
-            if not response or response.status_code != 200:
-                return "", "", {}, {}
+        if "httpGet" in readiness_check:
+            if "path" in readiness_check["httpGet"]:
+                check_path = readiness_check["httpGet"]["path"]
+                if not isinstance(check_path, str):
+                    logging.error(f"Invalid path type: {check_path}, expected str")
+                    return "", "", {}, {}
+                else:
+                    if not check_path.startswith("/"):
+                        check_path = "/" + check_path
+                readiness_check_url = f"http://{infer_url_host}:{inference_http_port}{check_path}"
 
-            return "http://{}:{}/".format(infer_url_host, inference_http_port), None, model_metadata, None
-        elif "command" in readiness_check:
+                response = None
+                try:
+                    response = requests.get(readiness_check_url)
+                except:
+                    pass
+                if not response or response.status_code != 200:
+                    return "", "", {}, {}
+
+                return readiness_check_url, None, model_metadata, None
+            else:
+                logging.error("'path' is not specified in httpGet readiness check")
+                return "", "", {}, {}
+        elif "exec" in readiness_check:
             # TODO(raphael): Support arbitrary readiness check command by using
             #  container id and docker exec
             return "http://{}:{}/".format(infer_url_host, inference_http_port), None, model_metadata, None
         else:
-            logging.error(f"Unknown readiness check type: {readiness_check}")
-            return "", "", {}, {}
+            # Ref K8S, if no readiness check, we assume the container is ready immediately
+            return "http://{}:{}/".format(infer_url_host, inference_http_port), None, model_metadata, None
+
+
+def _handle_union_volume_mount(binds, volumes, environment, data_cache_dir_input=None):
+    """
+    Private: data_cache_dir is the union folder on host machine, which will be shard across different containers,
+    the control of this folder should be handled by the platform.
+    """
+    if isinstance(data_cache_dir_input, str):
+        # In this case, we mount to the same folder, if it has ~, we replace it with /home/fedml
+        if data_cache_dir_input != "":
+            if data_cache_dir_input[0] == "~":
+                src_data_cache_dir = os.path.expanduser(data_cache_dir_input)
+                dst_data_cache_dir = data_cache_dir_input.replace("~", "/home/fedml")
+            else:
+                # check if the data_cache_dir is a relative path
+                if data_cache_dir_input[0] != "/":
+                    raise "data_cache_dir_input has to be an absolute path or start with ~"
+                else:
+                    src_data_cache_dir = data_cache_dir_input
+                    dst_data_cache_dir = data_cache_dir_input
+            logging.info(f"src_data_cache_dir: {src_data_cache_dir}, dst_data_cache_dir: {dst_data_cache_dir}")
+
+            if isinstance(src_data_cache_dir, str) and src_data_cache_dir != "":
+                logging.info("Start copying the data cache to the container...")
+                if os.path.exists(src_data_cache_dir):
+                    volumes.append(src_data_cache_dir)
+                    binds[src_data_cache_dir] = {
+                        "bind": dst_data_cache_dir,
+                        "mode": "rw"
+                    }
+                    environment["DATA_CACHE_FOLDER"] = dst_data_cache_dir
+    elif isinstance(data_cache_dir_input, dict):
+        for k, v in data_cache_dir_input.items():
+            if os.path.exists(k):
+                volumes.append(v)
+                binds[k] = {
+                    "bind": v,
+                    "mode": "rw"
+                }
+            else:
+                logging.warning(f"{k} does not exist, skip mounting it to the container")
+        logging.info(f"Data cache mount: {volumes}, {binds}")
+    else:
+        logging.info("data_cache_dir_input is not a string or a dictionary, skip mounting it to the container")
+
+
+def handle_volume_mount(volumes, binds, environment, relative_entry_fedml_format="", src_code_dir="",
+                        dst_model_serving_dir="", customized_volumes_mount_rule=None, host_workspace_root=""):
+    # If fedml format entry point is specified, inject the source code, e.g., main.py (FedMLPredictor inside)
+    if relative_entry_fedml_format != "":
+        logging.info("Using FedML format entry point, mounting the source code...")
+        volumes.append(src_code_dir)
+        binds[src_code_dir] = {
+            "bind": dst_model_serving_dir,
+            "mode": "rw"
+        }
+        environment["MAIN_ENTRY"] = relative_entry_fedml_format
+        return  # The reason we return here is that we don't need to mount the source code again
+
+    # If customized volume mount rule is specified, just follow the mount rule
+    """
+    e.g.,
+    volumes:
+      - workspace_path: "./model_repository"
+        mount_path: "/repo_inside_container"
+    """
+    mount_list = []
+    if not isinstance(customized_volumes_mount_rule, list):
+        if not isinstance(customized_volumes_mount_rule, dict):
+            logging.warning("customized_volumes_mount_rule is not a list or a dictionary, "
+                            "skip mounting it to the container")
+            return
+
+        # transform the dict to list
+        for k, v in customized_volumes_mount_rule.items():
+            mount_list.append({ClientConstants.CUSTOMIZED_VOLUMES_PATH_FROM_WORKSPACE_KEY: k,
+                               ClientConstants.CUSTOMIZED_VOLUMES_PATH_FROM_CONTAINER_KEY: v})
+    else:
+        mount_list = customized_volumes_mount_rule if customized_volumes_mount_rule is not None else []
+
+    for mount in mount_list:
+        workspace_relative_path = mount[ClientConstants.CUSTOMIZED_VOLUMES_PATH_FROM_WORKSPACE_KEY]
+        mount_path = mount[ClientConstants.CUSTOMIZED_VOLUMES_PATH_FROM_CONTAINER_KEY]
+
+        workspace_path = os.path.join(host_workspace_root, workspace_relative_path)
+        if os.path.exists(workspace_path):
+            volumes.append(workspace_path)
+            binds[workspace_path] = {
+                "bind": mount_path,
+                "mode": "rw"
+            }
+        else:
+            logging.warning(f"{workspace_path} does not exist, skip mounting it to the container")
 
 
 def check_container_readiness(inference_http_port, infer_host="127.0.0.1", request_input_example=None,
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py
index f6fa99d6d4..7ef9689c1c 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py
@@ -8,7 +8,7 @@
 from typing import Any, Mapping, MutableMapping, Union
 from urllib.parse import urlparse
 
-from fastapi import FastAPI, Request, Response, status
+from fastapi import FastAPI, Request, Response, status, APIRouter
 from fastapi.responses import StreamingResponse, JSONResponse
 
 import fedml
@@ -38,6 +38,7 @@ class Settings:
 
 
 api = FastAPI()
+router = APIRouter()
 
 FEDML_MODEL_CACHE = FedMLModelCache.get_instance()
 FEDML_MODEL_CACHE.set_redis_params(redis_addr=Settings.redis_addr,
@@ -169,7 +170,8 @@ async def predict_with_end_point_id(end_point_id, request: Request, response: Re
     return inference_response
 
 
-@api.post('/custom_inference/{end_point_id}/{path:path}')
+# @api.post('/custom_inference/{end_point_id}/{path:path}')
+@router.api_route("/custom_inference/{end_point_id}/{path:path}", methods=["POST", "GET"])
 async def custom_inference(end_point_id, path: str, request: Request):
     # Get json data
     input_json = await request.json()
@@ -178,18 +180,21 @@ async def custom_inference(end_point_id, path: str, request: Request):
     header = request.headers
 
     try:
-        inference_response = await _predict(end_point_id, input_json, header, path)
+        inference_response = await _predict(end_point_id, input_json, header, path, request.method)
     except Exception as e:
         inference_response = {"error": True, "message": f"{traceback.format_exc()}"}
 
     return inference_response
 
+api.include_router(router)
+
 
 async def _predict(
         end_point_id,
         input_json,
         header=None,
         path=None,
+        request_method="POST"
 ) -> Union[MutableMapping[str, Any], Response, StreamingResponse]:
     # Always increase the pending requests counter on a new incoming request.
     FEDML_MODEL_CACHE.update_pending_requests_counter(end_point_id, increase=True)
@@ -264,7 +269,7 @@ async def _predict(
                     output_list,
                     inference_type=in_return_type,
                     connectivity_type=connectivity_type,
-                    path=path)
+                    path=path, request_method=request_method)
 
             # Calculate model metrics
             try:
@@ -356,7 +361,7 @@ def found_idle_inference_device(end_point_id, end_point_name, in_model_name, in_
 async def send_inference_request(idle_device, end_point_id, inference_url, input_list, output_list,
                                  inference_type="default",
                                  connectivity_type=ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT,
-                                 path=None):
+                                 path=None, request_method="POST"):
     request_timeout_sec = FEDML_MODEL_CACHE.get_endpoint_settings(end_point_id) \
         .get("request_timeout_sec", ClientConstants.INFERENCE_REQUEST_TIMEOUT)
 
@@ -369,7 +374,8 @@ async def send_inference_request(idle_device, end_point_id, inference_url, input
                 input_list,
                 output_list,
                 inference_type=inference_type,
-                timeout=request_timeout_sec)
+                timeout=request_timeout_sec,
+                method=request_method)
             logging.debug(f"Use http inference. return {response_ok}")
             return inference_response
         elif connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP_PROXY:
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py
index a5048c26a6..b58b8fae72 100644
--- a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py
@@ -147,6 +147,10 @@ class ServerConstants(object):
     DEVICE_DIFF_ADD_OPERATION = "op: add"
     DEVICE_DIFF_DELETE_OPERATION = "op: delete"
     DEVICE_DIFF_REPLACE_OPERATION = "op: replace"
+
+    # Worker comfig yaml related
+    ENABLE_SERVERLESS_CONTAINER_KEY = "enable_serverless_container"
+
     @staticmethod
     def get_fedml_home_dir():
         home_dir = expanduser("~")
diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
index eff26684b7..f95dd8e176 100755
--- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
+++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
@@ -298,14 +298,7 @@ def process_deployment_result_message(self, topic=None, payload=None):
                     return
                 else:
                     # This is the last worker that failed, so we should continue to "ABORTED" status
-                    model_config_parameters = self.request_json["parameters"]
-                    inference_port_external = ServerConstants.get_inference_master_gateway_port()
-                    ip = GeneralConstants.get_ip_address(self.request_json)
-                    if ip.startswith("http://") or ip.startswith("https://"):
-                        model_inference_url = "{}/inference/{}".format(ip, end_point_id)
-                    else:
-                        model_inference_url = "http://{}:{}/inference/{}".format(ip, inference_port_external,
-                                                                                 end_point_id)
+                    model_inference_url = self.construct_final_gateway_url(end_point_id)
 
                     self.send_deployment_status(
                         end_point_id, end_point_name, payload_json["model_name"], model_inference_url,
@@ -367,13 +360,7 @@ def process_deployment_result_message(self, topic=None, payload=None):
             """
             When all the devices have finished the add / delete / update operation
             """
-            inference_port_external = ServerConstants.get_inference_master_gateway_port()
-            ip = GeneralConstants.get_ip_address(request_json)
-
-            if ip.startswith("http://") or ip.startswith("https://"):
-                model_inference_url = "{}/inference/{}".format(ip, end_point_id)
-            else:
-                model_inference_url = "http://{}:{}/inference/{}".format(ip, inference_port_external, end_point_id)
+            model_inference_url, inference_port_external = self.construct_final_gateway_url(end_point_id)
 
             # Send stage: MODEL_DEPLOYMENT_STAGE5 = "StartInferenceIngress"
             self.send_deployment_stages(end_point_id, model_name, model_id,
@@ -394,7 +381,7 @@ def process_deployment_result_message(self, topic=None, payload=None):
 
                 model_metadata = payload_json["model_metadata"]
                 model_inputs = model_metadata["inputs"]
-                ret_inputs = list()
+
                 if "type" in model_metadata and model_metadata["type"] == "default":
                     payload_json["input_json"] = {"end_point_name": end_point_name,
                                                   "model_name": model_name,
@@ -768,3 +755,22 @@ def build_dynamic_args(self, run_id, run_config, package_conf_object, base_dir):
     def build_dynamic_constrain_variables(self, run_id, run_config):
         pass
 
+    def construct_final_gateway_url(self, end_point_id):
+        inference_port_external = ServerConstants.get_inference_master_gateway_port()
+        ip = GeneralConstants.get_ip_address(self.request_json)
+
+        identifier = "inference"
+        if self.deployed_replica_payload is not None:
+            payload_json = self.deployed_replica_payload
+            enable_custom_path = payload_json["model_metadata"].get(
+                ServerConstants.ENABLE_SERVERLESS_CONTAINER_KEY, False)
+            if enable_custom_path:
+                identifier = "custom_inference"
+
+        if ip.startswith("http://") or ip.startswith("https://"):
+            model_inference_url = "{}/{}/{}".format(ip, identifier, end_point_id)
+        else:
+            model_inference_url = "http://{}:{}/{}/{}".format(ip, inference_port_external, identifier,
+                                                              end_point_id)
+        return model_inference_url, inference_port_external
+

From c0f691c7fd468549ee311c8ae260ba9c5599a43e Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Thu, 20 Jun 2024 14:27:35 -0700
Subject: [PATCH 28/38] [Deploy] Nit.

---
 .../custom_inference_image/template.yaml      | 22 +++++++++++++++++++
 .../scheduler/comm_utils/job_monitor.py       |  2 +-
 .../device_http_inference_protocol.py         |  9 ++++----
 .../device_model_deployment.py                |  2 --
 .../model_scheduler/device_model_inference.py |  1 -
 .../device_server_constants.py                |  2 +-
 .../model_scheduler/master_job_runner.py      | 12 ++++------
 7 files changed, 33 insertions(+), 17 deletions(-)
 create mode 100644 python/examples/deploy/custom_inference_image/template.yaml

diff --git a/python/examples/deploy/custom_inference_image/template.yaml b/python/examples/deploy/custom_inference_image/template.yaml
new file mode 100644
index 0000000000..10e6580bcf
--- /dev/null
+++ b/python/examples/deploy/custom_inference_image/template.yaml
@@ -0,0 +1,22 @@
+# Required
+workspace: "./"                     # We will pacakge all the files in the workspace directory
+enable_serverless_container: true   # Identify whether to use serverless container
+inference_image_name: ""            # Container image name
+container_run_command: ""           # str or list, similar to CMD in the dockerfile
+port: 80                            # Service port, currently you can only indicate one arbitrary port
+
+# Optional, these are the default values
+readiness_probe:                    # Probe for checking whether a container is ready for inference
+  httpGet:
+    path: ""
+environment_variables: {}           # Environment variables inside the container
+volumes:                            # Volumes to mount to the container
+    - workspace_path: ""            # Path to the volume in the workspace
+      mount_path: ""                # Path to mount the volume inside the container
+deploy_timeout_sec: 900             # Maximum time waiting for deployment to finish (Does not include the time to pull the image)
+request_input_example: {}           # Example of input request, will be shown in the UI
+registry_specs:                     # Registry information for pulling the image
+  registry_name: ""
+  registry_provider: "DockerHub"
+  registry_user_name: ""
+  registry_user_password: ""
\ No newline at end of file
diff --git a/python/fedml/computing/scheduler/comm_utils/job_monitor.py b/python/fedml/computing/scheduler/comm_utils/job_monitor.py
index d216b46dad..667a54e565 100644
--- a/python/fedml/computing/scheduler/comm_utils/job_monitor.py
+++ b/python/fedml/computing/scheduler/comm_utils/job_monitor.py
@@ -772,7 +772,7 @@ def _lenient_check_replica_ready(deployment_result):
         liveliness_check = result_json.get("model_metadata", {}).get("liveliness_check", None)
         readiness_check = result_json.get("model_metadata", {}).get("readiness_check", None)
 
-        if liveliness_check is not None:
+        if liveliness_check:
             if liveliness_check == ClientConstants.LIVENESS_PROBE_DEFAULT:
                 liveliness_check = readiness_check  # Follow the readiness check pattern
             if not isinstance(liveliness_check, dict):
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py b/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py
index 5b2658f0b3..28d50d5a50 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py
@@ -25,6 +25,8 @@ async def is_inference_ready(inference_url, path="ready", timeout=None):
         url_parsed = urlparse(inference_url)
         ready_url = f"http://{url_parsed.hostname}:{url_parsed.port}/{path}"
         response_ok = False
+
+        # TODO (Raphael): Support more methods and return codes rules.
         try:
             async with httpx.AsyncClient() as client:
                 ready_response = await client.get(url=ready_url, timeout=timeout)
@@ -109,11 +111,10 @@ async def redirect_non_stream_req_to_worker(inference_type, inference_url, model
     
     if response.status_code == 200:
         try:
-            if inference_type == "default":
-                model_inference_result = response.json()
-            elif inference_type == "image/png":
+            if inference_type == "image/png":
+                # wrapped media type for image
                 binary_content: bytes = response.content
-                model_inference_result = Response(content=binary_content, media_type="image/png")
+                model_inference_result = Response(content=binary_content, media_type=inference_type)
             else:
                 model_inference_result = response.json()
         except Exception as e:
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
index e18081c324..552d7ffaca 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
@@ -2,7 +2,6 @@
 
 import logging
 import os
-import shutil
 import time
 import traceback
 import yaml
@@ -12,7 +11,6 @@
 import requests
 import torch
 import torch.nn
-import tritonclient.http as http_client
 
 import collections.abc
 
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py
index 7ef9689c1c..9adc17538d 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py
@@ -170,7 +170,6 @@ async def predict_with_end_point_id(end_point_id, request: Request, response: Re
     return inference_response
 
 
-# @api.post('/custom_inference/{end_point_id}/{path:path}')
 @router.api_route("/custom_inference/{end_point_id}/{path:path}", methods=["POST", "GET"])
 async def custom_inference(end_point_id, path: str, request: Request):
     # Get json data
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py
index b58b8fae72..f86056229e 100644
--- a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py
@@ -358,7 +358,7 @@ def get_inference_master_gateway_port():
         # Use dotenv to load the environment variables
         fedml.load_env()
         master_inference_port = int(os.getenv(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY,
-                                            default=ServerConstants.MODEL_INFERENCE_DEFAULT_PORT))
+                                              default=ServerConstants.MODEL_INFERENCE_DEFAULT_PORT))
         return master_inference_port
 
 
diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
index f95dd8e176..ab6bc4c895 100755
--- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
+++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
@@ -360,7 +360,7 @@ def process_deployment_result_message(self, topic=None, payload=None):
             """
             When all the devices have finished the add / delete / update operation
             """
-            model_inference_url, inference_port_external = self.construct_final_gateway_url(end_point_id)
+            model_inference_url = self.construct_final_gateway_url(end_point_id)
 
             # Send stage: MODEL_DEPLOYMENT_STAGE5 = "StartInferenceIngress"
             self.send_deployment_stages(end_point_id, model_name, model_id,
@@ -375,7 +375,7 @@ def process_deployment_result_message(self, topic=None, payload=None):
                 payload_json = self.deployed_replica_payload
                 model_slave_url = payload_json["model_url"]
                 payload_json["model_url"] = model_inference_url
-                payload_json["port"] = inference_port_external
+                payload_json["port"] = ServerConstants.get_inference_master_gateway_port()
                 token = FedMLModelCache.get_instance(self.redis_addr, self.redis_port).get_end_point_token(
                     end_point_id, end_point_name, model_name)
 
@@ -767,10 +767,6 @@ def construct_final_gateway_url(self, end_point_id):
             if enable_custom_path:
                 identifier = "custom_inference"
 
-        if ip.startswith("http://") or ip.startswith("https://"):
-            model_inference_url = "{}/{}/{}".format(ip, identifier, end_point_id)
-        else:
-            model_inference_url = "http://{}:{}/{}/{}".format(ip, inference_port_external, identifier,
-                                                              end_point_id)
-        return model_inference_url, inference_port_external
+        model_inference_url = "http://{}:{}/{}/{}".format(ip, inference_port_external, identifier, end_point_id)
+        return model_inference_url
 

From 33fb5b45fc674d18d74e7f435d41e69ebfde703d Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Fri, 21 Jun 2024 14:21:16 -0700
Subject: [PATCH 29/38] [Deploy] Pass down the api key to container.

---
 .../device_client_constants.py                |  2 +
 .../model_scheduler/device_model_cache.py     | 15 +++++-
 .../device_model_deployment.py                | 47 ++++++++++++-------
 .../device_server_constants.py                |  2 +
 .../master_protocol_manager.py                | 25 +++++-----
 .../model_scheduler/worker_job_runner.py      |  4 +-
 6 files changed, 62 insertions(+), 33 deletions(-)

diff --git a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
index e18c9f730b..4aee592fca 100644
--- a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
@@ -165,6 +165,8 @@ class ClientConstants(object):
     CUSTOMIZED_VOLUMES_PATH_FROM_WORKSPACE_KEY = "workspace_path"
     CUSTOMIZED_VOLUMES_PATH_FROM_CONTAINER_KEY = "mount_path"
 
+    ENV_USER_ENCRYPTED_API_KEY = "FEDML_USER_ENCRYPTED_API_KEY"
+
     @staticmethod
     def get_fedml_home_dir():
         home_dir = expanduser("~")
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py
index c941c42102..b0021aa7df 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py
@@ -112,7 +112,8 @@ def set_user_setting_replica_num(self, end_point_id,
                                      replica_num: int, enable_auto_scaling: bool = False,
                                      scale_min: int = 0, scale_max: int = 0, state: str = "UNKNOWN",
                                      target_queries_per_replica: int = 60, aggregation_window_size_seconds: int = 60,
-                                     scale_down_delay_seconds: int = 120, timeout_s: int = 30
+                                     scale_down_delay_seconds: int = 120, timeout_s: int = 30,
+                                     user_encrypted_api_key: str = ""
                                      ) -> bool:
         """
         Key: FEDML_MODEL_ENDPOINT_REPLICA_USER_SETTING_TAG--<end_point_id>
@@ -139,7 +140,8 @@ def set_user_setting_replica_num(self, end_point_id,
             "target_queries_per_replica": target_queries_per_replica,
             "aggregation_window_size_seconds": aggregation_window_size_seconds,
             "scale_down_delay_seconds": scale_down_delay_seconds,
-            ServerConstants.INFERENCE_REQUEST_TIMEOUT_KEY: timeout_s
+            ServerConstants.INFERENCE_REQUEST_TIMEOUT_KEY: timeout_s,
+            ServerConstants.USER_ENCRYPTED_API_KEY: user_encrypted_api_key
         }
         try:
             self.redis_connection.set(self.get_user_setting_replica_num_key(end_point_id), json.dumps(replica_num_dict))
@@ -169,6 +171,15 @@ def update_user_setting_replica_num(self, end_point_id: str, state: str = "UNKNO
             return False
         return True
 
+    def get_user_encrypted_api_key(self, end_point_id: str) -> str:
+        try:
+            replica_num_dict = self.redis_connection.get(self.get_user_setting_replica_num_key(end_point_id))
+            replica_num_dict = json.loads(replica_num_dict)
+            return replica_num_dict.get(ServerConstants.USER_ENCRYPTED_API_KEY, "")
+        except Exception as e:
+            logging.error(e)
+            return ""
+
     def get_all_endpoints_user_setting(self) -> List[dict]:
         """
         Return a list of dict, each dict is the user setting of an endpoint.
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
index 552d7ffaca..9416d243d2 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
@@ -19,6 +19,7 @@
 from fedml.computing.scheduler.comm_utils.job_utils import JobRunnerUtils
 from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants
 from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants
+from fedml.computing.scheduler.model_scheduler.device_server_constants import ServerConstants
 from fedml.computing.scheduler.model_scheduler.device_model_cache import FedMLModelCache
 from ..scheduler_core.compute_utils import ComputeUtils
 from ..comm_utils.container_utils import ContainerUtils
@@ -59,7 +60,9 @@ def request_gpu_ids_on_deployment(edge_id, end_point_id, num_gpus=None, master_d
 def start_deployment(end_point_id, end_point_name, model_id, model_version,
                      model_storage_local_path, inference_model_name, inference_engine,
                      infer_host, master_ip, edge_id, master_device_id=None, replica_rank=0,
-                     gpu_per_replica=1):
+                     gpu_per_replica=1, request_json=None):
+    if request_json is None:
+        request_json = dict()
     logging.info("[Worker] Model deployment is starting...")
 
     # Real gpu per replica (container-level)
@@ -219,22 +222,9 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
     if device_mapping:
         host_config_dict.update(device_mapping)
 
-    # Environment variables
-    enable_custom_image = False if relative_entry_fedml_format != "" else True
-    if not enable_custom_image:
-        # For some image, the default user is root. Unified to fedml.
-        environment["HOME"] = "/home/fedml"
-    environment["BOOTSTRAP_DIR"] = dst_bootstrap_dir
-    environment["FEDML_CURRENT_RUN_ID"] = end_point_id
-    environment["FEDML_CURRENT_EDGE_ID"] = edge_id
-    environment["FEDML_REPLICA_RANK"] = replica_rank
-    environment["FEDML_CURRENT_VERSION"] = fedml.get_env_version()
-    environment["FEDML_ENV_VERSION"] = fedml.get_env_version()
-    environment["FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_HOST"] = fedml.get_local_on_premise_platform_host()
-    environment["FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_PORT"] = fedml.get_local_on_premise_platform_port()
-    if extra_envs is not None:
-        for key in extra_envs:
-            environment[key] = extra_envs[key]
+    # Handle the environment variables
+    handle_env_vars(environment, relative_entry_fedml_format, extra_envs, dst_bootstrap_dir,
+                    end_point_id, edge_id, replica_rank, request_json)
 
     # Create the container
     try:
@@ -612,6 +602,29 @@ def handle_volume_mount(volumes, binds, environment, relative_entry_fedml_format
             logging.warning(f"{workspace_path} does not exist, skip mounting it to the container")
 
 
+def handle_env_vars(environment, relative_entry_fedml_format, extra_envs, dst_bootstrap_dir, end_point_id, edge_id,
+                    replica_rank, request_json):
+    enable_custom_image = False if relative_entry_fedml_format != "" else True
+    if not enable_custom_image:
+        # For some image, the default user is root. Unified to fedml.
+        environment["HOME"] = "/home/fedml"
+
+    if request_json and ServerConstants.USER_ENCRYPTED_API_KEY in request_json:
+        environment[ClientConstants.ENV_USER_ENCRYPTED_API_KEY] = request_json[ServerConstants.USER_ENCRYPTED_API_KEY]
+
+    environment["BOOTSTRAP_DIR"] = dst_bootstrap_dir
+    environment["FEDML_CURRENT_RUN_ID"] = end_point_id
+    environment["FEDML_CURRENT_EDGE_ID"] = edge_id
+    environment["FEDML_REPLICA_RANK"] = replica_rank
+    environment["FEDML_CURRENT_VERSION"] = fedml.get_env_version()
+    environment["FEDML_ENV_VERSION"] = fedml.get_env_version()
+    environment["FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_HOST"] = fedml.get_local_on_premise_platform_host()
+    environment["FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_PORT"] = fedml.get_local_on_premise_platform_port()
+    if extra_envs is not None:
+        for key in extra_envs:
+            environment[key] = extra_envs[key]
+
+
 def check_container_readiness(inference_http_port, infer_host="127.0.0.1", request_input_example=None,
                               readiness_check=ClientConstants.READINESS_PROBE_DEFAULT):
     response_from_client_container = is_client_inference_container_ready(
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py
index f86056229e..c41b150bc2 100644
--- a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py
@@ -108,6 +108,8 @@ class ServerConstants(object):
 
     INFERENCE_REQUEST_TIMEOUT_KEY = "request_timeout_sec"
     INFERENCE_REQUEST_TIMEOUT_DEFAULT = 30
+
+    USER_ENCRYPTED_API_KEY = "user_encrypted_api_key"
     # -----End-----
 
     MODEL_DEPLOYMENT_STAGE1 = {"index": 1, "text": "ReceivedRequest"}
diff --git a/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py b/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py
index 7bfad2f3eb..5e16d5a02a 100755
--- a/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py
+++ b/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py
@@ -158,25 +158,20 @@ def callback_start_deployment(self, topic, payload):
         run_id = request_json["end_point_id"]
         end_point_name = request_json["end_point_name"]
         token = request_json["token"]
-        user_id = request_json["user_id"]
-        user_name = request_json["user_name"]
-        device_ids = request_json["device_ids"]
         device_objs = request_json["device_objs"]
+        enable_auto_scaling = request_json.get("enable_auto_scaling", False)
+        desired_replica_num = request_json.get("desired_replica_num", 1)
+        target_queries_per_replica = request_json.get("target_queries_per_replica", 10)
+        aggregation_window_size_seconds = request_json.get("aggregation_window_size_seconds", 60)
+        scale_down_delay_seconds = request_json.get("scale_down_delay_seconds", 120)
+        user_encrypted_api_key = request_json.get("encrypted_api_key", "")
 
         model_config = request_json["model_config"]
         model_name = model_config["model_name"]
         model_version = model_config["model_version"]
         model_id = model_config["model_id"]
-        model_storage_url = model_config["model_storage_url"]
         scale_min = model_config.get("instance_scale_min", 0)
         scale_max = model_config.get("instance_scale_max", 0)
-        inference_engine = model_config.get("inference_engine", 0)
-        enable_auto_scaling = request_json.get("enable_auto_scaling", False)
-        desired_replica_num = request_json.get("desired_replica_num", 1)
-
-        target_queries_per_replica = request_json.get("target_queries_per_replica", 10)
-        aggregation_window_size_seconds = request_json.get("aggregation_window_size_seconds", 60)
-        scale_down_delay_seconds = request_json.get("scale_down_delay_seconds", 120)
 
         model_config_parameters = request_json.get("parameters", {})
         timeout_s = model_config_parameters.get("request_timeout_sec", 30)
@@ -193,6 +188,12 @@ def callback_start_deployment(self, topic, payload):
             request_json["end_point_id"])
         request_json["is_fresh_endpoint"] = True if endpoint_device_info is None else False
 
+        if user_encrypted_api_key == "":
+            user_encrypted_api_key = (FedMLModelCache.get_instance(self.redis_addr, self.redis_port).
+                                      get_user_encrypted_api_key(run_id))
+            if user_encrypted_api_key != "":    # Pass the cached key to the workers
+                request_json[ServerConstants.USER_ENCRYPTED_API_KEY] = user_encrypted_api_key
+
         # Save the user setting (about replica number) of this run to Redis, if existed, update it
         FedMLModelCache.get_instance(self.redis_addr, self.redis_port).set_user_setting_replica_num(
             end_point_id=run_id, end_point_name=end_point_name, model_name=model_name, model_version=model_version,
@@ -201,7 +202,7 @@ def callback_start_deployment(self, topic, payload):
             aggregation_window_size_seconds=aggregation_window_size_seconds,
             target_queries_per_replica=target_queries_per_replica,
             scale_down_delay_seconds=int(scale_down_delay_seconds),
-            timeout_s=timeout_s
+            timeout_s=timeout_s, user_encrypted_api_key=user_encrypted_api_key
         )
 
         # Start log processor for current run
diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py
index a892412d29..113a20e825 100755
--- a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py
+++ b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py
@@ -250,7 +250,7 @@ def run_impl(self, run_extend_queue_list, sender_message_center,
                             inference_model_name=model_name, inference_engine=inference_engine,
                             infer_host=worker_ip, master_ip=master_ip, edge_id=self.edge_id,
                             master_device_id=device_ids[0], replica_rank=rank,
-                            gpu_per_replica=int(self.replica_handler.gpu_per_replica)
+                            gpu_per_replica=int(self.replica_handler.gpu_per_replica), request_json=self.request_json
                         )
                 except Exception as e:
                     inference_output_url = ""
@@ -373,7 +373,7 @@ def run_impl(self, run_extend_queue_list, sender_message_center,
                             inference_model_name=model_name, inference_engine=inference_engine,
                             infer_host=worker_ip, master_ip=master_ip, edge_id=self.edge_id,
                             master_device_id=device_ids[0], replica_rank=rank,
-                            gpu_per_replica=int(self.replica_handler.gpu_per_replica)
+                            gpu_per_replica=int(self.replica_handler.gpu_per_replica), request_json=self.request_json
                         )
                 except Exception as e:
                     inference_output_url = ""

From f412a2637b6ae83f9fc1ecaa60b5205d4d43507d Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Fri, 21 Jun 2024 21:36:12 +0000
Subject: [PATCH 30/38] [Deploy] Nit.

---
 .../scheduler/model_scheduler/device_server_constants.py        | 2 +-
 .../scheduler/model_scheduler/master_protocol_manager.py        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py
index c41b150bc2..00f0fe73bf 100644
--- a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py
@@ -109,7 +109,7 @@ class ServerConstants(object):
     INFERENCE_REQUEST_TIMEOUT_KEY = "request_timeout_sec"
     INFERENCE_REQUEST_TIMEOUT_DEFAULT = 30
 
-    USER_ENCRYPTED_API_KEY = "user_encrypted_api_key"
+    USER_ENCRYPTED_API_KEY = "encrypted_api_key"
     # -----End-----
 
     MODEL_DEPLOYMENT_STAGE1 = {"index": 1, "text": "ReceivedRequest"}
diff --git a/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py b/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py
index 5e16d5a02a..9e0d51b588 100755
--- a/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py
+++ b/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py
@@ -164,7 +164,7 @@ def callback_start_deployment(self, topic, payload):
         target_queries_per_replica = request_json.get("target_queries_per_replica", 10)
         aggregation_window_size_seconds = request_json.get("aggregation_window_size_seconds", 60)
         scale_down_delay_seconds = request_json.get("scale_down_delay_seconds", 120)
-        user_encrypted_api_key = request_json.get("encrypted_api_key", "")
+        user_encrypted_api_key = request_json.get(ServerConstants.USER_ENCRYPTED_API_KEY, "")
 
         model_config = request_json["model_config"]
         model_name = model_config["model_name"]

From d6c9411774318e812e7f0b4dd73478f2a88e4cb3 Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Fri, 21 Jun 2024 15:00:33 -0700
Subject: [PATCH 31/38] [Deploy] Remove example.

---
 .../lorax/custom_inference_image.yaml            | 16 ----------------
 1 file changed, 16 deletions(-)
 delete mode 100644 python/examples/deploy/custom_inference_image/lorax/custom_inference_image.yaml

diff --git a/python/examples/deploy/custom_inference_image/lorax/custom_inference_image.yaml b/python/examples/deploy/custom_inference_image/lorax/custom_inference_image.yaml
deleted file mode 100644
index 41cbe501d2..0000000000
--- a/python/examples/deploy/custom_inference_image/lorax/custom_inference_image.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-workspace: "./"
-
-enable_serverless_container: true
-inference_image_name: "ghcr.io/predibase/lorax:main"
-container_run_command: "--model-id mistralai/Mistral-7B-Instruct-v0.1"
-
-environment_variables:
-  HUGGING_FACE_HUB_TOKEN: ""
-
-readiness_probe:
-  httpGet:
-    path: "/health"
-
-port: 80
-
-deploy_timeout_sec: 1600

From fa44ccce0a553f7c7d7dcceb5312b830061e718f Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Tue, 25 Jun 2024 11:48:17 -0700
Subject: [PATCH 32/38] [Deploy] Return custom path other than /predict.

---
 .../device_client_constants.py                |   1 +
 .../device_model_deployment.py                | 108 +++++++++++-------
 2 files changed, 67 insertions(+), 42 deletions(-)

diff --git a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
index 4aee592fca..4006e50726 100644
--- a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
@@ -164,6 +164,7 @@ class ClientConstants(object):
     CUSTOMIZED_VOLUMES_MOUNT_KEY = "volumes"
     CUSTOMIZED_VOLUMES_PATH_FROM_WORKSPACE_KEY = "workspace_path"
     CUSTOMIZED_VOLUMES_PATH_FROM_CONTAINER_KEY = "mount_path"
+    CUSTOMIZED_SERVICE_KEY = "service"
 
     ENV_USER_ENCRYPTED_API_KEY = "FEDML_USER_ENCRYPTED_API_KEY"
 
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
index 9416d243d2..25fc1e1d64 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
@@ -87,36 +87,10 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
         inference_image_name, image_pull_policy, registry_name, registry_provider, \
             registry_user_name, registry_user_password = parse_image_registry_related_config(config)
 
-        # Bootstrap, job and entrypoint related
-        dst_model_serving_dir = "/home/fedml/models_serving"
-        bootstrap_cmds_str_frm_yaml = config.get('bootstrap', "")
-        job_cmds_str_frm_yaml = config.get('job', "")
-
-        if bootstrap_cmds_str_frm_yaml != "" or job_cmds_str_frm_yaml != "":
-            auto_gen_bootstrap_file_name = "fedml-deploy-bootstrap-entry-auto-gen.sh"
-            src_bootstrap_file_path = os.path.join(model_storage_local_path, auto_gen_bootstrap_file_name)
-            with open(src_bootstrap_file_path, 'w') as f:
-                f.write("cd /home/fedml/models_serving/\n")
-                f.write(bootstrap_cmds_str_frm_yaml)
-                f.write("\n")
-                f.write("cd /home/fedml/models_serving/\n")
-                f.write(job_cmds_str_frm_yaml)
-        else:
-            src_bootstrap_file_path = ""
-
-        if src_bootstrap_file_path != "":
-            dst_bootstrap_dir = os.path.join(dst_model_serving_dir, auto_gen_bootstrap_file_name)
-        else:
-            dst_bootstrap_dir = ""
-
-        # If the entry point is in fedml format (e.g., "main.py")
-        relative_entry_fedml_format = config.get('entry_point', "")
-
-        # User indicate either fedml format python main entry filename or entry command
-        enable_serverless_container = config.get(ClientConstants.ENABLE_SERVERLESS_CONTAINER_KEY, False)
-        customized_image_entry_cmd = config.get('container_run_command', None)  # Could be str or list
-        customized_readiness_check = config.get('readiness_probe', ClientConstants.READINESS_PROBE_DEFAULT)
-        customized_liveliness_check = config.get('liveness_probe', ClientConstants.LIVENESS_PROBE_DEFAULT)
+        # Service app related
+        dst_bootstrap_dir, dst_model_serving_dir, relative_entry_fedml_format, enable_serverless_container, \
+            customized_image_entry_cmd, customized_readiness_check, customized_liveliness_check, customized_uri = \
+            handle_container_service_app(config, model_storage_local_path)
 
         # Storage related
         src_code_dir = os.path.join(model_storage_local_path, config.get('source_code_dir', ""))
@@ -451,7 +425,7 @@ def parse_image_registry_related_config(config):
 
 def is_client_inference_container_ready(infer_url_host, inference_http_port,
                                         readiness_check=ClientConstants.READINESS_PROBE_DEFAULT,
-                                        request_input_example=None, container_id=None):
+                                        request_input_example=None, container_id=None, customized_uri=None):
     # Construct the model metadata (input and output)
     model_metadata = {}
     if request_input_example is not None and len(request_input_example) > 0:
@@ -461,6 +435,7 @@ def is_client_inference_container_ready(infer_url_host, inference_http_port,
     model_metadata["outputs"] = []
     model_metadata["type"] = "default"
 
+    # Check the readiness of the container
     if readiness_check == ClientConstants.READINESS_PROBE_DEFAULT:
         default_client_container_ready_url = "http://{}:{}/ready".format("0.0.0.0", inference_http_port)
         response = None
@@ -486,27 +461,38 @@ def is_client_inference_container_ready(infer_url_host, inference_http_port,
                 else:
                     if not check_path.startswith("/"):
                         check_path = "/" + check_path
-                readiness_check_url = f"http://{infer_url_host}:{inference_http_port}{check_path}"
-
                 response = None
                 try:
-                    response = requests.get(readiness_check_url)
+                    response = requests.get(f"http://{infer_url_host}:{inference_http_port}{check_path}")
                 except:
                     pass
                 if not response or response.status_code != 200:
                     return "", "", {}, {}
-
-                return readiness_check_url, None, model_metadata, None
             else:
                 logging.error("'path' is not specified in httpGet readiness check")
                 return "", "", {}, {}
         elif "exec" in readiness_check:
-            # TODO(raphael): Support arbitrary readiness check command by using
-            #  container id and docker exec
-            return "http://{}:{}/".format(infer_url_host, inference_http_port), None, model_metadata, None
+            # TODO(raphael): Support arbitrary readiness check command by using container id and docker exec
+            pass
         else:
             # Ref K8S, if no readiness check, we assume the container is ready immediately
-            return "http://{}:{}/".format(infer_url_host, inference_http_port), None, model_metadata, None
+            pass
+
+        # Construct the customized URI
+        path = ""
+        if customized_uri is not None:
+            if "httpPost" in customized_uri and "path" in customized_uri["httpPost"]:
+                path = customized_uri["httpPost"]["path"]
+                if not isinstance(path, str):
+                    logging.error(f"Invalid path type: {path}, expected str")
+                    return "", "", {}, {}
+                else:
+                    if not path.startswith("/"):
+                        path = "/" + path
+            # TODO(raphael): Finalized more customized URI types
+        readiness_check_url = f"http://{infer_url_host}:{inference_http_port}{path}"
+
+        return readiness_check_url, None, model_metadata, None
 
 
 def _handle_union_volume_mount(binds, volumes, environment, data_cache_dir_input=None):
@@ -602,6 +588,43 @@ def handle_volume_mount(volumes, binds, environment, relative_entry_fedml_format
             logging.warning(f"{workspace_path} does not exist, skip mounting it to the container")
 
 
+def handle_container_service_app(config, model_storage_local_path):
+    # Bootstrap, job and entrypoint related
+    dst_model_serving_dir = "/home/fedml/models_serving"
+    bootstrap_cmds_str_frm_yaml = config.get('bootstrap', "")
+    job_cmds_str_frm_yaml = config.get('job', "")
+
+    auto_gen_bootstrap_file_name = "fedml-deploy-bootstrap-entry-auto-gen.sh"
+    if bootstrap_cmds_str_frm_yaml != "" or job_cmds_str_frm_yaml != "":
+        src_bootstrap_file_path = os.path.join(model_storage_local_path, auto_gen_bootstrap_file_name)
+        with open(src_bootstrap_file_path, 'w') as f:
+            f.write("cd /home/fedml/models_serving/\n")
+            f.write(bootstrap_cmds_str_frm_yaml)
+            f.write("\n")
+            f.write("cd /home/fedml/models_serving/\n")
+            f.write(job_cmds_str_frm_yaml)
+    else:
+        src_bootstrap_file_path = ""
+
+    if src_bootstrap_file_path != "":
+        dst_bootstrap_dir = os.path.join(dst_model_serving_dir, auto_gen_bootstrap_file_name)
+    else:
+        dst_bootstrap_dir = ""
+
+    # If the entry point is in fedml format (e.g., "main.py")
+    relative_entry_fedml_format = config.get('entry_point', "")
+
+    # User indicate either fedml format python main entry filename or entry command
+    enable_serverless_container = config.get(ClientConstants.ENABLE_SERVERLESS_CONTAINER_KEY, False)
+    customized_image_entry_cmd = config.get('container_run_command', None)  # Could be str or list
+    customized_readiness_check = config.get('readiness_probe', ClientConstants.READINESS_PROBE_DEFAULT)
+    customized_liveliness_check = config.get('liveness_probe', ClientConstants.LIVENESS_PROBE_DEFAULT)
+    customized_uri = config.get(ClientConstants.CUSTOMIZED_SERVICE_KEY, "")
+
+    return (dst_bootstrap_dir, dst_model_serving_dir, relative_entry_fedml_format, enable_serverless_container,
+            customized_image_entry_cmd, customized_readiness_check, customized_liveliness_check, customized_uri)
+
+
 def handle_env_vars(environment, relative_entry_fedml_format, extra_envs, dst_bootstrap_dir, end_point_id, edge_id,
                     replica_rank, request_json):
     enable_custom_image = False if relative_entry_fedml_format != "" else True
@@ -626,10 +649,11 @@ def handle_env_vars(environment, relative_entry_fedml_format, extra_envs, dst_bo
 
 
 def check_container_readiness(inference_http_port, infer_host="127.0.0.1", request_input_example=None,
-                              readiness_check=ClientConstants.READINESS_PROBE_DEFAULT):
+                              readiness_check=ClientConstants.READINESS_PROBE_DEFAULT,
+                              customized_uri=None):
     response_from_client_container = is_client_inference_container_ready(
         infer_host, inference_http_port, readiness_check=readiness_check,
-        request_input_example=request_input_example)
+        request_input_example=request_input_example, customized_uri=customized_uri)
 
     return response_from_client_container
 

From bd89be1a1f01f0ff1528cd1766c2a22a25af5975 Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Tue, 25 Jun 2024 11:50:04 -0700
Subject: [PATCH 33/38] [Deploy] Add sqlite backup for
 get_all_deployment_result_list.

---
 .../scheduler/comm_utils/constants.py         |  1 -
 .../scheduler/comm_utils/job_monitor.py       |  2 +-
 .../model_scheduler/device_model_cache.py     | 30 +++++++++--
 .../model_scheduler/device_model_db.py        | 51 +++++++++++++++++--
 .../model_scheduler/worker_job_runner.py      |  2 +-
 5 files changed, 75 insertions(+), 11 deletions(-)

diff --git a/python/fedml/computing/scheduler/comm_utils/constants.py b/python/fedml/computing/scheduler/comm_utils/constants.py
index 67b9d8b14b..6e46ce207b 100644
--- a/python/fedml/computing/scheduler/comm_utils/constants.py
+++ b/python/fedml/computing/scheduler/comm_utils/constants.py
@@ -114,7 +114,6 @@ class SchedulerConstants:
     REDIS_PORT = "6379"
     REDIS_PASSWORD = "fedml_default"
 
-
     @staticmethod
     def get_log_source(run_json):
         run_config = run_json.get("run_config", {})
diff --git a/python/fedml/computing/scheduler/comm_utils/job_monitor.py b/python/fedml/computing/scheduler/comm_utils/job_monitor.py
index 667a54e565..b8237d93ba 100644
--- a/python/fedml/computing/scheduler/comm_utils/job_monitor.py
+++ b/python/fedml/computing/scheduler/comm_utils/job_monitor.py
@@ -210,6 +210,7 @@ def monitor_replicas_number():
             endpoint_replicas_details = {}
             if isinstance(endpoint_detail, str):
                 endpoint_replicas_details = json.loads(endpoint_detail)
+                # TODO: Check out this nested json
                 if isinstance(endpoint_replicas_details, str):
                     endpoint_replicas_details = json.loads(endpoint_replicas_details)
 
@@ -222,7 +223,6 @@ def monitor_replicas_number():
                     endpoint_replica_details["end_point_id"], 0) + 1
 
         for endpoint_id, num_replica in res_to_mlops.items():
-            curr_version = fedml.get_env_version()
             num_replica_url_path = "fedmlModelServer/api/v1/endpoint/replica-info"
             mlops_prefix = fedml._get_backend_service()
             url = f"{mlops_prefix}/{num_replica_url_path}"
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py
index b0021aa7df..1836971075 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py
@@ -302,7 +302,27 @@ def get_all_deployment_result_list(self):
                 result_list.extend(self.redis_connection.lrange(key, 0, -1))
         except Exception as e:
             logging.error(e)
-        # TODO(Raphael): Use Sqlite for the replica backup
+
+        # Get cached results from the persist sqlite database
+        if len(result_list) <= 0:
+            db_result_list = list()
+            try:
+                db_result_list = self.model_deployment_db.get_all_deployment_results_list()
+            except Exception as e:
+                logging.error(f"Failed to get all deployment results from the database due to {e}")
+                pass
+
+            for result in db_result_list:
+                try:
+                    self.redis_connection.rpush(self.get_deployment_result_key(
+                        result["end_point_id"], result["end_point_name"], result["model_name"]),
+                        json.dumps(result["replica_info"]))
+                except Exception as e:
+                    logging.error(e)
+                    pass
+
+            for result in db_result_list:
+                result_list.append(result["replica_info"])
 
         return result_list
 
@@ -330,7 +350,8 @@ def get_deployment_status_list_size(self, end_point_id, end_point_name, model_na
         status_list = self.get_deployment_status_list(end_point_id, end_point_name, model_name)
         return len(status_list)
 
-    def get_status_item_info(self, status_item):
+    @staticmethod
+    def get_status_item_info(status_item):
         status_item_json = json.loads(status_item)
         if isinstance(status_item_json, str):
             status_item_json = json.loads(status_item_json)
@@ -341,7 +362,8 @@ def get_status_item_info(self, status_item):
             status_payload = status_item_json["status"]
         return device_id, status_payload
 
-    def get_result_item_info(self, result_item):
+    @staticmethod
+    def get_result_item_info(result_item):
         result_item_json = json.loads(result_item)
         if isinstance(result_item_json, str):
             result_item_json = json.loads(result_item_json)
@@ -386,7 +408,7 @@ def get_idle_device(self,
             return None, None
 
         # # Randomly shuffle
-        # shuffle the list of deployed devices and get the first one as the target idle device.
+        #  the list of deployed devices and get the first one as the target idle device.
         # if len(idle_device_list) <= 0:
         #     return None, None
         # shuffle(idle_device_list)
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_db.py b/python/fedml/computing/scheduler/model_scheduler/device_model_db.py
index 09573a1d1b..606d8c010b 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_db.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_db.py
@@ -10,6 +10,7 @@
 from sqlalchemy.ext.declarative import declarative_base
 from fedml.core.common.singleton import Singleton
 from sqlalchemy.sql import text
+from typing import List, Dict
 
 Base = declarative_base()
 
@@ -42,9 +43,11 @@ def set_deployment_status(self, end_point_id, end_point_name, model_name, model_
         self.set_deployment_results_info(end_point_id, end_point_name, model_name, model_version,
                                          device_id, deployment_status=deployment_status, replica_no=replica_no)
 
-    def get_deployment_result_list(self, end_point_id, end_point_name, model_name, model_version=None):
+    def get_deployment_result_list(self, end_point_id, end_point_name, model_name, model_version=None) -> List[str]:
         """
-        query from sqlite db using e_id
+        Get the orm use get_deployment_results_info,
+        but (1) nested results with cache_device_id, cache_replica_no.
+        (2) return a list of json string, so that redis can store it.
         """
         result_list = self.get_deployment_results_info(end_point_id, end_point_name, model_name, model_version)
         ret_result_list = list()
@@ -55,6 +58,39 @@ def get_deployment_result_list(self, end_point_id, end_point_name, model_name, m
             ret_result_list.append(json.dumps(result_dict))
         return ret_result_list
 
+    def get_all_deployment_results_list(self) -> List[Dict]:
+        """
+        Similar to _get_all_deployment_results_info,
+        but return a list of json string, so that redis can store it.
+
+        return a list of dict, for each item:
+        [
+            {
+                "end_point_id": "",
+                "end_point_name": "",
+                "model_name":"",
+                "replica_res": ""   # Json string
+            },
+        ]
+        value in the dict is a string that contains the deployment result.
+        """
+        flat_ep_list = self._get_all_deployment_results_info()
+        ret_result_list = list()
+        for result in flat_ep_list:
+            result_dict = {
+                "end_point_id": result.end_point_id,
+                "end_point_name": result.end_point_name,
+                "model_name": result.model_name,
+                "replica_info": json.dumps(
+                    {
+                        "cache_device_id": result.device_id,
+                        "cache_replica_no": int(result.replica_no),
+                        "result": result.deployment_result
+                    }
+                )
+            }
+            ret_result_list.append(result_dict)
+        return ret_result_list
 
     def get_deployment_status_list(self, end_point_id, end_point_name, model_name, model_version=None):
         result_list = self.get_deployment_results_info(end_point_id, end_point_name, model_name, model_version)
@@ -156,7 +192,8 @@ def delete_deployment_run_info(self, end_point_id):
             end_point_id=f'{end_point_id}').delete()
         self.db_connection.commit()
 
-    def get_result_item_info(self, result_item):
+    @staticmethod
+    def get_result_item_info(result_item):
         result_item_json = json.loads(result_item)
         if isinstance(result_item_json, dict):
             result_item_json = json.loads(result_item)
@@ -169,7 +206,8 @@ def get_result_item_info(self, result_item):
             result_payload = result_item_json["result"]
         return device_id, replica_no, result_payload
 
-    def get_status_item_info(self, status_item):
+    @staticmethod
+    def get_status_item_info(status_item):
         status_item_json = json.loads(status_item)
         if isinstance(status_item_json, dict):
             status_item_json = json.loads(status_item)
@@ -320,6 +358,11 @@ def get_deployment_results_info(self, end_point_id, end_point_name, model_name,
                             FedMLDeploymentResultInfoModel.model_version == f'{model_version}')).all()
         return result_info
 
+    def _get_all_deployment_results_info(self):
+        self.open_job_db()
+        result_info = self.db_connection.query(FedMLDeploymentResultInfoModel).all()
+        return result_info
+
     def set_deployment_results_info(self, end_point_id, end_point_name,
                                     model_name, model_version, device_id,
                                     deployment_result=None, deployment_status=None, replica_no=None):
diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py
index 113a20e825..c73630fb65 100755
--- a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py
+++ b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py
@@ -260,7 +260,7 @@ def run_impl(self, run_extend_queue_list, sender_message_center,
                     logging.error("[Worker] Failed to deploy the model.")
 
                     # Send failed result back to master
-                    result_payload = self.send_deployment_results(
+                    _ = self.send_deployment_results(
                         end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED,
                         model_id, model_name, inference_output_url, inference_model_version, inference_port,
                         inference_engine, model_metadata, model_config)

From 43f99cf0acf9df685272fc02a5890981ac3d0ee2 Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Tue, 25 Jun 2024 11:55:14 -0700
Subject: [PATCH 34/38] [Deploy] Nit.

---
 .../scheduler/model_scheduler/device_model_deployment.py       | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
index 25fc1e1d64..a47f9dbc20 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
@@ -245,7 +245,8 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
     inference_output_url, running_model_version, ret_model_metadata, ret_model_config = \
         check_container_readiness(inference_http_port=inference_http_port, infer_host=infer_host,
                                   readiness_check=customized_readiness_check,
-                                  request_input_example=request_input_example)
+                                  request_input_example=request_input_example,
+                                  customized_uri=customized_uri)
 
     if inference_output_url == "":
         return running_model_name, "", None, None, None

From 766c52aaf7a5b1dd567e2b91730780a05c594d36 Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Tue, 25 Jun 2024 11:59:14 -0700
Subject: [PATCH 35/38] [Deploy] Nit.

---
 .../trt-llm-openai/config.yaml                | 22 +++++++++++++++++++
 .../device_model_deployment.py                |  2 +-
 2 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 python/examples/deploy/custom_inference_image/trt-llm-openai/config.yaml

diff --git a/python/examples/deploy/custom_inference_image/trt-llm-openai/config.yaml b/python/examples/deploy/custom_inference_image/trt-llm-openai/config.yaml
new file mode 100644
index 0000000000..1bdcf32f75
--- /dev/null
+++ b/python/examples/deploy/custom_inference_image/trt-llm-openai/config.yaml
@@ -0,0 +1,22 @@
+workspace: "./"
+
+inference_image_name: "fedml/trt-llm-openai"
+
+# The image has its self-contained cmd, no need for rewriting the command
+container_run_command: null
+
+port: 3000
+
+readiness_probe:
+  httpGet:
+    path: "/health_check"
+
+# If you do not use serverless container mode, and you want to indicate another resource path,
+# e.g. localhost:3000/v1/chat/completions, you can set the following uri:
+service:
+  httpPost:
+    path: "/v1/chat/completions"
+
+deploy_timeout_sec: 1600
+
+endpoint_api_type: "text2text_llm_openai_chat_completions"
\ No newline at end of file
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
index a47f9dbc20..665bb4082e 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
@@ -233,7 +233,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
                 raise Exception("Failed to get the port allocation")
             time.sleep(3)
 
-    # Logging the info from the container when starting
+    # Logging the info from the container when initializing
     log_deployment_output(end_point_id, model_id, default_server_container_name,
                           ClientConstants.CMD_TYPE_RUN_DEFAULT_SERVER,
                           inference_model_name, inference_engine, inference_http_port, inference_type,

From 0c29c4990d9f8f06940d3f3a658f9ffd1f0ddc86 Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Tue, 25 Jun 2024 17:22:02 -0700
Subject: [PATCH 36/38] [Deploy] Hot fix hash exist.

---
 .../computing/scheduler/model_scheduler/device_model_cache.py   | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py
index b0021aa7df..0d92466169 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py
@@ -990,6 +990,8 @@ def delete_endpoint_scaling_down_decision_time(self, end_point_id) -> bool:
             end_point_id))
 
     def get_pending_requests_counter(self, end_point_id) -> int:
+        if not end_point_id:
+            return 0
         # If the endpoint does not exist inside the Hash collection, set its counter to 0.
         if self.redis_connection.hexists(self.FEDML_PENDING_REQUESTS_COUNTER, end_point_id):
             return int(self.redis_connection.hget(self.FEDML_PENDING_REQUESTS_COUNTER, end_point_id))

From 36378f876018163508f03592fca556afa3a9ec8f Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Tue, 25 Jun 2024 17:52:33 -0700
Subject: [PATCH 37/38] [Deploy] Indicate worker connection type through cli
 and api.

---
 python/fedml/api/__init__.py       | 12 ++++++++----
 python/fedml/api/modules/device.py |  8 +++++---
 python/fedml/cli/modules/login.py  | 12 ++++++++++--
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/python/fedml/api/__init__.py b/python/fedml/api/__init__.py
index f753e4255b..b03c72b675 100755
--- a/python/fedml/api/__init__.py
+++ b/python/fedml/api/__init__.py
@@ -213,16 +213,20 @@ def fedml_build(platform, type, source_folder, entry_point, config_folder, dest_
 
 def login(api_key, computing, server, supplier,
           master_inference_gateway_port: int = ServerConstants.MODEL_INFERENCE_DEFAULT_PORT,
-          worker_inference_proxy_port: int = ClientConstants.LOCAL_CLIENT_API_PORT):
-    device_bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port)
+          worker_inference_proxy_port: int = ClientConstants.LOCAL_CLIENT_API_PORT,
+          worker_connection_type: str = ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT):
+    device_bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port,
+                worker_connection_type)
 
 
 def logout(computing, server):
     device_unbind(computing, server)
 
 
-def device_bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port):
-    device.bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port)
+def device_bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port,
+                worker_connection_type):
+    device.bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port,
+                worker_connection_type)
 
 
 def device_unbind(computing, server):
diff --git a/python/fedml/api/modules/device.py b/python/fedml/api/modules/device.py
index 14591147a6..7c4e52c8b5 100644
--- a/python/fedml/api/modules/device.py
+++ b/python/fedml/api/modules/device.py
@@ -21,7 +21,8 @@
 def bind(
         api_key, computing, server, supplier,
         master_inference_gateway_port=DeviceServerConstants.MODEL_INFERENCE_DEFAULT_PORT,
-        worker_inference_proxy_port=DeviceClientConstants.LOCAL_CLIENT_API_PORT
+        worker_inference_proxy_port=DeviceClientConstants.LOCAL_CLIENT_API_PORT,
+        worker_connection_type=DeviceClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT
 ):
     userid = api_key
     runner_cmd = "{}"
@@ -47,13 +48,13 @@ def bind(
     _bind(
         userid, computing, server,
         api_key, role, runner_cmd, device_id, os_name,
-        docker, master_inference_gateway_port, worker_inference_proxy_port)
+        docker, master_inference_gateway_port, worker_inference_proxy_port, worker_connection_type)
 
 
 def _bind(
         userid, computing, server,
         api_key, role, runner_cmd, device_id, os_name,
-        docker, master_inference_gateway_port, worker_inference_proxy_port):
+        docker, master_inference_gateway_port, worker_inference_proxy_port, worker_connection_type):
     fedml.load_env()
     if os.getenv(ModuleConstants.ENV_FEDML_INFER_HOST) is None:
         fedml.set_env_kv(ModuleConstants.ENV_FEDML_INFER_HOST, SchedulerConstants.REDIS_INFER_HOST)
@@ -66,6 +67,7 @@ def _bind(
 
     fedml.set_env_kv(DeviceServerConstants.ENV_MASTER_INFERENCE_PORT_KEY, str(master_inference_gateway_port))
     fedml.set_env_kv(DeviceClientConstants.ENV_CLIENT_PROXY_PORT_KEY, str(worker_inference_proxy_port))
+    fedml.set_env_kv(DeviceClientConstants.ENV_CONNECTION_TYPE_KEY, worker_connection_type)
 
     url = fedml._get_backend_service()
     platform_name = platform.system()
diff --git a/python/fedml/cli/modules/login.py b/python/fedml/cli/modules/login.py
index f3c982f456..7ec4191a3e 100644
--- a/python/fedml/cli/modules/login.py
+++ b/python/fedml/cli/modules/login.py
@@ -67,10 +67,17 @@
     default=ClientConstants.LOCAL_CLIENT_API_PORT,
     help="The port for worker inference proxy.",
 )
+@click.option(
+    "--worker_connection_type",
+    "-wct",
+    type=str,
+    default=ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT,
+    help="The connection type for worker inference proxy.",
+)
 def fedml_login(
         api_key, version, compute_node, server, provider, deploy_worker_num,
         local_on_premise_platform, local_on_premise_platform_port,
-        master_inference_gateway_port, worker_inference_proxy_port
+        master_inference_gateway_port, worker_inference_proxy_port, worker_connection_type
 ):
     fedml.set_env_version(version)
     fedml.set_local_on_premise_platform_host(local_on_premise_platform)
@@ -84,4 +91,5 @@ def fedml_login(
         print(f"Maybe you are using account id to login, we will try to login with account {api_key}.")
         pass
     os.environ["FEDML_MODEL_WORKER_NUM"] = str(deploy_worker_num)
-    fedml.api.login(api_key, compute_node, server, provider, master_inference_gateway_port, worker_inference_proxy_port)
+    fedml.api.login(api_key, compute_node, server, provider, master_inference_gateway_port,
+                    worker_inference_proxy_port, worker_connection_type)

From 5097ff29bf48b7f6d8c097721d96e44f421a4192 Mon Sep 17 00:00:00 2001
From: Raphael Jin <kimheavy@yahoo.com>
Date: Tue, 25 Jun 2024 18:01:29 -0700
Subject: [PATCH 38/38] [Deploy] Nit.

---
 .../scheduler/model_scheduler/device_model_cache.py         | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py
index 0d92466169..7e79126fa6 100755
--- a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py
+++ b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py
@@ -989,11 +989,9 @@ def delete_endpoint_scaling_down_decision_time(self, end_point_id) -> bool:
             self.FEDML_MODEL_ENDPOINT_SCALING_DOWN_DECISION_TIME_TAG,
             end_point_id))
 
-    def get_pending_requests_counter(self, end_point_id) -> int:
-        if not end_point_id:
-            return 0
+    def get_pending_requests_counter(self, end_point_id=None) -> int:
         # If the endpoint does not exist inside the Hash collection, set its counter to 0.
-        if self.redis_connection.hexists(self.FEDML_PENDING_REQUESTS_COUNTER, end_point_id):
+        if end_point_id and self.redis_connection.hexists(self.FEDML_PENDING_REQUESTS_COUNTER, end_point_id):
             return int(self.redis_connection.hget(self.FEDML_PENDING_REQUESTS_COUNTER, end_point_id))
         return 0