Merge branch 'dev/v0.7.0' into alexleung/dev_v070_for_refactor

FedML-AI · Jun 26, 2024 · a932082 · a932082
2 parents 0db8666 + 7193577
commit a932082
Show file tree

Hide file tree

Showing 37 changed files with 987 additions and 895 deletions.
diff --git a/python/examples/deploy/custom_inference_image/README.md b/python/examples/deploy/custom_inference_image/README.md
diff --git a/python/examples/deploy/custom_inference_image/custom_inference_image.yaml b/python/examples/deploy/custom_inference_image/custom_inference_image.yaml
diff --git a/python/examples/deploy/custom_inference_image/serve_main.py b/python/examples/deploy/custom_inference_image/serve_main.py
diff --git a/python/examples/deploy/custom_inference_image/template.yaml b/python/examples/deploy/custom_inference_image/template.yaml
@@ -0,0 +1,22 @@
+# Required
+workspace: "./"                     # We will pacakge all the files in the workspace directory
+enable_serverless_container: true   # Identify whether to use serverless container
+inference_image_name: ""            # Container image name
+container_run_command: ""           # str or list, similar to CMD in the dockerfile
+port: 80                            # Service port, currently you can only indicate one arbitrary port
+
+# Optional, these are the default values
+readiness_probe:                    # Probe for checking whether a container is ready for inference
+  httpGet:
+    path: ""
+environment_variables: {}           # Environment variables inside the container
+volumes:                            # Volumes to mount to the container
+    - workspace_path: ""            # Path to the volume in the workspace
+      mount_path: ""                # Path to mount the volume inside the container
+deploy_timeout_sec: 900             # Maximum time waiting for deployment to finish (Does not include the time to pull the image)
+request_input_example: {}           # Example of input request, will be shown in the UI
+registry_specs:                     # Registry information for pulling the image
+  registry_name: ""
+  registry_provider: "DockerHub"
+  registry_user_name: ""
+  registry_user_password: ""
diff --git a/python/examples/deploy/custom_inference_image/tensorrt_llm/tensorrtllm.yaml b/python/examples/deploy/custom_inference_image/tensorrt_llm/tensorrtllm.yaml
@@ -0,0 +1,17 @@
+workspace: "./"
+
+enable_serverless_container: true
+inference_image_name: "fedml/llama3-8b-tensorrtllm"
+
+# If you put the model repository in $workspace/model_repository, it will be mounted to /home/fedml/models_serving/model_repository
+container_run_command: ["sh", "-c", "cd / && huggingface-cli login --token $your_hf_token && pip install sentencepiece protobuf && python3 tensorrtllm_backend/scripts/launch_triton_server.py --model_repo tensorrtllm_backend/all_models/inflight_batcher_llm --world_size 1 && tail -f /dev/null"]
+
+readiness_probe:
+  httpGet:
+    path: "/v2/health/ready"
+
+port: 8000
+
+deploy_timeout_sec: 1600
+
+
diff --git a/...eploy/custom_inference_image/triton_inference_server/template/custom_inference_image.yaml b/...eploy/custom_inference_image/triton_inference_server/template/custom_inference_image.yaml
@@ -0,0 +1,20 @@
+workspace: "./"
+
+enable_serverless_container: true
+inference_image_name: "nvcr.io/nvidia/tritonserver:24.05-py3"
+
+volumes:
+  - workspace_path: "./model_repository"
+    mount_path: "/repo_inside_container"
+
+container_run_command: "tritonserver --model-repository=/repo_inside_container"
+
+readiness_probe:
+  httpGet:
+    path: "/v2/health/ready"
+
+port: 8000
+
+deploy_timeout_sec: 1600
+
+request_input_example: {"text_input": "Hello"}
diff --git a/...custom_inference_image/triton_inference_server/template/model_repository/dummy/1/model.py b/...custom_inference_image/triton_inference_server/template/model_repository/dummy/1/model.py
@@ -0,0 +1,25 @@
+import json
+import numpy as np
+import triton_python_backend_utils as pb_utils
+
+class TritonPythonModel:
+    def initialize(self, args):
+        self.model_name = args['model_name']
+
+    @staticmethod
+    def auto_complete_config(auto_complete_model_config):
+        auto_complete_model_config.add_input( {"name": "text_input",  "data_type": "TYPE_STRING", "dims": [-1]})
+        auto_complete_model_config.add_output({"name": "text_output", "data_type": "TYPE_STRING", "dims": [-1]})
+        auto_complete_model_config.set_max_batch_size(0)
+        return auto_complete_model_config
+
+    def execute(self, requests):
+        responses = []
+        for request in requests:
+            in_numpy = pb_utils.get_input_tensor_by_name(request, "text_input").as_numpy()
+            assert np.object_ == in_numpy.dtype, 'in this demo, triton passes in a numpy array of size 1 with object_ dtype, this dtype encapsulates a python bytes-array'
+            print('in this demo len(in_numpy) is 1:', len(in_numpy.tolist()))
+            out_numpy = np.array([ (self.model_name + ': ' + python_byte_array.decode('utf-8') + ' World').encode('utf-8') for python_byte_array in in_numpy.tolist()], dtype = np.object_)
+            out_pb = pb_utils.Tensor("text_output", out_numpy)
+            responses.append(pb_utils.InferenceResponse(output_tensors = [out_pb]))
+        return responses
diff --git a/python/examples/deploy/custom_inference_image/trt-llm-openai/config.yaml b/python/examples/deploy/custom_inference_image/trt-llm-openai/config.yaml
@@ -0,0 +1,22 @@
+workspace: "./"
+
+inference_image_name: "fedml/trt-llm-openai"
+
+# The image has its self-contained cmd, no need for rewriting the command
+container_run_command: null
+
+port: 3000
+
+readiness_probe:
+  httpGet:
+    path: "/health_check"
+
+# If you do not use serverless container mode, and you want to indicate another resource path,
+# e.g. localhost:3000/v1/chat/completions, you can set the following uri:
+service:
+  httpPost:
+    path: "/v1/chat/completions"
+
+deploy_timeout_sec: 1600
+
+endpoint_api_type: "text2text_llm_openai_chat_completions"
diff --git a/python/fedml/api/__init__.py b/python/fedml/api/__init__.py
@@ -24,6 +24,8 @@
 from fedml.computing.scheduler.scheduler_entry.cluster_manager import FedMLClusterModelList
 from fedml.computing.scheduler.scheduler_entry.run_manager import FedMLRunStartedModel, FedMLGpuDevices, \
     FedMLRunModelList, FeatureEntryPoint
+from fedml.computing.scheduler.model_scheduler.device_server_constants import ServerConstants
+from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants
 
 
 def fedml_login(api_key: str = None):
@@ -209,16 +211,22 @@ def fedml_build(platform, type, source_folder, entry_point, config_folder, dest_
     return build.build(platform, type, source_folder, entry_point, config_folder, dest_folder, ignore)
 
 
-def login(api_key, computing, server, supplier):
-    device_bind(api_key, computing, server, supplier)
+def login(api_key, computing, server, supplier,
+          master_inference_gateway_port: int = ServerConstants.MODEL_INFERENCE_DEFAULT_PORT,
+          worker_inference_proxy_port: int = ClientConstants.LOCAL_CLIENT_API_PORT,
+          worker_connection_type: str = ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT):
+    device_bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port,
+                worker_connection_type)
 
 
 def logout(computing, server):
     device_unbind(computing, server)
 
 
-def device_bind(api_key, computing, server, supplier):
-    device.bind(api_key, computing, server, supplier)
+def device_bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port,
+                worker_connection_type):
+    device.bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port,
+                worker_connection_type)
 
 
 def device_unbind(computing, server):

diff --git a/python/fedml/api/modules/device.py b/python/fedml/api/modules/device.py
@@ -10,14 +10,19 @@
 from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants
 from fedml.computing.scheduler.comm_utils.run_process_utils import RunProcessUtils
 from fedml.computing.scheduler.master.server_constants import ServerConstants
+from fedml.computing.scheduler.model_scheduler.device_server_constants import ServerConstants as DeviceServerConstants
 from fedml.computing.scheduler.master.server_login import logout as server_logout
 from fedml.computing.scheduler.slave.client_constants import ClientConstants
+from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants as DeviceClientConstants
 from fedml.computing.scheduler.slave.client_login import logout as client_logout
 from fedml.computing.scheduler.scheduler_entry.resource_manager import FedMLResourceManager
 
 
 def bind(
-        api_key, computing, server, supplier
+        api_key, computing, server, supplier,
+        master_inference_gateway_port=DeviceServerConstants.MODEL_INFERENCE_DEFAULT_PORT,
+        worker_inference_proxy_port=DeviceClientConstants.LOCAL_CLIENT_API_PORT,
+        worker_connection_type=DeviceClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT
 ):
     userid = api_key
     runner_cmd = "{}"
@@ -43,13 +48,13 @@ def bind(
     _bind(
         userid, computing, server,
         api_key, role, runner_cmd, device_id, os_name,
-        docker)
+        docker, master_inference_gateway_port, worker_inference_proxy_port, worker_connection_type)
 
 
 def _bind(
         userid, computing, server,
         api_key, role, runner_cmd, device_id, os_name,
-        docker):
+        docker, master_inference_gateway_port, worker_inference_proxy_port, worker_connection_type):
     fedml.load_env()
     if os.getenv(ModuleConstants.ENV_FEDML_INFER_HOST) is None:
         fedml.set_env_kv(ModuleConstants.ENV_FEDML_INFER_HOST, SchedulerConstants.REDIS_INFER_HOST)
@@ -60,6 +65,10 @@ def _bind(
     if os.getenv(ModuleConstants.ENV_FEDML_INFER_REDIS_PASSWORD) is None:
         fedml.set_env_kv(ModuleConstants.ENV_FEDML_INFER_REDIS_PASSWORD, SchedulerConstants.REDIS_PASSWORD)
 
+    fedml.set_env_kv(DeviceServerConstants.ENV_MASTER_INFERENCE_PORT_KEY, str(master_inference_gateway_port))
+    fedml.set_env_kv(DeviceClientConstants.ENV_CLIENT_PROXY_PORT_KEY, str(worker_inference_proxy_port))
+    fedml.set_env_kv(DeviceClientConstants.ENV_CONNECTION_TYPE_KEY, worker_connection_type)
+
     url = fedml._get_backend_service()
     platform_name = platform.system()
     docker_config_text = None