Skip to content

Commit

Permalink
Merge branch 'dev/v0.7.0' into alexleung/dev_v070_for_refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
fedml-alex authored Jun 26, 2024
2 parents 0db8666 + 7193577 commit a932082
Show file tree
Hide file tree
Showing 37 changed files with 987 additions and 895 deletions.
48 changes: 0 additions & 48 deletions python/examples/deploy/custom_inference_image/README.md

This file was deleted.

This file was deleted.

16 changes: 0 additions & 16 deletions python/examples/deploy/custom_inference_image/serve_main.py

This file was deleted.

22 changes: 22 additions & 0 deletions python/examples/deploy/custom_inference_image/template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Required
workspace: "./" # We will pacakge all the files in the workspace directory
enable_serverless_container: true # Identify whether to use serverless container
inference_image_name: "" # Container image name
container_run_command: "" # str or list, similar to CMD in the dockerfile
port: 80 # Service port, currently you can only indicate one arbitrary port

# Optional, these are the default values
readiness_probe: # Probe for checking whether a container is ready for inference
httpGet:
path: ""
environment_variables: {} # Environment variables inside the container
volumes: # Volumes to mount to the container
- workspace_path: "" # Path to the volume in the workspace
mount_path: "" # Path to mount the volume inside the container
deploy_timeout_sec: 900 # Maximum time waiting for deployment to finish (Does not include the time to pull the image)
request_input_example: {} # Example of input request, will be shown in the UI
registry_specs: # Registry information for pulling the image
registry_name: ""
registry_provider: "DockerHub"
registry_user_name: ""
registry_user_password: ""
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
workspace: "./"

enable_serverless_container: true
inference_image_name: "fedml/llama3-8b-tensorrtllm"

# If you put the model repository in $workspace/model_repository, it will be mounted to /home/fedml/models_serving/model_repository
container_run_command: ["sh", "-c", "cd / && huggingface-cli login --token $your_hf_token && pip install sentencepiece protobuf && python3 tensorrtllm_backend/scripts/launch_triton_server.py --model_repo tensorrtllm_backend/all_models/inflight_batcher_llm --world_size 1 && tail -f /dev/null"]

readiness_probe:
httpGet:
path: "/v2/health/ready"

port: 8000

deploy_timeout_sec: 1600


Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
workspace: "./"

enable_serverless_container: true
inference_image_name: "nvcr.io/nvidia/tritonserver:24.05-py3"

volumes:
- workspace_path: "./model_repository"
mount_path: "/repo_inside_container"

container_run_command: "tritonserver --model-repository=/repo_inside_container"

readiness_probe:
httpGet:
path: "/v2/health/ready"

port: 8000

deploy_timeout_sec: 1600

request_input_example: {"text_input": "Hello"}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import json
import numpy as np
import triton_python_backend_utils as pb_utils

class TritonPythonModel:
def initialize(self, args):
self.model_name = args['model_name']

@staticmethod
def auto_complete_config(auto_complete_model_config):
auto_complete_model_config.add_input( {"name": "text_input", "data_type": "TYPE_STRING", "dims": [-1]})
auto_complete_model_config.add_output({"name": "text_output", "data_type": "TYPE_STRING", "dims": [-1]})
auto_complete_model_config.set_max_batch_size(0)
return auto_complete_model_config

def execute(self, requests):
responses = []
for request in requests:
in_numpy = pb_utils.get_input_tensor_by_name(request, "text_input").as_numpy()
assert np.object_ == in_numpy.dtype, 'in this demo, triton passes in a numpy array of size 1 with object_ dtype, this dtype encapsulates a python bytes-array'
print('in this demo len(in_numpy) is 1:', len(in_numpy.tolist()))
out_numpy = np.array([ (self.model_name + ': ' + python_byte_array.decode('utf-8') + ' World').encode('utf-8') for python_byte_array in in_numpy.tolist()], dtype = np.object_)
out_pb = pb_utils.Tensor("text_output", out_numpy)
responses.append(pb_utils.InferenceResponse(output_tensors = [out_pb]))
return responses
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
workspace: "./"

inference_image_name: "fedml/trt-llm-openai"

# The image has its self-contained cmd, no need for rewriting the command
container_run_command: null

port: 3000

readiness_probe:
httpGet:
path: "/health_check"

# If you do not use serverless container mode, and you want to indicate another resource path,
# e.g. localhost:3000/v1/chat/completions, you can set the following uri:
service:
httpPost:
path: "/v1/chat/completions"

deploy_timeout_sec: 1600

endpoint_api_type: "text2text_llm_openai_chat_completions"
16 changes: 12 additions & 4 deletions python/fedml/api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
from fedml.computing.scheduler.scheduler_entry.cluster_manager import FedMLClusterModelList
from fedml.computing.scheduler.scheduler_entry.run_manager import FedMLRunStartedModel, FedMLGpuDevices, \
FedMLRunModelList, FeatureEntryPoint
from fedml.computing.scheduler.model_scheduler.device_server_constants import ServerConstants
from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants


def fedml_login(api_key: str = None):
Expand Down Expand Up @@ -209,16 +211,22 @@ def fedml_build(platform, type, source_folder, entry_point, config_folder, dest_
return build.build(platform, type, source_folder, entry_point, config_folder, dest_folder, ignore)


def login(api_key, computing, server, supplier):
device_bind(api_key, computing, server, supplier)
def login(api_key, computing, server, supplier,
master_inference_gateway_port: int = ServerConstants.MODEL_INFERENCE_DEFAULT_PORT,
worker_inference_proxy_port: int = ClientConstants.LOCAL_CLIENT_API_PORT,
worker_connection_type: str = ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT):
device_bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port,
worker_connection_type)


def logout(computing, server):
device_unbind(computing, server)


def device_bind(api_key, computing, server, supplier):
device.bind(api_key, computing, server, supplier)
def device_bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port,
worker_connection_type):
device.bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port,
worker_connection_type)


def device_unbind(computing, server):
Expand Down
15 changes: 12 additions & 3 deletions python/fedml/api/modules/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,19 @@
from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants
from fedml.computing.scheduler.comm_utils.run_process_utils import RunProcessUtils
from fedml.computing.scheduler.master.server_constants import ServerConstants
from fedml.computing.scheduler.model_scheduler.device_server_constants import ServerConstants as DeviceServerConstants
from fedml.computing.scheduler.master.server_login import logout as server_logout
from fedml.computing.scheduler.slave.client_constants import ClientConstants
from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants as DeviceClientConstants
from fedml.computing.scheduler.slave.client_login import logout as client_logout
from fedml.computing.scheduler.scheduler_entry.resource_manager import FedMLResourceManager


def bind(
api_key, computing, server, supplier
api_key, computing, server, supplier,
master_inference_gateway_port=DeviceServerConstants.MODEL_INFERENCE_DEFAULT_PORT,
worker_inference_proxy_port=DeviceClientConstants.LOCAL_CLIENT_API_PORT,
worker_connection_type=DeviceClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT
):
userid = api_key
runner_cmd = "{}"
Expand All @@ -43,13 +48,13 @@ def bind(
_bind(
userid, computing, server,
api_key, role, runner_cmd, device_id, os_name,
docker)
docker, master_inference_gateway_port, worker_inference_proxy_port, worker_connection_type)


def _bind(
userid, computing, server,
api_key, role, runner_cmd, device_id, os_name,
docker):
docker, master_inference_gateway_port, worker_inference_proxy_port, worker_connection_type):
fedml.load_env()
if os.getenv(ModuleConstants.ENV_FEDML_INFER_HOST) is None:
fedml.set_env_kv(ModuleConstants.ENV_FEDML_INFER_HOST, SchedulerConstants.REDIS_INFER_HOST)
Expand All @@ -60,6 +65,10 @@ def _bind(
if os.getenv(ModuleConstants.ENV_FEDML_INFER_REDIS_PASSWORD) is None:
fedml.set_env_kv(ModuleConstants.ENV_FEDML_INFER_REDIS_PASSWORD, SchedulerConstants.REDIS_PASSWORD)

fedml.set_env_kv(DeviceServerConstants.ENV_MASTER_INFERENCE_PORT_KEY, str(master_inference_gateway_port))
fedml.set_env_kv(DeviceClientConstants.ENV_CLIENT_PROXY_PORT_KEY, str(worker_inference_proxy_port))
fedml.set_env_kv(DeviceClientConstants.ENV_CONNECTION_TYPE_KEY, worker_connection_type)

url = fedml._get_backend_service()
platform_name = platform.system()
docker_config_text = None
Expand Down
Loading

0 comments on commit a932082

Please sign in to comment.