-
-
Notifications
You must be signed in to change notification settings - Fork 790
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'dev/v0.7.0' into alexleung/dev_v070_for_refactor
- Loading branch information
Showing
37 changed files
with
987 additions
and
895 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
13 changes: 0 additions & 13 deletions
13
python/examples/deploy/custom_inference_image/custom_inference_image.yaml
This file was deleted.
Oops, something went wrong.
16 changes: 0 additions & 16 deletions
16
python/examples/deploy/custom_inference_image/serve_main.py
This file was deleted.
Oops, something went wrong.
22 changes: 22 additions & 0 deletions
22
python/examples/deploy/custom_inference_image/template.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# Required | ||
workspace: "./" # We will pacakge all the files in the workspace directory | ||
enable_serverless_container: true # Identify whether to use serverless container | ||
inference_image_name: "" # Container image name | ||
container_run_command: "" # str or list, similar to CMD in the dockerfile | ||
port: 80 # Service port, currently you can only indicate one arbitrary port | ||
|
||
# Optional, these are the default values | ||
readiness_probe: # Probe for checking whether a container is ready for inference | ||
httpGet: | ||
path: "" | ||
environment_variables: {} # Environment variables inside the container | ||
volumes: # Volumes to mount to the container | ||
- workspace_path: "" # Path to the volume in the workspace | ||
mount_path: "" # Path to mount the volume inside the container | ||
deploy_timeout_sec: 900 # Maximum time waiting for deployment to finish (Does not include the time to pull the image) | ||
request_input_example: {} # Example of input request, will be shown in the UI | ||
registry_specs: # Registry information for pulling the image | ||
registry_name: "" | ||
registry_provider: "DockerHub" | ||
registry_user_name: "" | ||
registry_user_password: "" |
17 changes: 17 additions & 0 deletions
17
python/examples/deploy/custom_inference_image/tensorrt_llm/tensorrtllm.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
workspace: "./" | ||
|
||
enable_serverless_container: true | ||
inference_image_name: "fedml/llama3-8b-tensorrtllm" | ||
|
||
# If you put the model repository in $workspace/model_repository, it will be mounted to /home/fedml/models_serving/model_repository | ||
container_run_command: ["sh", "-c", "cd / && huggingface-cli login --token $your_hf_token && pip install sentencepiece protobuf && python3 tensorrtllm_backend/scripts/launch_triton_server.py --model_repo tensorrtllm_backend/all_models/inflight_batcher_llm --world_size 1 && tail -f /dev/null"] | ||
|
||
readiness_probe: | ||
httpGet: | ||
path: "/v2/health/ready" | ||
|
||
port: 8000 | ||
|
||
deploy_timeout_sec: 1600 | ||
|
||
|
20 changes: 20 additions & 0 deletions
20
...eploy/custom_inference_image/triton_inference_server/template/custom_inference_image.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
workspace: "./" | ||
|
||
enable_serverless_container: true | ||
inference_image_name: "nvcr.io/nvidia/tritonserver:24.05-py3" | ||
|
||
volumes: | ||
- workspace_path: "./model_repository" | ||
mount_path: "/repo_inside_container" | ||
|
||
container_run_command: "tritonserver --model-repository=/repo_inside_container" | ||
|
||
readiness_probe: | ||
httpGet: | ||
path: "/v2/health/ready" | ||
|
||
port: 8000 | ||
|
||
deploy_timeout_sec: 1600 | ||
|
||
request_input_example: {"text_input": "Hello"} |
25 changes: 25 additions & 0 deletions
25
...custom_inference_image/triton_inference_server/template/model_repository/dummy/1/model.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
import json | ||
import numpy as np | ||
import triton_python_backend_utils as pb_utils | ||
|
||
class TritonPythonModel: | ||
def initialize(self, args): | ||
self.model_name = args['model_name'] | ||
|
||
@staticmethod | ||
def auto_complete_config(auto_complete_model_config): | ||
auto_complete_model_config.add_input( {"name": "text_input", "data_type": "TYPE_STRING", "dims": [-1]}) | ||
auto_complete_model_config.add_output({"name": "text_output", "data_type": "TYPE_STRING", "dims": [-1]}) | ||
auto_complete_model_config.set_max_batch_size(0) | ||
return auto_complete_model_config | ||
|
||
def execute(self, requests): | ||
responses = [] | ||
for request in requests: | ||
in_numpy = pb_utils.get_input_tensor_by_name(request, "text_input").as_numpy() | ||
assert np.object_ == in_numpy.dtype, 'in this demo, triton passes in a numpy array of size 1 with object_ dtype, this dtype encapsulates a python bytes-array' | ||
print('in this demo len(in_numpy) is 1:', len(in_numpy.tolist())) | ||
out_numpy = np.array([ (self.model_name + ': ' + python_byte_array.decode('utf-8') + ' World').encode('utf-8') for python_byte_array in in_numpy.tolist()], dtype = np.object_) | ||
out_pb = pb_utils.Tensor("text_output", out_numpy) | ||
responses.append(pb_utils.InferenceResponse(output_tensors = [out_pb])) | ||
return responses |
22 changes: 22 additions & 0 deletions
22
python/examples/deploy/custom_inference_image/trt-llm-openai/config.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
workspace: "./" | ||
|
||
inference_image_name: "fedml/trt-llm-openai" | ||
|
||
# The image has its self-contained cmd, no need for rewriting the command | ||
container_run_command: null | ||
|
||
port: 3000 | ||
|
||
readiness_probe: | ||
httpGet: | ||
path: "/health_check" | ||
|
||
# If you do not use serverless container mode, and you want to indicate another resource path, | ||
# e.g. localhost:3000/v1/chat/completions, you can set the following uri: | ||
service: | ||
httpPost: | ||
path: "/v1/chat/completions" | ||
|
||
deploy_timeout_sec: 1600 | ||
|
||
endpoint_api_type: "text2text_llm_openai_chat_completions" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.