From b016e5d232d1213b5f1bdc32f47212710b667b43 Mon Sep 17 00:00:00 2001 From: Qing Lan Date: Fri, 9 Jun 2023 14:23:58 -0700 Subject: [PATCH] update gpu memory consumption (#818) Add GPTJ and GPTNeoX model to tests --- .github/workflows/llm_integration.yml | 15 ++++++++++++--- tests/integration/llm/client.py | 11 ++++++++--- .../integration/llm/fastertransformer-model.py | 18 +++++++++++++----- tests/integration/llm/prepare.py | 15 ++++++++++++--- 4 files changed, 45 insertions(+), 14 deletions(-) diff --git a/.github/workflows/llm_integration.yml b/.github/workflows/llm_integration.yml index bcdf4b960cd..fa67fa503ff 100644 --- a/.github/workflows/llm_integration.yml +++ b/.github/workflows/llm_integration.yml @@ -358,14 +358,14 @@ jobs: serve -m test=file:/opt/ml/model/test/ python3 llm/client.py fastertransformer_raw bigscience/bloom-3b docker rm -f $(docker ps -aq) - - name: Test flan-t5-xxl + - name: Test nomic-ai/gpt4all-j working-directory: tests/integration run: | rm -rf models - python3 llm/prepare.py fastertransformer_raw flan-t5-xxl + python3 llm/prepare.py fastertransformer_raw nomic-ai/gpt4all-j ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py fastertransformer_raw flan-t5-xxl + python3 llm/client.py fastertransformer_raw nomic-ai/gpt4all-j docker rm -f $(docker ps -aq) - name: On fail step if: ${{ failure() }} @@ -422,6 +422,15 @@ jobs: serve python3 llm/client.py fastertransformer bigscience/bloom-3b docker rm -f $(docker ps -aq) + - name: Test EleutherAI/pythia-2.8b + working-directory: tests/integration + run: | + rm -rf models + python3 llm/prepare.py fastertransformer EleutherAI/pythia-2.8b + ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ + serve -m test=file:/opt/ml/model/test/ + python3 llm/client.py fastertransformer EleutherAI/pythia-2.8b + docker rm -f $(docker ps -aq) - name: On fail step if: ${{ failure() }} working-directory: tests/integration diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py index 7f4bebb2c30..284dd1cb4ca 100644 --- a/tests/integration/llm/client.py +++ b/tests/integration/llm/client.py @@ -168,6 +168,11 @@ def get_model_name(): "batch_size": [1, 2], "seq_length": [64, 128], "max_memory_per_gpu": [15.0, 15.0, 15.0, 15.0] + }, + "EleutherAI/pythia-2.8b": { + "batch_size": [1, 2], + "seq_length": [64, 128], + "max_memory_per_gpu": [6.0, 6.0, 6.0, 6.0] } } @@ -178,7 +183,7 @@ def get_model_name(): }, "gpt2-xl": { "batch_size": [1, 2], - "max_memory_per_gpu": 7.0 + "max_memory_per_gpu": 8.0 }, "facebook/opt-6.7b": { "batch_size": [1, 2], @@ -188,9 +193,9 @@ def get_model_name(): "batch_size": [1, 2], "max_memory_per_gpu": 6.0 }, - "flan-t5-xxl": { + "nomic-ai/gpt4all-j": { "batch_size": [1, 2], - "max_memory_per_gpu": 15.0 + "max_memory_per_gpu": 6.0 } } diff --git a/tests/integration/llm/fastertransformer-model.py b/tests/integration/llm/fastertransformer-model.py index f6b5a9151fb..36d7681e7ff 100644 --- a/tests/integration/llm/fastertransformer-model.py +++ b/tests/integration/llm/fastertransformer-model.py @@ -2,15 +2,20 @@ import fastertransformer model = None +use_triton = False def load_model(properties): tensor_parallel_degree = properties["tensor_parallel_degree"] pipeline_parallel_degree = 1 # TODO: add tests for pp_degree > 1 model_id = properties.get('model_id') or properties.get('model_dir') + use_triton = properties.get("use_triton", False) dtype = properties.get("dtype", "fp32") - return fastertransformer.init_inference(model_id, tensor_parallel_degree, - pipeline_parallel_degree, dtype) + return fastertransformer.init_inference(model_id, + tensor_parallel_degree, + pipeline_parallel_degree, + dtype, + use_triton=use_triton), use_triton def partition(inputs: Input): @@ -27,10 +32,10 @@ def partition(inputs: Input): def handle(inputs: Input): - global model + global model, use_triton if not model: - model = load_model(inputs.get_properties()) + model, use_triton = load_model(inputs.get_properties()) if inputs.is_empty(): # Model server makes an empty call to warmup the model on startup @@ -38,6 +43,9 @@ def handle(inputs: Input): input_json = inputs.get_as_json() input_data = input_json.pop("inputs") - result = model.pipeline_generate(input_data) + if not use_triton: + result = model.pipeline_generate(input_data) + else: + result = model.pipeline_generate(input_data, [64] * len(input_data)) return Output().add(result) diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py index ab0e6cb2633..165b9c929fb 100644 --- a/tests/integration/llm/prepare.py +++ b/tests/integration/llm/prepare.py @@ -180,6 +180,12 @@ "option.model_id": "s3://djl-llm/flan-t5-xxl/", "option.tensor_parallel_degree": 4, "option.dtype": "fp32" + }, + "EleutherAI/pythia-2.8b": { + "option.model_id": "s3://djl-llm/pythia-2.8b/", + "option.tensor_parallel_degree": 2, + "option.dtype": "fp16", + "gpu.maxWorkers": 1 } } @@ -203,10 +209,11 @@ "option.dtype": "fp16", "gpu.maxWorkers": 1, }, - "flan-t5-xxl": { - "option.model_id": "s3://djl-llm/flan-t5-xxl/", + "nomic-ai/gpt4all-j": { + "option.model_id": "s3://djl-llm/gpt4all-j/", "option.tensor_parallel_degree": 4, - "option.dtype": "fp32" + "option.dtype": "fp32", + "option.use_triton": True } } @@ -409,6 +416,8 @@ def build_ft_raw_model(model): ) options = ft_model_list[model] options["engine"] = "FasterTransformer" + if "option.use_triton" in options and options["option.use_triton"]: + options["engine"] = "Python" write_properties(options) shutil.copyfile("llm/fastertransformer-model.py", "models/test/model.py")