Add llama2 text completion example with streaming response support

pytorch · Jul 27, 2023 · b9f2654 · b9f2654
1 parent 71e1c2e
commit b9f2654
Show file tree

Hide file tree

Showing 8 changed files with 136 additions and 189 deletions.
diff --git a/examples/large_models/inferentia2/llama/inf2_handler.py b/examples/large_models/inferentia2/llama/inf2_handler.py
diff --git a/examples/large_models/inferentia2/llama/requirements.txt b/examples/large_models/inferentia2/llama/requirements.txt
diff --git a/examples/large_models/inferentia2/llama/sample_text.txt b/examples/large_models/inferentia2/llama/sample_text.txt
diff --git a/.../large_models/inferentia2/llama/Readme.md → ...large_models/inferentia2/llama2/Readme.md b/.../large_models/inferentia2/llama/Readme.md → ...large_models/inferentia2/llama2/Readme.md
@@ -1,12 +1,12 @@
 # Large model inference on Inferentia2
 
-This document briefs on serving large HuggingFace (HF) models on [AWS Inferentia2](https://aws.amazon.com/ec2/instance-types/inf2/) instances.
+This document briefs on serving the [Llama 2](https://huggingface.co/meta-llama) model on [AWS Inferentia2](https://aws.amazon.com/ec2/instance-types/inf2/) with streaming response support.
 
-Inferentia2 uses [Neuron SDK](https://aws.amazon.com/machine-learning/neuron/) which is build on top of PyTorch XLA stack. For large model inference [`transformers-neuronx`](https://github.com/aws-neuron/transformers-neuronx) package is used that takes care of model partitioning and running inference.
+Inferentia2 uses [Neuron SDK](https://aws.amazon.com/machine-learning/neuron/) which is built on top of PyTorch XLA stack. For large model inference [`transformers-neuronx`](https://github.com/aws-neuron/transformers-neuronx) package is used that takes care of model partitioning and running inference.
 
 Let's take a look at the steps to prepare our model for inference on Inf2 instances.
 
-**Note** To run the model on an Inf2 instance, the model gets compiled as a preprocessing step. As part of the compilation process, to generate the model graph, a specific batch size is used. Following this, when running inference, we need to pass the same batch size that was used during compilation. This example uses batch size of 2 but make sure to change it and register the model according to your batch size.
+**Note** To run the model on an Inf2 instance, the model gets compiled as a preprocessing step. As part of the compilation process, to generate the model graph, a specific batch size is used. Following this, when running inference, we need to pass the same batch size that was used during compilation. This example uses batch size of 1 to demonstrate real-time inference with streaming response.
 
 ### Step 1: Inf2 instance
 
@@ -30,7 +30,7 @@ source /opt/aws_neuron_venv_pytorch/bin/activate
 python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
 
 # Update Neuron Compiler and Framework
-python -m pip install --upgrade neuronx-cc==2.* torch-neuronx torchvision
+python -m pip install --upgrade neuronx-cc==2.* torch-neuronx
 
 pip install git+https://github.com/aws-neuron/transformers-neuronx.git transformers -U
 
@@ -40,38 +40,42 @@ pip install git+https://github.com/aws-neuron/transformers-neuronx.git transform
 
 ### Step 2: Save the model split checkpoints compatible with `transformers-neuronx`
 
-Navigate to `large_model/inferentia2/llama` directory.
+Navigate to `large_model/inferentia2/llama2` directory.
 
 ```bash
- python ../util/inf2_save_split_checkpoints.py --model_name decapoda-research/llama-7b-hf --save_path './decapoda_llama_7b_split'
+ python ../util/inf2_save_split_checkpoints.py --model_name meta-llama/Llama-2-7b-hf --save_path './llama-2-7b-split'
 
 ```
 
 
 ### Step 3: Generate Tar/ MAR file
 
 ```bash
-torch-model-archiver --model-name decapoda_llama_7b --version 1.0 --handler inf2_handler.py --extra-files ./decapoda_llama_7b_split  -r requirements.txt --config-file model-config.yaml --archive-format no-archive
+torch-model-archiver --model-name llama-2-7b --version 1.0 --handler inf2_handler.py --extra-files ./llama-2-7b-split  -r requirements.txt --config-file model-config.yaml --archive-format no-archive
 
 ```
 
 ### Step 4: Add the mar file to model store
 
 ```bash
 mkdir model_store
-mv decapoda_llama_7b model_store
+mv llama-2-7b model_store
 ```
 
 ### Step 5: Start torchserve
 
-Update config.properties and start torchserve
+```bash
+torchserve --ncs --start --model-store model_store
+```
+
+### Step 6: Register model
 
 ```bash
-torchserve --ncs --start --model-store model_store --models decapoda_llama_7b
+curl -X POST "http://localhost:8081/models?url=llama-2-7b"
 ```
 
-### Step 6: Run inference
+### Step 7: Run inference
 
 ```bash
-curl -v "http://localhost:8080/predictions/decapoda_llama_7b" -T sample_text.txt
+python test_stream_response.py
 ```
diff --git a/examples/large_models/inferentia2/llama2/inf2_handler.py b/examples/large_models/inferentia2/llama2/inf2_handler.py
@@ -0,0 +1,100 @@
+import logging
+import os
+from abc import ABC
+from threading import Thread
+
+import torch_neuronx
+from transformers import AutoConfig, LlamaTokenizer, TextIteratorStreamer
+from transformers_neuronx.generation_utils import HuggingFaceGenerationModelAdapter
+from transformers_neuronx.llama.model import LlamaForSampling
+
+from ts.protocol.otf_message_handler import send_intermediate_predict_response
+from ts.torch_handler.base_handler import BaseHandler
+
+logger = logging.getLogger(__name__)
+
+
+class LLMHandler(BaseHandler, ABC):
+    """
+    Transformers handler class for text completion streaming on Inferentia2
+    """
+
+    def __init__(self):
+        super(LLMHandler, self).__init__()
+        self.initialized = False
+
+    def initialize(self, ctx):
+        self.manifest = ctx.manifest
+        properties = ctx.system_properties
+        model_dir = properties.get("model_dir")
+
+        # settings for model compiliation and loading
+        model_name = ctx.model_yaml_config["handler"]["model_name"]
+        tp_degree = ctx.model_yaml_config["handler"]["tp_degree"]
+        self.max_length = ctx.model_yaml_config["handler"]["max_length"]
+
+        # allocate "tp_degree" number of neuron cores to the worker process
+        os.environ["NEURON_RT_NUM_CORES"] = str(tp_degree)
+        try:
+            num_neuron_cores_available = (
+                torch_neuronx.xla_impl.data_parallel.device_count()
+            )
+            assert num_neuron_cores_available >= int(tp_degree)
+        except (RuntimeError, AssertionError) as error:
+            raise RuntimeError(
+                "Required number of neuron cores for tp_degree "
+                + str(tp_degree)
+                + " are not available: "
+                + str(error)
+            )
+
+        os.environ["NEURON_CC_FLAGS"] = "--model-type=transformer-inference"
+
+        self.tokenizer = LlamaTokenizer.from_pretrained(model_name)
+        self.model = LlamaForSampling.from_pretrained(
+            model_dir, batch_size=1, tp_degree=tp_degree
+        )
+        logger.info("Starting to compile the model")
+        self.model.to_neuron()
+        logger.info("Model has been successfully compiled")
+        model_config = AutoConfig.from_pretrained(model_dir)
+        self.model = HuggingFaceGenerationModelAdapter(model_config, self.model)
+        self.output_streamer = TextIteratorStreamer(self.tokenizer)
+
+        self.initialized = True
+
+    def preprocess(self, requests):
+        input_texts = []
+        for req in requests:
+            data = req.get("data") or req.get("body")
+            if isinstance(data, (bytes, bytearray)):
+                data = data.decode("utf-8")
+            input_texts.append(data)
+
+        return self.tokenizer(input_texts, return_tensors="pt")
+
+    def inference(self, tokenized_input):
+        generation_kwargs = dict(
+            tokenized_input,
+            streamer=self.output_streamer,
+            max_new_tokens=self.max_length,
+        )
+        self.model.reset_generation()
+        thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
+        thread.start()
+
+        for new_text in self.output_streamer:
+            send_intermediate_predict_response(
+                [new_text],
+                self.context.request_ids,
+                "Intermediate Prediction success",
+                200,
+                self.context,
+            )
+
+        thread.join()
+
+        return [""]
+
+    def postprocess(self, inference_output):
+        return inference_output
diff --git a/...odels/inferentia2/llama/model-config.yaml → ...dels/inferentia2/llama2/model-config.yaml b/...odels/inferentia2/llama/model-config.yaml → ...dels/inferentia2/llama2/model-config.yaml
@@ -5,8 +5,5 @@ responseTimeout: 600
 
 handler:
     max_length: 50
-    manual_seed: 40
-    batch_size: 2
     tp_degree: 2
-    amp: f16
-    model_name: decapoda-research/llama-7b-hf
+    model_name: meta-llama/Llama-2-7b-hf
diff --git a/examples/large_models/inferentia2/llama2/requirements.txt b/examples/large_models/inferentia2/llama2/requirements.txt
@@ -0,0 +1,5 @@
+torch-neuronx
+transformers-neuronx
+transformers
+tokenizers
+sentencepiece
diff --git a/examples/large_models/inferentia2/llama2/test_stream_response.py b/examples/large_models/inferentia2/llama2/test_stream_response.py
@@ -0,0 +1,14 @@
+import requests
+
+response = requests.post(
+    "http://localhost:8080/predictions/llama-2-7b",
+    data="Today the weather is really nice and I am planning on ",
+    stream=True,
+)
+
+for chunk in response.iter_content(chunk_size=None):
+    if chunk:
+        data = chunk.decode("utf-8")
+        print(data, end="", flush=True)
+
+print("")