diff --git a/.github/workflows/benchmark_nightly_cpu.yml b/.github/workflows/benchmark_nightly.yml similarity index 60% rename from .github/workflows/benchmark_nightly_cpu.yml rename to .github/workflows/benchmark_nightly.yml index ac8487e416..a88128a091 100644 --- a/.github/workflows/benchmark_nightly_cpu.yml +++ b/.github/workflows/benchmark_nightly.yml @@ -1,4 +1,4 @@ -name: Benchmark torchserve cpu nightly +name: Benchmark torchserve nightly on: # run every day at 2:15am @@ -7,7 +7,13 @@ on: jobs: nightly: - runs-on: [self-hosted, cpu] + strategy: + fail-fast: false + matrix: + hardware: [cpu, gpu, inf1] + runs-on: + - self-hosted + - ${{ matrix.hardware }} timeout-minutes: 1320 steps: - name: Clean up previous run @@ -32,16 +38,26 @@ jobs: uses: actions/checkout@v3 - name: Install dependencies run: | - sudo apt-get update -y - sudo apt-get install -y apache2-utils - pip install -r benchmarks/requirements-ab.txt - export omp_num_threads=1 + sudo apt-get update -y + sudo apt-get install -y apache2-utils + pip install -r benchmarks/requirements-ab.txt - name: Benchmark cpu nightly + if: ${{ matrix.hardware == 'cpu' }} + env: + OMP_NUM_THREADS: 1 run: python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_cpu.yaml --skip false + - name: Benchmark gpu nightly + if: ${{ matrix.hardware == 'gpu' }} + run: python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_gpu.yaml --skip false + - name: Benchmark inf1 nightly + if: ${{ matrix.hardware == 'inf1' }} + env: + NEURON_RT_NUM_CORES: 4 + run: python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_neuron.yaml --skip false - name: Save benchmark artifacts uses: actions/upload-artifact@v2 with: - name: nightly cpu artifact + name: nightly ${{ matrix.hardware }} artifact path: /tmp/ts_benchmark - name: Download benchmark artifacts for auto validation uses: dawidd6/action-download-artifact@v2 @@ -50,19 +66,19 @@ jobs: workflow_conclusion: success if_no_artifact_found: ignore path: /tmp/ts_artifacts - name: cpu_benchmark_validation + name: ${{ matrix.hardware }}_benchmark_validation - name: Update benchmark artifacts for auto validation - run: python benchmarks/utils/update_artifacts.py --output /tmp/ts_artifacts/cpu_benchmark_validation + run: python benchmarks/utils/update_artifacts.py --output /tmp/ts_artifacts/${{ matrix.hardware }}_benchmark_validation - name: Upload the updated benchmark artifacts for auto validation uses: actions/upload-artifact@v2 with: - name: cpu_benchmark_validation + name: ${{ matrix.hardware }}_benchmark_validation path: /tmp/ts_artifacts - name: Open issue on failure - if: ${{ failure() && github.event_name == 'schedule' }} + if: ${{ failure() && github.event_name == 'schedule' && matrix.hardware == 'cpu' }} uses: dacbd/create-issue-action@v1 with: token: ${{ secrets.GITHUB_TOKEN }} - title: Nightly CPU benchmark failed + title: Nightly ${{ matrix.hardware }} benchmark failed body: Commit ${{ github.sha }} daily scheduled [CI run](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) failed, please check why assignees: '' diff --git a/.github/workflows/benchmark_nightly_gpu.yml b/.github/workflows/benchmark_nightly_gpu.yml deleted file mode 100644 index edf7de86b4..0000000000 --- a/.github/workflows/benchmark_nightly_gpu.yml +++ /dev/null @@ -1,60 +0,0 @@ -name: Benchmark torchserve gpu nightly - - -on: - # run every day at 2:15am - schedule: - - cron: '15 02 * * *' - -jobs: - nightly: - runs-on: [self-hosted, gpu] - timeout-minutes: 1320 - steps: - - name: Clean up previous run - run: | - echo "Cleaning up previous run" - cd $RUNNER_WORKSPACE - pwd - cd .. - pwd - rm -rf _tool - - name: Setup Python 3.8 - uses: actions/setup-python@v4 - with: - python-version: 3.8 - architecture: x64 - - name: Setup Java 17 - uses: actions/setup-java@v3 - with: - distribution: 'zulu' - java-version: '17' - - name: Checkout TorchServe - uses: actions/checkout@v3 - - name: Install dependencies - run: | - sudo apt-get update -y - sudo apt-get install -y apache2-utils - pip install -r benchmarks/requirements-ab.txt - - name: Benchmark gpu nightly - run: python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_gpu.yaml --skip false - - name: Save benchmark artifacts - uses: actions/upload-artifact@v2 - with: - name: nightly gpu artifact - path: /tmp/ts_benchmark - - name: Download benchmark artifacts for auto validation - uses: dawidd6/action-download-artifact@v2 - with: - workflow: ${{ github.event.workflow_run.workflow_id }} - workflow_conclusion: success - if_no_artifact_found: ignore - path: /tmp/ts_artifacts - name: gpu_benchmark_validation - - name: Update benchmark artifacts for auto validation - run: python benchmarks/utils/update_artifacts.py --output /tmp/ts_artifacts/gpu_benchmark_validation - - name: Upload the updated benchmark artifacts for auto validation - uses: actions/upload-artifact@v2 - with: - name: gpu_benchmark_validation - path: /tmp/ts_artifacts diff --git a/benchmarks/README.md b/benchmarks/README.md index b935844de6..6feb3d7850 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -127,7 +127,7 @@ python benchmark-ab.py --url https://torchserve.pytorch.org/mar_files/mnist.mar * TORCHSERVE SERVING PREDICTIONS WITH DOCKER ``` -python benchmark-ab.py --url https://torchserve.pytorch.org/mar_files/mnist.mar --content_type application/png --config_properties config.properties --inference_model_url predictions/benchmark --input ../examples/image_classifier/mnist/test_data/0.png --exec_env docker +python benchmark-ab.py --url https://torchserve.pytorch.org/mar_files/mnist.mar --content_type application/png --config_properties config.properties --inference_model_url predictions/benchmark --input ../examples/image_classifier/mnist/test_data/0.png --exec_env docker ``` ### Test plans @@ -136,7 +136,7 @@ Benchmark supports pre-defined, pre-configured params that can be selected based 2. vgg11_1000r_10c: vgg11 model with requests =1000 and concurrency=10 3. vgg11_10000r_100c: vgg11 model with requests =10000 and concurrency=100 4. resnet152_batch: Resnet-152 model with batch size = 4, requests =1000 and concurrency=10 -5. resnet152_batch_docker: Resnet-152 model with batch size = 4, requests =1000, concurrency=10 and execution env = docker +5. resnet152_batch_docker: Resnet-152 model with batch size = 4, requests =1000, concurrency=10 and execution env = docker Note: These pre-defined parameters in test plan can be overwritten by cmd line args. @@ -209,11 +209,11 @@ python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_template ``` ## Github Actions benchmarking -If you need to run your benchmarks on a specific cloud or hardware infrastructure. We highly recommend you fork this repo and leverage the benchmarks in `.github/workflows/benchmark-nightly_cpu*.yml` which will run the benchmarks on a custom instance of your choice and save the results as a github artifact. To learn more about how to create your own custom runner by following instructions from Github here https://docs.github.com/en/actions/hosting-your-own-runners/adding-self-hosted-runners +If you need to run your benchmarks on a specific cloud or hardware infrastructure. We highly recommend you fork this repo and leverage the benchmarks in `.github/workflows/benchmark_nightly.yml` which will run the benchmarks on a custom instance of your choice and save the results as a github artifact. To learn more about how to create your own custom runner by following instructions from Github here https://docs.github.com/en/actions/hosting-your-own-runners/adding-self-hosted-runners The high level approach 1. Create a cloud instance in your favorite cloud provider 2. Configure it so it can talk to github actions by running some shell commands listed here https://docs.github.com/en/actions/hosting-your-own-runners/adding-self-hosted-runners 3. Tag your instances in the runners tab on Github 3. In the `.yml` make sure to use `runs-on [self-hosted, your_tag]` -4. Inspect the results in https://github.com/pytorch/serve/actions and download the artifacts for further analysis \ No newline at end of file +4. Inspect the results in https://github.com/pytorch/serve/actions and download the artifacts for further analysis diff --git a/benchmarks/auto_benchmark.py b/benchmarks/auto_benchmark.py index 7918cfdd0f..b728805909 100644 --- a/benchmarks/auto_benchmark.py +++ b/benchmarks/auto_benchmark.py @@ -95,9 +95,9 @@ def load_config(self): report_cmd = v self.bm_config["model_config_path"] = ( - "{}/cpu".format(MODEL_JSON_CONFIG_PATH) - if self.bm_config["hardware"] == "cpu" - else "{}/gpu".format(MODEL_JSON_CONFIG_PATH) + "{}/{}".format(MODEL_JSON_CONFIG_PATH, self.bm_config["hardware"]) + if self.bm_config["hardware"] in ["cpu", "gpu", "neuron"] + else "{}/cpu".format(MODEL_JSON_CONFIG_PATH) ) if self.skip_ts_install: diff --git a/benchmarks/benchmark_config_neuron.yaml b/benchmarks/benchmark_config_neuron.yaml new file mode 100644 index 0000000000..38fb76c78d --- /dev/null +++ b/benchmarks/benchmark_config_neuron.yaml @@ -0,0 +1,45 @@ +# Torchserve version is to be installed. It can be one of the options +# - branch : "master" +# - nightly: "2022.3.16" +# - release: "0.5.3" +# Nightly build will be installed if "ts_version" is not specifiged +#ts_version: +# branch: &ts_version "master" + +# a list of model configure yaml files defined in benchmarks/models_config +# or a list of model configure yaml files with full path +models: + - "bert_neuron.yaml" + +# benchmark on "cpu", "gpu" or "neuron". +# "cpu" is set if "hardware" is not specified +hardware: &hardware "neuron" + +# load prometheus metrics report to remote storage or local different path if "metrics_cmd" is set. +# the command line to load prometheus metrics report to remote system. +# Here is an example of AWS cloudwatch command: +# Note: +# - keep the values order as the same as the command definition. +# - set up the command before enabling `metrics_cmd`. +# For example, aws client and AWS credentials need to be setup before trying this example. +metrics_cmd: + - "cmd": "aws cloudwatch put-metric-data" + - "--namespace": ["torchserve_benchmark_nightly_", *hardware] + - "--region": "us-east-2" + - "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json' + +# load report to remote storage or local different path if "report_cmd" is set. +# the command line to load report to remote storage. +# Here is an example of AWS cloudwatch command: +# Note: +# - keep the values order as the same as the command. +# - set up the command before enabling `report_cmd`. +# For example, aws client, AWS credentials and S3 bucket +# need to be setup before trying this example. +# - "today()" is a keyword to apply current date in the path +# For example, the dest path in the following example is +# s3://torchserve-model-serving/benchmark/2022-03-18/gpu +report_cmd: + - "cmd": "aws s3 cp --recursive" + - "source": '/tmp/ts_benchmark/' + - "dest": ['s3://torchserve-benchmark/nightly', "today()", *hardware] diff --git a/benchmarks/models_config/bert_neuron.yaml b/benchmarks/models_config/bert_neuron.yaml index 13d9004a22..be771fb8df 100644 --- a/benchmarks/models_config/bert_neuron.yaml +++ b/benchmarks/models_config/bert_neuron.yaml @@ -1,22 +1,68 @@ --- -bert_inf1: +bert_neuron_batch_1: scripted_mode: benchmark_engine: "ab" - compile_per_batch_size: True + url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuron_batch_1.mar workers: - 4 batch_delay: 100 batch_size: - 1 + input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt" + requests: 10000 + concurrency: 100 + backend_profiling: False + exec_env: "local" + processors: + - "neuron" + +bert_neuron_batch_2: + scripted_mode: + benchmark_engine: "ab" + url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuron_batch_2.mar + workers: + - 4 + batch_delay: 100 + batch_size: - 2 + input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt" + requests: 10000 + concurrency: 100 + backend_profiling: False + exec_env: "local" + processors: + - "neuron" + +bert_neuron_batch_4: + scripted_mode: + benchmark_engine: "ab" + url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuron_batch_4.mar + workers: + - 4 + batch_delay: 100 + batch_size: - 4 + input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt" + requests: 10000 + concurrency: 100 + backend_profiling: False + exec_env: "local" + processors: + - "neuron" + +bert_neuron_batch_8: + scripted_mode: + benchmark_engine: "ab" + url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuron_batch_8.mar + workers: + - 4 + batch_delay: 100 + batch_size: - 8 - input: "./benchmarks/automated/tests/resources/neuron-bert/input" + input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt" requests: 10000 concurrency: 100 backend_profiling: False - exec_env: "aws_neuron_pytorch_p36" + exec_env: "local" processors: - - "inferentia" -instance_types: - - "inf1.6xlarge" \ No newline at end of file + - "neuron" diff --git a/examples/Huggingface_Transformers/Download_Transformer_models.py b/examples/Huggingface_Transformers/Download_Transformer_models.py index 821e89cc98..ff5af3d5c5 100644 --- a/examples/Huggingface_Transformers/Download_Transformer_models.py +++ b/examples/Huggingface_Transformers/Download_Transformer_models.py @@ -20,7 +20,14 @@ def transformers_model_dowloader( - mode, pretrained_model_name, num_labels, do_lower_case, max_length, torchscript + mode, + pretrained_model_name, + num_labels, + do_lower_case, + max_length, + torchscript, + hardware, + batch_size, ): """This function, save the checkpoint, config file along with tokenizer config and vocab files of a transformer model of your choice. @@ -98,11 +105,27 @@ def transformers_model_dowloader( add_special_tokens=True, return_tensors="pt", ) - input_ids = inputs["input_ids"].to(device) - attention_mask = inputs["attention_mask"].to(device) model.to(device).eval() - traced_model = torch.jit.trace(model, (input_ids, attention_mask)) - torch.jit.save(traced_model, os.path.join(NEW_DIR, "traced_model.pt")) + if hardware == "neuron": + import torch_neuron + + input_ids = torch.cat([inputs["input_ids"]] * batch_size, 0).to(device) + attention_mask = torch.cat([inputs["attention_mask"]] * batch_size, 0).to( + device + ) + traced_model = torch_neuron.trace(model, (input_ids, attention_mask)) + torch.jit.save( + traced_model, + os.path.join( + NEW_DIR, + "traced_{}_model_neuron_batch_{}.pt".format(model_name, batch_size), + ), + ) + else: + input_ids = inputs["input_ids"].to(device) + attention_mask = inputs["attention_mask"].to(device) + traced_model = torch.jit.trace(model, (input_ids, attention_mask)) + torch.jit.save(traced_model, os.path.join(NEW_DIR, "traced_model.pt")) return @@ -124,7 +147,16 @@ def transformers_model_dowloader( torchscript = True else: torchscript = False + hardware = settings.get("hardware") + batch_size = int(settings.get("batch_size", "1")) transformers_model_dowloader( - mode, model_name, num_labels, do_lower_case, max_length, torchscript + mode, + model_name, + num_labels, + do_lower_case, + max_length, + torchscript, + hardware, + batch_size, ) diff --git a/examples/Huggingface_Transformers/README.md b/examples/Huggingface_Transformers/README.md index a11b7ec919..4a24f1fe09 100644 --- a/examples/Huggingface_Transformers/README.md +++ b/examples/Huggingface_Transformers/README.md @@ -1,6 +1,6 @@ ## Serving Huggingface Transformers using TorchServe -In this example, we show how to serve a fine tuned or off the shelf Transformer model from [huggingface](https://huggingface.co/transformers/index.html) using TorchServe. +In this example, we show how to serve a fine tuned or off the shelf Transformer model from [huggingface](https://huggingface.co/transformers/index.html) using TorchServe. We use a custom handler, [Transformer_handler.py](https://github.com/pytorch/serve/blob/master/examples/Huggingface_Transformers/Transformer_handler_generalized.py). @@ -51,21 +51,25 @@ In the setup_config.json : *embedding_name* : The name of embedding layer in the chosen model, this could be `bert` for `bert-base-uncased`, `roberta` for `roberta-base` or `roberta` for `xlm-roberta-large`, or `gpt2` for `gpt2` model +*hardware* : The target platform to trace the model for. Specify as `neuron` for [Inferentia1](https://aws.amazon.com/ec2/instance-types/inf1/). + +*batch_size* : Input batch size when tracing the model for `neuron` as target hardware. + Once, `setup_config.json` has been set properly, the next step is to run `python Download_Transformer_models.py` -This produces all the required files for packaging using a huggingface transformer model off-the-shelf without fine-tuning process. Using this option will create and saved the required files into Transformer_model directory. +This produces all the required files for packaging using a huggingface transformer model off-the-shelf without fine-tuning process. Using this option will create and saved the required files into Transformer_model directory. #### Setting the extra_files -There are few files that are used for model packaging and at the inference time. +There are few files that are used for model packaging and at the inference time. * `index_to_name.json`: maps predictions to labels * `sample_text.txt`: input text for inference * `vocab.txt`: by default will use the tokenizer from the pretrained model -For custom vocabs, it is required to pass all other tokenizer related files such `tokenizer_config.json`, `special_tokens_map.json`, `config.json` and if available `merges.txt`. +For custom vocabs, it is required to pass all other tokenizer related files such `tokenizer_config.json`, `special_tokens_map.json`, `config.json` and if available `merges.txt`. For examples of how to configure a model for a use case and what the input format should look like * Model configuration: `Transformer_model` directory after running `python Download_Transformer_models.py` @@ -278,7 +282,7 @@ For batch inference the main difference is that you need set the batch size whil mv BERTSeqClassification.mar model_store/ torchserve --start --model-store model_store --ts-config config.properties --models BERTSeqClassification= BERTSeqClassification.mar - ``` + ``` Now to run the batch inference following command can be used: ``` @@ -293,7 +297,7 @@ curl -X POST http://127.0.0.1:8080/predictions/BERTSeqClassification -T ./Seq_c The [Captum Explanations for Visual Insights Notebook](https://github.com/pytorch/serve/tree/master/examples/captum/Captum_visualization_for_bert.ipynb) provides a visual example for how model interpretations can help -Known issues: +Known issues: * Captum does't work well for batched inputs and may result in timeouts * No support for torchscripted models @@ -318,7 +322,7 @@ Note: make sure to install [HuggingFace Optimum] `pip install optimum` Main speed ups in the Better Transformer comes from kernel fusion in the [TransformerEncoder] (https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoder.html) and making use of sparsity with [nested tensors](https://pytorch.org/tutorials/prototype/nestedtensor.html) when input sequences are padded to avoid unnecessary computation on padded tensors. We have seen up to 4.5x speed up with distill_bert when used higher batch sizes with padding. Please read more about it in this [blog post](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2). You get some speedups even with Batch size = 1 and no padding however, major speed ups will show up when running inference with higher batch sizes (8.16,32) with padding. -## Model Parallelism +## Model Parallelism [Parallelize] (https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2Model.parallelize) is a an experimental feature that HuggingFace recently added to support large model inference for some very large models, GPT2 and T5. GPT2 model choices based on their size are gpt2-medium, gpt2-large, gpt2-xl. This feature only supports LMHeadModel that could be used for text generation, other application such as sequence, token classification and question answering are not supported. We have added parallelize support for GPT2 model in the custom handler in this example that will enable you to perform model parallel inference for GPT2 models used for text generation. The same logic in the handler can be extended to T5 and the applications it supports. Make sure that you register your model with one worker using this feature. To run this example, a machine with #gpus > 1 is required. The number of required gpus depends on the size of the model. This feature only supports single node, one machine with multi-gpus. @@ -356,7 +360,7 @@ To register the model on TorchServe using the above model archive file, we run t ``` mkdir model_store mv Textgeneration.mar model_store/ -torchserve --start --model-store model_store +torchserve --start --model-store model_store curl -X POST "localhost:8081/models?model_name=Textgeneration&url=Textgeneration.mar&batch_size=1&max_batch_delay=5000&initial_workers=1&synchronous=true" ``` diff --git a/examples/Huggingface_Transformers/Transformer_handler_generalized_neuron.py b/examples/Huggingface_Transformers/Transformer_handler_generalized_neuron.py new file mode 100644 index 0000000000..c74ac9e9b4 --- /dev/null +++ b/examples/Huggingface_Transformers/Transformer_handler_generalized_neuron.py @@ -0,0 +1,32 @@ +import os + +import torch +from Transformer_handler_generalized import TransformersSeqClassifierHandler + +if "NEURON_RT_NUM_CORES" not in os.environ: + os.environ["NEURON_RT_NUM_CORES"] = "1" + + +class TransformersSeqClassifierNeuronHandler(TransformersSeqClassifierHandler): + def inference(self, input_batch): + """Predict the class (or classes) of the received text using the + serialized transformers checkpoint. + Args: + input_batch (list): List of Text Tensors from the pre-process function is passed here + Returns: + list : It returns a list of the predicted value for the input text + """ + input_ids_batch, attention_mask_batch = input_batch + num_inferences = len(input_ids_batch) + batch_size = int(self.setup_config.get("batch_size", "1")) + + # insert padding if a partial batch was received + padding = batch_size - num_inferences + if padding > 0: + pad = torch.nn.ConstantPad1d((0, 0, 0, padding), value=0) + input_ids_batch = pad(input_ids_batch) + attention_mask_batch = pad(attention_mask_batch) + + inferences = super().inference((input_ids_batch, attention_mask_batch)) + + return inferences[:num_inferences]