Revert changes to FastPitch and BERT on PyTorch

Lzhang-hub · Aug 13, 2021 · 6a64283 · 6a64283
1 parent a860701
commit 6a64283
Show file tree

Hide file tree

Showing 16 changed files with 438 additions and 441 deletions.
diff --git a/PyTorch/LanguageModeling/BERT/Dockerfile b/PyTorch/LanguageModeling/BERT/Dockerfile
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.12-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
 FROM nvcr.io/nvidia/tritonserver:20.06-v1-py3-clientsdk as trt
 FROM ${FROM_IMAGE_NAME}
 RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract
@@ -33,8 +33,8 @@ RUN pip install /workspace/install/python/tensorrtserver*.whl
 WORKDIR /workspace/bert
 RUN pip install --upgrade --no-cache-dir pip \
  && pip install --no-cache-dir \
- tqdm boto3 requests six ipdb h5py html2text nltk onnxruntime tokenizers==0.7\
- git+https://github.com/NVIDIA/dllogger@a20b622 wget
+ tqdm boto3 requests six ipdb h5py html2text nltk progressbar onnxruntime \
+ git+https://github.com/NVIDIA/dllogger wget
 
 RUN apt-get install -y iputils-ping
 

diff --git a/PyTorch/SpeechSynthesis/FastPitch/Dockerfile b/PyTorch/SpeechSynthesis/FastPitch/Dockerfile
@@ -22,7 +22,7 @@ RUN apt-get update && apt-get install -y libb64-dev libb64-0d
 
 # Install Triton Client Python API and copy Perf Client
 COPY --from=triton-client /workspace/install/ /workspace/install/
-RUN find /workspace/install/python/ -iname triton*manylinux*.whl -exec pip install {}[all] \;
+RUN pip install /workspace/install/python/triton*.whl
 
 # Setup environment variables to access Triton Client binaries and libs
 ENV PATH /workspace/install/bin:${PATH}

diff --git a/PyTorch/SpeechSynthesis/FastPitch/inference.py b/PyTorch/SpeechSynthesis/FastPitch/inference.py
@@ -325,7 +325,7 @@ def main():
             with torch.no_grad():
                 if generator is not None:
                     b = batches[0]
-                    mel, *_ = generator(b['text'], b['text_lens'])
+                    mel, *_ = generator(b['text'])
                 if waveglow is not None:
                     audios = waveglow(mel, sigma=args.sigma_infer).float()
                     _ = denoiser(audios, strength=args.denoising_strength)

diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py
@@ -32,19 +32,11 @@
 import os
 import re
 import time
-import warnings
 from collections import defaultdict, OrderedDict
 from contextlib import contextmanager
 
 import numpy as np
-try:
-    import nvidia_dlprof_pytorch_nvtx as pyprof
-except ModuleNotFoundError:
-    try:
-        import pyprof
-    except ModuleNotFoundError:
-        warnings.warn('PyProf is unavailable')
-
+import nvidia_dlprof_pytorch_nvtx as pyprof
 import torch
 import torch.cuda.profiler as profiler
 import torch.distributed as dist

diff --git a/PyTorch/SpeechSynthesis/FastPitch/triton/calculate_metrics.py b/PyTorch/SpeechSynthesis/FastPitch/triton/calculate_metrics.py
@@ -17,7 +17,7 @@
 r"""
 Using `calculate_metrics.py` script, you can obtain model accuracy/error metrics using defined `MetricsCalculator` class.
 
-Data provided to `MetricsCalculator` are obtained from dump files
+Data provided to `MetricsCalculator` are obtained from npz dump files
 stored in directory pointed by `--dump-dir` argument.
 Above files are prepared by `run_inference_on_fw.py` and `run_inference_on_triton.py` scripts.
 
@@ -40,24 +40,49 @@
 import string
 from pathlib import Path
 
+import numpy as np
+
 # method from PEP-366 to support relative import in executed modules
 
 if __package__ is None:
     __package__ = Path(__file__).parent.name
 
 from .deployment_toolkit.args import ArgParserGenerator
 from .deployment_toolkit.core import BaseMetricsCalculator, load_from_file
-from .deployment_toolkit.dump import JsonDumpReader
+from .deployment_toolkit.dump import pad_except_batch_axis
 
 LOGGER = logging.getLogger("calculate_metrics")
 TOTAL_COLUMN_NAME = "_total_"
 
 
+def get_data(dump_dir, prefix):
+    """Loads and concatenates dump files for given prefix (ex. inputs, outputs, labels, ids)"""
+    dump_dir = Path(dump_dir)
+    npz_files = sorted(dump_dir.glob(f"{prefix}*.npz"))
+    data = None
+    if npz_files:
+        # assume that all npz files with given prefix contain same set of names
+        names = list(np.load(npz_files[0].as_posix()).keys())
+        # calculate target shape
+        target_shape = {
+            name: tuple(np.max([np.load(npz_file.as_posix())[name].shape for npz_file in npz_files], axis=0))
+            for name in names
+        }
+        # pad and concatenate data
+        data = {
+            name: np.concatenate(
+                [pad_except_batch_axis(np.load(npz_file.as_posix())[name], target_shape[name]) for npz_file in npz_files]
+            )
+            for name in names
+        }
+    return data
+
+
 def main():
     logging.basicConfig(level=logging.INFO)
 
     parser = argparse.ArgumentParser(description="Run models with given dataloader", allow_abbrev=False)
-    parser.add_argument("--metrics", help="Path to python module containing metrics calculator", required=True)
+    parser.add_argument("--metrics", help=f"Path to python module containing metrics calculator", required=True)
     parser.add_argument("--csv", help="Path to csv file", required=True)
     parser.add_argument("--dump-dir", help="Path to directory with dumped outputs (and labels)", required=True)
 
@@ -68,18 +93,29 @@ def main():
 
     args = parser.parse_args()
 
-    LOGGER.info("args:")
+    LOGGER.info(f"args:")
     for key, value in vars(args).items():
         LOGGER.info(f"    {key} = {value}")
 
     MetricsCalculator = load_from_file(args.metrics, "metrics", "MetricsCalculator")
     metrics_calculator: BaseMetricsCalculator = ArgParserGenerator(MetricsCalculator).from_args(args)
 
-    reader = JsonDumpReader(args.dump_dir)
-    for ids, x, y_true, y_pred in reader.iterate_over(["ids", "inputs", "labels", "outputs"]):
-        ids = list(ids["ids"]) if ids is not None else None
-        metrics_calculator.update(ids=ids, x=x, y_pred=y_pred, y_real=y_true)
-    metrics = metrics_calculator.metrics
+    ids = get_data(args.dump_dir, "ids")["ids"]
+    x = get_data(args.dump_dir, "inputs")
+    y_true = get_data(args.dump_dir, "labels")
+    y_pred = get_data(args.dump_dir, "outputs")
+
+    common_keys = list({k for k in (y_true or [])} & {k for k in (y_pred or [])})
+    for key in common_keys:
+        if y_true[key].shape != y_pred[key].shape:
+            LOGGER.warning(
+                f"Model predictions and labels shall have equal shapes. "
+                f"y_pred[{key}].shape={y_pred[key].shape} != "
+                f"y_true[{key}].shape={y_true[key].shape}"
+            )
+
+    metrics = metrics_calculator.calc(ids=ids, x=x, y_pred=y_pred, y_real=y_true)
+    metrics = {TOTAL_COLUMN_NAME: len(ids), **metrics}
 
     metric_names_with_space = [name for name in metrics if any([c in string.whitespace for c in name])]
     if metric_names_with_space:

diff --git a/PyTorch/SpeechSynthesis/FastPitch/triton/config_model_on_triton.py b/PyTorch/SpeechSynthesis/FastPitch/triton/config_model_on_triton.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+To configure model on Triton, you can use `config_model_on_triton.py` script.
+This will prepare layout of Model Repository, including  Model Configuration.
+
+```shell script
+python ./triton/config_model_on_triton.py \
+    --model-repository /model_repository \
+    --model-path /models/exported/model.onnx \
+    --model-format onnx \
+    --model-name ResNet50 \
+    --model-version 1 \
+    --max-batch-size 32 \
+    --precision fp16 \
+    --backend-accelerator trt \
+    --load-model explicit \
+    --timeout 120 \
+    --verbose
+```
+
+If Triton server to which we prepare model repository is running with **explicit model control mode**,
+use `--load-model` argument to send request load_model request to Triton Inference Server.
+If server is listening on non-default address or port use `--server-url` argument to point server control endpoint.
+If it is required to use HTTP protocol to communicate with Triton server use `--http` argument.
+
+To improve inference throughput you can use
+[dynamic batching](https://github.com/triton-inference-server/server/blob/master/docs/model_configuration.md#dynamic-batcher)
+for your model by providing `--preferred-batch-sizes` and `--max-queue-delay-us` parameters.
+
+For models which doesn't support batching, set `--max-batch-sizes` to 0.
+
+By default Triton will [automatically obtain inputs and outputs definitions](https://github.com/triton-inference-server/server/blob/master/docs/model_configuration.md#auto-generated-model-configuration).
+but for TorchScript ang TF GraphDef models script uses file with I/O specs. This file is automatically generated
+when the model is converted to ScriptModule (either traced or scripted).
+If there is a need to pass different than default path to I/O spec file use `--io-spec` CLI argument.
+
+I/O spec file is yaml file with below structure:
+
+```yaml
+- inputs:
+  - name: input
+    dtype: float32   # np.dtype name
+    shape: [None, 224, 224, 3]
+- outputs:
+  - name: probabilities
+    dtype: float32
+    shape: [None, 1001]
+  - name: classes
+    dtype: int32
+    shape: [None, 1]
+```
+
+"""
+
+import argparse
+import logging
+import time
+
+from model_navigator import Accelerator, Format, Precision
+from model_navigator.args import str2bool
+from model_navigator.log import set_logger, log_dict
+from model_navigator.triton import ModelConfig, TritonClient, TritonModelStore
+
+LOGGER = logging.getLogger("config_model")
+
+
+def _available_enum_values(my_enum):
+    return [item.value for item in my_enum]
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Create Triton model repository and model configuration", allow_abbrev=False
+    )
+    parser.add_argument("--model-repository", required=True, help="Path to Triton model repository.")
+    parser.add_argument("--model-path", required=True, help="Path to model to configure")
+
+    # TODO: automation
+    parser.add_argument(
+        "--model-format",
+        required=True,
+        choices=_available_enum_values(Format),
+        help="Format of model to deploy",
+    )
+    parser.add_argument("--model-name", required=True, help="Model name")
+    parser.add_argument("--model-version", default="1", help="Version of model (default 1)")
+    parser.add_argument(
+        "--max-batch-size",
+        type=int,
+        default=32,
+        help="Maximum batch size allowed for inference. "
+        "A max_batch_size value of 0 indicates that batching is not allowed for the model",
+    )
+    # TODO: automation
+    parser.add_argument(
+        "--precision",
+        type=str,
+        default=Precision.FP16.value,
+        choices=_available_enum_values(Precision),
+        help="Model precision (parameter used only by Tensorflow backend with TensorRT optimization)",
+    )
+
+    # Triton Inference Server endpoint
+    parser.add_argument(
+        "--server-url",
+        type=str,
+        default="grpc://localhost:8001",
+        help="Inference server URL in format protocol://host[:port] (default grpc://localhost:8001)",
+    )
+    parser.add_argument(
+        "--load-model",
+        choices=["none", "poll", "explicit"],
+        help="Loading model while Triton Server is in given model control mode",
+    )
+    parser.add_argument(
+        "--timeout", default=120, help="Timeout in seconds to wait till model load (default=120)", type=int
+    )
+
+    # optimization related
+    parser.add_argument(
+        "--backend-accelerator",
+        type=str,
+        choices=_available_enum_values(Accelerator),
+        default=Accelerator.TRT.value,
+        help="Select Backend Accelerator used to serve model",
+    )
+    parser.add_argument("--number-of-model-instances", type=int, default=1, help="Number of model instances per GPU")
+    parser.add_argument(
+        "--preferred-batch-sizes",
+        type=int,
+        nargs="*",
+        help="Batch sizes that the dynamic batcher should attempt to create. "
+        "In case --max-queue-delay-us is set and this parameter is not, default value will be --max-batch-size",
+    )
+    parser.add_argument(
+        "--max-queue-delay-us",
+        type=int,
+        default=0,
+        help="Max delay time which dynamic batcher shall wait to form a batch (default 0)",
+    )
+    parser.add_argument(
+        "--capture-cuda-graph",
+        type=int,
+        default=0,
+        help="Use cuda capture graph (used only by TensorRT platform)",
+    )
+
+    parser.add_argument("-v", "--verbose", help="Provide verbose logs", type=str2bool, default=False)
+    args = parser.parse_args()
+
+    set_logger(verbose=args.verbose)
+    log_dict("args", vars(args))
+
+    config = ModelConfig.create(
+        model_path=args.model_path,
+        # model definition
+        model_name=args.model_name,
+        model_version=args.model_version,
+        model_format=args.model_format,
+        precision=args.precision,
+        max_batch_size=args.max_batch_size,
+        # optimization
+        accelerator=args.backend_accelerator,
+        gpu_engine_count=args.number_of_model_instances,
+        preferred_batch_sizes=args.preferred_batch_sizes or [],
+        max_queue_delay_us=args.max_queue_delay_us,
+        capture_cuda_graph=args.capture_cuda_graph,
+    )
+
+    model_store = TritonModelStore(args.model_repository)
+    model_store.deploy_model(model_config=config, model_path=args.model_path)
+
+    if args.load_model != "none":
+        client = TritonClient(server_url=args.server_url, verbose=args.verbose)
+        client.wait_for_server_ready(timeout=args.timeout)
+
+        if args.load_model == "explicit":
+            client.load_model(model_name=args.model_name)
+
+        if args.load_model == "poll":
+            time.sleep(15)
+
+        client.wait_for_model(model_name=args.model_name, model_version=args.model_version, timeout_s=args.timeout)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/PyTorch/SpeechSynthesis/FastPitch/triton/deployment_toolkit/.version b/PyTorch/SpeechSynthesis/FastPitch/triton/deployment_toolkit/.version
@@ -1 +1 @@
-0.6.13-4-g623fb7c3
+0.5.0-2-gd556907
diff --git a/PyTorch/SpeechSynthesis/FastPitch/triton/deployment_toolkit/args.py b/PyTorch/SpeechSynthesis/FastPitch/triton/deployment_toolkit/args.py
@@ -53,7 +53,7 @@ def add_args_for_fn_signature(parser, fn) -> argparse.ArgumentParser:
             if parameter.annotation == bool:
                 argument_kwargs["type"] = str2bool
                 argument_kwargs["choices"] = [0, 1]
-            elif isinstance(parameter.annotation, type(Optional[Any])):
+            elif type(parameter.annotation) == type(Union): # isinstance(parameter.annotation, type(Optional[Any])):
                 types = [type_ for type_ in parameter.annotation.__args__ if not isinstance(None, type_)]
                 if len(types) != 1:
                     raise RuntimeError(