byshiue · byshiue · Sep 20, 2020 · Jun 21, 2020 · Jun 30, 2020 · Jun 30, 2020
diff --git a/.gitmodules b/.gitmodules
@@ -1,7 +1,4 @@
-[submodule "PyTorch/Translation/Transformer/cutlass"]
-    path = PyTorch/Translation/Transformer/cutlass
-    url = https://github.com/NVIDIA/cutlass.git
-[submodule "PyTorch/SpeechRecognition/Jasper/external/tensorrt-inference-server"]
-    path = PyTorch/SpeechRecognition/Jasper/external/tensorrt-inference-server
-    url = https://github.com/NVIDIA/tensorrt-inference-server.git
-    branch = r19.06
+[submodule "PyTorch/SpeechRecognition/Jasper/external/triton-inference-server"]
+    path = PyTorch/SpeechRecognition/Jasper/external/triton-inference-server   
+    url = https://github.com/NVIDIA/triton-inference-server.git
+    branch = r19.12
diff --git a/CUDA-Optimized/FastSpeech/.gitignore b/CUDA-Optimized/FastSpeech/.gitignore
@@ -0,0 +1,5 @@
+.idea
+__pycache__
+.DS_Store
+*.egg-info
+.vscode
diff --git a/CUDA-Optimized/FastSpeech/.gitmodules b/CUDA-Optimized/FastSpeech/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "waveglow"]
+	path = waveglow
+	url = https://github.com/NVIDIA/waveglow.git
+[submodule "cub"]
+	path = cub
+	url = https://github.com/NVlabs/cub.git
diff --git a/CUDA-Optimized/FastSpeech/Dockerfile b/CUDA-Optimized/FastSpeech/Dockerfile
@@ -0,0 +1,7 @@
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.03-py3
+FROM ${FROM_IMAGE_NAME}
+
+ADD . /workspace/fastspeech
+WORKDIR /workspace/fastspeech
+
+RUN sh ./scripts/install.sh
diff --git a/CUDA-Optimized/FastSpeech/LICENSE b/CUDA-Optimized/FastSpeech/LICENSE
@@ -0,0 +1,23 @@
+Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the NVIDIA CORPORATION nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/CUDA-Optimized/FastSpeech/MANIFEST.in b/CUDA-Optimized/FastSpeech/MANIFEST.in
@@ -0,0 +1,2 @@
+include fastspeech/hparams/*.yaml
+recursive-include fastspeech/trt/plugins *.so
diff --git a/CUDA-Optimized/FastSpeech/README.md b/CUDA-Optimized/FastSpeech/README.md
diff --git a/CUDA-Optimized/FastSpeech/fastspeech/__init__.py b/CUDA-Optimized/FastSpeech/fastspeech/__init__.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the
+#       names of its contributors may be used to endorse or promote products
+#       derived from this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+from fastspeech.utils.hparam import Hparam
+import torch
+
+# hyperparamter
+HP_ROOT_PATH = os.path.join(os.path.dirname(__file__), 'hparams')
+hparam = Hparam(HP_ROOT_PATH)
+
+# device
+DEFAULT_DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
diff --git a/CUDA-Optimized/FastSpeech/fastspeech/align_tacotron2.py b/CUDA-Optimized/FastSpeech/fastspeech/align_tacotron2.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the
+#       names of its contributors may be used to endorse or promote products
+#       derived from this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pathlib
+
+import fire
+import torch
+from tqdm import tqdm
+from fastspeech.data_load import PadDataLoader
+from fastspeech.dataset.ljspeech_dataset import LJSpeechDataset
+import tacotron2.train
+import tacotron2.hparams
+from fastspeech import hparam as hp, DEFAULT_DEVICE
+import os
+import numpy as np
+
+from fastspeech.utils.logging import tprint
+from fastspeech.utils.pytorch import to_device_async, to_cpu_numpy
+
+
+def get_tacotron2(device, is_training=False):
+    hparams = tacotron2.hparams.create_hparams()
+    model = tacotron2.train.load_model(hparams)
+    model.load_state_dict(torch.load(
+        hp.tacotron2_path, map_location=torch.device(device))["state_dict"])
+    if is_training:
+        model.train()
+    else:
+        model.eval()
+    return model
+
+
+def get_duration(texts, text_lens, mels, mel_lens, tacotron2, device):
+    texts = to_device_async(texts, device)
+    text_lens = to_device_async(text_lens, device)
+    mels = to_device_async(mels, device)
+    mel_lens = to_device_async(mel_lens, device)
+
+    _, _, _, aligns = tacotron2.forward(
+        (texts, text_lens, mels, None, mel_lens))
+
+    aligns = to_cpu_numpy(aligns)
+    durs = torch.FloatTensor([compute_duration(align) for align in aligns])
+
+    return durs
+
+
+def compute_duration(align):
+    """
+    Warning. This code assumes the attention is monotonic.
+    """
+    d_mel, d_text = align.shape
+    dur = np.array([0 for _ in range(d_text)])
+
+    for i in range(d_mel):
+        idx = np.argmax(align[i])
+        dur[idx] += 1
+
+    return dur
+
+
+def preprocess_aligns(
+        hparam="base.yaml",
+        device=DEFAULT_DEVICE):
+    """ The script for preprocessing alignments.
+
+    By default, this script assumes to load parameters in the default config file, fastspeech/hparams/base.yaml.
+
+    --dataset_path=DATASET_PATH
+        Path to dataset directory.
+    --tacotron2_path=TACOTRON2_PATH
+        Path to tacotron2 checkpoint file.
+    --aligns_path=ALIGNS_PATH
+        Path to output preprocessed alignments directory.
+
+    Refer to fastspeech/hparams/base.yaml to see more parameters.
+
+    Args:
+        hparam (str, optional): Path to default config file. Defaults to "base.yaml".
+        device (str, optional): Device to use. Defaults to "cuda" if avaiable, or "cpu".
+    """
+
+    hp.set_hparam(hparam)
+
+    pathlib.Path(hp.aligns_path).mkdir(parents=True, exist_ok=True)
+
+    dataset = LJSpeechDataset(hp.dataset_path)
+    dataloader = PadDataLoader(
+        dataset, batch_size=1, shuffle=False, num_workers=32, drop_last=True)
+
+    tacotron2 = get_tacotron2(device, is_training=True)
+    to_device_async(tacotron2, device)
+
+    for batched in tqdm(dataloader):
+        names = batched['name']
+        texts = batched['text_encoded']
+        text_lens = batched['text_len']
+        mels = batched['mel']
+        mel_lens = batched['mel_len']
+
+        tprint("Processing {}.".format(', '.join(names)))
+        durs = get_duration(texts, text_lens, mels,
+                            mel_lens, tacotron2, device)
+
+        for i, (name, dur) in enumerate(zip(names, durs)):
+            save_path = os.path.join(hp.aligns_path, name + ".align.npy")
+
+            if os.path.exists(save_path):
+                continue
+
+            np.save(save_path, dur)
+            # assert sum(duration) == len(align)
+
+
+if __name__ == '__main__':
+    fire.Fire(preprocess_aligns)
diff --git a/CUDA-Optimized/FastSpeech/fastspeech/audio.py b/CUDA-Optimized/FastSpeech/fastspeech/audio.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the
+#       names of its contributors may be used to endorse or promote products
+#       derived from this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+
+
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+
+
+def dynamic_range_decompression(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return np.exp(x) / C
diff --git a/CUDA-Optimized/FastSpeech/fastspeech/data_load.py b/CUDA-Optimized/FastSpeech/fastspeech/data_load.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the
+#       names of its contributors may be used to endorse or promote products
+#       derived from this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+
+class PadDataLoader(DataLoader):
+
+    @staticmethod
+    def pad_collate_fn(batch):
+        """
+        Apply zero-padding.
+        """
+        # TODO refactor
+        result = dict()
+        for key in batch[0].keys():
+            # apply padding on dataset
+            sub_batch = [elem[key] for elem in batch]
+            # check diff dims
+            if not isinstance(sub_batch[0], np.ndarray):
+                # if list of float or int
+                assert all([type(x) == type(sub_batch[0]) for x in sub_batch[1:]]), sub_batch
+                if isinstance(sub_batch[0], int):
+                    sub_batch = torch.LongTensor(sub_batch)
+                elif isinstance(sub_batch[0], float):
+                    sub_batch = torch.DoubleTensor(sub_batch)
+
+            elif any(list(map(lambda x: x.shape != sub_batch[0].shape, sub_batch[1:]))):
+                sub_batch = torch.from_numpy(__class__.pad_zero(sub_batch))
+            else:
+                sub_batch = torch.from_numpy(np.concatenate(np.expand_dims(sub_batch, axis=0)))
+            result[key] = sub_batch
+        return result
+
+    def __init__(self, dataset, batch_size, num_workers, shuffle=True, pin_memory=True, drop_last=True):
+        super().__init__(dataset,
+                         batch_size=batch_size,
+                         shuffle=shuffle,
+                         num_workers=num_workers,
+                         pin_memory=pin_memory,
+                         collate_fn=self.pad_collate_fn,
+                         drop_last=drop_last
+                         )
+
+    @staticmethod
+    def pad_zero(sub_batch):
+        dims = [b.shape for b in sub_batch]
+
+        max_dims = list(dims[0])
+        for d_li in dims[1:]:
+            for d_idx in range(len(d_li)):
+                if max_dims[d_idx] < d_li[d_idx]:
+                    max_dims[d_idx] = d_li[d_idx]
+
+        temp = np.zeros((len(sub_batch), *max_dims), dtype=sub_batch[0].dtype)
+        for i, b in enumerate(sub_batch):
+            if len(b.shape) == 1:
+                temp[i, :b.shape[0]] = b
+            elif len(b.shape) == 2:
+                temp[i, :b.shape[0], :b.shape[1]] = b
+            elif len(b.shape) == 3:
+                temp[i, :b.shape[0], :b.shape[1], :b.shape[2]] = b
+            else:
+                raise ValueError
+        return temp
+
+
diff --git a/CUDA-Optimized/FastSpeech/fastspeech/dataset/__init__.py b/CUDA-Optimized/FastSpeech/fastspeech/dataset/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the
+#       names of its contributors may be used to endorse or promote products
+#       derived from this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		include fastspeech/hparams/*.yaml
		recursive-include fastspeech/trt/plugins *.so