Open source export and deploy modules (#8743)

* export and deploy modules Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * Add export tests Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Address PR reviews Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * Add try except Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * Moved query_llm to nlp folder Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * removed lambada.json Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * Reverting the Jenkinsfile Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * Exclude deploy and export from the pip Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * Address the CodeQL issues Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Addressing reviews Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * remove deploy test for now Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * Addressing CodeQL comments Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * wrap imports with try except Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * Add test data param and fix codeql issue Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> --------- Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper <complex451@gmail.com>
NVIDIA · Apr 6, 2024 · 97d1abb · 97d1abb
1 parent 20c6a18
commit 97d1abb
Show file tree

Hide file tree

Showing 39 changed files with 7,954 additions and 5 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -2201,7 +2201,7 @@ pipeline {
         }
       }
     }
-    
+
     stage('Punctuation & Capitalization tarred dataset') {
       when {
         anyOf {
@@ -2261,7 +2261,7 @@ pipeline {
         }
       }
     }
-    
+
     stage('Punctuation & Capitalization, Different ways of passing labels to model') {
       when {
         anyOf {
@@ -5585,7 +5585,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
       }
     }
-    
+
     stage('L2: Megatron Mock Data Generation') {
       when {
         anyOf {
@@ -5815,4 +5815,4 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
       cleanWs()
     }
   }
-}
+}
diff --git a/nemo/deploy/__init__.py b/nemo/deploy/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from nemo.deploy.deploy_base import DeployBase
+from nemo.deploy.deploy_pytriton import DeployPyTriton
+from nemo.deploy.triton_deployable import ITritonDeployable
diff --git a/nemo/deploy/deploy_base.py b/nemo/deploy/deploy_base.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import logging
+from abc import ABC, abstractmethod
+
+use_pytorch_lightning = True
+try:
+    from pytorch_lightning import Trainer
+except Exception:
+    use_pytorch_lightning = False
+
+from nemo.deploy.triton_deployable import ITritonDeployable
+
+use_nemo = True
+try:
+    from nemo.core.classes.modelPT import ModelPT
+except Exception:
+    use_nemo = False
+
+
+LOGGER = logging.getLogger("NeMo")
+
+
+class DeployBase(ABC):
+    def __init__(
+        self,
+        triton_model_name: str,
+        triton_model_version: int = 1,
+        checkpoint_path: str = None,
+        model=None,
+        max_batch_size: int = 128,
+        port: int = 8000,
+        address="0.0.0.0",
+        allow_grpc=True,
+        allow_http=True,
+        streaming=False,
+        pytriton_log_verbose=0,
+    ):
+        self.checkpoint_path = checkpoint_path
+        self.triton_model_name = triton_model_name
+        self.triton_model_version = triton_model_version
+        self.max_batch_size = max_batch_size
+        self.model = model
+        self.port = port
+        self.address = address
+        self.triton = None
+        self.allow_grpc = allow_grpc
+        self.allow_http = allow_http
+        self.streaming = streaming
+        self.pytriton_log_verbose = pytriton_log_verbose
+
+        if checkpoint_path is None and model is None:
+            raise Exception("Either checkpoint_path or model should be provided.")
+
+    @abstractmethod
+    def deploy(self):
+        pass
+
+    @abstractmethod
+    def serve(self):
+        pass
+
+    @abstractmethod
+    def run(self):
+        pass
+
+    @abstractmethod
+    def stop(self):
+        pass
+
+    def _init_nemo_model(self):
+        if self.checkpoint_path is not None:
+            model_config = ModelPT.restore_from(self.checkpoint_path, return_config=True)
+            module_path, class_name = DeployBase.get_module_and_class(model_config.target)
+            cls = getattr(importlib.import_module(module_path), class_name)
+            self.model = cls.restore_from(restore_path=self.checkpoint_path, trainer=Trainer())
+            self.model.freeze()
+
+            # has to turn off activations_checkpoint_method for inference
+            try:
+                self.model.model.language_model.encoder.activations_checkpoint_method = None
+            except AttributeError as e:
+                LOGGER.warning(e)
+
+        if self.model is None:
+            raise Exception("There is no model to deploy.")
+
+        self._is_model_deployable()
+
+    def _is_model_deployable(self):
+        if not issubclass(type(self.model), ITritonDeployable):
+            raise Exception(
+                "This model is not deployable to Triton." "nemo.deploy.ITritonDeployable class should be inherited"
+            )
+        else:
+            return True
+
+    @staticmethod
+    def get_module_and_class(target: str):
+        ln = target.rindex(".")
+        return target[0:ln], target[ln + 1 : len(target)]
diff --git a/nemo/deploy/deploy_pytriton.py b/nemo/deploy/deploy_pytriton.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+use_pytriton = True
+try:
+    from pytriton.model_config import ModelConfig
+    from pytriton.triton import Triton, TritonConfig
+except Exception:
+    use_pytriton = False
+
+from nemo.deploy.deploy_base import DeployBase
+
+
+class DeployPyTriton(DeployBase):
+
+    """
+    Deploys any models to Triton Inference Server that implements ITritonDeployable interface in nemo.deploy.
+
+    Example:
+        from nemo.deploy import DeployPyTriton, NemoQueryLLM
+        from nemo.export import TensorRTLLM
+
+        trt_llm_exporter = TensorRTLLM(model_dir="/path/for/model/files")
+        trt_llm_exporter.export(
+            nemo_checkpoint_path="/path/for/nemo/checkpoint",
+            model_type="llama",
+            n_gpus=1,
+        )
+
+        nm = DeployPyTriton(model=trt_llm_exporter, triton_model_name="model_name", port=8000)
+        nm.deploy()
+        nm.run()
+        nq = NemoQueryLLM(url="localhost", model_name="model_name")
+
+        prompts = ["hello, testing GPT inference", "another GPT inference test?"]
+        output = nq.query_llm(prompts=prompts, max_output_len=100)
+        print("prompts: ", prompts)
+        print("")
+        print("output: ", output)
+        print("")
+
+        prompts = ["Give me some info about Paris", "Do you think Londan is a good city to visit?", "What do you think about Rome?"]
+        output = nq.query_llm(prompts=prompts, max_output_len=250)
+        print("prompts: ", prompts)
+        print("")
+        print("output: ", output)
+        print("")
+
+    """
+
+    def __init__(
+        self,
+        triton_model_name: str,
+        triton_model_version: int = 1,
+        checkpoint_path: str = None,
+        model=None,
+        max_batch_size: int = 128,
+        port: int = 8000,
+        address="0.0.0.0",
+        allow_grpc=True,
+        allow_http=True,
+        streaming=False,
+        pytriton_log_verbose=0,
+    ):
+        """
+        A nemo checkpoint or model is expected for serving on Triton Inference Server.
+
+        Args:
+            triton_model_name (str): Name for the service
+            triton_model_version(int): Version for the service
+            checkpoint_path (str): path of the nemo file
+            model (ITritonDeployable): A model that implements the ITritonDeployable from nemo.deploy import ITritonDeployable
+            max_batch_size (int): max batch size
+            port (int) : port for the Triton server
+            address (str): http address for Triton server to bind.
+        """
+
+        super().__init__(
+            triton_model_name=triton_model_name,
+            triton_model_version=triton_model_version,
+            checkpoint_path=checkpoint_path,
+            model=model,
+            max_batch_size=max_batch_size,
+            port=port,
+            address=address,
+            allow_grpc=allow_grpc,
+            allow_http=allow_http,
+            streaming=streaming,
+            pytriton_log_verbose=pytriton_log_verbose,
+        )
+
+    def deploy(self):
+
+        """
+        Deploys any models to Triton Inference Server.
+        """
+
+        self._init_nemo_model()
+
+        try:
+            if self.streaming:
+                # TODO: can't set allow_http=True due to a bug in pytriton, will fix in latest pytriton
+                triton_config = TritonConfig(
+                    log_verbose=self.pytriton_log_verbose,
+                    allow_grpc=self.allow_grpc,
+                    allow_http=self.allow_http,
+                    grpc_address=self.address,
+                )
+                self.triton = Triton(config=triton_config)
+                self.triton.bind(
+                    model_name=self.triton_model_name,
+                    model_version=self.triton_model_version,
+                    infer_func=self.model.triton_infer_fn_streaming,
+                    inputs=self.model.get_triton_input,
+                    outputs=self.model.get_triton_output,
+                    config=ModelConfig(decoupled=True),
+                )
+            else:
+                triton_config = TritonConfig(
+                    http_address=self.address,
+                    http_port=self.port,
+                    allow_grpc=self.allow_grpc,
+                    allow_http=self.allow_http,
+                )
+                self.triton = Triton(config=triton_config)
+                self.triton.bind(
+                    model_name=self.triton_model_name,
+                    model_version=self.triton_model_version,
+                    infer_func=self.model.triton_infer_fn,
+                    inputs=self.model.get_triton_input,
+                    outputs=self.model.get_triton_output,
+                    config=ModelConfig(max_batch_size=self.max_batch_size),
+                )
+        except Exception as e:
+            self.triton = None
+            print(e)
+
+    def serve(self):
+
+        """
+        Starts serving the model and waits for the requests
+        """
+
+        if self.triton is None:
+            raise Exception("deploy should be called first.")
+
+        try:
+            self.triton.serve()
+        except Exception as e:
+            self.triton = None
+            print(e)
+
+    def run(self):
+
+        """
+        Starts serving the model asynchronously.
+        """
+
+        if self.triton is None:
+            raise Exception("deploy should be called first.")
+
+        self.triton.run()
+
+    def stop(self):
+        """
+        Stops serving the model.
+        """
+
+        if self.triton is None:
+            raise Exception("deploy should be called first.")
+
+        self.triton.stop()
diff --git a/nemo/deploy/nlp/__init__.py b/nemo/deploy/nlp/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+use_query_llm = True
+try:
+    from nemo.deploy.nlp.query_llm import NemoQueryLLM
+except Exception:
+    use_query_llm = False
-Original file line number
+Diff line change
@@ Expand Up / @@ -2201,7 +2201,7 @@ pipeline { @@
             }
           }
         }
         stage('Punctuation & Capitalization tarred dataset') {
           when {
             anyOf {
@@ Expand Down Expand Up / @@ -2261,7 +2261,7 @@ pipeline { @@
             }
           }
         }
         stage('Punctuation & Capitalization, Different ways of passing labels to model') {
           when {
             anyOf {
@@ Expand Down Expand Up @@
             sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
           }
         }
         stage('L2: Megatron Mock Data Generation') {
           when {
             anyOf {
@@ Expand Down Expand Up @@
           cleanWs()
         }
       }
-    }
+    }