Skip to content

Commit

Permalink
Open source export and deploy modules (#8743)
Browse files Browse the repository at this point in the history
* export and deploy modules

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Add export tests

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Address PR reviews

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Add try except

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Moved query_llm to nlp folder

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* removed lambada.json

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Reverting the Jenkinsfile

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Exclude deploy and export from the pip

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Address the CodeQL issues

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Addressing reviews

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* remove deploy test for now

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Addressing CodeQL comments

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* wrap imports with try except

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Add test data param and fix codeql issue

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
  • Loading branch information
3 people authored Apr 6, 2024
1 parent 20c6a18 commit 97d1abb
Show file tree
Hide file tree
Showing 39 changed files with 7,954 additions and 5 deletions.
8 changes: 4 additions & 4 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -2201,7 +2201,7 @@ pipeline {
}
}
}

stage('Punctuation & Capitalization tarred dataset') {
when {
anyOf {
Expand Down Expand Up @@ -2261,7 +2261,7 @@ pipeline {
}
}
}

stage('Punctuation & Capitalization, Different ways of passing labels to model') {
when {
anyOf {
Expand Down Expand Up @@ -5585,7 +5585,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
}
}

stage('L2: Megatron Mock Data Generation') {
when {
anyOf {
Expand Down Expand Up @@ -5815,4 +5815,4 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
cleanWs()
}
}
}
}
18 changes: 18 additions & 0 deletions nemo/deploy/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from nemo.deploy.deploy_base import DeployBase
from nemo.deploy.deploy_pytriton import DeployPyTriton
from nemo.deploy.triton_deployable import ITritonDeployable
114 changes: 114 additions & 0 deletions nemo/deploy/deploy_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import importlib
import logging
from abc import ABC, abstractmethod

use_pytorch_lightning = True
try:
from pytorch_lightning import Trainer
except Exception:
use_pytorch_lightning = False

from nemo.deploy.triton_deployable import ITritonDeployable

use_nemo = True
try:
from nemo.core.classes.modelPT import ModelPT
except Exception:
use_nemo = False


LOGGER = logging.getLogger("NeMo")


class DeployBase(ABC):
def __init__(
self,
triton_model_name: str,
triton_model_version: int = 1,
checkpoint_path: str = None,
model=None,
max_batch_size: int = 128,
port: int = 8000,
address="0.0.0.0",
allow_grpc=True,
allow_http=True,
streaming=False,
pytriton_log_verbose=0,
):
self.checkpoint_path = checkpoint_path
self.triton_model_name = triton_model_name
self.triton_model_version = triton_model_version
self.max_batch_size = max_batch_size
self.model = model
self.port = port
self.address = address
self.triton = None
self.allow_grpc = allow_grpc
self.allow_http = allow_http
self.streaming = streaming
self.pytriton_log_verbose = pytriton_log_verbose

if checkpoint_path is None and model is None:
raise Exception("Either checkpoint_path or model should be provided.")

@abstractmethod
def deploy(self):
pass

@abstractmethod
def serve(self):
pass

@abstractmethod
def run(self):
pass

@abstractmethod
def stop(self):
pass

def _init_nemo_model(self):
if self.checkpoint_path is not None:
model_config = ModelPT.restore_from(self.checkpoint_path, return_config=True)
module_path, class_name = DeployBase.get_module_and_class(model_config.target)
cls = getattr(importlib.import_module(module_path), class_name)
self.model = cls.restore_from(restore_path=self.checkpoint_path, trainer=Trainer())
self.model.freeze()

# has to turn off activations_checkpoint_method for inference
try:
self.model.model.language_model.encoder.activations_checkpoint_method = None
except AttributeError as e:
LOGGER.warning(e)

if self.model is None:
raise Exception("There is no model to deploy.")

self._is_model_deployable()

def _is_model_deployable(self):
if not issubclass(type(self.model), ITritonDeployable):
raise Exception(
"This model is not deployable to Triton." "nemo.deploy.ITritonDeployable class should be inherited"
)
else:
return True

@staticmethod
def get_module_and_class(target: str):
ln = target.rindex(".")
return target[0:ln], target[ln + 1 : len(target)]
184 changes: 184 additions & 0 deletions nemo/deploy/deploy_pytriton.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


use_pytriton = True
try:
from pytriton.model_config import ModelConfig
from pytriton.triton import Triton, TritonConfig
except Exception:
use_pytriton = False

from nemo.deploy.deploy_base import DeployBase


class DeployPyTriton(DeployBase):

"""
Deploys any models to Triton Inference Server that implements ITritonDeployable interface in nemo.deploy.
Example:
from nemo.deploy import DeployPyTriton, NemoQueryLLM
from nemo.export import TensorRTLLM
trt_llm_exporter = TensorRTLLM(model_dir="/path/for/model/files")
trt_llm_exporter.export(
nemo_checkpoint_path="/path/for/nemo/checkpoint",
model_type="llama",
n_gpus=1,
)
nm = DeployPyTriton(model=trt_llm_exporter, triton_model_name="model_name", port=8000)
nm.deploy()
nm.run()
nq = NemoQueryLLM(url="localhost", model_name="model_name")
prompts = ["hello, testing GPT inference", "another GPT inference test?"]
output = nq.query_llm(prompts=prompts, max_output_len=100)
print("prompts: ", prompts)
print("")
print("output: ", output)
print("")
prompts = ["Give me some info about Paris", "Do you think Londan is a good city to visit?", "What do you think about Rome?"]
output = nq.query_llm(prompts=prompts, max_output_len=250)
print("prompts: ", prompts)
print("")
print("output: ", output)
print("")
"""

def __init__(
self,
triton_model_name: str,
triton_model_version: int = 1,
checkpoint_path: str = None,
model=None,
max_batch_size: int = 128,
port: int = 8000,
address="0.0.0.0",
allow_grpc=True,
allow_http=True,
streaming=False,
pytriton_log_verbose=0,
):
"""
A nemo checkpoint or model is expected for serving on Triton Inference Server.
Args:
triton_model_name (str): Name for the service
triton_model_version(int): Version for the service
checkpoint_path (str): path of the nemo file
model (ITritonDeployable): A model that implements the ITritonDeployable from nemo.deploy import ITritonDeployable
max_batch_size (int): max batch size
port (int) : port for the Triton server
address (str): http address for Triton server to bind.
"""

super().__init__(
triton_model_name=triton_model_name,
triton_model_version=triton_model_version,
checkpoint_path=checkpoint_path,
model=model,
max_batch_size=max_batch_size,
port=port,
address=address,
allow_grpc=allow_grpc,
allow_http=allow_http,
streaming=streaming,
pytriton_log_verbose=pytriton_log_verbose,
)

def deploy(self):

"""
Deploys any models to Triton Inference Server.
"""

self._init_nemo_model()

try:
if self.streaming:
# TODO: can't set allow_http=True due to a bug in pytriton, will fix in latest pytriton
triton_config = TritonConfig(
log_verbose=self.pytriton_log_verbose,
allow_grpc=self.allow_grpc,
allow_http=self.allow_http,
grpc_address=self.address,
)
self.triton = Triton(config=triton_config)
self.triton.bind(
model_name=self.triton_model_name,
model_version=self.triton_model_version,
infer_func=self.model.triton_infer_fn_streaming,
inputs=self.model.get_triton_input,
outputs=self.model.get_triton_output,
config=ModelConfig(decoupled=True),
)
else:
triton_config = TritonConfig(
http_address=self.address,
http_port=self.port,
allow_grpc=self.allow_grpc,
allow_http=self.allow_http,
)
self.triton = Triton(config=triton_config)
self.triton.bind(
model_name=self.triton_model_name,
model_version=self.triton_model_version,
infer_func=self.model.triton_infer_fn,
inputs=self.model.get_triton_input,
outputs=self.model.get_triton_output,
config=ModelConfig(max_batch_size=self.max_batch_size),
)
except Exception as e:
self.triton = None
print(e)

def serve(self):

"""
Starts serving the model and waits for the requests
"""

if self.triton is None:
raise Exception("deploy should be called first.")

try:
self.triton.serve()
except Exception as e:
self.triton = None
print(e)

def run(self):

"""
Starts serving the model asynchronously.
"""

if self.triton is None:
raise Exception("deploy should be called first.")

self.triton.run()

def stop(self):
"""
Stops serving the model.
"""

if self.triton is None:
raise Exception("deploy should be called first.")

self.triton.stop()
20 changes: 20 additions & 0 deletions nemo/deploy/nlp/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


use_query_llm = True
try:
from nemo.deploy.nlp.query_llm import NemoQueryLLM
except Exception:
use_query_llm = False
Loading

0 comments on commit 97d1abb

Please sign in to comment.