diff --git a/examples/distributed_inference/tensor_parallel_initialize_dist.py b/examples/distributed_inference/tensor_parallel_initialize_dist.py new file mode 100644 index 0000000000..a85f570e09 --- /dev/null +++ b/examples/distributed_inference/tensor_parallel_initialize_dist.py @@ -0,0 +1,69 @@ +import logging +import os +from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union + +import numpy as np +import tensorrt as trt +import torch +import torch.distributed as dist +from torch.distributed._tensor.device_mesh import init_device_mesh + + +def find_repo_root(max_depth=10): + dir_path = os.path.dirname(os.path.realpath(__file__)) + for i in range(max_depth): + files = os.listdir(dir_path) + if "MODULE.bazel" in files: + return dir_path + else: + dir_path = os.path.dirname(dir_path) + + raise RuntimeError("Could not find repo root") + + +def initialize_logger(rank, logger_file_name): + logger = logging.getLogger() + logger.setLevel(logging.INFO) + fh = logging.FileHandler(logger_file_name + f"_{rank}.log", mode="w") + fh.setLevel(logging.INFO) + logger.addHandler(fh) + return logger + + +# This is required for env initialization since we use mpirun +def initialize_distributed_env(logger_file_name, rank=0, world_size=1, port=29500): + local_rank = int( + os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK", rank % torch.cuda.device_count()) + ) + world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE", world_size)) + + # Set up environment variable to run with mpirun + os.environ["RANK"] = str(local_rank) + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = str(port) + # Note this will not work in the initialization here + # You would need to set it externally as a user + os.environ["trtllm_env"] = ( + find_repo_root() + "/lib/libnvinfer_plugin_tensorrt_llm.so" + ) + + # Necessary to assign a device to each rank. + torch.cuda.set_device(local_rank) + + # We use nccl backend + dist.init_process_group("nccl") + + # set a manual seed for reproducibility + torch.manual_seed(1111) + + device_mesh = init_device_mesh(device_type="cuda", mesh_shape=(world_size,)) + rank = device_mesh.get_rank() + assert rank == local_rank + logger = initialize_logger(rank, logger_file_name) + device_id = ( + rank % torch.cuda.device_count() + ) # Ensure each rank gets a unique device + torch.cuda.set_device(device_id) + + return device_mesh, world_size, rank, logger diff --git a/examples/distributed_inference/tensor_parallel_llama3.py b/examples/distributed_inference/tensor_parallel_llama3.py index 0e4dd8b6bc..a0533b1488 100644 --- a/examples/distributed_inference/tensor_parallel_llama3.py +++ b/examples/distributed_inference/tensor_parallel_llama3.py @@ -7,7 +7,7 @@ import torch import torch_tensorrt from llama3_model import ModelArgs, ParallelTransformer -from tensor_parallel_nccl_ops import register_nccl_ops +from tensor_parallel_initialize_dist import initialize_distributed_env from torch.distributed._composable.fsdp import MixedPrecisionPolicy from torch.distributed._composable.fsdp.fully_shard import fully_shard from torch.distributed._tensor import Replicate, Shard @@ -15,7 +15,9 @@ checkpoint_wrapper, ) -device_mesh, _world_size, _rank, logger = register_nccl_ops("./tensor_parallel_llama3") +device_mesh, _world_size, _rank, logger = initialize_distributed_env( + "./tensor_parallel_llama3" +) logger.info(f"Starting PyTorch TP example on rank {_rank}.") assert ( diff --git a/examples/distributed_inference/tensor_parallel_nccl_ops.py b/examples/distributed_inference/tensor_parallel_nccl_ops.py deleted file mode 100644 index 26e03b70db..0000000000 --- a/examples/distributed_inference/tensor_parallel_nccl_ops.py +++ /dev/null @@ -1,197 +0,0 @@ -import ctypes -import logging -import os -import site -from enum import IntEnum, IntFlag, auto -from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union - -import numpy as np -import tensorrt as trt -import tensorrt_llm -import torch -import torch.distributed as dist -import torch_tensorrt -from torch.distributed._tensor.device_mesh import init_device_mesh -from torch.fx import GraphModule, Node -from torch.fx.node import Argument, Target -from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext -from torch_tensorrt.dynamo.conversion._ConverterRegistry import ( - dynamo_tensorrt_converter, -) -from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import ( - tensorrt_fused_nccl_all_gather_op, - tensorrt_fused_nccl_reduce_scatter_op, -) -from torch_tensorrt.dynamo.types import TRTTensor -from torch_tensorrt.fx.converters.converter_utils import set_layer_name - - -# class for AllReduce -class AllReduceStrategy(IntEnum): - """Warning: actual definition is in kernels/customAllReduceKernels.h. - - They must be kept in sync. - """ - - NCCL = 0 - ONESHOT = 1 - TWOSHOT = 2 - AUTO = 3 - - -class AllReduceConfig(IntFlag): - """Warning: actual definition is in kernels/customAllReduceKernels.h. - - They must be kept in sync - """ - - USE_MEMCPY = auto() - PUSH_MODE = auto() - - -def initialize_logger(rank, logger_file_name): - logger = logging.getLogger() - logger.setLevel(logging.INFO) - fh = logging.FileHandler(logger_file_name + f"_{rank}.log", mode="w") - fh.setLevel(logging.INFO) - logger.addHandler(fh) - return logger - - -# This is required for env initialization since we use mpirun -def initialize_distributed_env(rank=0, world_size=1, port=29500): - local_rank = int( - os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK", rank % torch.cuda.device_count()) - ) - world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE", world_size)) - - # Set up environment variable to run with mpirun - os.environ["RANK"] = str(local_rank) - os.environ["WORLD_SIZE"] = str(world_size) - os.environ["MASTER_ADDR"] = "127.0.0.1" - os.environ["MASTER_PORT"] = str(port) - - # Necessary to assign a device to each rank. - torch.cuda.set_device(local_rank) - - # We use nccl backend - dist.init_process_group("nccl") - - # set a manual seed for reproducibility - torch.manual_seed(1111) - - return local_rank, world_size - - -def register_nccl_ops(logger_file_name): - # Initialization - initialize_distributed_env() - # create a device mesh based on the given world_size. - _world_size = int(os.environ["WORLD_SIZE"]) - - device_mesh = init_device_mesh(device_type="cuda", mesh_shape=(_world_size,)) - _rank = device_mesh.get_rank() - logger = initialize_logger(_rank, logger_file_name) - device_id = ( - _rank % torch.cuda.device_count() - ) # Ensure each rank gets a unique device - torch.cuda.set_device(device_id) - - # TensorRT NCCL plugins - # Iterate over all registered plugin creators - plugin_registry = trt.get_plugin_registry() - for plugin_creator in plugin_registry.plugin_creator_list: - logger.info( - f"Plugin Name: {plugin_creator.name}, Namespace: {plugin_creator.plugin_namespace}, Version: {plugin_creator.plugin_version}" - ) - - @dynamo_tensorrt_converter(tensorrt_fused_nccl_all_gather_op) - def insert_nccl_gather_op( - ctx: ConversionContext, - target: Target, - args: Tuple[Argument, ...], - kwargs: Dict[str, Argument], - name: str, - ) -> Union[TRTTensor, Sequence[TRTTensor]]: - plug_inputs = [args[0]] - allgather_plg_creator = trt.get_plugin_registry().get_plugin_creator( - "AllGather", "1", "tensorrt_llm" - ) - assert allgather_plg_creator is not None - _world_size = os.environ.get("WORLD_SIZE") - if _world_size is not None: - _world_size = int(_world_size) - else: - raise RuntimeError( - f"The WORLD_SIZE env variable is not set in distributed environment" - ) - group = list(range(_world_size)) - group = trt.PluginField( - "group", np.array(group, dtype=np.int32), trt.PluginFieldType.INT32 - ) - p_dtype = trt.float32 - pf_type = trt.PluginField( - "type_id", np.array([int(p_dtype)], np.int32), trt.PluginFieldType.INT32 - ) - pfc = trt.PluginFieldCollection([group, pf_type]) - allgather = allgather_plg_creator.create_plugin("allgather", pfc) - layer = ctx.net.add_plugin_v2(plug_inputs, allgather) - set_layer_name(layer, target, name) - return layer.get_output(0) - - @dynamo_tensorrt_converter(tensorrt_fused_nccl_reduce_scatter_op) - def insert_nccl_reduce_scatter_plugin( - ctx: ConversionContext, - target: Target, - args: Tuple[Argument, ...], - kwargs: Dict[str, Argument], - name: str, - ) -> Union[TRTTensor, Sequence[TRTTensor]]: - plug_inputs = [args[0]] - allreduce_plg_creator = trt.get_plugin_registry().get_plugin_creator( - "ReduceScatter", "1", "tensorrt_llm" - ) - - assert allreduce_plg_creator is not None - - counter = 0 - strategy = AllReduceStrategy.NCCL - config = AllReduceConfig(0) - _world_size = os.environ.get("WORLD_SIZE") - if _world_size is not None: - _world_size = int(_world_size) - else: - raise RuntimeError( - f"The WORLD_SIZE env variable is not set in distributed environment" - ) - group = list(range(_world_size)) - group = trt.PluginField( - "group", np.array(group, dtype=np.int32), trt.PluginFieldType.INT32 - ) - - p_dtype = trt.float16 - pf_dtype = trt.PluginField( - "type_id", np.array([int(p_dtype)], np.int32), trt.PluginFieldType.INT32 - ) - pfc = [group, pf_dtype] - p_strategy = trt.PluginField( - "strategy", np.array([int(strategy)], np.int8), trt.PluginFieldType.INT8 - ) - pfc.append(p_strategy) - p_config = trt.PluginField( - "config", np.array([int(config)], np.int8), trt.PluginFieldType.INT8 - ) - pfc.append(p_config) - p_counter = trt.PluginField( - "counter", np.array([counter], np.int32), trt.PluginFieldType.INT32 - ) - pfc.append(p_counter) - - pfc = trt.PluginFieldCollection(pfc) - ar_plug = allreduce_plg_creator.create_plugin("allreduce", pfc) - - layer = ctx.net.add_plugin_v2(plug_inputs, ar_plug) - set_layer_name(layer, target, name) - return layer.get_output(0) - - return device_mesh, _world_size, _rank, logger diff --git a/examples/distributed_inference/tensor_parallel_simple_example.py b/examples/distributed_inference/tensor_parallel_simple_example.py index c8b93ac6ca..e397df8e5a 100755 --- a/examples/distributed_inference/tensor_parallel_simple_example.py +++ b/examples/distributed_inference/tensor_parallel_simple_example.py @@ -5,7 +5,7 @@ import torch import torch.nn as nn import torch_tensorrt -from tensor_parallel_nccl_ops import register_nccl_ops +from tensor_parallel_initialize_dist import initialize_distributed_env from torch.distributed._tensor import Shard from torch.distributed.tensor.parallel import ( ColwiseParallel, @@ -13,7 +13,7 @@ parallelize_module, ) -device_mesh, _world_size, _rank, logger = register_nccl_ops( +device_mesh, _world_size, _rank, logger = initialize_distributed_env( "./tensor_parallel_simple_example" ) diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index ee7fb219aa..fb22cb7411 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -1,10 +1,13 @@ # mypy: disallow-untyped-decorators=False +import ctypes import logging import operator +import os from typing import Any, Callable, Dict, Optional, Sequence, Tuple, Union import numpy as np +import tensorrt as trt import torch from torch.fx.node import Argument, Node, Target from torch_tensorrt.dynamo._settings import CompilationSettings @@ -21,6 +24,10 @@ get_positive_dim, is_only_operator_on_placeholder, ) +from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import ( + tensorrt_fused_nccl_all_gather_op, + tensorrt_fused_nccl_reduce_scatter_op, +) from torch_tensorrt.dynamo.types import TRTTensor _LOGGER: logging.Logger = logging.getLogger(__name__) @@ -3585,3 +3592,76 @@ def aten_ops_full( fill_value=args[1], dtype=kwargs.get("dtype", None), ) + + +try: + import tensorrt_llm as trt_llm +except (ImportError, AssertionError) as e_import_error: + _LOGGER.warning( + "TensorRT_LLM is not installed. Please install TensorRT_LLM or set trtllm_env", + e_import_error, + ) + # note this is for Linux only + plugin_lib_path = os.environ.get("trtllm_env") + if plugin_lib_path is None: + _LOGGER.warning( + "Please specify a valid path for trtllm_env libnvinfer_plugin_tensorrt_llm.so when using distributed examples in examples/distributed_inference" + ) + else: + _LOGGER.info(f"Plugin lib path found: {plugin_lib_path}") + try: + handle = ctypes.CDLL(plugin_lib_path) + _LOGGER.info(f"Successfully loaded plugin library: {plugin_lib_path}") + except OSError as e_os_error: + _LOGGER.error( + f"Failed to load the shared library at {plugin_lib_path}. " + f"Ensure the path is correct and the library is compatible.", + e_os_error, + ) + try: + handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p] + handle.initTrtLlmPlugins.restype = ctypes.c_bool + except AttributeError as e_plugin_unavailable: + _LOGGER.warning("TensorRT-LLM Plugin is unavailable") + try: + TRT_LLM_PLUGIN_NAMESPACE = "tensorrt_llm" + assert handle.initTrtLlmPlugins( + None, TRT_LLM_PLUGIN_NAMESPACE.encode("utf-8") + ) + except Exception as e_initialization_error: + _LOGGER.warning( + "Exception happened in initializing TensorRT-LLM plugins", e + ) + else: + + @dynamo_tensorrt_converter(tensorrt_fused_nccl_all_gather_op) + def insert_nccl_gather_op( + ctx: ConversionContext, + target: Target, + args: Tuple[Argument, ...], + kwargs: Dict[str, Argument], + name: str, + ) -> Union[TRTTensor, Sequence[TRTTensor]]: + return impl.nccl_ops.gather_op( + ctx, + target, + SourceIR.ATEN, + name, + [args[0]], + ) + + @dynamo_tensorrt_converter(tensorrt_fused_nccl_reduce_scatter_op) + def insert_nccl_reduce_scatter_plugin( + ctx: ConversionContext, + target: Target, + args: Tuple[Argument, ...], + kwargs: Dict[str, Argument], + name: str, + ) -> Union[TRTTensor, Sequence[TRTTensor]]: + return impl.nccl_ops.reduce_scatter_op( + ctx, + target, + SourceIR.ATEN, + name, + [args[0]], + ) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/__init__.py b/py/torch_tensorrt/dynamo/conversion/impl/__init__.py index c1187f0dd9..75f7492591 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/__init__.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/__init__.py @@ -13,6 +13,7 @@ full, grid, matmul, + nccl_ops, normalization, pad, permutation, diff --git a/py/torch_tensorrt/dynamo/conversion/impl/nccl_ops.py b/py/torch_tensorrt/dynamo/conversion/impl/nccl_ops.py new file mode 100644 index 0000000000..64b8cbfda5 --- /dev/null +++ b/py/torch_tensorrt/dynamo/conversion/impl/nccl_ops.py @@ -0,0 +1,120 @@ +import os +from enum import IntEnum, IntFlag, auto +from typing import Optional, Sequence, Union + +import numpy as np +import tensorrt as trt +import torch +from torch.fx.node import Target +from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext +from torch_tensorrt.fx.converters.converter_utils import SourceIR, set_layer_name + + +# class for AllReduce +class AllReduceStrategy(IntEnum): + """Warning: actual definition is in kernels/customAllReduceKernels.h. + + They must be kept in sync. + """ + + NCCL = 0 + ONESHOT = 1 + TWOSHOT = 2 + AUTO = 3 + + +class AllReduceConfig(IntFlag): + """Warning: actual definition is in kernels/customAllReduceKernels.h. + + They must be kept in sync + """ + + USE_MEMCPY = auto() + PUSH_MODE = auto() + + +def gather_op( + ctx: ConversionContext, + target: Union[Target, str], + source_ir: Optional[SourceIR], + name: str, + plug_inputs, +): + allgather_plg_creator = trt.get_plugin_registry().get_plugin_creator( + "AllGather", "1", "tensorrt_llm" + ) + assert allgather_plg_creator is not None + _world_size = os.environ.get("WORLD_SIZE") + if _world_size is not None: + _world_size = int(_world_size) + else: + raise RuntimeError( + f"The WORLD_SIZE env variable is not set in distributed environment" + ) + group = list(range(_world_size)) + group = trt.PluginField( + "group", np.array(group, dtype=np.int32), trt.PluginFieldType.INT32 + ) + p_dtype = trt.float32 + pf_type = trt.PluginField( + "type_id", np.array([int(p_dtype)], np.int32), trt.PluginFieldType.INT32 + ) + pfc = trt.PluginFieldCollection([group, pf_type]) + allgather = allgather_plg_creator.create_plugin("allgather", pfc) + layer = ctx.net.add_plugin_v2(plug_inputs, allgather) + set_layer_name(layer, target, name, source_ir) + return layer.get_output(0) + + +def reduce_scatter_op( + ctx: ConversionContext, + target: Union[Target, str], + source_ir: Optional[SourceIR], + name: str, + plug_inputs, +): + allreduce_plg_creator = trt.get_plugin_registry().get_plugin_creator( + "ReduceScatter", "1", "tensorrt_llm" + ) + + assert allreduce_plg_creator is not None + + counter = 0 + strategy = AllReduceStrategy.NCCL + config = AllReduceConfig(0) + _world_size = os.environ.get("WORLD_SIZE") + if _world_size is not None: + _world_size = int(_world_size) + else: + raise RuntimeError( + f"The WORLD_SIZE env variable is not set in distributed environment" + ) + group = list(range(_world_size)) + group = trt.PluginField( + "group", np.array(group, dtype=np.int32), trt.PluginFieldType.INT32 + ) + + p_dtype = trt.float16 + pf_dtype = trt.PluginField( + "type_id", np.array([int(p_dtype)], np.int32), trt.PluginFieldType.INT32 + ) + pfc = [group, pf_dtype] + p_strategy = trt.PluginField( + "strategy", np.array([int(strategy)], np.int8), trt.PluginFieldType.INT8 + ) + pfc.append(p_strategy) + p_config = trt.PluginField( + "config", np.array([int(config)], np.int8), trt.PluginFieldType.INT8 + ) + pfc.append(p_config) + p_counter = trt.PluginField( + "counter", np.array([counter], np.int32), trt.PluginFieldType.INT32 + ) + pfc.append(p_counter) + + pfc = trt.PluginFieldCollection(pfc) + ar_plug = allreduce_plg_creator.create_plugin("allreduce", pfc) + + layer = ctx.net.add_plugin_v2(plug_inputs, ar_plug) + set_layer_name(layer, target, name, source_ir) + return layer.get_output(0)