From 68ca2c89aa0718f3d1be4c8a0a95abbc6fc95fe4 Mon Sep 17 00:00:00 2001 From: cehongwang Date: Wed, 25 Jun 2025 20:53:28 +0000 Subject: [PATCH 1/4] Removed debug flag usage --- py/torch_tensorrt/dynamo/_refit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py/torch_tensorrt/dynamo/_refit.py b/py/torch_tensorrt/dynamo/_refit.py index 7cf19e870e..3a6410d38d 100644 --- a/py/torch_tensorrt/dynamo/_refit.py +++ b/py/torch_tensorrt/dynamo/_refit.py @@ -300,7 +300,7 @@ def refit_module_weights( # Check the number of supported operations in the graph num_supported_ops, total_ops = partitioning.get_graph_converter_support( - new_gm, settings.debug, settings.torch_executed_ops + new_gm, settings.torch_executed_ops ) if num_supported_ops == 0 or ( From 647793c7413d70d37c01b714ee1ad1e4c50e07ef Mon Sep 17 00:00:00 2001 From: cehongwang Date: Wed, 25 Jun 2025 23:43:46 +0000 Subject: [PATCH 2/4] Added warning and changed the test cases --- py/torch_tensorrt/dynamo/backend/backends.py | 6 +++++- tests/py/dynamo/models/test_models.py | 22 +++++++++++--------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py index dfdc9e1c69..16a061aea5 100644 --- a/py/torch_tensorrt/dynamo/backend/backends.py +++ b/py/torch_tensorrt/dynamo/backend/backends.py @@ -10,7 +10,6 @@ from torch._dynamo.backends.common import aot_autograd from torch._dynamo.utils import detect_fake_mode from torch._functorch.aot_autograd import aot_export_joint_simple -from torch.distributed.tensor import DTensor from torch_tensorrt.dynamo import CompilationSettings from torch_tensorrt.dynamo._compiler import compile_module from torch_tensorrt.dynamo.lowering import ( @@ -89,6 +88,11 @@ def aot_torch_tensorrt_aten_backend( logger.warning( "It is recommended to run the model with use_distributed_mode_trace = True since there are distributed tensors in the input which is not supported in aot_export_joint_simple" ) + + if settings.offload_module_to_cpu: + logger.warning( + "`offload_module_to_cpu` is not supported for `torch_compile` backend." + ) return _pretraced_backend(gm, sample_inputs, settings, engine_cache) diff --git a/tests/py/dynamo/models/test_models.py b/tests/py/dynamo/models/test_models.py index 359044a2b2..90d3cc637b 100644 --- a/tests/py/dynamo/models/test_models.py +++ b/tests/py/dynamo/models/test_models.py @@ -79,11 +79,12 @@ def test_resnet18_cpu_offload(ir): } trt_mod = torchtrt.compile(model, **compile_spec) - assertions.assertTrue( - get_model_device(model).type == "cpu", - msg="Model should be offloaded to CPU", - ) - model.cuda() + if ir == "dynamo": + assertions.assertTrue( + get_model_device(model).type == "cpu", + msg="Model should be offloaded to CPU", + ) + model.cuda() cos_sim = cosine_similarity(model(input), trt_mod(input)) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, @@ -286,11 +287,12 @@ def test_bert_base_uncased_cpu_offload(ir): "offload_module_to_cpu": True, } trt_mod = torchtrt.compile(model, **compile_spec) - assertions.assertTrue( - get_model_device(model).type == "cpu", - msg="Model should be offloaded to CPU", - ) - model.cuda() + if ir == "dynamo": + assertions.assertTrue( + get_model_device(model).type == "cpu", + msg="Model should be offloaded to CPU", + ) + model.cuda() model_outputs = model(input, input2) trt_model_outputs = trt_mod(input, input2) From 6bd6c805a754daffc1b30b0bc4ec86b11f9d1c81 Mon Sep 17 00:00:00 2001 From: cehongwang Date: Thu, 26 Jun 2025 00:20:41 +0000 Subject: [PATCH 3/4] changed the message and added other cpu offload test --- py/torch_tensorrt/dynamo/backend/backends.py | 2 +- tests/py/dynamo/models/test_export_serde.py | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py index 16a061aea5..c0d29c41f0 100644 --- a/py/torch_tensorrt/dynamo/backend/backends.py +++ b/py/torch_tensorrt/dynamo/backend/backends.py @@ -91,7 +91,7 @@ def aot_torch_tensorrt_aten_backend( if settings.offload_module_to_cpu: logger.warning( - "`offload_module_to_cpu` is not supported for `torch_compile` backend." + "The offload_module_to_cpu option is set, but it is being ignored since the torch_compile backend does not support this feature" ) return _pretraced_backend(gm, sample_inputs, settings, engine_cache) diff --git a/tests/py/dynamo/models/test_export_serde.py b/tests/py/dynamo/models/test_export_serde.py index 6166e16949..5b8dd90d92 100644 --- a/tests/py/dynamo/models/test_export_serde.py +++ b/tests/py/dynamo/models/test_export_serde.py @@ -320,11 +320,12 @@ def test_resnet18_cpu_offload(ir): exp_program = torchtrt.dynamo.trace(model, **compile_spec) trt_module = torchtrt.dynamo.compile(exp_program, **compile_spec) - assertions.assertTrue( - get_model_device(model).type == "cpu", - msg="Model should be offloaded to CPU", - ) - model.cuda() + if ir == "dynamo": + assertions.assertTrue( + get_model_device(model).type == "cpu", + msg="Model should be offloaded to CPU", + ) + model.cuda() torchtrt.save(trt_module, trt_ep_path) deser_trt_module = torchtrt.load(trt_ep_path).module() From 9fd8ddb2432e4129bb5521c6241fb7cec193abc0 Mon Sep 17 00:00:00 2001 From: cehongwang Date: Fri, 27 Jun 2025 20:59:47 +0000 Subject: [PATCH 4/4] delete cpu offload in refit. Don't need them --- py/torch_tensorrt/dynamo/_refit.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/py/torch_tensorrt/dynamo/_refit.py b/py/torch_tensorrt/dynamo/_refit.py index 3a6410d38d..4985788808 100644 --- a/py/torch_tensorrt/dynamo/_refit.py +++ b/py/torch_tensorrt/dynamo/_refit.py @@ -39,7 +39,6 @@ from torch_tensorrt.dynamo.utils import ( CPU_DEVICE, check_module_output, - deallocate_module, get_model_device, get_torch_inputs, to_torch_device, @@ -363,7 +362,6 @@ def refit_module_weights( # Iterate over all components that can be accelerated # Generate the corresponding TRT Module for those - new_weight_module.module().to(CPU_DEVICE) for name, new_submodule in new_partitioned_module.named_children(): # Refit each submodule # Extract engine from the submodule @@ -466,7 +464,6 @@ def refit_module_weights( settings=settings, weight_name_map=None, ) - deallocate_module(new_submodule) # clear EXCLUDE_WEIGHTS flag serialization_config = engine.create_serialization_config() @@ -489,8 +486,6 @@ def refit_module_weights( gc.collect() torch.cuda.empty_cache() - deallocate_module(new_partitioned_module) - if verify_output and arg_inputs is not None: new_gm.to(to_torch_device(settings.device)) if check_module_output(