From 3517483de852974570d6ca640c581c8299db067a Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Sat, 8 Mar 2025 19:57:37 -0500
Subject: [PATCH 01/13] detach and copy

---
 .../torch/base_fx_graph_translator.py         |  3 +
 .../torch/exported_program_translator.py      |  4 +
 .../relax/test_from_exported_to_cuda.py       | 85 +++++++++++++++++++
 .../test_frontend_from_exported_program.py    | 45 ++++++++++
 4 files changed, 137 insertions(+)
 create mode 100644 tests/python/relax/test_from_exported_to_cuda.py

diff --git a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
index 003ceebec6ff..bce6badb1e9f 100644
--- a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
+++ b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
@@ -958,6 +958,9 @@ def _transpose(self, node: fx.Node) -> relax.Var:
 
     ########## Creation ##########
 
+    def _detach(self, node: fx.Node) -> relax.Var:
+        return self.env[node.args[0]]
+
     def _to_copy(self, node: fx.Node) -> relax.Var:
         import torch  # type: ignore
 
diff --git a/python/tvm/relax/frontend/torch/exported_program_translator.py b/python/tvm/relax/frontend/torch/exported_program_translator.py
index c8d9d12505c6..af586fe5cea6 100644
--- a/python/tvm/relax/frontend/torch/exported_program_translator.py
+++ b/python/tvm/relax/frontend/torch/exported_program_translator.py
@@ -283,7 +283,11 @@ def create_convert_map(
             "view.default": self._reshape,
             "reshape.default": self._reshape,
             # tensor creation
+            "copy_.default": self._to_copy,
             "_to_copy.default": self._to_copy,
+            "lift_fresh_copy.default": self._to_copy,
+            "detach.default": self._detach,
+            "detach_.default": self._detach,
             "arange.start": self._arange,
             "clone.default": lambda node: self.env[node.args[0]],
             "empty.memory_format": self._empty,
diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py
new file mode 100644
index 000000000000..c5ba2f410e2a
--- /dev/null
+++ b/tests/python/relax/test_from_exported_to_cuda.py
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance    
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# TODO remove
+import sys
+sys.path.append('/ssd1/htalendr/tvm/python')
+
+import tvm
+from tvm import relax
+import tvm.testing
+import numpy as np
+import torch
+from torch import nn
+from torch.export import export
+from tvm.relax.frontend.torch import from_exported_program
+from torch.nn import Softmax, Upsample
+
+
+def assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module):
+    """
+    This util ensures that a torch module can successfully be exported to TVM 
+    using torch.export and that the resuling IR program gives the same result 
+    as PyTorch when ran on CUDA.
+    """
+    raw_data_for_tvm = raw_data.copy() # In case the data is modified
+    torch_data = torch.from_numpy(raw_data)
+    example_args = (torch_data,)
+
+    with torch.no_grad():
+        exported_program = export(torch_module, example_args)
+        mod_from_torch = from_exported_program(
+            exported_program, keep_params_as_input=True
+        )
+
+    tvm_mod, tvm_params = relax.frontend.detach_params(mod_from_torch)
+    target = tvm.target.Target.from_device(tvm.cuda())
+
+    ex = relax.build(tvm_mod, target=target, 
+                     relax_pipeline=relax.get_default_pipeline(target))
+    dev = tvm.device("cuda", 0)
+    vm = relax.VirtualMachine(ex, dev)
+
+    gpu_data = tvm.nd.array(raw_data_for_tvm, dev)
+    gpu_params = [tvm.nd.array(p, dev) for p in tvm_params["main"]]
+    gpu_out = vm["main"](gpu_data, *gpu_params)
+
+    pytorch_out = torch_module(torch_data).detach().numpy()
+    actual = gpu_out[0].numpy()
+    desired = pytorch_out
+    np.testing.assert_allclose(actual=actual, desired=desired, rtol=1e-5, 
+                               atol=1e-5) 
+
+
+def test_detach():
+    class DetachTester(nn.Module):
+        def forward(self, x):
+            detached = x.detach()
+
+            # Test that detached shares same memory as x
+            x[0][0] = 42.0 
+
+            return detached
+    
+    raw_data = np.ones((2,2)).astype(np.float32)
+    torch_module = DetachTester().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module)
+
+# TODO undo
+test_detach()
+# if __name__ == "__main__":
+#     tvm.testing.main()
diff --git a/tests/python/relax/test_frontend_from_exported_program.py b/tests/python/relax/test_frontend_from_exported_program.py
index 8ca335c2fe7a..dc7f39d7755c 100644
--- a/tests/python/relax/test_frontend_from_exported_program.py
+++ b/tests/python/relax/test_frontend_from_exported_program.py
@@ -370,6 +370,51 @@ def main(
     # tril, triu
     test_tril_triu()
 
+def test_detach():
+    class DetachTester(nn.Module):
+
+        def __init__(self):
+            super(DetachTester, self).__init__()
+            self.weight = nn.Parameter(torch.randn(3, 3))
+    
+        def forward(self, x):
+            detached = x.detach()
+
+            # Test that they share same memory
+            original_modified = x + 1.0
+            
+            return detached
+        
+    class Softmax(Module):
+        def __init__(self):
+            super().__init__()
+            self.sm = torch.nn.Softmax(dim=1)
+
+        def forward(self, input):
+            return self.sm(input)
+
+    class Softmax2(Module):
+        def forward(self, input):
+            return torch.nn.functional.softmax(input, dim=1)
+
+    @tvm.script.ir_module
+    class expected1:
+        @R.function
+        def main(
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+        ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
+            # block 0
+            with R.dataflow():
+                lv: R.Tensor((1, 3, 10, 10), dtype="float32") = R.nn.softmax(input_1, axis=1)
+                gv: R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")) = (lv,)
+                R.output(gv)
+            return gv
+
+    example_args = (torch.randn(1, 3, 10, 10, dtype=torch.float32),)
+    verify_model(Softmax(), example_args, {}, expected1)
+    verify_model(Softmax2(), example_args, {}, expected1)
+
+
 
 def test_hardtanh():
     class Hardtanh(torch.nn.Module):

From 57bc004967c15694c0ebce3483939b5a13ef34f2 Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Sat, 8 Mar 2025 20:31:33 -0500
Subject: [PATCH 02/13] copy_ implemenation. Unit test passes

---
 .../torch/base_fx_graph_translator.py         |  6 +++
 .../torch/exported_program_translator.py      |  2 +-
 .../relax/test_from_exported_to_cuda.py       | 54 ++++++++++++++-----
 3 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
index bce6badb1e9f..2cc6a242145f 100644
--- a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
+++ b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
@@ -961,7 +961,13 @@ def _transpose(self, node: fx.Node) -> relax.Var:
     def _detach(self, node: fx.Node) -> relax.Var:
         return self.env[node.args[0]]
 
+    def _copy_(self, node: fx.Node) -> relax.Var:
+        # Copies the source tensor's to the destination tensor
+        # In TVM, that means simply returning the source tensor
+        return self.env[node.args[1]]
+
     def _to_copy(self, node: fx.Node) -> relax.Var:
+        # Returns a copy of the input tensor
         import torch  # type: ignore
 
         x = self.env[node.args[0]]
diff --git a/python/tvm/relax/frontend/torch/exported_program_translator.py b/python/tvm/relax/frontend/torch/exported_program_translator.py
index af586fe5cea6..7714aaecff2b 100644
--- a/python/tvm/relax/frontend/torch/exported_program_translator.py
+++ b/python/tvm/relax/frontend/torch/exported_program_translator.py
@@ -266,6 +266,7 @@ def create_convert_map(
             # tensor manipulation
             "cat.default": self._cat,
             "concat.default": self._cat,
+            "copy_.default": self._copy_,
             "cumsum.default": self._cumsum,
             "expand.default": self._expand,
             "permute.default": self._permute,
@@ -283,7 +284,6 @@ def create_convert_map(
             "view.default": self._reshape,
             "reshape.default": self._reshape,
             # tensor creation
-            "copy_.default": self._to_copy,
             "_to_copy.default": self._to_copy,
             "lift_fresh_copy.default": self._to_copy,
             "detach.default": self._detach,
diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py
index c5ba2f410e2a..b8856e9a3666 100644
--- a/tests/python/relax/test_from_exported_to_cuda.py
+++ b/tests/python/relax/test_from_exported_to_cuda.py
@@ -15,9 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# TODO remove
 import sys
-sys.path.append('/ssd1/htalendr/tvm/python')
+sys.path.append('/ssd1/htalendr/tvm/python') # Refer to local TVM build
 
 import tvm
 from tvm import relax
@@ -64,22 +63,53 @@ def assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module):
     np.testing.assert_allclose(actual=actual, desired=desired, rtol=1e-5, 
                                atol=1e-5) 
 
+def test_copy_():
+    class CopyTester(nn.Module):
+        def __init__(self, size):
+            super().__init__()
+            # self.buffer = torch.zeros(size)
+            self.register_buffer("buffer", torch.zeros(size))
 
-def test_detach():
-    class DetachTester(nn.Module):
         def forward(self, x):
-            detached = x.detach()
+            self.buffer.copy_(x)
+            
+            return x * 3 + self.buffer * 5
+
+    size = (2,2)
+    raw_data = np.random.rand(*size).astype(np.float32)
+    torch_module = CopyTester(size).eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module)
 
-            # Test that detached shares same memory as x
-            x[0][0] = 42.0 
 
+def test_detach_no_change():
+    """ Most of the time, in TVM, detach() should basically be identity"""
+    class DetachTester(nn.Module):
+        def forward(self, x):
+            detached = x.detach()
             return detached
-    
+
     raw_data = np.ones((2,2)).astype(np.float32)
     torch_module = DetachTester().eval()
     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module)
 
-# TODO undo
-test_detach()
-# if __name__ == "__main__":
-#     tvm.testing.main()
+
+# TODO test below fails! Is there a way to implement detach such that the 
+#  memory is shared with the input?
+# def test_detach_with_change():
+#     """ Testing that detach() shares memory with original tensor"""
+#     class DetachTester(nn.Module):
+#         def forward(self, x):
+#             detached = x.detach()
+
+#             # Test that detached shares same memory as x
+#             x[0][0] = 42.0 
+
+#             return detached
+
+#     raw_data = np.ones((2,2)).astype(np.float32)
+#     torch_module = DetachTester().eval()
+#     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 6f184aa6d64ef05e76cc2d00479b937627f0ff70 Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Sat, 8 Mar 2025 20:40:05 -0500
Subject: [PATCH 03/13] restore test_frontend

---
 .../test_frontend_from_exported_program.py    | 45 -------------------
 1 file changed, 45 deletions(-)

diff --git a/tests/python/relax/test_frontend_from_exported_program.py b/tests/python/relax/test_frontend_from_exported_program.py
index dc7f39d7755c..8ca335c2fe7a 100644
--- a/tests/python/relax/test_frontend_from_exported_program.py
+++ b/tests/python/relax/test_frontend_from_exported_program.py
@@ -370,51 +370,6 @@ def main(
     # tril, triu
     test_tril_triu()
 
-def test_detach():
-    class DetachTester(nn.Module):
-
-        def __init__(self):
-            super(DetachTester, self).__init__()
-            self.weight = nn.Parameter(torch.randn(3, 3))
-    
-        def forward(self, x):
-            detached = x.detach()
-
-            # Test that they share same memory
-            original_modified = x + 1.0
-            
-            return detached
-        
-    class Softmax(Module):
-        def __init__(self):
-            super().__init__()
-            self.sm = torch.nn.Softmax(dim=1)
-
-        def forward(self, input):
-            return self.sm(input)
-
-    class Softmax2(Module):
-        def forward(self, input):
-            return torch.nn.functional.softmax(input, dim=1)
-
-    @tvm.script.ir_module
-    class expected1:
-        @R.function
-        def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
-        ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
-            # block 0
-            with R.dataflow():
-                lv: R.Tensor((1, 3, 10, 10), dtype="float32") = R.nn.softmax(input_1, axis=1)
-                gv: R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")) = (lv,)
-                R.output(gv)
-            return gv
-
-    example_args = (torch.randn(1, 3, 10, 10, dtype=torch.float32),)
-    verify_model(Softmax(), example_args, {}, expected1)
-    verify_model(Softmax2(), example_args, {}, expected1)
-
-
 
 def test_hardtanh():
     class Hardtanh(torch.nn.Module):

From 419285d74a3271ffc24a36d73b723e056544d0e1 Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Sat, 8 Mar 2025 20:40:38 -0500
Subject: [PATCH 04/13] don't specify syspath

---
 tests/python/relax/test_from_exported_to_cuda.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py
index b8856e9a3666..682671b3b73b 100644
--- a/tests/python/relax/test_from_exported_to_cuda.py
+++ b/tests/python/relax/test_from_exported_to_cuda.py
@@ -15,9 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import sys
-sys.path.append('/ssd1/htalendr/tvm/python') # Refer to local TVM build
-
 import tvm
 from tvm import relax
 import tvm.testing

From e51fe01b47de93262560f7a122f89eebac888896 Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Sat, 8 Mar 2025 20:45:32 -0500
Subject: [PATCH 05/13] todo for _detach()

---
 python/tvm/relax/frontend/torch/base_fx_graph_translator.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
index 2cc6a242145f..55bb120f0830 100644
--- a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
+++ b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
@@ -959,6 +959,10 @@ def _transpose(self, node: fx.Node) -> relax.Var:
     ########## Creation ##########
 
     def _detach(self, node: fx.Node) -> relax.Var:
+        # TODO found no way to correctly implement this. The output should 
+        # share the same memory as the input. It is not the case right now. 
+        # Ideally, this test would pass, but it doesn't :
+        # https://github.com/hugolatendresse/tvm/blob/456845811ba01c0bea07737a4f7a333a0b45ea92/tests/python/relax/test_from_exported_to_cuda.py#L98
         return self.env[node.args[0]]
 
     def _copy_(self, node: fx.Node) -> relax.Var:

From 6391d2486249c388a662c1ac89340c029b64bd99 Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Sun, 9 Mar 2025 17:38:47 -0400
Subject: [PATCH 06/13] Black formatter

---
 .../torch/base_fx_graph_translator.py         |  10 +-
 .../relax/test_from_exported_to_cuda.py       |  32 ++---
 .../test_frontend_from_exported_program.py    | 136 ++++++++----------
 3 files changed, 83 insertions(+), 95 deletions(-)

diff --git a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
index 55bb120f0830..bda71f3cf877 100644
--- a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
+++ b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
@@ -37,9 +37,9 @@ def __init__(self) -> None:
         self.env: Dict[fx.Node, relax.Expr] = {}
         self.params: Dict[torch.Tensor, relax.Expr] = {}
         self.block_builder: relax.BlockBuilder = None
-        self.convert_map: Dict[
-            Union[torch.nn.Module, str], Callable[[fx.Node], relax.Var]
-        ] = self.create_convert_map()
+        self.convert_map: Dict[Union[torch.nn.Module, str], Callable[[fx.Node], relax.Var]] = (
+            self.create_convert_map()
+        )
 
     ########## Utilities ##########
 
@@ -959,8 +959,8 @@ def _transpose(self, node: fx.Node) -> relax.Var:
     ########## Creation ##########
 
     def _detach(self, node: fx.Node) -> relax.Var:
-        # TODO found no way to correctly implement this. The output should 
-        # share the same memory as the input. It is not the case right now. 
+        # TODO found no way to correctly implement this. The output should
+        # share the same memory as the input. It is not the case right now.
         # Ideally, this test would pass, but it doesn't :
         # https://github.com/hugolatendresse/tvm/blob/456845811ba01c0bea07737a4f7a333a0b45ea92/tests/python/relax/test_from_exported_to_cuda.py#L98
         return self.env[node.args[0]]
diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py
index 682671b3b73b..a0f3163d08a9 100644
--- a/tests/python/relax/test_from_exported_to_cuda.py
+++ b/tests/python/relax/test_from_exported_to_cuda.py
@@ -3,7 +3,7 @@
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance    
+# "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
@@ -28,25 +28,22 @@
 
 def assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module):
     """
-    This util ensures that a torch module can successfully be exported to TVM 
-    using torch.export and that the resuling IR program gives the same result 
+    This util ensures that a torch module can successfully be exported to TVM
+    using torch.export and that the resuling IR program gives the same result
     as PyTorch when ran on CUDA.
     """
-    raw_data_for_tvm = raw_data.copy() # In case the data is modified
+    raw_data_for_tvm = raw_data.copy()  # In case the data is modified
     torch_data = torch.from_numpy(raw_data)
     example_args = (torch_data,)
 
     with torch.no_grad():
         exported_program = export(torch_module, example_args)
-        mod_from_torch = from_exported_program(
-            exported_program, keep_params_as_input=True
-        )
+        mod_from_torch = from_exported_program(exported_program, keep_params_as_input=True)
 
     tvm_mod, tvm_params = relax.frontend.detach_params(mod_from_torch)
     target = tvm.target.Target.from_device(tvm.cuda())
 
-    ex = relax.build(tvm_mod, target=target, 
-                     relax_pipeline=relax.get_default_pipeline(target))
+    ex = relax.build(tvm_mod, target=target, relax_pipeline=relax.get_default_pipeline(target))
     dev = tvm.device("cuda", 0)
     vm = relax.VirtualMachine(ex, dev)
 
@@ -57,8 +54,8 @@ def assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module):
     pytorch_out = torch_module(torch_data).detach().numpy()
     actual = gpu_out[0].numpy()
     desired = pytorch_out
-    np.testing.assert_allclose(actual=actual, desired=desired, rtol=1e-5, 
-                               atol=1e-5) 
+    np.testing.assert_allclose(actual=actual, desired=desired, rtol=1e-5, atol=1e-5)
+
 
 def test_copy_():
     class CopyTester(nn.Module):
@@ -69,28 +66,29 @@ def __init__(self, size):
 
         def forward(self, x):
             self.buffer.copy_(x)
-            
+
             return x * 3 + self.buffer * 5
 
-    size = (2,2)
+    size = (2, 2)
     raw_data = np.random.rand(*size).astype(np.float32)
     torch_module = CopyTester(size).eval()
     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module)
 
 
 def test_detach_no_change():
-    """ Most of the time, in TVM, detach() should basically be identity"""
+    """Most of the time, in TVM, detach() should basically be identity"""
+
     class DetachTester(nn.Module):
         def forward(self, x):
             detached = x.detach()
             return detached
 
-    raw_data = np.ones((2,2)).astype(np.float32)
+    raw_data = np.ones((2, 2)).astype(np.float32)
     torch_module = DetachTester().eval()
     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module)
 
 
-# TODO test below fails! Is there a way to implement detach such that the 
+# TODO test below fails! Is there a way to implement detach such that the
 #  memory is shared with the input?
 # def test_detach_with_change():
 #     """ Testing that detach() shares memory with original tensor"""
@@ -99,7 +97,7 @@ def forward(self, x):
 #             detached = x.detach()
 
 #             # Test that detached shares same memory as x
-#             x[0][0] = 42.0 
+#             x[0][0] = 42.0
 
 #             return detached
 
diff --git a/tests/python/relax/test_frontend_from_exported_program.py b/tests/python/relax/test_frontend_from_exported_program.py
index 8ca335c2fe7a..77aac527bc06 100644
--- a/tests/python/relax/test_frontend_from_exported_program.py
+++ b/tests/python/relax/test_frontend_from_exported_program.py
@@ -82,7 +82,7 @@ def forward(self, input):
     class expected:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((1, 3, 10, 10), dtype="float32") = relax_op(input_1)
@@ -112,7 +112,7 @@ def forward(self, input):
     class expected:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="bool")):
             with R.dataflow():
                 lv: R.Tensor((1, 3, 10, 10), dtype="bool") = relax_op(input_1)
@@ -135,7 +135,7 @@ def forward(self, input):
     class expected_clamp:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -163,7 +163,7 @@ def forward(self, input):
     class expected_dropout:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -191,7 +191,7 @@ def forward(self, input):
     class expected_gelu:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -220,7 +220,7 @@ def forward(self, input):
     class expected_hardsigmoid:
         @R.function
         def main(
-            inp_0: R.Tensor((1, 3, 10, 10), dtype="float32")
+            inp_0: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((1, 3, 10, 10), dtype="float32") = R.add(inp_0, R.const(3, "float32"))
@@ -252,7 +252,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            inp_0: R.Tensor((1, 3, 10, 10), dtype="float32")
+            inp_0: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((1, 3, 10, 10), dtype="float32") = R.add(inp_0, R.const(3, "float32"))
@@ -294,7 +294,7 @@ def forward(self, input):
     class expected_relu:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -323,7 +323,7 @@ def forward(self, input):
     class expected_sigmoid:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -352,7 +352,7 @@ def forward(self, input):
     class expected_silu:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -388,7 +388,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            inp_0: R.Tensor((1, 3, 10, 10), dtype="float32")
+            inp_0: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((1, 3, 10, 10), dtype="float32") = R.clip(
@@ -425,7 +425,7 @@ def forward(self, input):
     class expected:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -456,7 +456,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -487,7 +487,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -512,7 +512,7 @@ def forward(self, input):
     class expected_tril:
         @R.function
         def main(
-            input_1: R.Tensor((10, 10), dtype="float32")
+            input_1: R.Tensor((10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -531,7 +531,7 @@ def forward(self, input):
     class expected_triu:
         @R.function
         def main(
-            input_1: R.Tensor((10, 10), dtype="float32")
+            input_1: R.Tensor((10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -795,7 +795,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -883,7 +883,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -1580,7 +1580,7 @@ def forward(self, x, y):
     class Expected1:
         @R.function
         def main(
-            inp_0: R.Tensor((4, 4), dtype="float32")
+            inp_0: R.Tensor((4, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((), dtype="float32") = R.einsum((inp_0,), subscripts="ii")
@@ -1827,7 +1827,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -1856,7 +1856,7 @@ def forward(self, input):
     class expected2:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 4, 4), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -1885,7 +1885,7 @@ def forward(self, input):
     class expected3:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 6, 6), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2007,9 +2007,7 @@ def forward(self, data):
     @tvm.script.ir_module
     class expected1:
         @R.function
-        def main(
-            input_1: R.Tensor((3, 3, 10, 10), dtype="float32")
-        ) -> R.Tuple(
+        def main(input_1: R.Tensor((3, 3, 10, 10), dtype="float32")) -> R.Tuple(
             R.Tensor((3, 10, 10), dtype="float32"),
             R.Tensor((3, 10, 10), dtype="float32"),
             R.Tensor((3, 10, 10), dtype="float32"),
@@ -2051,9 +2049,7 @@ def forward(self, data):
     @tvm.script.ir_module
     class expected2:
         @R.function
-        def main(
-            input_1: R.Tensor((3, 3, 10, 10), dtype="float32")
-        ) -> R.Tuple(
+        def main(input_1: R.Tensor((3, 3, 10, 10), dtype="float32")) -> R.Tuple(
             R.Tensor((3, 10, 10), dtype="float32"),
             R.Tensor((3, 10, 10), dtype="float32"),
             R.Tensor((3, 10, 10), dtype="float32"),
@@ -2102,7 +2098,7 @@ def forward(self, input):
     class expected_bilinear:
         @R.function
         def main(
-            input: R.Tensor((1, 3, 112, 112), dtype="float32")
+            input: R.Tensor((1, 3, 112, 112), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 224, 224), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2131,7 +2127,7 @@ def forward(self, input):
     class expected_nearest:
         @R.function
         def main(
-            input: R.Tensor((1, 3, 112, 112), dtype="float32")
+            input: R.Tensor((1, 3, 112, 112), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 224, 224), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2170,7 +2166,7 @@ def forward(self, input: torch.Tensor):
     class Expected1:
         @R.function
         def main(
-            inp_0: R.Tensor((256, 256), dtype="float32")
+            inp_0: R.Tensor((256, 256), dtype="float32"),
         ) -> R.Tuple(R.Tensor((256,), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((256,), dtype="float32") = R.mean(inp_0, axis=[-1], keepdims=False)
@@ -2182,7 +2178,7 @@ def main(
     class Expected2:
         @R.function
         def main(
-            inp_0: R.Tensor((256, 256), dtype="float32")
+            inp_0: R.Tensor((256, 256), dtype="float32"),
         ) -> R.Tuple(R.Tensor((256, 1), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((256, 1), dtype="float32") = R.mean(inp_0, axis=[-1], keepdims=True)
@@ -2204,7 +2200,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            inp_0: R.Tensor((1, 2, 3, 4), dtype="float32")
+            inp_0: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 4), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2238,7 +2234,7 @@ def forward(self, input):
     class expected_argmax1:
         @R.function
         def main(
-            inp_0: R.Tensor((256, 256), dtype="float32")
+            inp_0: R.Tensor((256, 256), dtype="float32"),
         ) -> R.Tuple(R.Tensor((256,), dtype="int64")):
             with R.dataflow():
                 lv: R.Tensor((256,), dtype="int64") = R.argmax(inp_0, axis=-1, keepdims=False)
@@ -2250,7 +2246,7 @@ def main(
     class expected_argmax2:
         @R.function
         def main(
-            inp_0: R.Tensor((256, 256), dtype="float32")
+            inp_0: R.Tensor((256, 256), dtype="float32"),
         ) -> R.Tuple(R.Tensor((256, 1), dtype="int64")):
             with R.dataflow():
                 lv: R.Tensor((256, 1), dtype="int64") = R.argmax(inp_0, axis=-1, keepdims=True)
@@ -2279,7 +2275,7 @@ def forward(self, input):
     class expected_argmin1:
         @R.function
         def main(
-            inp_0: R.Tensor((256, 256), dtype="float32")
+            inp_0: R.Tensor((256, 256), dtype="float32"),
         ) -> R.Tuple(R.Tensor((), dtype="int64")):
             with R.dataflow():
                 lv: R.Tensor((), dtype="int64") = R.argmin(inp_0, axis=None, keepdims=False)
@@ -2291,7 +2287,7 @@ def main(
     class expected_argmin2:
         @R.function
         def main(
-            inp_0: R.Tensor((256, 256), dtype="float32")
+            inp_0: R.Tensor((256, 256), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 1), dtype="int64")):
             with R.dataflow():
                 lv: R.Tensor((1, 1), dtype="int64") = R.argmin(inp_0, axis=None, keepdims=True)
@@ -2362,7 +2358,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 2, 3, 4), dtype="float32")
+            input_1: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 2, 3, 4), dtype="int32")):
             # block 0
             with R.dataflow():
@@ -2388,7 +2384,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32")
+            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((4, 2, 3, 4), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2419,7 +2415,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 100), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2445,7 +2441,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32")
+            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 4, 3, 2), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2483,7 +2479,7 @@ def main(x: R.Tensor((3,), dtype="float32")) -> R.Tuple(R.Tensor((6,), dtype="fl
     class expected2:
         @R.function
         def main(
-            x: R.Tensor((1, 3), dtype="float32")
+            x: R.Tensor((1, 3), dtype="float32"),
         ) -> R.Tuple(R.Tensor((4, 6), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2511,7 +2507,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32")
+            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((2, 12), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2533,7 +2529,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 3, 10, 10), dtype="float32")
+            x: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 10, 3), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2574,7 +2570,7 @@ def forward(self, x):
     class expected2:
         @R.function
         def main(
-            x: R.Tensor((8, 16), dtype="float32")
+            x: R.Tensor((8, 16), dtype="float32"),
         ) -> R.Tuple(R.Tensor((8, 1, 1, 16, 1), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((8, 16), dtype="float32") = R.strided_slice(
@@ -2619,9 +2615,7 @@ def forward(self, input):
     @tvm.script.ir_module
     class Expected:
         @R.function
-        def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
-        ) -> R.Tuple(
+        def main(input_1: R.Tensor((1, 3, 10, 10), dtype="float32")) -> R.Tuple(
             R.Tensor((1, 1, 10, 10), dtype="float32"),
             R.Tensor((1, 1, 10, 10), dtype="float32"),
             R.Tensor((1, 1, 10, 10), dtype="float32"),
@@ -2651,9 +2645,7 @@ def forward(self, data):
     @tvm.script.ir_module
     class expected1:
         @R.function
-        def main(
-            input_1: R.Tensor((3, 3, 10, 10), dtype="float32")
-        ) -> R.Tuple(
+        def main(input_1: R.Tensor((3, 3, 10, 10), dtype="float32")) -> R.Tuple(
             R.Tensor((3, 10, 10), dtype="float32"),
             R.Tensor((3, 10, 10), dtype="float32"),
             R.Tensor((3, 10, 10), dtype="float32"),
@@ -2695,9 +2687,7 @@ def forward(self, data):
     @tvm.script.ir_module
     class expected2:
         @R.function
-        def main(
-            input_1: R.Tensor((3, 3, 10, 10), dtype="float32")
-        ) -> R.Tuple(
+        def main(input_1: R.Tensor((3, 3, 10, 10), dtype="float32")) -> R.Tuple(
             R.Tensor((3, 10, 10), dtype="float32"),
             R.Tensor((3, 10, 10), dtype="float32"),
             R.Tensor((3, 10, 10), dtype="float32"),
@@ -2749,7 +2739,7 @@ def forward(self, input):
     class Expected1:
         @R.function
         def main(
-            inp_0: R.Tensor((3, 1, 4, 1), dtype="float32")
+            inp_0: R.Tensor((3, 1, 4, 1), dtype="float32"),
         ) -> R.Tuple(R.Tensor((3, 4, 1), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((3, 4, 1), dtype="float32") = R.squeeze(inp_0, axis=[1])
@@ -2765,7 +2755,7 @@ def forward(self, input):
     class Expected2:
         @R.function
         def main(
-            inp_0: R.Tensor((3, 1, 4, 1), dtype="float32")
+            inp_0: R.Tensor((3, 1, 4, 1), dtype="float32"),
         ) -> R.Tuple(R.Tensor((3, 4), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((3, 4), dtype="float32") = R.squeeze(inp_0, axis=None)
@@ -2796,7 +2786,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 3), dtype="float32")
+            x: R.Tensor((1, 3), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 6), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2809,7 +2799,7 @@ def main(
     class expected2:
         @R.function
         def main(
-            x: R.Tensor((1, 3), dtype="float32")
+            x: R.Tensor((1, 3), dtype="float32"),
         ) -> R.Tuple(R.Tensor((4, 6), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2833,7 +2823,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32")
+            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 4, 3, 2), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2855,7 +2845,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2872,7 +2862,7 @@ def forward(self, input):
     class expected2:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10, 1), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2896,7 +2886,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32")
+            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((2, 12), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2918,7 +2908,7 @@ def forward(self, input):
     class Expected:
         @R.function
         def main(
-            input: R.Tensor((10, 10), dtype="float32")
+            input: R.Tensor((10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((20,), dtype="int32")):
             with R.dataflow():
                 lv: R.Tensor((20,), dtype="int32") = R.arange(0, 20, 1, dtype="int32")
@@ -2939,7 +2929,7 @@ def forward(self, input):
     class Expected:
         @R.function
         def main(
-            input: R.Tensor((10, 10), dtype="float32")
+            input: R.Tensor((10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((10, 10), dtype="float32")):
             with R.dataflow():
                 gv: R.Tuple(R.Tensor((10, 10), dtype="float32")) = (input,)
@@ -2959,7 +2949,7 @@ def forward(self, input):
     class Expected:
         @R.function
         def main(
-            inp_0: R.Tensor((10, 10), dtype="float32")
+            inp_0: R.Tensor((10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((10, 10), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((10, 10), dtype="float32") = R.zeros(
@@ -2982,7 +2972,7 @@ def forward(self, input: torch.Tensor):
     class Expected:
         @R.function
         def main(
-            inp_0: R.Tensor((10, 10), dtype="float32")
+            inp_0: R.Tensor((10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((10, 10), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((10, 10), dtype="float32") = R.full(
@@ -3005,7 +2995,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3), dtype="float32")
+            x: R.Tensor((1, 2, 3), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 2, 3), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -3034,7 +3024,7 @@ def forward(self, x):
     class expected_float:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32")
+            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 2, 3, 4), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -3052,7 +3042,7 @@ def forward(self, x):
     class expected_half:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32")
+            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 2, 3, 4), dtype="float16")):
             # block 0
             with R.dataflow():
@@ -3070,7 +3060,7 @@ def forward(self, x):
     class expected_type:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32")
+            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 2, 3, 4), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -3086,7 +3076,7 @@ def forward(self, input):
     class expected_to1:
         @R.function
         def main(
-            inp_0: R.Tensor((1, 2, 3, 4), dtype="float32")
+            inp_0: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 2, 3, 4), dtype="float16")):
             with R.dataflow():
                 lv: R.Tensor((1, 2, 3, 4), dtype="float16") = R.astype(inp_0, dtype="float16")
@@ -3102,7 +3092,7 @@ def forward(self, input):
     class expected_to2:
         @R.function
         def main(
-            inp_0: R.Tensor((1, 2, 3, 4), dtype="float32")
+            inp_0: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 2, 3, 4), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((1, 2, 3, 4), dtype="float32") = R.astype(inp_0, dtype="float32")
@@ -3187,7 +3177,7 @@ def forward(self, x):
     class Expected:
         @R.function
         def main(
-            inp_0: R.Tensor((256, 256), dtype="float32")
+            inp_0: R.Tensor((256, 256), dtype="float32"),
         ) -> R.Tensor((256, 256), dtype="float32"):
             with R.dataflow():
                 gv: R.Tensor((256, 256), dtype="float32") = inp_0

From 32527bcba58848577dcdc074b8a4746ae5c4b78c Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Mon, 10 Mar 2025 03:26:37 -0400
Subject: [PATCH 07/13] black formatting with version 22.12.0

---
 .../torch/base_fx_graph_translator.py         |  6 +++---
 .../test_frontend_from_exported_program.py    | 20 ++++++++++++++-----
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
index bda71f3cf877..585d7d2b7a38 100644
--- a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
+++ b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
@@ -37,9 +37,9 @@ def __init__(self) -> None:
         self.env: Dict[fx.Node, relax.Expr] = {}
         self.params: Dict[torch.Tensor, relax.Expr] = {}
         self.block_builder: relax.BlockBuilder = None
-        self.convert_map: Dict[Union[torch.nn.Module, str], Callable[[fx.Node], relax.Var]] = (
-            self.create_convert_map()
-        )
+        self.convert_map: Dict[
+            Union[torch.nn.Module, str], Callable[[fx.Node], relax.Var]
+        ] = self.create_convert_map()
 
     ########## Utilities ##########
 
diff --git a/tests/python/relax/test_frontend_from_exported_program.py b/tests/python/relax/test_frontend_from_exported_program.py
index 77aac527bc06..399739146359 100644
--- a/tests/python/relax/test_frontend_from_exported_program.py
+++ b/tests/python/relax/test_frontend_from_exported_program.py
@@ -2007,7 +2007,9 @@ def forward(self, data):
     @tvm.script.ir_module
     class expected1:
         @R.function
-        def main(input_1: R.Tensor((3, 3, 10, 10), dtype="float32")) -> R.Tuple(
+        def main(
+            input_1: R.Tensor((3, 3, 10, 10), dtype="float32")
+        ) -> R.Tuple(
             R.Tensor((3, 10, 10), dtype="float32"),
             R.Tensor((3, 10, 10), dtype="float32"),
             R.Tensor((3, 10, 10), dtype="float32"),
@@ -2049,7 +2051,9 @@ def forward(self, data):
     @tvm.script.ir_module
     class expected2:
         @R.function
-        def main(input_1: R.Tensor((3, 3, 10, 10), dtype="float32")) -> R.Tuple(
+        def main(
+            input_1: R.Tensor((3, 3, 10, 10), dtype="float32")
+        ) -> R.Tuple(
             R.Tensor((3, 10, 10), dtype="float32"),
             R.Tensor((3, 10, 10), dtype="float32"),
             R.Tensor((3, 10, 10), dtype="float32"),
@@ -2615,7 +2619,9 @@ def forward(self, input):
     @tvm.script.ir_module
     class Expected:
         @R.function
-        def main(input_1: R.Tensor((1, 3, 10, 10), dtype="float32")) -> R.Tuple(
+        def main(
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+        ) -> R.Tuple(
             R.Tensor((1, 1, 10, 10), dtype="float32"),
             R.Tensor((1, 1, 10, 10), dtype="float32"),
             R.Tensor((1, 1, 10, 10), dtype="float32"),
@@ -2645,7 +2651,9 @@ def forward(self, data):
     @tvm.script.ir_module
     class expected1:
         @R.function
-        def main(input_1: R.Tensor((3, 3, 10, 10), dtype="float32")) -> R.Tuple(
+        def main(
+            input_1: R.Tensor((3, 3, 10, 10), dtype="float32")
+        ) -> R.Tuple(
             R.Tensor((3, 10, 10), dtype="float32"),
             R.Tensor((3, 10, 10), dtype="float32"),
             R.Tensor((3, 10, 10), dtype="float32"),
@@ -2687,7 +2695,9 @@ def forward(self, data):
     @tvm.script.ir_module
     class expected2:
         @R.function
-        def main(input_1: R.Tensor((3, 3, 10, 10), dtype="float32")) -> R.Tuple(
+        def main(
+            input_1: R.Tensor((3, 3, 10, 10), dtype="float32")
+        ) -> R.Tuple(
             R.Tensor((3, 10, 10), dtype="float32"),
             R.Tensor((3, 10, 10), dtype="float32"),
             R.Tensor((3, 10, 10), dtype="float32"),

From e0e5620f45efdea6c7a6ce2d535f4f62e88fa566 Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Mon, 10 Mar 2025 04:05:16 -0400
Subject: [PATCH 08/13] cleanup unit tests and ran Black Formatter with version
 22

---
 .../relax/test_from_exported_to_cuda.py       | 36 +++++--------------
 1 file changed, 9 insertions(+), 27 deletions(-)

diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py
index a0f3163d08a9..93091fdcedb4 100644
--- a/tests/python/relax/test_from_exported_to_cuda.py
+++ b/tests/python/relax/test_from_exported_to_cuda.py
@@ -26,7 +26,7 @@
 from torch.nn import Softmax, Upsample
 
 
-def assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module):
+def assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev):
     """
     This util ensures that a torch module can successfully be exported to TVM
     using torch.export and that the resuling IR program gives the same result
@@ -41,10 +41,9 @@ def assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module):
         mod_from_torch = from_exported_program(exported_program, keep_params_as_input=True)
 
     tvm_mod, tvm_params = relax.frontend.detach_params(mod_from_torch)
-    target = tvm.target.Target.from_device(tvm.cuda())
 
-    ex = relax.build(tvm_mod, target=target, relax_pipeline=relax.get_default_pipeline(target))
-    dev = tvm.device("cuda", 0)
+    relax_pipeline = relax.get_default_pipeline(tvm.target.Target.from_device(tvm.cuda()))
+    ex = relax.build(tvm_mod, target=target, relax_pipeline=relax_pipeline)
     vm = relax.VirtualMachine(ex, dev)
 
     gpu_data = tvm.nd.array(raw_data_for_tvm, dev)
@@ -57,11 +56,11 @@ def assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module):
     np.testing.assert_allclose(actual=actual, desired=desired, rtol=1e-5, atol=1e-5)
 
 
-def test_copy_():
+@tvm.testing.parametrize_targets("cuda")
+def test_copy_(target, dev):
     class CopyTester(nn.Module):
         def __init__(self, size):
             super().__init__()
-            # self.buffer = torch.zeros(size)
             self.register_buffer("buffer", torch.zeros(size))
 
         def forward(self, x):
@@ -72,10 +71,11 @@ def forward(self, x):
     size = (2, 2)
     raw_data = np.random.rand(*size).astype(np.float32)
     torch_module = CopyTester(size).eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module)
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
 
 
-def test_detach_no_change():
+@tvm.testing.parametrize_targets("cuda")
+def test_detach_no_change(target, dev):
     """Most of the time, in TVM, detach() should basically be identity"""
 
     class DetachTester(nn.Module):
@@ -85,25 +85,7 @@ def forward(self, x):
 
     raw_data = np.ones((2, 2)).astype(np.float32)
     torch_module = DetachTester().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module)
-
-
-# TODO test below fails! Is there a way to implement detach such that the
-#  memory is shared with the input?
-# def test_detach_with_change():
-#     """ Testing that detach() shares memory with original tensor"""
-#     class DetachTester(nn.Module):
-#         def forward(self, x):
-#             detached = x.detach()
-
-#             # Test that detached shares same memory as x
-#             x[0][0] = 42.0
-
-#             return detached
-
-#     raw_data = np.ones((2,2)).astype(np.float32)
-#     torch_module = DetachTester().eval()
-#     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module)
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
 
 
 if __name__ == "__main__":

From 7c25af32b877e526bee714e88d24767025ada68f Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Mon, 10 Mar 2025 04:34:12 -0400
Subject: [PATCH 09/13] restore unmodified frontend test

---
 .../test_frontend_from_exported_program.py    | 116 +++++++++---------
 1 file changed, 58 insertions(+), 58 deletions(-)

diff --git a/tests/python/relax/test_frontend_from_exported_program.py b/tests/python/relax/test_frontend_from_exported_program.py
index 399739146359..8ca335c2fe7a 100644
--- a/tests/python/relax/test_frontend_from_exported_program.py
+++ b/tests/python/relax/test_frontend_from_exported_program.py
@@ -82,7 +82,7 @@ def forward(self, input):
     class expected:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((1, 3, 10, 10), dtype="float32") = relax_op(input_1)
@@ -112,7 +112,7 @@ def forward(self, input):
     class expected:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="bool")):
             with R.dataflow():
                 lv: R.Tensor((1, 3, 10, 10), dtype="bool") = relax_op(input_1)
@@ -135,7 +135,7 @@ def forward(self, input):
     class expected_clamp:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -163,7 +163,7 @@ def forward(self, input):
     class expected_dropout:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -191,7 +191,7 @@ def forward(self, input):
     class expected_gelu:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -220,7 +220,7 @@ def forward(self, input):
     class expected_hardsigmoid:
         @R.function
         def main(
-            inp_0: R.Tensor((1, 3, 10, 10), dtype="float32"),
+            inp_0: R.Tensor((1, 3, 10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((1, 3, 10, 10), dtype="float32") = R.add(inp_0, R.const(3, "float32"))
@@ -252,7 +252,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            inp_0: R.Tensor((1, 3, 10, 10), dtype="float32"),
+            inp_0: R.Tensor((1, 3, 10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((1, 3, 10, 10), dtype="float32") = R.add(inp_0, R.const(3, "float32"))
@@ -294,7 +294,7 @@ def forward(self, input):
     class expected_relu:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -323,7 +323,7 @@ def forward(self, input):
     class expected_sigmoid:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -352,7 +352,7 @@ def forward(self, input):
     class expected_silu:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -388,7 +388,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            inp_0: R.Tensor((1, 3, 10, 10), dtype="float32"),
+            inp_0: R.Tensor((1, 3, 10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((1, 3, 10, 10), dtype="float32") = R.clip(
@@ -425,7 +425,7 @@ def forward(self, input):
     class expected:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -456,7 +456,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -487,7 +487,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -512,7 +512,7 @@ def forward(self, input):
     class expected_tril:
         @R.function
         def main(
-            input_1: R.Tensor((10, 10), dtype="float32"),
+            input_1: R.Tensor((10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -531,7 +531,7 @@ def forward(self, input):
     class expected_triu:
         @R.function
         def main(
-            input_1: R.Tensor((10, 10), dtype="float32"),
+            input_1: R.Tensor((10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -795,7 +795,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -883,7 +883,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -1580,7 +1580,7 @@ def forward(self, x, y):
     class Expected1:
         @R.function
         def main(
-            inp_0: R.Tensor((4, 4), dtype="float32"),
+            inp_0: R.Tensor((4, 4), dtype="float32")
         ) -> R.Tuple(R.Tensor((), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((), dtype="float32") = R.einsum((inp_0,), subscripts="ii")
@@ -1827,7 +1827,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -1856,7 +1856,7 @@ def forward(self, input):
     class expected2:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 3, 4, 4), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -1885,7 +1885,7 @@ def forward(self, input):
     class expected3:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 3, 6, 6), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2102,7 +2102,7 @@ def forward(self, input):
     class expected_bilinear:
         @R.function
         def main(
-            input: R.Tensor((1, 3, 112, 112), dtype="float32"),
+            input: R.Tensor((1, 3, 112, 112), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 3, 224, 224), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2131,7 +2131,7 @@ def forward(self, input):
     class expected_nearest:
         @R.function
         def main(
-            input: R.Tensor((1, 3, 112, 112), dtype="float32"),
+            input: R.Tensor((1, 3, 112, 112), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 3, 224, 224), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2170,7 +2170,7 @@ def forward(self, input: torch.Tensor):
     class Expected1:
         @R.function
         def main(
-            inp_0: R.Tensor((256, 256), dtype="float32"),
+            inp_0: R.Tensor((256, 256), dtype="float32")
         ) -> R.Tuple(R.Tensor((256,), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((256,), dtype="float32") = R.mean(inp_0, axis=[-1], keepdims=False)
@@ -2182,7 +2182,7 @@ def main(
     class Expected2:
         @R.function
         def main(
-            inp_0: R.Tensor((256, 256), dtype="float32"),
+            inp_0: R.Tensor((256, 256), dtype="float32")
         ) -> R.Tuple(R.Tensor((256, 1), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((256, 1), dtype="float32") = R.mean(inp_0, axis=[-1], keepdims=True)
@@ -2204,7 +2204,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            inp_0: R.Tensor((1, 2, 3, 4), dtype="float32"),
+            inp_0: R.Tensor((1, 2, 3, 4), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 4), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2238,7 +2238,7 @@ def forward(self, input):
     class expected_argmax1:
         @R.function
         def main(
-            inp_0: R.Tensor((256, 256), dtype="float32"),
+            inp_0: R.Tensor((256, 256), dtype="float32")
         ) -> R.Tuple(R.Tensor((256,), dtype="int64")):
             with R.dataflow():
                 lv: R.Tensor((256,), dtype="int64") = R.argmax(inp_0, axis=-1, keepdims=False)
@@ -2250,7 +2250,7 @@ def main(
     class expected_argmax2:
         @R.function
         def main(
-            inp_0: R.Tensor((256, 256), dtype="float32"),
+            inp_0: R.Tensor((256, 256), dtype="float32")
         ) -> R.Tuple(R.Tensor((256, 1), dtype="int64")):
             with R.dataflow():
                 lv: R.Tensor((256, 1), dtype="int64") = R.argmax(inp_0, axis=-1, keepdims=True)
@@ -2279,7 +2279,7 @@ def forward(self, input):
     class expected_argmin1:
         @R.function
         def main(
-            inp_0: R.Tensor((256, 256), dtype="float32"),
+            inp_0: R.Tensor((256, 256), dtype="float32")
         ) -> R.Tuple(R.Tensor((), dtype="int64")):
             with R.dataflow():
                 lv: R.Tensor((), dtype="int64") = R.argmin(inp_0, axis=None, keepdims=False)
@@ -2291,7 +2291,7 @@ def main(
     class expected_argmin2:
         @R.function
         def main(
-            inp_0: R.Tensor((256, 256), dtype="float32"),
+            inp_0: R.Tensor((256, 256), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 1), dtype="int64")):
             with R.dataflow():
                 lv: R.Tensor((1, 1), dtype="int64") = R.argmin(inp_0, axis=None, keepdims=True)
@@ -2362,7 +2362,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 2, 3, 4), dtype="float32"),
+            input_1: R.Tensor((1, 2, 3, 4), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 2, 3, 4), dtype="int32")):
             # block 0
             with R.dataflow():
@@ -2388,7 +2388,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
+            x: R.Tensor((1, 2, 3, 4), dtype="float32")
         ) -> R.Tuple(R.Tensor((4, 2, 3, 4), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2419,7 +2419,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 3, 100), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2445,7 +2445,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
+            x: R.Tensor((1, 2, 3, 4), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 4, 3, 2), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2483,7 +2483,7 @@ def main(x: R.Tensor((3,), dtype="float32")) -> R.Tuple(R.Tensor((6,), dtype="fl
     class expected2:
         @R.function
         def main(
-            x: R.Tensor((1, 3), dtype="float32"),
+            x: R.Tensor((1, 3), dtype="float32")
         ) -> R.Tuple(R.Tensor((4, 6), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2511,7 +2511,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
+            x: R.Tensor((1, 2, 3, 4), dtype="float32")
         ) -> R.Tuple(R.Tensor((2, 12), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2533,7 +2533,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 3, 10, 10), dtype="float32"),
+            x: R.Tensor((1, 3, 10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 10, 3), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2574,7 +2574,7 @@ def forward(self, x):
     class expected2:
         @R.function
         def main(
-            x: R.Tensor((8, 16), dtype="float32"),
+            x: R.Tensor((8, 16), dtype="float32")
         ) -> R.Tuple(R.Tensor((8, 1, 1, 16, 1), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((8, 16), dtype="float32") = R.strided_slice(
@@ -2749,7 +2749,7 @@ def forward(self, input):
     class Expected1:
         @R.function
         def main(
-            inp_0: R.Tensor((3, 1, 4, 1), dtype="float32"),
+            inp_0: R.Tensor((3, 1, 4, 1), dtype="float32")
         ) -> R.Tuple(R.Tensor((3, 4, 1), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((3, 4, 1), dtype="float32") = R.squeeze(inp_0, axis=[1])
@@ -2765,7 +2765,7 @@ def forward(self, input):
     class Expected2:
         @R.function
         def main(
-            inp_0: R.Tensor((3, 1, 4, 1), dtype="float32"),
+            inp_0: R.Tensor((3, 1, 4, 1), dtype="float32")
         ) -> R.Tuple(R.Tensor((3, 4), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((3, 4), dtype="float32") = R.squeeze(inp_0, axis=None)
@@ -2796,7 +2796,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 3), dtype="float32"),
+            x: R.Tensor((1, 3), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 6), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2809,7 +2809,7 @@ def main(
     class expected2:
         @R.function
         def main(
-            x: R.Tensor((1, 3), dtype="float32"),
+            x: R.Tensor((1, 3), dtype="float32")
         ) -> R.Tuple(R.Tensor((4, 6), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2833,7 +2833,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
+            x: R.Tensor((1, 2, 3, 4), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 4, 3, 2), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2855,7 +2855,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2872,7 +2872,7 @@ def forward(self, input):
     class expected2:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10, 1), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2896,7 +2896,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
+            x: R.Tensor((1, 2, 3, 4), dtype="float32")
         ) -> R.Tuple(R.Tensor((2, 12), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2918,7 +2918,7 @@ def forward(self, input):
     class Expected:
         @R.function
         def main(
-            input: R.Tensor((10, 10), dtype="float32"),
+            input: R.Tensor((10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((20,), dtype="int32")):
             with R.dataflow():
                 lv: R.Tensor((20,), dtype="int32") = R.arange(0, 20, 1, dtype="int32")
@@ -2939,7 +2939,7 @@ def forward(self, input):
     class Expected:
         @R.function
         def main(
-            input: R.Tensor((10, 10), dtype="float32"),
+            input: R.Tensor((10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((10, 10), dtype="float32")):
             with R.dataflow():
                 gv: R.Tuple(R.Tensor((10, 10), dtype="float32")) = (input,)
@@ -2959,7 +2959,7 @@ def forward(self, input):
     class Expected:
         @R.function
         def main(
-            inp_0: R.Tensor((10, 10), dtype="float32"),
+            inp_0: R.Tensor((10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((10, 10), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((10, 10), dtype="float32") = R.zeros(
@@ -2982,7 +2982,7 @@ def forward(self, input: torch.Tensor):
     class Expected:
         @R.function
         def main(
-            inp_0: R.Tensor((10, 10), dtype="float32"),
+            inp_0: R.Tensor((10, 10), dtype="float32")
         ) -> R.Tuple(R.Tensor((10, 10), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((10, 10), dtype="float32") = R.full(
@@ -3005,7 +3005,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3), dtype="float32"),
+            x: R.Tensor((1, 2, 3), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 2, 3), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -3034,7 +3034,7 @@ def forward(self, x):
     class expected_float:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
+            x: R.Tensor((1, 2, 3, 4), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 2, 3, 4), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -3052,7 +3052,7 @@ def forward(self, x):
     class expected_half:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
+            x: R.Tensor((1, 2, 3, 4), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 2, 3, 4), dtype="float16")):
             # block 0
             with R.dataflow():
@@ -3070,7 +3070,7 @@ def forward(self, x):
     class expected_type:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
+            x: R.Tensor((1, 2, 3, 4), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 2, 3, 4), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -3086,7 +3086,7 @@ def forward(self, input):
     class expected_to1:
         @R.function
         def main(
-            inp_0: R.Tensor((1, 2, 3, 4), dtype="float32"),
+            inp_0: R.Tensor((1, 2, 3, 4), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 2, 3, 4), dtype="float16")):
             with R.dataflow():
                 lv: R.Tensor((1, 2, 3, 4), dtype="float16") = R.astype(inp_0, dtype="float16")
@@ -3102,7 +3102,7 @@ def forward(self, input):
     class expected_to2:
         @R.function
         def main(
-            inp_0: R.Tensor((1, 2, 3, 4), dtype="float32"),
+            inp_0: R.Tensor((1, 2, 3, 4), dtype="float32")
         ) -> R.Tuple(R.Tensor((1, 2, 3, 4), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((1, 2, 3, 4), dtype="float32") = R.astype(inp_0, dtype="float32")
@@ -3187,7 +3187,7 @@ def forward(self, x):
     class Expected:
         @R.function
         def main(
-            inp_0: R.Tensor((256, 256), dtype="float32"),
+            inp_0: R.Tensor((256, 256), dtype="float32")
         ) -> R.Tensor((256, 256), dtype="float32"):
             with R.dataflow():
                 gv: R.Tensor((256, 256), dtype="float32") = inp_0

From ad101c84d5104fcb013e0f6ca52ca56d68d45aad Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Mon, 10 Mar 2025 13:25:00 -0400
Subject: [PATCH 10/13] fix vm in
 assert_torch_output_vs_tvm_from_exported_to_cuda

---
 tests/python/relax/test_from_exported_to_cuda.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py
index ba46fe1ace87..f10674ddbded 100644
--- a/tests/python/relax/test_from_exported_to_cuda.py
+++ b/tests/python/relax/test_from_exported_to_cuda.py
@@ -46,6 +46,11 @@ def assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, tar
     tvm_mod, tvm_params = relax.frontend.detach_params(mod_from_torch)
 
     relax_pipeline = relax.get_default_pipeline(tvm.target.Target.from_device(tvm.cuda()))
+    ex = relax.build(tvm_mod, target=target, relax_pipeline=relax_pipeline)
+    vm = relax.VirtualMachine(ex, dev)
+
+    gpu_data = tvm.nd.array(raw_data_for_tvm, dev)
+    gpu_params = [tvm.nd.array(p, dev) for p in tvm_params["main"]]
     gpu_out = vm["main"](gpu_data, *gpu_params)
 
     pytorch_out = torch_module(torch_data).detach().numpy()

From 697dcd8f719bf5c5b98b47ca00eab5798dd2c913 Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Mon, 10 Mar 2025 15:02:19 -0400
Subject: [PATCH 11/13] lint with Python Black formatter

---
 tests/python/relax/test_from_exported_to_cuda.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py
index 9b51025c0102..bd4bdcf61770 100644
--- a/tests/python/relax/test_from_exported_to_cuda.py
+++ b/tests/python/relax/test_from_exported_to_cuda.py
@@ -103,6 +103,7 @@ def forward(self, x):
     torch_module = DetachTester().eval()
     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
 
+
 @tvm.testing.parametrize_targets("cuda")
 def test_upsample_with_scale_factor(target, dev):
     """

From ec73d225a02d0808986737b9a8ce3289c27bf058 Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Mon, 10 Mar 2025 15:22:23 -0400
Subject: [PATCH 12/13] update todo

---
 .../tvm/relax/frontend/torch/base_fx_graph_translator.py   | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
index 8ce0a9570d47..0f47d6bee6c2 100644
--- a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
+++ b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
@@ -995,10 +995,9 @@ def _transpose(self, node: fx.Node) -> relax.Var:
     ########## Creation ##########
 
     def _detach(self, node: fx.Node) -> relax.Var:
-        # TODO found no way to correctly implement this. The output should
-        # share the same memory as the input. It is not the case right now.
-        # Ideally, this test would pass, but it doesn't :
-        # https://github.com/hugolatendresse/tvm/blob/456845811ba01c0bea07737a4f7a333a0b45ea92/tests/python/relax/test_from_exported_to_cuda.py#L98
+        # There is no way to implement detach() such that the output shares 
+        # the same memory as the input. In-place operations are not supported 
+        # by the translator, and therefore we just return a copy of the input.
         return self.env[node.args[0]]
 
     def _copy_(self, node: fx.Node) -> relax.Var:

From d3561fff5ad2eaf88b4d67ff8a900dea46e2833a Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Mon, 10 Mar 2025 15:22:55 -0400
Subject: [PATCH 13/13] update explanation for _detach

---
 python/tvm/relax/frontend/torch/base_fx_graph_translator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
index 0f47d6bee6c2..d5cad2381b49 100644
--- a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
+++ b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
@@ -995,8 +995,8 @@ def _transpose(self, node: fx.Node) -> relax.Var:
     ########## Creation ##########
 
     def _detach(self, node: fx.Node) -> relax.Var:
-        # There is no way to implement detach() such that the output shares 
-        # the same memory as the input. In-place operations are not supported 
+        # There is no way to implement detach() such that the output shares
+        # the same memory as the input. In-place operations are not supported
         # by the translator, and therefore we just return a copy of the input.
         return self.env[node.args[0]]