diff --git a/python/tvm/dlight/gpu/general_reduction.py b/python/tvm/dlight/gpu/general_reduction.py
index a068e732b986..9b5c4c16153a 100644
--- a/python/tvm/dlight/gpu/general_reduction.py
+++ b/python/tvm/dlight/gpu/general_reduction.py
@@ -90,7 +90,10 @@ def f_layout_mapping(*iters):
             sch.transform_block_layout(block_infos[-1].block_rv, index_map)
 
         try:
-            # TODO: fix num_leading_s = 0 case
+            # Handle the case where num_leading_s = 0
+            if num_leading_s == 0:
+                num_leading_s = 1  # Use at least one spatial dimension for blockIdx.x
+
             assert num_trailing_r > 0
             for block in block_infos[1:-1]:
                 assert block.dom_kind() == dom_kind
@@ -100,7 +103,13 @@ def f_layout_mapping(*iters):
             return None
 
         loops = sch.get_loops(block_infos[-1].block_rv)
-        bx = sch.fuse(*loops[:num_leading_s])
+
+        # Ensure we have at least one spatial dimension for blockIdx.x
+        if num_leading_s > 0:
+            bx = sch.fuse(*loops[:num_leading_s])
+        else:
+            bx = loops[0]  # Use the first loop as blockIdx.x
+
         r_loop, tx = sch.split(loops[-1], [None, len_tx])
         sch.reorder(tx, r_loop)
         sch.bind(bx, "blockIdx.x")
diff --git a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
index 003ceebec6ff..2f9f53146d2d 100644
--- a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
+++ b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
@@ -227,9 +227,54 @@ def _round(self, node: fx.Node) -> relax.Expr:
         return self.block_builder.emit(relax.op.round(arg))
 
     def _softmax(self, node: fx.Node) -> relax.Var:
+        """
+        For large tensors with non-last dimension softmax, we transpose to move
+        the softmax dimension to the end, apply softmax, and then transpose
+        back to the original shape.
+        """
         x = self.env[node.args[0]]
         dim = node.args[1] if len(node.args) > 1 else node.kwargs.get("dim", -1)
-        return self.block_builder.emit(relax.op.nn.softmax(x, dim))
+        input_shape = x.struct_info.shape
+        input_ndim = len(input_shape)
+
+        if dim < 0:
+            # Ensure dim is express as a positive index
+            dim = input_ndim + dim
+
+        # Check if this is a non-last dimension with large size (> 1024)
+        # The smallest power of 2 that doesn't work on a NVIDIA GeForce RTX
+        # 4090 if 8192. Using 1024 here to be safe.
+        is_large_non_last_dim = False
+        large_size_threshold = 1024
+
+        if dim != input_ndim - 1:  # Not the last dimension
+            # Check if any dimension is large
+            for i, size in enumerate(input_shape):
+                if hasattr(size, "value") and size.value > large_size_threshold:
+                    is_large_non_last_dim = True
+                    break
+
+        if is_large_non_last_dim:
+            # Special handling for large tensors with non-last dimension softmax
+
+            # Get dimension ordering for transpose
+            dims = list(range(input_ndim))
+            dims.append(dims.pop(dim))
+
+            # Transpose
+            x_transposed = self.block_builder.emit(relax.op.permute_dims(x, dims))
+
+            # Apply softmax on last dimension
+            softmax_result = self.block_builder.emit(relax.op.nn.softmax(x_transposed, -1))
+
+            # Transpose back to original shape
+            inv_dims = [-1] * len(dims)
+            for i, d in enumerate(dims):
+                inv_dims[d] = i
+            return self.block_builder.emit(relax.op.permute_dims(softmax_result, inv_dims))
+        else:
+            # Regular softmax
+            return self.block_builder.emit(relax.op.nn.softmax(x, dim))
 
     def _selu(self, node: fx.Node) -> relax.Var:
         x = self.env[node.args[0]]
diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py
new file mode 100644
index 000000000000..a31345e04bd9
--- /dev/null
+++ b/tests/python/relax/test_from_exported_to_cuda.py
@@ -0,0 +1,71 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+from tvm import relax
+import tvm.testing
+import numpy as np
+import torch
+from torch.export import export
+from tvm.relax.frontend.torch import from_exported_program
+from torch.nn import Softmax
+
+
+def assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev):
+    """
+    This util ensures that a torch module can successfully be exported to TVM
+    using torch.export and that the resuling IR program gives the same result
+    as PyTorch when ran on CUDA.
+    """
+    raw_data_for_tvm = raw_data.copy()  # In case the data is modified
+    torch_data = torch.from_numpy(raw_data)
+    example_args = (torch_data,)
+
+    with torch.no_grad():
+        exported_program = export(torch_module, example_args)
+        mod_from_torch = from_exported_program(exported_program, keep_params_as_input=True)
+
+    tvm_mod, tvm_params = relax.frontend.detach_params(mod_from_torch)
+
+    relax_pipeline = relax.get_default_pipeline(tvm.target.Target.from_device(tvm.cuda()))
+    ex = relax.build(tvm_mod, target=target, relax_pipeline=relax_pipeline)
+    vm = relax.VirtualMachine(ex, dev)
+
+    gpu_data = tvm.nd.array(raw_data_for_tvm, dev)
+    gpu_params = [tvm.nd.array(p, dev) for p in tvm_params["main"]]
+    gpu_out = vm["main"](gpu_data, *gpu_params)
+
+    pytorch_out = torch_module(torch_data).detach().numpy()
+    actual = gpu_out[0].numpy()
+    desired = pytorch_out
+    np.testing.assert_allclose(actual=actual, desired=desired, rtol=1e-5, atol=1e-5)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_softmax_non_last_dim_large_tensor(target, dev):
+    """
+    Tests ingesting a PyTorch exported model that uses softmax on a large
+    tensor, with the softmax dimension not being that last dimension, and
+    running it with CUDA.
+    """
+    torch_module = Softmax(dim=2).eval()
+    raw_data = np.random.rand(10, 4, 32, 16384).astype("float32")
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/relax/test_frontend_from_exported_program.py b/tests/python/relax/test_frontend_from_exported_program.py
index 8ca335c2fe7a..399739146359 100644
--- a/tests/python/relax/test_frontend_from_exported_program.py
+++ b/tests/python/relax/test_frontend_from_exported_program.py
@@ -82,7 +82,7 @@ def forward(self, input):
     class expected:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((1, 3, 10, 10), dtype="float32") = relax_op(input_1)
@@ -112,7 +112,7 @@ def forward(self, input):
     class expected:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="bool")):
             with R.dataflow():
                 lv: R.Tensor((1, 3, 10, 10), dtype="bool") = relax_op(input_1)
@@ -135,7 +135,7 @@ def forward(self, input):
     class expected_clamp:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -163,7 +163,7 @@ def forward(self, input):
     class expected_dropout:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -191,7 +191,7 @@ def forward(self, input):
     class expected_gelu:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -220,7 +220,7 @@ def forward(self, input):
     class expected_hardsigmoid:
         @R.function
         def main(
-            inp_0: R.Tensor((1, 3, 10, 10), dtype="float32")
+            inp_0: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((1, 3, 10, 10), dtype="float32") = R.add(inp_0, R.const(3, "float32"))
@@ -252,7 +252,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            inp_0: R.Tensor((1, 3, 10, 10), dtype="float32")
+            inp_0: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((1, 3, 10, 10), dtype="float32") = R.add(inp_0, R.const(3, "float32"))
@@ -294,7 +294,7 @@ def forward(self, input):
     class expected_relu:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -323,7 +323,7 @@ def forward(self, input):
     class expected_sigmoid:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -352,7 +352,7 @@ def forward(self, input):
     class expected_silu:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -388,7 +388,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            inp_0: R.Tensor((1, 3, 10, 10), dtype="float32")
+            inp_0: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((1, 3, 10, 10), dtype="float32") = R.clip(
@@ -425,7 +425,7 @@ def forward(self, input):
     class expected:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -456,7 +456,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -487,7 +487,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -512,7 +512,7 @@ def forward(self, input):
     class expected_tril:
         @R.function
         def main(
-            input_1: R.Tensor((10, 10), dtype="float32")
+            input_1: R.Tensor((10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -531,7 +531,7 @@ def forward(self, input):
     class expected_triu:
         @R.function
         def main(
-            input_1: R.Tensor((10, 10), dtype="float32")
+            input_1: R.Tensor((10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -795,7 +795,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -883,7 +883,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -1580,7 +1580,7 @@ def forward(self, x, y):
     class Expected1:
         @R.function
         def main(
-            inp_0: R.Tensor((4, 4), dtype="float32")
+            inp_0: R.Tensor((4, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((), dtype="float32") = R.einsum((inp_0,), subscripts="ii")
@@ -1827,7 +1827,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -1856,7 +1856,7 @@ def forward(self, input):
     class expected2:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 4, 4), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -1885,7 +1885,7 @@ def forward(self, input):
     class expected3:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 6, 6), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2102,7 +2102,7 @@ def forward(self, input):
     class expected_bilinear:
         @R.function
         def main(
-            input: R.Tensor((1, 3, 112, 112), dtype="float32")
+            input: R.Tensor((1, 3, 112, 112), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 224, 224), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2131,7 +2131,7 @@ def forward(self, input):
     class expected_nearest:
         @R.function
         def main(
-            input: R.Tensor((1, 3, 112, 112), dtype="float32")
+            input: R.Tensor((1, 3, 112, 112), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 224, 224), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2170,7 +2170,7 @@ def forward(self, input: torch.Tensor):
     class Expected1:
         @R.function
         def main(
-            inp_0: R.Tensor((256, 256), dtype="float32")
+            inp_0: R.Tensor((256, 256), dtype="float32"),
         ) -> R.Tuple(R.Tensor((256,), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((256,), dtype="float32") = R.mean(inp_0, axis=[-1], keepdims=False)
@@ -2182,7 +2182,7 @@ def main(
     class Expected2:
         @R.function
         def main(
-            inp_0: R.Tensor((256, 256), dtype="float32")
+            inp_0: R.Tensor((256, 256), dtype="float32"),
         ) -> R.Tuple(R.Tensor((256, 1), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((256, 1), dtype="float32") = R.mean(inp_0, axis=[-1], keepdims=True)
@@ -2204,7 +2204,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            inp_0: R.Tensor((1, 2, 3, 4), dtype="float32")
+            inp_0: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 4), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2238,7 +2238,7 @@ def forward(self, input):
     class expected_argmax1:
         @R.function
         def main(
-            inp_0: R.Tensor((256, 256), dtype="float32")
+            inp_0: R.Tensor((256, 256), dtype="float32"),
         ) -> R.Tuple(R.Tensor((256,), dtype="int64")):
             with R.dataflow():
                 lv: R.Tensor((256,), dtype="int64") = R.argmax(inp_0, axis=-1, keepdims=False)
@@ -2250,7 +2250,7 @@ def main(
     class expected_argmax2:
         @R.function
         def main(
-            inp_0: R.Tensor((256, 256), dtype="float32")
+            inp_0: R.Tensor((256, 256), dtype="float32"),
         ) -> R.Tuple(R.Tensor((256, 1), dtype="int64")):
             with R.dataflow():
                 lv: R.Tensor((256, 1), dtype="int64") = R.argmax(inp_0, axis=-1, keepdims=True)
@@ -2279,7 +2279,7 @@ def forward(self, input):
     class expected_argmin1:
         @R.function
         def main(
-            inp_0: R.Tensor((256, 256), dtype="float32")
+            inp_0: R.Tensor((256, 256), dtype="float32"),
         ) -> R.Tuple(R.Tensor((), dtype="int64")):
             with R.dataflow():
                 lv: R.Tensor((), dtype="int64") = R.argmin(inp_0, axis=None, keepdims=False)
@@ -2291,7 +2291,7 @@ def main(
     class expected_argmin2:
         @R.function
         def main(
-            inp_0: R.Tensor((256, 256), dtype="float32")
+            inp_0: R.Tensor((256, 256), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 1), dtype="int64")):
             with R.dataflow():
                 lv: R.Tensor((1, 1), dtype="int64") = R.argmin(inp_0, axis=None, keepdims=True)
@@ -2362,7 +2362,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 2, 3, 4), dtype="float32")
+            input_1: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 2, 3, 4), dtype="int32")):
             # block 0
             with R.dataflow():
@@ -2388,7 +2388,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32")
+            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((4, 2, 3, 4), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2419,7 +2419,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 100), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2445,7 +2445,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32")
+            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 4, 3, 2), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2483,7 +2483,7 @@ def main(x: R.Tensor((3,), dtype="float32")) -> R.Tuple(R.Tensor((6,), dtype="fl
     class expected2:
         @R.function
         def main(
-            x: R.Tensor((1, 3), dtype="float32")
+            x: R.Tensor((1, 3), dtype="float32"),
         ) -> R.Tuple(R.Tensor((4, 6), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2511,7 +2511,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32")
+            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((2, 12), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2533,7 +2533,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 3, 10, 10), dtype="float32")
+            x: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 10, 3), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2574,7 +2574,7 @@ def forward(self, x):
     class expected2:
         @R.function
         def main(
-            x: R.Tensor((8, 16), dtype="float32")
+            x: R.Tensor((8, 16), dtype="float32"),
         ) -> R.Tuple(R.Tensor((8, 1, 1, 16, 1), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((8, 16), dtype="float32") = R.strided_slice(
@@ -2749,7 +2749,7 @@ def forward(self, input):
     class Expected1:
         @R.function
         def main(
-            inp_0: R.Tensor((3, 1, 4, 1), dtype="float32")
+            inp_0: R.Tensor((3, 1, 4, 1), dtype="float32"),
         ) -> R.Tuple(R.Tensor((3, 4, 1), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((3, 4, 1), dtype="float32") = R.squeeze(inp_0, axis=[1])
@@ -2765,7 +2765,7 @@ def forward(self, input):
     class Expected2:
         @R.function
         def main(
-            inp_0: R.Tensor((3, 1, 4, 1), dtype="float32")
+            inp_0: R.Tensor((3, 1, 4, 1), dtype="float32"),
         ) -> R.Tuple(R.Tensor((3, 4), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((3, 4), dtype="float32") = R.squeeze(inp_0, axis=None)
@@ -2796,7 +2796,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 3), dtype="float32")
+            x: R.Tensor((1, 3), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 6), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2809,7 +2809,7 @@ def main(
     class expected2:
         @R.function
         def main(
-            x: R.Tensor((1, 3), dtype="float32")
+            x: R.Tensor((1, 3), dtype="float32"),
         ) -> R.Tuple(R.Tensor((4, 6), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2833,7 +2833,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32")
+            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 4, 3, 2), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2855,7 +2855,7 @@ def forward(self, input):
     class expected1:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 1, 3, 10, 10), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2872,7 +2872,7 @@ def forward(self, input):
     class expected2:
         @R.function
         def main(
-            input_1: R.Tensor((1, 3, 10, 10), dtype="float32")
+            input_1: R.Tensor((1, 3, 10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 3, 10, 10, 1), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2896,7 +2896,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32")
+            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((2, 12), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -2918,7 +2918,7 @@ def forward(self, input):
     class Expected:
         @R.function
         def main(
-            input: R.Tensor((10, 10), dtype="float32")
+            input: R.Tensor((10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((20,), dtype="int32")):
             with R.dataflow():
                 lv: R.Tensor((20,), dtype="int32") = R.arange(0, 20, 1, dtype="int32")
@@ -2939,7 +2939,7 @@ def forward(self, input):
     class Expected:
         @R.function
         def main(
-            input: R.Tensor((10, 10), dtype="float32")
+            input: R.Tensor((10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((10, 10), dtype="float32")):
             with R.dataflow():
                 gv: R.Tuple(R.Tensor((10, 10), dtype="float32")) = (input,)
@@ -2959,7 +2959,7 @@ def forward(self, input):
     class Expected:
         @R.function
         def main(
-            inp_0: R.Tensor((10, 10), dtype="float32")
+            inp_0: R.Tensor((10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((10, 10), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((10, 10), dtype="float32") = R.zeros(
@@ -2982,7 +2982,7 @@ def forward(self, input: torch.Tensor):
     class Expected:
         @R.function
         def main(
-            inp_0: R.Tensor((10, 10), dtype="float32")
+            inp_0: R.Tensor((10, 10), dtype="float32"),
         ) -> R.Tuple(R.Tensor((10, 10), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((10, 10), dtype="float32") = R.full(
@@ -3005,7 +3005,7 @@ def forward(self, x):
     class expected1:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3), dtype="float32")
+            x: R.Tensor((1, 2, 3), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 2, 3), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -3034,7 +3034,7 @@ def forward(self, x):
     class expected_float:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32")
+            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 2, 3, 4), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -3052,7 +3052,7 @@ def forward(self, x):
     class expected_half:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32")
+            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 2, 3, 4), dtype="float16")):
             # block 0
             with R.dataflow():
@@ -3070,7 +3070,7 @@ def forward(self, x):
     class expected_type:
         @R.function
         def main(
-            x: R.Tensor((1, 2, 3, 4), dtype="float32")
+            x: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 2, 3, 4), dtype="float32")):
             # block 0
             with R.dataflow():
@@ -3086,7 +3086,7 @@ def forward(self, input):
     class expected_to1:
         @R.function
         def main(
-            inp_0: R.Tensor((1, 2, 3, 4), dtype="float32")
+            inp_0: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 2, 3, 4), dtype="float16")):
             with R.dataflow():
                 lv: R.Tensor((1, 2, 3, 4), dtype="float16") = R.astype(inp_0, dtype="float16")
@@ -3102,7 +3102,7 @@ def forward(self, input):
     class expected_to2:
         @R.function
         def main(
-            inp_0: R.Tensor((1, 2, 3, 4), dtype="float32")
+            inp_0: R.Tensor((1, 2, 3, 4), dtype="float32"),
         ) -> R.Tuple(R.Tensor((1, 2, 3, 4), dtype="float32")):
             with R.dataflow():
                 lv: R.Tensor((1, 2, 3, 4), dtype="float32") = R.astype(inp_0, dtype="float32")
@@ -3187,7 +3187,7 @@ def forward(self, x):
     class Expected:
         @R.function
         def main(
-            inp_0: R.Tensor((256, 256), dtype="float32")
+            inp_0: R.Tensor((256, 256), dtype="float32"),
         ) -> R.Tensor((256, 256), dtype="float32"):
             with R.dataflow():
                 gv: R.Tensor((256, 256), dtype="float32") = inp_0