ok, fixed the torchax.view.item() issue.

vanbasten23 · vanbasten23 · commit 9369e127b7d3 · 2025-10-30T22:19:47.000Z
Signed-off-by: Xiongfei Wei &lt;isaacwxf23@gmail.com&gt;
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
@@ -231,15 +231,15 @@ def create_column_parallel_packed_layer():
         if repeats == 2:
             # In e2e, MergedColumnParallelLinear is created when we load the model. The base_layer weights are sharded and moved to TPU in VllmUnquantizedLinearMethod.process_weights_after_loading.
             linear = MergedColumnParallelLinear(
-                256,  # input_size
-                [256] * repeats,  # output_size
+                64,  # input_size
+                [64] * repeats,  # output_size
                 bias=False,
                 params_dtype=torch.float16)
             linear.weight.data = torch.rand_like(linear.weight.data)
             
             base_linear = MergedColumnParallelLinear(
-                256,  # input_size
-                [256] * repeats,  # output_size
+                64,  # input_size
+                [64] * repeats,  # output_size
                 bias=False,
                 params_dtype=torch.float16)
             base_linear.weight.data = linear.weight.data
@@ -303,13 +303,13 @@ def create_column_parallel_packed_layer():
         repeats=repeats,
     )
 
-    # inputs: list[torch.Tensor] of size num_inputs. inputs[i] corresponds to a request which has several token of shape=[num_tokens, 256].
+    # inputs: list[torch.Tensor] of size num_inputs. inputs[i] corresponds to a request which has several token of shape=[num_tokens, 64].
     # index_mapping: list[int]
     # prompt_mapping: list[int]
     inputs, index_mapping, prompt_mapping = create_random_inputs(
         active_lora_ids=list(lora_dict.keys()),
         num_inputs=32,
-        input_size=(1, 256),
+        input_size=(1, 64),
         input_range=(0, 1),
         input_type=torch.float16,
         device='cpu')
@@ -372,7 +372,7 @@ def create_column_parallel_packed_layer():
     inputs, index_mapping, prompt_mapping = create_random_inputs(
         active_lora_ids=[0],  # different from the above create_random_inputs
         num_inputs=32,
-        input_size=(1, 256),
+        input_size=(1, 64),
         input_range=(0, 1),
         input_type=torch.float16,
         device='cpu')
diff --git a/tpu_inference/lora/torch_punica_tpu.py b/tpu_inference/lora/torch_punica_tpu.py
@@ -8,6 +8,8 @@
 import torch.nn.functional as F
 import torchax
 from vllm.lora.punica_wrapper.utils import convert_mapping
+from torchax.interop import jax_view, torch_view
+
 
 if TYPE_CHECKING:
     # avoid circuit import
@@ -283,7 +285,7 @@ def _update_prefill_metadata(self,
             self.batch_size = 1
             self._lora_indices_per_batch[:self.
                                          batch_size] = token_lora_tensor[:self.
-                                                                         batch_size]
+                                                                         batch_size].torch()
 
     def _pad_prompt_mapping(
             self, prompt_mapping: tuple[int, ...]) -> tuple[int, ...]: