updated

robertgshaw2-redhat · robertgshaw2-redhat · commit a73721ab9ca1 · 2025-04-09T13:19:17.000Z
Signed-off-by: rshaw@neuralmagic.com &lt;robertgshaw2@gmail.com&gt;
diff --git a/examples/offline_inference/disaggrated-prefill-v1/run.sh b/examples/offline_inference/disaggrated-prefill-v1/run.sh
@@ -1,5 +1,5 @@
 rm -rf local_storage/
 rm output.txt
 
-VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=6 python3 prefill_example.py
-VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=6 python3 decode_example.py
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=5 python3 prefill_example.py
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=5 python3 decode_example.py
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -353,6 +353,7 @@ def maybe_save_kv_layer_to_connector(
 ):
     if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
         return
+
     connector = get_kv_transfer_group()
 
     forward_context: ForwardContext = get_forward_context()
@@ -370,7 +371,7 @@ def unified_attention(
     value: torch.Tensor,
     layer_name: str,
 ) -> torch.Tensor:
-    # wait_for_kv_layer_from_connector(layer_name)
+    wait_for_kv_layer_from_connector(layer_name)
 
     forward_context: ForwardContext = get_forward_context()
     attn_metadata = forward_context.attn_metadata
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -197,12 +197,11 @@ def extract_kv_from_layer(
             Assume the shape of the layer is (2, num_pages, page_size, xxx).
             """
             num_pages, page_size = layer.shape[1], layer.shape[2]
-            reshaped = layer.reshape(2, num_pages * page_size, -1)
             print(f"{layer.shape=}")
-            print(f"{reshaped.shape=}")
-            print(f"{slot_mapping}")
-
-            return reshaped[:, slot_mapping, ...]
+            print(f"{layer.reshape(2, num_pages * page_size, -1)=}")
+            print(f"{slot_mapping.shape=}")
+            return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping,
+                                                               ...]
 
         connector_metadata = self._get_connector_metadata()
         assert isinstance(connector_metadata, SharedStorageConnectorMetadata)
@@ -212,8 +211,8 @@ def extract_kv_from_layer(
                     layer_name, request.token_ids)
                 kv_cache = extract_kv_from_layer(kv_layer,
                                                  request.slot_mapping)
-                assert False
-                # torch.ops.save_lib.save_safetensors(kv_cache, filename)
+                tensors = {"kv_cache": kv_cache.detach().cpu()}
+                safetensors.torch.save_file(tensors, filename)
 
     def wait_for_save(self):
         return
@@ -366,21 +365,3 @@ def align_to_block_size(num_tokens: int, block_size) -> int:
     """Align the number of tokens to the block size.
     """
     return (num_tokens - 1) // block_size * block_size
-
-
-# Register a custom library and print operator
-import torch
-from torch.library import Library, impl
-
-lib = Library("save_lib", "DEF")
-lib.define("save_safetensors(Tensor kv_cache, str filename) -> ()")
-
-
-@impl(lib, "save_safetensors", "CompositeExplicitAutograd")
-def save_safetensors(kv_cache, filename):
-    # tensors = {"kv_cache": kv_cache.detach().cpu()}
-    # kv_cache = kv_cache.cpu()
-    # tensors = {"kv_cache": kv_cache}
-    # safetensors.torch.save_file(tensors, filename)
-    a = torch.empty(10)
-    return