[Core] Support Zero-Copy Serialization for Read-Only Tensors (#57639)

KaisennHu · israbbani · web-flow · commit 9f19049da237 · 2025-11-25T20:47:31.000-06:00
Enable zero-copy serialization for all PyTorch tensors by setting `RAY_ENABLE_ZERO_COPY_TORCH_TENSORS=1` to accelerate serialization. Example test script: ```python import os # Must be set before `import ray` to ensure that the zero-copy tensor pickle reducer # is properly registered in driver. os.environ["RAY_ENABLE_ZERO_COPY_TORCH_TENSORS"] = "1" import ray import torch from datetime import datetime ray.init(runtime_env={"env_vars": {"RAY_ENABLE_ZERO_COPY_TORCH_TENSORS": "1"}}) @ray.remote def process(tensor): return tensor.sum() x = torch.ones(1024, 1024, 256) start_time = datetime.now() x_ref = process.remote(x) result = ray.get(x_ref) time_diff = datetime.now() - start_time print(f"result : {result}") print(f"between time: {time_diff.total_seconds()}s") print(f"result type : {type(result)}") ``` Below are the performance gains and validation results: <img width="1977" height="965" alt="zuizhongxiaoguo" src="https://github.com/user-attachments/assets/e3d5210c-142d-4ec3-908c-fe590514cfc8" /> Closes #56740 #26229 --------- Signed-off-by: Haichuan Hu <kaisennhu@gmail.com> Co-authored-by: Ibrahim Rabbani <irabbani@anyscale.com>
diff --git a/python/ray/_common/test_utils.py b/python/ray/_common/test_utils.py
@@ -247,3 +247,14 @@ def f():
         assert all(
             [extra_usage_tags[k] == v for k, v in expected_extra_usage_tags.items()]
         ), extra_usage_tags
+
+
+def is_named_tuple(cls):
+    """Return True if cls is a namedtuple and False otherwise."""
+    b = cls.__bases__
+    if len(b) != 1 or b[0] is not tuple:
+        return False
+    f = getattr(cls, "_fields", None)
+    if not isinstance(f, tuple):
+        return False
+    return all(type(n) is str for n in f)
diff --git a/python/ray/_private/ray_constants.py b/python/ray/_private/ray_constants.py
@@ -590,3 +590,28 @@ def gcs_actor_scheduling_enabled():
 RDT_FETCH_FAIL_TIMEOUT_SECONDS = (
     env_integer("RAY_rdt_fetch_fail_timeout_milliseconds", 60000) / 1000
 )
+
+# Whether to enable zero-copy serialization for PyTorch tensors.
+# When enabled, Ray serializes PyTorch tensors by converting them to NumPy arrays
+# and leveraging pickle5's zero-copy buffer sharing. This avoids copying the
+# underlying tensor data, which can improve performance when passing large tensors
+# across tasks or actors. Note that this is experimental and should be used with caution
+# as we won't copy and allow a write to shared memory. One process changing a tensor
+# after ray.get could be reflected in another process.
+#
+# This feature is experimental and works best under the following conditions:
+# - The tensor has `requires_grad=False` (i.e., is detached from the autograd graph).
+# - The tensor is contiguous in memory
+# - Performance benefits from this are larger if the tensor resides in CPU memory
+# - You are not using Ray Direct Transport
+#
+# Tensors on GPU or non-contiguous tensors are still supported: Ray will
+# automatically move them to CPU and/or make them contiguous as needed.
+# While this incurs an initial copy, subsequent serialization may still benefit
+# from reduced overhead compared to the default path.
+#
+# Use with caution and ensure tensors meet the above criteria before enabling.
+# Default: False.
+RAY_ENABLE_ZERO_COPY_TORCH_TENSORS = env_bool(
+    "RAY_ENABLE_ZERO_COPY_TORCH_TENSORS", False
+)
diff --git a/python/ray/_private/serialization.py b/python/ray/_private/serialization.py
@@ -1,6 +1,7 @@
 import logging
 import threading
 import traceback
+import warnings
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 if TYPE_CHECKING:
@@ -11,7 +12,10 @@
 import ray._private.utils
 import ray.cloudpickle as pickle
 import ray.exceptions
-from ray._private import ray_constants
+from ray._private import (
+    ray_constants,
+    tensor_serialization_utils,
+)
 from ray._raylet import (
     DynamicObjectRefGenerator,
     MessagePackSerializedObject,
@@ -159,6 +163,28 @@ def __init__(self, worker):
         # instead of the normal serialize -> object store -> deserialize codepath.
         self._torch_custom_serializer_registered = False
 
+        # Enable zero-copy serialization of tensors if the environment variable is set.
+        self._zero_copy_tensors_enabled = (
+            ray_constants.RAY_ENABLE_ZERO_COPY_TORCH_TENSORS
+        )
+        if self._zero_copy_tensors_enabled:
+            try:
+                import torch
+
+                self._register_cloudpickle_reducer(
+                    torch.Tensor, tensor_serialization_utils.zero_copy_tensors_reducer
+                )
+            except ImportError:
+                # Warn and disable zero-copy tensor serialization when PyTorch is missing,
+                # even if RAY_ENABLE_ZERO_COPY_TORCH_TENSORS is set.
+                warnings.warn(
+                    "PyTorch is not installed. Disabling zero-copy tensor serialization "
+                    "even though RAY_ENABLE_ZERO_COPY_TORCH_TENSORS is set.",
+                    tensor_serialization_utils.ZeroCopyTensorsWarning,
+                    stacklevel=3,
+                )
+                self._zero_copy_tensors_enabled = False
+
         def actor_handle_reducer(obj):
             ray._private.worker.global_worker.check_connected()
             serialized, actor_handle_id, weak_ref = obj._serialization_helper()
diff --git a/python/ray/_private/tensor_serialization_utils.py b/python/ray/_private/tensor_serialization_utils.py
@@ -0,0 +1,145 @@
+import warnings
+from typing import TYPE_CHECKING, Any, Tuple
+
+if TYPE_CHECKING:
+    import numpy as np
+    import torch
+
+
+class ZeroCopyTensorsWarning(UserWarning):
+    """
+    Warning for unsafe or failed zero-copy tensor serialization/deserialization.
+    """
+
+    pass
+
+
+warnings.filterwarnings("once", category=ZeroCopyTensorsWarning)
+
+
+def _zero_copy_tensors_deserializer(
+    np_array: "np.ndarray", dtype_str: str, shape: Tuple[int, ...], device_str: str
+) -> "torch.Tensor":
+    """
+    Reconstructs a torch.Tensor from a zero-copy NumPy byte array.
+
+    Args:
+        np_array: 1D uint8 NumPy array of the original tensor's raw bytes.
+        dtype_str: Full string representation of the original tensor's dtype (e.g., 'torch.float32').
+        shape: The original shape of the tensor before serialization.
+        device_str: String representation of the original device (e.g., 'cpu', 'cuda:0').
+
+    Returns:
+        Reconstructed torch.Tensor on the specified device if successful;
+        otherwise, returns the input np_array unchanged and issues a warning.
+
+    Raises:
+        ImportError/DeserializationError: If deserialization fails for any reason (e.g., missing PyTorch
+                            dtype mismatch, shape inconsistency, device error, etc.).
+    """
+    try:
+        import torch
+    except ImportError as e:
+        raise ImportError(
+            "Zero-copy tensor deserialization failed: PyTorch is not installed."
+        ) from e
+
+    try:
+        # Step 1: Convert uint8 numpy array back to torch tensor
+        uint8_tensor = torch.from_numpy(np_array)
+
+        # Step 2: Restore original dtype
+        dtype_name = dtype_str.split(".")[-1]
+        if not hasattr(torch, dtype_name):
+            raise ValueError(f"Invalid or unsupported dtype string: {dtype_str}")
+        original_dtype = getattr(torch, dtype_name)
+
+        # Compute number of bytes per element
+        dtype_size = torch.tensor([], dtype=original_dtype).element_size()
+        if np_array.size % dtype_size != 0:
+            raise ValueError(
+                f"Byte array size ({np_array.size}) is not divisible by "
+                f"dtype size ({dtype_size}) for dtype {dtype_str}"
+            )
+
+        # Step 3: Reshape and reinterpret bytes as target dtype
+        restored_tensor = uint8_tensor.view(original_dtype).reshape(shape)
+
+        # Step 4: Move to target device
+        return restored_tensor.to(device=device_str)
+
+    except Exception as e:
+        from ray._private.serialization import DeserializationError
+
+        raise DeserializationError(
+            f"Failed to deserialize zero-copy tensor from byte array. "
+            f"Input dtype={dtype_str}, shape={shape}, device={device_str}. "
+            f"Underlying error: {type(e).__name__}: {e}"
+        ) from e
+
+
+def zero_copy_tensors_reducer(tensor: "torch.Tensor") -> Tuple[Any, Tuple[Any, ...]]:
+    """Pickle serializer for zero-copy serialization of read-only torch.Tensor.
+
+    This serializer aims to avoid copying tensor data by using a NumPy uint8 view,
+    which enables pickle5's out-of-band buffer transmission. However, true zero-copy
+    is only possible when the input tensor is already:
+
+    - On CPU,
+    - Detached from the computation graph (no gradients),
+    - Contiguous in memory.
+
+    If the input tensor does **not** meet these conditions, this function will:
+
+    - Call `.detach()` to remove gradient information,
+    - Move the tensor to CPU (copying data if it's on GPU or another device),
+    - Make the tensor contiguous (copying data if it's non-contiguous).
+
+    These operations may incur one or two full copies of the tensor data,
+    negating zero-copy benefits. A warning is issued in such cases.
+
+    Args:
+        tensor: The input torch.Tensor to serialize. Can be on any device,
+                with or without gradients, contiguous or not — but zero-copy
+                is only achieved if it is already CPU, detached, and contiguous.
+
+    Returns:
+        A tuple (deserializer_callable, args_tuple) suitable for pickle.
+    """
+    warnings.warn(
+        "Zero-copy tensor serialization is enabled, but it only works safely for read-only tensors "
+        "(detached, no gradients, contiguous). Modifiable or non-contiguous tensors may cause data corruption.",
+        ZeroCopyTensorsWarning,
+        stacklevel=3,
+    )
+
+    import torch
+
+    # Detach the tensor from gradients and computation graph.
+    # Move it to cpu (this is a noop if the tensor is already in main memory, but will create a copy if the
+    # the tensor is on an accelerator).
+    # Ensure that the tensor is contiguous. If the tensor is not contiguous, this will create a contiguous
+    # copy.
+    cpu_tensor = tensor.detach().cpu()
+    if not cpu_tensor.is_contiguous():
+        warnings.warn(
+            "The input tensor is non-contiguous. A copy will be made to ensure contiguity. "
+            "For zero-copy serialization, please ensure the tensor is contiguous before passing it "
+            "(e.g., by calling `.contiguous()`).",
+            ZeroCopyTensorsWarning,
+            stacklevel=3,
+        )
+        cpu_tensor = cpu_tensor.contiguous()
+
+    # Flatten to 1D for safe uint8 view (handles scalars)
+    flat_tensor = cpu_tensor.reshape(-1)
+    # View as uint8 bytes
+    uint8_view = flat_tensor.view(torch.uint8)
+    np_array = uint8_view.numpy()
+
+    return _zero_copy_tensors_deserializer, (
+        np_array,
+        str(tensor.dtype),
+        tuple(tensor.shape),
+        str(tensor.device),
+    )
diff --git a/python/ray/tests/BUILD.bazel b/python/ray/tests/BUILD.bazel
@@ -640,6 +640,7 @@ py_test_module_list(
     files = [
         "gpu_objects/test_gpu_objects_nccl.py",
         "gpu_objects/test_gpu_objects_nixl.py",
+        "test_tensor_zero_copy_serialization.py",
     ],
     tags = [
         "custom_setup",
diff --git a/python/ray/tests/test_serialization.py b/python/ray/tests/test_serialization.py
@@ -16,21 +16,11 @@
 import ray.cluster_utils
 import ray.exceptions
 from ray import cloudpickle
+from ray._common.test_utils import is_named_tuple
 
 logger = logging.getLogger(__name__)
 
 
-def is_named_tuple(cls):
-    """Return True if cls is a namedtuple and False otherwise."""
-    b = cls.__bases__
-    if len(b) != 1 or b[0] is not tuple:
-        return False
-    f = getattr(cls, "_fields", None)
-    if not isinstance(f, tuple):
-        return False
-    return all(type(n) is str for n in f)
-
-
 @pytest.mark.parametrize(
     "ray_start_regular", [{"local_mode": True}, {"local_mode": False}], indirect=True
 )
diff --git a/python/ray/tests/test_tensor_zero_copy_serialization.py b/python/ray/tests/test_tensor_zero_copy_serialization.py