Cache PyArrow schema operations (ray-project#58583)

xinyuangui2 · SheldonTsen · commit 01b21f4e6de3 · 2025-12-01T09:30:59.000Z
## Description This PR adds caching for PyArrow schema operations to improve performance during batching operations, especially for tables with a large number of columns. ### Main Changes - **Caching for Tensor Type Serialization/Deserialization**: Added cache for tensor type serialization and deserialization operations. This significantly reduces overhead for frequently accessed tensor types during schema operations. ### Performance Impact This optimization is particularly beneficial during batching operations for tables with a large number of columns. In one of our tests with 200 columns, the batching time per batch decreased from **0.30s to 0.11s** (~63% improvement). #### Without cache: <img width="1719" height="464" alt="Screenshot 2025-11-13 at 9 49 33 PM" src="https://github.com/user-attachments/assets/46122634-dd09-40ed-a2a8-725d14f85728" /> We can see `__arrow_ext_deserialize__` and `__arrow_ext_serialize__` in different places. Each time `__arrow_ext_deserialize__` will create a new object and `__arrow_ext_serialize__` includes expensive pickle. #### With cache <img width="1717" height="476" alt="Screenshot 2025-11-13 at 9 41 15 PM" src="https://github.com/user-attachments/assets/50e77253-d69d-40d9-9e1f-56e9341bc131" /> The time on `__arrow_ext_deserialize__` and `__arrow_ext_serialize__` is not a bottleneck anymore. --------- Signed-off-by: xgui <xgui@anyscale.com> Signed-off-by: Xinyuan <43737116+xinyuangui2@users.noreply.github.com>
diff --git a/python/ray/air/util/tensor_extensions/arrow.py b/python/ray/air/util/tensor_extensions/arrow.py
@@ -1,8 +1,11 @@
 import abc
+import functools
 import itertools
 import json
 import logging
 import sys
+import threading
+from abc import abstractmethod
 from datetime import datetime
 from enum import Enum
 from typing import Any, Collection, Dict, Iterable, List, Optional, Tuple, Union
@@ -62,6 +65,11 @@ class _SerializationFormat(Enum):
     else _SerializationFormat.CLOUDPICKLE  # default
 )
 
+# 100,000 entries, about 10MB in memory.
+# Most users tables should have less than 100K columns.
+ARROW_EXTENSION_SERIALIZATION_CACHE_MAXSIZE = env_integer(
+    "RAY_EXTENSION_SERIALIZATION_CACHE_MAXSIZE", 10**5
+)
 
 logger = logging.getLogger(__name__)
 
@@ -85,6 +93,88 @@ def _deserialize_with_fallback(serialized: bytes, field_name: str = "data"):
             )
 
 
+@DeveloperAPI(stability="beta")
+class ArrowExtensionSerializeDeserializeCache(abc.ABC):
+    """Base class for caching Arrow extension type serialization and deserialization.
+
+    The deserialization and serialization of Arrow extension types is frequent,
+    so we cache the results here to improve performance.
+
+    The deserialization cache uses functools.lru_cache as a classmethod. There is
+    a single cache instance shared across all subclasses, but the cache key includes
+    the class (cls parameter) as the first argument, so different subclasses get
+    different cache entries even when called with the same parameters. The cache is
+    thread-safe and has a maximum size limit to control memory usage. The cache key
+    is (cls, *args) where args are the parameters returned by _get_deserialize_parameter().
+
+    Attributes:
+        _serialize_cache: Instance-level cache for serialization results.
+            This is a simple cached value (bytes) that is computed once per
+            instance and reused.
+    """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        """Initialize the extension type with caching support.
+
+        Args:
+            *args: Positional arguments passed to the parent class.
+            **kwargs: Keyword arguments passed to the parent class.
+        """
+        # Instance-level cache for serialization results, no TTL
+        self._serialize_cache = None
+        self._cache_lock = threading.RLock()
+        super().__init__(*args, **kwargs)
+
+    def __arrow_ext_serialize__(self) -> bytes:
+        """Serialize the extension type using caching if enabled."""
+        with self._cache_lock:
+            if self._serialize_cache is None:
+                self._serialize_cache = self._arrow_ext_serialize_compute()
+            return self._serialize_cache
+
+    @abstractmethod
+    def _arrow_ext_serialize_compute(self) -> bytes:
+        """Subclasses must implement this method to compute serialization."""
+        ...
+
+    @classmethod
+    @functools.lru_cache(maxsize=ARROW_EXTENSION_SERIALIZATION_CACHE_MAXSIZE)
+    def _arrow_ext_deserialize_cache(cls: type, *args: Any, **kwargs: Any) -> Any:
+        """Deserialize the extension type using the class-level cache.
+
+        This method is cached using functools.lru_cache to improve performance
+        when deserializing extension types. The cache key includes the class (cls)
+        as the first argument, ensuring different subclasses get separate cache entries.
+
+        Args:
+            *args: Positional arguments passed to _arrow_ext_deserialize_compute.
+            **kwargs: Keyword arguments passed to _arrow_ext_deserialize_compute.
+
+        Returns:
+            The deserialized extension type instance.
+        """
+        return cls._arrow_ext_deserialize_compute(*args, **kwargs)
+
+    @classmethod
+    @abstractmethod
+    def _arrow_ext_deserialize_compute(cls, *args: Any, **kwargs: Any) -> Any:
+        """Subclasses must implement this method to compute deserialization."""
+        ...
+
+    @classmethod
+    @abstractmethod
+    def _get_deserialize_parameter(cls, storage_type, serialized) -> Tuple:
+        """Subclasses must implement this method to return the parameters for the deserialization cache."""
+        ...
+
+    @classmethod
+    def __arrow_ext_deserialize__(cls, storage_type, serialized) -> Any:
+        """Deserialize the extension type using caching if enabled."""
+        return cls._arrow_ext_deserialize_cache(
+            *cls._get_deserialize_parameter(storage_type, serialized)
+        )
+
+
 @DeveloperAPI
 class ArrowConversionError(Exception):
     """Error raised when there is an issue converting data to Arrow."""
@@ -431,7 +521,10 @@ def get_arrow_extension_variable_shape_tensor_types():
     return (ArrowVariableShapedTensorType,)
 
 
-class _BaseFixedShapeArrowTensorType(pa.ExtensionType, abc.ABC):
+# ArrowExtensionSerializeDeserializeCache needs to be first in the MRO to ensure the cache is used
+class _BaseFixedShapeArrowTensorType(
+    ArrowExtensionSerializeDeserializeCache, pa.ExtensionType
+):
     """
     Arrow ExtensionType for an array of fixed-shaped, homogeneous-typed
     tensors.
@@ -446,7 +539,6 @@ def __init__(
         self, shape: Tuple[int, ...], tensor_dtype: pa.DataType, ext_type_id: str
     ):
         self._shape = shape
-
         super().__init__(tensor_dtype, ext_type_id)
 
     @property
@@ -478,7 +570,7 @@ def __reduce__(self):
             self.__arrow_ext_serialize__(),
         )
 
-    def __arrow_ext_serialize__(self):
+    def _arrow_ext_serialize_compute(self):
         if ARROW_EXTENSION_SERIALIZATION_FORMAT == _SerializationFormat.CLOUDPICKLE:
             return cloudpickle.dumps(self._shape)
         elif ARROW_EXTENSION_SERIALIZATION_FORMAT == _SerializationFormat.JSON:
@@ -563,9 +655,13 @@ def __init__(self, shape: Tuple[int, ...], dtype: pa.DataType):
         super().__init__(shape, pa.list_(dtype), "ray.data.arrow_tensor")
 
     @classmethod
-    def __arrow_ext_deserialize__(cls, storage_type, serialized):
+    def _get_deserialize_parameter(cls, storage_type, serialized):
+        return (serialized, storage_type.value_type)
+
+    @classmethod
+    def _arrow_ext_deserialize_compute(cls, serialized, value_type):
         shape = tuple(_deserialize_with_fallback(serialized, "shape"))
-        return cls(shape, storage_type.value_type)
+        return cls(shape, value_type)
 
 
 @PublicAPI(stability="alpha")
@@ -586,9 +682,13 @@ def __init__(self, shape: Tuple[int, ...], dtype: pa.DataType):
         super().__init__(shape, pa.large_list(dtype), "ray.data.arrow_tensor_v2")
 
     @classmethod
-    def __arrow_ext_deserialize__(cls, storage_type, serialized):
+    def _get_deserialize_parameter(cls, storage_type, serialized):
+        return (serialized, storage_type.value_type)
+
+    @classmethod
+    def _arrow_ext_deserialize_compute(cls, serialized, value_type):
         shape = tuple(_deserialize_with_fallback(serialized, "shape"))
-        return cls(shape, storage_type.value_type)
+        return cls(shape, value_type)
 
 
 @PublicAPI(stability="beta")
@@ -878,8 +978,11 @@ def to_var_shaped_tensor_array(
         return target_type.wrap_array(storage)
 
 
+# ArrowExtensionSerializeDeserializeCache needs to be first in the MRO to ensure the cache is used
 @PublicAPI(stability="alpha")
-class ArrowVariableShapedTensorType(pa.ExtensionType):
+class ArrowVariableShapedTensorType(
+    ArrowExtensionSerializeDeserializeCache, pa.ExtensionType
+):
     """
     Arrow ExtensionType for an array of heterogeneous-shaped, homogeneous-typed
     tensors.
@@ -906,7 +1009,6 @@ def __init__(self, dtype: pa.DataType, ndim: int):
             ndim: The number of dimensions in the tensor elements.
         """
         self._ndim = ndim
-
         super().__init__(
             pa.struct(
                 [("data", pa.large_list(dtype)), ("shape", pa.list_(self.OFFSET_DTYPE))]
@@ -949,7 +1051,7 @@ def __reduce__(self):
             self.__arrow_ext_serialize__(),
         )
 
-    def __arrow_ext_serialize__(self):
+    def _arrow_ext_serialize_compute(self):
         if ARROW_EXTENSION_SERIALIZATION_FORMAT == _SerializationFormat.CLOUDPICKLE:
             return cloudpickle.dumps(self._ndim)
         elif ARROW_EXTENSION_SERIALIZATION_FORMAT == _SerializationFormat.JSON:
@@ -960,10 +1062,13 @@ def __arrow_ext_serialize__(self):
             )
 
     @classmethod
-    def __arrow_ext_deserialize__(cls, storage_type, serialized):
+    def _get_deserialize_parameter(cls, storage_type, serialized):
+        return (serialized, storage_type["data"].type.value_type)
+
+    @classmethod
+    def _arrow_ext_deserialize_compute(cls, serialized, value_type):
         ndim = _deserialize_with_fallback(serialized, "ndim")
-        dtype = storage_type["data"].type.value_type
-        return cls(dtype, ndim)
+        return cls(value_type, ndim)
 
     def __arrow_ext_class__(self):
         """
diff --git a/python/ray/air/util/tensor_extensions/utils.py b/python/ray/air/util/tensor_extensions/utils.py
@@ -1,5 +1,12 @@
 import warnings
-from typing import TYPE_CHECKING, Any, List, Protocol, Sequence, Union
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    List,
+    Protocol,
+    Sequence,
+    Union,
+)
 
 import numpy as np
 
diff --git a/python/ray/data/_internal/arrow_ops/transform_pyarrow.py b/python/ray/data/_internal/arrow_ops/transform_pyarrow.py
@@ -306,11 +306,10 @@ def unify_schemas(
     # Deduplicate schemas. Calling this before PyArrow's unify_schemas is more efficient (100x faster).
 
     # Remove metadata for hashability
-    schemas[0].remove_metadata()
+    schema_to_compare = schemas[0].remove_metadata()
     schemas_to_unify = [schemas[0]]
     for schema in schemas[1:]:
-        schema.remove_metadata()
-        if not schema.equals(schemas[0]):
+        if not schema.remove_metadata().equals(schema_to_compare):
             schemas_to_unify.append(schema)
 
     pyarrow_exception = None
@@ -670,9 +669,8 @@ def _concat_cols_with_native_pyarrow_types(
     # NOTE: Type promotions aren't available in Arrow < 14.0
     subset_blocks = []
     for block in blocks:
-        cols_to_select = [
-            col_name for col_name in col_names if col_name in block.schema.names
-        ]
+        block_cols = set(block.schema.names)
+        cols_to_select = [col_name for col_name in col_names if col_name in block_cols]
         subset_blocks.append(block.select(cols_to_select))
     if get_pyarrow_version() < parse_version("14.0.0"):
         table = pa.concat_tables(subset_blocks, promote=True)
diff --git a/python/ray/data/tests/test_tensor_extension.py b/python/ray/data/tests/test_tensor_extension.py