From 8ef918a390a521b4455d72c7b81b3785c066938b Mon Sep 17 00:00:00 2001 From: Alexey Kudinkin Date: Tue, 12 Nov 2024 17:03:29 -0800 Subject: [PATCH] [Data] Fixing `DelegatingBlockBuilder` to avoid re-serializing objects multiple times (#48509) Currently, we're serializing first row in every block twice when adding it t/h `DelegatingBlockBuilder`, carrying tangible overhead and impact on latency for large enough rows. Provided that `ArrowBlockBuilder` is now able to handle arbitrary Python object we can just deprecate `DelegatingBlockBuilder` altogether. --------- Signed-off-by: Alexey Kudinkin --- .../ray/data/_internal/delegating_block_builder.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/python/ray/data/_internal/delegating_block_builder.py b/python/ray/data/_internal/delegating_block_builder.py index ab33c1aa4b3a..4655a8e24148 100644 --- a/python/ray/data/_internal/delegating_block_builder.py +++ b/python/ray/data/_internal/delegating_block_builder.py @@ -1,10 +1,8 @@ import collections from typing import Any, Mapping, Optional -from ray.air.util.tensor_extensions.arrow import ArrowConversionError from ray.data._internal.arrow_block import ArrowBlockBuilder from ray.data._internal.block_builder import BlockBuilder -from ray.data._internal.pandas_block import PandasBlockBuilder from ray.data.block import Block, BlockAccessor, BlockType, DataBatch @@ -23,17 +21,8 @@ def _inferred_block_type(self) -> Optional[BlockType]: def add(self, item: Mapping[str, Any]) -> None: assert isinstance(item, collections.abc.Mapping), item - import pyarrow - if self._builder is None: - try: - check = ArrowBlockBuilder() - check.add(item) - check.build() - self._builder = ArrowBlockBuilder() - except (TypeError, pyarrow.lib.ArrowInvalid, ArrowConversionError): - # Can also handle nested Python objects, which Arrow cannot. - self._builder = PandasBlockBuilder() + self._builder = ArrowBlockBuilder() self._builder.add(item)