[data] Fix reading from zipped json (#58214)

iamjustinhsu · web-flow · commit 6dd3776acc28 · 2025-10-29T11:32:16.000-07:00
## Description ### Status Quo This PR #54667 addressed issues of OOM by sampling a few lines of the file. However, this code always assumes the input file is seekable(ie, not compressed). This means zipped files are broken like this issue: #55356 ### Potential Workaround - Refractor reused code between JsonDatasource and FileDatasource - default to 10000 if zipped file found ## Related issues #55356 ## Additional information > Optional: Add implementation details, API changes, usage examples, screenshots, etc. --------- Signed-off-by: iamjustinhsu <jhsu@anyscale.com>
diff --git a/python/ray/data/_internal/datasource/json_datasource.py b/python/ray/data/_internal/datasource/json_datasource.py
@@ -171,6 +171,9 @@ class PandasJSONDatasource(FileBasedDatasource):
     # reads bigger blocks at once.
     _BUFFER_SIZE = 1024**2
 
+    # In the case of zipped json files, we cannot infer the chunk_size.
+    _DEFAULT_CHUNK_SIZE = 10000
+
     def __init__(
         self,
         paths: Union[str, List[str]],
@@ -200,6 +203,9 @@ def _estimate_chunksize(self, f: "pyarrow.NativeFile") -> Optional[int]:
 
         This is necessary to avoid OOMs while reading the file.
         """
+
+        if not f.seekable():
+            return self._DEFAULT_CHUNK_SIZE
         assert f.tell() == 0, "File pointer must be at the beginning"
 
         if self._target_output_size_bytes is None:
@@ -230,10 +236,14 @@ def _open_input_source(
         path: str,
         **open_args,
     ) -> "pyarrow.NativeFile":
-        # Use seekable file so we can reset the file after sampling the first row.
-        file = filesystem.open_input_file(path, **open_args)
-        assert file.seekable(), "File must be seekable"
-        return file
+
+        compression = self.resolve_compression(path, open_args)
+
+        if compression is None:
+            # We use a seekable file to estimate chunksize.
+            return filesystem.open_input_file(path)
+
+        return super()._open_input_source(filesystem, path, **open_args)
 
 
 def _cast_range_index_to_string(df: pd.DataFrame):
diff --git a/python/ray/data/_internal/util.py b/python/ray/data/_internal/util.py
@@ -36,6 +36,7 @@
 import ray
 from ray._common.retry import call_with_retry
 from ray.data.context import DEFAULT_READ_OP_MIN_NUM_BLOCKS, WARN_PREFIX, DataContext
+from ray.util.annotations import DeveloperAPI
 
 import psutil
 
@@ -1712,3 +1713,21 @@ def merge_resources_to_ray_remote_args(
     if memory is not None:
         ray_remote_args["memory"] = memory
     return ray_remote_args
+
+
+@DeveloperAPI
+def infer_compression(path: str) -> Optional[str]:
+    import pyarrow as pa
+
+    compression = None
+    try:
+        # Try to detect compression codec from path.
+        compression = pa.Codec.detect(path).name
+    except (ValueError, TypeError):
+        # Arrow's compression inference on the file path doesn't work for Snappy, so we double-check ourselves.
+        import pathlib
+
+        suffix = pathlib.Path(path).suffix
+        if suffix and suffix[1:] == "snappy":
+            compression = "snappy"
+    return compression
diff --git a/python/ray/data/datasource/file_based_datasource.py b/python/ray/data/datasource/file_based_datasource.py
@@ -22,6 +22,7 @@
     RetryingPyFileSystem,
     _check_pyarrow_version,
     _is_local_scheme,
+    infer_compression,
     iterate_with_retry,
     make_async_gen,
 )
@@ -321,6 +322,50 @@ def read_task_fn():
 
         return read_tasks
 
+    def resolve_compression(
+        self, path: str, open_args: Dict[str, Any]
+    ) -> Optional[str]:
+        """Resolves the compression format for a stream.
+
+        Args:
+            path: The file path to resolve compression for.
+            open_args: kwargs passed to
+                `pyarrow.fs.FileSystem.open_input_stream <https://arrow.apache.org/docs/python/generated/pyarrow.fs.FileSystem.html#pyarrow.fs.FileSystem.open_input_stream>`_
+                when opening input files to read.
+
+        Returns:
+            The compression format (e.g., "gzip", "snappy", "bz2") or None if
+            no compression is detected or specified.
+        """
+        compression = open_args.get("compression", None)
+        if compression is None:
+            compression = infer_compression(path)
+        return compression
+
+    def _resolve_buffer_size(self, open_args: Dict[str, Any]) -> Optional[int]:
+        buffer_size = open_args.pop("buffer_size", None)
+        if buffer_size is None:
+            buffer_size = self._data_context.streaming_read_buffer_size
+        return buffer_size
+
+    def _file_to_snappy_stream(
+        self,
+        file: "pyarrow.NativeFile",
+        filesystem: "RetryingPyFileSystem",
+    ) -> "pyarrow.PythonFile":
+        import pyarrow as pa
+        import snappy
+        from pyarrow.fs import HadoopFileSystem
+
+        stream = io.BytesIO()
+        if isinstance(filesystem.unwrap(), HadoopFileSystem):
+            snappy.hadoop_snappy.stream_decompress(src=file, dst=stream)
+        else:
+            snappy.stream_decompress(src=file, dst=stream)
+        stream.seek(0)
+
+        return pa.PythonFile(stream, mode="r")
+
     def _open_input_source(
         self,
         filesystem: "RetryingPyFileSystem",
@@ -336,53 +381,22 @@ def _open_input_source(
         Implementations that do not support streaming reads (e.g. that require random
         access) should override this method.
         """
-        import pyarrow as pa
-        from pyarrow.fs import HadoopFileSystem
 
-        compression = open_args.get("compression", None)
-        if compression is None:
-            try:
-                # If no compression manually given, try to detect
-                # compression codec from path.
-                compression = pa.Codec.detect(path).name
-            except (ValueError, TypeError):
-                # Arrow's compression inference on the file path
-                # doesn't work for Snappy, so we double-check ourselves.
-                import pathlib
-
-                suffix = pathlib.Path(path).suffix
-                if suffix and suffix[1:] == "snappy":
-                    compression = "snappy"
-                else:
-                    compression = None
-
-        buffer_size = open_args.pop("buffer_size", None)
-        if buffer_size is None:
-            buffer_size = self._data_context.streaming_read_buffer_size
+        compression = self.resolve_compression(path, open_args)
+        buffer_size = self._resolve_buffer_size(open_args)
 
         if compression == "snappy":
             # Arrow doesn't support streaming Snappy decompression since the canonical
             # C++ Snappy library doesn't natively support streaming decompression. We
             # works around this by manually decompressing the file with python-snappy.
             open_args["compression"] = None
-        else:
-            open_args["compression"] = compression
-
-        file = filesystem.open_input_stream(path, buffer_size=buffer_size, **open_args)
-
-        if compression == "snappy":
-            import snappy
-
-            stream = io.BytesIO()
-            if isinstance(filesystem.unwrap(), HadoopFileSystem):
-                snappy.hadoop_snappy.stream_decompress(src=file, dst=stream)
-            else:
-                snappy.stream_decompress(src=file, dst=stream)
-            stream.seek(0)
-
-            file = pa.PythonFile(stream, mode="r")
+            file = filesystem.open_input_stream(
+                path, buffer_size=buffer_size, **open_args
+            )
+            return self._file_to_snappy_stream(file, filesystem)
 
-        return file
+        open_args["compression"] = compression
+        return filesystem.open_input_stream(path, buffer_size=buffer_size, **open_args)
 
     def _rows_per_file(self):
         """Returns the number of rows per file, or None if unknown."""
diff --git a/python/ray/data/tests/test_json.py b/python/ray/data/tests/test_json.py
@@ -528,13 +528,22 @@ class TestPandasJSONDatasource:
         [{"a": []}, {"a": [1]}, {"a": [1, 2, 3]}],
         ids=["empty", "single", "multiple"],
     )
+    @pytest.mark.parametrize(
+        "compression,filename",
+        [("gzip", "test.json.gz"), ("infer", "test.json")],  # infer = default
+    )
     def test_read_stream(
-        self, data, tmp_path, target_max_block_size_infinite_or_default
+        self,
+        data,
+        tmp_path,
+        compression,
+        filename,
+        target_max_block_size_infinite_or_default,
     ):
         # Setup test file.
         df = pd.DataFrame(data)
-        path = os.path.join(tmp_path, "test.json")
-        df.to_json(path, orient="records", lines=True)
+        path = os.path.join(tmp_path, filename)
+        df.to_json(path, orient="records", lines=True, compression=compression)
 
         # Setup datasource.
         local_filesystem = fs.LocalFileSystem()