ray-project · bveeramani · Jul 17, 2025 · Jul 16, 2025 · Jul 16, 2025 · Jul 17, 2025
@@ -1,5 +1,5 @@
+import io
 import logging
-from io import BytesIO
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
 import pandas as pd
@@ -82,7 +82,7 @@ def _read_with_pyarrow_read_json(self, buffer: "pyarrow.lib.Buffer"):
         while True:
             try:
                 yield pajson.read_json(
-                    BytesIO(buffer),
+                    io.BytesIO(buffer),
                     read_options=self.read_options,
                     **self.arrow_json_args,
                 )
@@ -124,7 +124,7 @@ def _read_with_python_json(self, buffer: "pyarrow.lib.Buffer"):
         if buffer.size == 0:
             return
 
-        parsed_json = json.load(BytesIO(buffer))
+        parsed_json = json.load(io.BytesIO(buffer))
         try:
             yield pa.Table.from_pylist(parsed_json)
         except AttributeError as e:
@@ -159,6 +159,15 @@ def _read_stream(self, f: "pyarrow.NativeFile", path: str):
 
 
 class PandasJSONDatasource(FileBasedDatasource):
+
+    # Buffer size in bytes for reading files. Default is 1MB.
+    #
+    # pandas reads data in small chunks (~8 KiB), which leads to many costly
+    # small read requests when accessing cloud storage. To reduce overhead and
+    # improve performance, we wrap the file in a larger buffered reader that
+    # reads bigger blocks at once.
+    _BUFFER_SIZE = 1024**2
+
     def __init__(
         self,
         paths: Union[str, List[str]],
@@ -171,22 +180,31 @@ def __init__(
 
     def _read_stream(self, f: "pyarrow.NativeFile", path: str):
         chunksize = self._estimate_chunksize(f)
-        with pd.read_json(f, chunksize=chunksize, lines=True) as reader:
+        stream = StrictBufferedReader(f, buffer_size=self._BUFFER_SIZE)
+        with pd.read_json(stream, chunksize=chunksize, lines=True) as reader:
             for df in reader:
                 yield _cast_range_index_to_string(df)
 
     def _estimate_chunksize(self, f: "pyarrow.NativeFile") -> int:
+        """Estimate the chunksize by sampling the first row.
+
+        This is necessary to avoid OOMs while reading the file.
+        """
         assert f.tell() == 0, "File pointer must be at the beginning"
 
-        with pd.read_json(f, chunksize=1, lines=True) as reader:
-            df = _cast_range_index_to_string(next(reader))
+        stream = StrictBufferedReader(f, buffer_size=self._BUFFER_SIZE)
+        with pd.read_json(stream, chunksize=1, lines=True) as reader:
+            try:
+                df = _cast_range_index_to_string(next(reader))
+            except StopIteration:
+                return 1
 
         block_accessor = PandasBlockAccessor.for_block(df)
         if block_accessor.num_rows() == 0:
-            return 1
-
-        bytes_per_row = block_accessor.size_bytes() / block_accessor.num_rows()
-        chunksize = max(round(self._target_output_size_bytes / bytes_per_row), 1)
+            chunksize = 1
+        else:
+            bytes_per_row = block_accessor.size_bytes() / block_accessor.num_rows()
+            chunksize = max(round(self._target_output_size_bytes / bytes_per_row), 1)
 
         # Reset file pointer to the beginning.
         f.seek(0)
@@ -199,7 +217,7 @@ def _open_input_source(
         path: str,
         **open_args,
     ) -> "pyarrow.NativeFile":
-        # Use seekable file to ensure we can correctly sample the first row.
+        # Use seekable file so we can reset the file after sampling the first row.
         file = filesystem.open_input_file(path, **open_args)
         assert file.seekable(), "File must be seekable"
         return file
@@ -211,3 +229,34 @@ def _cast_range_index_to_string(df: pd.DataFrame):
     if isinstance(df.columns, pd.RangeIndex):
         df.columns = df.columns.astype(str)
     return df
+
+
+class StrictBufferedReader(io.RawIOBase):
+    """Wrapper that prevents premature file closure and ensures full-buffered reads.
+
+    This is necessary for two reasons:
+    1. The datasource reads the file twice -- first to sample and determine the chunk size,
+       and again to load the actual data. Since pandas assumes ownership of the file and
+       may close it, we prevent that by explicitly detaching the underlying file before
+       closing the buffer.
+
+    2. pandas wraps the file in a TextIOWrapper to decode bytes into text. TextIOWrapper
+       prefers calling read1(), which doesn't prefetch for random-access files
+       (e.g., from PyArrow). This wrapper forces all reads through the full buffer to
+       avoid inefficient small-range S3 GETs.
+    """
+
+    def __init__(self, file: io.RawIOBase, buffer_size: int):
+        self._file = io.BufferedReader(file, buffer_size=buffer_size)
+
+    def read(self, size=-1, /):
+        return self._file.read(size)
+
+    def readable(self) -> bool:
+        return True
+
+    def close(self):
+        if not self.closed:
+            self._file.detach()
+            self._file.close()
+            super().close()