Skip to content

Commit e34c0b8

Browse files
committed
[data] Fix HTTP streaming file download by using open_input_stream
Signed-off-by: xyuzh <xinyzng@gmail.com>
1 parent f2a7a94 commit e34c0b8

File tree

1 file changed

+9
-4
lines changed

1 file changed

+9
-4
lines changed

python/ray/data/_internal/planner/plan_download_op.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -189,14 +189,19 @@ def download_bytes_threaded(
189189
def load_uri_bytes(uri_path_iterator):
190190
"""Function that takes an iterator of URI paths and yields downloaded bytes for each."""
191191
for uri_path in uri_path_iterator:
192+
# Handle both file and stream for uri download:
192193
try:
193-
with fs.open_input_file(uri_path) as f:
194+
with fs.open_input_stream(uri_path) as f:
194195
yield f.read()
195196
except OSError as e:
196-
logger.debug(
197-
f"Failed to download URI '{uri_path}' from column '{uri_column_name}' with error: {e}"
198-
)
197+
logger.debug(f"OSError: '{uri_path}' from column '{uri_column_name}' with error: {e}")
199198
yield None
199+
except Exception as e:
200+
# Catch unexpected errors like pyarrow.lib.ArrowInvalid caused by invalid uri like `foo://bar`
201+
# This make sure we don't fail the entire dataset download because of one invalid uri.
202+
logger.error(f"Unexpected error in load_uri_bytes: {e}")
203+
yield None
204+
200205

201206
# Use make_async_gen to download URI bytes concurrently
202207
# This preserves the order of results to match the input URIs

0 commit comments

Comments
 (0)