Skip to content

Commit 8fdb894

Browse files
committed
Add comprehensive edge case tests for URI download failures
- Add test_download_expression_with_invalid_uris: Tests exception handling for URIs that fail to download (OSError), verifying None is returned instead of crashing (covers commit c9d9108) - Add test_download_expression_all_size_estimations_fail: Tests divide-by-zero fix when all URI size estimations fail during partitioning, ensuring fallback to row count works correctly (covers commit 0959734) - Add test_download_expression_mixed_valid_and_invalid_size_estimation: Tests partial failures during size estimation with mix of valid/invalid URIs These tests properly cover the code paths that were previously untested: - load_uri_bytes exception handling (lines 192-196) - avg_nbytes_per_row == 0 check (lines 285-291) - Failed size estimation handling (lines 326-332) Signed-off-by: xyuzh <xinyzng@gmail.com>
1 parent 0959734 commit 8fdb894

File tree

1 file changed

+109
-0
lines changed

1 file changed

+109
-0
lines changed

python/ray/data/tests/test_download_expression.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,115 @@ def test_download_expression_with_null_uris(self):
294294
# If it fails, should be a reasonable error (not a crash)
295295
assert isinstance(e, (ValueError, KeyError, RuntimeError))
296296

297+
def test_download_expression_with_invalid_uris(self, tmp_path):
298+
"""Test download expression with URIs that fail to download.
299+
300+
This tests the exception handling in load_uri_bytes (commit c9d91080fb)
301+
where OSError is caught and None is returned for failed downloads.
302+
"""
303+
# Create one valid file
304+
valid_file = tmp_path / "valid.txt"
305+
valid_file.write_bytes(b"valid content")
306+
307+
# Create URIs: one valid, one non-existent file, one invalid path
308+
table = pa.Table.from_arrays(
309+
[
310+
pa.array([
311+
f"local://{valid_file}",
312+
f"local://{tmp_path}/nonexistent.txt", # File doesn't exist
313+
"local:///this/path/does/not/exist/file.txt", # Invalid path
314+
]),
315+
],
316+
names=["uri"],
317+
)
318+
319+
ds = ray.data.from_arrow(table)
320+
ds_with_downloads = ds.with_column("bytes", download("uri"))
321+
322+
# Should not crash - failed downloads return None
323+
results = ds_with_downloads.take_all()
324+
assert len(results) == 3
325+
326+
# First URI should succeed
327+
assert results[0]["bytes"] == b"valid content"
328+
329+
# Second and third URIs should fail gracefully (return None)
330+
assert results[1]["bytes"] is None
331+
assert results[2]["bytes"] is None
332+
333+
def test_download_expression_all_size_estimations_fail(self):
334+
"""Test download expression when all URI size estimations fail.
335+
336+
This tests the divide-by-zero fix (commit 095973428f) where failed
337+
size estimations append 0 instead of being skipped, and avg_nbytes_per_row == 0
338+
is checked to prevent division by zero.
339+
"""
340+
# Create URIs that will fail size estimation (non-existent files)
341+
# Using enough URIs to trigger size estimation sampling
342+
invalid_uris = [
343+
f"local:///nonexistent/path/file_{i}.txt"
344+
for i in range(30) # More than INIT_SAMPLE_BATCH_SIZE (25)
345+
]
346+
347+
table = pa.Table.from_arrays(
348+
[pa.array(invalid_uris)],
349+
names=["uri"],
350+
)
351+
352+
ds = ray.data.from_arrow(table)
353+
ds_with_downloads = ds.with_column("bytes", download("uri"))
354+
355+
# Should not crash with divide-by-zero error
356+
# The PartitionActor should handle all failed size estimations gracefully
357+
# and fall back to using the number of rows in the block as partition size
358+
results = ds_with_downloads.take_all()
359+
360+
# All downloads should fail gracefully (return None)
361+
assert len(results) == 30
362+
for result in results:
363+
assert result["bytes"] is None
364+
365+
def test_download_expression_mixed_valid_and_invalid_size_estimation(self, tmp_path):
366+
"""Test download expression with mix of valid and invalid URIs for size estimation.
367+
368+
This tests that size estimation handles partial failures correctly.
369+
"""
370+
# Create some valid files
371+
valid_files = []
372+
for i in range(10):
373+
file_path = tmp_path / f"valid_{i}.txt"
374+
file_path.write_bytes(b"x" * 100) # 100 bytes each
375+
valid_files.append(str(file_path))
376+
377+
# Mix valid and invalid URIs
378+
mixed_uris = []
379+
for i in range(30):
380+
if i % 3 == 0 and i // 3 < len(valid_files):
381+
# Every 3rd URI is valid (for first 10)
382+
mixed_uris.append(f"local://{valid_files[i // 3]}")
383+
else:
384+
# Others are invalid
385+
mixed_uris.append(f"local:///nonexistent/file_{i}.txt")
386+
387+
table = pa.Table.from_arrays(
388+
[pa.array(mixed_uris)],
389+
names=["uri"],
390+
)
391+
392+
ds = ray.data.from_arrow(table)
393+
ds_with_downloads = ds.with_column("bytes", download("uri"))
394+
395+
# Should not crash - should handle mixed valid/invalid gracefully
396+
results = ds_with_downloads.take_all()
397+
assert len(results) == 30
398+
399+
# Verify valid URIs downloaded successfully
400+
for i, result in enumerate(results):
401+
if i % 3 == 0 and i // 3 < len(valid_files):
402+
assert result["bytes"] == b"x" * 100
403+
else:
404+
assert result["bytes"] is None
405+
297406

298407
class TestDownloadExpressionIntegration:
299408
"""Integration tests combining download expressions with other Ray Data operations."""

0 commit comments

Comments
 (0)