-
Notifications
You must be signed in to change notification settings - Fork 282
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix type of the a batch returned by make_batch_reader when TransformSpec's function returns column with all values being None #750
base: master
Are you sure you want to change the base?
Changes from all commits
80815cb
02bc27c
bb3dbec
9b2bb69
1fcf22f
6ef18b3
ad9defb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,7 @@ | |
|
||
import numpy as np | ||
import pandas as pd | ||
import pyarrow as pa | ||
import pytest | ||
from pyarrow import parquet as pq | ||
|
||
|
@@ -209,6 +210,65 @@ def preproc_fn1(x): | |
(3, 4, 5) == sample['tensor_col_2'].shape[1:] | ||
|
||
|
||
@pytest.mark.parametrize('null_column_dtype', [np.float64, np.unicode_]) | ||
@pytest.mark.parametrize('reader_factory', _D) | ||
def test_transform_spec_returns_all_none_values(scalar_dataset, null_column_dtype, reader_factory): | ||
def fill_id_with_nones(x): | ||
return pd.DataFrame({'id': [None] * len(x)}) | ||
|
||
edit_fields = [('id', null_column_dtype, (), True)] | ||
|
||
with reader_factory(scalar_dataset.url, schema_fields=["id"], | ||
transform_spec=TransformSpec(fill_id_with_nones, edit_fields=edit_fields)) as reader: | ||
sample = next(reader) | ||
assert sample.id.dtype.type == null_column_dtype | ||
|
||
|
||
@pytest.mark.parametrize('np_dtype, pa_dtype, null_value', | ||
((np.float32, pa.float32(), np.nan), (np.object_, pa.string(), None))) | ||
@pytest.mark.parametrize('reader_factory', _D) | ||
def test_entire_column_of_typed_nulls(reader_factory, np_dtype, pa_dtype, null_value, tmp_path): | ||
path = tmp_path / "dataset" | ||
schema = pa.schema([pa.field('all_nulls', pa_dtype)]) | ||
pq.write_table(pa.Table.from_pydict({"all_nulls": [null_value] * 10}, schema=schema), path) | ||
|
||
with reader_factory("file:///" + str(path)) as reader: | ||
sample = next(reader) | ||
assert sample.all_nulls.dtype == np_dtype | ||
if np_dtype == np.float32: | ||
assert np.all(np.isnan(sample.all_nulls)) | ||
elif np_dtype == np.object_: | ||
assert all(v is None for v in sample.all_nulls) | ||
else: | ||
assert False, "Unexpected np_dtype" | ||
|
||
|
||
@pytest.mark.parametrize('reader_factory', _D) | ||
def test_column_with_list_of_strings_some_are_null(reader_factory, tmp_path): | ||
path = tmp_path / "dataset" | ||
schema = pa.schema([pa.field('some_nulls', pa.list_(pa.string(), -1))]) | ||
pq.write_table(pa.Table.from_pydict({"some_nulls": [['a0', 'a1'], ['b0', None], [None, None]]}, schema=schema), | ||
path) | ||
|
||
with reader_factory("file:///" + str(path)) as reader: | ||
sample = next(reader) | ||
assert sample.some_nulls.dtype == np.object | ||
np.testing.assert_equal(sample.some_nulls, [['a0', 'a1'], ['b0', None], [None, None]]) | ||
|
||
|
||
@pytest.mark.parametrize('reader_factory', _D) | ||
def test_transform_spec_returns_all_none_values_in_a_list_field(scalar_dataset, reader_factory): | ||
def fill_id_with_nones(x): | ||
return pd.DataFrame({'int_fixed_size_list': [[None for _ in range(3)]] * len(x)}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We also ran into the NoneType issue with lists of strings. Consider adding string types to the test as well. The NoneType problem also occurs when only some of the values in the list are None, e.g. |
||
|
||
with reader_factory(scalar_dataset.url, schema_fields=["int_fixed_size_list"], | ||
transform_spec=TransformSpec(fill_id_with_nones)) as reader: | ||
sample = next(reader) | ||
# The type will be float as an numpy converts integer array that has null | ||
# values into a float64 array with nan as nulls | ||
assert sample.int_fixed_size_list.dtype.type == np.float64 | ||
|
||
|
||
@pytest.mark.parametrize('reader_factory', _D) | ||
@pytest.mark.parametrize('partition_by', [['string'], ['id'], ['string', 'id']]) | ||
def test_string_partition(reader_factory, tmpdir, partition_by): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These test cases assume the transform_spec func is creating the null values. In the more common case, there are missing values in fields unedited by the transform_spec. I believe this solution already addresses both cases but it would be good to demonstrate this in the tests either with an additional test case or additional non-edited fields in this test case.