Skip to content

Commit

Permalink
apacheGH-36537: [Python] Ensure dataset writer follows default Parque…
Browse files Browse the repository at this point in the history
…t version of 2.6
  • Loading branch information
jorisvandenbossche committed Jul 7, 2023
1 parent 8509052 commit ed02603
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 3 deletions.
8 changes: 7 additions & 1 deletion python/pyarrow/_dataset_parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
"use_deprecated_int96_timestamps",
"coerce_timestamps",
"allow_truncated_timestamps",
"use_compliant_nested_type",
}

setters = set()
Expand Down Expand Up @@ -586,7 +587,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
self._properties = dict(
use_dictionary=True,
compression="snappy",
version="1.0",
version="2.6",
write_statistics=None,
data_page_size=None,
compression_level=None,
Expand All @@ -601,6 +602,11 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
self._set_properties()
self._set_arrow_properties()

def __repr__(self):
return "<pyarrow.dataset.ParquetFileWriteOptions {0}>".format(
" ".join([f"{key}={value}" for key, value in self._properties.items()])
)


cdef set _PARQUET_READ_OPTIONS = {
'dictionary_columns', 'coerce_int96_timestamp_unit'
Expand Down
18 changes: 16 additions & 2 deletions python/pyarrow/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4539,7 +4539,9 @@ def test_write_table_partitioned_dict(tempdir):
@pytest.mark.parquet
def test_write_dataset_parquet(tempdir):
table = pa.table([
pa.array(range(20)), pa.array(np.random.randn(20)),
pa.array(range(20), type="uint32"),
pa.array(np.arange("2012-01-01", 20, dtype="datetime64[D]").astype(
"datetime64[ns]")),
pa.array(np.repeat(['a', 'b'], 10))
], names=["f1", "f2", "part"])

Expand All @@ -4551,20 +4553,32 @@ def test_write_dataset_parquet(tempdir):
file_paths = list(base_dir.rglob("*"))
expected_paths = [base_dir / "part-0.parquet"]
assert set(file_paths) == set(expected_paths)
# check Table roundtrip
# check Table roundtrip with default version
result = ds.dataset(base_dir, format="parquet").to_table()
assert result.equals(table)

# using custom options
for version in ["1.0", "2.4", "2.6"]:
format = ds.ParquetFileFormat()
opts = format.make_write_options(version=version)
assert "<pyarrow.dataset.ParquetFileWriteOptions" in repr(opts)
base_dir = tempdir / 'parquet_dataset_version{0}'.format(version)
ds.write_dataset(table, base_dir, format=format, file_options=opts)
meta = pq.read_metadata(base_dir / "part-0.parquet")
expected_version = "1.0" if version == "1.0" else "2.6"
assert meta.format_version == expected_version

# ensure version is actually honored based on supported datatypes
result = ds.dataset(base_dir, format="parquet").to_table()
schema = table.schema
if version == "1.0":
# uint32 is written as int64
schema = schema.set(0, schema.field(0).with_type(pa.int64()))
if version in ("1.0", "2.4"):
schema = schema.set(1, schema.field(1).with_type(pa.timestamp("us")))
expected = table.cast(schema)
assert result.equals(expected)


def test_write_dataset_csv(tempdir):
table = pa.table([
Expand Down

0 comments on commit ed02603

Please sign in to comment.