diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000000..b57f642b441 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.parquet filter=lfs diff=lfs merge=lfs -text diff --git a/.github/workflows/check-ci.yml b/.github/workflows/check-ci.yml index 14678aa511f..a80f9153b61 100644 --- a/.github/workflows/check-ci.yml +++ b/.github/workflows/check-ci.yml @@ -16,6 +16,8 @@ jobs: steps: - name: Checkout uses: actions/checkout@v4 + with: + lfs: true - name: Setup JDK 11 id: setup-java-11 diff --git a/.github/workflows/nightly-check-ci.yml b/.github/workflows/nightly-check-ci.yml index 12184ffc72c..b8402c0a409 100644 --- a/.github/workflows/nightly-check-ci.yml +++ b/.github/workflows/nightly-check-ci.yml @@ -24,6 +24,8 @@ jobs: steps: - name: Checkout uses: actions/checkout@v4 + with: + lfs: true - name: Setup JDK 11 id: setup-java-11 diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java index 3e14d8b2d33..db33cf9c3bc 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java @@ -236,8 +236,17 @@ private static void buildChildren(Types.GroupBuilder builder, Iterator> visit(final LogicalTypeAnnotation.TimeLogicalTypeAnnot @Override public Optional> visit( final LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { - // TODO(deephaven-core#3588): Unable to read parquet TimestampLogicalTypeAnnotation that is not adjusted + // TODO(deephaven-core#976): Unable to read parquet TimestampLogicalTypeAnnotation that is not adjusted // to UTC if (timestampLogicalType.isAdjustedToUTC()) { switch (timestampLogicalType.getUnit()) { diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetColumnLocation.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetColumnLocation.java index 3ac34501aeb..f9adbc44b95 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetColumnLocation.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetColumnLocation.java @@ -700,6 +700,8 @@ private static class LogicalTypeVisitor @Override public Optional> visit( LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { + // TODO(deephaven-core#976): Unable to read parquet TimestampLogicalTypeAnnotation that is not adjusted + // to UTC if (timestampLogicalType.isAdjustedToUTC()) { return Optional .of(ToInstantPage.create(componentType, timestampLogicalType.getUnit())); diff --git a/extensions/parquet/table/src/test/e0.py b/extensions/parquet/table/src/test/e0.py index 09416337baa..f6cc32c323d 100644 --- a/extensions/parquet/table/src/test/e0.py +++ b/extensions/parquet/table/src/test/e0.py @@ -8,7 +8,8 @@ "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], - "f": pd.date_range("20130101", periods=3), + # TODO(deephaven-core#976): Unable to read parquet TimestampLogicalTypeAnnotation that is not adjusted to UTC + # "f": pd.date_range("20130101", periods=3), "g": pd.date_range("20130101", periods=3, tz="US/Eastern"), "h": pd.Categorical(list("abc")), "i": pd.Categorical(list("abc"), ordered=True), diff --git a/extensions/parquet/table/src/test/e0.requirements.txt b/extensions/parquet/table/src/test/e0.requirements.txt index ba37038bfb6..bce4a36fa96 100644 --- a/extensions/parquet/table/src/test/e0.requirements.txt +++ b/extensions/parquet/table/src/test/e0.requirements.txt @@ -1,6 +1,6 @@ numpy==1.24.2 pandas==1.5.3 -pyarrow==4.0.1 +pyarrow==5.0.0 python-dateutil==2.8.2 pytz==2022.7.1 six==1.16.0 diff --git a/extensions/parquet/table/src/test/e1.py b/extensions/parquet/table/src/test/e1.py index 408c327f3a8..450179b49e1 100644 --- a/extensions/parquet/table/src/test/e1.py +++ b/extensions/parquet/table/src/test/e1.py @@ -8,7 +8,8 @@ "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], - "f": pd.date_range("20130101", periods=3), + # TODO(deephaven-core#976): Unable to read parquet TimestampLogicalTypeAnnotation that is not adjusted to UTC + # "f": pd.date_range("20130101", periods=3), "g": pd.date_range("20130101", periods=3, tz="US/Eastern"), "h": pd.Categorical(list("abc")), "i": pd.Categorical(list("abc"), ordered=True), diff --git a/extensions/parquet/table/src/test/e2.py b/extensions/parquet/table/src/test/e2.py index 446fb28519a..9fa3560a1e0 100644 --- a/extensions/parquet/table/src/test/e2.py +++ b/extensions/parquet/table/src/test/e2.py @@ -8,7 +8,8 @@ "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], - "f": pd.date_range("20130101", periods=3), + # TODO(deephaven-core#976): Unable to read parquet TimestampLogicalTypeAnnotation that is not adjusted to UTC + # "f": pd.date_range("20130101", periods=3), "g": pd.date_range("20130101", periods=3, tz="US/Eastern"), "h": pd.Categorical(list("abc")), "i": pd.Categorical(list("abc"), ordered=True), diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/TestParquetTools.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/TestParquetTools.java index 031c108a544..1add72076ce 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/TestParquetTools.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/TestParquetTools.java @@ -458,24 +458,23 @@ public void e1() { assertTableEquals(uncompressed, zstd); } - // TODO(deephaven-core#3588): Unable to read parquet TimestampLogicalTypeAnnotation that is not adjusted to UTC - // @Test - // public void e2() { - // final Table uncompressed = - // ParquetTools.readTable(TestParquetTools.class.getResource("/e2/uncompressed.parquet").getFile()); - // - // final Table gzip = ParquetTools.readTable(TestParquetTools.class.getResource("/e2/gzip.parquet").getFile()); - // assertTableEquals(uncompressed, gzip); - // - // final Table lz4 = ParquetTools.readTable(TestParquetTools.class.getResource("/e2/lz4.parquet").getFile()); - // assertTableEquals(uncompressed, lz4); - // - // final Table snappy = ParquetTools.readTable(TestParquetTools.class.getResource("/e2/snappy.parquet").getFile()); - // assertTableEquals(uncompressed, snappy); - // - // final Table zstd = ParquetTools.readTable(TestParquetTools.class.getResource("/e2/zstd.parquet").getFile()); - // assertTableEquals(uncompressed, zstd); - // } + @Test + public void e2() { + final Table uncompressed = + ParquetTools.readTable(TestParquetTools.class.getResource("/e2/uncompressed.parquet").getFile()); + + final Table gzip = ParquetTools.readTable(TestParquetTools.class.getResource("/e2/gzip.parquet").getFile()); + assertTableEquals(uncompressed, gzip); + + final Table lz4 = ParquetTools.readTable(TestParquetTools.class.getResource("/e2/lz4.parquet").getFile()); + assertTableEquals(uncompressed, lz4); + + final Table snappy = ParquetTools.readTable(TestParquetTools.class.getResource("/e2/snappy.parquet").getFile()); + assertTableEquals(uncompressed, snappy); + + final Table zstd = ParquetTools.readTable(TestParquetTools.class.getResource("/e2/zstd.parquet").getFile()); + assertTableEquals(uncompressed, zstd); + } private void testWriteRead(Table source, Function transform) { final File f2w = new File(testRoot, "testWriteRead.parquet"); diff --git a/extensions/parquet/table/src/test/resources/e0/brotli.parquet b/extensions/parquet/table/src/test/resources/e0/brotli.parquet index 01e7d994f64..e26a64f8c61 100644 Binary files a/extensions/parquet/table/src/test/resources/e0/brotli.parquet and b/extensions/parquet/table/src/test/resources/e0/brotli.parquet differ diff --git a/extensions/parquet/table/src/test/resources/e0/gzip.parquet b/extensions/parquet/table/src/test/resources/e0/gzip.parquet index b68912d7cdb..39cd91a4ee3 100644 Binary files a/extensions/parquet/table/src/test/resources/e0/gzip.parquet and b/extensions/parquet/table/src/test/resources/e0/gzip.parquet differ diff --git a/extensions/parquet/table/src/test/resources/e0/lz4.parquet b/extensions/parquet/table/src/test/resources/e0/lz4.parquet index 7762b171369..3dd0120a98c 100644 Binary files a/extensions/parquet/table/src/test/resources/e0/lz4.parquet and b/extensions/parquet/table/src/test/resources/e0/lz4.parquet differ diff --git a/extensions/parquet/table/src/test/resources/e0/pyarrow_stats.parquet b/extensions/parquet/table/src/test/resources/e0/pyarrow_stats.parquet index 503301ca035..74a01eff3cd 100644 Binary files a/extensions/parquet/table/src/test/resources/e0/pyarrow_stats.parquet and b/extensions/parquet/table/src/test/resources/e0/pyarrow_stats.parquet differ diff --git a/extensions/parquet/table/src/test/resources/e0/snappy.parquet b/extensions/parquet/table/src/test/resources/e0/snappy.parquet index 70610211695..dd0ef9114a8 100644 Binary files a/extensions/parquet/table/src/test/resources/e0/snappy.parquet and b/extensions/parquet/table/src/test/resources/e0/snappy.parquet differ diff --git a/extensions/parquet/table/src/test/resources/e0/uncompressed.parquet b/extensions/parquet/table/src/test/resources/e0/uncompressed.parquet index a10b74a8ab6..cc6d359aa9e 100644 Binary files a/extensions/parquet/table/src/test/resources/e0/uncompressed.parquet and b/extensions/parquet/table/src/test/resources/e0/uncompressed.parquet differ diff --git a/extensions/parquet/table/src/test/resources/e0/zstd.parquet b/extensions/parquet/table/src/test/resources/e0/zstd.parquet index 39047596648..20e8f23b9a2 100644 Binary files a/extensions/parquet/table/src/test/resources/e0/zstd.parquet and b/extensions/parquet/table/src/test/resources/e0/zstd.parquet differ diff --git a/extensions/parquet/table/src/test/resources/e1/brotli.parquet b/extensions/parquet/table/src/test/resources/e1/brotli.parquet index bbda7e06fb9..a91acfd4788 100644 Binary files a/extensions/parquet/table/src/test/resources/e1/brotli.parquet and b/extensions/parquet/table/src/test/resources/e1/brotli.parquet differ diff --git a/extensions/parquet/table/src/test/resources/e1/gzip.parquet b/extensions/parquet/table/src/test/resources/e1/gzip.parquet index fcd8ba59e75..c38f1487f95 100644 Binary files a/extensions/parquet/table/src/test/resources/e1/gzip.parquet and b/extensions/parquet/table/src/test/resources/e1/gzip.parquet differ diff --git a/extensions/parquet/table/src/test/resources/e1/lz4.parquet b/extensions/parquet/table/src/test/resources/e1/lz4.parquet index 2b474d731ac..6baaaf61cd0 100644 Binary files a/extensions/parquet/table/src/test/resources/e1/lz4.parquet and b/extensions/parquet/table/src/test/resources/e1/lz4.parquet differ diff --git a/extensions/parquet/table/src/test/resources/e1/snappy.parquet b/extensions/parquet/table/src/test/resources/e1/snappy.parquet index 36a55927719..e914e31503c 100644 Binary files a/extensions/parquet/table/src/test/resources/e1/snappy.parquet and b/extensions/parquet/table/src/test/resources/e1/snappy.parquet differ diff --git a/extensions/parquet/table/src/test/resources/e1/uncompressed.parquet b/extensions/parquet/table/src/test/resources/e1/uncompressed.parquet index 79700941900..94ee35905cd 100644 Binary files a/extensions/parquet/table/src/test/resources/e1/uncompressed.parquet and b/extensions/parquet/table/src/test/resources/e1/uncompressed.parquet differ diff --git a/extensions/parquet/table/src/test/resources/e1/zstd.parquet b/extensions/parquet/table/src/test/resources/e1/zstd.parquet index 52317b0f204..b654640e0f6 100644 Binary files a/extensions/parquet/table/src/test/resources/e1/zstd.parquet and b/extensions/parquet/table/src/test/resources/e1/zstd.parquet differ diff --git a/extensions/parquet/table/src/test/resources/e2/brotli.parquet b/extensions/parquet/table/src/test/resources/e2/brotli.parquet index f0def9676f0..2fde3adac4d 100644 Binary files a/extensions/parquet/table/src/test/resources/e2/brotli.parquet and b/extensions/parquet/table/src/test/resources/e2/brotli.parquet differ diff --git a/extensions/parquet/table/src/test/resources/e2/gzip.parquet b/extensions/parquet/table/src/test/resources/e2/gzip.parquet index 30d83fc0cbd..16c359740dd 100644 Binary files a/extensions/parquet/table/src/test/resources/e2/gzip.parquet and b/extensions/parquet/table/src/test/resources/e2/gzip.parquet differ diff --git a/extensions/parquet/table/src/test/resources/e2/lz4.parquet b/extensions/parquet/table/src/test/resources/e2/lz4.parquet index c7416517a78..ac80cd17b63 100644 Binary files a/extensions/parquet/table/src/test/resources/e2/lz4.parquet and b/extensions/parquet/table/src/test/resources/e2/lz4.parquet differ diff --git a/extensions/parquet/table/src/test/resources/e2/snappy.parquet b/extensions/parquet/table/src/test/resources/e2/snappy.parquet index f06e8ff2cf5..2b0b2a54217 100644 Binary files a/extensions/parquet/table/src/test/resources/e2/snappy.parquet and b/extensions/parquet/table/src/test/resources/e2/snappy.parquet differ diff --git a/extensions/parquet/table/src/test/resources/e2/uncompressed.parquet b/extensions/parquet/table/src/test/resources/e2/uncompressed.parquet index 50855c72df8..1b18f5ec570 100644 Binary files a/extensions/parquet/table/src/test/resources/e2/uncompressed.parquet and b/extensions/parquet/table/src/test/resources/e2/uncompressed.parquet differ diff --git a/extensions/parquet/table/src/test/resources/e2/zstd.parquet b/extensions/parquet/table/src/test/resources/e2/zstd.parquet index 29d4295166d..aa9a74dc97f 100644 Binary files a/extensions/parquet/table/src/test/resources/e2/zstd.parquet and b/extensions/parquet/table/src/test/resources/e2/zstd.parquet differ diff --git a/extensions/parquet/table/src/test/resources/sample_lz4_compressed.parquet b/extensions/parquet/table/src/test/resources/sample_lz4_compressed.parquet index cfbdc7ef2db..ce1a7ba020b 100644 Binary files a/extensions/parquet/table/src/test/resources/sample_lz4_compressed.parquet and b/extensions/parquet/table/src/test/resources/sample_lz4_compressed.parquet differ diff --git a/py/server/tests/data/crypto_trades.parquet b/py/server/tests/data/crypto_trades.parquet index 25e13e8a08f..426f50c4142 100644 Binary files a/py/server/tests/data/crypto_trades.parquet and b/py/server/tests/data/crypto_trades.parquet differ diff --git a/py/server/tests/test_parquet.py b/py/server/tests/test_parquet.py index d1a1f1338bf..57ce42594e1 100644 --- a/py/server/tests/test_parquet.py +++ b/py/server/tests/test_parquet.py @@ -10,7 +10,7 @@ import pandas import pyarrow.parquet -from deephaven import empty_table, dtypes, new_table +from deephaven import DHError, empty_table, dtypes, new_table from deephaven import arrow as dharrow from deephaven.column import InputColumn from deephaven.pandas import to_pandas, to_table @@ -304,7 +304,7 @@ def round_trip_with_compression(self, compression_codec_name, dh_table, vector_c # result_table = read('data_from_pandas.parquet') # self.assert_table_equals(dh_table, result_table) - def test_writing_via_pyarrow(self): + def test_writing_lists_via_pyarrow(self): # This function tests that we can write tables with list types to parquet files via pyarrow and read them back # through deephaven's parquet reader code with no exceptions pa_table = pyarrow.table({'numList': [[2, 2, 4]], @@ -314,6 +314,25 @@ def test_writing_via_pyarrow(self): pa_table_from_disk = dharrow.to_arrow(from_disk) self.assertTrue(pa_table.equals(pa_table_from_disk)) + def test_writing_time_via_pyarrow(self): + def _test_writing_time_helper(filename): + metadata = pyarrow.parquet.read_metadata(filename) + if "isAdjustedToUTC=false" in str(metadata.row_group(0).column(0)): + # TODO(deephaven-core#976): Unable to read non UTC adjusted timestamps + with self.assertRaises(DHError) as e: + read(filename) + self.assertIn("ParquetFileReaderException", e.exception.root_cause) + + df = pandas.DataFrame({ + "f": pandas.date_range("20130101", periods=3), + }) + df.to_parquet("pyarrow_26.parquet", engine='pyarrow', compression=None, version='2.6') + _test_writing_time_helper("pyarrow_26.parquet") + df.to_parquet("pyarrow_24.parquet", engine='pyarrow', compression=None, version='2.4') + _test_writing_time_helper("pyarrow_24.parquet") + df.to_parquet("pyarrow_10.parquet", engine='pyarrow', compression=None, version='1.0') + _test_writing_time_helper("pyarrow_10.parquet") + def test_dictionary_encoding(self): dh_table = empty_table(10).update(formulas=[ "shortStringColumn = `Row ` + i",