diff --git a/velox/dwio/common/DirectDecoder.h b/velox/dwio/common/DirectDecoder.h index 1c243899327ae..54c46f8032300 100644 --- a/velox/dwio/common/DirectDecoder.h +++ b/velox/dwio/common/DirectDecoder.h @@ -93,7 +93,17 @@ class DirectDecoder : public IntDecoder { } else if constexpr (std::is_same_v< typename Visitor::DataType, int128_t>) { - toSkip = visitor.process(super::template readInt(), atEnd); + if (super::numBytes != 12) { + toSkip = visitor.process(super::template readInt(), atEnd); + } else { + // Reads INT96 timestamp as int128_t type and extracts the days and + // nanos. + const int128_t encoded = super::template readInt(); + const int32_t days = encoded & ((1ULL << 32) - 1); + const uint64_t nanos = static_cast(encoded >> 32); + auto ts = Timestamp::fromDaysAndNanos(days, nanos); + toSkip = visitor.process(reinterpret_cast(ts), atEnd); + } } else { toSkip = visitor.process(super::template readInt(), atEnd); } diff --git a/velox/dwio/common/IntDecoder.h b/velox/dwio/common/IntDecoder.h index 016e8b1f8244d..045da8d4fa100 100644 --- a/velox/dwio/common/IntDecoder.h +++ b/velox/dwio/common/IntDecoder.h @@ -150,6 +150,9 @@ class IntDecoder { template T readInt(); + // Reads Int96 timestamp composed of days and nanos as int128_t. + int128_t readInt96(); + template T readVInt(); @@ -438,12 +441,43 @@ inline T IntDecoder::readInt() { return readLittleEndianFromBigEndian(); } else { if constexpr (std::is_same_v) { - VELOX_NYI(); + if (numBytes == 12) { + VELOX_DCHECK(!useVInts, "Int96 should not be VInt encoded."); + return readInt96(); + } else { + VELOX_NYI(); + } } return readLongLE(); } } +template +inline int128_t IntDecoder::readInt96() { + int64_t offset = 0; + unsigned char ch; + + // Read 8 unsigned bytes. + uint64_t part1 = 0; + for (uint32_t i = 0; i < 8; ++i) { + ch = readByte(); + part1 |= (ch & BASE_256_MASK) << offset; + offset += 8; + } + + // Read 4 signed bytes. + int32_t part2 = 0; + offset = 0; + for (uint32_t i = 0; i < 4; ++i) { + ch = readByte(); + part2 |= (ch & BASE_256_MASK) << offset; + offset += 8; + } + + int128_t result = part1; + return (result << 32) | part2; +} + template template inline T IntDecoder::readVInt() { diff --git a/velox/dwio/parquet/tests/examples/timestamp_dict_int96.parquet b/velox/dwio/parquet/tests/examples/timestamp_dict_int96.parquet new file mode 100644 index 0000000000000..661cb7a285227 Binary files /dev/null and b/velox/dwio/parquet/tests/examples/timestamp_dict_int96.parquet differ diff --git a/velox/dwio/parquet/tests/examples/timestamp_plain_int96.parquet b/velox/dwio/parquet/tests/examples/timestamp_plain_int96.parquet new file mode 100644 index 0000000000000..f2aa666b7d710 Binary files /dev/null and b/velox/dwio/parquet/tests/examples/timestamp_plain_int96.parquet differ diff --git a/velox/dwio/parquet/tests/reader/E2EFilterTest.cpp b/velox/dwio/parquet/tests/reader/E2EFilterTest.cpp index 0b27395f9a00d..1634da0556c2f 100644 --- a/velox/dwio/parquet/tests/reader/E2EFilterTest.cpp +++ b/velox/dwio/parquet/tests/reader/E2EFilterTest.cpp @@ -256,6 +256,20 @@ TEST_F(E2EFilterTest, integerDictionary) { 20); } +TEST_F(E2EFilterTest, timestampDirect) { + options_.enableDictionary = false; + options_.dataPageSize = 4 * 1024; + options_.writeInt96AsTimestamp = true; + + testWithTypes( + "timestamp_val_0:timestamp," + "timestamp_val_1:timestamp", + [&]() {}, + true, + {"timestamp_val_0", "timestamp_val_1"}, + 20); +} + TEST_F(E2EFilterTest, timestampDictionary) { options_.dataPageSize = 4 * 1024; options_.writeInt96AsTimestamp = true; diff --git a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp index 4261ee7022490..0a96227f19867 100644 --- a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp +++ b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp @@ -836,6 +836,34 @@ TEST_F(ParquetTableScanTest, timestampPrecisionMicrosecond) { assertEqualResults({expected}, result.second); } + +TEST_F(ParquetTableScanTest, timestampINT96) { + auto a = makeFlatVector({Timestamp(1, 0), Timestamp(2, 0)}); + auto expected = makeRowVector({"time"}, {a}); + createDuckDbTable("expected", {expected}); + + auto vector = makeArrayVector({{}}); + loadData( + getExampleFilePath("timestamp_dict_int96.parquet"), + ROW({"time"}, {TIMESTAMP()}), + makeRowVector( + {"time"}, + { + vector, + })); + assertSelect({"time"}, "SELECT time from expected"); + + loadData( + getExampleFilePath("timestamp_plain_int96.parquet"), + ROW({"time"}, {TIMESTAMP()}), + makeRowVector( + {"time"}, + { + vector, + })); + assertSelect({"time"}, "SELECT time from expected"); +} + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); folly::Init init{&argc, &argv, false};