Skip to content

Commit

Permalink
Support read 'p' type deletion vectors in Delta Lake
Browse files Browse the repository at this point in the history
  • Loading branch information
chenjian2664 committed Feb 7, 2025
1 parent b1d2302 commit 5286b97
Show file tree
Hide file tree
Showing 17 changed files with 86 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,21 @@ private DeletionVectors() {}
public static RoaringBitmapArray readDeletionVectors(TrinoFileSystem fileSystem, Location location, DeletionVectorEntry deletionVector)
throws IOException
{
if (deletionVector.storageType().equals(UUID_MARKER)) {
TrinoInputFile inputFile = fileSystem.newInputFile(location.appendPath(toFileName(deletionVector.pathOrInlineDv())));
ByteBuffer buffer = readDeletionVector(inputFile, deletionVector.offset().orElseThrow(), deletionVector.sizeInBytes());
return deserializeDeletionVectors(buffer);
}
if (deletionVector.storageType().equals(INLINE_MARKER) || deletionVector.storageType().equals(PATH_MARKER)) {
throw new TrinoException(NOT_SUPPORTED, "Unsupported storage type for deletion vector: " + deletionVector.storageType());
switch (deletionVector.storageType()) {
case UUID_MARKER -> {
TrinoInputFile inputFile = fileSystem.newInputFile(location.appendPath(toFileName(deletionVector.pathOrInlineDv())));
ByteBuffer buffer = readDeletionVector(inputFile, deletionVector.offset().orElseThrow(), deletionVector.sizeInBytes());
return deserializeDeletionVectors(buffer);
}
case PATH_MARKER -> {
TrinoInputFile inputFile = fileSystem.newInputFile(Location.of(deletionVector.pathOrInlineDv()));
if (!inputFile.exists()) {
throw new IllegalArgumentException("Unable to find 'p' type deletion vector by path: " + deletionVector.pathOrInlineDv());
}
ByteBuffer buffer = readDeletionVector(inputFile, deletionVector.offset().orElseThrow(), deletionVector.sizeInBytes());
return deserializeDeletionVectors(buffer);
}
case INLINE_MARKER -> throw new TrinoException(NOT_SUPPORTED, "Unsupported storage type for deletion vector: " + deletionVector.storageType());
}
throw new IllegalArgumentException("Unexpected storage type: " + deletionVector.storageType());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,29 @@ public void testUuidStorageType()
assertThat(bitmaps.contains(2)).isFalse();
}

/**
* @see deltalake.p_type_deletion_vectors
*/
@Test
public void testUnsupportedPathStorageType()
public void testPathStorageType()
throws Exception
{
Path path = new File(Resources.getResource("deltalake/p_type_deletion_vectors/p_deletion_vector_cloned").toURI()).toPath();
Location clonedTableLocation = Location.of(path.toString());
TrinoFileSystem fileSystem = HDFS_FILE_SYSTEM_FACTORY.create(SESSION);
DeletionVectorEntry deletionVector = new DeletionVectorEntry("p", "s3://bucket/table/deletion_vector.bin", OptionalInt.empty(), 40, 1);
assertThatThrownBy(() -> readDeletionVectors(fileSystem, Location.of("s3://bucket/table"), deletionVector))
.hasMessageContaining("Unsupported storage type for deletion vector: p");

String deletionVectorPath = new File(Resources.getResource("deltalake/p_type_deletion_vectors/p_deletion_vector_source/deletion_vector_ace02373-6e93-4445-bfc5-a5f3af97725e.bin").toURI())
.getAbsolutePath();

DeletionVectorEntry deletionVector1 = new DeletionVectorEntry("p", deletionVectorPath, OptionalInt.of(1), 34, 1);
RoaringBitmapArray bitmaps1 = readDeletionVectors(fileSystem, clonedTableLocation, deletionVector1);
assertThat(bitmaps1.contains(0)).isFalse();
assertThat(bitmaps1.contains(1)).isTrue();

DeletionVectorEntry deletionVector2 = new DeletionVectorEntry("p", deletionVectorPath, OptionalInt.of(43), 34, 1);
RoaringBitmapArray bitmaps2 = readDeletionVectors(fileSystem, clonedTableLocation, deletionVector2);
assertThat(bitmaps2.contains(0)).isTrue();
assertThat(bitmaps2.contains(1)).isFalse();
}

@Test
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Data generated using OSS Delta Lake 3.3.0.


```sql
CREATE TABLE p_deletion_vector_cloned SHALLOW CLONE p_deletion_vector_source;
```
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"commitInfo":{"timestamp":1738897446671,"operation":"CLONE","operationParameters":{"source":"spark_catalog.tiny.p_deletion_vector_source","sourceVersion":3},"readVersion":-1,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"removedFilesSize":"0","numRemovedFiles":"0","sourceTableSize":"872","numCopiedFiles":"0","copiedFilesSize":"0","sourceNumOfFiles":"2"},"engineInfo":"Apache-Spark/3.5.3 Delta-Lake/3.3.0","txnId":"50d6ccd3-3852-49d5-89f3-124194aa6a3b"}}
{"metaData":{"id":"e593184d-67ce-4987-8c34-4a6ff4603e64","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"v\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"part\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["part"],"configuration":{"delta.enableDeletionVectors":"true"},"createdTime":1738897033368}}
{"add":{"path":"s3://test-bucket/tiny/p_deletion_vector_source/part=2025-01-02/part-00000-088e57c5-0719-4c6e-9f74-2200eefbf737.c000.snappy.parquet","partitionValues":{"part":"2025-01-02"},"size":436,"modificationTime":1738897298000,"dataChange":true,"stats":"{\"numRecords\":2,\"minValues\":{\"v\":\"3\"},\"maxValues\":{\"v\":\"4\"},\"nullCount\":{\"v\":0},\"tightBounds\":false}","deletionVector":{"storageType":"p","pathOrInlineDv":"s3://test-bucket/tiny/p_deletion_vector_source/deletion_vector_ace02373-6e93-4445-bfc5-a5f3af97725e.bin","offset":1,"sizeInBytes":34,"cardinality":1}}}
{"add":{"path":"s3://test-bucket/tiny/p_deletion_vector_source/part=2025-01-01/part-00000-b402fe5d-d2d9-461e-845c-67519cc5e5b7.c000.snappy.parquet","partitionValues":{"part":"2025-01-01"},"size":436,"modificationTime":1738897298000,"dataChange":true,"stats":"{\"numRecords\":2,\"minValues\":{\"v\":\"1\"},\"maxValues\":{\"v\":\"2\"},\"nullCount\":{\"v\":0},\"tightBounds\":false}","deletionVector":{"storageType":"p","pathOrInlineDv":"s3://test-bucket/tiny/p_deletion_vector_source/deletion_vector_ace02373-6e93-4445-bfc5-a5f3af97725e.bin","offset":43,"sizeInBytes":34,"cardinality":1}}}
{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
Data generated using OSS Delta Lake 3.3.0.


```sql
CREATE TABLE p_deletion_vector_source
(v string, part date)
USING DELTA
PARTITIONED BY (part)
TBLPROPERTIES
('delta.enableDeletionVectors' = 'true');

INSERT INTO p_deletion_vector_source VALUES (1, '2025-01-01'),
(2, '2025-01-01'),
(3, '2025-01-02'),
(4, '2025-01-02');

OPTIMIZE p_deletion_vector_source;

DELETE FROM p_deletion_vector_source WHERE v IN (2, 3);
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"commitInfo":{"timestamp":1738897033643,"operation":"CREATE TABLE","operationParameters":{"partitionBy":"[\"part\"]","clusterBy":"[]","description":null,"isManaged":"true","properties":"{\"delta.enableDeletionVectors\":\"true\"}"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Apache-Spark/3.5.3 Delta-Lake/3.3.0","txnId":"3b379603-3838-438b-a186-f3fce1d3894b"}}
{"metaData":{"id":"5429a887-7b5b-4a03-a7ee-3b7bd8e7f522","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"v\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"part\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["part"],"configuration":{"delta.enableDeletionVectors":"true"},"createdTime":1738897033368}}
{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"commitInfo":{"timestamp":1738897245923,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":0,"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"4","numOutputRows":"4","numOutputBytes":"1748"},"engineInfo":"Apache-Spark/3.5.3 Delta-Lake/3.3.0","txnId":"8f6dbdc8-5ded-4b92-9b20-f4e401411068"}}
{"add":{"path":"part=2025-01-01/part-00000-8e4bc0e0-50bb-4d9b-b938-87a1f3db8fdc.c000.snappy.parquet","partitionValues":{"part":"2025-01-01"},"size":437,"modificationTime":1738897245000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"v\":\"1\"},\"maxValues\":{\"v\":\"1\"},\"nullCount\":{\"v\":0},\"tightBounds\":true}"}}
{"add":{"path":"part=2025-01-01/part-00001-ebe34a24-f74d-4361-862f-c0f893baa5ab.c000.snappy.parquet","partitionValues":{"part":"2025-01-01"},"size":437,"modificationTime":1738897245000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"v\":\"2\"},\"maxValues\":{\"v\":\"2\"},\"nullCount\":{\"v\":0},\"tightBounds\":true}"}}
{"add":{"path":"part=2025-01-02/part-00002-a6d9f5d1-4c7e-41f4-b5b8-4d7b7c59d283.c000.snappy.parquet","partitionValues":{"part":"2025-01-02"},"size":437,"modificationTime":1738897245000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"v\":\"3\"},\"maxValues\":{\"v\":\"3\"},\"nullCount\":{\"v\":0},\"tightBounds\":true}"}}
{"add":{"path":"part=2025-01-02/part-00003-f36390a2-b4e4-4d4b-b2a0-3d453ce0ae7d.c000.snappy.parquet","partitionValues":{"part":"2025-01-02"},"size":437,"modificationTime":1738897245000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"v\":\"4\"},\"maxValues\":{\"v\":\"4\"},\"nullCount\":{\"v\":0},\"tightBounds\":true}"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{"commitInfo":{"timestamp":1738897298789,"operation":"OPTIMIZE","operationParameters":{"predicate":"[]","zOrderBy":"[]","clusterBy":"[]","auto":false},"readVersion":1,"isolationLevel":"SnapshotIsolation","isBlindAppend":false,"operationMetrics":{"numRemovedFiles":"4","numRemovedBytes":"1748","p25FileSize":"436","numDeletionVectorsRemoved":"0","minFileSize":"436","numAddedFiles":"2","maxFileSize":"436","p75FileSize":"436","p50FileSize":"436","numAddedBytes":"872"},"engineInfo":"Apache-Spark/3.5.3 Delta-Lake/3.3.0","txnId":"1c7572ad-54e8-4577-ad73-9823b056e964"}}
{"add":{"path":"part=2025-01-01/part-00000-b402fe5d-d2d9-461e-845c-67519cc5e5b7.c000.snappy.parquet","partitionValues":{"part":"2025-01-01"},"size":436,"modificationTime":1738897298000,"dataChange":false,"stats":"{\"numRecords\":2,\"minValues\":{\"v\":\"1\"},\"maxValues\":{\"v\":\"2\"},\"nullCount\":{\"v\":0},\"tightBounds\":true}"}}
{"remove":{"path":"part=2025-01-01/part-00001-ebe34a24-f74d-4361-862f-c0f893baa5ab.c000.snappy.parquet","deletionTimestamp":1738897297138,"dataChange":false,"extendedFileMetadata":true,"partitionValues":{"part":"2025-01-01"},"size":437,"stats":"{\"numRecords\":1}"}}
{"remove":{"path":"part=2025-01-01/part-00000-8e4bc0e0-50bb-4d9b-b938-87a1f3db8fdc.c000.snappy.parquet","deletionTimestamp":1738897297138,"dataChange":false,"extendedFileMetadata":true,"partitionValues":{"part":"2025-01-01"},"size":437,"stats":"{\"numRecords\":1}"}}
{"add":{"path":"part=2025-01-02/part-00000-088e57c5-0719-4c6e-9f74-2200eefbf737.c000.snappy.parquet","partitionValues":{"part":"2025-01-02"},"size":436,"modificationTime":1738897298000,"dataChange":false,"stats":"{\"numRecords\":2,\"minValues\":{\"v\":\"3\"},\"maxValues\":{\"v\":\"4\"},\"nullCount\":{\"v\":0},\"tightBounds\":true}"}}
{"remove":{"path":"part=2025-01-02/part-00003-f36390a2-b4e4-4d4b-b2a0-3d453ce0ae7d.c000.snappy.parquet","deletionTimestamp":1738897297138,"dataChange":false,"extendedFileMetadata":true,"partitionValues":{"part":"2025-01-02"},"size":437,"stats":"{\"numRecords\":1}"}}
{"remove":{"path":"part=2025-01-02/part-00002-a6d9f5d1-4c7e-41f4-b5b8-4d7b7c59d283.c000.snappy.parquet","deletionTimestamp":1738897297138,"dataChange":false,"extendedFileMetadata":true,"partitionValues":{"part":"2025-01-02"},"size":437,"stats":"{\"numRecords\":1}"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"commitInfo":{"timestamp":1738897338914,"operation":"DELETE","operationParameters":{"predicate":"[\"v#1771 IN (2,3)\"]"},"readVersion":2,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numRemovedFiles":"0","numRemovedBytes":"0","numCopiedRows":"0","numDeletionVectorsAdded":"2","numDeletionVectorsRemoved":"0","numAddedChangeFiles":"0","executionTimeMs":"2799","numDeletionVectorsUpdated":"0","numDeletedRows":"2","scanTimeMs":"0","numAddedFiles":"0","numAddedBytes":"0","rewriteTimeMs":"0"},"engineInfo":"Apache-Spark/3.5.3 Delta-Lake/3.3.0","txnId":"0814ad98-ba9f-4619-bc03-3585c4a2e0f3"}}
{"add":{"path":"part=2025-01-02/part-00000-088e57c5-0719-4c6e-9f74-2200eefbf737.c000.snappy.parquet","partitionValues":{"part":"2025-01-02"},"size":436,"modificationTime":1738897298000,"dataChange":true,"stats":"{\"numRecords\":2,\"minValues\":{\"v\":\"3\"},\"maxValues\":{\"v\":\"4\"},\"nullCount\":{\"v\":0},\"tightBounds\":false}","deletionVector":{"storageType":"u","pathOrInlineDv":"TL+sAzJ^ESZS05vUA%oo","offset":1,"sizeInBytes":34,"cardinality":1}}}
{"add":{"path":"part=2025-01-01/part-00000-b402fe5d-d2d9-461e-845c-67519cc5e5b7.c000.snappy.parquet","partitionValues":{"part":"2025-01-01"},"size":436,"modificationTime":1738897298000,"dataChange":true,"stats":"{\"numRecords\":2,\"minValues\":{\"v\":\"1\"},\"maxValues\":{\"v\":\"2\"},\"nullCount\":{\"v\":0},\"tightBounds\":false}","deletionVector":{"storageType":"u","pathOrInlineDv":"TL+sAzJ^ESZS05vUA%oo","offset":43,"sizeInBytes":34,"cardinality":1}}}
{"remove":{"path":"part=2025-01-02/part-00000-088e57c5-0719-4c6e-9f74-2200eefbf737.c000.snappy.parquet","deletionTimestamp":1738897338470,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{"part":"2025-01-02"},"size":436,"stats":"{\"numRecords\":2}"}}
{"remove":{"path":"part=2025-01-01/part-00000-b402fe5d-d2d9-461e-845c-67519cc5e5b7.c000.snappy.parquet","deletionTimestamp":1738897338470,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{"part":"2025-01-01"},"size":436,"stats":"{\"numRecords\":2}"}}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit 5286b97

Please sign in to comment.