Skip to content

Commit

Permalink
Fix incorrect result when reading deletion vectors in Delta Lake
Browse files Browse the repository at this point in the history
This issue happens when Parquet file contains several pages and
predicates filter page when parquet_use_column_index is set to true.
  • Loading branch information
ebyhr committed Jan 10, 2025
1 parent a50629f commit e208752
Show file tree
Hide file tree
Showing 10 changed files with 54 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ public ConnectorPageSource createPageSource(
ParquetReaderOptions options = parquetReaderOptions.withMaxReadBlockSize(getParquetMaxReadBlockSize(session))
.withMaxReadBlockRowCount(getParquetMaxReadBlockRowCount(session))
.withSmallFileThreshold(getParquetSmallFileThreshold(session))
.withUseColumnIndex(isParquetUseColumnIndex(session))
.withUseColumnIndex(split.getDeletionVector().isEmpty() && isParquetUseColumnIndex(session))
.withIgnoreStatistics(isParquetIgnoreStatistics(session))
.withVectorizedDecodingEnabled(isParquetVectorizedDecodingEnabled(session));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1402,6 +1402,35 @@ void testDeletionVectorsRepeat()
}
}

@Test
void testDeletionVectorsPages()
throws Exception
{
testDeletionVectorsPages(true);
testDeletionVectorsPages(false);
}

private void testDeletionVectorsPages(boolean parquetUseColumnIndex)
throws Exception
{
String tableName = "deletion_vectors_pages" + randomNameSuffix();
Path tableLocation = catalogDir.resolve(tableName);
copyDirectoryContents(new File(Resources.getResource("deltalake/deletion_vector_pages").toURI()).toPath(), tableLocation);
assertUpdate("CALL system.register_table('%s', '%s', '%s')".formatted(getSession().getSchema().orElseThrow(), tableName, tableLocation.toUri()));

Session session = Session.builder(getSession())
.setCatalogSessionProperty("delta", "parquet_use_column_index", Boolean.toString(parquetUseColumnIndex))
.build();

assertQueryReturnsEmptyResult(session, "SELECT * FROM " + tableName + " WHERE id = 20001");
assertThat(query(session, "SELECT * FROM " + tableName + " WHERE id = 99999")).matches("VALUES 99999");

assertThat(query(session, "SELECT id, _change_type, _commit_version FROM TABLE(system.table_changes('tpch', '" +tableName+ "')) WHERE id = 20001"))
.matches("VALUES (20001, VARCHAR 'insert', BIGINT '1'), (20001, VARCHAR 'update_preimage', BIGINT '2')");

assertUpdate("DROP TABLE " + tableName);
}

@Test
public void testUnsupportedVacuumDeletionVectors()
throws Exception
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
Data generated using Delta Lake 3.2.0:

```sql
CREATE TABLE test_deletion_vector_pages
(id INT)
USING delta
LOCATION 's3://test-bucket/test_deletion_vector_pages3'
TBLPROPERTIES ('delta.enableDeletionVectors' = 'true', 'delta.enableChangeDataFeed' = 'true');

-- Write 2 pages and update a row in the second page
-- 20000 is the default size of DEFAULT_PAGE_ROW_COUNT_LIMIT
INSERT INTO test_deletion_vector_pages SELECT explode(sequence(1, 20001));
UPDATE test_deletion_vector_pages SET id = 99999 WHERE id = 20001;
```
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"commitInfo":{"timestamp":1736468279775,"operation":"CREATE TABLE","operationParameters":{"partitionBy":"[]","clusterBy":"[]","description":null,"isManaged":"false","properties":"{\"delta.enableChangeDataFeed\":\"true\",\"delta.enableDeletionVectors\":\"true\"}"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Apache-Spark/3.5.0 Delta-Lake/3.2.0","txnId":"0a1b0291-0ee3-4283-a3b9-17c6ebf506cf"}}
{"metaData":{"id":"ec0e9b1d-2340-402e-a5a3-9cee9f4fbfb4","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableChangeDataFeed":"true","delta.enableDeletionVectors":"true"},"createdTime":1736468279750}}
{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","changeDataFeed"]}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"commitInfo":{"timestamp":1736468280380,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":0,"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"20001","numOutputBytes":"80540"},"engineInfo":"Apache-Spark/3.5.0 Delta-Lake/3.2.0","txnId":"72c9962f-8a24-4877-bee3-5259ac43b66b"}}
{"add":{"path":"part-00000-8e78760d-6337-4ce4-a7b8-3ce3e7c5be44-c000.snappy.parquet","partitionValues":{},"size":80540,"modificationTime":1736468280000,"dataChange":true,"stats":"{\"numRecords\":20001,\"minValues\":{\"id\":1},\"maxValues\":{\"id\":20001},\"nullCount\":{\"id\":0},\"tightBounds\":true}"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"commitInfo":{"timestamp":1736468282105,"operation":"UPDATE","operationParameters":{"predicate":"[\"(id#3023 = 20001)\"]"},"readVersion":1,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numRemovedFiles":"1","numRemovedBytes":"773","numCopiedRows":"0","numDeletionVectorsAdded":"1","numDeletionVectorsRemoved":"0","numAddedChangeFiles":"1","executionTimeMs":"1544","numDeletionVectorsUpdated":"0","scanTimeMs":"1023","numAddedFiles":"1","numUpdatedRows":"1","numAddedBytes":"669","rewriteTimeMs":"243"},"engineInfo":"Apache-Spark/3.5.0 Delta-Lake/3.2.0","txnId":"bc1f9b9d-3d04-4cf2-a8d1-b6b75e922f6c"}}
{"add":{"path":"part-00000-8e78760d-6337-4ce4-a7b8-3ce3e7c5be44-c000.snappy.parquet","partitionValues":{},"size":80540,"modificationTime":1736468280000,"dataChange":true,"stats":"{\"numRecords\":20001,\"minValues\":{\"id\":1},\"maxValues\":{\"id\":20001},\"nullCount\":{\"id\":0},\"tightBounds\":false}","deletionVector":{"storageType":"u","pathOrInlineDv":"hR{.8.y4^dHj2[P[Sd0J","offset":1,"sizeInBytes":34,"cardinality":1}}}
{"remove":{"path":"part-00000-8e78760d-6337-4ce4-a7b8-3ce3e7c5be44-c000.snappy.parquet","deletionTimestamp":1736468281584,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":80540,"stats":"{\"numRecords\":20001}"}}
{"add":{"path":"part-00000-2007aafe-e829-46a6-b6cf-97f6ec686718.c000.snappy.parquet","partitionValues":{},"size":669,"modificationTime":1736468282000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":99999},\"maxValues\":{\"id\":99999},\"nullCount\":{\"id\":0},\"tightBounds\":true}"}}
{"cdc":{"path":"_change_data/cdc-00000-8e47b370-7c53-4609-9432-1bbd8d71463a.c000.snappy.parquet","partitionValues":{},"size":773,"dataChange":false}}
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit e208752

Please sign in to comment.