Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add parquet_ignore_statistics session property to iceberg and delta #20228

Merged
merged 4 commits into from
Jan 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/src/main/sphinx/connector/object-storage-file-formats.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ with Parquet files performed by supported object storage connectors:
catalog session property is `parquet_use_column_index`. Only supported by
the Delta Lake and Hive connectors.
- `true`
* - `parquet.ignore-statistics`
- Ignore statistics from Parquet to allow querying files with corrupted or
incorrect statistics. The equivalent catalog session property is
`parquet_ignore_statistics`.
- `false`
* - `parquet.max-read-block-row-count`
- Sets the maximum number of rows read in a batch. The equivalent catalog
session property is named `parquet_max_read_block_row_count` and supported
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
import static io.trino.plugin.deltalake.DeltaLakeSessionProperties.getParquetMaxReadBlockRowCount;
import static io.trino.plugin.deltalake.DeltaLakeSessionProperties.getParquetMaxReadBlockSize;
import static io.trino.plugin.deltalake.DeltaLakeSessionProperties.getParquetSmallFileThreshold;
import static io.trino.plugin.deltalake.DeltaLakeSessionProperties.isParquetIgnoreStatistics;
import static io.trino.plugin.deltalake.DeltaLakeSessionProperties.isParquetUseColumnIndex;
import static io.trino.plugin.deltalake.delete.DeletionVectors.readDeletionVectors;
import static io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport.extractSchema;
Expand Down Expand Up @@ -204,7 +205,8 @@ public ConnectorPageSource createPageSource(
ParquetReaderOptions options = parquetReaderOptions.withMaxReadBlockSize(getParquetMaxReadBlockSize(session))
.withMaxReadBlockRowCount(getParquetMaxReadBlockRowCount(session))
.withSmallFileThreshold(getParquetSmallFileThreshold(session))
.withUseColumnIndex(isParquetUseColumnIndex(session));
.withUseColumnIndex(isParquetUseColumnIndex(session))
.withIgnoreStatistics(isParquetIgnoreStatistics(session));

Map<Integer, String> parquetFieldIdToName = columnMappingMode == ColumnMappingMode.ID ? loadParquetIdAndNameMapping(inputFile, options) : ImmutableMap.of();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ public final class DeltaLakeSessionProperties
private static final String PARQUET_MAX_READ_BLOCK_ROW_COUNT = "parquet_max_read_block_row_count";
private static final String PARQUET_SMALL_FILE_THRESHOLD = "parquet_small_file_threshold";
private static final String PARQUET_USE_COLUMN_INDEX = "parquet_use_column_index";
private static final String PARQUET_IGNORE_STATISTICS = "parquet_ignore_statistics";
private static final String PARQUET_WRITER_BLOCK_SIZE = "parquet_writer_block_size";
private static final String PARQUET_WRITER_PAGE_SIZE = "parquet_writer_page_size";
private static final String TARGET_MAX_FILE_SIZE = "target_max_file_size";
Expand Down Expand Up @@ -131,6 +132,11 @@ public DeltaLakeSessionProperties(
"Use Parquet column index",
parquetReaderConfig.isUseColumnIndex(),
false),
booleanProperty(
PARQUET_IGNORE_STATISTICS,
"Ignore statistics from Parquet to allow querying files with corrupted or incorrect statistics",
parquetReaderConfig.isIgnoreStatistics(),
false),
dataSizeProperty(
PARQUET_WRITER_BLOCK_SIZE,
"Parquet: Writer block size",
Expand Down Expand Up @@ -257,6 +263,11 @@ public static boolean isParquetUseColumnIndex(ConnectorSession session)
return session.getProperty(PARQUET_USE_COLUMN_INDEX, Boolean.class);
}

public static boolean isParquetIgnoreStatistics(ConnectorSession session)
{
return session.getProperty(PARQUET_IGNORE_STATISTICS, Boolean.class);
}

public static DataSize getParquetWriterBlockSize(ConnectorSession session)
{
return session.getProperty(PARQUET_WRITER_BLOCK_SIZE, DataSize.class);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
import static io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR;
import static io.trino.plugin.deltalake.DeltaLakeSessionProperties.getParquetMaxReadBlockRowCount;
import static io.trino.plugin.deltalake.DeltaLakeSessionProperties.getParquetMaxReadBlockSize;
import static io.trino.plugin.deltalake.DeltaLakeSessionProperties.isParquetIgnoreStatistics;
import static io.trino.plugin.deltalake.DeltaLakeSessionProperties.isParquetUseColumnIndex;
import static io.trino.plugin.deltalake.functions.tablechanges.TableChangesFileType.CDF_FILE;
import static io.trino.spi.function.table.TableFunctionProcessorState.Finished.FINISHED;
Expand Down Expand Up @@ -175,7 +176,8 @@ private static DeltaLakePageSource createDeltaLakePageSource(
parquetReaderOptions = parquetReaderOptions
.withMaxReadBlockSize(getParquetMaxReadBlockSize(session))
.withMaxReadBlockRowCount(getParquetMaxReadBlockRowCount(session))
.withUseColumnIndex(isParquetUseColumnIndex(session));
.withUseColumnIndex(isParquetUseColumnIndex(session))
.withIgnoreStatistics(isParquetIgnoreStatistics(session));

List<DeltaLakeColumnHandle> splitColumns = switch (split.fileType()) {
case CDF_FILE -> ImmutableList.<DeltaLakeColumnHandle>builder().addAll(handle.columns())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,27 @@

import com.google.common.collect.ContiguousSet;
import com.google.common.collect.ImmutableMap;
import io.trino.Session;
import io.trino.operator.OperatorStats;
import io.trino.plugin.hive.containers.HiveMinioDataLake;
import io.trino.spi.QueryId;
import io.trino.testing.AbstractTestQueryFramework;
import io.trino.testing.DistributedQueryRunner;
import io.trino.testing.MaterializedResult;
import io.trino.testing.MaterializedResultWithQueryId;
import io.trino.testing.MaterializedRow;
import io.trino.testing.QueryRunner;
import org.intellij.lang.annotations.Language;
import org.junit.jupiter.api.Test;

import java.nio.file.Path;
import java.util.OptionalLong;
import java.util.Set;

import static com.google.common.collect.MoreCollectors.onlyElement;
import static io.trino.plugin.deltalake.DeltaLakeQueryRunner.DELTA_CATALOG;
import static io.trino.plugin.deltalake.DeltaLakeQueryRunner.createS3DeltaLakeQueryRunner;
import static io.trino.testing.QueryAssertions.assertEqualsIgnoreOrder;
import static io.trino.testing.TestingNames.randomNameSuffix;
import static java.lang.String.format;
import static org.assertj.core.api.Assertions.assertThat;
Expand Down Expand Up @@ -130,6 +136,42 @@ public void testUpdatePushdown()
table));
}

@Test
public void testIgnoreParquetStatistics()
{
String table = testTable.register("ignore_parquet_statistics");
@Language("SQL") String query = "SELECT * FROM " + table + " WHERE custkey = 1450";

DistributedQueryRunner queryRunner = getDistributedQueryRunner();
MaterializedResultWithQueryId resultWithoutParquetStatistics = queryRunner.executeWithQueryId(
Session.builder(getSession())
.setCatalogSessionProperty(getSession().getCatalog().orElseThrow(), "parquet_ignore_statistics", "true")
.build(),
query);
OperatorStats queryStatsWithoutParquetStatistics = getOperatorStats(resultWithoutParquetStatistics.getQueryId());
assertThat(queryStatsWithoutParquetStatistics.getPhysicalInputPositions()).isGreaterThan(0);

MaterializedResultWithQueryId resultWithParquetStatistics = queryRunner.executeWithQueryId(getSession(), query);
OperatorStats queryStatsWithParquetStatistics = getOperatorStats(resultWithParquetStatistics.getQueryId());
assertThat(queryStatsWithParquetStatistics.getPhysicalInputPositions()).isGreaterThan(0);
assertThat(queryStatsWithParquetStatistics.getPhysicalInputPositions())
.isLessThan(queryStatsWithoutParquetStatistics.getPhysicalInputPositions());

assertEqualsIgnoreOrder(resultWithParquetStatistics.getResult(), resultWithoutParquetStatistics.getResult());
}

private OperatorStats getOperatorStats(QueryId queryId)
{
return getDistributedQueryRunner().getCoordinator()
.getQueryManager()
.getFullQueryInfo(queryId)
.getQueryStats()
.getOperatorSummaries()
.stream()
.filter(summary -> summary.getOperatorType().startsWith("TableScan") || summary.getOperatorType().startsWith("Scan"))
.collect(onlyElement());
}

/**
* Assert on the number of rows read and updated by a read operation
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,11 @@ public class ParquetReaderConfig

private ParquetReaderOptions options = new ParquetReaderOptions();

@Deprecated
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was it intentional that the session property parquet_ignore_statistics was introduced only for Hive, but not for Iceberg (Delta Lake connector didn't exist at the time) in 1981cef ?

cc @findepi

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it was originally added to Trino due to a bug in Trino which caused us to ingest corrupted statistics.

public boolean isIgnoreStatistics()
{
return options.isIgnoreStatistics();
}

@Deprecated
@Config("parquet.ignore-statistics")
@ConfigDescription("Ignore statistics from Parquet to allow querying files with corrupted or incorrect statistics")
public ParquetReaderConfig setIgnoreStatistics(boolean ignoreStatistics)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@
import static io.trino.plugin.iceberg.IcebergSessionProperties.getParquetSmallFileThreshold;
import static io.trino.plugin.iceberg.IcebergSessionProperties.isOrcBloomFiltersEnabled;
import static io.trino.plugin.iceberg.IcebergSessionProperties.isOrcNestedLazy;
import static io.trino.plugin.iceberg.IcebergSessionProperties.isParquetIgnoreStatistics;
import static io.trino.plugin.iceberg.IcebergSessionProperties.isUseFileSizeFromMetadata;
import static io.trino.plugin.iceberg.IcebergSessionProperties.useParquetBloomFilter;
import static io.trino.plugin.iceberg.IcebergSplitManager.ICEBERG_DOMAIN_COMPACTION_THRESHOLD;
Expand Down Expand Up @@ -611,6 +612,7 @@ public ReaderPageSourceWithRowPositions createDataPageSource(
.withMaxReadBlockSize(getParquetMaxReadBlockSize(session))
.withMaxReadBlockRowCount(getParquetMaxReadBlockRowCount(session))
.withSmallFileThreshold(getParquetSmallFileThreshold(session))
.withIgnoreStatistics(isParquetIgnoreStatistics(session))
.withBloomFilter(useParquetBloomFilter(session))
// TODO https://github.com/trinodb/trino/issues/11000
.withUseColumnIndex(false),
Expand Down Expand Up @@ -984,7 +986,7 @@ private static ReaderPageSourceWithRowPositions createParquetPageSource(

MessageType requestedSchema = getMessageType(regularColumns, fileSchema.getName(), parquetIdToField);
Map<List<String>, ColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
TupleDomain<ColumnDescriptor> parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate);
TupleDomain<ColumnDescriptor> parquetTupleDomain = options.isIgnoreStatistics() ? TupleDomain.all() : getParquetTupleDomain(descriptorsByPath, effectivePredicate);
TupleDomainParquetPredicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath, UTC);

List<RowGroupInfo> rowGroups = getFilteredRowGroups(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ public final class IcebergSessionProperties
private static final String PARQUET_USE_BLOOM_FILTER = "parquet_use_bloom_filter";
private static final String PARQUET_MAX_READ_BLOCK_ROW_COUNT = "parquet_max_read_block_row_count";
private static final String PARQUET_SMALL_FILE_THRESHOLD = "parquet_small_file_threshold";
private static final String PARQUET_IGNORE_STATISTICS = "parquet_ignore_statistics";
private static final String PARQUET_WRITER_BLOCK_SIZE = "parquet_writer_block_size";
private static final String PARQUET_WRITER_PAGE_SIZE = "parquet_writer_page_size";
private static final String PARQUET_WRITER_BATCH_SIZE = "parquet_writer_batch_size";
Expand Down Expand Up @@ -235,6 +236,11 @@ public IcebergSessionProperties(
parquetReaderConfig.getSmallFileThreshold(),
value -> validateMaxDataSize(PARQUET_SMALL_FILE_THRESHOLD, value, DataSize.valueOf(PARQUET_READER_MAX_SMALL_FILE_THRESHOLD)),
false))
.add(booleanProperty(
PARQUET_IGNORE_STATISTICS,
"Ignore statistics from Parquet to allow querying files with corrupted or incorrect statistics",
parquetReaderConfig.isIgnoreStatistics(),
false))
.add(dataSizeProperty(
PARQUET_WRITER_BLOCK_SIZE,
"Parquet: Writer block size",
Expand Down Expand Up @@ -448,6 +454,11 @@ public static DataSize getParquetSmallFileThreshold(ConnectorSession session)
return session.getProperty(PARQUET_SMALL_FILE_THRESHOLD, DataSize.class);
}

public static boolean isParquetIgnoreStatistics(ConnectorSession session)
raunaqmorarka marked this conversation as resolved.
Show resolved Hide resolved
{
return session.getProperty(PARQUET_IGNORE_STATISTICS, Boolean.class);
}

public static DataSize getParquetWriterPageSize(ConnectorSession session)
{
return session.getProperty(PARQUET_WRITER_PAGE_SIZE, DataSize.class);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4776,7 +4776,7 @@ private void verifySplitCount(QueryId queryId, long expectedSplitCount)
}
}

private OperatorStats getOperatorStats(QueryId queryId)
protected OperatorStats getOperatorStats(QueryId queryId)
{
try {
return getDistributedQueryRunner().getCoordinator()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,14 @@
*/
package io.trino.plugin.iceberg;

import io.trino.Session;
import io.trino.filesystem.Location;
import io.trino.operator.OperatorStats;
import io.trino.testing.DistributedQueryRunner;
import io.trino.testing.MaterializedResult;
import io.trino.testing.MaterializedResultWithQueryId;
import io.trino.testing.sql.TestTable;
import org.intellij.lang.annotations.Language;
import org.junit.jupiter.api.Test;

import java.util.Optional;
Expand All @@ -25,6 +30,7 @@
import static io.trino.plugin.iceberg.IcebergFileFormat.PARQUET;
import static io.trino.plugin.iceberg.IcebergTestUtils.checkParquetFileSorting;
import static io.trino.plugin.iceberg.IcebergTestUtils.withSmallRowGroups;
import static io.trino.testing.QueryAssertions.assertEqualsIgnoreOrder;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;

Expand Down Expand Up @@ -91,6 +97,39 @@ public void testDropAmbiguousRowFieldCaseSensitivity()
.hasStackTraceContaining("Multiple entries with same key");
}

@Test
public void testIgnoreParquetStatistics()
{
try (TestTable table = new TestTable(
getQueryRunner()::execute,
"test_ignore_parquet_statistics",
"WITH (sorted_by = ARRAY['custkey']) AS TABLE tpch.tiny.customer WITH NO DATA")) {
assertUpdate(
withSmallRowGroups(getSession()),
"INSERT INTO " + table.getName() + " TABLE tpch.tiny.customer",
"VALUES 1500");

@Language("SQL") String query = "SELECT * FROM " + table.getName() + " WHERE custkey = 100";

DistributedQueryRunner queryRunner = getDistributedQueryRunner();
MaterializedResultWithQueryId resultWithoutParquetStatistics = queryRunner.executeWithQueryId(
Session.builder(getSession())
.setCatalogSessionProperty(getSession().getCatalog().orElseThrow(), "parquet_ignore_statistics", "true")
.build(),
query);
OperatorStats queryStatsWithoutParquetStatistics = getOperatorStats(resultWithoutParquetStatistics.getQueryId());
assertThat(queryStatsWithoutParquetStatistics.getPhysicalInputPositions()).isGreaterThan(0);

MaterializedResultWithQueryId resultWithParquetStatistics = queryRunner.executeWithQueryId(getSession(), query);
OperatorStats queryStatsWithParquetStatistics = getOperatorStats(resultWithParquetStatistics.getQueryId());
assertThat(queryStatsWithParquetStatistics.getPhysicalInputPositions()).isGreaterThan(0);
assertThat(queryStatsWithParquetStatistics.getPhysicalInputPositions())
.isLessThan(queryStatsWithoutParquetStatistics.getPhysicalInputPositions());

assertEqualsIgnoreOrder(resultWithParquetStatistics.getResult(), resultWithoutParquetStatistics.getResult());
}
}

@Override
protected boolean isFileSorted(String path, String sortColumnName)
{
Expand Down
Loading