Skip to content

Commit

Permalink
Add min, max, and allowed_values column properties in Faker connector
Browse files Browse the repository at this point in the history
Allow constraining generated values by setting the min, max, or
allowed_values column properties.
  • Loading branch information
nineinchnick committed Dec 26, 2024
1 parent 6ff4cad commit 71e855e
Show file tree
Hide file tree
Showing 9 changed files with 932 additions and 49 deletions.
58 changes: 39 additions & 19 deletions docs/src/main/sphinx/connector/faker.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,15 @@ The following table details all supported column properties.
sentence from the
[Lorem](https://javadoc.io/doc/net.datafaker/datafaker/latest/net/datafaker/providers/base/Lorem.html)
provider.
* - `min`
- Minimum generated value (inclusive). Cannot be set for character-based type
columns.
* - `max`
- Maximum generated value (inclusive). Cannot be set for character-based type
columns.
* - `allowed_values`
- List of allowed values. Cannot be set together with the `min`, or `max`
properties.
:::

### Character types
Expand Down Expand Up @@ -166,7 +175,7 @@ Faker supports the following non-character types:
- `UUID`

You can not use generator expressions for non-character-based columns. To limit
their data range, specify constraints in the `WHERE` clause - see
their data range, set the `min` and/or `max` column properties - see
[](faker-usage).

### Unsupported types
Expand All @@ -184,12 +193,11 @@ can be combined, like in the following example:
```sql
CREATE TABLE faker.default.prices (
currency VARCHAR NOT NULL WITH (generator = '#{Currency.code}'),
price DECIMAL(8,2) NOT NULL
price DECIMAL(8,2) NOT NULL WITH (min = '0')
);

SELECT JSON_OBJECT(KEY currency VALUE price) AS complex
FROM faker.default.prices
WHERE price > 0
LIMIT 3;
```

Expand Down Expand Up @@ -260,37 +268,49 @@ CREATE TABLE generator.default.customer (LIKE production.public.customer EXCLUDI

Insert random data into the original table, by selecting it from the
`generator` catalog. Data generated by the Faker connector for columns of
non-character types cover the whole range of that data type. Add constraints to
adjust the data as desired. The following example ensures that date of birth
and age in years are related and realistic values.
non-character types cover the whole range of that data type. Set the `min` and
`max` column properties, to adjust the generated data as desired. The following
example ensures that date of birth and age in years are related and realistic
values.

Start with getting the complete definition of
a table:

```sql
SHOW CREATE TABLE production.public.customers;
```

Modify the output of the previous query and add some column properties.

```sql
CREATE TABLE generator.default.customer (
id UUID NOT NULL,
name VARCHAR NOT NULL,
address VARCHAR NOT NULL,
born_at DATE WITH (min = '1900-01-01', max = '2025-01-01'),
age_years INTEGER WITH (min = '0', max = '150'),
group_id INTEGER WITH (allowed_values = ARRAY['10', '32', '81'])
);
```

```sql
INSERT INTO production.public.customers
SELECT *
FROM generator.default.customers
WHERE
born_at BETWEEN CURRENT_DATE - INTERVAL '150' YEAR AND CURRENT_DATE
AND age_years BETWEEN 0 AND 150
LIMIT 100;
```

To generate even more realistic data, choose specific generators by setting the
`generator` property on columns. Start with getting the complete definition of
a table:

```sql
SHOW CREATE TABLE production.public.customers;
```

Modify the output of the previous query and add some column properties.
`generator` property on columns.

```sql
CREATE TABLE generator.default.customer (
id UUID NOT NULL,
name VARCHAR NOT NULL WITH (generator = '#{Name.first_name} #{Name.last_name}'),
address VARCHAR NOT NULL WITH (generator = '#{Address.fullAddress}'),
born_at DATE,
age_years INTEGER
born_at DATE WITH (min = '1900-01-01', max = '2025-01-01'),
age_years INTEGER WITH (min = '0', max = '150'),
group_id INTEGER WITH (allowed_values = ARRAY['10', '32', '81'])
);
```

Expand Down
5 changes: 5 additions & 0 deletions plugin/trino-faker/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@
<artifactId>trino-main</artifactId>
</dependency>

<dependency>
<groupId>io.trino</groupId>
<artifactId>trino-parser</artifactId>
</dependency>

<dependency>
<groupId>io.trino</groupId>
<artifactId>trino-plugin-toolkit</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ public record ColumnInfo(FakerColumnHandle handle, ColumnMetadata metadata)
{
public static final String NULL_PROBABILITY_PROPERTY = "null_probability";
public static final String GENERATOR_PROPERTY = "generator";
public static final String MIN_PROPERTY = "min";
public static final String MAX_PROPERTY = "max";
public static final String ALLOWED_VALUES_PROPERTY = "allowed_values";

public ColumnInfo
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,127 @@

package io.trino.plugin.faker;

import com.google.common.collect.ImmutableList;
import io.trino.spi.TrinoException;
import io.trino.spi.connector.ColumnHandle;
import io.trino.spi.connector.ColumnMetadata;
import io.trino.spi.predicate.Domain;
import io.trino.spi.predicate.Range;
import io.trino.spi.predicate.ValueSet;
import io.trino.spi.type.CharType;
import io.trino.spi.type.Type;
import io.trino.spi.type.VarbinaryType;
import io.trino.spi.type.VarcharType;

import java.util.Collection;
import java.util.List;

import static com.google.common.collect.ImmutableList.toImmutableList;
import static io.trino.plugin.faker.ColumnInfo.ALLOWED_VALUES_PROPERTY;
import static io.trino.plugin.faker.ColumnInfo.GENERATOR_PROPERTY;
import static io.trino.plugin.faker.ColumnInfo.MAX_PROPERTY;
import static io.trino.plugin.faker.ColumnInfo.MIN_PROPERTY;
import static io.trino.plugin.faker.ColumnInfo.NULL_PROBABILITY_PROPERTY;
import static io.trino.spi.StandardErrorCode.INVALID_COLUMN_PROPERTY;
import static java.util.Objects.requireNonNull;

public record FakerColumnHandle(
int columnIndex,
String name,
Type type,
double nullProbability,
String generator)
String generator,
Domain domain)
implements ColumnHandle
{
public FakerColumnHandle
{
requireNonNull(name, "name is null");
requireNonNull(type, "type is null");
}

public static FakerColumnHandle of(int columnId, ColumnMetadata column, double defaultNullProbability)
{
double nullProbability = 0;
if (column.isNullable()) {
nullProbability = (double) column.getProperties().getOrDefault(NULL_PROBABILITY_PROPERTY, defaultNullProbability);
}
String generator = (String) column.getProperties().get(GENERATOR_PROPERTY);
if (generator != null && !isCharacterColumn(column)) {
throw new TrinoException(INVALID_COLUMN_PROPERTY, "The `%s` property can only be set for CHAR, VARCHAR or VARBINARY columns".formatted(GENERATOR_PROPERTY));
}
// only parse min, max, and options to validate literals - FakerColumnHandle needs to be serializable,
// and some internal Trino types are not (Int128, LongTimestamp, LongTimestampWithTimeZone), so they cannot be stored in the handle as native types
String min = (String) column.getProperties().get(MIN_PROPERTY);
try {
Literal.parse(min, column.getType());
}
catch (IllegalArgumentException e) {
throw new TrinoException(INVALID_COLUMN_PROPERTY, "The `%s` property must be a valid %s literal".formatted(MIN_PROPERTY, column.getType().getDisplayName()), e);
}
String max = (String) column.getProperties().get(MAX_PROPERTY);
try {
Literal.parse(max, column.getType());
}
catch (IllegalArgumentException e) {
throw new TrinoException(INVALID_COLUMN_PROPERTY, "The `%s` property must be a valid %s literal".formatted(MAX_PROPERTY, column.getType().getDisplayName()), e);
}
Domain domain = Domain.all(column.getType());
if (min != null || max != null) {
if (isCharacterColumn(column)) {
throw new TrinoException(INVALID_COLUMN_PROPERTY, "The `%s` and `%s` properties cannot be set for CHAR, VARCHAR or VARBINARY columns".formatted(MIN_PROPERTY, MAX_PROPERTY));
}
domain = Domain.create(ValueSet.ofRanges(range(column.getType(), min, max)), false);
}
if (column.getProperties().containsKey(ALLOWED_VALUES_PROPERTY)) {
if (min != null || max != null || generator != null) {
throw new TrinoException(INVALID_COLUMN_PROPERTY, "The `%s` property cannot be set together with `%s`, `%s`, and `%s` properties".formatted(ALLOWED_VALUES_PROPERTY, MIN_PROPERTY, MAX_PROPERTY, GENERATOR_PROPERTY));
}
ImmutableList.Builder<Object> builder = ImmutableList.builder();
for (String value : strings((List<?>) column.getProperties().get(ALLOWED_VALUES_PROPERTY))) {
try {
builder.add(Literal.parse(value, column.getType()));
}
catch (IllegalArgumentException | ClassCastException e) {
throw new TrinoException(INVALID_COLUMN_PROPERTY, "The `%s` property must only contain valid %s literals, failed to parse `%s`".formatted(ALLOWED_VALUES_PROPERTY, column.getType().getDisplayName(), value), e);
}
}
domain = Domain.create(ValueSet.copyOf(column.getType(), builder.build()), false);
}

return new FakerColumnHandle(
columnId,
column.getName(),
column.getType(),
nullProbability,
generator,
domain);
}

private static boolean isCharacterColumn(ColumnMetadata column)
{
return column.getType() instanceof CharType || column.getType() instanceof VarcharType || column.getType() instanceof VarbinaryType;
}

private static Range range(Type type, String min, String max)
{
requireNonNull(type, "type is null");
if (min == null && max == null) {
return Range.all(type);
}
if (max == null) {
return Range.greaterThanOrEqual(type, Literal.parse(min, type));
}
if (min == null) {
return Range.lessThanOrEqual(type, Literal.parse(max, type));
}
return Range.range(type, Literal.parse(min, type), true, Literal.parse(max, type), true);
}

private static List<String> strings(Collection<?> values)
{
return values.stream()
.map(String.class::cast)
.collect(toImmutableList());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,25 @@
import io.trino.spi.function.FunctionProvider;
import io.trino.spi.session.PropertyMetadata;
import io.trino.spi.transaction.IsolationLevel;
import io.trino.spi.type.ArrayType;
import jakarta.inject.Inject;

import java.util.List;
import java.util.Optional;
import java.util.Set;

import static com.google.common.collect.ImmutableList.toImmutableList;
import static io.trino.plugin.faker.ColumnInfo.ALLOWED_VALUES_PROPERTY;
import static io.trino.plugin.faker.ColumnInfo.MAX_PROPERTY;
import static io.trino.plugin.faker.ColumnInfo.MIN_PROPERTY;
import static io.trino.spi.StandardErrorCode.INVALID_COLUMN_PROPERTY;
import static io.trino.spi.StandardErrorCode.INVALID_SCHEMA_PROPERTY;
import static io.trino.spi.StandardErrorCode.INVALID_TABLE_PROPERTY;
import static io.trino.spi.connector.ConnectorCapabilities.NOT_NULL_COLUMN_CONSTRAINT;
import static io.trino.spi.session.PropertyMetadata.doubleProperty;
import static io.trino.spi.session.PropertyMetadata.longProperty;
import static io.trino.spi.session.PropertyMetadata.stringProperty;
import static io.trino.spi.type.VarcharType.VARCHAR;
import static java.util.Objects.requireNonNull;

public class FakerConnector
Expand Down Expand Up @@ -161,7 +167,28 @@ public List<PropertyMetadata<?>> getColumnProperties()
throw new TrinoException(INVALID_COLUMN_PROPERTY, "generator must be a valid Faker expression", e);
}
},
false));
false),
stringProperty(
MIN_PROPERTY,
"Minimum generated value (inclusive)",
null,
false),
stringProperty(
MAX_PROPERTY,
"Maximum generated value (inclusive)",
null,
false),
new PropertyMetadata<>(
ALLOWED_VALUES_PROPERTY,
"List of allowed values",
new ArrayType(VARCHAR),
List.class,
null,
false,
value -> ((List<?>) value).stream()
.map(String.class::cast)
.collect(toImmutableList()),
value -> value));
}

private static void checkProperty(boolean expression, ErrorCodeSupplier errorCode, String errorMessage)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,6 @@
import io.trino.spi.security.TrinoPrincipal;
import io.trino.spi.statistics.ComputedStatistics;
import io.trino.spi.type.BigintType;
import io.trino.spi.type.CharType;
import io.trino.spi.type.VarbinaryType;
import io.trino.spi.type.VarcharType;
import jakarta.inject.Inject;

import java.util.ArrayList;
Expand All @@ -72,7 +69,6 @@
import static com.google.common.collect.ImmutableMap.toImmutableMap;
import static com.google.common.collect.Maps.filterKeys;
import static io.trino.spi.StandardErrorCode.ALREADY_EXISTS;
import static io.trino.spi.StandardErrorCode.INVALID_COLUMN_PROPERTY;
import static io.trino.spi.StandardErrorCode.INVALID_COLUMN_REFERENCE;
import static io.trino.spi.StandardErrorCode.NOT_FOUND;
import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED;
Expand Down Expand Up @@ -331,21 +327,8 @@ public synchronized FakerOutputTableHandle beginCreateTable(ConnectorSession ses
int columnId = 0;
for (; columnId < tableMetadata.getColumns().size(); columnId++) {
ColumnMetadata column = tableMetadata.getColumns().get(columnId);
double nullProbability = 0;
if (column.isNullable()) {
nullProbability = (double) column.getProperties().getOrDefault(ColumnInfo.NULL_PROBABILITY_PROPERTY, tableNullProbability);
}
String generator = (String) column.getProperties().get(ColumnInfo.GENERATOR_PROPERTY);
if (generator != null && !isCharacterColumn(column)) {
throw new TrinoException(INVALID_COLUMN_PROPERTY, "The `generator` property can only be set for CHAR, VARCHAR or VARBINARY columns");
}
columns.add(new ColumnInfo(
new FakerColumnHandle(
columnId,
column.getName(),
column.getType(),
nullProbability,
generator),
FakerColumnHandle.of(columnId, column, tableNullProbability),
column));
}

Expand All @@ -355,7 +338,8 @@ public synchronized FakerOutputTableHandle beginCreateTable(ConnectorSession ses
ROW_ID_COLUMN_NAME,
BigintType.BIGINT,
0,
""),
"",
Domain.all(BigintType.BIGINT)),
ColumnMetadata.builder()
.setName(ROW_ID_COLUMN_NAME)
.setType(BigintType.BIGINT)
Expand All @@ -371,11 +355,6 @@ public synchronized FakerOutputTableHandle beginCreateTable(ConnectorSession ses
return new FakerOutputTableHandle(tableName);
}

private boolean isCharacterColumn(ColumnMetadata column)
{
return column.getType() instanceof CharType || column.getType() instanceof VarcharType || column.getType() instanceof VarbinaryType;
}

private synchronized void checkSchemaExists(String schemaName)
{
if (schemas.stream().noneMatch(schema -> schema.name().equals(schemaName))) {
Expand Down
Loading

0 comments on commit 71e855e

Please sign in to comment.