From 9c9a92179292222cc5052e91029dd19c65db6650 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Wa=C5=9B?= Date: Sat, 1 Feb 2025 10:17:31 +0100 Subject: [PATCH] Document using statistics in the Faker connector --- docs/src/main/sphinx/connector/faker.md | 64 ++++++++++++++++++- docs/src/main/sphinx/sql/select.md | 1 + .../io/trino/plugin/faker/FakerConfig.java | 4 +- .../io/trino/plugin/faker/FakerConnector.java | 8 +-- 4 files changed, 70 insertions(+), 7 deletions(-) diff --git a/docs/src/main/sphinx/connector/faker.md b/docs/src/main/sphinx/connector/faker.md index 2a3f01cd9c09..fa09a3a0b875 100644 --- a/docs/src/main/sphinx/connector/faker.md +++ b/docs/src/main/sphinx/connector/faker.md @@ -50,6 +50,13 @@ The following table details all general configuration properties: * - `faker.locale` - Default locale for generating character-based data, specified as a IETF BCP 47 language tag string. Defaults to `en`. +* - `faker.sequence-detection-enabled` + - If true, when creating a table using existing data, columns with the number + of distinct values close to the number of rows are treated as sequences. +* - `faker.dictionary-detection-enabled` + - If true, when creating a table using existing data, columns with a low + number of distinct values are treated as dictionaries, and get + the `allowed_values` column property populated with random values. ::: The following table details all supported schema properties. If they're not @@ -66,6 +73,13 @@ set, values from corresponding configuration properties are used. them, in any table of this schema. * - `default_limit` - Default number of rows in a table. +* - `sequence_detection_enabled` + - If true, when creating a table using existing data, columns with the number + of distinct values close to the number of rows are treated as sequences. +* - `dictionary_detection_enabled` + - If true, when creating a table using existing data, columns with a low + number of distinct values are treated as dictionaries, and get + the `allowed_values` column property populated with random values. ::: The following table details all supported table properties. If they're not set, @@ -82,6 +96,13 @@ values from corresponding schema properties are used. `null` in the table. * - `default_limit` - Default number of rows in the table. +* - `sequence_detection_enabled` + - If true, when creating a table using existing data, columns with the number + of distinct values close to the number of rows are treated as sequences. +* - `dictionary_detection_enabled` + - If true, when creating a table using existing data, columns with a low + number of distinct values are treated as dictionaries, and get + the `allowed_values` column property populated with random values. ::: The following table details all supported column properties. @@ -245,7 +266,7 @@ operation](sql-read-operations) statements to generate data. To define the schema for generating data, it supports the following features: - [](/sql/create-table) -- [](/sql/create-table-as) +- [](/sql/create-table-as), see also [](faker-statistics) - [](/sql/drop-table) - [](/sql/create-schema) - [](/sql/drop-schema) @@ -317,3 +338,44 @@ CREATE TABLE generator.default.customer ( group_id INTEGER WITH (allowed_values = ARRAY['10', '32', '81']) ); ``` + +(faker-statistics)= +### Using existing data statistics + +The Faker connector automatically sets the `default_limit` table property, +and the `min`, `max`, `null_probability` column properties, based on statistics +collected by scanning existing data, like in the following example: + +```sql +CREATE TABLE generator.default.customer AS +SELECT * +FROM production.public.customer +WHERE created_at > CURRENT_DATE - INTERVAL '1' YEAR; +``` + +Instead of using range, or other predicates, tables can be sampled, +see [](tablesample). + +When the `SELECT` statement doesn't contain a `WHERE` clause, a shorter notation +can be used: + +```sql +CREATE TABLE generator.default.customer AS TABLE production.public.customer; +``` + +The Faker connector detects sequence columns, which are integer column with the +number of distinct values almost equal to the number of rows in the table. For +such columns, Faker sets the `step` column property to 1. + +Sequence detection can be turned off using the `sequence_detection_enabled` +table, or schema property or in the connector configuration file, using the +`faker.sequence-detection-enabled` property. + +The Faker connector detects dictionary columns, which are columns of +non-character types with the number of distinct values lower or equal to 1000. +For such columns, Faker generates a list of random values to choose from, and +saves it in the `allowed_values` column property. + +Dictionary detection can be turned off using the `dictionary_detection_enabled` +table, or schema property or in the connector configuration file, using +the `faker.dictionary-detection-enabled` property. diff --git a/docs/src/main/sphinx/sql/select.md b/docs/src/main/sphinx/sql/select.md index 59d96267e0f0..4318eee01b68 100644 --- a/docs/src/main/sphinx/sql/select.md +++ b/docs/src/main/sphinx/sql/select.md @@ -1038,6 +1038,7 @@ ORDER BY regionkey FETCH FIRST ROW WITH TIES; (5 rows) ``` +(tablesample)= ## TABLESAMPLE There are multiple sample methods: diff --git a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerConfig.java b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerConfig.java index 070a1e5753d7..06a651870ebb 100644 --- a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerConfig.java +++ b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerConfig.java @@ -80,7 +80,7 @@ public boolean isSequenceDetectionEnabled() @ConfigDescription( """ If true, when creating a table using existing data, columns with the number of distinct values close to - the number of rows will be treated as sequences""") + the number of rows are treated as sequences""") public FakerConfig setSequenceDetectionEnabled(boolean value) { this.sequenceDetectionEnabled = value; @@ -96,7 +96,7 @@ public boolean isDictionaryDetectionEnabled() @ConfigDescription( """ If true, when creating a table using existing data, columns with a low number of distinct values - will have the allowed_values column property populated with random values""") + are treated as dictionaries, and get the allowed_values column property populated with random values""") public FakerConfig setDictionaryDetectionEnabled(boolean value) { this.dictionaryDetectionEnabled = value; diff --git a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerConnector.java b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerConnector.java index 322fbfa006ea..06039bcb5d03 100644 --- a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerConnector.java +++ b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerConnector.java @@ -131,14 +131,14 @@ public List> getSchemaProperties() SchemaInfo.SEQUENCE_DETECTION_ENABLED, """ If true, when creating a table using existing data, columns with the number of distinct values close to - the number of rows will be treated as sequences""", + the number of rows are treated as sequences""", null, false), booleanProperty( SchemaInfo.DICTIONARY_DETECTION_ENABLED, """ If true, when creating a table using existing data, columns with a low number of distinct values - will have the allowed_values column property populated with random values""", + are treated as dictionaries, and get the allowed_values column property populated with random values""", null, false)); } @@ -163,14 +163,14 @@ public List> getTableProperties() TableInfo.SEQUENCE_DETECTION_ENABLED, """ If true, when creating a table using existing data, columns with the number of distinct values close to - the number of rows will be treated as sequences""", + the number of rows are treated as sequences""", null, false), booleanProperty( TableInfo.DICTIONARY_DETECTION_ENABLED, """ If true, when creating a table using existing data, columns with a low number of distinct values - will have the allowed_values column property populated with random values""", + are treated as dictionaries, and get the allowed_values column property populated with random values""", null, false)); }