From bd63938b62511ab2a957f56c69dbb2d03bb24ec5 Mon Sep 17 00:00:00 2001 From: Uzair Ahmad Date: Fri, 28 Jun 2024 10:34:22 -0400 Subject: [PATCH] sql: ignore outer buckets when getting partial statistic extreme bounds Full statistic collections sometimes invoke `addOuterBuckets()` which adds buckets with column-type max and min upper bounds to the histogram. Previously, we used the first (non-null) and last bucket as the "less than" and "greater than" bounds for partial statistics collections using extremes. This results in an incorrect predicate when outer buckets exist in the most recent full statistic, which has been fixed in this commit by ignoring outer buckets when determining bounds. Fixes: #93094 See also: #125950 Release note (bug fix): Fixed a bug when creating partial statistics using extremes (which is disabled by default) where it would occasionally use incorrect extreme values and collect no stats. This occurs when outer buckets were added to the previous histogram to account for extra distinct count. --- .../testdata/logic_test/distsql_stats | 174 +++++++++++++++++- pkg/sql/stats/bounds/extremes.go | 16 +- 2 files changed, 186 insertions(+), 4 deletions(-) diff --git a/pkg/sql/logictest/testdata/logic_test/distsql_stats b/pkg/sql/logictest/testdata/logic_test/distsql_stats index ea5e4aac7ad2..9eb5a3b44f5f 100644 --- a/pkg/sql/logictest/testdata/logic_test/distsql_stats +++ b/pkg/sql/logictest/testdata/logic_test/distsql_stats @@ -2559,7 +2559,7 @@ SHOW STATISTICS USING JSON FOR TABLE only_null; statement ok ALTER TABLE only_null INJECT STATISTICS '$only_null_stat'; -statement error pq: only NULL values exist in the index, so partial stats cannot be collected +statement error pq: only outer or NULL bounded buckets exist in the index, so partial stats cannot be collected CREATE STATISTICS only_null_partial ON a FROM only_null USING EXTREMES; statement ok @@ -2890,6 +2890,178 @@ SHOW HISTOGRAM $hist_crdb_internal_idx_expr upper_bound range_rows distinct_range_rows equal_rows '{"bar": {"baz": 5}}' 0 0 1 +# Verify that the correct partial predicate is used for partial stats using +# extremes when outer buckets exist (int column type). +statement ok +CREATE TABLE int_outer_buckets (a PRIMARY KEY) AS SELECT generate_series(0, 9999); + +statement ok +CREATE STATISTICS int_outer_buckets_full ON a FROM int_outer_buckets; + +let $hist_id_int_outer_buckets_full +SELECT histogram_id FROM [SHOW STATISTICS FOR TABLE int_outer_buckets] WHERE statistics_name = 'int_outer_buckets_full' + +# The full stats collection should have added 2 outer buckets for a total of 202 +# with upper bounds of MaxInt64 and MinInt64. +query I +SELECT count(*) FROM [SHOW HISTOGRAM $hist_id_int_outer_buckets_full] +---- +202 + +statement ok +INSERT INTO int_outer_buckets SELECT generate_series(-10, -1) UNION ALL SELECT generate_series(10000, 10009); + +statement ok +CREATE STATISTICS int_outer_buckets_partial ON a FROM int_outer_buckets USING EXTREMES; + +# The partial stat predicate should not include MaxInt64 and MinInt64 from the +# outer buckets and should count 20 rows beyond the extremes. +query TTII colnames +SELECT "statistics_name", "partial_predicate", "row_count", "null_count" +FROM [SHOW STATISTICS FOR TABLE int_outer_buckets] +WHERE statistics_name = 'int_outer_buckets_partial' +---- +statistics_name partial_predicate row_count null_count +int_outer_buckets_partial (a IS NULL) OR ((a < 0:::INT8) OR (a > 9999:::INT8)) 20 0 + +# Verify that we don't ignore buckets with actual max and min values when +# creating partial stats using extremes. +statement ok +INSERT INTO int_outer_buckets VALUES (-9223372036854775808), (9223372036854775807); + +statement ok +SET CLUSTER SETTING sql.stats.histogram_samples.count = 10050; + +statement ok +CREATE STATISTICS int_outer_buckets_full ON a FROM int_outer_buckets; + +statement ok +CREATE STATISTICS int_outer_buckets_partial ON a FROM int_outer_buckets USING EXTREMES; + +# The partial stat predicate should include MaxInt64 and MinInt64 and should +# count no rows beyond the extremes. +query TTII colnames +SELECT "statistics_name", "partial_predicate", "row_count", "null_count" +FROM [SHOW STATISTICS FOR TABLE int_outer_buckets] +WHERE statistics_name = 'int_outer_buckets_partial' +---- +statistics_name partial_predicate row_count null_count +int_outer_buckets_partial (a IS NULL) OR ((a < (-9223372036854775808):::INT8) OR (a > 9223372036854775807:::INT8)) 0 0 + +# Verify that the correct partial predicate is used for partial stats using +# extremes when outer buckets exist (timestamp column type). +statement ok +CREATE TABLE timestamp_outer_buckets (a TIMESTAMP PRIMARY KEY); + +statement ok +INSERT INTO timestamp_outer_buckets VALUES + ('2024-06-26 01:00:00'), + ('2024-06-26 02:00:00'), + ('2024-06-27 01:30:00'), + ('2024-06-27 02:30:00'); + +statement ok +CREATE STATISTICS timestamp_outer_buckets_full ON a FROM timestamp_outer_buckets; + +let $hist_id_timestamp_outer_buckets_full +SELECT histogram_id FROM [SHOW STATISTICS FOR TABLE timestamp_outer_buckets] WHERE statistics_name = 'timestamp_outer_buckets_full' + +# The full stats collection should not have added outer buckets. +query I +SELECT count(*) FROM [SHOW HISTOGRAM $hist_id_timestamp_outer_buckets_full] +---- +4 + +statement ok +INSERT INTO timestamp_outer_buckets VALUES + ('2024-06-26 00:00:00'), + ('2024-06-27 03:30:00'); + +statement ok +CREATE STATISTICS timestamp_outer_buckets_partial ON a FROM timestamp_outer_buckets USING EXTREMES; + +# The partial stat should not ignore any buckets and have the correct predicate. +query TTII colnames +SELECT "statistics_name", "partial_predicate", "row_count", "null_count" +FROM [SHOW STATISTICS FOR TABLE timestamp_outer_buckets] +WHERE statistics_name = 'timestamp_outer_buckets_partial' +---- +statistics_name partial_predicate row_count null_count +timestamp_outer_buckets_partial (a IS NULL) OR ((a < '2024-06-26 01:00:00':::TIMESTAMP) OR (a > '2024-06-27 02:30:00':::TIMESTAMP)) 2 0 + +# Inject a full statistic with outer buckets, overriding the previous stats. +statement ok +ALTER TABLE timestamp_outer_buckets INJECT STATISTICS '[ + { + "avg_size": 7, + "columns": [ + "a" + ], + "created_at": "2024-06-27 19:00:16.450303", + "distinct_count": 4, + "histo_buckets": [ + { + "distinct_range": 0.000001, + "num_eq": 0, + "num_range": 0, + "upper_bound": "4714-11-24 00:00:00 BC" + }, + { + "distinct_range": 0, + "num_eq": 1, + "num_range": 0, + "upper_bound": "2024-06-26 01:00:00" + }, + { + "distinct_range": 0, + "num_eq": 1, + "num_range": 0, + "upper_bound": "2024-06-26 02:00:00" + }, + { + "distinct_range": 0, + "num_eq": 1, + "num_range": 0, + "upper_bound": "2024-06-27 01:30:00" + }, + { + "distinct_range": 0, + "num_eq": 1, + "num_range": 0, + "upper_bound": "2024-06-27 02:30:00" + }, + { + "distinct_range": 0.000001, + "num_eq": 0, + "num_range": 0, + "upper_bound": "294276-12-31 23:59:59.999999" + } + ], + "histo_col_type": "TIMESTAMP", + "histo_version": 3, + "name": "timestamp_outer_buckets_full", + "null_count": 0, + "row_count": 4 + } +]' + +statement ok +INSERT INTO timestamp_outer_buckets VALUES ('2024-06-28 01:00:00'); + +statement ok +CREATE STATISTICS timestamp_outer_buckets_partial ON a FROM timestamp_outer_buckets USING EXTREMES; + +# The partial stat predicate should not include MaxSupportedTime and +# MinSupportedTime from the outer buckets and should count 3 rows beyond the +# extremes. +query TTII colnames +SELECT "statistics_name", "partial_predicate", "row_count", "null_count" +FROM [SHOW STATISTICS FOR TABLE timestamp_outer_buckets] +WHERE statistics_name = 'timestamp_outer_buckets_partial' +---- +statistics_name partial_predicate row_count null_count +timestamp_outer_buckets_partial (a IS NULL) OR ((a < '2024-06-26 01:00:00':::TIMESTAMP) OR (a > '2024-06-27 02:30:00':::TIMESTAMP)) 3 0 + statement ok RESET enable_create_stats_using_extremes diff --git a/pkg/sql/stats/bounds/extremes.go b/pkg/sql/stats/bounds/extremes.go index 2b5cd12021de..4d3421b477a3 100644 --- a/pkg/sql/stats/bounds/extremes.go +++ b/pkg/sql/stats/bounds/extremes.go @@ -88,15 +88,25 @@ func ConstructUsingExtremesSpans( func GetUsingExtremesBounds( ctx context.Context, evalCtx *eval.Context, histogram []cat.HistogramBucket, ) (lowerBound tree.Datum, upperBound tree.Datum, _ error) { + // Full stats collections sometimes add buckets with column type max/min upper + // bounds above and below the observed max and min values to account for extra + // distinct count (see addOuterBuckets()) and should be ignored. + isOuterBucket := func(bucket *cat.HistogramBucket) bool { + return (bucket.UpperBound.IsMin(ctx, evalCtx) || bucket.UpperBound.IsMax(ctx, evalCtx)) && bucket.NumEq == 0 + } upperBound = histogram[len(histogram)-1].UpperBound - // Pick the earliest lowerBound that is not null, + if len(histogram) > 1 && isOuterBucket(&histogram[len(histogram)-1]) { + upperBound = histogram[len(histogram)-2].UpperBound + } + + // Pick the earliest lowerBound that is not null and isn't an outer bucket, // but if none exist, return error for i := range histogram { hist := &histogram[i] if cmp, err := hist.UpperBound.Compare(ctx, evalCtx, tree.DNull); err != nil { return lowerBound, nil, err - } else if cmp != 0 { + } else if cmp != 0 && !isOuterBucket(hist) { lowerBound = hist.UpperBound break } @@ -105,7 +115,7 @@ func GetUsingExtremesBounds( return lowerBound, nil, pgerror.Newf( pgcode.ObjectNotInPrerequisiteState, - "only NULL values exist in the index, so partial stats cannot be collected") + "only outer or NULL bounded buckets exist in the index, so partial stats cannot be collected") } return lowerBound, upperBound, nil }