diff --git a/pkg/executor/show_stats_test.go b/pkg/executor/show_stats_test.go index 97b2946ba5991..2869d45f74310 100644 --- a/pkg/executor/show_stats_test.go +++ b/pkg/executor/show_stats_test.go @@ -407,7 +407,7 @@ func TestShowAnalyzeStatus(t *testing.T) { require.Equal(t, "test", rows[0][0]) require.Equal(t, "t", rows[0][1]) require.Equal(t, "", rows[0][2]) - require.Equal(t, "analyze table all columns with 256 buckets, 500 topn, 1 samplerate", rows[0][3]) + require.Equal(t, "analyze table all columns with 256 buckets, 100 topn, 1 samplerate", rows[0][3]) require.Equal(t, "2", rows[0][4]) checkTime := func(val any) { str, ok := val.(string) diff --git a/pkg/executor/test/analyzetest/analyze_test.go b/pkg/executor/test/analyzetest/analyze_test.go index d44ad171abddf..207ffa379b1a5 100644 --- a/pkg/executor/test/analyzetest/analyze_test.go +++ b/pkg/executor/test/analyzetest/analyze_test.go @@ -1916,7 +1916,7 @@ func testKillAutoAnalyze(t *testing.T, ver int) { if ver == 1 { jobInfo += "columns" } else { - jobInfo += "table all columns with 256 buckets, 500 topn, 1 samplerate" + jobInfo += "table all columns with 256 buckets, 100 topn, 1 samplerate" } // kill auto analyze when it is pending/running/finished for _, status := range []string{ @@ -2041,7 +2041,7 @@ func TestAnalyzeJob(t *testing.T) { DBName: "test", TableName: "t", PartitionName: "", - JobInfo: "table all columns with 256 buckets, 500 topn, 1 samplerate", + JobInfo: "table all columns with 256 buckets, 100 topn, 1 samplerate", } executor.AddNewAnalyzeJob(se, job) require.NotNil(t, job.ID) @@ -2133,7 +2133,7 @@ func TestInsertAnalyzeJobWithLongInstance(t *testing.T) { DBName: "test", TableName: "t", PartitionName: "", - JobInfo: "table all columns with 256 buckets, 500 topn, 1 samplerate", + JobInfo: "table all columns with 256 buckets, 100 topn, 1 samplerate", } h := dom.StatsHandle() instance := "xxxtidb-tidb-0.xxxtidb-tidb-peer.xxxx-xx-1234-xxx-123456-1-321.xyz:4000" @@ -2785,7 +2785,7 @@ func TestAnalyzeColumnsSkipMVIndexJsonCol(t *testing.T) { tk.MustQuery("select job_info from mysql.analyze_jobs where table_schema = 'test' and table_name = 't'").Sort().Check( testkit.Rows( "analyze index idx_c", - "analyze table columns a, b with 256 buckets, 500 topn, 1 samplerate", + "analyze table columns a, b with 256 buckets, 100 topn, 1 samplerate", )) is := dom.InfoSchema() diff --git a/pkg/planner/cardinality/testdata/cardinality_suite_out.json b/pkg/planner/cardinality/testdata/cardinality_suite_out.json index f4b64ebd7b1cd..f484751ae69b9 100644 --- a/pkg/planner/cardinality/testdata/cardinality_suite_out.json +++ b/pkg/planner/cardinality/testdata/cardinality_suite_out.json @@ -24,7 +24,7 @@ { "Start": 800, "End": 900, - "Count": 771.504166655054 + "Count": 755.754166655054 }, { "Start": 900, @@ -79,7 +79,7 @@ { "Start": 800, "End": 1000, - "Count": 1229.696869573942 + "Count": 1213.946869573942 }, { "Start": 900, @@ -104,7 +104,7 @@ { "Start": 200, "End": 400, - "Count": 1226.2788209899081 + "Count": 1215.0288209899081 }, { "Start": 200, diff --git a/pkg/planner/core/planbuilder.go b/pkg/planner/core/planbuilder.go index c7129d02a3351..6773481f15156 100644 --- a/pkg/planner/core/planbuilder.go +++ b/pkg/planner/core/planbuilder.go @@ -2696,9 +2696,11 @@ var analyzeOptionDefault = map[ast.AnalyzeOptionType]uint64{ ast.AnalyzeOptSampleRate: math.Float64bits(0), } +// TopN reduced from 500 to 100 due to concerns over large number of TopN values collected for customers with many tables. +// 100 is more inline with other databases. 100-256 is also common for NumBuckets with other databases. var analyzeOptionDefaultV2 = map[ast.AnalyzeOptionType]uint64{ ast.AnalyzeOptNumBuckets: 256, - ast.AnalyzeOptNumTopN: 500, + ast.AnalyzeOptNumTopN: 100, ast.AnalyzeOptCMSketchWidth: 2048, ast.AnalyzeOptCMSketchDepth: 5, ast.AnalyzeOptNumSamples: 0, diff --git a/pkg/statistics/builder.go b/pkg/statistics/builder.go index c71247f9bb078..6899169eb40e9 100644 --- a/pkg/statistics/builder.go +++ b/pkg/statistics/builder.go @@ -296,6 +296,11 @@ func BuildHistAndTopN( sampleNum := int64(len(samples)) // As we use samples to build the histogram, the bucket number and repeat should multiply a factor. sampleFactor := float64(count) / float64(len(samples)) + // If a numTopn value other than 100 is passed in, we assume it's a value that the user wants us to honor + allowPruning := true + if numTopN != 100 { + allowPruning = false + } // Step1: collect topn from samples @@ -326,18 +331,23 @@ func BuildHistAndTopN( continue } // case 2, meet a different value: counting for the "current" is complete - // case 2-1, now topn is empty: append the "current" count directly + // case 2-1, do not add a count of 1 if we're sampling + if curCnt == 1 && sampleFactor > 1 && allowPruning { + cur, curCnt = sampleBytes, 1 + continue + } + // case 2-2, now topn is empty: append the "current" count directly if len(topNList) == 0 { topNList = append(topNList, TopNMeta{Encoded: cur, Count: uint64(curCnt)}) cur, curCnt = sampleBytes, 1 continue } - // case 2-2, now topn is full, and the "current" count is less than the least count in the topn: no need to insert the "current" + // case 2-3, now topn is full, and the "current" count is less than the least count in the topn: no need to insert the "current" if len(topNList) >= numTopN && uint64(curCnt) <= topNList[len(topNList)-1].Count { cur, curCnt = sampleBytes, 1 continue } - // case 2-3, now topn is not full, or the "current" count is larger than the least count in the topn: need to find a slot to insert the "current" + // case 2-4, now topn is not full, or the "current" count is larger than the least count in the topn: need to find a slot to insert the "current" j := len(topNList) for ; j > 0; j-- { if uint64(curCnt) < topNList[j-1].Count { @@ -358,9 +368,10 @@ func BuildHistAndTopN( hg.Correlation = calcCorrelation(sampleNum, corrXYSum) } - // Handle the counting for the last value. Basically equal to the case 2 above. - // now topn is empty: append the "current" count directly - if numTopN != 0 { + // Handle the counting for the last value. Basically equal to the case 2 above - including + // limiting addition of a value with a count of 1 (since it will be pruned anyway). + if numTopN != 0 && (!allowPruning || (allowPruning && (sampleFactor <= 1 || curCnt > 1))) { + // now topn is empty: append the "current" count directly if len(topNList) == 0 { topNList = append(topNList, TopNMeta{Encoded: cur, Count: uint64(curCnt)}) } else if len(topNList) < numTopN || uint64(curCnt) > topNList[len(topNList)-1].Count { @@ -380,7 +391,9 @@ func BuildHistAndTopN( } } - topNList = pruneTopNItem(topNList, ndv, nullCount, sampleNum, count) + if allowPruning { + topNList = pruneTopNItem(topNList, ndv, nullCount, sampleNum, count) + } // Step2: exclude topn from samples if numTopN != 0 { @@ -435,7 +448,7 @@ func BuildHistAndTopN( topn.Scale(sampleFactor) if uint64(count) <= topn.TotalCount() || int(hg.NDV) <= len(topn.TopN) { - // TopN includes all sample data + // If we've collected everything - don't create any buckets return hg, topn, nil } @@ -454,8 +467,7 @@ func BuildHistAndTopN( // // We assume that the ones not in the top-n list's selectivity is 1/remained_ndv which is the internal implementation of EqualRowCount func pruneTopNItem(topns []TopNMeta, ndv, nullCount, sampleRows, totalRows int64) []TopNMeta { - // If the sampleRows holds all rows, or NDV of samples equals to actual NDV, we just return the TopN directly. - if sampleRows == totalRows || totalRows <= 1 || int64(len(topns)) >= ndv || len(topns) == 0 { + if totalRows <= 1 || int64(len(topns)) >= ndv || len(topns) <= 1 { return topns } // Sum the occurrence except the least common one from the top-n list. To check whether the lest common one is worth diff --git a/pkg/statistics/handle/autoanalyze/autoanalyze_test.go b/pkg/statistics/handle/autoanalyze/autoanalyze_test.go index 2acfe70d5de11..86e24739416d6 100644 --- a/pkg/statistics/handle/autoanalyze/autoanalyze_test.go +++ b/pkg/statistics/handle/autoanalyze/autoanalyze_test.go @@ -315,7 +315,7 @@ func TestAutoAnalyzeSkipColumnTypes(t *testing.T) { exec.AutoAnalyzeMinCnt = originalVal }() require.True(t, h.HandleAutoAnalyze()) - tk.MustQuery("select job_info from mysql.analyze_jobs where job_info like '%auto analyze table%'").Check(testkit.Rows("auto analyze table columns a, b, d with 256 buckets, 500 topn, 1 samplerate")) + tk.MustQuery("select job_info from mysql.analyze_jobs where job_info like '%auto analyze table%'").Check(testkit.Rows("auto analyze table columns a, b, d with 256 buckets, 100 topn, 1 samplerate")) } func TestAutoAnalyzeOnEmptyTable(t *testing.T) { diff --git a/pkg/statistics/handle/globalstats/global_stats_internal_test.go b/pkg/statistics/handle/globalstats/global_stats_internal_test.go index fb675c9d2b602..ba9775fd51678 100644 --- a/pkg/statistics/handle/globalstats/global_stats_internal_test.go +++ b/pkg/statistics/handle/globalstats/global_stats_internal_test.go @@ -351,6 +351,7 @@ func testIssues24349(testKit *testkit.TestKit) { testKit.MustExec("create table t (a int, b int) partition by hash(a) partitions 3") testKit.MustExec("insert into t values (0, 3), (0, 3), (0, 3), (0, 2), (1, 1), (1, 2), (1, 2), (1, 2), (1, 3), (1, 4), (2, 1), (2, 1)") testKit.MustExec("analyze table t with 1 topn, 3 buckets") + testKit.MustExec("explain select * from t where a > 0 and b > 0") testKit.MustQuery("show stats_buckets where partition_name='global'").Check(testkit.Rows( "test t global a 0 0 2 2 0 2 0", "test t global b 0 0 3 1 1 2 0", diff --git a/pkg/statistics/statistics_test.go b/pkg/statistics/statistics_test.go index 7655bf38131a6..0452d01e027c6 100644 --- a/pkg/statistics/statistics_test.go +++ b/pkg/statistics/statistics_test.go @@ -501,7 +501,7 @@ func SubTestBuild() func(*testing.T) { return func(t *testing.T) { s := createTestStatisticsSamples(t) bucketCount := int64(256) - topNCount := 20 + topNCount := 100 ctx := mock.NewContext() sc := ctx.GetSessionVars().StmtCtx sketch, _, err := buildFMSketch(sc, s.rc.(*recordSet).data, 1000) @@ -650,7 +650,7 @@ func TestPruneTopN(t *testing.T) { var totalNDV, nullCnt, sampleRows, totalRows int64 // case 1 - topnIn = []TopNMeta{{[]byte{1}, 100_000}, {[]byte{2}, 10}} + topnIn = []TopNMeta{{[]byte{1}, 100_000}} totalNDV = 2 nullCnt = 0 sampleRows = 100_010 @@ -674,8 +674,8 @@ func TestPruneTopN(t *testing.T) { // case 3 topnIn = nil - for i := 0; i < 100; i++ { - topnIn = append(topnIn, TopNMeta{[]byte{byte(i)}, 1_000}) + for i := 0; i < 10; i++ { + topnIn = append(topnIn, TopNMeta{[]byte{byte(i)}, 10_000}) } totalNDV = 100 nullCnt = 0 @@ -683,4 +683,32 @@ func TestPruneTopN(t *testing.T) { totalRows = 10_000_000 topnOut = pruneTopNItem(topnIn, totalNDV, nullCnt, sampleRows, totalRows) require.Equal(t, topnIn, topnOut) + + // case 4 - test TopN pruning for small table + topnIn = []TopNMeta{ + {[]byte{1}, 3_000}, + {[]byte{2}, 3_000}, + } + totalNDV = 4002 + nullCnt = 0 + sampleRows = 10_000 + totalRows = 10_000 + topnOut = pruneTopNItem(topnIn, totalNDV, nullCnt, sampleRows, totalRows) + require.Equal(t, topnIn, topnOut) + + // case 5 - test pruning of value=1 + topnIn = nil + for i := 0; i < 10; i++ { + topnIn = append(topnIn, TopNMeta{[]byte{byte(i)}, 90}) + } + topnPruned := topnIn + for i := 90; i < 150; i++ { + topnIn = append(topnIn, TopNMeta{[]byte{byte(i)}, 1}) + } + totalNDV = 150 + nullCnt = 0 + sampleRows = 1500 + totalRows = 1500 + topnOut = pruneTopNItem(topnIn, totalNDV, nullCnt, sampleRows, totalRows) + require.Equal(t, topnPruned, topnOut) } diff --git a/tests/integrationtest/r/executor/analyze.result b/tests/integrationtest/r/executor/analyze.result index 3762068660eb4..3b0e599983b59 100644 --- a/tests/integrationtest/r/executor/analyze.result +++ b/tests/integrationtest/r/executor/analyze.result @@ -824,12 +824,12 @@ delete from mysql.analyze_jobs; analyze table t; select job_info from mysql.analyze_jobs where job_info like '%analyze table%'; job_info -analyze table columns a, b, d with 256 buckets, 500 topn, 1 samplerate +analyze table columns a, b, d with 256 buckets, 100 topn, 1 samplerate delete from mysql.analyze_jobs; analyze table t columns a, e; select job_info from mysql.analyze_jobs where job_info like '%analyze table%'; job_info -analyze table columns a, d with 256 buckets, 500 topn, 1 samplerate +analyze table columns a, d with 256 buckets, 100 topn, 1 samplerate set @@session.tidb_analyze_skip_column_types = default; DROP TABLE IF EXISTS Issue34228; CREATE TABLE Issue34228 (id bigint NOT NULL, dt datetime NOT NULL) PARTITION BY RANGE COLUMNS(dt) (PARTITION p202201 VALUES LESS THAN ("2022-02-01"), PARTITION p202202 VALUES LESS THAN ("2022-03-01"));