pingcap · ti-chi-bot · Jun 19, 2024 · May 22, 2024 · May 17, 2024 · May 18, 2024
diff --git a/pkg/executor/show_stats_test.go b/pkg/executor/show_stats_test.go
@@ -407,7 +407,7 @@ func TestShowAnalyzeStatus(t *testing.T) {
  require.Equal(t, "test", rows[0][0])
  require.Equal(t, "t", rows[0][1])
  require.Equal(t, "", rows[0][2])
- require.Equal(t, "analyze table all columns with 256 buckets, 500 topn, 1 samplerate", rows[0][3])
+ require.Equal(t, "analyze table all columns with 256 buckets, 100 topn, 1 samplerate", rows[0][3])
  require.Equal(t, "2", rows[0][4])
  checkTime := func(val any) {
  str, ok := val.(string)

diff --git a/pkg/executor/test/analyzetest/analyze_test.go b/pkg/executor/test/analyzetest/analyze_test.go
@@ -1916,7 +1916,7 @@ func testKillAutoAnalyze(t *testing.T, ver int) {
  if ver == 1 {
  jobInfo += "columns"
  } else {
- jobInfo += "table all columns with 256 buckets, 500 topn, 1 samplerate"
+ jobInfo += "table all columns with 256 buckets, 100 topn, 1 samplerate"
  }
  // kill auto analyze when it is pending/running/finished
  for _, status := range []string{
@@ -2041,7 +2041,7 @@ func TestAnalyzeJob(t *testing.T) {
  DBName: "test",
  TableName: "t",
  PartitionName: "",
- JobInfo: "table all columns with 256 buckets, 500 topn, 1 samplerate",
+ JobInfo: "table all columns with 256 buckets, 100 topn, 1 samplerate",
  }
  executor.AddNewAnalyzeJob(se, job)
  require.NotNil(t, job.ID)
@@ -2133,7 +2133,7 @@ func TestInsertAnalyzeJobWithLongInstance(t *testing.T) {
  DBName: "test",
  TableName: "t",
  PartitionName: "",
- JobInfo: "table all columns with 256 buckets, 500 topn, 1 samplerate",
+ JobInfo: "table all columns with 256 buckets, 100 topn, 1 samplerate",
  }
  h := dom.StatsHandle()
  instance := "xxxtidb-tidb-0.xxxtidb-tidb-peer.xxxx-xx-1234-xxx-123456-1-321.xyz:4000"
@@ -2785,7 +2785,7 @@ func TestAnalyzeColumnsSkipMVIndexJsonCol(t *testing.T) {
  tk.MustQuery("select job_info from mysql.analyze_jobs where table_schema = 'test' and table_name = 't'").Sort().Check(
  testkit.Rows(
  "analyze index idx_c",
- "analyze table columns a, b with 256 buckets, 500 topn, 1 samplerate",
+ "analyze table columns a, b with 256 buckets, 100 topn, 1 samplerate",
  ))
 
  is := dom.InfoSchema()

diff --git a/pkg/planner/cardinality/testdata/cardinality_suite_out.json b/pkg/planner/cardinality/testdata/cardinality_suite_out.json
@@ -24,7 +24,7 @@
  {
  "Start": 800,
  "End": 900,
- "Count": 771.504166655054
+ "Count": 755.754166655054
  },
  {
  "Start": 900,
@@ -79,7 +79,7 @@
  {
  "Start": 800,
  "End": 1000,
- "Count": 1229.696869573942
+ "Count": 1213.946869573942
  },
  {
  "Start": 900,
@@ -104,7 +104,7 @@
  {
  "Start": 200,
  "End": 400,
- "Count": 1226.2788209899081
+ "Count": 1215.0288209899081
  },
  {
  "Start": 200,

diff --git a/pkg/planner/core/planbuilder.go b/pkg/planner/core/planbuilder.go
@@ -2696,9 +2696,11 @@ var analyzeOptionDefault = map[ast.AnalyzeOptionType]uint64{
  ast.AnalyzeOptSampleRate: math.Float64bits(0),
 }
 
+// TopN reduced from 500 to 100 due to concerns over large number of TopN values collected for customers with many tables.
+// 100 is more inline with other databases. 100-256 is also common for NumBuckets with other databases.
 var analyzeOptionDefaultV2 = map[ast.AnalyzeOptionType]uint64{
  ast.AnalyzeOptNumBuckets: 256,
- ast.AnalyzeOptNumTopN: 500,
+ ast.AnalyzeOptNumTopN: 100,
  ast.AnalyzeOptCMSketchWidth: 2048,
  ast.AnalyzeOptCMSketchDepth: 5,
  ast.AnalyzeOptNumSamples: 0,

diff --git a/pkg/statistics/builder.go b/pkg/statistics/builder.go
@@ -296,6 +296,11 @@ func BuildHistAndTopN(
  sampleNum := int64(len(samples))
  // As we use samples to build the histogram, the bucket number and repeat should multiply a factor.
  sampleFactor := float64(count) / float64(len(samples))
+ // If a numTopn value other than 100 is passed in, we assume it's a value that the user wants us to honor
+ allowPruning := true
+ if numTopN != 100 {
+ allowPruning = false
+ }
 
  // Step1: collect topn from samples
 
@@ -326,18 +331,23 @@ func BuildHistAndTopN(
  continue
  }
  // case 2, meet a different value: counting for the "current" is complete
- // case 2-1, now topn is empty: append the "current" count directly
+ // case 2-1, do not add a count of 1 if we're sampling
+ if curCnt == 1 && sampleFactor > 1 && allowPruning {
+ cur, curCnt = sampleBytes, 1
+ continue
+ }
+ // case 2-2, now topn is empty: append the "current" count directly
  if len(topNList) == 0 {
  topNList = append(topNList, TopNMeta{Encoded: cur, Count: uint64(curCnt)})
  cur, curCnt = sampleBytes, 1
  continue
  }
- // case 2-2, now topn is full, and the "current" count is less than the least count in the topn: no need to insert the "current"
+ // case 2-3, now topn is full, and the "current" count is less than the least count in the topn: no need to insert the "current"
  if len(topNList) >= numTopN && uint64(curCnt) <= topNList[len(topNList)-1].Count {
  cur, curCnt = sampleBytes, 1
  continue
  }
- // case 2-3, now topn is not full, or the "current" count is larger than the least count in the topn: need to find a slot to insert the "current"
+ // case 2-4, now topn is not full, or the "current" count is larger than the least count in the topn: need to find a slot to insert the "current"
  j := len(topNList)
  for ; j > 0; j-- {
  if uint64(curCnt) < topNList[j-1].Count {
@@ -358,9 +368,10 @@ func BuildHistAndTopN(
  hg.Correlation = calcCorrelation(sampleNum, corrXYSum)
  }
 
- // Handle the counting for the last value. Basically equal to the case 2 above.
- // now topn is empty: append the "current" count directly
- if numTopN != 0 {
+ // Handle the counting for the last value. Basically equal to the case 2 above - including
+ // limiting addition of a value with a count of 1 (since it will be pruned anyway).
+ if numTopN != 0 && (!allowPruning || (allowPruning && (sampleFactor <= 1 || curCnt > 1))) {
+ // now topn is empty: append the "current" count directly
  if len(topNList) == 0 {
  topNList = append(topNList, TopNMeta{Encoded: cur, Count: uint64(curCnt)})
  } else if len(topNList) < numTopN || uint64(curCnt) > topNList[len(topNList)-1].Count {
@@ -380,7 +391,9 @@ func BuildHistAndTopN(
  }
  }
 
- topNList = pruneTopNItem(topNList, ndv, nullCount, sampleNum, count)
+ if allowPruning {
+ topNList = pruneTopNItem(topNList, ndv, nullCount, sampleNum, count)
+ }
 
  // Step2: exclude topn from samples
  if numTopN != 0 {
@@ -435,7 +448,7 @@ func BuildHistAndTopN(
  topn.Scale(sampleFactor)
 
  if uint64(count) <= topn.TotalCount() || int(hg.NDV) <= len(topn.TopN) {
- // TopN includes all sample data
+ // If we've collected everything - don't create any buckets
  return hg, topn, nil
  }
 
@@ -454,8 +467,7 @@ func BuildHistAndTopN(
 //
 // We assume that the ones not in the top-n list's selectivity is 1/remained_ndv which is the internal implementation of EqualRowCount
 func pruneTopNItem(topns []TopNMeta, ndv, nullCount, sampleRows, totalRows int64) []TopNMeta {
- // If the sampleRows holds all rows, or NDV of samples equals to actual NDV, we just return the TopN directly.
- if sampleRows == totalRows || totalRows <= 1 || int64(len(topns)) >= ndv || len(topns) == 0 {
+ if totalRows <= 1 || int64(len(topns)) >= ndv || len(topns) <= 1 {
  return topns
  }
  // Sum the occurrence except the least common one from the top-n list. To check whether the lest common one is worth

diff --git a/pkg/statistics/handle/autoanalyze/autoanalyze_test.go b/pkg/statistics/handle/autoanalyze/autoanalyze_test.go
@@ -315,7 +315,7 @@ func TestAutoAnalyzeSkipColumnTypes(t *testing.T) {
  exec.AutoAnalyzeMinCnt = originalVal
  }()
  require.True(t, h.HandleAutoAnalyze())
- tk.MustQuery("select job_info from mysql.analyze_jobs where job_info like '%auto analyze table%'").Check(testkit.Rows("auto analyze table columns a, b, d with 256 buckets, 500 topn, 1 samplerate"))
+ tk.MustQuery("select job_info from mysql.analyze_jobs where job_info like '%auto analyze table%'").Check(testkit.Rows("auto analyze table columns a, b, d with 256 buckets, 100 topn, 1 samplerate"))
 }
 
 func TestAutoAnalyzeOnEmptyTable(t *testing.T) {

diff --git a/pkg/statistics/handle/globalstats/global_stats_internal_test.go b/pkg/statistics/handle/globalstats/global_stats_internal_test.go
@@ -351,6 +351,7 @@ func testIssues24349(testKit *testkit.TestKit) {
  testKit.MustExec("create table t (a int, b int) partition by hash(a) partitions 3")
  testKit.MustExec("insert into t values (0, 3), (0, 3), (0, 3), (0, 2), (1, 1), (1, 2), (1, 2), (1, 2), (1, 3), (1, 4), (2, 1), (2, 1)")
  testKit.MustExec("analyze table t with 1 topn, 3 buckets")
+ testKit.MustExec("explain select * from t where a > 0 and b > 0")
  testKit.MustQuery("show stats_buckets where partition_name='global'").Check(testkit.Rows(
  "test t global a 0 0 2 2 0 2 0",
  "test t global b 0 0 3 1 1 2 0",

diff --git a/pkg/statistics/statistics_test.go b/pkg/statistics/statistics_test.go
@@ -501,7 +501,7 @@ func SubTestBuild() func(*testing.T) {
  return func(t *testing.T) {
  s := createTestStatisticsSamples(t)
  bucketCount := int64(256)
- topNCount := 20
+ topNCount := 100
  ctx := mock.NewContext()
  sc := ctx.GetSessionVars().StmtCtx
  sketch, _, err := buildFMSketch(sc, s.rc.(*recordSet).data, 1000)
@@ -650,7 +650,7 @@ func TestPruneTopN(t *testing.T) {
  var totalNDV, nullCnt, sampleRows, totalRows int64
 
  // case 1
- topnIn = []TopNMeta{{[]byte{1}, 100_000}, {[]byte{2}, 10}}
+ topnIn = []TopNMeta{{[]byte{1}, 100_000}}
  totalNDV = 2
  nullCnt = 0
  sampleRows = 100_010
@@ -674,13 +674,41 @@ func TestPruneTopN(t *testing.T) {
 
  // case 3
  topnIn = nil
- for i := 0; i < 100; i++ {
- topnIn = append(topnIn, TopNMeta{[]byte{byte(i)}, 1_000})
+ for i := 0; i < 10; i++ {
+ topnIn = append(topnIn, TopNMeta{[]byte{byte(i)}, 10_000})
  }
  totalNDV = 100
  nullCnt = 0
  sampleRows = 100_000
  totalRows = 10_000_000
  topnOut = pruneTopNItem(topnIn, totalNDV, nullCnt, sampleRows, totalRows)
  require.Equal(t, topnIn, topnOut)
+
+ // case 4 - test TopN pruning for small table
+ topnIn = []TopNMeta{
+ {[]byte{1}, 3_000},
+ {[]byte{2}, 3_000},
+ }
+ totalNDV = 4002
+ nullCnt = 0
+ sampleRows = 10_000
+ totalRows = 10_000
+ topnOut = pruneTopNItem(topnIn, totalNDV, nullCnt, sampleRows, totalRows)
+ require.Equal(t, topnIn, topnOut)
+
+ // case 5 - test pruning of value=1
+ topnIn = nil
+ for i := 0; i < 10; i++ {
+ topnIn = append(topnIn, TopNMeta{[]byte{byte(i)}, 90})
+ }
+ topnPruned := topnIn
+ for i := 90; i < 150; i++ {
+ topnIn = append(topnIn, TopNMeta{[]byte{byte(i)}, 1})
+ }
+ totalNDV = 150
+ nullCnt = 0
+ sampleRows = 1500
+ totalRows = 1500
+ topnOut = pruneTopNItem(topnIn, totalNDV, nullCnt, sampleRows, totalRows)
+ require.Equal(t, topnPruned, topnOut)
 }
diff --git a/tests/integrationtest/r/executor/analyze.result b/tests/integrationtest/r/executor/analyze.result
@@ -824,12 +824,12 @@ delete from mysql.analyze_jobs;
 analyze table t;
 select job_info from mysql.analyze_jobs where job_info like '%analyze table%';
 job_info
-analyze table columns a, b, d with 256 buckets, 500 topn, 1 samplerate
+analyze table columns a, b, d with 256 buckets, 100 topn, 1 samplerate
 delete from mysql.analyze_jobs;
 analyze table t columns a, e;
 select job_info from mysql.analyze_jobs where job_info like '%analyze table%';
 job_info
-analyze table columns a, d with 256 buckets, 500 topn, 1 samplerate
+analyze table columns a, d with 256 buckets, 100 topn, 1 samplerate
 set @@session.tidb_analyze_skip_column_types = default;
 DROP TABLE IF EXISTS Issue34228;
 CREATE TABLE Issue34228 (id bigint NOT NULL, dt datetime NOT NULL) PARTITION BY RANGE COLUMNS(dt) (PARTITION p202201 VALUES LESS THAN ("2022-02-01"), PARTITION p202202 VALUES LESS THAN ("2022-03-01"));