Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

planner: reduce topn count to exclude non-skewed values | tidb-test=pr/2333 #53035

Merged
merged 64 commits into from
Jun 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
64 commits
Select commit Hold shift + click to select a range
90bbf9e
review testing 23
terry1purcell May 22, 2024
4de3125
minor reset
terry1purcell May 17, 2024
26ffc4a
testcase1
terry1purcell May 18, 2024
ecbf8cf
testcase2
terry1purcell May 18, 2024
69691df
testcase3
terry1purcell May 18, 2024
92dfddb
testcase4
terry1purcell May 18, 2024
92a2796
testcase5
terry1purcell May 19, 2024
413e63a
testcase6
terry1purcell May 19, 2024
1b343ce
testcase7
terry1purcell May 19, 2024
abdf128
testcase8
terry1purcell May 19, 2024
91ea0cd
docs: update README.md (#53373)
DanRoscigno May 20, 2024
668b5e0
testcase9
terry1purcell May 20, 2024
557eee0
testcase10
terry1purcell May 20, 2024
6b273a8
testcase11
terry1purcell May 20, 2024
84b10bd
testcase12
terry1purcell May 20, 2024
1e338cb
testcase13
terry1purcell May 20, 2024
5961402
testcase14
terry1purcell May 20, 2024
79782f2
testcase15
terry1purcell May 21, 2024
016ff85
testcase16
terry1purcell May 21, 2024
c2a0d67
testcase17
terry1purcell May 21, 2024
44a00e1
testcase18
terry1purcell May 21, 2024
3475281
testcase19
terry1purcell May 21, 2024
fdf2414
review testing 20
terry1purcell May 21, 2024
43e3384
review testing 21
terry1purcell May 21, 2024
978f9c2
review testing 22
terry1purcell May 21, 2024
917ac20
review testing 25
terry1purcell May 23, 2024
a52e75c
Delete pkg/executor/testkit.go
terry1purcell May 23, 2024
e23d61c
review testing 27
terry1purcell May 23, 2024
26f3c42
review testing 28
terry1purcell May 23, 2024
28132ce
review testing 30
terry1purcell May 23, 2024
b6d0aea
review reset
terry1purcell May 23, 2024
046b892
review new1
terry1purcell May 23, 2024
c3984d9
Merge branch 'pingcap:master' into statstopn
terry1purcell May 28, 2024
69f429f
Merge branch 'pingcap:master' into statstopn
terry1purcell May 28, 2024
97b276f
Merge branch 'pingcap:master' into statstopn
terry1purcell May 29, 2024
8c21e79
Merge branch 'pingcap:master' into statstopn
terry1purcell May 30, 2024
1fa7890
Delete README.md
terry1purcell May 30, 2024
2562d26
review may 30
terry1purcell May 30, 2024
74b7d4c
review may 30 test2
terry1purcell May 30, 2024
132878c
Merge branch 'pingcap:master' into statstopn
terry1purcell Jun 1, 2024
6bde95a
Merge branch 'pingcap:master' into statstopn
terry1purcell Jun 2, 2024
03f2a0a
Merge branch 'pingcap:master' into statstopn
terry1purcell Jun 3, 2024
e1ac079
review comments jun3
terry1purcell Jun 3, 2024
d4715c2
review comments jun3 test1
terry1purcell Jun 3, 2024
f8fa5fd
review comments jun3 add file back
terry1purcell Jun 3, 2024
8cdfe7e
Merge branch 'pingcap:master' into statstopn
terry1purcell Jun 4, 2024
48e21c5
Merge branch 'pingcap:master' into statstopn
terry1purcell Jun 5, 2024
fdde2fb
Merge branch 'pingcap:master' into statstopn
terry1purcell Jun 6, 2024
eebb721
Merge branch 'pingcap:master' into statstopn
terry1purcell Jun 7, 2024
7a9961e
Merge branch 'pingcap:master' into statstopn
terry1purcell Jun 8, 2024
c54b9e1
Merge branch 'pingcap:master' into statstopn
terry1purcell Jun 11, 2024
0ffba90
Merge branch 'pingcap:master' into statstopn
terry1purcell Jun 11, 2024
22073bd
Merge branch 'pingcap:master' into statstopn
terry1purcell Jun 12, 2024
f1f55f9
Merge branch 'pingcap:master' into statstopn
terry1purcell Jun 13, 2024
80c2f41
Merge branch 'pingcap:master' into statstopn
terry1purcell Jun 13, 2024
dbff992
Merge branch 'pingcap:master' into statstopn
terry1purcell Jun 13, 2024
3bc1ee8
Merge branch 'pingcap:master' into statstopn
terry1purcell Jun 14, 2024
0d1597b
Merge branch 'pingcap:master' into statstopn
terry1purcell Jun 14, 2024
166ad73
Merge branch 'pingcap:master' into statstopn
terry1purcell Jun 14, 2024
09def84
june 17 merge with other updates
terry1purcell Jun 17, 2024
bc7615d
Merge branch 'pingcap:master' into statstopn
terry1purcell Jun 18, 2024
00a62ce
june 18 review comments
terry1purcell Jun 18, 2024
2cbcab8
Merge branch 'pingcap:master' into statstopn
terry1purcell Jun 18, 2024
bf7918d
Merge branch 'pingcap:master' into statstopn
terry1purcell Jun 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/executor/show_stats_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ func TestShowAnalyzeStatus(t *testing.T) {
require.Equal(t, "test", rows[0][0])
require.Equal(t, "t", rows[0][1])
require.Equal(t, "", rows[0][2])
require.Equal(t, "analyze table all columns with 256 buckets, 500 topn, 1 samplerate", rows[0][3])
require.Equal(t, "analyze table all columns with 256 buckets, 100 topn, 1 samplerate", rows[0][3])
require.Equal(t, "2", rows[0][4])
checkTime := func(val any) {
str, ok := val.(string)
Expand Down
8 changes: 4 additions & 4 deletions pkg/executor/test/analyzetest/analyze_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1916,7 +1916,7 @@ func testKillAutoAnalyze(t *testing.T, ver int) {
if ver == 1 {
jobInfo += "columns"
} else {
jobInfo += "table all columns with 256 buckets, 500 topn, 1 samplerate"
jobInfo += "table all columns with 256 buckets, 100 topn, 1 samplerate"
}
// kill auto analyze when it is pending/running/finished
for _, status := range []string{
Expand Down Expand Up @@ -2041,7 +2041,7 @@ func TestAnalyzeJob(t *testing.T) {
DBName: "test",
TableName: "t",
PartitionName: "",
JobInfo: "table all columns with 256 buckets, 500 topn, 1 samplerate",
JobInfo: "table all columns with 256 buckets, 100 topn, 1 samplerate",
}
executor.AddNewAnalyzeJob(se, job)
require.NotNil(t, job.ID)
Expand Down Expand Up @@ -2133,7 +2133,7 @@ func TestInsertAnalyzeJobWithLongInstance(t *testing.T) {
DBName: "test",
TableName: "t",
PartitionName: "",
JobInfo: "table all columns with 256 buckets, 500 topn, 1 samplerate",
JobInfo: "table all columns with 256 buckets, 100 topn, 1 samplerate",
}
h := dom.StatsHandle()
instance := "xxxtidb-tidb-0.xxxtidb-tidb-peer.xxxx-xx-1234-xxx-123456-1-321.xyz:4000"
Expand Down Expand Up @@ -2785,7 +2785,7 @@ func TestAnalyzeColumnsSkipMVIndexJsonCol(t *testing.T) {
tk.MustQuery("select job_info from mysql.analyze_jobs where table_schema = 'test' and table_name = 't'").Sort().Check(
testkit.Rows(
"analyze index idx_c",
"analyze table columns a, b with 256 buckets, 500 topn, 1 samplerate",
"analyze table columns a, b with 256 buckets, 100 topn, 1 samplerate",
))

is := dom.InfoSchema()
Expand Down
6 changes: 3 additions & 3 deletions pkg/planner/cardinality/testdata/cardinality_suite_out.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
{
"Start": 800,
"End": 900,
"Count": 771.504166655054
"Count": 755.754166655054
},
{
"Start": 900,
Expand Down Expand Up @@ -79,7 +79,7 @@
{
"Start": 800,
"End": 1000,
"Count": 1229.696869573942
"Count": 1213.946869573942
},
{
"Start": 900,
Expand All @@ -104,7 +104,7 @@
{
"Start": 200,
"End": 400,
"Count": 1226.2788209899081
"Count": 1215.0288209899081
},
{
"Start": 200,
Expand Down
4 changes: 3 additions & 1 deletion pkg/planner/core/planbuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -2696,9 +2696,11 @@ var analyzeOptionDefault = map[ast.AnalyzeOptionType]uint64{
ast.AnalyzeOptSampleRate: math.Float64bits(0),
}

// TopN reduced from 500 to 100 due to concerns over large number of TopN values collected for customers with many tables.
// 100 is more inline with other databases. 100-256 is also common for NumBuckets with other databases.
var analyzeOptionDefaultV2 = map[ast.AnalyzeOptionType]uint64{
ast.AnalyzeOptNumBuckets: 256,
ast.AnalyzeOptNumTopN: 500,
terry1purcell marked this conversation as resolved.
Show resolved Hide resolved
ast.AnalyzeOptNumTopN: 100,
ast.AnalyzeOptCMSketchWidth: 2048,
ast.AnalyzeOptCMSketchDepth: 5,
ast.AnalyzeOptNumSamples: 0,
Expand Down
32 changes: 22 additions & 10 deletions pkg/statistics/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,11 @@ func BuildHistAndTopN(
sampleNum := int64(len(samples))
// As we use samples to build the histogram, the bucket number and repeat should multiply a factor.
sampleFactor := float64(count) / float64(len(samples))
// If a numTopn value other than 100 is passed in, we assume it's a value that the user wants us to honor
allowPruning := true
if numTopN != 100 {
Rustin170506 marked this conversation as resolved.
Show resolved Hide resolved
allowPruning = false
}

// Step1: collect topn from samples

Expand Down Expand Up @@ -326,18 +331,23 @@ func BuildHistAndTopN(
continue
}
// case 2, meet a different value: counting for the "current" is complete
// case 2-1, now topn is empty: append the "current" count directly
// case 2-1, do not add a count of 1 if we're sampling
if curCnt == 1 && sampleFactor > 1 && allowPruning {
Rustin170506 marked this conversation as resolved.
Show resolved Hide resolved
cur, curCnt = sampleBytes, 1
continue
}
// case 2-2, now topn is empty: append the "current" count directly
if len(topNList) == 0 {
topNList = append(topNList, TopNMeta{Encoded: cur, Count: uint64(curCnt)})
cur, curCnt = sampleBytes, 1
continue
}
// case 2-2, now topn is full, and the "current" count is less than the least count in the topn: no need to insert the "current"
// case 2-3, now topn is full, and the "current" count is less than the least count in the topn: no need to insert the "current"
if len(topNList) >= numTopN && uint64(curCnt) <= topNList[len(topNList)-1].Count {
cur, curCnt = sampleBytes, 1
continue
}
// case 2-3, now topn is not full, or the "current" count is larger than the least count in the topn: need to find a slot to insert the "current"
// case 2-4, now topn is not full, or the "current" count is larger than the least count in the topn: need to find a slot to insert the "current"
j := len(topNList)
for ; j > 0; j-- {
if uint64(curCnt) < topNList[j-1].Count {
Expand All @@ -358,9 +368,10 @@ func BuildHistAndTopN(
hg.Correlation = calcCorrelation(sampleNum, corrXYSum)
}

// Handle the counting for the last value. Basically equal to the case 2 above.
// now topn is empty: append the "current" count directly
if numTopN != 0 {
// Handle the counting for the last value. Basically equal to the case 2 above - including
// limiting addition of a value with a count of 1 (since it will be pruned anyway).
if numTopN != 0 && (!allowPruning || (allowPruning && (sampleFactor <= 1 || curCnt > 1))) {
terry1purcell marked this conversation as resolved.
Show resolved Hide resolved
// now topn is empty: append the "current" count directly
if len(topNList) == 0 {
topNList = append(topNList, TopNMeta{Encoded: cur, Count: uint64(curCnt)})
} else if len(topNList) < numTopN || uint64(curCnt) > topNList[len(topNList)-1].Count {
Expand All @@ -380,7 +391,9 @@ func BuildHistAndTopN(
}
}

topNList = pruneTopNItem(topNList, ndv, nullCount, sampleNum, count)
if allowPruning {
topNList = pruneTopNItem(topNList, ndv, nullCount, sampleNum, count)
}

// Step2: exclude topn from samples
if numTopN != 0 {
Expand Down Expand Up @@ -435,7 +448,7 @@ func BuildHistAndTopN(
topn.Scale(sampleFactor)

if uint64(count) <= topn.TotalCount() || int(hg.NDV) <= len(topn.TopN) {
// TopN includes all sample data
// If we've collected everything - don't create any buckets
return hg, topn, nil
}

Expand All @@ -454,8 +467,7 @@ func BuildHistAndTopN(
//
// We assume that the ones not in the top-n list's selectivity is 1/remained_ndv which is the internal implementation of EqualRowCount
func pruneTopNItem(topns []TopNMeta, ndv, nullCount, sampleRows, totalRows int64) []TopNMeta {
// If the sampleRows holds all rows, or NDV of samples equals to actual NDV, we just return the TopN directly.
if sampleRows == totalRows || totalRows <= 1 || int64(len(topns)) >= ndv || len(topns) == 0 {
terry1purcell marked this conversation as resolved.
Show resolved Hide resolved
if totalRows <= 1 || int64(len(topns)) >= ndv || len(topns) <= 1 {
return topns
}
// Sum the occurrence except the least common one from the top-n list. To check whether the lest common one is worth
Expand Down
2 changes: 1 addition & 1 deletion pkg/statistics/handle/autoanalyze/autoanalyze_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ func TestAutoAnalyzeSkipColumnTypes(t *testing.T) {
exec.AutoAnalyzeMinCnt = originalVal
}()
require.True(t, h.HandleAutoAnalyze())
tk.MustQuery("select job_info from mysql.analyze_jobs where job_info like '%auto analyze table%'").Check(testkit.Rows("auto analyze table columns a, b, d with 256 buckets, 500 topn, 1 samplerate"))
tk.MustQuery("select job_info from mysql.analyze_jobs where job_info like '%auto analyze table%'").Check(testkit.Rows("auto analyze table columns a, b, d with 256 buckets, 100 topn, 1 samplerate"))
}

func TestAutoAnalyzeOnEmptyTable(t *testing.T) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,7 @@ func testIssues24349(testKit *testkit.TestKit) {
testKit.MustExec("create table t (a int, b int) partition by hash(a) partitions 3")
testKit.MustExec("insert into t values (0, 3), (0, 3), (0, 3), (0, 2), (1, 1), (1, 2), (1, 2), (1, 2), (1, 3), (1, 4), (2, 1), (2, 1)")
testKit.MustExec("analyze table t with 1 topn, 3 buckets")
terry1purcell marked this conversation as resolved.
Show resolved Hide resolved
testKit.MustExec("explain select * from t where a > 0 and b > 0")
testKit.MustQuery("show stats_buckets where partition_name='global'").Check(testkit.Rows(
"test t global a 0 0 2 2 0 2 0",
"test t global b 0 0 3 1 1 2 0",
Expand Down
36 changes: 32 additions & 4 deletions pkg/statistics/statistics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ func SubTestBuild() func(*testing.T) {
return func(t *testing.T) {
s := createTestStatisticsSamples(t)
bucketCount := int64(256)
topNCount := 20
topNCount := 100
ctx := mock.NewContext()
sc := ctx.GetSessionVars().StmtCtx
sketch, _, err := buildFMSketch(sc, s.rc.(*recordSet).data, 1000)
Expand Down Expand Up @@ -650,7 +650,7 @@ func TestPruneTopN(t *testing.T) {
var totalNDV, nullCnt, sampleRows, totalRows int64

// case 1
topnIn = []TopNMeta{{[]byte{1}, 100_000}, {[]byte{2}, 10}}
topnIn = []TopNMeta{{[]byte{1}, 100_000}}
totalNDV = 2
nullCnt = 0
sampleRows = 100_010
Expand All @@ -674,13 +674,41 @@ func TestPruneTopN(t *testing.T) {

// case 3
topnIn = nil
for i := 0; i < 100; i++ {
topnIn = append(topnIn, TopNMeta{[]byte{byte(i)}, 1_000})
for i := 0; i < 10; i++ {
topnIn = append(topnIn, TopNMeta{[]byte{byte(i)}, 10_000})
}
totalNDV = 100
nullCnt = 0
sampleRows = 100_000
totalRows = 10_000_000
topnOut = pruneTopNItem(topnIn, totalNDV, nullCnt, sampleRows, totalRows)
require.Equal(t, topnIn, topnOut)

// case 4 - test TopN pruning for small table
topnIn = []TopNMeta{
{[]byte{1}, 3_000},
{[]byte{2}, 3_000},
}
totalNDV = 4002
nullCnt = 0
sampleRows = 10_000
totalRows = 10_000
topnOut = pruneTopNItem(topnIn, totalNDV, nullCnt, sampleRows, totalRows)
require.Equal(t, topnIn, topnOut)
terry1purcell marked this conversation as resolved.
Show resolved Hide resolved

// case 5 - test pruning of value=1
topnIn = nil
for i := 0; i < 10; i++ {
topnIn = append(topnIn, TopNMeta{[]byte{byte(i)}, 90})
}
topnPruned := topnIn
for i := 90; i < 150; i++ {
topnIn = append(topnIn, TopNMeta{[]byte{byte(i)}, 1})
}
totalNDV = 150
nullCnt = 0
sampleRows = 1500
totalRows = 1500
topnOut = pruneTopNItem(topnIn, totalNDV, nullCnt, sampleRows, totalRows)
require.Equal(t, topnPruned, topnOut)
}
4 changes: 2 additions & 2 deletions tests/integrationtest/r/executor/analyze.result
Original file line number Diff line number Diff line change
Expand Up @@ -824,12 +824,12 @@ delete from mysql.analyze_jobs;
analyze table t;
select job_info from mysql.analyze_jobs where job_info like '%analyze table%';
job_info
analyze table columns a, b, d with 256 buckets, 500 topn, 1 samplerate
analyze table columns a, b, d with 256 buckets, 100 topn, 1 samplerate
delete from mysql.analyze_jobs;
analyze table t columns a, e;
select job_info from mysql.analyze_jobs where job_info like '%analyze table%';
job_info
analyze table columns a, d with 256 buckets, 500 topn, 1 samplerate
analyze table columns a, d with 256 buckets, 100 topn, 1 samplerate
set @@session.tidb_analyze_skip_column_types = default;
DROP TABLE IF EXISTS Issue34228;
CREATE TABLE Issue34228 (id bigint NOT NULL, dt datetime NOT NULL) PARTITION BY RANGE COLUMNS(dt) (PARTITION p202201 VALUES LESS THAN ("2022-02-01"), PARTITION p202202 VALUES LESS THAN ("2022-03-01"));
Expand Down