Skip to content

Commit efe47c9

Browse files
MM-61311: Version-controlled config samples (#839)
* Add sample files in TOML for comparison and coord * Add release testing config set * Add performance comparison config set * Add some documentation to surface the new samples
1 parent 4e7633c commit efe47c9

File tree

12 files changed

+849
-0
lines changed

12 files changed

+849
-0
lines changed

config/comparison.sample.toml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
[BaseBuild]
2+
Label = 'master'
3+
URL = 'file://master.tar.gz'
4+
5+
[NewBuild]
6+
Label = 'release'
7+
URL = 'file://release.tar.gz'
8+
9+
[[LoadTests]]
10+
Type = 'unbounded'
11+
DBEngine = 'mysql'
12+
13+
[[LoadTests]]
14+
Type = 'bounded'
15+
DBEngine = 'mysql'
16+
NumUsers = 1000
17+
Duration = '1h'
18+
19+
[[LoadTests]]
20+
Type = 'unbounded'
21+
DBEngine = 'postgresql'
22+
23+
[[LoadTests]]
24+
Type = 'bounded'
25+
DBEngine = 'postgresql'
26+
NumUsers = 1000
27+
Duration = '1h'
28+
29+
[Output]
30+
UploadDashboard = true
31+
GenerateGraphs = false
32+
GenerateReport = true

config/coordinator.sample.toml

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
NumUsersInc = 8
2+
NumUsersDec = 8
3+
RestTimeSec = 2
4+
5+
[ClusterConfig]
6+
MaxActiveUsers = 2000
7+
8+
[[ClusterConfig.Agents]]
9+
Id = 'lt0'
10+
ApiURL = 'http://localhost:4000'
11+
12+
[MonitorConfig]
13+
PrometheusURL = 'http://localhost:9090'
14+
UpdateIntervalMs = 2000
15+
16+
[[MonitorConfig.Queries]]
17+
Description = 'Percentage of HTTP 5xx server errors'
18+
Legend = 'Percent'
19+
Query = '(sum(rate(mattermost_api_time_count{status_code=~"5.."}[1m]))/sum(rate(mattermost_api_time_count[1m])))*100'
20+
Threshold = 0.025
21+
MinIntervalSec = 60
22+
Alert = true
23+
24+
[[MonitorConfig.Queries]]
25+
Description = 'Average client request duration'
26+
Legend = 'Avg duration (s)'
27+
Query = 'sum(rate(loadtest_http_request_time_sum[1m]))/sum(rate(loadtest_http_request_time_count[1m]))'
28+
Threshold = 0.1
29+
MinIntervalSec = 60
30+
Alert = true
31+
32+
[[MonitorConfig.Queries]]
33+
Description = '99th percentile of client request duration'
34+
Legend = 'P99 duration (s)'
35+
Query = 'histogram_quantile(0.99, sum(rate(loadtest_http_request_time_bucket[1m])) by (le))'
36+
Threshold = 2
37+
MinIntervalSec = 60
38+
Alert = true
39+
40+
[[MonitorConfig.Queries]]
41+
Description = 'Percentage of HTTP 5xx client errors'
42+
Legend = 'Percent'
43+
Query = '(sum(rate(loadtest_http_errors_total{status_code=~"5.."}[1m]))/sum(rate(loadtest_http_request_time_count[1m])))*100'
44+
Threshold = 0.025
45+
MinIntervalSec = 60
46+
Alert = true
47+
48+
[[MonitorConfig.Queries]]
49+
Description = 'Percentage of client timeouts'
50+
Legend = 'Percent'
51+
Query = '(sum(rate(loadtest_http_timeouts_total[1m]))/sum(rate(loadtest_http_request_time_count[1m]))) * 100'
52+
Threshold = 0.025
53+
MinIntervalSec = 60
54+
Alert = true
55+
56+
[[MonitorConfig.Queries]]
57+
Description = 'CPU utilization - Average of app nodes'
58+
Legend = 'Percent'
59+
Query = '100 - 100 * (avg(irate(node_cpu_seconds_total{instance=~"app.*",mode="idle"}[5m])))'
60+
Threshold = 85
61+
MinIntervalSec = 60
62+
Alert = true
63+
64+
[[MonitorConfig.Queries]]
65+
Description = 'Memory utilization - Average of app nodes'
66+
Legend = 'Percent'
67+
Query = '100 - 100 * avg(node_memory_MemAvailable_bytes{instance=~"app.*"} / node_memory_MemTotal_bytes{instance=~"app.*"})'
68+
Threshold = 85
69+
MinIntervalSec = 60
70+
Alert = true
71+
72+
[[MonitorConfig.Queries]]
73+
Description = 'Percentage of TCP retransmissions in the app nodes'
74+
Legend = 'Percent'
75+
Query = '(avg(rate(node_netstat_Tcp_RetransSegs{instance=~"app.*"}[1m])) / avg(rate(node_netstat_Tcp_OutSegs{instance=~"app.*"}[1m]))) * 100'
76+
Threshold = 0.5
77+
MinIntervalSec = 60
78+
Alert = true
79+
80+
[[MonitorConfig.Queries]]
81+
Description = 'Percentage of TCP retransmissions in the proxy node'
82+
Legend = 'Percent'
83+
Query = '(avg(rate(node_netstat_Tcp_RetransSegs{instance=~"proxy:9100"}[1m])) / avg(rate(node_netstat_Tcp_OutSegs{instance=~"proxy:9100"}[1m]))) * 100'
84+
Threshold = 0.5
85+
MinIntervalSec = 60
86+
Alert = true
87+
88+
[LogSettings]
89+
EnableConsole = true
90+
ConsoleLevel = 'INFO'
91+
ConsoleJson = false
92+
EnableFile = true
93+
FileLevel = 'INFO'
94+
FileJson = true
95+
FileLocation = 'ltcoordinator.log'
96+
EnableColor = false

docs/readme.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,7 @@ Once you have familiarized yourself with the tool, and after you have successful
4747
- [Running an automated load-test comparison](comparison.md): a workflow specifically designed for when you need to compare two different versions of Mattermost while maintaining the rest of the variables fixed. This is what the Server team at Mattermost uses for the monthly release performance comparisons.
4848
- [Generating data](generating-data.md): for larger load-tests, you'll need larger datasets. This guide describes how you can use the gencontroller to create an arbitrary number of teams, channels, posts, reactions... to use as the starting point for future tests.
4949

50+
51+
## Configuration samples
52+
53+
We know that the configuration of the load-test tool can be overwhelming, specially to newcomers. We have some sets of config templates we actively use and maintain up-to-date in the [`examples/config` directory](../examples/config). Take a look at the files there to learn from real-world config files.

examples/config/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Configuration samples
2+
3+
This directory contains sets of configuration templates that we use in different scenarios. Some fields are hard-coded to the values we use in our day-to-day processes (e.g. the path to the SSH keys), and others are marked as `#TBD` because they may change from run to run (e.g. the URLs to download Mattermost from). In any case, these sets can serve as starter packs for other, different workflows. For now, we have:
4+
- [Release testing](./release): configuration used when testing a new release of the load-test tool.
5+
- [Performance comparison](./perfcomp): configuration used for regression testing of new Mattermost releases. The results of these runs can be found in the [`performance-reports` repository](https://github.com/mattermost/performance-reports/tree/main/performance-comparisons).
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
[BaseBuild]
2+
Label = 'release-X.Y.Z' #TBD
3+
URL = 'https://releases.mattermost.com/X.Y.Z/mattermost-enterprise-X.Y.Z-linux-amd64.tar.gz' #TBD
4+
5+
[NewBuild]
6+
Label = 'release-A.B.C-rcN' #TBD
7+
URL = 'https://releases.mattermost.com/A.B.C-rcN/mattermost-enterprise-A.B.C-rcN-linux-amd64.tar.gz' #TBD
8+
9+
[[LoadTests]]
10+
Type = 'unbounded'
11+
DBEngine = 'postgresql'
12+
DBDumpURL = 'https://lt-public-data.s3.amazonaws.com/12M_610_psql.sql.gz'
13+
14+
[[LoadTests]]
15+
Type = 'bounded'
16+
DBEngine = 'postgresql'
17+
DBDumpURL = 'https://lt-public-data.s3.amazonaws.com/12M_610_psql.sql.gz'
18+
NumUsers = 7500
19+
Duration = '90m'
20+
21+
[[LoadTests]]
22+
Type = 'unbounded'
23+
DBEngine = 'mysql'
24+
DBDumpURL = 'https://lt-public-data.s3.amazonaws.com/12M_610_mysql.sql.gz'
25+
26+
[[LoadTests]]
27+
Type = 'bounded'
28+
DBEngine = 'mysql'
29+
DBDumpURL = 'https://lt-public-data.s3.amazonaws.com/12M_610_mysql.sql.gz'
30+
NumUsers = 5000
31+
Duration = '90m'
32+
33+
[Output]
34+
UploadDashboard = true
35+
GenerateGraphs = true
36+
GenerateReport = true

examples/config/perfcomp/config.toml

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
[ConnectionConfiguration]
2+
ServerURL = 'http://localhost:8065'
3+
WebSocketURL = 'ws://localhost:8065'
4+
AdminEmail = '[email protected]'
5+
AdminPassword = 'Sys@dmin-sample1'
6+
7+
[UserControllerConfiguration]
8+
Type = 'simulative'
9+
10+
[[UserControllerConfiguration.RatesDistribution]]
11+
Rate = 1.0
12+
Percentage = 0.05
13+
14+
[[UserControllerConfiguration.RatesDistribution]]
15+
Rate = 2.0
16+
Percentage = 0.1
17+
18+
[[UserControllerConfiguration.RatesDistribution]]
19+
Rate = 3.0
20+
Percentage = 0.15
21+
22+
[[UserControllerConfiguration.RatesDistribution]]
23+
Rate = 6.0
24+
Percentage = 0.4
25+
26+
[[UserControllerConfiguration.RatesDistribution]]
27+
Rate = 30.0
28+
Percentage = 0.3
29+
30+
[InstanceConfiguration]
31+
NumTeams = 2
32+
NumChannels = 0
33+
NumPosts = 0
34+
NumReactions = 0
35+
NumAdmins = 0
36+
PercentReplies = 0.5
37+
PercentRepliesInLongThreads = 0.05
38+
PercentPublicChannels = 1
39+
PercentPrivateChannels = 0
40+
PercentDirectChannels = 0
41+
PercentGroupChannels = 0
42+
PercentUrgentPosts = 0.001
43+
44+
[UsersConfiguration]
45+
InitialActiveUsers = 0
46+
UsersFilePath = ''
47+
MaxActiveUsers = 2000
48+
AvgSessionsPerUser = 1
49+
PercentOfUsersAreAdmin = 0.0005
50+
51+
[LogSettings]
52+
EnableConsole = true
53+
ConsoleLevel = 'DEBUG'
54+
ConsoleJson = false
55+
EnableFile = true
56+
FileLevel = 'DEBUG'
57+
FileJson = true
58+
FileLocation = 'ltagent.log'
59+
EnableColor = true
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
NumUsersInc = 8
2+
NumUsersDec = 8
3+
RestTimeSec = 2
4+
5+
[ClusterConfig]
6+
MaxActiveUsers = 20000
7+
8+
[[ClusterConfig.Agents]]
9+
Id = 'lt0'
10+
ApiURL = 'http://localhost:4000'
11+
12+
[MonitorConfig]
13+
PrometheusURL = 'http://localhost:9090'
14+
UpdateIntervalMs = 2000
15+
16+
[[MonitorConfig.Queries]]
17+
Description = 'Percentage of HTTP 5xx server errors'
18+
Legend = 'Percent'
19+
Query = '(sum(rate(mattermost_api_time_count{status_code=~"5.."}[1m]))/sum(rate(mattermost_api_time_count[1m])))*100'
20+
Threshold = 0.025
21+
MinIntervalSec = 60
22+
Alert = true
23+
24+
[[MonitorConfig.Queries]]
25+
Description = 'Average client request duration'
26+
Legend = 'Avg duration (s)'
27+
Query = 'sum(rate(loadtest_http_request_time_sum[1m]))/sum(rate(loadtest_http_request_time_count[1m]))'
28+
Threshold = 0.1
29+
MinIntervalSec = 60
30+
Alert = true
31+
32+
[[MonitorConfig.Queries]]
33+
Description = '99th percentile of client request duration'
34+
Legend = 'P99 duration (s)'
35+
Query = 'histogram_quantile(0.99, sum(rate(loadtest_http_request_time_bucket[1m])) by (le))'
36+
Threshold = 2
37+
MinIntervalSec = 60
38+
Alert = true
39+
40+
[[MonitorConfig.Queries]]
41+
Description = 'Percentage of HTTP 5xx client errors'
42+
Legend = 'Percent'
43+
Query = '(sum(rate(loadtest_http_errors_total{status_code=~"5.."}[1m]))/sum(rate(loadtest_http_request_time_count[1m])))*100'
44+
Threshold = 0.025
45+
MinIntervalSec = 60
46+
Alert = true
47+
48+
[[MonitorConfig.Queries]]
49+
Description = 'Percentage of client timeouts'
50+
Legend = 'Percent'
51+
Query = '(sum(rate(loadtest_http_timeouts_total[1m]))/sum(rate(loadtest_http_request_time_count[1m]))) * 100'
52+
Threshold = 0.025
53+
MinIntervalSec = 60
54+
Alert = true
55+
56+
[[MonitorConfig.Queries]]
57+
Description = 'CPU utilization - Average of app nodes'
58+
Legend = 'Percent'
59+
Query = '100 - 100 * (avg(irate(node_cpu_seconds_total{instance=~"app.*",mode="idle"}[5m])))'
60+
Threshold = 85
61+
MinIntervalSec = 60
62+
Alert = true
63+
64+
[[MonitorConfig.Queries]]
65+
Description = 'Memory utilization - Average of app nodes'
66+
Legend = 'Percent'
67+
Query = '100 - 100 * avg(node_memory_MemAvailable_bytes{instance=~"app.*"} / node_memory_MemTotal_bytes{instance=~"app.*"})'
68+
Threshold = 85
69+
MinIntervalSec = 60
70+
Alert = true
71+
72+
[[MonitorConfig.Queries]]
73+
Description = 'Percentage of TCP retransmissions in the app nodes'
74+
Legend = 'Percent'
75+
Query = '(avg(rate(node_netstat_Tcp_RetransSegs{instance=~"app.*"}[1m])) / avg(rate(node_netstat_Tcp_OutSegs{instance=~"app.*"}[1m]))) * 100'
76+
Threshold = 0.5
77+
MinIntervalSec = 60
78+
Alert = true
79+
80+
[[MonitorConfig.Queries]]
81+
Description = 'Percentage of TCP retransmissions in the proxy node'
82+
Legend = 'Percent'
83+
Query = '(avg(rate(node_netstat_Tcp_RetransSegs{instance=~"proxy:9100"}[1m])) / avg(rate(node_netstat_Tcp_OutSegs{instance=~"proxy:9100"}[1m]))) * 100'
84+
Threshold = 0.5
85+
MinIntervalSec = 60
86+
Alert = true
87+
88+
[LogSettings]
89+
EnableConsole = true
90+
ConsoleLevel = 'INFO'
91+
ConsoleJson = false
92+
EnableFile = true
93+
FileLevel = 'INFO'
94+
FileJson = true
95+
FileLocation = 'ltcoordinator.log'
96+
EnableColor = false

0 commit comments

Comments
 (0)