Skip to content

Commit 46d0be8

Browse files
MM-52240: Stop/Start DB Cluster after load test (#690)
We add the ability to stop/start the DB cluster after a load-test is complete. There is a dedicated command that the users can run any time. Additionally, there is also an automated way to stop the DB after a load test finishes, which is exposed via a flag. Using the flag makes the load test runs synchronously, so it is advised to be used only by advanced users running the command preferably inside an EC2 instance in a screen session. https://mattermost.atlassian.net/browse/MM-52240 Co-authored-by: Alejandro García Montoro <[email protected]>
1 parent bcdd67f commit 46d0be8

File tree

8 files changed

+276
-13
lines changed

8 files changed

+276
-13
lines changed

cmd/ltctl/loadtest.go

Lines changed: 74 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@ import (
1515
"github.com/spf13/cobra"
1616
)
1717

18+
const (
19+
dbAvailable = "available"
20+
dbStopped = "stopped"
21+
)
22+
1823
func getUsersCount(helper *prometheus.Helper) (int, error) {
1924
query := "sum(mattermost_http_websockets_total)"
2025
value, err := helper.VectorFirst(query)
@@ -68,7 +73,75 @@ func RunLoadTestStartCmdF(cmd *cobra.Command, args []string) error {
6873
if err != nil {
6974
return fmt.Errorf("failed to create terraform engine: %w", err)
7075
}
71-
return t.StartCoordinator(nil)
76+
77+
status, err := t.DBStatus()
78+
if err != nil {
79+
return fmt.Errorf("failed to get DB status: %w", err)
80+
}
81+
82+
if status == dbStopped {
83+
if err := t.StartDB(); err != nil {
84+
return err
85+
}
86+
87+
fmt.Println("=====================")
88+
fmt.Println("Looping until the DB is fully available. You can cancel the command and start the test after some time, or don't do anything and it will automatically start the test after the DB is ready")
89+
fmt.Println("=====================")
90+
// Now we loop until the DB is available.
91+
92+
for {
93+
status, err := t.DBStatus()
94+
if err != nil {
95+
return fmt.Errorf("failed to get DB status: %w", err)
96+
}
97+
if status == dbAvailable {
98+
break
99+
}
100+
fmt.Println("Sleeping... ")
101+
time.Sleep(30 * time.Second)
102+
}
103+
} else if status != dbAvailable {
104+
fmt.Printf("The database isn't available at the moment. Its status is %q. Please wait until it has finished, and then try again. \n", status)
105+
return nil
106+
}
107+
108+
isSync, err := cmd.Flags().GetBool("sync")
109+
if err != nil {
110+
return fmt.Errorf("unable to check -sync flag: %w", err)
111+
}
112+
113+
// We simply return in async mode, which is the default.
114+
if !isSync {
115+
return t.StartCoordinator(nil)
116+
}
117+
118+
err = t.StartCoordinator(nil)
119+
if err != nil {
120+
return fmt.Errorf("error in starting coordinator: %w", err)
121+
}
122+
123+
// Now we keep checking the status of the coordinator until it's done.
124+
var coordStatus coordinator.Status
125+
for {
126+
coordStatus, err = t.GetCoordinatorStatus()
127+
if err != nil {
128+
return err
129+
}
130+
131+
if coordStatus.State == coordinator.Done {
132+
fmt.Println("load-test has completed")
133+
break
134+
}
135+
136+
fmt.Println("Sleeping ...")
137+
// Sleeping for 5 minutes gives 12 lines an hour.
138+
// For an avg unbounded test of 4-5 hours, it gives around 50 lines,
139+
// which should be acceptable.
140+
time.Sleep(5 * time.Minute)
141+
}
142+
143+
// Now we stop the DB.
144+
return t.StopDB()
72145
}
73146

74147
func RunLoadTestStopCmdF(cmd *cobra.Command, args []string) error {

cmd/ltctl/main.go

Lines changed: 72 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,55 @@ func RunSyncCmdF(cmd *cobra.Command, args []string) error {
8585
return t.Sync()
8686
}
8787

88+
func RunStopDBCmdF(cmd *cobra.Command, args []string) error {
89+
config, err := getConfig(cmd)
90+
if err != nil {
91+
return err
92+
}
93+
94+
t, err := terraform.New("", config)
95+
if err != nil {
96+
return fmt.Errorf("failed to create terraform engine: %w", err)
97+
}
98+
99+
return t.StopDB()
100+
}
101+
102+
func RunStartDBCmdF(cmd *cobra.Command, args []string) error {
103+
config, err := getConfig(cmd)
104+
if err != nil {
105+
return err
106+
}
107+
108+
t, err := terraform.New("", config)
109+
if err != nil {
110+
return fmt.Errorf("failed to create terraform engine: %w", err)
111+
}
112+
113+
return t.StartDB()
114+
}
115+
116+
func RunDBStatusCmdF(cmd *cobra.Command, args []string) error {
117+
config, err := getConfig(cmd)
118+
if err != nil {
119+
return err
120+
}
121+
122+
t, err := terraform.New("", config)
123+
if err != nil {
124+
return fmt.Errorf("failed to create terraform engine: %w", err)
125+
}
126+
127+
status, err := t.DBStatus()
128+
if err != nil {
129+
return fmt.Errorf("failed to get DB status: %w", err)
130+
}
131+
132+
fmt.Println("Status: ", status)
133+
134+
return nil
135+
}
136+
88137
func RunSSHListCmdF(cmd *cobra.Command, args []string) error {
89138
config, err := getConfig(cmd)
90139
if err != nil {
@@ -166,6 +215,21 @@ func main() {
166215
Short: "Syncs the local .tfstate file with any changes made remotely",
167216
RunE: RunSyncCmdF,
168217
},
218+
{
219+
Use: "stop-db",
220+
Short: "Stops the DB cluster and syncs the changes.",
221+
RunE: RunStopDBCmdF,
222+
},
223+
{
224+
Use: "start-db",
225+
Short: "Starts the DB cluster and syncs the changes.",
226+
RunE: RunStartDBCmdF,
227+
},
228+
{
229+
Use: "db-info",
230+
Short: "Display info about the DB cluster.",
231+
RunE: RunDBStatusCmdF,
232+
},
169233
}
170234

171235
deploymentCmd.AddCommand(deploymentCommands...)
@@ -185,12 +249,15 @@ func main() {
185249
}
186250
resetCmd.Flags().Bool("confirm", false, "Confirm you really want to reset the database and re-initialize it.")
187251

252+
ltStartCmd := &cobra.Command{
253+
Use: "start",
254+
Short: "Start the coordinator in the current load-test deployment",
255+
RunE: RunLoadTestStartCmdF,
256+
}
257+
ltStartCmd.Flags().Bool("sync", false, "Changes the command to not return until the test has finished, and then stops the DB after that")
258+
188259
loadtestComands := []*cobra.Command{
189-
{
190-
Use: "start",
191-
Short: "Start the coordinator in the current load-test deployment",
192-
RunE: RunLoadTestStartCmdF,
193-
},
260+
ltStartCmd,
194261
{
195262
Use: "stop",
196263
Short: "Stop the coordinator in the current load-test deployment",

deployment/terraform/create.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ func (t *Terraform) Create(initData bool) error {
174174
"--vpc-security-group-ids=" + sgID,
175175
"--region=" + t.config.AWSRegion,
176176
}
177-
if err := t.runAWSCommand(nil, args); err != nil {
177+
if err := t.runAWSCommand(nil, args, nil); err != nil {
178178
return err
179179
}
180180
}

deployment/terraform/db_operations.go

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
// Copyright (c) 2019-present Mattermost, Inc. All Rights Reserved.
2+
// See LICENSE.txt for license information.
3+
4+
package terraform
5+
6+
import (
7+
"bytes"
8+
"context"
9+
"encoding/json"
10+
"errors"
11+
"fmt"
12+
"time"
13+
)
14+
15+
// StopDB stops the DB cluster and syncs the changes.
16+
func (t *Terraform) StopDB() error {
17+
if err := t.preFlightCheck(); err != nil {
18+
return err
19+
}
20+
21+
output, err := t.Output()
22+
if err != nil {
23+
return err
24+
}
25+
26+
args := []string{
27+
"--profile=" + t.config.AWSProfile,
28+
"rds",
29+
"stop-db-cluster",
30+
"--db-cluster-identifier=" + output.DBCluster.ClusterIdentifier,
31+
"--region=" + t.config.AWSRegion,
32+
}
33+
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
34+
defer cancel()
35+
if err := t.runAWSCommand(ctx, args, nil); err != nil {
36+
return err
37+
}
38+
39+
return t.Sync()
40+
}
41+
42+
// StartDB starts the DB cluster and syncs the changes.
43+
func (t *Terraform) StartDB() error {
44+
if err := t.preFlightCheck(); err != nil {
45+
return err
46+
}
47+
48+
output, err := t.Output()
49+
if err != nil {
50+
return err
51+
}
52+
53+
args := []string{
54+
"--profile=" + t.config.AWSProfile,
55+
"rds",
56+
"start-db-cluster",
57+
"--db-cluster-identifier=" + output.DBCluster.ClusterIdentifier,
58+
"--region=" + t.config.AWSRegion,
59+
}
60+
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
61+
defer cancel()
62+
if err := t.runAWSCommand(ctx, args, nil); err != nil {
63+
return err
64+
}
65+
66+
return t.Sync()
67+
}
68+
69+
type rdsOutput struct {
70+
DBCluster []struct {
71+
DatabaseName string `json:"DatabaseName"`
72+
DBClusterIdentifier string `json:"DBClusterIdentifier"`
73+
Status string `json:"Status"`
74+
Engine string `json:"Engine"`
75+
EngineVersion string `json:"EngineVersion"`
76+
} `json:"DBClusters"`
77+
}
78+
79+
// DBStatus returns the status of the DB cluster.
80+
func (t *Terraform) DBStatus() (string, error) {
81+
if err := t.preFlightCheck(); err != nil {
82+
return "", err
83+
}
84+
85+
output, err := t.Output()
86+
if err != nil {
87+
return "", err
88+
}
89+
90+
if output.DBCluster.ClusterIdentifier == "" {
91+
return "", errors.New("DB cluster identifier not found")
92+
}
93+
94+
var buf bytes.Buffer
95+
args := []string{
96+
"--profile=" + t.config.AWSProfile,
97+
"rds",
98+
"describe-db-clusters",
99+
"--db-cluster-identifier=" + output.DBCluster.ClusterIdentifier,
100+
"--region=" + t.config.AWSRegion,
101+
}
102+
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
103+
defer cancel()
104+
if err := t.runAWSCommand(ctx, args, &buf); err != nil {
105+
return "", err
106+
}
107+
108+
var out rdsOutput
109+
err = json.Unmarshal(buf.Bytes(), &out)
110+
if err != nil {
111+
return "", err
112+
}
113+
114+
if len(out.DBCluster) == 0 {
115+
return "", fmt.Errorf("No DB Clusters found for cluster identifier: %s", output.DBCluster.ClusterIdentifier)
116+
}
117+
118+
return out.DBCluster[0].Status, nil
119+
}

deployment/terraform/destroy.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ func (t *Terraform) Destroy() error {
3838
"s3://" + t.output.S3Bucket.Id,
3939
"--recursive",
4040
}
41-
if err := t.runAWSCommand(emptyBucketCtx, emptyS3BucketArgs); err != nil {
41+
if err := t.runAWSCommand(emptyBucketCtx, emptyS3BucketArgs, nil); err != nil {
4242
emptyBucketErrCh <- fmt.Errorf("failed to run local cmd \"aws %s\": %w", strings.Join(emptyS3BucketArgs, " "), err)
4343
return
4444
}
@@ -69,7 +69,7 @@ func (t *Terraform) Destroy() error {
6969
"--skip-final-snapshot",
7070
}
7171
// We have to ignore if the cluster was already deleted to make the command idempotent.
72-
if err := t.runAWSCommand(nil, args); err != nil && !strings.Contains(err.Error(), "DBClusterNotFoundFault") {
72+
if err := t.runAWSCommand(nil, args, nil); err != nil && !strings.Contains(err.Error(), "DBClusterNotFoundFault") {
7373
return err
7474
}
7575
}

deployment/terraform/engine.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ func (t *Terraform) runCommand(dst io.Writer, args ...string) error {
4343
return _runCommand(cmd, dst)
4444
}
4545

46-
func (t *Terraform) runAWSCommand(ctx context.Context, args []string) error {
46+
func (t *Terraform) runAWSCommand(ctx context.Context, args []string, dst io.Writer) error {
4747
awsBin := "aws"
4848
if _, err := exec.LookPath(awsBin); err != nil {
4949
return fmt.Errorf("aws not installed. Please install aws. (https://aws.amazon.com/cli): %w", err)
@@ -58,7 +58,7 @@ func (t *Terraform) runAWSCommand(ctx context.Context, args []string) error {
5858
mlog.Debug("Running aws command", mlog.String("args", fmt.Sprintf("%v", args)))
5959
cmd := exec.CommandContext(ctx, awsBin, args...)
6060

61-
return _runCommand(cmd, nil)
61+
return _runCommand(cmd, dst)
6262
}
6363

6464
func _runCommand(cmd *exec.Cmd, dst io.Writer) error {

deployment/terraform/info.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ func displayInfo(output *Output) {
5757
fmt.Println("Pyroscope URL: http://" + output.MetricsServer.PublicIP + ":4040")
5858
}
5959
if output.HasDB() {
60+
fmt.Println("DB Cluster Identifier: ", output.DBCluster.ClusterIdentifier)
6061
fmt.Println("DB writer endpoint: " + output.DBWriter())
6162
for _, rd := range output.DBReaders() {
6263
fmt.Println("DB reader endpoint: " + rd)

deployment/terraform/output.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ type output struct {
1818
} `json:"instances"`
1919
DBCluster struct {
2020
Value []struct {
21-
Endpoint string `json:"endpoint"`
21+
Endpoint string `json:"endpoint"`
22+
ClusterIdentifier string `json:"cluster_identifier"`
2223
} `json:"value"`
2324
} `json:"dbCluster"`
2425
Agents struct {
@@ -82,7 +83,8 @@ type Tags struct {
8283

8384
// DBCluster defines a RDS cluster instance resource.
8485
type DBCluster struct {
85-
Endpoints []string `json:"endpoint"`
86+
Endpoints []string `json:"endpoint"`
87+
ClusterIdentifier string `json:"cluster_identifier"`
8688
}
8789

8890
// IAMAccess is a set of credentials that allow API requests to be made as an IAM user.
@@ -132,6 +134,7 @@ func (t *Terraform) loadOutput() error {
132134
for _, ep := range o.DBCluster.Value {
133135
outputv2.DBCluster.Endpoints = append(outputv2.DBCluster.Endpoints, ep.Endpoint)
134136
}
137+
outputv2.DBCluster.ClusterIdentifier = o.DBCluster.Value[0].ClusterIdentifier
135138
}
136139
if len(o.MetricsServer.Value) > 0 {
137140
outputv2.MetricsServer = o.MetricsServer.Value[0]

0 commit comments

Comments
 (0)