Skip to content

Commit

Permalink
Add Mattermost debug data gathering (#1071)
Browse files Browse the repository at this point in the history
This is initial work to allow for obtaining pprof information from
a given Mattermost deployment. The heap and goroutine performance
profiles can now be obtained in a zip file for all pods in a
cluster installation. The zip is processed on the server and then
sent to the cloud client to be reviewed.
  • Loading branch information
gabrieljackson authored Sep 9, 2024
1 parent c613cb5 commit 4cd8481
Show file tree
Hide file tree
Showing 10 changed files with 386 additions and 2 deletions.
41 changes: 41 additions & 0 deletions cmd/cloud/cluster_installation.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ package main

import (
"fmt"
"os"
"strings"
"time"

"github.com/mattermost/mattermost-cloud/model"
"github.com/pkg/errors"
Expand All @@ -25,6 +27,7 @@ func newCmdClusterInstallation() *cobra.Command {
cmd.AddCommand(newCmdClusterInstallationStatus())
cmd.AddCommand(newCmdClusterInstallationMMCTL())
cmd.AddCommand(newCmdClusterInstallationMattermostCLI())
cmd.AddCommand(newCmdClusterInstallationPPROF())
cmd.AddCommand(newCmdClusterInstallationMigration())

return cmd
Expand Down Expand Up @@ -293,6 +296,44 @@ func newCmdClusterInstallationMattermostCLI() *cobra.Command {
return cmd
}

func newCmdClusterInstallationPPROF() *cobra.Command {
var flags clusterInstallationPPROFFlags

cmd := &cobra.Command{
Use: "pprof",
Short: "Gather pprof data from a cluster installation",
RunE: func(command *cobra.Command, args []string) error {
command.SilenceUsage = true

client := createClient(flags.clusterFlags)

output, err := client.ExecClusterInstallationPPROF(flags.clusterInstallationID)
if err != nil {
return errors.Wrap(err, "failed to run mattermost CLI command")
}
if output == nil {
return errors.Wrap(err, "no debug data returned")
}

filename := fmt.Sprintf("%s.%s.prof.zip", flags.clusterInstallationID, time.Now().Format("2006-01-02.15-04-05.MST"))
err = os.WriteFile(filename, output, 0644)
if err != nil {
return errors.Wrap(err, "failed to save debug zip")
}

fmt.Printf("Debug data saved to %s\n", filename)

return nil
},
PreRun: func(cmd *cobra.Command, args []string) {
flags.clusterFlags.addFlags(cmd)
},
}
flags.addFlags(cmd)

return cmd
}

func newCmdClusterInstallationMigration() *cobra.Command {
var flags clusterInstallationMigrationFlags

Expand Down
11 changes: 11 additions & 0 deletions cmd/cloud/cluster_installation_flag.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,17 @@ func (flags *clusterInstallationMattermostCLIFlags) addFlags(command *cobra.Comm
_ = command.MarkFlagRequired("command")
}

type clusterInstallationPPROFFlags struct {
clusterFlags
clusterInstallationID string
}

func (flags *clusterInstallationPPROFFlags) addFlags(command *cobra.Command) {
command.Flags().StringVar(&flags.clusterInstallationID, "cluster-installation", "", "The id of the cluster installation.")

_ = command.MarkFlagRequired("cluster-installation")
}

type clusterInstallationMigrationFlags struct {
clusterFlags
installation string
Expand Down
95 changes: 95 additions & 0 deletions internal/api/cluster_installation.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@
package api

import (
"archive/zip"
"fmt"
"net/http"
"os"
"path"

"github.com/gorilla/mux"
"github.com/mattermost/mattermost-cloud/internal/common"
Expand Down Expand Up @@ -35,6 +38,7 @@ func initClusterInstallation(apiRouter *mux.Router, context *Context) {
clusterInstallationRouter.Handle("/config", addContext(handleSetClusterInstallationConfig)).Methods("PUT")
clusterInstallationRouter.Handle("/exec/{command}", addContext(handleRunClusterInstallationExecCommand)).Methods("POST")
clusterInstallationRouter.Handle("/mattermost_cli", addContext(handleRunClusterInstallationMattermostCLI)).Methods("POST")
clusterInstallationRouter.Handle("/pprof", addContext(handleRunClusterInstallationGetPPROF)).Methods("GET")
clusterInstallationRouter.Handle("/status", addContext(handleGetClusterInstallationStatus)).Methods("GET")
}

Expand Down Expand Up @@ -315,6 +319,97 @@ func handleRunClusterInstallationExecCommand(c *Context, w http.ResponseWriter,
w.Write(output)
}

// handleRunClusterInstallationGetPPROF responds to POST /api/cluster_installation/{cluster_installation}/pprof,
// running pprof commands on all pods and returning the output as a dubug zip file.
func handleRunClusterInstallationGetPPROF(c *Context, w http.ResponseWriter, r *http.Request) {
vars := mux.Vars(r)
clusterInstallationID := vars["cluster_installation"]
c.Logger = c.Logger.WithField("cluster_installation", clusterInstallationID)

clusterInstallation, err := c.Store.GetClusterInstallation(clusterInstallationID)
if err != nil {
c.Logger.WithError(err).Error("failed to query cluster installation")
w.WriteHeader(http.StatusInternalServerError)
return
}
if clusterInstallation == nil {
c.Logger.Error("cluster installation not found")
w.WriteHeader(http.StatusNotFound)
return
}
if clusterInstallation.IsDeleted() {
c.Logger.Error("cluster installation is deleted")
w.WriteHeader(http.StatusGone)
return
}

if clusterInstallation.APISecurityLock {
logSecurityLockConflict("cluster-installation", c.Logger)
w.WriteHeader(http.StatusForbidden)
return
}

cluster, err := c.Store.GetCluster(clusterInstallation.ClusterID)
if err != nil {
c.Logger.WithError(err).Error("failed to query cluster")
w.WriteHeader(http.StatusInternalServerError)
return
}
if cluster == nil {
c.Logger.Errorf("failed to find cluster %s associated with cluster installation", clusterInstallation.ClusterID)
w.WriteHeader(http.StatusInternalServerError)
return
}

c.Logger = c.Logger.WithField("cluster_id", cluster.ID)
debugData, execErr, err := c.Provisioner.ExecClusterInstallationPPROF(cluster, clusterInstallation)
if err != nil {
c.Logger.WithError(err).Error("failed to prepare command execution")
w.WriteHeader(http.StatusInternalServerError)
return
}
if execErr != nil {
c.Logger.WithError(execErr).Error("failed to execute command")
w.WriteHeader(http.StatusConflict)
return
}

// Create a temporary zipfile which will be cleaned up after being sent.
tempDir, err := os.MkdirTemp("", "pprof-")
if err != nil {
c.Logger.WithError(err).Error("failed to create temporary pprof directory")
w.WriteHeader(http.StatusInternalServerError)
return
}
defer os.RemoveAll(tempDir)

tempZipPath := path.Join(tempDir, fmt.Sprintf("%s.tempprof.zip", clusterInstallationID))
tempZipFile, err := os.Create(tempZipPath)
if err != nil {
c.Logger.WithError(err).Error("failed to create temporary pprof zip file")
w.WriteHeader(http.StatusInternalServerError)
return
}

zipFileWriter := zip.NewWriter(tempZipFile)
err = populateZipfile(zipFileWriter, debugData.ToFileData())
if err != nil {
c.Logger.WithError(err).Error("failed to populate temporary pprof zip file")
w.WriteHeader(http.StatusInternalServerError)
return
}

debugBytes, err := os.ReadFile(tempZipPath)
if err != nil {
c.Logger.WithError(err).Error("failed to read temporary pprof zip file")
w.WriteHeader(http.StatusInternalServerError)
return
}

w.WriteHeader(http.StatusOK)
w.Write(debugBytes)
}

// handleRunClusterInstallationMattermostCLI responds to POST /api/cluster_installation/{cluster_installation}/mattermost_cli, running a Mattermost CLI command and returning any output.
// TODO: deprecate or refactor into /exec/command endpoint
func handleRunClusterInstallationMattermostCLI(c *Context, w http.ResponseWriter, r *http.Request) {
Expand Down
82 changes: 81 additions & 1 deletion internal/api/cluster_installation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -581,6 +581,86 @@ func TestRunClusterInstallationMattermostCLI(t *testing.T) {
})
}

func TestRunClusterInstallationGetPPROF(t *testing.T) {
logger := testlib.MakeLogger(t)
sqlStore := store.MakeTestSQLStore(t, logger)

mProvisioner := &mockProvisioner{DebugData: model.ClusterInstallationDebugData{
{
Name: "pod1",
HeapProf: []byte(model.NewID()),
GoroutineProf: []byte(model.NewID()),
},
{
Name: "pod2",
HeapProf: []byte(model.NewID()),
GoroutineProf: []byte(model.NewID()),
},
{
Name: "pod3",
HeapProf: []byte(model.NewID()),
GoroutineProf: []byte(model.NewID()),
},
}}

router := mux.NewRouter()
api.Register(router, &api.Context{
Store: sqlStore,
Supervisor: &mockSupervisor{},
Provisioner: mProvisioner,
Metrics: &mockMetrics{},
Logger: logger,
})
ts := httptest.NewServer(router)
defer ts.Close()

client := model.NewClient(ts.URL)

cluster := &model.Cluster{}
err := sqlStore.CreateCluster(cluster, nil)
require.NoError(t, err)

clusterInstallation1 := &model.ClusterInstallation{
ClusterID: cluster.ID,
InstallationID: model.NewID(),
}
err = sqlStore.CreateClusterInstallation(clusterInstallation1)
require.NoError(t, err)

t.Run("success", func(t *testing.T) {
bytes, errTest := client.ExecClusterInstallationPPROF(clusterInstallation1.ID)
require.NoError(t, errTest)
require.NotEmpty(t, bytes)
})

t.Run("unknown cluster installation", func(t *testing.T) {
bytes, errTest := client.ExecClusterInstallationPPROF(model.NewID())
require.EqualError(t, errTest, "failed with status code 404")
require.Empty(t, bytes)
})

t.Run("while api-security-locked", func(t *testing.T) {
errTest := sqlStore.LockClusterInstallationAPI(clusterInstallation1.ID)
require.NoError(t, errTest)

bytes, errTest := client.ExecClusterInstallationPPROF(clusterInstallation1.ID)
require.EqualError(t, errTest, "failed with status code 403")
require.Empty(t, bytes)

errTest = sqlStore.UnlockClusterInstallationAPI(clusterInstallation1.ID)
require.NoError(t, errTest)
})

t.Run("cluster installation deleted", func(t *testing.T) {
errTest := sqlStore.DeleteClusterInstallation(clusterInstallation1.ID)
require.NoError(t, errTest)

bytes, errTest := client.ExecClusterInstallationPPROF(clusterInstallation1.ID)
require.Error(t, errTest)
require.Empty(t, bytes)
})
}

func TestMigrateClusterInstallations(t *testing.T) {
logger := testlib.MakeLogger(t)
sqlStore := store.MakeTestSQLStore(t, logger)
Expand Down Expand Up @@ -1189,8 +1269,8 @@ func TestMigrateDNSForNonHibernatingInstallation(t *testing.T) {
_, err = client.MigrateDNS(&model.MigrateClusterInstallationRequest{InstallationID: "", SourceClusterID: sourceCluster.ID, TargetClusterID: targetCluster.ID, DNSSwitch: true, LockInstallation: true})
require.EqualError(t, err, "failed with status code 404")
})

}

func TestDeleteInActiveClusterInstallationsByCluster(t *testing.T) {
logger := testlib.MakeLogger(t)
sqlStore := store.MakeTestSQLStore(t, logger)
Expand Down
5 changes: 5 additions & 0 deletions internal/api/common_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ func (m *mockMetrics) ObserveAPIEndpointDuration(handler, method string, statusC

type mockProvisioner struct {
Output []byte
DebugData model.ClusterInstallationDebugData
ExecError error
CommandError error
}
Expand Down Expand Up @@ -67,6 +68,10 @@ func (s *mockProvisioner) ExecMattermostCLI(*model.Cluster, *model.ClusterInstal
return s.Output, s.CommandError
}

func (s *mockProvisioner) ExecClusterInstallationPPROF(*model.Cluster, *model.ClusterInstallation) (model.ClusterInstallationDebugData, error, error) {
return s.DebugData, s.ExecError, s.CommandError
}

func (s *mockProvisioner) GetClusterResources(*model.Cluster, bool, log.FieldLogger) (*k8s.ClusterResources, error) {
return nil, nil
}
1 change: 1 addition & 0 deletions internal/api/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ type Provisioner interface {
ExecClusterInstallationCLI(cluster *model.Cluster, clusterInstallation *model.ClusterInstallation, args ...string) ([]byte, error, error)
ExecMMCTL(cluster *model.Cluster, clusterInstallation *model.ClusterInstallation, args ...string) ([]byte, error)
ExecMattermostCLI(cluster *model.Cluster, clusterInstallation *model.ClusterInstallation, args ...string) ([]byte, error)
ExecClusterInstallationPPROF(cluster *model.Cluster, clusterInstallation *model.ClusterInstallation) (model.ClusterInstallationDebugData, error, error)
GetClusterInstallationStatus(cluster *model.Cluster, clusterInstallation *model.ClusterInstallation) (*model.ClusterInstallationStatus, error)
}

Expand Down
23 changes: 23 additions & 0 deletions internal/api/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
package api

import (
"archive/zip"
"net/url"
"strconv"
"time"

"github.com/mattermost/mattermost-cloud/model"

Expand Down Expand Up @@ -105,3 +107,24 @@ func parseDeletionLocked(u *url.URL) (*bool, error) {

return &locked, nil
}

func populateZipfile(w *zip.Writer, fileDatas []model.FileData) error {
defer w.Close()
for _, fd := range fileDatas {
f, err := w.CreateHeader(&zip.FileHeader{
Name: fd.Filename,
Method: zip.Deflate,
Modified: time.Now(),
})

if err != nil {
return err
}

_, err = f.Write(fd.Body)
if err != nil {
return err
}
}
return nil
}
Loading

0 comments on commit 4cd8481

Please sign in to comment.