diff --git a/Tiltfile b/Tiltfile index 1a4a186a4..5a7d5c23f 100644 --- a/Tiltfile +++ b/Tiltfile @@ -254,6 +254,9 @@ for x in range(localnet_config["relayminers"]["count"]): # Run `curl localhost:PORT` to see the current snapshot of relayminer metrics. str(9069 + actor_number) + ":9090", # Relayminer metrics port. relayminer1 - exposes 9070, relayminer2 exposes 9071, etc. + # Use with pprof like this: `go tool pprof -http=:3333 http://localhost:6070/debug/pprof/goroutine` + str(6069 + actor_number) + + ":6060", # Relayminer pprof port. relayminer1 - exposes 6070, relayminer2 exposes 6071, etc. ], ) @@ -295,6 +298,9 @@ for x in range(localnet_config["appgateservers"]["count"]): # Run `curl localhost:PORT` to see the current snapshot of appgateserver metrics. str(9079 + actor_number) + ":9090", # appgateserver metrics port. appgateserver1 - exposes 9080, appgateserver2 exposes 9081, etc. + # Use with pprof like this: `go tool pprof -http=:3333 http://localhost:6080/debug/pprof/goroutine` + str(6079 + actor_number) + + ":6090", # appgateserver metrics port. appgateserver1 - exposes 6080, appgateserver2 exposes 6081, etc. ], ) @@ -336,13 +342,22 @@ for x in range(localnet_config["gateways"]["count"]): # Run `curl localhost:PORT` to see the current snapshot of gateway metrics. str(9089 + actor_number) + ":9090", # gateway metrics port. gateway1 - exposes 9090, gateway2 exposes 9091, etc. + # Use with pprof like this: `go tool pprof -http=:3333 http://localhost:6090/debug/pprof/goroutine` + str(6089 + actor_number) + + ":6060", # gateway metrics port. gateway1 - exposes 6090, gateway2 exposes 6091, etc. ], ) k8s_resource( "validator", labels=["pocket_network"], - port_forwards=["36657", "36658", "40004"], + port_forwards=[ + "36657", + "36658", + "40004", + # Use with pprof like this: `go tool pprof -http=:3333 http://localhost:6061/debug/pprof/goroutine` + "6061:6060", + ], links=[ link( "http://localhost:3003/d/cosmoscometbft/protocol-cometbft-dashboard?orgId=1&from=now-1h&to=now", diff --git a/docusaurus/docs/develop/developer_guide/performance_troubleshooting.md b/docusaurus/docs/develop/developer_guide/performance_troubleshooting.md new file mode 100644 index 000000000..f27da1145 --- /dev/null +++ b/docusaurus/docs/develop/developer_guide/performance_troubleshooting.md @@ -0,0 +1,155 @@ +--- +sidebar_position: 4 +title: Performance troubleshooting +--- + +# Performance troubleshooting + +- [What is pprof](#what-is-pprof) +- [`pprof` and Dependencies - Installation](#pprof-and-dependencies---installation) +- [How to Use `pprof`](#how-to-use-pprof) + - [Available `pprof` Endpoints](#available-pprof-endpoints) + - [Configure Software to Expose `pprof` Endpoints](#configure-software-to-expose-pprof-endpoints) + - [Full Nodes and Validator Configuration](#full-nodes-and-validator-configuration) + - [AppGate Server and RelayMiner](#appgate-server-and-relayminer) + - [Save the Profiling Data](#save-the-profiling-data) + - [Explore the Profiling Data](#explore-the-profiling-data) + - [Explore without saving data](#explore-without-saving-data) + - [Report Issues](#report-issues) + +If you believe you've encountered an issue related to memory, goroutine leaks, +or some sort of synchronization blocking scenario, `pprof` is a good tool to +help identify & investigate the problem. + +It is open-source and maintained by Google: [google/pprof](https://github.com/google/pprof) + +## What is pprof + +`pprof` is a tool for profiling and visualizing profiling data. In modern Go versions, +it is included with the compiler (`go tool pprof`), but it can also be installed as a +standalone binary from [github.com/google/pprof](https://github.com/google/pprof). + +```bash +go install +``` + +More information can be found in the [pprof README](https://github.com/google/pprof/blob/main/doc/README.md). + +## `pprof` and Dependencies - Installation + +1. [Required] `pprof` - Go compiler or standalone pprof binary: + + 1. pprof that comes with Golang is available via `go tool pprof` + 2. A standalone binary can be installed with: + + ```bash + go install github.com/google/pprof@latest + ``` + +2. [Optional] `graphviz` - Recommended for visualization. It can be skipped if you're not planning to use visualizations. + + - [Installation guide](https://graphviz.readthedocs.io/en/stable/#installation) + - On MacOS, it can be installed with: + + ```bash + brew install graphviz + ``` + +## How to Use `pprof` + +`pprof` operates by connecting to an exposed endpoint in the software you want to profile. + +It can create snapshots for later examination, or can show information in a browser +for an already running process. + +We're going to use `go tool pprof` in the examples below, but if you installed a +standalone binary, just replace `go tool pprof` with `pprof`. + +### Available `pprof` Endpoints + +Before running `pprof`, you need to decide what kind of profiling you need to do. + +The `pprof` package provides several endpoints that are useful for profiling and +debugging. Here are the most commonly used ones: + +- `/debug/pprof/heap`: Snapshot of the memory allocation of the heap. +- `/debug/pprof/allocs`: Similar to `/debug/pprof/heap`, but includes all past memory allocations, not just the ones currently in the heap. +- `/debug/pprof/goroutine`: All current go-routines. +- `/debug/pprof/threadcreate`: Records stack traces that led to the creation of new OS threads. +- `/debug/pprof/block`: Displays stack traces that led to blocking on synchronization primitives. +- `/debug/pprof/profile`: Collects 30 seconds of CPU profiling data - configurable via the `seconds` parameter. +- `/debug/pprof/symbol`: Looks up the program counters provided in the request, returning function names. +- `/debug/pprof/trace`: Provides a trace of the program execution. + +### Configure Software to Expose `pprof` Endpoints + +:::warning Exposing pprof + +It is recommended to never expose `pprof` to the internet, as this feature allows +operational control of the software. A malicious actor could potentially disrupt +or DoS your services if these endpoints are exposed to the internet. + +::: + +#### Full Nodes and Validator Configuration + +In `config.toml`, you can configure `pprof_laddr` to expose a `pprof` endpoint +on a particular network interface and port. By default, `pprof` listens on `localhost:6060`. + +If the value has been modified, you must restart the process. + +#### AppGate Server and RelayMiner + +Both `AppGate Server` and `RelayMiner` can be configured to expose a `pprof` +endpoint using a configuration file like this: + +```yaml +pprof: + enabled: true + addr: localhost:6060 +``` + +If any of these values have been modified, you must restart the process. + +### Save the Profiling Data + +You can save profiling data to a file using by running: + +```bash +curl -o http:/// +``` + +For example, a command to save a heap profile looks like this: + +```bash +curl -o heap_profile.pprof http://localhost:6061/debug/pprof/heap +``` + +That file can be shared with other people. + +### Explore the Profiling Data + +Now, you can use the file to get insights into the profiling data, including visualizations. +A command like this will start an HTTP server and open a browser: + +```bash +go tool pprof -http=:PORT +``` + +For example, to open a `heap_profile.pprof` from the example above, you can run: + +```bash +go tool pprof -http=:3333 heap_profile.pprof +``` + +### Explore without saving data + +It is also possible to visualize `pprof` data without saving to the file. For example: + +```bash +go tool pprof -http=:3333 http://localhost:6061/debug/pprof/goroutine +``` + +### Report Issues + +If you believe you've found a performance problem, please [open a GitHub Issue](https://github.com/pokt-network/poktroll/issues). Make sure to attach the profiling data. diff --git a/docusaurus/docs/operate/configs/appgate_server_config.md b/docusaurus/docs/operate/configs/appgate_server_config.md index a4e1f7f32..f98ff8233 100644 --- a/docusaurus/docs/operate/configs/appgate_server_config.md +++ b/docusaurus/docs/operate/configs/appgate_server_config.md @@ -22,6 +22,7 @@ It is responsible for multiple things: - [`signing_key`](#signing_key) - [`listening_endpoint`](#listening_endpoint) - [`metrics`](#metrics) + - [`pprof`](#pprof) ## Usage @@ -135,3 +136,20 @@ metrics: When `enabled` is set to `true`, the exporter is active. The addr `value` of `:9090` implies the exporter is bound to port 9090 on all available network interfaces. + +### `pprof` + +_`Optional`_ + +Configures a [pprof](https://github.com/google/pprof/blob/main/doc/README.md) +endpoint for troubleshooting and debugging performance issues. + +Example configuration: + +```yaml +pprof: + enabled: true + addr: localhost:6060 +``` + +You can learn how to use that endpoint on the [Performance Troubleshooting](../../develop/developer_guide/performance_troubleshooting.md) page. diff --git a/docusaurus/docs/operate/configs/relayminer_config.md b/docusaurus/docs/operate/configs/relayminer_config.md index 5128edb82..0adc1ec92 100644 --- a/docusaurus/docs/operate/configs/relayminer_config.md +++ b/docusaurus/docs/operate/configs/relayminer_config.md @@ -18,6 +18,7 @@ and which domains to accept queries from._ - [`signing_key_name`](#signing_key_name) - [`smt_store_path`](#smt_store_path) - [`metrics`](#metrics) + - [`pprof`](#pprof) - [Pocket node connectivity](#pocket-node-connectivity) - [`query_node_rpc_url`](#query_node_rpc_url) - [`query_node_grpc_url`](#query_node_grpc_url) @@ -144,6 +145,23 @@ When `enabled` is set to `true`, the exporter is active. The addr `value` of `:9090` implies the exporter is bound to port 9090 on all available network interfaces. +### `pprof` + +_`Optional`_ + +Configures a [pprof](https://github.com/google/pprof/blob/main/doc/README.md) +endpoint for troubleshooting and debugging performance issues. + +Example configuration: + +```yaml +pprof: + enabled: true + addr: localhost:6060 +``` + +You can learn how to use that endpoint on the [Performance Troubleshooting](../../develop/developer_guide/performance_troubleshooting.md) page. + ## Pocket node connectivity ```yaml diff --git a/localnet/kubernetes/values-appgateserver.yaml b/localnet/kubernetes/values-appgateserver.yaml index d6e949c0d..dd6400ae1 100644 --- a/localnet/kubernetes/values-appgateserver.yaml +++ b/localnet/kubernetes/values-appgateserver.yaml @@ -4,3 +4,6 @@ config: metrics: enabled: true addr: :9090 + pprof: + enabled: true + addr: localhost:6060 diff --git a/localnet/kubernetes/values-gateway.yaml b/localnet/kubernetes/values-gateway.yaml index 1cae6cb5e..3560905dc 100644 --- a/localnet/kubernetes/values-gateway.yaml +++ b/localnet/kubernetes/values-gateway.yaml @@ -5,3 +5,6 @@ config: metrics: enabled: true addr: :9090 + pprof: + enabled: true + addr: localhost:6060 diff --git a/localnet/kubernetes/values-relayminer-common.yaml b/localnet/kubernetes/values-relayminer-common.yaml index f222f7a39..a8070be01 100644 --- a/localnet/kubernetes/values-relayminer-common.yaml +++ b/localnet/kubernetes/values-relayminer-common.yaml @@ -8,3 +8,6 @@ config: query_node_grpc_url: tcp://validator-poktroll-validator:36658 tx_node_rpc_url: tcp://validator-poktroll-validator:36657 suppliers: [] + pprof: + enabled: true + addr: localhost:6060 diff --git a/localnet/poktrolld/config/appgate_server_config.yaml b/localnet/poktrolld/config/appgate_server_config.yaml index bba48c7e5..146528551 100644 --- a/localnet/poktrolld/config/appgate_server_config.yaml +++ b/localnet/poktrolld/config/appgate_server_config.yaml @@ -6,3 +6,6 @@ listening_endpoint: http://localhost:42069 metrics: enabled: true addr: :9090 +pprof: + enabled: true + addr: localhost:6060 diff --git a/localnet/poktrolld/config/appgate_server_config_example.yaml b/localnet/poktrolld/config/appgate_server_config_example.yaml index 9ae9b5b72..c14a15b83 100644 --- a/localnet/poktrolld/config/appgate_server_config_example.yaml +++ b/localnet/poktrolld/config/appgate_server_config_example.yaml @@ -15,3 +15,6 @@ metrics: enabled: true # The address that the metrics exporter will listen on. Can be just a port, or host:port addr: :9090 +pprof: + enabled: true + addr: localhost:6060 diff --git a/localnet/poktrolld/config/appgate_server_config_localnet_vscode.yaml b/localnet/poktrolld/config/appgate_server_config_localnet_vscode.yaml index c79a07377..cc3aa6bb0 100644 --- a/localnet/poktrolld/config/appgate_server_config_localnet_vscode.yaml +++ b/localnet/poktrolld/config/appgate_server_config_localnet_vscode.yaml @@ -6,3 +6,6 @@ listening_endpoint: http://0.0.0.0:42069 metrics: enabled: true addr: :9090 +pprof: + enabled: true + addr: localhost:6060 diff --git a/localnet/poktrolld/config/relayminer_config.yaml b/localnet/poktrolld/config/relayminer_config.yaml index a44d59e6a..031999030 100644 --- a/localnet/poktrolld/config/relayminer_config.yaml +++ b/localnet/poktrolld/config/relayminer_config.yaml @@ -14,3 +14,6 @@ suppliers: backend_url: http://anvil:8547/ publicly_exposed_endpoints: - relayminers +pprof: + enabled: false + addr: localhost:6060 diff --git a/localnet/poktrolld/config/relayminer_config_full_example.yaml b/localnet/poktrolld/config/relayminer_config_full_example.yaml index 4dab186d1..0769acfc2 100644 --- a/localnet/poktrolld/config/relayminer_config_full_example.yaml +++ b/localnet/poktrolld/config/relayminer_config_full_example.yaml @@ -11,6 +11,12 @@ metrics: # The address (host:port or just port) for the metrics exporter to listen on. addr: :9090 +# Pprof endpoint configuration. More information: +# https://pkg.go.dev/github.com/google/pprof#section-readme +pprof: + enabled: false + addr: localhost:6060 + pocket_node: # Pocket node URL exposing the CometBFT JSON-RPC API. # Used by the Cosmos client SDK, event subscriptions, etc. diff --git a/pkg/appgateserver/cmd/cmd.go b/pkg/appgateserver/cmd/cmd.go index 3ea75d24e..e6501e86c 100644 --- a/pkg/appgateserver/cmd/cmd.go +++ b/pkg/appgateserver/cmd/cmd.go @@ -144,6 +144,13 @@ func runAppGateServer(cmd *cobra.Command, _ []string) error { } } + if appGateConfigs.Pprof.Enabled { + err = appGateServer.ServePprof(appGateConfigs.Pprof.Addr) + if err != nil { + return fmt.Errorf("failed to start pprof endpoint: %w", err) + } + } + // Start the AppGate server. if err := appGateServer.Start(ctx); err != nil && !errors.Is(err, http.ErrServerClosed) { return fmt.Errorf("failed to start app gate server: %w", err) diff --git a/pkg/appgateserver/config/appgate_configs_reader.go b/pkg/appgateserver/config/appgate_configs_reader.go index 5d59c9e52..d0b42cd67 100644 --- a/pkg/appgateserver/config/appgate_configs_reader.go +++ b/pkg/appgateserver/config/appgate_configs_reader.go @@ -15,6 +15,7 @@ type YAMLAppGateServerConfig struct { QueryNodeRPCUrl string `yaml:"query_node_rpc_url"` SelfSigning bool `yaml:"self_signing"` SigningKey string `yaml:"signing_key"` + Pprof YAMLAppGateServerPprofConfig `yaml:"pprof"` } // YAMLAppGateServerMetricsConfig is the structure used to unmarshal the metrics @@ -24,6 +25,13 @@ type YAMLAppGateServerMetricsConfig struct { Addr string `yaml:"addr"` } +// YAMLAppGateServerPprofConfig is the structure used to unmarshal the config +// for `pprof`. +type YAMLAppGateServerPprofConfig struct { + Enabled bool `yaml:"enabled,omitempty"` + Addr string `yaml:"addr,omitempty"` +} + // AppGateServerConfig is the structure describing the AppGateServer config type AppGateServerConfig struct { ListeningEndpoint *url.URL @@ -32,15 +40,23 @@ type AppGateServerConfig struct { QueryNodeRPCUrl *url.URL SelfSigning bool SigningKey string + Pprof *AppGateServerPprofConfig } // AppGateServerMetricsConfig is the structure resulting from parsing the metrics -// section of the AppGateServer config file +// section of the AppGateServer config file. type AppGateServerMetricsConfig struct { Enabled bool Addr string } +// AppGateServerPprofConfig is the structure resulting from parsing the pprof +// section of the AppGateServer config file. +type AppGateServerPprofConfig struct { + Enabled bool + Addr string +} + // ParseAppGateServerConfigs parses the stake config file into a AppGateConfig // NOTE: If SelfSigning is not defined in the config file, it will default to false func ParseAppGateServerConfigs(configContent []byte) (*AppGateServerConfig, error) { @@ -102,5 +118,10 @@ func ParseAppGateServerConfigs(configContent []byte) (*AppGateServerConfig, erro Addr: yamlAppGateServerConfig.Metrics.Addr, } + appGateServerConfig.Pprof = &AppGateServerPprofConfig{ + Enabled: yamlAppGateServerConfig.Pprof.Enabled, + Addr: yamlAppGateServerConfig.Pprof.Addr, + } + return appGateServerConfig, nil } diff --git a/pkg/appgateserver/server.go b/pkg/appgateserver/server.go index 761ff0b18..2b2e37637 100644 --- a/pkg/appgateserver/server.go +++ b/pkg/appgateserver/server.go @@ -7,6 +7,7 @@ import ( "io" "net" "net/http" + "net/http/pprof" "net/url" "strings" "sync" @@ -272,4 +273,22 @@ func (app *appGateServer) ServeMetrics(addr string) error { return nil } +// Starts a pprof server on the given address. +func (app *appGateServer) ServePprof(addr string) error { + pprofMux := http.NewServeMux() + pprofMux.HandleFunc("/debug/pprof/", pprof.Index) + pprofMux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline) + pprofMux.HandleFunc("/debug/pprof/profile", pprof.Profile) + pprofMux.HandleFunc("/debug/pprof/symbol", pprof.Symbol) + pprofMux.HandleFunc("/debug/pprof/trace", pprof.Trace) + + server := &http.Server{ + Addr: addr, + Handler: pprofMux, + } + + app.logger.Info().Str("endpoint", addr).Msg("starting a pprof endpoint") + return server.ListenAndServe() +} + type appGateServerOption func(*appGateServer) diff --git a/pkg/relayer/cmd/cmd.go b/pkg/relayer/cmd/cmd.go index 3641c09ad..69de7828f 100644 --- a/pkg/relayer/cmd/cmd.go +++ b/pkg/relayer/cmd/cmd.go @@ -129,6 +129,13 @@ func runRelayer(cmd *cobra.Command, _ []string) error { } } + if relayMinerConfig.Pprof.Enabled { + err = relayMiner.ServePprof(relayMinerConfig.Pprof.Addr) + if err != nil { + return fmt.Errorf("failed to start pprof endpoint: %w", err) + } + } + // Start the relay miner logger.Info().Msg("Starting relay miner...") if err := relayMiner.Start(ctx); err != nil && !errors.Is(err, http.ErrServerClosed) { diff --git a/pkg/relayer/config/relayminer_configs_reader.go b/pkg/relayer/config/relayminer_configs_reader.go index e3ae3a52b..353b52d9c 100644 --- a/pkg/relayer/config/relayminer_configs_reader.go +++ b/pkg/relayer/config/relayminer_configs_reader.go @@ -1,6 +1,8 @@ package config -import yaml "gopkg.in/yaml.v2" +import ( + yaml "gopkg.in/yaml.v2" +) // ParseRelayMinerConfigs parses the relay miner config file into a RelayMinerConfig func ParseRelayMinerConfigs(configContent []byte) (*RelayMinerConfig, error) { @@ -39,6 +41,11 @@ func ParseRelayMinerConfigs(configContent []byte) (*RelayMinerConfig, error) { Addr: yamlRelayMinerConfig.Metrics.Addr, } + relayMinerConfig.Pprof = &RelayMinerPprofConfig{ + Enabled: yamlRelayMinerConfig.Pprof.Enabled, + Addr: yamlRelayMinerConfig.Pprof.Addr, + } + // Hydrate the pocket node urls if err := relayMinerConfig.HydratePocketNodeUrls(&yamlRelayMinerConfig.PocketNode); err != nil { return nil, err diff --git a/pkg/relayer/config/types.go b/pkg/relayer/config/types.go index 7b273292e..6aa2e4167 100644 --- a/pkg/relayer/config/types.go +++ b/pkg/relayer/config/types.go @@ -23,6 +23,7 @@ type YAMLRelayMinerConfig struct { SmtStorePath string `yaml:"smt_store_path"` Metrics YAMLRelayMinerMetricsConfig `yaml:"metrics"` Suppliers []YAMLRelayMinerSupplierConfig `yaml:"suppliers"` + Pprof YAMLRelayMinerPprofConfig `yaml:"pprof"` } // YAMLRelayMinerPocketNodeConfig is the structure used to unmarshal the pocket @@ -66,6 +67,13 @@ type YAMLRelayMinerSupplierServiceAuthentication struct { Password string `yaml:"password,omitempty"` } +// YAMLRelayMinerPprofConfig is the structure used to unmarshal the config +// for `pprof`. +type YAMLRelayMinerPprofConfig struct { + Enabled bool `yaml:"enabled,omitempty"` + Addr string `yaml:"addr,omitempty"` +} + // RelayMinerConfig is the structure describing the RelayMiner config type RelayMinerConfig struct { PocketNode *RelayMinerPocketNodeConfig @@ -73,6 +81,7 @@ type RelayMinerConfig struct { Metrics *RelayMinerMetricsConfig SigningKeyName string SmtStorePath string + Pprof *RelayMinerPprofConfig } // RelayMinerPocketNodeConfig is the structure resulting from parsing the pocket @@ -148,8 +157,15 @@ type RelayMinerSupplierServiceConfig struct { // RelayMinerSupplierServiceAuthentication is the structure resulting from parsing // the supplier service basic auth of the RelayMiner config file when the -// supplier is of type "http" +// supplier is of type "http". type RelayMinerSupplierServiceAuthentication struct { Username string Password string } + +// RelayMinerPprofConfig is the structure resulting from parsing the pprof config +// section of a RelayMiner config. +type RelayMinerPprofConfig struct { + Enabled bool + Addr string +} diff --git a/pkg/relayer/relayminer.go b/pkg/relayer/relayminer.go index f4ef30ecf..f21f895eb 100644 --- a/pkg/relayer/relayminer.go +++ b/pkg/relayer/relayminer.go @@ -4,6 +4,7 @@ import ( "context" "net" "net/http" + "net/http/pprof" "cosmossdk.io/depinject" "github.com/prometheus/client_golang/prometheus/promhttp" @@ -103,3 +104,21 @@ func (rel *relayMiner) ServeMetrics(addr string) error { return nil } + +// Starts a pprof server on the given address. +func (rel *relayMiner) ServePprof(addr string) error { + pprofMux := http.NewServeMux() + pprofMux.HandleFunc("/debug/pprof/", pprof.Index) + pprofMux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline) + pprofMux.HandleFunc("/debug/pprof/profile", pprof.Profile) + pprofMux.HandleFunc("/debug/pprof/symbol", pprof.Symbol) + pprofMux.HandleFunc("/debug/pprof/trace", pprof.Trace) + + server := &http.Server{ + Addr: addr, + Handler: pprofMux, + } + + rel.logger.Info().Str("endpoint", addr).Msg("starting a pprof endpoint") + return server.ListenAndServe() +}