Skip to content

Commit 4be53ef

Browse files
authored
[QoS][CometBFT] Add CometBFT QoS metrics and observation infrastructure (#349)
## Summary Add CometBFT QoS metrics collection and observation infrastructure with request tracking capabilities ### Primary Changes: - Add CometBFTRequestObservations protobuf with chain_id, service_id, request_origin, and request_error fields for comprehensive request tracking - Implement CometBFTObservationInterpreter to extract metrics-relevant data from observations for Prometheus reporting - Create metrics/qos/cometbft/metrics.go with requestsTotal counter tracking requests by chain, service, method, success status, and error types - Update CometBFT QoS context to generate complete observations with proper error handling and JSONRPC response mapping ### Secondary Changes: - Separate logger contexts in cmd/qos.go - use hydratedLogger for function-level logging and qosLogger for QoS instances - Refactor CometBFT response interface to return GetJSONRPCResponse() instead of separate payload/status methods - Add serviceID field to ServiceState for observation generation consistency ## Issue No CometBFT metrics, log entries missing fields required for analysis of errors. - Issue or PR: #{ISSUE_OR_PR_NUMBER} ## Type of change Select one or more from the following: - [x] New feature, functionality or library - [ ] Bug fix - [ ] Code health or cleanup - [ ] Documentation - [ ] Other (specify) ## QoS Checklist ### E2E Validation & Tests - [ ] `make path_up` - [ ] `make test_e2e_evm_shannon` - [ ] `make test_e2e_evm_morse` ### Observability - [ ] 1. `make path_up` - [ ] 2. Run one of the following: - For `Shannon` with `anvil`: `make test_request__shannon_relay_util_100` - For `Morse` with `F00C`: `make test_request__morse_relay_util_100` - [ ] 3. Visit [PATH Relay Grafana Dashboard](http://localhost:3003/d/relays/path-service-requests) to view results ## Sanity Checklist - [ ] I have updated the GitHub Issue `assignees`, `reviewers`, `labels`, `project`, `iteration` and `milestone` - [ ] For docs, I have run `make docusaurus_start` - [ ] For code, I have run `make test_all` - [ ] For configurations, I have updated the documentation - [ ] I added `TODO`s where applicable
1 parent 2cf808f commit 4be53ef

File tree

14 files changed

+402
-117
lines changed

14 files changed

+402
-117
lines changed

cmd/qos.go

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,41 +24,45 @@ func getServiceQoSInstances(
2424
// need to manually add entries for every new QoS implementation.
2525
qosServices := make(map[protocol.ServiceID]gateway.QoSService)
2626

27-
logger = logger.With("module", "qos").With("method", "getServiceQoSInstances").With("protocol", protocolInstance.Name())
27+
// Create a logger for this function's own messages with method-specific context
28+
hydratedLogger := logger.With("module", "qos").With("method", "getServiceQoSInstances").With("protocol", protocolInstance.Name())
29+
30+
// Create a separate logger for QoS instances without method-specific context
31+
qosLogger := logger.With("module", "qos").With("protocol", protocolInstance.Name())
2832

2933
// Wait for the protocol to become healthy BEFORE configuring and starting the hydrator.
3034
// - Ensures the protocol instance's configured service IDs are available before hydrator startup.
31-
err := waitForProtocolHealth(logger, protocolInstance, defaultProtocolHealthTimeout)
35+
err := waitForProtocolHealth(hydratedLogger, protocolInstance, defaultProtocolHealthTimeout)
3236
if err != nil {
3337
return nil, err
3438
}
3539

3640
// Get configured service IDs from the protocol instance.
3741
// - Used to run hydrator checks on all configured service IDs (except those manually disabled by the user).
3842
gatewayServiceIDs := protocolInstance.ConfiguredServiceIDs()
39-
logGatewayServiceIDs(logger, gatewayServiceIDs)
43+
logGatewayServiceIDs(hydratedLogger, gatewayServiceIDs)
4044

4145
// Remove any service IDs that are manually disabled by the user.
4246
for _, disabledQoSServiceIDForGateway := range gatewayConfig.HydratorConfig.QoSDisabledServiceIDs {
4347
// Throw error if any manually disabled service IDs are not found in the protocol's configured service IDs.
4448
if _, found := gatewayServiceIDs[disabledQoSServiceIDForGateway]; !found {
4549
return nil, fmt.Errorf("[INVALID CONFIGURATION] QoS manually disabled for service ID: %s BUT NOT not found in protocol's configured service IDs", disabledQoSServiceIDForGateway)
4650
}
47-
logger.Info().Msgf("Gateway manually disabled QoS for service ID: %s", disabledQoSServiceIDForGateway)
51+
hydratedLogger.Info().Msgf("Gateway manually disabled QoS for service ID: %s", disabledQoSServiceIDForGateway)
4852
delete(gatewayServiceIDs, disabledQoSServiceIDForGateway)
4953
}
5054

5155
// Get the service configs for the current protocol
5256
qosServiceConfigs := config.QoSServiceConfigs.GetServiceConfigs(gatewayConfig)
53-
logQoSServiceConfigs(logger, qosServiceConfigs)
57+
logQoSServiceConfigs(hydratedLogger, qosServiceConfigs)
5458

5559
// Initialize QoS services for all service IDs with a corresponding QoS
5660
// implementation, as defined in the `config/service_qos.go` file.
5761
for _, qosServiceConfig := range qosServiceConfigs {
5862
serviceID := qosServiceConfig.GetServiceID()
5963
// Skip service IDs that are not configured for the PATH instance.
6064
if _, found := gatewayServiceIDs[serviceID]; !found {
61-
logger.Warn().Msgf("Service ID %s has an available QoS configuration but is not configured for the gateway. Skipping...", serviceID)
65+
hydratedLogger.Warn().Msgf("Service ID %s has an available QoS configuration but is not configured for the gateway. Skipping...", serviceID)
6266
continue
6367
}
6468

@@ -69,18 +73,18 @@ func getServiceQoSInstances(
6973
return nil, fmt.Errorf("SHOULD NEVER HAPPEN: error building QoS instances: service ID %q is not an EVM service", serviceID)
7074
}
7175

72-
evmQoS := evm.NewQoSInstance(logger, evmServiceQoSConfig)
76+
evmQoS := evm.NewQoSInstance(qosLogger, evmServiceQoSConfig)
7377
qosServices[serviceID] = evmQoS
7478

75-
logger.With("service_id", serviceID).Debug().Msg("Added EVM QoS instance for the service ID.")
79+
hydratedLogger.With("service_id", serviceID).Debug().Msg("Added EVM QoS instance for the service ID.")
7680

7781
case cometbft.QoSType:
7882
cometBFTServiceQoSConfig, ok := qosServiceConfig.(cometbft.CometBFTServiceQoSConfig)
7983
if !ok {
8084
return nil, fmt.Errorf("SHOULD NEVER HAPPEN: error building QoS instances: service ID %q is not a CometBFT service", serviceID)
8185
}
8286

83-
cometBFTQoS := cometbft.NewQoSInstance(logger, cometBFTServiceQoSConfig)
87+
cometBFTQoS := cometbft.NewQoSInstance(qosLogger, cometBFTServiceQoSConfig)
8488
qosServices[serviceID] = cometBFTQoS
8589

8690
case solana.QoSType:
@@ -89,10 +93,10 @@ func getServiceQoSInstances(
8993
return nil, fmt.Errorf("SHOULD NEVER HAPPEN: error building QoS instances: service ID %q is not a Solana service", serviceID)
9094
}
9195

92-
solanaQoS := solana.NewQoSInstance(logger, solanaServiceQoSConfig)
96+
solanaQoS := solana.NewQoSInstance(qosLogger, solanaServiceQoSConfig)
9397
qosServices[serviceID] = solanaQoS
9498

95-
logger.With("service_id", serviceID).Debug().Msg("Added Solana QoS instance for the service ID.")
99+
hydratedLogger.With("service_id", serviceID).Debug().Msg("Added Solana QoS instance for the service ID.")
96100
default:
97101
return nil, fmt.Errorf("SHOULD NEVER HAPPEN: error building QoS instances: service ID %q not supported by PATH", serviceID)
98102
}

metrics/qos/cometbft/metrics.go

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
package cometbft
2+
3+
import (
4+
"fmt"
5+
6+
"github.com/pokt-network/poktroll/pkg/polylog"
7+
"github.com/prometheus/client_golang/prometheus"
8+
9+
"github.com/buildwithgrove/path/observation/qos"
10+
)
11+
12+
const (
13+
// The POSIX process that emits metrics
14+
pathProcess = "path"
15+
16+
// The list of metrics being tracked for CometBFT QoS
17+
requestsTotalMetric = "cometbft_requests_total"
18+
)
19+
20+
func init() {
21+
prometheus.MustRegister(requestsTotal)
22+
}
23+
24+
var (
25+
// TODO_MVP(@adshmh):
26+
// - Add 'errorSubType' label for more granular error categorization
27+
// - Use 'errorType' for broad error categories (e.g., request validation, protocol error)
28+
// - Use 'errorSubType' for specifics (e.g., endpoint maxed out, timed out)
29+
// - Remove 'success' label (success = absence of errorType)
30+
// - Update EVM observations proto files and add interpreter support
31+
//
32+
// TODO_MVP(@adshmh):
33+
// - Track endpoint responses separately from requests if/when retries are implemented
34+
// (A single request may generate multiple responses due to retries)
35+
//
36+
// requestsTotal tracks total CometBFT requests processed
37+
//
38+
// - Labels:
39+
// - chain_id: Target CometBFT chain identifier
40+
// - service_id: Service ID of the CometBFT QoS instance
41+
// - request_origin: origin of the request: User or Hydrator.
42+
// - request_method: CometBFT RPC method name (e.g., health, status)
43+
// - success: Whether a valid response was received
44+
// - error_type: Type of error if request failed (empty for success)
45+
// - http_status_code: HTTP status code returned to user
46+
//
47+
// - Use cases:
48+
// - Analyze request volume by chain and method
49+
// - Track success rates across PATH deployment regions
50+
// - Identify method usage patterns per chain
51+
// - Measure end-to-end request success rates
52+
// - Review error types by method and chain
53+
// - Examine HTTP status code distribution
54+
requestsTotal = prometheus.NewCounterVec(
55+
prometheus.CounterOpts{
56+
Subsystem: pathProcess,
57+
Name: requestsTotalMetric,
58+
Help: "Total number of requests processed by CometBFT QoS instance(s)",
59+
},
60+
[]string{"chain_id", "service_id", "request_origin", "request_method", "success", "error_type", "http_status_code"},
61+
)
62+
)
63+
64+
// PublishMetrics:
65+
// - Exports all CometBFT-related Prometheus metrics using observations from CometBFT QoS service
66+
// - Logs errors for unexpected (should-never-happen) conditions
67+
func PublishMetrics(logger polylog.Logger, observations *qos.CometBFTRequestObservations) {
68+
logger = logger.With("method", "PublishMetricsCometBFT")
69+
70+
// Skip if observations is nil.
71+
// This should never happen as PublishQoSMetrics uses nil checks to identify which QoS service produced the observations.
72+
if observations == nil {
73+
logger.ProbabilisticDebugInfo(polylog.ProbabilisticDebugInfoProb).Msg("SHOULD RARELY HAPPEN: Unable to publish CometBFT metrics: received nil observations.")
74+
return
75+
}
76+
77+
// Create an interpreter for the observations
78+
interpreter := &qos.CometBFTObservationInterpreter{
79+
Logger: logger,
80+
Observations: observations,
81+
}
82+
83+
// Increment request counters with all corresponding labels
84+
requestsTotal.With(
85+
prometheus.Labels{
86+
"chain_id": interpreter.GetChainID(),
87+
"service_id": interpreter.GetServiceID(),
88+
"request_origin": observations.GetRequestOrigin().String(),
89+
"request_method": interpreter.GetRequestMethod(),
90+
"success": fmt.Sprintf("%t", interpreter.IsRequestSuccessful()),
91+
"error_type": interpreter.GetRequestErrorType(),
92+
"http_status_code": fmt.Sprintf("%d", interpreter.GetRequestHTTPStatus()),
93+
},
94+
).Inc()
95+
}

observation/qos/cometbft.pb.go

Lines changed: 67 additions & 15 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)