Skip to content

Commit 8595e93

Browse files
red-0neclaude
andauthored
[Metrics] Improve relay metrics reliability with early initialization (#1780)
## Summary Improve relay metrics reliability by initializing variables early and using deferred metrics recording to ensure consistent data collection. ### Primary Changes: - Initialize `supplierOperatorAddress` and `serviceId` with default values before relay request unmarshalling - Move metrics recording to deferred function to guarantee execution regardless of function exit path - Ensure `RelaysTotal` and relay duration metrics are always captured, even on early error returns ### Secondary changes: - Updated error reply handling to include default `ServiceId` for consistent metric labeling - Improved variable scope and reuse for better code organization ## Issue: Problem: Relay metrics could be inconsistently recorded when requests failed early in processing, leading to gaps in monitoring data and unreliable dashboards. ## Type of change Select one or more from the following: - [ ] Bug fix - [ ] New feature, functionality or library - [x] Code health or cleanup - [ ] Documentation - [ ] Other (specify) ## Sanity Checklist - [ ] I have updated the GitHub Issue Metadata: `assignees`, `reviewers`, `labels`, `project`, `iteration` and `milestone` - [ ] For docs: `make docusaurus_start` - [ ] For small changes: `make go_develop_and_test` and `make test_e2e` - [ ] For major changes: `devnet-test-e2e` label to run E2E tests in CI - [ ] For migration changes: `make test_e2e_oneshot` - [ ] 'TODO's, configurations and other docs --------- Co-authored-by: Claude <[email protected]>
1 parent 91d56e4 commit 8595e93

File tree

3 files changed

+41
-16
lines changed

3 files changed

+41
-16
lines changed

pkg/relayer/proxy/error_reply.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,9 @@ func (sync *relayMinerHTTPServer) replyWithError(
3939
if relayRequest == nil {
4040
relayRequest = &types.RelayRequest{
4141
Meta: types.RelayRequestMetadata{
42-
SessionHeader: &sessiontypes.SessionHeader{},
42+
SessionHeader: &sessiontypes.SessionHeader{
43+
ServiceId: UnknownServiceID,
44+
},
4345
},
4446
}
4547
}

pkg/relayer/proxy/proxy.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,15 @@ import (
1717

1818
var _ relayer.RelayerProxy = (*relayerProxy)(nil)
1919

20+
// UnknownServiceID and UnknownSupplierOperatorAddress are the default values used
21+
// as a fallback when the actual service ID or supplier operator cannot be determined.
22+
// This occurs during error scenarios before relay request validation, ensuring
23+
// metrics labels and error responses always have a valid service ID value.
24+
const (
25+
UnknownServiceID = "unknown_service_id"
26+
UnknownSupplierOperatorAddress = "unknown_supplier_operator_address"
27+
)
28+
2029
// relayerProxy is the main relayer proxy that takes relay requests of supported
2130
// services from the client and proxies them to the supported backend services.
2231
// It is responsible for notifying the miner about the relays that have been

pkg/relayer/proxy/sync.go

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,29 @@ func (server *relayMinerHTTPServer) serveSyncRequest(
5050
startBlock := server.blockClient.LastBlock(ctx)
5151
startHeight := startBlock.Height()
5252

53+
// Initialize with default values for metrics:
54+
// - We don't know the actual supplierOperatorAddress and serviceId until the relay request is unmarshalled.
55+
// - If we fail before unmarshalling, these defaults ensure:
56+
// - Metric labels are always populated (never empty)
57+
// - Downstream monitoring and dashboards remain consistent
58+
supplierOperatorAddress := UnknownSupplierOperatorAddress
59+
serviceId := UnknownServiceID
60+
61+
// Defer metrics to guarantee they are always recorded:
62+
// - Ensures RelaysTotal and relay duration are captured regardless of how/when the function returns
63+
// - Even on early error returns, metrics are updated with the best-known values
64+
// - Prevents accidental metric omission due to premature exit
65+
defer func(startTime time.Time, statusCode *int) {
66+
// Increment the relays counter.
67+
relayer.RelaysTotal.With(
68+
"service_id", serviceId,
69+
"supplier_operator_address", supplierOperatorAddress,
70+
).Add(1)
71+
72+
// Capture the relay request duration metric.
73+
relayer.CaptureRelayDuration(serviceId, startTime, *statusCode)
74+
}(requestStartTime, &statusCode)
75+
5376
logger.ProbabilisticDebugInfo(polylog.ProbabilisticDebugInfoProb).Msgf(
5477
"📊 Chain head at height %d (block hash: %X) at relay request start",
5578
startHeight,
@@ -72,7 +95,8 @@ func (server *relayMinerHTTPServer) serveSyncRequest(
7295
}
7396

7497
meta := relayRequest.Meta
75-
serviceId := meta.SessionHeader.ServiceId
98+
supplierOperatorAddress = meta.SupplierOperatorAddress
99+
serviceId = meta.SessionHeader.ServiceId
76100

77101
blockHeight := server.blockClient.LastBlock(ctx).Height()
78102

@@ -83,19 +107,19 @@ func (server *relayMinerHTTPServer) serveSyncRequest(
83107
"session_end_height", meta.SessionHeader.SessionEndBlockHeight,
84108
"service_id", serviceId,
85109
"application_address", meta.SessionHeader.ApplicationAddress,
86-
"supplier_operator_address", meta.SupplierOperatorAddress,
110+
"supplier_operator_address", supplierOperatorAddress,
87111
"request_start_time", requestStartTime.String(),
88112
)
89113

90114
// Check if the request's selected supplier is available for relaying.
91115
availableSuppliers := server.relayAuthenticator.GetSupplierOperatorAddresses()
92116

93-
if !slices.Contains(availableSuppliers, meta.SupplierOperatorAddress) {
117+
if !slices.Contains(availableSuppliers, supplierOperatorAddress) {
94118
logger.Warn().
95119
Msgf(
96120
"❌ The request's selected supplier with operator_address (%q) is not available for relaying! "+
97121
"This could be a network or configuration issue. Available suppliers: [%s] 🚦",
98-
meta.SupplierOperatorAddress,
122+
supplierOperatorAddress,
99123
strings.Join(availableSuppliers, ", "),
100124
)
101125
return relayRequest, ErrRelayerProxySupplierNotReachable
@@ -218,16 +242,6 @@ func (server *relayMinerHTTPServer) serveSyncRequest(
218242
"service_config_type", serviceConfigTypeLog,
219243
)
220244

221-
// Increment the relays counter.
222-
relayer.RelaysTotal.With(
223-
"service_id", serviceId,
224-
"supplier_operator_address", meta.SupplierOperatorAddress,
225-
).Add(1)
226-
defer func(startTime time.Time, statusCode *int) {
227-
// Capture the relay request duration metric.
228-
relayer.CaptureRelayDuration(serviceId, startTime, *statusCode)
229-
}(requestStartTime, &statusCode)
230-
231245
relayer.RelayRequestSizeBytes.With("service_id", serviceId).
232246
Observe(float64(relayRequest.Size()))
233247

@@ -360,7 +374,7 @@ func (server *relayMinerHTTPServer) serveSyncRequest(
360374
// Build the relay response using the original service's response.
361375
// Use relayRequest.Meta.SessionHeader on the relayResponse session header since it
362376
// was verified to be valid and has to be the same as the relayResponse session header.
363-
relayResponse, err := server.newRelayResponse(responseBz, meta.SessionHeader, meta.SupplierOperatorAddress)
377+
relayResponse, err := server.newRelayResponse(responseBz, meta.SessionHeader, supplierOperatorAddress)
364378
if err != nil {
365379
logger.Error().Err(err).Msg("❌ Failed building the relay response")
366380
// The client should not have knowledge about the RelayMiner's issues with

0 commit comments

Comments
 (0)