Skip to content

Commit 5635598

Browse files
authored
[Protocol][Observability] Track RelayMiner non-2xx HTTP status codes (#436)
## Summary Track non-2xx HTTP status codes from RelayMiner via metrics.
1 parent 49a4a73 commit 5635598

File tree

5 files changed

+102
-4
lines changed

5 files changed

+102
-4
lines changed

observation/protocol/shannon.pb.go

Lines changed: 12 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

proto/path/protocol/shannon.proto

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,12 @@ enum ShannonEndpointErrorType {
124124

125125
// The relay request sent to the endpoint via WebSocket failed to validate the relay response.
126126
SHANNON_ENDPOINT_ERROR_WEBSOCKET_RELAY_RESPONSE_VALIDATION_FAILED = 41;
127+
128+
// RelayMiner returned a 4XX HTTP status code
129+
SHANNON_ENDPOINT_ERROR_RELAY_MINER_HTTP_4XX = 42;
130+
131+
// RelayMiner returned a 5XX HTTP status code
132+
SHANNON_ENDPOINT_ERROR_RELAY_MINER_HTTP_5XX = 43;
127133
}
128134

129135
// ShannonSanctionType specifies the duration type for endpoint sanctions

protocol/shannon/context.go

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"fmt"
77
"maps"
88
"math/rand"
9+
"net/http"
910
"strconv"
1011
"sync"
1112
"time"
@@ -337,7 +338,9 @@ func (rc *requestContext) executeRelayRequestStrategy(payload protocol.Payload)
337338
// Session rollover periods
338339
// - Protocol relay with fallback protection during session rollover periods
339340
// - Sends requests in parallel to ensure reliability during network transitions
340-
case rc.fullNode.IsInSessionRollover():
341+
//
342+
// TODO_DELETE(@adshmh): No session rollover fallback for hey service.
343+
case rc.fullNode.IsInSessionRollover() && rc.serviceID != "hey":
341344
rc.logger.Debug().Msg("Executing protocol relay with fallback protection during session rollover periods")
342345
// TODO_TECHDEBT(@adshmh): Separate error handling for fallback and Shannon endpoints.
343346
return rc.sendRelayWithFallback(payload)
@@ -509,12 +512,23 @@ func (rc *requestContext) sendProtocolRelay(payload protocol.Payload) (protocol.
509512
return defaultResponse, fmt.Errorf("SHOULD NEVER HAPPEN: failed to marshal relay request: %w", err)
510513
}
511514

515+
// TODO_TECHDEBT(@adshmh): Add a new struct to track details about the HTTP call.
516+
// It should contain at-least:
517+
// - endpoint payload
518+
// - HTTP status code
519+
// Use the new struct to pass data around for logging/metrics/etc.
520+
//
512521
// Send the HTTP request to the protocol endpoint.
513-
httpRelayResponseBz, _, err := rc.sendHTTPRequest(payload, selectedEndpoint.PublicURL(), relayRequestBz)
522+
httpRelayResponseBz, httpStatusCode, err := rc.sendHTTPRequest(payload, selectedEndpoint.PublicURL(), relayRequestBz)
514523
if err != nil {
515524
return defaultResponse, err
516525
}
517526

527+
// Non-2xx HTTP status code received from the endpoint: build and return an error
528+
if httpStatusCode != http.StatusOK {
529+
return defaultResponse, fmt.Errorf("%w %w: %d", errSendHTTPRelay, errEndpointNon2XXHTTPStatusCode, httpStatusCode)
530+
}
531+
518532
// Validate and process the response
519533
response, err := rc.validateAndProcessResponse(httpRelayResponseBz)
520534
if err != nil {
@@ -709,12 +723,23 @@ func (rc *requestContext) sendFallbackRelay(
709723
fallbackURL,
710724
[]byte(payload.Data),
711725
)
726+
712727
if err != nil {
713728
return protocol.Response{
714729
EndpointAddr: fallbackEndpoint.Addr(),
715730
}, err
716731
}
717732

733+
// TODO_CONSIDERATION(@adshmh): Are there any scenarios where a fallback endpoint should return a non-2xx HTTP status code?
734+
// Examples: a fallback endpoint for a RESTful service.
735+
//
736+
// Non-2xx HTTP status code: build and return an error.
737+
if httpStatusCode != http.StatusOK {
738+
return protocol.Response{
739+
EndpointAddr: fallbackEndpoint.Addr(),
740+
}, fmt.Errorf("%w %w: %d", errSendHTTPRelay, errEndpointNon2XXHTTPStatusCode, httpStatusCode)
741+
}
742+
718743
// Build and return the fallback response
719744
return protocol.Response{
720745
Bytes: httpResponseBz,

protocol/shannon/errors.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@ var (
6767
// Helps track more fine-grained metrics on endpoint errors.
6868
errMalformedEndpointPayload = errors.New("endpoint returned malformed payload")
6969

70+
// The endpoint returned a non-2XX response.
71+
errEndpointNon2XXHTTPStatusCode = errors.New("endpoint returned non-2xx HTTP status code")
72+
7073
// ** WebSocket errors **
7174

7275
// Error creating a WebSocket connection.

protocol/shannon/sanctions.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package shannon
33
import (
44
"errors"
55
"regexp"
6+
"strconv"
67
"strings"
78

89
"github.com/pokt-network/poktroll/pkg/polylog"
@@ -149,6 +150,13 @@ func classifyHttpError(logger polylog.Logger, err error) (protocolobservations.S
149150
protocolobservations.ShannonSanctionType_SHANNON_SANCTION_SESSION
150151
}
151152

153+
// RelayMiner returned non-2xx HTTP status code.
154+
if errors.Is(err, errEndpointNon2XXHTTPStatusCode) {
155+
return getNon2XXHTTPStatusCodeObservation(err),
156+
// TODO_UPNEXT(@adshmh): Make this a sanction that lasts a few blocks.
157+
protocolobservations.ShannonSanctionType_SHANNON_SANCTION_DO_NOT_SANCTION
158+
}
159+
152160
errStr := err.Error()
153161

154162
// Connection establishment failures
@@ -294,3 +302,49 @@ func classifyMalformedEndpointPayload(logger polylog.Logger, payloadContent stri
294302
//
295303
return protocolobservations.ShannonEndpointErrorType_SHANNON_ENDPOINT_ERROR_RAW_PAYLOAD_UNKNOWN, protocolobservations.ShannonSanctionType_SHANNON_SANCTION_DO_NOT_SANCTION
296304
}
305+
306+
// getNon2XXHTTPStatusCodeObservation returns ShannonEndpointErrorType based on HTTP status code:
307+
// - 4xx: SHANNON_ENDPOINT_ERROR_RELAY_MINER_HTTP_4XX
308+
// - 5xx: SHANNON_ENDPOINT_ERROR_RELAY_MINER_HTTP_5XX
309+
// - other/parse error: SHANNON_ENDPOINT_ERROR_HTTP_UNKNOWN
310+
func getNon2XXHTTPStatusCodeObservation(non2XXHTTPStatusCodeErr error) protocolobservations.ShannonEndpointErrorType {
311+
statusCode, ok := extractHTTPStatusCode(non2XXHTTPStatusCodeErr)
312+
if !ok {
313+
return protocolobservations.ShannonEndpointErrorType_SHANNON_ENDPOINT_ERROR_HTTP_UNKNOWN
314+
}
315+
316+
switch {
317+
case statusCode >= 400 && statusCode < 500:
318+
return protocolobservations.ShannonEndpointErrorType_SHANNON_ENDPOINT_ERROR_RELAY_MINER_HTTP_4XX
319+
case statusCode >= 500 && statusCode < 600:
320+
return protocolobservations.ShannonEndpointErrorType_SHANNON_ENDPOINT_ERROR_RELAY_MINER_HTTP_5XX
321+
default:
322+
return protocolobservations.ShannonEndpointErrorType_SHANNON_ENDPOINT_ERROR_HTTP_UNKNOWN
323+
}
324+
}
325+
326+
// extractHTTPStatusCode extracts the HTTP status code from the error message.
327+
// Expects the status code to be at the end of the error string after ": ".
328+
func extractHTTPStatusCode(err error) (int, bool) {
329+
errStr := err.Error()
330+
331+
// Look for ": " followed by 3 digits at the end of the string
332+
re := regexp.MustCompile(`: (\d{3})$`)
333+
matches := re.FindStringSubmatch(errStr)
334+
335+
if len(matches) < 2 {
336+
return 0, false
337+
}
338+
339+
statusCode, parseErr := strconv.Atoi(matches[1])
340+
if parseErr != nil {
341+
return 0, false
342+
}
343+
344+
// Basic validation that it's a valid HTTP status code
345+
if statusCode < 100 || statusCode > 599 {
346+
return 0, false
347+
}
348+
349+
return statusCode, true
350+
}

0 commit comments

Comments
 (0)