Skip to content

Commit a041247

Browse files
committed
feat: enhanced retry system with endpoint rotation and latency budget
Implemented comprehensive retry system improvements: - Retry endpoint rotation: never reuse same endpoint on retry, select new endpoint following QoS/reputation rules - Latency budget: configurable max_retry_latency (default 500ms) to prevent retrying slow requests - Concurrency configuration: made max_parallel_endpoints, max_concurrent_relays, and max_batch_payloads configurable via YAML - Endpoint exhaustion handling: exponential backoff (100ms, 200ms, 400ms) when all endpoints tried Bug fixes: - Fixed empty response handling in retry loops - Fixed attempt number in retry metrics (removed incorrect +1) - Fixed break/return in select statements - Fixed empty endpoint domain in metrics recording - Fixed Redis test to use testcontainer address - Added missing error storage before loop breaks - Enhanced context cancellation logging Metrics improvements: - Added endpoint_domain label to retry_latency metric - Added retry_reason label to retry_budget_skipped metric - New endpoint rotation metrics: shannon_retry_endpoint_switches_total, shannon_retry_endpoint_exhaustion_total - All metrics follow lowercase_underscore naming convention Configuration: - Updated config schema with max_retry_latency and concurrency_config - Added comprehensive examples in config.shannon_example.yaml - Per-service override support for all retry and concurrency settings Testing: - Fixed Redis test conflicts by using testcontainer addresses - Updated test mocks with GetConcurrencyConfig() - E2E tests passing at 99.33% success rate (298/300 requests)
1 parent 0d51476 commit a041247

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+2391
-353
lines changed

cmd/extractor_factory.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ func buildExtractorRegistry(unifiedConfig *gateway.UnifiedServicesConfig) *qosty
3939
registry.Register(serviceID, cosmosExtractor)
4040
case gateway.ServiceTypeSolana:
4141
registry.Register(serviceID, solanaExtractor)
42-
// Default: falls back to NoOpDataExtractor via registry.Get()
42+
// Default: falls back to NoOpDataExtractor via registry.Get()
4343
}
4444
}
4545

cmd/qos.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,4 +110,3 @@ func logGatewayServiceIDs(logger polylog.Logger, serviceConfigs map[protocol.Ser
110110
}
111111
logger.Info().Msgf("Service IDs configured by the gateway: %s.", strings.Join(serviceIDs, ", "))
112112
}
113-

config/config.schema.yaml

Lines changed: 63 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,11 @@ properties:
312312
description: "Retry on connection errors."
313313
type: boolean
314314
default: true
315+
max_retry_latency:
316+
description: "Maximum latency budget for retries. Only retry if failed request took less than this duration. Prevents retrying requests that already consumed significant time."
317+
type: string
318+
pattern: "^[0-9]+m?s$"
319+
default: "500ms"
315320

316321
# Observation Pipeline Configuration
317322
observation_pipeline:
@@ -497,45 +502,18 @@ properties:
497502
minimum: 0.0
498503
maximum: 1.0
499504

500-
# Service Defaults - Settings inherited by all services
501-
defaults:
502-
description: "Default settings inherited by all services. Per-service overrides only need to specify differences."
503-
type: object
504-
additionalProperties: false
505-
properties:
506-
type:
507-
description: "Default QoS type. Options: evm, solana, cosmos, generic, passthrough"
508-
type: string
509-
enum: ["evm", "solana", "cosmos", "generic", "passthrough"]
510-
default: "passthrough"
511-
rpc_types:
512-
description: "Default supported RPC types."
513-
type: array
514-
items:
515-
type: string
516-
enum: ["json_rpc", "rest", "websocket", "comet_bft", "grpc"]
517-
latency_profile:
518-
description: "Default latency profile name (references latency_profiles or built-in)."
519-
type: string
520-
default: "standard"
521-
reputation_config:
522-
$ref: "#/definitions/service_reputation_config"
523-
latency:
524-
$ref: "#/definitions/service_latency_config"
525-
tiered_selection:
526-
$ref: "#/definitions/service_tiered_selection_config"
527-
probation:
528-
$ref: "#/definitions/service_probation_config"
529-
retry_config:
530-
$ref: "#/definitions/service_retry_config"
531-
observation_pipeline:
532-
$ref: "#/definitions/service_observation_config"
533-
active_health_checks:
534-
$ref: "#/definitions/service_health_check_override"
505+
# Note: Service defaults are inherited from gateway_config top-level settings:
506+
# - reputation_config.tiered_selection -> default tiered selection
507+
# - reputation_config.tiered_selection.probation -> default probation
508+
# - retry_config -> default retry settings
509+
# - observation_pipeline -> default observation settings
510+
# - active_health_checks -> default health check settings
511+
#
512+
# Services only need to specify overrides in the services[] array.
535513

536514
# Services - Array of per-service configurations
537515
services:
538-
description: "List of configured services. Each service inherits from defaults unless explicitly overridden."
516+
description: "List of configured services. Each service inherits from gateway_config settings unless explicitly overridden."
539517
type: array
540518
uniqueItems: true
541519
items:
@@ -572,6 +550,8 @@ properties:
572550
$ref: "#/definitions/service_retry_config"
573551
observation_pipeline:
574552
$ref: "#/definitions/service_observation_config"
553+
concurrency_config:
554+
$ref: "#/definitions/service_concurrency_config"
575555
fallback:
576556
description: "Fallback endpoint configuration (no defaults - must be explicitly set per-service)."
577557
type: object
@@ -635,6 +615,31 @@ properties:
635615
description: "Buffer size for websocket messages."
636616
type: integer
637617

618+
# Concurrency Configuration (optional)
619+
concurrency_config:
620+
description: "Optional configuration for controlling concurrency limits in request processing. These limits protect against resource exhaustion from batch requests and parallel relays."
621+
type: object
622+
additionalProperties: false
623+
properties:
624+
max_parallel_endpoints:
625+
description: "Maximum number of endpoints to query in parallel per request. Higher values reduce latency but increase load. Range: 1-10."
626+
type: integer
627+
minimum: 1
628+
maximum: 10
629+
default: 1
630+
max_concurrent_relays:
631+
description: "Global limit on concurrent relay goroutines across all requests. Prevents resource exhaustion from too many simultaneous relays. Range: 100-10000."
632+
type: integer
633+
minimum: 100
634+
maximum: 10000
635+
default: 5500
636+
max_batch_payloads:
637+
description: "Maximum number of payloads allowed in a batch request. Must be less than or equal to max_concurrent_relays. Range: 1-10000."
638+
type: integer
639+
minimum: 1
640+
maximum: 10000
641+
default: 5500
642+
638643
# Hydrator Configuration (optional)
639644
hydrator_config:
640645
description: "Configuration for the hydrator, which is used to run QoS checks against endpoints of a service."
@@ -832,6 +837,10 @@ definitions:
832837
retry_on_connection:
833838
description: "Retry on connection errors."
834839
type: boolean
840+
max_retry_latency:
841+
description: "Maximum latency budget for retries."
842+
type: string
843+
pattern: "^[0-9]+m?s$"
835844

836845
# Per-service observation pipeline configuration
837846
# Note: worker_count and queue_size are GLOBAL only (gateway_config.observation_pipeline)
@@ -849,6 +858,24 @@ definitions:
849858
minimum: 0.0
850859
maximum: 1.0
851860

861+
# Per-service concurrency configuration
862+
# Note: max_concurrent_relays is GLOBAL only (cannot be overridden per-service)
863+
service_concurrency_config:
864+
description: "Per-service concurrency configuration. Allows fine-tuning parallel execution and batch limits per service."
865+
type: object
866+
additionalProperties: false
867+
properties:
868+
max_parallel_endpoints:
869+
description: "Maximum endpoints to query in parallel for this service. Use >1 for unreliable services to reduce latency. Range: 1-10."
870+
type: integer
871+
minimum: 1
872+
maximum: 10
873+
max_batch_payloads:
874+
description: "Maximum payloads in a batch request for this service. Lower for heavy services, higher for light services. Range: 1-10000."
875+
type: integer
876+
minimum: 1
877+
maximum: 10000
878+
852879
# Per-service health check configuration
853880
service_health_check_override:
854881
description: "Per-service health check configuration."

config/config_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ func Test_LoadGatewayConfigFromYAML(t *testing.T) {
3939
name: "should load valid config from example file",
4040
filePath: "./examples/config.shannon_example.yaml",
4141
skipCompare: true, // Example config is a reference doc, not a test fixture
42-
want: GatewayConfig{
42+
want: GatewayConfig{
4343
FullNodeConfig: shannonprotocol.FullNodeConfig{
4444
RpcURL: "https://shannon-grove-rpc.mainnet.poktroll.com",
4545
SessionRolloverBlocks: 10,

config/examples/config.shannon_example.yaml

Lines changed: 54 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
# 2. Full node connection settings
1010
# 3. Gateway settings with unified service configuration
1111
#
12-
# Services inherit from `defaults` and can override any setting.
12+
# Services inherit from gateway_config settings and can override any setting.
1313
# Only specify what differs from defaults to keep configs clean.
1414

1515
# =============================================================================
@@ -47,6 +47,13 @@ data_reporter_config:
4747
logger_config:
4848
level: "info"
4949

50+
# Concurrency Configuration (optional)
51+
# Controls parallel request processing and batch limits
52+
concurrency_config:
53+
max_parallel_endpoints: 1 # How many endpoints to query in parallel per request (1-10)
54+
max_concurrent_relays: 5500 # Global limit on concurrent relay goroutines (100-10000)
55+
max_batch_payloads: 5500 # Max payloads in batch request (1-10000, ≤ max_concurrent_relays)
56+
5057
# =============================================================================
5158
# FULL NODE CONFIGURATION
5259
# =============================================================================
@@ -96,7 +103,8 @@ gateway_config:
96103
# ===========================================================================
97104
# GLOBAL REPUTATION CONFIGURATION
98105
# ===========================================================================
99-
# These settings apply globally and CANNOT be overridden per-service
106+
# These settings apply globally and serve as defaults for all services.
107+
# Per-service overrides can be specified in the services[] array.
100108

101109
reputation_config:
102110
# Enable/disable the entire reputation system
@@ -109,7 +117,7 @@ gateway_config:
109117
# NOTE: This is GLOBAL ONLY - cannot be overridden per-service
110118
storage_type: "memory"
111119

112-
# Global initial score (can be overridden per-service in `defaults` or `services`)
120+
# Global initial score (can be overridden per-service)
113121
initial_score: 80
114122

115123
# Global minimum threshold (can be overridden per-service)
@@ -118,14 +126,30 @@ gateway_config:
118126
# Time before inactive low-scoring endpoints can recover
119127
recovery_timeout: 5m
120128

129+
# Tiered endpoint selection
130+
# Cascade: tier1 first, then tier2, then tier3
131+
tiered_selection:
132+
enabled: true
133+
tier1_threshold: 70 # Premium tier (highest priority)
134+
tier2_threshold: 50 # Good tier
135+
136+
# Probation system for recovering endpoints
137+
# Low-scoring endpoints get limited traffic to prove reliability
138+
probation:
139+
enabled: true
140+
threshold: 10 # Score below which endpoint enters probation
141+
traffic_percent: 10 # % of traffic routed to probation endpoints
142+
recovery_multiplier: 2.0 # Boost for successful probation requests
143+
121144
# ===========================================================================
122145
# GLOBAL RETRY CONFIGURATION (optional)
123146
# ===========================================================================
124-
# Global retry settings - use `defaults` or `services` for per-service control
147+
# Global retry settings - per-service overrides in services[] array
125148

126149
retry_config:
127150
enabled: true
128151
max_retries: 1
152+
max_retry_latency: 500ms # Only retry if failed request took < 500ms
129153
retry_on_5xx: true
130154
retry_on_timeout: true
131155
retry_on_connection: true
@@ -145,7 +169,7 @@ gateway_config:
145169
# GLOBAL ACTIVE HEALTH CHECKS
146170
# ===========================================================================
147171
# Proactive endpoint monitoring - runs health checks on all endpoints
148-
# Use `defaults` or `services` for per-service health check rules
172+
# Per-service health check rules can be defined in services[].health_checks
149173

150174
active_health_checks:
151175
enabled: true
@@ -254,87 +278,22 @@ gateway_config:
254278
slow_penalty: 0.7
255279
very_slow_penalty: 0.3
256280

257-
# ===========================================================================
258-
# SERVICE DEFAULTS
259-
# ===========================================================================
260-
# Settings inherited by ALL services unless overridden per-service
261-
# Only specify what you want as default behavior
262-
263-
defaults:
264-
# QoS type determines how requests are validated and processed
265-
# Options: evm, solana, cosmos, generic, passthrough
266-
type: passthrough
267-
268-
# Supported RPC types for endpoints
269-
# Options: json_rpc, rest, websocket, comet_bft, grpc
270-
rpc_types:
271-
- json_rpc
272-
273-
# Reference to a latency profile (from latency_profiles or built-in)
274-
latency_profile: "standard"
275-
276-
# Per-service reputation overrides
277-
# NOTE: storage_type is GLOBAL ONLY (set in gateway_config.reputation_config)
278-
reputation_config:
279-
enabled: true
280-
initial_score: 70 # Default starting score
281-
min_threshold: 40 # Default minimum for selection
282-
recovery_timeout: 5m # Time before recovery attempt
283-
# key_granularity: "per-endpoint" # per-endpoint, per-domain, per-supplier
284-
285-
# Inline latency config (alternative to latency_profile)
286-
# If target_ms > 0, this OVERRIDES the latency_profile
287-
latency:
288-
enabled: true
289-
target_ms: 0 # 0 means use latency_profile instead
290-
penalty_weight: 0.3 # How much latency affects scoring (0.0-1.0)
291-
292-
# Tiered endpoint selection
293-
# Cascade: tier1 first, then tier2, then tier3
294-
tiered_selection:
295-
enabled: true
296-
tier1_threshold: 70 # Premium tier (highest priority)
297-
tier2_threshold: 50 # Good tier
298-
299-
# Probation system for recovering endpoints
300-
# Low-scoring endpoints get limited traffic to prove reliability
301-
probation:
302-
enabled: true
303-
threshold: 10 # Score below which endpoint enters probation
304-
traffic_percent: 10 # % of traffic routed to probation endpoints
305-
recovery_multiplier: 2.0 # Boost for successful probation requests
306-
307-
# Retry configuration
308-
retry_config:
309-
enabled: true
310-
max_retries: 1
311-
retry_on_5xx: true
312-
retry_on_timeout: true
313-
retry_on_connection: true
314-
315-
# Observation pipeline per-service settings
316-
# NOTE: worker_count and queue_size are GLOBAL ONLY
317-
observation_pipeline:
318-
enabled: true
319-
sample_rate: 0.1 # Per-service sample rate (this WORKS)
320-
# worker_count: 4 # GLOBAL ONLY - per-service value ignored
321-
# queue_size: 1000 # GLOBAL ONLY - per-service value ignored
322-
323-
# Health check defaults
324-
active_health_checks:
325-
enabled: true
326-
interval: 30s
327-
sync_allowance: 5 # Blocks behind latest before considered out-of-sync
328-
external: # Per-service external URL (overrides global)
329-
url: ""
330-
refresh_interval: "1h"
331-
timeout: "30s"
332-
local: [] # Per-service local rules (override external by name)
333-
334281
# ===========================================================================
335282
# SERVICE CONFIGURATIONS
336283
# ===========================================================================
337-
# Define services and their overrides. Only specify what differs from defaults.
284+
# Define services with their per-service overrides.
285+
# Each service inherits from gateway_config settings and can override:
286+
# - type: QoS type (evm, solana, cosmos, generic, passthrough)
287+
# - rpc_types: Supported RPC types
288+
# - latency_profile: Reference to a named profile
289+
# - reputation_config: Per-service reputation overrides
290+
# - tiered_selection: Per-service tier thresholds
291+
# - probation: Per-service probation settings
292+
# - retry_config: Per-service retry settings
293+
# - concurrency_config: Per-service concurrency overrides (max_parallel_endpoints, max_batch_payloads)
294+
# - observation_pipeline: Per-service sample rate
295+
# - fallback: Fallback endpoints (no defaults - must be explicitly configured)
296+
# - health_checks: Per-service health check rules
338297

339298
services:
340299
# -------------------------------------------------------------------------
@@ -505,3 +464,15 @@ gateway_config:
505464
latency_profile: "slow"
506465
retry_config:
507466
max_retries: 2
467+
468+
# Per-service concurrency overrides (optional)
469+
# Use these to override global concurrency_config for specific services
470+
# concurrency_config:
471+
# # max_parallel_endpoints: Race multiple endpoints in parallel (1-10)
472+
# # Use >1 for unreliable services to get faster responses
473+
# # ⚠️ WARNING: Values >1 multiply token burn (e.g., 3 endpoints = 3x cost)
474+
# max_parallel_endpoints: 3
475+
#
476+
# # max_batch_payloads: Limit batch size for this service (1-10000)
477+
# # Useful for heavy services that process large batches
478+
# max_batch_payloads: 100

0 commit comments

Comments
 (0)