Skip to content

Commit 53fd03f

Browse files
jorgecuestaoten91
andauthored
feat: implement tiered endpoint selection with reputation signals (#495)
## Summary - Implement tiered endpoint selection based on reputation scores: - Tier 1 (score >70): Best endpoints, used first if any exist - Tier 2 (score 50-70): Used when Tier 1 is empty - Tier 3 (score 30-50): Last resort fallback - Add comprehensive reputation signal recording across all relay dispatch points: - Success signals for successful relays with latency tracking - Minor error signals for JSON-RPC errors in responses - Major error signals for timeouts and HTTP errors - Critical error signals for websocket connection failures - Record signals for fallback endpoints during Shannon session rollover - Add websocket message success/error reputation tracking - Add tier distribution Prometheus metrics (`path_reputation_tier_endpoints_total`) - Add `make path_dev` target for local development testing ## Configuration Options New configuration options added to `shannon_config.gateway_config.reputation_config`: ```yaml reputation_config: # Enable/disable the reputation system (default: false) enabled: true # Storage backend: "memory" or "redis" (default: "memory") storage_type: "memory" # Starting score for new endpoints (default: 80) initial_score: 80 # Minimum score for endpoint selection (default: 30) min_threshold: 30 # Time for inactive endpoint score recovery (default: 5m) recovery_timeout: 5m # Tiered selection configuration tiered_selection: # Enable tiered selection (default: true when reputation enabled) enabled: true # Tier 1 threshold - Premium tier (default: 70) tier1_threshold: 70 # Tier 2 threshold - Good tier (default: 50) tier2_threshold: 50 # Tier 3 uses min_threshold (30) as its minimum ``` ## Reputation Signal Types | Signal Type | Score Impact | Triggers | |-------------|-------------|----------| | Success | +1 | Successful relay with valid response | | MinorError | -3 | JSON-RPC errors in response | | MajorError | -10 | Timeouts, HTTP errors, websocket validation failures | | CriticalError | -25 | HTTP 5xx, service errors, validation/signature errors | | FatalError | -50 | Permanent sanctions (service misconfiguration) | ## Test plan - [x] Unit tests pass (`make test_unit`) - [x] E2E tests pass for eth service (`make e2e_test eth` - 99% success rate) - [x] Lint passes (`make go_lint` - 0 issues) - [x] Format applied (`make go_fmt`) - [x] Manual verification: tier distribution metrics showing correct distribution - [x] Manual verification: JSON-RPC errors affecting reputation and causing tier movement --------- Co-authored-by: Otto V <[email protected]>
1 parent d516bfb commit 53fd03f

File tree

13 files changed

+1192
-10
lines changed

13 files changed

+1192
-10
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,4 +185,4 @@ include ./makefiles/helpers.mk
185185
fi; \
186186
echo ""; \
187187
exit 1; \
188-
fi
188+
fi

config/config_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,11 @@ func Test_LoadGatewayConfigFromYAML(t *testing.T) {
9494
MinThreshold: 30,
9595
RecoveryTimeout: 5 * time.Minute,
9696
KeyGranularity: reputation.KeyGranularityEndpoint,
97+
TieredSelection: reputation.TieredSelectionConfig{
98+
Enabled: true,
99+
Tier1Threshold: 70,
100+
Tier2Threshold: 50,
101+
},
97102
},
98103
},
99104
},

config/examples/config.shannon_example.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,24 @@ shannon_config:
121121
# sol:
122122
# # Use per-supplier for sol to group by supplier
123123
# key_granularity: "per-supplier"
124+
# Tiered endpoint selection configuration
125+
# When enabled, endpoints are grouped into tiers based on reputation score
126+
# and selection prefers higher-tier endpoints using cascade-down logic
127+
tiered_selection:
128+
# Enable/disable tiered selection
129+
# Default: true (when reputation is enabled)
130+
enabled: true
131+
# Minimum score for Tier 1 (Premium tier)
132+
# Endpoints with scores >= tier1_threshold are selected first
133+
# Default: 70
134+
tier1_threshold: 70
135+
# Minimum score for Tier 2 (Good tier)
136+
# Endpoints with scores >= tier2_threshold but < tier1_threshold
137+
# are selected only if no Tier 1 endpoints are available
138+
# Default: 50
139+
tier2_threshold: 50
140+
# Tier 3 (Fair tier) uses min_threshold as its minimum score
141+
# Endpoints in Tier 3 are selected only if Tier 1 and 2 are empty
124142
# Redis configuration (only used when storage_type is "redis")
125143
# redis:
126144
# address: "localhost:6379"

metrics/reputation/metrics.go

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,21 @@ const (
2020

2121
// Reputation service health metrics
2222
reputationErrorsTotalMetric = "shannon_reputation_errors_total"
23+
24+
// Tiered selection metrics
25+
reputationTierSelectionMetric = "shannon_reputation_tier_selection_total"
26+
27+
// Tier distribution metrics (gauge showing endpoints per tier)
28+
reputationTierDistributionMetric = "shannon_reputation_tier_distribution"
2329
)
2430

2531
func init() {
2632
prometheus.MustRegister(reputationSignalsTotal)
2733
prometheus.MustRegister(reputationEndpointsFiltered)
2834
prometheus.MustRegister(reputationScoreDistribution)
2935
prometheus.MustRegister(reputationErrorsTotal)
36+
prometheus.MustRegister(reputationTierSelection)
37+
prometheus.MustRegister(reputationTierDistribution)
3038
}
3139

3240
var (
@@ -36,6 +44,10 @@ var (
3644
// - signal_type: Type of signal (success, minor_error, major_error, critical_error, fatal_error)
3745
// - endpoint_domain: Effective TLD+1 domain extracted from endpoint URL
3846
//
47+
// CARDINALITY WARNING: The endpoint_domain label can have high cardinality
48+
// in deployments with many unique supplier domains. Monitor Prometheus memory
49+
// usage and consider aggregating metrics if cardinality exceeds ~1000 unique domains.
50+
//
3951
// Use to analyze:
4052
// - Signal distribution by type and service
4153
// - Endpoint reliability patterns
@@ -109,6 +121,42 @@ var (
109121
},
110122
[]string{"operation", "error_type"},
111123
)
124+
125+
// reputationTierSelection tracks endpoint selections by tier.
126+
// Labels:
127+
// - service_id: Target service identifier
128+
// - tier: Selected tier (1=Premium, 2=Good, 3=Fair, 0=Random/disabled)
129+
//
130+
// Use to analyze:
131+
// - Tier distribution across services
132+
// - How often cascade-down occurs (tier 2/3 selections)
133+
// - Effectiveness of tiered selection
134+
reputationTierSelection = prometheus.NewCounterVec(
135+
prometheus.CounterOpts{
136+
Subsystem: pathProcess,
137+
Name: reputationTierSelectionMetric,
138+
Help: "Total endpoint selections by tier",
139+
},
140+
[]string{"service_id", "tier"},
141+
)
142+
143+
// reputationTierDistribution tracks the current distribution of endpoints across tiers.
144+
// Labels:
145+
// - service_id: Target service identifier
146+
// - tier: Tier number (1=Premium, 2=Good, 3=Fair)
147+
//
148+
// Use to analyze:
149+
// - Real-time health of endpoint pool
150+
// - How endpoints are distributed across reputation tiers
151+
// - Identify services with poor endpoint quality
152+
reputationTierDistribution = prometheus.NewGaugeVec(
153+
prometheus.GaugeOpts{
154+
Subsystem: pathProcess,
155+
Name: reputationTierDistributionMetric,
156+
Help: "Current number of endpoints in each tier",
157+
},
158+
[]string{"service_id", "tier"},
159+
)
112160
)
113161

114162
// RecordSignal records a reputation signal metric.
@@ -152,3 +200,44 @@ func RecordError(operation, errorType string) {
152200
"error_type": errorType,
153201
}).Inc()
154202
}
203+
204+
// RecordTierSelection records which tier an endpoint was selected from.
205+
func RecordTierSelection(serviceID string, tier int) {
206+
reputationTierSelection.With(prometheus.Labels{
207+
"service_id": serviceID,
208+
"tier": tierToString(tier),
209+
}).Inc()
210+
}
211+
212+
// RecordTierDistribution records the current distribution of endpoints across tiers.
213+
// This should be called whenever tiered selection is performed to show real-time tier health.
214+
func RecordTierDistribution(serviceID string, tier1Count, tier2Count, tier3Count int) {
215+
reputationTierDistribution.With(prometheus.Labels{
216+
"service_id": serviceID,
217+
"tier": "1",
218+
}).Set(float64(tier1Count))
219+
220+
reputationTierDistribution.With(prometheus.Labels{
221+
"service_id": serviceID,
222+
"tier": "2",
223+
}).Set(float64(tier2Count))
224+
225+
reputationTierDistribution.With(prometheus.Labels{
226+
"service_id": serviceID,
227+
"tier": "3",
228+
}).Set(float64(tier3Count))
229+
}
230+
231+
// tierToString converts tier number to string label.
232+
func tierToString(tier int) string {
233+
switch tier {
234+
case 1:
235+
return "1"
236+
case 2:
237+
return "2"
238+
case 3:
239+
return "3"
240+
default:
241+
return "0"
242+
}
243+
}

0 commit comments

Comments
 (0)