Skip to content

Commit 68cce2a

Browse files
RomanZavodskikhRoman Zavodskikh
andauthored
Add -min-drop-probability cmdline option for the PHC (#3053)
Signed-off-by: Roman Zavodskikh <[email protected]> Co-authored-by: Roman Zavodskikh <[email protected]>
1 parent 678fc5a commit 68cce2a

File tree

8 files changed

+91
-12
lines changed

8 files changed

+91
-12
lines changed

docs/operation/operation.md

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -901,15 +901,15 @@ Passive Health Check(PHC).
901901
PHC works the following way: the entire uptime is divided in chunks of `period`, per every period Skipper calculates
902902
the total amount of requests and amount of requests failed per every endpoint. While next period is going on,
903903
the Skipper takes a look at previous period and if the amount of requests in the previous period is more than `min-requests`
904-
for the given endpoints then Skipper will send reduced (the more `max-drop-probability`
905-
and failed requests ratio in previous period are, the stronger reduction is)
906-
amount of requests compared to amount sent without PHC.
904+
and failed requests ratio is more than `min-drop-probability` for the given endpoints
905+
then Skipper will send reduced (the more `max-drop-probability` and failed requests ratio
906+
in previous period are, the stronger reduction is) amount of requests compared to amount sent without PHC.
907907

908908
Having this, we expect less requests to fail because a lot of them would be sent to endpoints that seem to be healthy instead.
909909

910910
To enable this feature, you need to provide `-passive-health-check` option having all forementioned parameters
911-
(`period`, `min-requests`, `max-drop-probability`) defined,
912-
for instance: `-passive-health-check=period=1s,min-requests=10,max-drop-probability=0.9`.
911+
(`period`, `min-requests`, `min-drop-probability`, `max-drop-probability`) defined,
912+
for instance: `-passive-health-check=period=1s,min-requests=10,min-drop-probability=0.05,max-drop-probability=0.9`.
913913

914914
You need to define all parameters on your side, there are no defaults, and skipper will not run if PHC params are passed only partially.
915915

@@ -918,7 +918,8 @@ However, Skipper will run without this feature, if no `-passive-health-check` is
918918
The parameters of `-passive-health-check` option are:
919919
+ `period=<duration>` - the duration of stats reset period
920920
+ `min-requests=<int>` - the minimum number of requests per `period` per backend endpoint required to activate PHC for this endpoint
921-
+ `max-drop-probabilty=<float more than/equal to 0 and less than/equal to 1>` - the maximum possible probability of unhealthy endpoint being not considered
921+
+ `min-drop-probabilty=<0.0 <= p <= 1.0>` - the minimum possible probability of unhealthy endpoint being not considered while choosing the endpoint for the given request. The same value is in fact used as minimal failed requests ratio for PHC to be enabled for this endpoint
922+
+ `max-drop-probabilty=<0.0 <= p <= 1.0>` - the maximum possible probability of unhealthy endpoint being not considered
922923
while choosing the endpoint for the given request
923924

924925
### Metrics

metricsinit_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ func TestInitOrderAndDefault(t *testing.T) {
5757
"period": "1m",
5858
"min-requests": "10",
5959
"max-drop-probability": "0.9",
60+
"min-drop-probability": "0.05",
6061
},
6162
}
6263

proxy/healthy_endpoints.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ func (h *healthyEndpoints) filterHealthyEndpoints(ctx *context, endpoints []rout
2222
filtered := make([]routing.LBEndpoint, 0, len(endpoints))
2323
for _, e := range endpoints {
2424
dropProbability := e.Metrics.HealthCheckDropProbability()
25-
if dropProbability > 0.05 && p < dropProbability {
25+
if p < dropProbability {
2626
ctx.Logger().Infof("Dropping endpoint %q due to passive health check: p=%0.2f, dropProbability=%0.2f",
2727
e.Host, p, dropProbability)
2828
metrics.IncCounter("passive-health-check.endpoints.dropped")

proxy/healthy_endpoints_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ func defaultEndpointRegistry() *routing.EndpointRegistry {
2424
StatsResetPeriod: period,
2525
MinRequests: 10,
2626
MaxHealthCheckDropProbability: 1.0,
27+
MinHealthCheckDropProbability: 0.01,
2728
})
2829
}
2930

proxy/proxy.go

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,11 @@ type PassiveHealthCheck struct {
154154
// potentially opt out the endpoint from the list of healthy endpoints
155155
MinRequests int64
156156

157+
// The minimal ratio of failed requests in a single period to potentially opt out the endpoint
158+
// from the list of healthy endpoints. This ratio is equal to the minimal non-zero probability of
159+
// dropping endpoint out from load balancing for every specific request
160+
MinDropProbability float64
161+
157162
// The maximum probability of unhealthy endpoint to be dropped out from load balancing for every specific request
158163
MaxDropProbability float64
159164
}
@@ -186,6 +191,15 @@ func InitPassiveHealthChecker(o map[string]string) (bool, *PassiveHealthCheck, e
186191
return false, nil, fmt.Errorf("passive health check: invalid minRequests value: %s", value)
187192
}
188193
result.MinRequests = int64(minRequests)
194+
case "min-drop-probability":
195+
minDropProbability, err := strconv.ParseFloat(value, 64)
196+
if err != nil {
197+
return false, nil, fmt.Errorf("passive health check: invalid minDropProbability value: %s", value)
198+
}
199+
if minDropProbability < 0 || minDropProbability > 1 {
200+
return false, nil, fmt.Errorf("passive health check: invalid minDropProbability value: %s", value)
201+
}
202+
result.MinDropProbability = minDropProbability
189203
case "max-drop-probability":
190204
maxDropProbability, err := strconv.ParseFloat(value, 64)
191205
if err != nil {
@@ -202,9 +216,12 @@ func InitPassiveHealthChecker(o map[string]string) (bool, *PassiveHealthCheck, e
202216
keysInitialized[key] = struct{}{}
203217
}
204218

205-
if len(keysInitialized) != 3 {
219+
if len(keysInitialized) != 4 {
206220
return false, nil, fmt.Errorf("passive health check: missing required parameters")
207221
}
222+
if result.MinDropProbability >= result.MaxDropProbability {
223+
return false, nil, fmt.Errorf("passive health check: minDropProbability should be less than maxDropProbability")
224+
}
208225
return true, result, nil
209226
}
210227

proxy/proxy_test.go

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2299,6 +2299,7 @@ func TestInitPassiveHealthChecker(t *testing.T) {
22992299
"period": "somethingInvalid",
23002300
"min-requests": "10",
23012301
"max-drop-probability": "0.9",
2302+
"min-drop-probability": "0.05",
23022303
},
23032304
expectedEnabled: false,
23042305
expectedParams: nil,
@@ -2309,12 +2310,14 @@ func TestInitPassiveHealthChecker(t *testing.T) {
23092310
"period": "1m",
23102311
"min-requests": "10",
23112312
"max-drop-probability": "0.9",
2313+
"min-drop-probability": "0.05",
23122314
},
23132315
expectedEnabled: true,
23142316
expectedParams: &PassiveHealthCheck{
23152317
Period: 1 * time.Minute,
23162318
MinRequests: 10,
23172319
MaxDropProbability: 0.9,
2320+
MinDropProbability: 0.05,
23182321
},
23192322
expectedError: nil,
23202323
},
@@ -2323,6 +2326,7 @@ func TestInitPassiveHealthChecker(t *testing.T) {
23232326
"period": "-1m",
23242327
"min-requests": "10",
23252328
"max-drop-probability": "0.9",
2329+
"min-drop-probability": "0.05",
23262330
},
23272331
expectedEnabled: false,
23282332
expectedParams: nil,
@@ -2333,6 +2337,7 @@ func TestInitPassiveHealthChecker(t *testing.T) {
23332337
"period": "1m",
23342338
"min-requests": "somethingInvalid",
23352339
"max-drop-probability": "0.9",
2340+
"min-drop-probability": "0.05",
23362341
},
23372342
expectedEnabled: false,
23382343
expectedParams: nil,
@@ -2343,6 +2348,7 @@ func TestInitPassiveHealthChecker(t *testing.T) {
23432348
"period": "1m",
23442349
"min-requests": "-10",
23452350
"max-drop-probability": "0.9",
2351+
"min-drop-probability": "0.05",
23462352
},
23472353
expectedEnabled: false,
23482354
expectedParams: nil,
@@ -2353,6 +2359,7 @@ func TestInitPassiveHealthChecker(t *testing.T) {
23532359
"period": "1m",
23542360
"min-requests": "10",
23552361
"max-drop-probability": "somethingInvalid",
2362+
"min-drop-probability": "0.05",
23562363
},
23572364
expectedEnabled: false,
23582365
expectedParams: nil,
@@ -2363,6 +2370,7 @@ func TestInitPassiveHealthChecker(t *testing.T) {
23632370
"period": "1m",
23642371
"min-requests": "10",
23652372
"max-drop-probability": "-0.1",
2373+
"min-drop-probability": "0.05",
23662374
},
23672375
expectedEnabled: false,
23682376
expectedParams: nil,
@@ -2373,6 +2381,7 @@ func TestInitPassiveHealthChecker(t *testing.T) {
23732381
"period": "1m",
23742382
"min-requests": "10",
23752383
"max-drop-probability": "3.1415",
2384+
"min-drop-probability": "0.05",
23762385
},
23772386
expectedEnabled: false,
23782387
expectedParams: nil,
@@ -2383,6 +2392,51 @@ func TestInitPassiveHealthChecker(t *testing.T) {
23832392
"period": "1m",
23842393
"min-requests": "10",
23852394
"max-drop-probability": "0.9",
2395+
"min-drop-probability": "somethingInvalid",
2396+
},
2397+
expectedEnabled: false,
2398+
expectedParams: nil,
2399+
expectedError: fmt.Errorf("passive health check: invalid minDropProbability value: somethingInvalid"),
2400+
},
2401+
{
2402+
inputArg: map[string]string{
2403+
"period": "1m",
2404+
"min-requests": "10",
2405+
"max-drop-probability": "0.9",
2406+
"min-drop-probability": "-0.1",
2407+
},
2408+
expectedEnabled: false,
2409+
expectedParams: nil,
2410+
expectedError: fmt.Errorf("passive health check: invalid minDropProbability value: -0.1"),
2411+
},
2412+
{
2413+
inputArg: map[string]string{
2414+
"period": "1m",
2415+
"min-requests": "10",
2416+
"max-drop-probability": "0.9",
2417+
"min-drop-probability": "3.1415",
2418+
},
2419+
expectedEnabled: false,
2420+
expectedParams: nil,
2421+
expectedError: fmt.Errorf("passive health check: invalid minDropProbability value: 3.1415"),
2422+
},
2423+
{
2424+
inputArg: map[string]string{
2425+
"period": "1m",
2426+
"min-requests": "10",
2427+
"max-drop-probability": "0.05",
2428+
"min-drop-probability": "0.9",
2429+
},
2430+
expectedEnabled: false,
2431+
expectedParams: nil,
2432+
expectedError: fmt.Errorf("passive health check: minDropProbability should be less than maxDropProbability"),
2433+
},
2434+
{
2435+
inputArg: map[string]string{
2436+
"period": "1m",
2437+
"min-requests": "10",
2438+
"max-drop-probability": "0.9",
2439+
"min-drop-probability": "0.05",
23862440
"non-existing": "non-existing",
23872441
},
23882442
expectedEnabled: false,
@@ -2394,6 +2448,7 @@ func TestInitPassiveHealthChecker(t *testing.T) {
23942448
"period": "1m",
23952449
"min-requests": "10",
23962450
/* forgot max-drop-probability */
2451+
"min-drop-probability": "0.05",
23972452
},
23982453
expectedEnabled: false,
23992454
expectedParams: nil,

routing/endpointregistry.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ type EndpointRegistry struct {
9898
lastSeenTimeout time.Duration
9999
statsResetPeriod time.Duration
100100
minRequests int64
101+
minHealthCheckDropProbability float64
101102
maxHealthCheckDropProbability float64
102103

103104
quit chan struct{}
@@ -113,6 +114,7 @@ type RegistryOptions struct {
113114
PassiveHealthCheckEnabled bool
114115
StatsResetPeriod time.Duration
115116
MinRequests int64
117+
MinHealthCheckDropProbability float64
116118
MaxHealthCheckDropProbability float64
117119
}
118120

@@ -164,17 +166,17 @@ func (r *EndpointRegistry) updateStats() {
164166
e.totalFailedRoundTrips[nextSlot].Store(0)
165167
e.totalRequests[nextSlot].Store(0)
166168

169+
newDropProbability := 0.0
167170
failed := e.totalFailedRoundTrips[curSlot].Load()
168171
requests := e.totalRequests[curSlot].Load()
169172
if requests > r.minRequests {
170173
failedRoundTripsRatio := float64(failed) / float64(requests)
171-
if failedRoundTripsRatio > 0.0 {
174+
if failedRoundTripsRatio > r.minHealthCheckDropProbability {
172175
log.Infof("Passive health check: marking %q as unhealthy due to failed round trips ratio: %0.2f", key, failedRoundTripsRatio)
176+
newDropProbability = min(failedRoundTripsRatio, r.maxHealthCheckDropProbability)
173177
}
174-
e.healthCheckDropProbability.Store(min(failedRoundTripsRatio, r.maxHealthCheckDropProbability))
175-
} else {
176-
e.healthCheckDropProbability.Store(0.0)
177178
}
179+
e.healthCheckDropProbability.Store(newDropProbability)
178180
e.curSlot.Store(nextSlot)
179181

180182
return true
@@ -201,6 +203,7 @@ func NewEndpointRegistry(o RegistryOptions) *EndpointRegistry {
201203
lastSeenTimeout: o.LastSeenTimeout,
202204
statsResetPeriod: o.StatsResetPeriod,
203205
minRequests: o.MinRequests,
206+
minHealthCheckDropProbability: o.MinHealthCheckDropProbability,
204207
maxHealthCheckDropProbability: o.MaxHealthCheckDropProbability,
205208

206209
quit: make(chan struct{}),

skipper.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1945,6 +1945,7 @@ func run(o Options, sig chan os.Signal, idleConnsCH chan struct{}) error {
19451945
PassiveHealthCheckEnabled: passiveHealthCheckEnabled,
19461946
StatsResetPeriod: passiveHealthCheck.Period,
19471947
MinRequests: passiveHealthCheck.MinRequests,
1948+
MinHealthCheckDropProbability: passiveHealthCheck.MinDropProbability,
19481949
MaxHealthCheckDropProbability: passiveHealthCheck.MaxDropProbability,
19491950
})
19501951
ro := routing.Options{

0 commit comments

Comments
 (0)