Skip to content

Commit b62948d

Browse files
authored
feat: Detect bridge ICE failure based on restart requests from endpoints (#1210)
* ref: Simplify code, order alphabetically. * feat: Keep track of number of endpoints on each Bridge. * feat: Add per-bridge metrics for endpoint and restart requests. * ref: Cleanup comment formatting. * ref: Rename Bridge.stress to correctedStress. * feat: Penalize stress when a bridge enters the failing ICE state * feat: Add metric for failing ICE.
1 parent 9369e97 commit b62948d

File tree

13 files changed

+225
-104
lines changed

13 files changed

+225
-104
lines changed

jicofo-selector/src/main/kotlin/org/jitsi/jicofo/bridge/Bridge.kt

Lines changed: 116 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ import org.jxmpp.jid.Jid
2727
import java.time.Clock
2828
import java.time.Duration
2929
import java.time.Instant
30+
import java.util.concurrent.atomic.AtomicBoolean
31+
import java.util.concurrent.atomic.AtomicInteger
32+
import kotlin.math.max
3033
import org.jitsi.jicofo.bridge.BridgeConfig.Companion.config as config
3134

3235
/**
@@ -48,36 +51,34 @@ class Bridge @JvmOverloads internal constructor(
4851
private val clock: Clock = Clock.systemUTC()
4952
) : Comparable<Bridge> {
5053

51-
/**
52-
* Keep track of the recently added endpoints.
53-
*/
54+
/** Keep track of the recently added endpoints. */
5455
private val newEndpointsRate = RateTracker(
5556
config.participantRampupInterval,
5657
Duration.ofMillis(100),
5758
clock
5859
)
5960

60-
/**
61-
* The last report stress level
62-
*/
61+
private val endpointRestartRequestRate = RateTracker(
62+
config.iceFailureDetection.interval,
63+
Duration.ofSeconds(1),
64+
clock
65+
)
66+
67+
/** Number of endpoints currently allocated on this bridge by this jicofo instance. */
68+
val endpoints = AtomicInteger(0)
69+
70+
/** The last report stress level */
6371
var lastReportedStressLevel = 0.0
6472
private set
6573

66-
/**
67-
* Holds bridge version (if known - not all bridge version are capable of
68-
* reporting it).
69-
*/
74+
/** Holds bridge version (if known - not all bridge version are capable of reporting it). */
7075
private var version: String? = null
7176

72-
/**
73-
* Whether the last received presence indicated the bridge is healthy.
74-
*/
77+
/** Whether the last received presence indicated the bridge is healthy. */
7578
var isHealthy = true
7679
private set
7780

78-
/**
79-
* Holds bridge release ID, or null if not known.
80-
*/
81+
/** Holds bridge release ID, or null if not known. */
8182
private var releaseId: String? = null
8283

8384
/**
@@ -108,25 +109,16 @@ class Bridge @JvmOverloads internal constructor(
108109
}
109110
}
110111

111-
/**
112-
* Start out with the configured value, update if the bridge reports a value.
113-
*/
112+
/** Start out with the configured value, update if the bridge reports a value. */
114113
private var averageParticipantStress = config.averageParticipantStress
115114

116-
/**
117-
* Stores a boolean that indicates whether the bridge is in graceful shutdown mode.
118-
*/
115+
/** Stores a boolean that indicates whether the bridge is in graceful shutdown mode. */
119116
var isInGracefulShutdown = false // we assume it is not shutting down
120117

121-
/**
122-
* Whether the bridge is in SHUTTING_DOWN mode.
123-
*/
118+
/** Whether the bridge is in SHUTTING_DOWN mode. */
124119
var isShuttingDown = false
125120
private set
126121

127-
/**
128-
* @return true if the bridge is currently in drain mode
129-
*/
130122
/**
131123
* Stores a boolean that indicates whether the bridge is in drain mode.
132124
*/
@@ -140,19 +132,27 @@ class Bridge @JvmOverloads internal constructor(
140132
*/
141133
private var failureInstant: Instant? = null
142134

143-
/**
144-
* @return the region of this [Bridge].
145-
*/
135+
/** @return the region of this [Bridge]. */
146136
var region: String? = null
147137
private set
148138

149-
/**
150-
* @return the relay ID advertised by the bridge, or `null` if
151-
* none was advertised.
152-
*/
139+
/** @return the relay ID advertised by the bridge, or `null` if none was advertised. */
153140
var relayId: String? = null
154141
private set
155142

143+
/**
144+
* If this [Bridge] has been removed from the list of bridges. Once removed, the metrics specific to this instance
145+
* are cleared and no longer emitted. If the bridge re-connects, a new [Bridge] instance will be created.
146+
*/
147+
val removed = AtomicBoolean(false)
148+
149+
/**
150+
* The last instant at which we detected, based on restart requests from endpoints, that this bridge is failing ICE
151+
*/
152+
private var lastIceFailed = Instant.MIN
153+
private val failingIce: Boolean
154+
get() = Duration.between(lastIceFailed, clock.instant()) < config.iceFailureDetection.timeout
155+
156156
private val logger: Logger = LoggerImpl(Bridge::class.java.name)
157157

158158
init {
@@ -237,37 +237,75 @@ class Bridge @JvmOverloads internal constructor(
237237
return compare(this, other)
238238
}
239239

240-
/**
241-
* Notifies this [Bridge] that it was used for a new endpoint.
242-
*/
240+
/** Notifies this [Bridge] that it was used for a new endpoint. */
243241
fun endpointAdded() {
244242
newEndpointsRate.update(1)
243+
endpoints.incrementAndGet()
244+
if (!removed.get()) {
245+
BridgeMetrics.endpoints.set(endpoints.get().toLong(), listOf(jid.resourceOrEmpty.toString()))
246+
}
245247
}
246248

247-
/**
248-
* Returns the net number of video channels recently allocated or removed
249-
* from this bridge.
250-
*/
249+
fun endpointRemoved() = endpointsRemoved(1)
250+
fun endpointsRemoved(count: Int) {
251+
endpoints.addAndGet(-count)
252+
if (!removed.get()) {
253+
BridgeMetrics.endpoints.set(endpoints.get().toLong(), listOf(jid.resourceOrEmpty.toString()))
254+
}
255+
if (endpoints.get() < 0) {
256+
logger.error("Removed more endpoints than were allocated. Resetting to 0.", Throwable())
257+
endpoints.set(0)
258+
}
259+
}
260+
internal fun markRemoved() {
261+
if (removed.compareAndSet(false, true)) {
262+
BridgeMetrics.restartRequestsMetric.remove(listOf(jid.resourceOrEmpty.toString()))
263+
BridgeMetrics.endpoints.remove(listOf(jid.resourceOrEmpty.toString()))
264+
}
265+
}
266+
internal fun updateMetrics() {
267+
if (!removed.get()) {
268+
BridgeMetrics.failingIce.set(failingIce, listOf(jid.resourceOrEmpty.toString()))
269+
}
270+
}
271+
272+
fun endpointRequestedRestart() {
273+
endpointRestartRequestRate.update(1)
274+
if (!removed.get()) {
275+
BridgeMetrics.restartRequestsMetric.inc(listOf(jid.resourceOrEmpty.toString()))
276+
}
277+
278+
if (config.iceFailureDetection.enabled) {
279+
val restartCount = endpointRestartRequestRate.getAccumulatedCount()
280+
val endpoints = endpoints.get()
281+
if (endpoints >= config.iceFailureDetection.minEndpoints &&
282+
restartCount > endpoints * config.iceFailureDetection.threshold
283+
) {
284+
// Reset the timeout regardless of the previous state, but only log if the state changed.
285+
if (!failingIce) {
286+
logger.info("Detected an ICE failing state.")
287+
}
288+
lastIceFailed = clock.instant()
289+
}
290+
}
291+
}
292+
293+
/** Returns the net number of video channels recently allocated or removed from this bridge. */
251294
private val recentlyAddedEndpointCount: Long
252295
get() = newEndpointsRate.getAccumulatedCount()
253296

254-
/**
255-
* The version of this bridge (with embedded release ID, if available).
256-
*/
297+
/** The version of this bridge (with embedded release ID, if available). */
257298
val fullVersion: String?
258299
get() = if (version != null && releaseId != null) "$version-$releaseId" else version
259300

260-
/**
261-
* {@inheritDoc}
262-
*/
263301
override fun toString(): String {
264302
return String.format(
265-
"Bridge[jid=%s, version=%s, relayId=%s, region=%s, stress=%.2f]",
303+
"Bridge[jid=%s, version=%s, relayId=%s, region=%s, correctedStress=%.2f]",
266304
jid.toString(),
267305
fullVersion,
268306
relayId,
269307
region,
270-
stress
308+
correctedStress
271309
)
272310
}
273311

@@ -276,34 +314,37 @@ class Bridge @JvmOverloads internal constructor(
276314
* can exceed 1).
277315
* @return this bridge's stress level
278316
*/
279-
val stress: Double
280-
get() =
281-
// While a stress of 1 indicates a bridge is fully loaded, we allow
282-
// larger values to keep sorting correctly.
283-
lastReportedStressLevel +
284-
recentlyAddedEndpointCount.coerceAtLeast(0) * averageParticipantStress
317+
val correctedStress: Double
318+
get() {
319+
// Correct for recently added endpoints.
320+
// While a stress of 1 indicates a bridge is fully loaded, we allow larger values to keep sorting correctly.
321+
val s = lastReportedStressLevel + recentlyAddedEndpointCount.coerceAtLeast(0) * averageParticipantStress
285322

286-
/**
287-
* @return true if the stress of the bridge is greater-than-or-equal to the threshold.
288-
*/
323+
// Correct for failing ICE.
324+
return if (failingIce) max(s, config.stressThreshold + 0.01) else s
325+
}
326+
327+
/** @return true if the stress of the bridge is greater-than-or-equal to the threshold. */
289328
val isOverloaded: Boolean
290-
get() = stress >= config.stressThreshold
329+
get() = correctedStress >= config.stressThreshold
291330

292331
val debugState: OrderedJsonObject
293-
get() {
294-
val o = OrderedJsonObject()
295-
o["version"] = version.toString()
296-
o["release"] = releaseId.toString()
297-
o["stress"] = stress
298-
o["operational"] = isOperational
299-
o["region"] = region.toString()
300-
o["drain"] = isDraining
301-
o["graceful-shutdown"] = isInGracefulShutdown
302-
o["shutting-down"] = isShuttingDown
303-
o["overloaded"] = isOverloaded
304-
o["relay-id"] = relayId.toString()
305-
o["healthy"] = isHealthy
306-
return o
332+
get() = OrderedJsonObject().apply {
333+
this["corrected-stress"] = correctedStress
334+
this["drain"] = isDraining
335+
this["endpoints"] = endpoints.get()
336+
this["endpoint-restart-requests"] = endpointRestartRequestRate.getAccumulatedCount()
337+
this["failing-ice"] = failingIce
338+
this["graceful-shutdown"] = isInGracefulShutdown
339+
this["healthy"] = isHealthy
340+
this["operational"] = isOperational
341+
this["overloaded"] = isOverloaded
342+
this["region"] = region.toString()
343+
this["relay-id"] = relayId.toString()
344+
this["release"] = releaseId.toString()
345+
this["shutting-down"] = isShuttingDown
346+
this["stress"] = lastReportedStressLevel
347+
this["version"] = version.toString()
307348
}
308349

309350
companion object {
@@ -327,7 +368,7 @@ class Bridge @JvmOverloads internal constructor(
327368
return if (myPriority != otherPriority) {
328369
myPriority - otherPriority
329370
} else {
330-
b1.stress.compareTo(b2.stress)
371+
b1.correctedStress.compareTo(b2.correctedStress)
331372
}
332373
}
333374

jicofo-selector/src/main/kotlin/org/jitsi/jicofo/bridge/BridgeConfig.kt

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,10 +156,34 @@ class BridgeConfig private constructor() {
156156
fun getRegionGroup(region: String?): Set<String> =
157157
if (region == null) emptySet() else regionGroups[region] ?: setOf(region)
158158

159+
val iceFailureDetection = IceFailureDetectionConfig()
160+
159161
companion object {
160162
const val BASE = "jicofo.bridge"
161163

162164
@JvmField
163165
val config = BridgeConfig()
164166
}
165167
}
168+
169+
class IceFailureDetectionConfig {
170+
val enabled: Boolean by config {
171+
"$BASE.enabled".from(JitsiConfig.newConfig)
172+
}
173+
val interval: Duration by config {
174+
"$BASE.interval".from(JitsiConfig.newConfig)
175+
}
176+
val minEndpoints: Int by config {
177+
"$BASE.min-endpoints".from(JitsiConfig.newConfig)
178+
}
179+
val threshold: Double by config {
180+
"$BASE.threshold".from(JitsiConfig.newConfig)
181+
}
182+
val timeout: Duration by config {
183+
"$BASE.timeout".from(JitsiConfig.newConfig)
184+
}
185+
186+
companion object {
187+
const val BASE = "jicofo.bridge.ice-failure-detection"
188+
}
189+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
package org.jitsi.jicofo.bridge
2+
3+
import org.jitsi.jicofo.metrics.JicofoMetricsContainer.Companion.instance as metricsContainer
4+
class BridgeMetrics {
5+
companion object {
6+
/** Total number of participants that requested a restart for a specific bridge. */
7+
val restartRequestsMetric = metricsContainer.registerCounter(
8+
"bridge_restart_requests_total",
9+
"Total number of requests to restart a bridge",
10+
labelNames = listOf("jvb")
11+
)
12+
val endpoints = metricsContainer.registerLongGauge(
13+
"bridge_endpoints",
14+
"The number of endpoints on a bridge.",
15+
labelNames = listOf("jvb")
16+
)
17+
val failingIce = metricsContainer.registerBooleanMetric(
18+
"bridge_failing_ice",
19+
"Whether a bridge is currently in the failing ICE state.",
20+
labelNames = listOf("jvb")
21+
)
22+
}
23+
}

jicofo-selector/src/main/kotlin/org/jitsi/jicofo/bridge/BridgeSelector.kt

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ class BridgeSelector @JvmOverloads constructor(
112112
logger.warn("Lost a bridge: $bridgeJid")
113113
lostBridges.inc()
114114
}
115+
it.markRemoved()
115116
bridgeCount.dec()
116117
eventEmitter.fireEvent { bridgeRemoved(it) }
117118
}
@@ -214,10 +215,7 @@ class BridgeSelector @JvmOverloads constructor(
214215
conferenceBridges,
215216
participantProperties,
216217
OctoConfig.config.enabled
217-
).also {
218-
// The bridge was selected for an endpoint, increment its counter.
219-
it?.endpointAdded()
220-
}
218+
)
221219
}
222220

223221
val stats: JSONObject
@@ -245,6 +243,7 @@ class BridgeSelector @JvmOverloads constructor(
245243
inShutdownBridgeCountMetric.set(bridges.values.count { it.isInGracefulShutdown }.toLong())
246244
operationalBridgeCountMetric.set(bridges.values.count { it.isOperational }.toLong())
247245
bridgeVersionCount.set(bridges.values.map { it.fullVersion }.toSet().size.toLong())
246+
bridges.values.forEach { it.updateMetrics() }
248247
}
249248

250249
companion object {

jicofo-selector/src/main/kotlin/org/jitsi/jicofo/bridge/VisitorTopologyStrategy.kt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,9 @@ class VisitorTopologyStrategy : TopologySelectionStrategy() {
3535
cascade.getDistanceFrom(it) { node -> !node.visitor }
3636
}
3737

38-
val sortedNodes = nodesWithDistance.entries.sortedWith(compareBy({ it.value }, { it.key.bridge.stress }))
39-
.map { it.key }
38+
val sortedNodes = nodesWithDistance.entries.sortedWith(
39+
compareBy({ it.value }, { it.key.bridge.correctedStress })
40+
).map { it.key }
4041

4142
/* TODO: this logic looks a lot like bridge selection. Do we want to try to share logic with that code? */
4243
val nonOverloadedInRegion = sortedNodes.filter {

jicofo-selector/src/main/kotlin/org/jitsi/jicofo/bridge/colibri/ColibriSessionManager.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ interface ColibriSessionManager {
5757
suppressLocalBridgeUpdate: Boolean = false
5858
)
5959

60-
fun getBridgeSessionId(participantId: String): String?
60+
fun getBridgeSessionId(participantId: String): Pair<Bridge?, String?>
6161

6262
/**
6363
* Stop using [bridge], expiring all endpoints on it (e.g. because it was detected to have failed).

0 commit comments

Comments
 (0)