Skip to content

Commit a4c77d3

Browse files
committed
feat: add check for kubeapiserver service being down
updated logic for disurption interval skip on SNO to look at the default namespace for the kuberentes service being down Signed-off-by: ehila <[email protected]>
1 parent 221cc88 commit a4c77d3

File tree

1 file changed

+41
-4
lines changed

1 file changed

+41
-4
lines changed

pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@ import (
44
"context"
55
"fmt"
66
"regexp"
7+
"sort"
78
"strings"
9+
"time"
810

911
v1 "github.com/openshift/api/config/v1"
1012
configclient "github.com/openshift/client-go/config/clientset/versioned"
@@ -898,13 +900,48 @@ func newSingleNodeConnectionRefusedEventMatcher(finalIntervals monitorapi.Interv
898900
const (
899901
ocpAPINamespace = "openshift-apiserver"
900902
ocpOAuthAPINamespace = "openshift-oauth-apiserver"
903+
defaultNamespace = "default"
904+
905+
bufferTime = time.Second * 45
906+
bufferSourceID = "GeneratedSNOBufferInterval"
901907
)
902908
snoTopology := v1.SingleReplicaTopologyMode
909+
910+
// Intervals are collected as they come to the monitorapi and the `from` and `to` is recorded at that point,
911+
// this works fine for most runs however for single node the events might be sent at irregular intervals.
912+
// This makes it hard to determine if connection refused errors are false positives,
913+
// here we collect intervals we know are acceptable for connection refused errors to occur for single node.
914+
bufferInterval := []monitorapi.Interval{}
915+
903916
ocpAPISeverTargetDownIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
904-
return eventInterval.Source == monitorapi.SourceAlert &&
905-
eventInterval.Locator.Keys[monitorapi.LocatorAlertKey] == "TargetDown" &&
906-
(eventInterval.Locator.Keys[monitorapi.LocatorNamespaceKey] == ocpAPINamespace ||
907-
eventInterval.Locator.Keys[monitorapi.LocatorNamespaceKey] == ocpOAuthAPINamespace)
917+
918+
// If we find a graceful shutdown event, we create a buffer interval after shutdown to account
919+
// for the API Server coming back up, as well as a 5 second before `from` buffer to account for a
920+
// situation where the `event.from` falls exactly on the `interval.from` thus causing time.Before() logic to return false.
921+
if eventInterval.Source == monitorapi.APIServerGracefulShutdown && eventInterval.Message.Reason == monitorapi.GracefulAPIServerShutdown {
922+
temp := eventInterval
923+
temp.Locator = monitorapi.Locator{Type: bufferSourceID, Keys: temp.Locator.Keys}
924+
temp.Source = bufferSourceID
925+
temp.From = eventInterval.From.Add(time.Second * -5)
926+
temp.To = eventInterval.To.Add(bufferTime)
927+
bufferInterval = append(bufferInterval, temp)
928+
}
929+
930+
isTargetDownAlert := eventInterval.Source == monitorapi.SourceAlert && eventInterval.Locator.Keys[monitorapi.LocatorAlertKey] == "TargetDown"
931+
identifiedSkipInterval := false
932+
933+
switch eventInterval.Locator.Keys[monitorapi.LocatorNamespaceKey] {
934+
case ocpAPINamespace, ocpOAuthAPINamespace:
935+
identifiedSkipInterval = true
936+
case defaultNamespace:
937+
identifiedSkipInterval = strings.Contains(eventInterval.Message.HumanMessage, "apiserver")
938+
}
939+
940+
return isTargetDownAlert && identifiedSkipInterval
941+
})
942+
ocpAPISeverTargetDownIntervals = append(ocpAPISeverTargetDownIntervals, bufferInterval...)
943+
sort.SliceStable(ocpAPISeverTargetDownIntervals, func(i, j int) bool {
944+
return ocpAPISeverTargetDownIntervals[i].To.Before(ocpAPISeverTargetDownIntervals[j].To)
908945
})
909946
if len(ocpAPISeverTargetDownIntervals) > 0 {
910947
logrus.Infof("found %d OCP APIServer TargetDown intervals", len(ocpAPISeverTargetDownIntervals))

0 commit comments

Comments
 (0)