|
4 | 4 | "context"
|
5 | 5 | "fmt"
|
6 | 6 | "regexp"
|
| 7 | + "sort" |
7 | 8 | "strings"
|
| 9 | + "time" |
8 | 10 |
|
9 | 11 | v1 "github.com/openshift/api/config/v1"
|
10 | 12 | configclient "github.com/openshift/client-go/config/clientset/versioned"
|
@@ -898,13 +900,48 @@ func newSingleNodeConnectionRefusedEventMatcher(finalIntervals monitorapi.Interv
|
898 | 900 | const (
|
899 | 901 | ocpAPINamespace = "openshift-apiserver"
|
900 | 902 | ocpOAuthAPINamespace = "openshift-oauth-apiserver"
|
| 903 | + defaultNamespace = "default" |
| 904 | + |
| 905 | + bufferTime = time.Second * 45 |
| 906 | + bufferSourceID = "GeneratedSNOBufferInterval" |
901 | 907 | )
|
902 | 908 | snoTopology := v1.SingleReplicaTopologyMode
|
| 909 | + |
| 910 | + // Intervals are collected as they come to the monitorapi and the `from` and `to` is recorded at that point, |
| 911 | + // this works fine for most runs however for single node the events might be sent at irregular intervals. |
| 912 | + // This makes it hard to determine if connection refused errors are false positives, |
| 913 | + // here we collect intervals we know are acceptable for connection refused errors to occur for single node. |
| 914 | + bufferInterval := []monitorapi.Interval{} |
| 915 | + |
903 | 916 | ocpAPISeverTargetDownIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
|
904 |
| - return eventInterval.Source == monitorapi.SourceAlert && |
905 |
| - eventInterval.Locator.Keys[monitorapi.LocatorAlertKey] == "TargetDown" && |
906 |
| - (eventInterval.Locator.Keys[monitorapi.LocatorNamespaceKey] == ocpAPINamespace || |
907 |
| - eventInterval.Locator.Keys[monitorapi.LocatorNamespaceKey] == ocpOAuthAPINamespace) |
| 917 | + |
| 918 | + // If we find a graceful shutdown event, we create a buffer interval after shutdown to account |
| 919 | + // for the API Server coming back up, as well as a 5 second before `from` buffer to account for a |
| 920 | + // situation where the `event.from` falls exactly on the `interval.from` thus causing time.Before() logic to return false. |
| 921 | + if eventInterval.Source == monitorapi.APIServerGracefulShutdown && eventInterval.Message.Reason == monitorapi.GracefulAPIServerShutdown { |
| 922 | + temp := eventInterval |
| 923 | + temp.Locator = monitorapi.Locator{Type: bufferSourceID, Keys: temp.Locator.Keys} |
| 924 | + temp.Source = bufferSourceID |
| 925 | + temp.From = eventInterval.From.Add(time.Second * -5) |
| 926 | + temp.To = eventInterval.To.Add(bufferTime) |
| 927 | + bufferInterval = append(bufferInterval, temp) |
| 928 | + } |
| 929 | + |
| 930 | + isTargetDownAlert := eventInterval.Source == monitorapi.SourceAlert && eventInterval.Locator.Keys[monitorapi.LocatorAlertKey] == "TargetDown" |
| 931 | + identifiedSkipInterval := false |
| 932 | + |
| 933 | + switch eventInterval.Locator.Keys[monitorapi.LocatorNamespaceKey] { |
| 934 | + case ocpAPINamespace, ocpOAuthAPINamespace: |
| 935 | + identifiedSkipInterval = true |
| 936 | + case defaultNamespace: |
| 937 | + identifiedSkipInterval = strings.Contains(eventInterval.Message.HumanMessage, "apiserver") |
| 938 | + } |
| 939 | + |
| 940 | + return isTargetDownAlert && identifiedSkipInterval |
| 941 | + }) |
| 942 | + ocpAPISeverTargetDownIntervals = append(ocpAPISeverTargetDownIntervals, bufferInterval...) |
| 943 | + sort.SliceStable(ocpAPISeverTargetDownIntervals, func(i, j int) bool { |
| 944 | + return ocpAPISeverTargetDownIntervals[i].To.Before(ocpAPISeverTargetDownIntervals[j].To) |
908 | 945 | })
|
909 | 946 | if len(ocpAPISeverTargetDownIntervals) > 0 {
|
910 | 947 | logrus.Infof("found %d OCP APIServer TargetDown intervals", len(ocpAPISeverTargetDownIntervals))
|
|
0 commit comments