diff --git a/scripts/measure-testgrid-flakiness.sh b/scripts/measure-testgrid-flakiness.sh index 6cde5418727..1ea6ed721f2 100755 --- a/scripts/measure-testgrid-flakiness.sh +++ b/scripts/measure-testgrid-flakiness.sh @@ -11,11 +11,12 @@ fi pushd ./tools/testgrid-analysis # ci-etcd-e2e-amd64 and ci-etcd-unit-test-amd64 runs 6 times a day. Keeping a rolling window of 14 days. -go run main.go flaky --create-issue --dashboard=sig-etcd-periodics --tab=ci-etcd-e2e-amd64 --max-days=14 -go run main.go flaky --create-issue --dashboard=sig-etcd-periodics --tab=ci-etcd-unit-test-amd64 --max-days=14 +go run main.go flaky --auto-create-issues --dashboard=sig-etcd-periodics --tab=ci-etcd-e2e-amd64 --max-days=14 +go run main.go flaky --auto-create-issues --dashboard=sig-etcd-periodics --tab=ci-etcd-unit-test-amd64 --max-days=14 -# do not create issues for presubmit tests -go run main.go flaky --dashboard=sig-etcd-presubmits --tab=pull-etcd-e2e-amd64 -go run main.go flaky --dashboard=sig-etcd-presubmits --tab=pull-etcd-unit-test +go run main.go flaky --auto-create-issues --dashboard=sig-etcd-presubmits --tab=pull-etcd-e2e-amd64 --max-days=14 +go run main.go flaky --auto-create-issues --dashboard=sig-etcd-presubmits --tab=pull-etcd-unit-test --max-days=14 + +go run main.go auto-close-stale-issues --days-before-auto-close=14 popd diff --git a/tools/testgrid-analysis/cmd/data.go b/tools/testgrid-analysis/cmd/data.go index 899523aa275..eeba7454105 100644 --- a/tools/testgrid-analysis/cmd/data.go +++ b/tools/testgrid-analysis/cmd/data.go @@ -19,6 +19,7 @@ import ( "io" "net/http" "os" + "sort" "strings" "time" @@ -36,6 +37,15 @@ var ( skippedTestStatuses = make(map[int32]struct{}) ) +type TabResultSummary struct { + DashboardName, TabName string + TestsWithFailures []*TestResultSummary + FailureRate float32 + IssueBody string + allBuilds map[string]struct{} + failedBuilds map[string]struct{} +} + type TestResultSummary struct { Name string FullName string @@ -43,13 +53,25 @@ type TestResultSummary struct { FailureRate float32 FailureLogs []string IssueBody string + allBuilds map[string]struct{} + failedBuilds map[string]struct{} } -func fetchTestResultSummaries(dashboard, tab string) []*TestResultSummary { - // Fetch test data - rowsURL := fmt.Sprintf("http://testgrid-data.k8s.io/api/v1/dashboards/%s/tabs/%s/rows", dashboard, tab) - headersURL := fmt.Sprintf("http://testgrid-data.k8s.io/api/v1/dashboards/%s/tabs/%s/headers", dashboard, tab) +func FetchTabResultSummary(dashboard, tab string) *TabResultSummary { + summary := TabResultSummary{DashboardName: dashboard, TabName: tab} + summary.analyzeTestResults() + return &summary +} +func (tab *TabResultSummary) dataURLs() (rowsURL, headersURL string) { + rowsURL = fmt.Sprintf("http://testgrid-data.k8s.io/api/v1/dashboards/%s/tabs/%s/rows", tab.DashboardName, tab.TabName) + headersURL = fmt.Sprintf("http://testgrid-data.k8s.io/api/v1/dashboards/%s/tabs/%s/headers", tab.DashboardName, tab.TabName) + return +} + +func (tab *TabResultSummary) analyzeTestResults() { + // Fetch test data + rowsURL, headersURL := tab.dataURLs() var testData apipb.ListRowsResponse var headerData apipb.ListHeadersResponse protojson.Unmarshal(fetchJSON(rowsURL), &testData) @@ -60,13 +82,48 @@ func fetchTestResultSummaries(dashboard, tab string) []*TestResultSummary { allTests = append(allTests, row.Name) } - summaries := []*TestResultSummary{} + tab.allBuilds = map[string]struct{}{} + tab.failedBuilds = map[string]struct{}{} + // Process rows for _, row := range testData.Rows { - t := processRow(dashboard, tab, row, allTests, headerData.Headers) - summaries = append(summaries, t) + t := processRow(tab.DashboardName, tab.TabName, row, allTests, headerData.Headers) + mergeMaps(t.allBuilds, tab.allBuilds) + mergeMaps(t.failedBuilds, tab.failedBuilds) + if t.FailedRuns > 0 { + tab.TestsWithFailures = append(tab.TestsWithFailures, t) + } + } + sort.Slice(tab.TestsWithFailures, func(i, j int) bool { + ti := tab.TestsWithFailures[i] + tj := tab.TestsWithFailures[j] + if ti.FailureRate == tj.FailureRate { + if ti.FailedRuns == tj.FailedRuns { + return ti.FullName < tj.FullName + } + return ti.FailedRuns > tj.FailedRuns + } + return ti.FailureRate > tj.FailureRate + }) + if len(tab.allBuilds) > 0 { + tab.FailureRate = float32(len(tab.failedBuilds)) / float32(len(tab.allBuilds)) + } + tab.IssueBody += fmt.Sprintf("%s#%s failed %.1f%% (%d/%d) of the time\n", tab.DashboardName, tab.TabName, + 100*tab.FailureRate, len(tab.failedBuilds), len(tab.allBuilds)) + if len(tab.failedBuilds) > 0 { + tab.IssueBody += "
\nRecent failed test logs\n" + for _, header := range headerData.Headers { + if _, found := tab.failedBuilds[header.Build]; found { + tab.IssueBody += fmt.Sprintf("\n* %s", buildLogURL(tab.TabName, header)) + } + } + tab.IssueBody += "\n
\n
\nFailed tests\n" + for _, t := range tab.TestsWithFailures { + tab.IssueBody += fmt.Sprintf("\n* %s failed %.1f%% (%d/%d) of the time", t.FullName, t.FailureRate*100, t.FailedRuns, t.TotalRuns) + } + tab.IssueBody += "\n
\n" } - return summaries + fmt.Println(tab.IssueBody) } func processRow(dashboard, tab string, row *apipb.ListRowsResponse_Row, allTests []string, headers []*apipb.ListHeadersResponse_Header) *TestResultSummary { @@ -81,6 +138,8 @@ func processRow(dashboard, tab string, row *apipb.ListRowsResponse_Row, allTests earliestTimeToConsider := time.Now().AddDate(0, 0, -1*maxDays) total := 0 failed := 0 + allBuilds := map[string]struct{}{} + failedBuilds := map[string]struct{}{} logs := []string{} for i, cell := range row.Cells { // ignore tests with status not in the validTestStatuses @@ -96,10 +155,12 @@ func processRow(dashboard, tab string, row *apipb.ListRowsResponse_Row, allTests continue } total++ + allBuilds[header.Build] = struct{}{} if _, ok := failureTestStatusesInt[cell.Result]; ok { failed++ + failedBuilds[header.Build] = struct{}{} // markdown table format of | commit | log | - logs = append(logs, fmt.Sprintf("| %s | %s | https://prow.k8s.io/view/gs/kubernetes-jenkins/logs/%s/%s |", strings.Join(header.Extra, ","), header.Started.AsTime().String(), tab, header.Build)) + logs = append(logs, fmt.Sprintf("| %s | %s | %s |", strings.Join(header.Extra, ","), header.Started.AsTime().String(), buildLogURL(tab, header))) } if maxRuns > 0 && total >= maxRuns { break @@ -109,12 +170,12 @@ func processRow(dashboard, tab string, row *apipb.ListRowsResponse_Row, allTests t.TotalRuns = total t.FailureLogs = logs t.FailureRate = float32(failed) / float32(total) + t.failedBuilds = failedBuilds + t.allBuilds = allBuilds if t.FailedRuns > 0 { - dashboardUrl := fmt.Sprintf("[%s](https://testgrid.k8s.io/%s#%s)", tab, dashboard, tab) - t.IssueBody = fmt.Sprintf("## %s Test: %s \nTest failed %.1f%% (%d/%d) of the time\n\nfailure logs are:\n| commit | started | log |\n| --- | --- | --- |\n%s\n", - dashboardUrl, t.FullName, t.FailureRate*100, t.FailedRuns, t.TotalRuns, strings.Join(t.FailureLogs, "\n")) - t.IssueBody += "\nPlease follow the [instructions in the contributing guide](https://github.com/etcd-io/etcd/blob/main/CONTRIBUTING.md#check-for-flaky-tests) to reproduce the issue.\n" - fmt.Printf("%s failed %.1f%% (%d/%d) of the time\n", t.FullName, t.FailureRate*100, t.FailedRuns, t.TotalRuns) + t.IssueBody = fmt.Sprintf("## %s Test: %s \nTest failed %.1f%% (%d/%d) of the time\n\n
\nfailure logs:\n\n| commit | started | log |\n| --- | --- | --- |\n%s\n", + dashboardTabURL(dashboard, tab), t.FullName, t.FailureRate*100, t.FailedRuns, t.TotalRuns, strings.Join(t.FailureLogs, "\n")) + t.IssueBody += "\n
\n\nPlease follow the [instructions in the contributing guide](https://github.com/etcd-io/etcd/blob/main/CONTRIBUTING.md#check-for-flaky-tests) to reproduce the issue.\n" } return &t } @@ -150,6 +211,29 @@ func intStatusSet(statuses []statuspb.TestStatus) map[int32]struct{} { } func shortenTestName(fullname string) string { - parts := strings.Split(fullname, ".") - return parts[len(parts)-1] + parts := strings.Split(fullname, "/") + keepParts := []string{} + // keep the package name of the test. + for i := len(parts) - 1; i >= 0; i-- { + part := parts[i] + keepParts = append([]string{part}, keepParts...) + if strings.Contains(part, ".") { + break + } + } + return strings.Join(keepParts, "/") +} + +func mergeMaps(from, to map[string]struct{}) { + for k, v := range from { + to[k] = v + } +} + +func dashboardTabURL(dashboard, tab string) string { + return fmt.Sprintf("[%s](https://testgrid.k8s.io/%s#%s)", tab, dashboard, tab) +} + +func buildLogURL(tab string, header *apipb.ListHeadersResponse_Header) string { + return fmt.Sprintf("https://prow.k8s.io/view/gs/kubernetes-jenkins/logs/%s/%s", tab, header.Build) } diff --git a/tools/testgrid-analysis/cmd/data_test.go b/tools/testgrid-analysis/cmd/data_test.go new file mode 100644 index 00000000000..1dc8114a152 --- /dev/null +++ b/tools/testgrid-analysis/cmd/data_test.go @@ -0,0 +1,51 @@ +// Copyright 2024 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "testing" +) + +func TestShortenTestName(t *testing.T) { + tests := []struct { + testName string + shortName string + }{ + { + testName: "go.etcd.io/etcd/tests/v3/common.TestKVGet/ClientTLS", + shortName: "common.TestKVGet/ClientTLS", + }, + { + testName: "go.etcd.io/etcd/tests/v3/common.TestKVDelete/ClientTLS", + shortName: "common.TestKVDelete/ClientTLS", + }, + { + testName: "go.etcd.io/etcd/tests/v3/common.TestLeaseGrantAndList/ClientAutoTLS/many_leases", + shortName: "common.TestLeaseGrantAndList/ClientAutoTLS/many_leases", + }, + { + testName: "go.etcd.io/etcd/tests/v3/common.TestMoveLeaderWithInvalidAuth", + shortName: "common.TestMoveLeaderWithInvalidAuth", + }, + } + for _, tt := range tests { + t.Run(tt.testName, func(t *testing.T) { + shortName := shortenTestName(tt.testName) + if shortName != tt.shortName { + t.Errorf("Want %s, got %s", tt.shortName, shortName) + } + }) + } +} diff --git a/tools/testgrid-analysis/cmd/flaky.go b/tools/testgrid-analysis/cmd/flaky.go index 2752b81a873..38dddaa7f64 100644 --- a/tools/testgrid-analysis/cmd/flaky.go +++ b/tools/testgrid-analysis/cmd/flaky.go @@ -28,52 +28,59 @@ var flakyCmd = &cobra.Command{ Run: flakyFunc, } +var closeStaleIssuesCmd = &cobra.Command{ + Use: "auto-close-stale-issues", + Short: "auto close stale flaky test issues", + Long: `automatically close stale Github issues for flaky test.`, + Run: closeStaleIssuesFunc, +} + var ( - flakyThreshold float32 - minRuns int - maxRuns int - maxDays int - createGithubIssue bool - githubOwner string - githubRepo string + flakyThreshold float32 + maxSubIssuesForTestSet int + minRuns int + maxRuns int + maxDays int + autoCreateIssues bool + daysBeforeAutoClose int lineSep = "-------------------------------------------------------------" ) func init() { rootCmd.AddCommand(flakyCmd) + rootCmd.AddCommand(closeStaleIssuesCmd) - flakyCmd.Flags().BoolVar(&createGithubIssue, "create-issue", false, "create Github issue for each flaky test") flakyCmd.Flags().Float32Var(&flakyThreshold, "flaky-threshold", 0.1, "fraction threshold of test failures for a test to be considered flaky") - flakyCmd.Flags().IntVar(&minRuns, "min-runs", 20, "minimum test runs for a test to be included in flaky analysis") + flakyCmd.Flags().IntVar(&minRuns, "min-runs", 20, "minimum test runs for a test to be created an issue for") flakyCmd.Flags().IntVar(&maxRuns, "max-runs", 0, "maximum test runs for a test to be included in flaky analysis, 0 to include all") - flakyCmd.Flags().IntVar(&maxDays, "max-days", 0, "maximum days of results before today to be included in flaky analysis, 0 to include all") - flakyCmd.Flags().StringVar(&githubOwner, "github-owner", "etcd-io", "the github organization to create the issue for") - flakyCmd.Flags().StringVar(&githubRepo, "github-repo", "etcd", "the github repo to create the issue for") + flakyCmd.Flags().IntVar(&maxDays, "max-days", 30, "maximum days of results before today to be included in flaky analysis, 0 to include all") + flakyCmd.Flags().BoolVar(&autoCreateIssues, "auto-create-issues", false, "automatically create Github issue for flaky test") + flakyCmd.Flags().IntVar(&maxSubIssuesForTestSet, "max-sub-issues", 3, "maximum number of sub-issues to create for a test set") + + closeStaleIssuesCmd.Flags().IntVar(&daysBeforeAutoClose, "days-before-auto-close", 30, "maximum days of no updates before an issue is automatically closed") } func flakyFunc(cmd *cobra.Command, args []string) { - fmt.Printf("flaky called, for %s#%s, createGithubIssue=%v, githubRepo=%s/%s, flakyThreshold=%f, minRuns=%d\n", dashboard, tab, createGithubIssue, githubOwner, githubRepo, flakyThreshold, minRuns) - - allTests := fetchTestResultSummaries(dashboard, tab) - flakyTests := []*TestResultSummary{} - for _, t := range allTests { - if t.TotalRuns >= minRuns && t.FailureRate >= flakyThreshold { - flakyTests = append(flakyTests, t) - } - } fmt.Println(lineSep) - fmt.Printf("Detected total %d flaky tests above the %.0f%% threshold for %s#%s\n", len(flakyTests), flakyThreshold*100, dashboard, tab) + fmt.Printf("flaky called, for %s#%s, createGithubIssue=%v, githubRepo=%s/%s, flakyThreshold=%f, minRuns=%d\n", dashboard, tab, autoCreateIssues, githubOwner, githubRepo, flakyThreshold, minRuns) + + tabSummary := FetchTabResultSummary(dashboard, tab) fmt.Println(lineSep) - if len(flakyTests) == 0 { + if tabSummary.FailureRate < flakyThreshold { + fmt.Printf("Failure rate for test set %s#%s is %.1f%%, below the flaky threshold %.0f%%\n", dashboard, tab, tabSummary.FailureRate*100, flakyThreshold*100) return } - for _, t := range flakyTests { - fmt.Println(lineSep) - fmt.Println(t.IssueBody) - fmt.Println(lineSep) - } - if createGithubIssue { - createIssues(flakyTests, []string{"type/flake"}) + fmt.Printf("Failure rate for test set %s#%s is %.1f%%, above the flaky threshold %.0f%%\n", dashboard, tab, tabSummary.FailureRate*100, flakyThreshold*100) + if autoCreateIssues { + createIssues(tabSummary, minRuns, maxSubIssuesForTestSet, []string{"type/flake"}) } + fmt.Println(lineSep) +} + +func closeStaleIssuesFunc(cmd *cobra.Command, args []string) { + fmt.Println(lineSep) + fmt.Printf("auto close stale issues with no updates for %d days in githubRepo=%s/%s\n", daysBeforeAutoClose, githubOwner, githubRepo) + closeStaleIssues(daysBeforeAutoClose, []string{"type/flake"}) + fmt.Println(lineSep) } diff --git a/tools/testgrid-analysis/cmd/github.go b/tools/testgrid-analysis/cmd/github.go index 47445da4a8e..ed80afb27c3 100644 --- a/tools/testgrid-analysis/cmd/github.go +++ b/tools/testgrid-analysis/cmd/github.go @@ -19,15 +19,25 @@ import ( "fmt" "os" "strings" + "time" "github.com/google/go-github/v60/github" ) -func createIssues(tests []*TestResultSummary, labels []string) { - openIssues := getOpenIssues(labels) - for _, t := range tests { - createIssueIfNonExist(t, openIssues, append(labels, "help wanted")) +func createIssues(summary *TabResultSummary, minRuns, maxSubIssues int, labels []string) { + subIssues := []string{} + for _, t := range summary.TestsWithFailures { + if t.TotalRuns < minRuns { + continue + } + subIssue := createOrUpdateIssue(fmt.Sprintf("Flaky Test: %s", t.Name), t.IssueBody, labels) + subIssues = append(subIssues, subIssue) + if len(subIssues) >= maxSubIssues { + break + } } + body := summary.IssueBody + fmt.Sprintf("\n## Sub-issues\nauto created sub-issues for the top %d failed tests with at least %d runs:\n", maxSubIssues, minRuns) + strings.Join(subIssues, "\n") + createOrUpdateIssue(fmt.Sprintf("Flaky Test Set: %s", summary.TabName), body, labels) } func getOpenIssues(labels []string) []*github.Issue { @@ -54,25 +64,84 @@ func getOpenIssues(labels []string) []*github.Issue { return allIssues } -func createIssueIfNonExist(t *TestResultSummary, issues []*github.Issue, labels []string) { +func createOrUpdateIssue(title, body string, labels []string) string { + issues := getOpenIssues(labels) + newLabels := append(labels, "help wanted") + + var currentIssue *github.Issue + title = strings.TrimSpace(title) // check if there is already an open issue regarding this test for _, issue := range issues { - if strings.Contains(*issue.Title, t.Name) { - fmt.Printf("%s is already open for test %s\n\n", issue.GetHTMLURL(), t.Name) - return + if strings.Contains(*issue.Title, title) { + fmt.Printf("%s is already open for %s\n", issue.GetHTMLURL(), title) + currentIssue = issue + break } } - fmt.Printf("Opening new issue for %s\n", t.Name) client := github.NewClient(nil).WithAuthToken(os.Getenv("GITHUB_TOKEN")) ctx := context.Background() - req := &github.IssueRequest{ - Title: github.String(fmt.Sprintf("Flaky test %s", t.Name)), - Body: &t.IssueBody, - Labels: &labels, + if currentIssue == nil { + fmt.Printf("Opening new issue for %s\n", title) + req := &github.IssueRequest{ + Title: github.String(title), + Body: &body, + Labels: &newLabels, + } + issue, _, err := client.Issues.Create(ctx, githubOwner, githubRepo, req) + if err != nil { + panic(err) + } + fmt.Printf("New issue %s created for %s\n\n", issue.GetHTMLURL(), title) + return issue.GetHTMLURL() + } + // if the issue already exists, append comments + comment := &github.IssueComment{ + Body: &body, } - issue, _, err := client.Issues.Create(ctx, githubOwner, githubRepo, req) + issueComment, _, err := client.Issues.CreateComment(ctx, githubOwner, githubRepo, *currentIssue.Number, comment) if err != nil { panic(err) } - fmt.Printf("New issue %s created for %s\n\n", issue.GetHTMLURL(), t.Name) + fmt.Printf("New comment %s created for %s\n\n", issueComment.GetHTMLURL(), title) + return currentIssue.GetHTMLURL() +} + +func closeStaleIssues(daysBeforeAutoClose int, labels []string) { + earliestTimeToConsider := time.Now().AddDate(0, 0, -1*daysBeforeAutoClose) + issues := getOpenIssues(labels) + client := github.NewClient(nil).WithAuthToken(os.Getenv("GITHUB_TOKEN")) + ctx := context.Background() + newLabels := append(labels, "stale") + warning := fmt.Sprintf("auto close due to no updates for %d days", daysBeforeAutoClose) + state := "closed" + cnt := 0 + + for _, issue := range issues { + if issue.UpdatedAt.Before(earliestTimeToConsider) { + fmt.Printf("closing stale issue %s last updated at %s\n", *issue.HTMLURL, issue.UpdatedAt.String()) + comment := &github.IssueComment{ + Body: &warning, + } + _, _, err := client.Issues.CreateComment(ctx, githubOwner, githubRepo, *issue.Number, comment) + if err != nil { + panic(err) + } + + req := &github.IssueRequest{ + Labels: &newLabels, + State: &state, + } + respIssue, _, err := client.Issues.Edit(ctx, githubOwner, githubRepo, *issue.Number, req) + if err != nil { + panic(err) + } + if *respIssue.State == "closed" { + fmt.Printf("closed stale issue %s\n", *issue.HTMLURL) + cnt++ + } else { + fmt.Printf("failed to close stale issue %s\n", *issue.HTMLURL) + } + } + } + fmt.Printf("closed %d/%d stale issues\n", cnt, len(issues)) } diff --git a/tools/testgrid-analysis/cmd/root.go b/tools/testgrid-analysis/cmd/root.go index 046bdec4fad..c01d0db6374 100644 --- a/tools/testgrid-analysis/cmd/root.go +++ b/tools/testgrid-analysis/cmd/root.go @@ -21,8 +21,10 @@ import ( ) var ( - dashboard string - tab string + dashboard string + tab string + githubOwner string + githubRepo string ) var rootCmd = &cobra.Command{ @@ -41,4 +43,6 @@ func Execute() { func init() { rootCmd.PersistentFlags().StringVar(&dashboard, "dashboard", "sig-etcd-periodics", "testgrid dashboard to retrieve data from") rootCmd.PersistentFlags().StringVar(&tab, "tab", "ci-etcd-e2e-amd64", "testgrid tab within the dashboard to retrieve data from") + rootCmd.PersistentFlags().StringVar(&githubOwner, "github-owner", "etcd-io", "the github organization to create the issue for") + rootCmd.PersistentFlags().StringVar(&githubRepo, "github-repo", "etcd", "the github repo to create the issue for") }