From ac1898d0b737f8a402e4e7939ccfe8878dc69f6d Mon Sep 17 00:00:00 2001 From: Kieron Browne Date: Mon, 11 Jul 2022 12:33:26 +0100 Subject: [PATCH] Include exit code in task failure message When a task's job fails, copy the reason from the task's container status (which will generally be 'Error'), and construct the message including the exit code - i.e. "Failed with exit code: N" Issue: https://github.com/cloudfoundry/korifi/issues/1048 Co-authored-by: Georgi Sabev --- cmd/wiring/task_reconciler.go | 2 +- k8s/jobs/status.go | 60 ++++++-- k8s/jobs/status_test.go | 135 ++++++++++++++++-- .../fake_task_status_getter.go | 23 +-- k8s/reconciler/task.go | 9 +- k8s/reconciler/task_test.go | 17 ++- tests/eats/tasks_crd_test.go | 5 +- 7 files changed, 218 insertions(+), 33 deletions(-) diff --git a/cmd/wiring/task_reconciler.go b/cmd/wiring/task_reconciler.go index 3d848b115..f44448b5e 100644 --- a/cmd/wiring/task_reconciler.go +++ b/cmd/wiring/task_reconciler.go @@ -41,7 +41,7 @@ func createTaskReconciler( ) desirer := jobs.NewDesirer(logger, taskToJobConverter, controllerClient, scheme) - statusGetter := jobs.NewStatusGetter(logger) + statusGetter := jobs.NewStatusGetter(logger, controllerClient) return reconciler.NewTask(logger, controllerClient, desirer, statusGetter, cfg.TaskTTLSeconds) } diff --git a/k8s/jobs/status.go b/k8s/jobs/status.go index 212993927..10b08c5a2 100644 --- a/k8s/jobs/status.go +++ b/k8s/jobs/status.go @@ -2,24 +2,30 @@ package jobs import ( "context" + "fmt" eiriniv1 "code.cloudfoundry.org/eirini-controller/pkg/apis/eirini/v1" "code.cloudfoundry.org/lager" batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" ) type StatusGetter struct { - logger lager.Logger + logger lager.Logger + k8sClient client.Client } -func NewStatusGetter(logger lager.Logger) *StatusGetter { +func NewStatusGetter(logger lager.Logger, k8sClient client.Client) *StatusGetter { return &StatusGetter{ - logger: logger, + logger: logger, + k8sClient: k8sClient, } } -func (s *StatusGetter) GetStatusConditions(ctx context.Context, job *batchv1.Job) []metav1.Condition { +func (s *StatusGetter) GetStatusConditions(ctx context.Context, job *batchv1.Job) ([]metav1.Condition, error) { + logger := s.logger.Session("get status condtions", lager.Data{"name": job.Name, "namespace": job.Namespace}) conditions := []metav1.Condition{ { Type: eiriniv1.TaskInitializedConditionType, @@ -30,7 +36,7 @@ func (s *StatusGetter) GetStatusConditions(ctx context.Context, job *batchv1.Job } if job.Status.StartTime == nil { - return conditions + return conditions, nil } conditions = append(conditions, metav1.Condition{ @@ -53,16 +59,54 @@ func (s *StatusGetter) GetStatusConditions(ctx context.Context, job *batchv1.Job lastFailureTimestamp := getLastFailureTimestamp(job.Status) if job.Status.Failed > 0 && lastFailureTimestamp != nil { + terminationState, err := s.getFailedContainerStatus(ctx, job) + if err != nil { + logger.Error("failed to get container status", err) + + return nil, fmt.Errorf("failed to get container status: %w", err) + } + conditions = append(conditions, metav1.Condition{ Type: eiriniv1.TaskFailedConditionType, Status: metav1.ConditionTrue, LastTransitionTime: *lastFailureTimestamp, - Reason: "job_failed", - Message: "Job failed", + Reason: terminationState.Reason, + Message: fmt.Sprintf("Failed with exit code: %d", terminationState.ExitCode), }) } - return conditions + return conditions, nil +} + +func (s *StatusGetter) getFailedContainerStatus(ctx context.Context, job *batchv1.Job) (*corev1.ContainerStateTerminated, error) { + var jobPods corev1.PodList + if err := s.k8sClient.List(ctx, &jobPods, client.InNamespace(job.Namespace), client.MatchingLabels{"job-name": job.Name}); err != nil { + return nil, err + } + + if len(jobPods.Items) > 1 { + return nil, fmt.Errorf("found more than one pod for job %s:%s", job.Namespace, job.Name) + } + + if len(jobPods.Items) == 0 { + return nil, fmt.Errorf("no pods found for job %s:%s", job.Namespace, job.Name) + } + + jobPod := jobPods.Items[0] + + for _, containerStatus := range jobPod.Status.ContainerStatuses { + if containerStatus.Name != jobPod.Annotations[AnnotationTaskContainerName] { + continue + } + + if containerStatus.State.Terminated == nil { + return nil, fmt.Errorf("no terminated state found for job %s:%s", job.Namespace, job.Name) + } + + return containerStatus.State.Terminated, nil + } + + return nil, fmt.Errorf("no task container found for job %s:%s", job.Namespace, job.Name) } func getLastFailureTimestamp(jobStatus batchv1.JobStatus) *metav1.Time { diff --git a/k8s/jobs/status_test.go b/k8s/jobs/status_test.go index 18ba3c3a7..ce0f42ce9 100644 --- a/k8s/jobs/status_test.go +++ b/k8s/jobs/status_test.go @@ -5,20 +5,26 @@ import ( "time" "code.cloudfoundry.org/eirini-controller/k8s/jobs" + "code.cloudfoundry.org/eirini-controller/k8s/k8sfakes" eiriniv1 "code.cloudfoundry.org/eirini-controller/pkg/apis/eirini/v1" "code.cloudfoundry.org/eirini-controller/tests" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/pkg/errors" batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" ) var _ = Describe("StatusGetter", func() { var ( - statusGetter *jobs.StatusGetter - job *batchv1.Job - conditions []metav1.Condition + statusGetter *jobs.StatusGetter + job *batchv1.Job + conditions []metav1.Condition + conditionsErr error + k8sClient *k8sfakes.FakeClient ) BeforeEach(func() { @@ -26,11 +32,16 @@ var _ = Describe("StatusGetter", func() { Status: batchv1.JobStatus{}, } - statusGetter = jobs.NewStatusGetter(tests.NewTestLogger("status_getter_test")) + k8sClient = new(k8sfakes.FakeClient) + statusGetter = jobs.NewStatusGetter(tests.NewTestLogger("status_getter_test"), k8sClient) }) JustBeforeEach(func() { - conditions = statusGetter.GetStatusConditions(context.Background(), job) + conditions, conditionsErr = statusGetter.GetStatusConditions(context.Background(), job) + }) + + It("succeeds", func() { + Expect(conditionsErr).NotTo(HaveOccurred()) }) It("returns an initialized condition", func() { @@ -81,14 +92,19 @@ var _ = Describe("StatusGetter", func() { When("the job has failed", func() { var ( - now metav1.Time - later metav1.Time + now metav1.Time + later metav1.Time + podList corev1.PodList ) BeforeEach(func() { now = metav1.Now() later = metav1.NewTime(now.Add(time.Hour)) job = &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-job", + Namespace: "my-ns", + }, Status: batchv1.JobStatus{ Conditions: []batchv1.JobCondition{ { @@ -108,11 +124,112 @@ var _ = Describe("StatusGetter", func() { Failed: 1, }, } + + podList = corev1.PodList{ + Items: []corev1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + jobs.AnnotationTaskContainerName: "simon", + }, + }, + Status: corev1.PodStatus{ + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "henry", + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + ExitCode: 1, + Reason: "Error", + }, + }, + }, + { + Name: "simon", + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + ExitCode: 42, + Reason: "Error", + }, + }, + }, + }, + }, + }, + }, + } + + k8sClient.ListStub = func(ctx context.Context, objList client.ObjectList, opts ...client.ListOption) error { + list, ok := objList.(*corev1.PodList) + Expect(ok).To(BeTrue()) + *list = podList + + return nil + } }) - It("returns a failed status", func() { + It("returns a failed status with values from the failed container", func() { Expect(meta.IsStatusConditionTrue(conditions, eiriniv1.TaskFailedConditionType)).To(BeTrue()) - Expect(meta.FindStatusCondition(conditions, eiriniv1.TaskFailedConditionType).LastTransitionTime).To(Equal(later)) + failedCondition := meta.FindStatusCondition(conditions, eiriniv1.TaskFailedConditionType) + Expect(failedCondition.LastTransitionTime).To(Equal(later)) + + Expect(k8sClient.ListCallCount()).To(Equal(1)) + _, listObj, opts := k8sClient.ListArgsForCall(0) + Expect(listObj).To(BeAssignableToTypeOf(&corev1.PodList{})) + Expect(opts).To(ContainElement(client.InNamespace("my-ns"))) + Expect(opts).To(ContainElement(client.MatchingLabels{"job-name": "my-job"})) + + Expect(failedCondition.Message).To(Equal("Failed with exit code: 42")) + }) + + When("listing the job pods fails", func() { + BeforeEach(func() { + k8sClient.ListReturns(errors.New("boom")) + }) + + It("returns the error", func() { + Expect(conditionsErr).To(MatchError(ContainSubstring("boom"))) + }) + }) + + When("we get more than one pod for a job", func() { + BeforeEach(func() { + podList.Items = append(podList.Items, podList.Items[0]) + }) + + It("returns an error", func() { + Expect(conditionsErr).To(MatchError(ContainSubstring("found more than one pod for job"))) + }) + }) + + When("there are no pods for the job", func() { + BeforeEach(func() { + podList.Items = nil + }) + + It("returns an error", func() { + Expect(conditionsErr).To(MatchError(ContainSubstring("no pods found for job"))) + }) + }) + + When("task container does not have termination status", func() { + BeforeEach(func() { + podList.Items[0].Status.ContainerStatuses[1].State.Terminated = nil + }) + + It("returns an error", func() { + Expect(conditionsErr).To(MatchError(ContainSubstring("no terminated state found"))) + }) + }) + + When("task container is not found", func() { + BeforeEach(func() { + podList.Items[0].Annotations[jobs.AnnotationTaskContainerName] = "foo" + }) + + It("returns an error", func() { + Expect(conditionsErr).To(MatchError(ContainSubstring("no task container found"))) + }) }) }) }) diff --git a/k8s/reconciler/reconcilerfakes/fake_task_status_getter.go b/k8s/reconciler/reconcilerfakes/fake_task_status_getter.go index 7e70aa267..c7f2bf0c3 100644 --- a/k8s/reconciler/reconcilerfakes/fake_task_status_getter.go +++ b/k8s/reconciler/reconcilerfakes/fake_task_status_getter.go @@ -11,7 +11,7 @@ import ( ) type FakeTaskStatusGetter struct { - GetStatusConditionsStub func(context.Context, *v1a.Job) []v1.Condition + GetStatusConditionsStub func(context.Context, *v1a.Job) ([]v1.Condition, error) getStatusConditionsMutex sync.RWMutex getStatusConditionsArgsForCall []struct { arg1 context.Context @@ -19,15 +19,17 @@ type FakeTaskStatusGetter struct { } getStatusConditionsReturns struct { result1 []v1.Condition + result2 error } getStatusConditionsReturnsOnCall map[int]struct { result1 []v1.Condition + result2 error } invocations map[string][][]interface{} invocationsMutex sync.RWMutex } -func (fake *FakeTaskStatusGetter) GetStatusConditions(arg1 context.Context, arg2 *v1a.Job) []v1.Condition { +func (fake *FakeTaskStatusGetter) GetStatusConditions(arg1 context.Context, arg2 *v1a.Job) ([]v1.Condition, error) { fake.getStatusConditionsMutex.Lock() ret, specificReturn := fake.getStatusConditionsReturnsOnCall[len(fake.getStatusConditionsArgsForCall)] fake.getStatusConditionsArgsForCall = append(fake.getStatusConditionsArgsForCall, struct { @@ -42,9 +44,9 @@ func (fake *FakeTaskStatusGetter) GetStatusConditions(arg1 context.Context, arg2 return stub(arg1, arg2) } if specificReturn { - return ret.result1 + return ret.result1, ret.result2 } - return fakeReturns.result1 + return fakeReturns.result1, fakeReturns.result2 } func (fake *FakeTaskStatusGetter) GetStatusConditionsCallCount() int { @@ -53,7 +55,7 @@ func (fake *FakeTaskStatusGetter) GetStatusConditionsCallCount() int { return len(fake.getStatusConditionsArgsForCall) } -func (fake *FakeTaskStatusGetter) GetStatusConditionsCalls(stub func(context.Context, *v1a.Job) []v1.Condition) { +func (fake *FakeTaskStatusGetter) GetStatusConditionsCalls(stub func(context.Context, *v1a.Job) ([]v1.Condition, error)) { fake.getStatusConditionsMutex.Lock() defer fake.getStatusConditionsMutex.Unlock() fake.GetStatusConditionsStub = stub @@ -66,27 +68,30 @@ func (fake *FakeTaskStatusGetter) GetStatusConditionsArgsForCall(i int) (context return argsForCall.arg1, argsForCall.arg2 } -func (fake *FakeTaskStatusGetter) GetStatusConditionsReturns(result1 []v1.Condition) { +func (fake *FakeTaskStatusGetter) GetStatusConditionsReturns(result1 []v1.Condition, result2 error) { fake.getStatusConditionsMutex.Lock() defer fake.getStatusConditionsMutex.Unlock() fake.GetStatusConditionsStub = nil fake.getStatusConditionsReturns = struct { result1 []v1.Condition - }{result1} + result2 error + }{result1, result2} } -func (fake *FakeTaskStatusGetter) GetStatusConditionsReturnsOnCall(i int, result1 []v1.Condition) { +func (fake *FakeTaskStatusGetter) GetStatusConditionsReturnsOnCall(i int, result1 []v1.Condition, result2 error) { fake.getStatusConditionsMutex.Lock() defer fake.getStatusConditionsMutex.Unlock() fake.GetStatusConditionsStub = nil if fake.getStatusConditionsReturnsOnCall == nil { fake.getStatusConditionsReturnsOnCall = make(map[int]struct { result1 []v1.Condition + result2 error }) } fake.getStatusConditionsReturnsOnCall[i] = struct { result1 []v1.Condition - }{result1} + result2 error + }{result1, result2} } func (fake *FakeTaskStatusGetter) Invocations() map[string][][]interface{} { diff --git a/k8s/reconciler/task.go b/k8s/reconciler/task.go index db52d77c1..de77569b8 100644 --- a/k8s/reconciler/task.go +++ b/k8s/reconciler/task.go @@ -34,7 +34,7 @@ type TaskDesirer interface { //counterfeiter:generate . TaskStatusGetter type TaskStatusGetter interface { - GetStatusConditions(ctx context.Context, job *batchv1.Job) []metav1.Condition + GetStatusConditions(ctx context.Context, job *batchv1.Job) ([]metav1.Condition, error) } func NewTask(logger lager.Logger, @@ -127,7 +127,12 @@ func (t *Task) desireTask(ctx context.Context, logger lager.Logger, task *eirini func (t *Task) updateTaskStatus(ctx context.Context, task *eiriniv1.Task, job *batchv1.Job) error { originalTask := task.DeepCopy() - for _, condition := range t.statusGetter.GetStatusConditions(ctx, job) { + conditions, err := t.statusGetter.GetStatusConditions(ctx, job) + if err != nil { + return fmt.Errorf("failed to get status conditions for job %s:%s: %w", job.Namespace, job.Name, err) + } + + for _, condition := range conditions { meta.SetStatusCondition(&task.Status.Conditions, condition) } diff --git a/k8s/reconciler/task_test.go b/k8s/reconciler/task_test.go index b49686cee..f4367e3d2 100644 --- a/k8s/reconciler/task_test.go +++ b/k8s/reconciler/task_test.go @@ -108,7 +108,7 @@ var _ = Describe("Task", func() { Type: eiriniv1.TaskInitializedConditionType, Status: metav1.ConditionTrue, }, - }) + }, nil) }) JustBeforeEach(func() { @@ -132,6 +132,17 @@ var _ = Describe("Task", func() { Expect(namespacedName.Name).To(Equal("my-name")) }) + When("getting status condition fails", func() { + BeforeEach(func() { + statusGetter.GetStatusConditionsReturns(nil, errors.New("boom")) + getJobErr = nil + }) + + It("returns an error", func() { + Expect(reconcileErr).To(MatchError(ContainSubstring("boom"))) + }) + }) + When("the task cannot be found", func() { BeforeEach(func() { getTaskErr = k8serrors.NewNotFound(schema.GroupResource{}, "foo") @@ -333,7 +344,7 @@ var _ = Describe("Task", func() { Type: eiriniv1.TaskFailedConditionType, Status: metav1.ConditionTrue, }, - }) + }, nil) }) It("requeues the event after the ttl", func() { @@ -348,7 +359,7 @@ var _ = Describe("Task", func() { Type: eiriniv1.TaskSucceededConditionType, Status: metav1.ConditionTrue, }, - }) + }, nil) }) It("requeues the event after the ttl", func() { diff --git a/tests/eats/tasks_crd_test.go b/tests/eats/tasks_crd_test.go index f2cadd528..7fbb9392e 100644 --- a/tests/eats/tasks_crd_test.go +++ b/tests/eats/tasks_crd_test.go @@ -171,7 +171,10 @@ var _ = Describe("Tasks CRD [needs-logs-for: eirini-controller]", func() { g.Expect(err).NotTo(HaveOccurred()) g.Expect(meta.IsStatusConditionTrue(status.Conditions, eiriniv1.TaskFailedConditionType)).Should(BeTrue()) - g.Expect(meta.FindStatusCondition(status.Conditions, eiriniv1.TaskFailedConditionType).LastTransitionTime).NotTo(BeZero()) + cond := meta.FindStatusCondition(status.Conditions, eiriniv1.TaskFailedConditionType) + g.Expect(cond.LastTransitionTime).NotTo(BeZero()) + g.Expect(cond.Reason).To(Equal("Error")) + g.Expect(cond.Message).To(Equal("Failed with exit code: 1")) }).Should(Succeed()) }) })