Skip to content
This repository has been archived by the owner on Jul 11, 2023. It is now read-only.

Commit

Permalink
Include exit code in task failure message
Browse files Browse the repository at this point in the history
When a task's job fails, copy the reason from the task's container
status (which will generally be 'Error'), and construct the message
including the exit code - i.e. "Failed with exit code: N"

Issue: cloudfoundry/korifi#1048
Co-authored-by: Georgi Sabev <[email protected]>
  • Loading branch information
Kieron Browne and georgethebeatle committed Jul 11, 2022
1 parent bc26c2f commit ac1898d
Show file tree
Hide file tree
Showing 7 changed files with 218 additions and 33 deletions.
2 changes: 1 addition & 1 deletion cmd/wiring/task_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ func createTaskReconciler(
)

desirer := jobs.NewDesirer(logger, taskToJobConverter, controllerClient, scheme)
statusGetter := jobs.NewStatusGetter(logger)
statusGetter := jobs.NewStatusGetter(logger, controllerClient)

return reconciler.NewTask(logger, controllerClient, desirer, statusGetter, cfg.TaskTTLSeconds)
}
60 changes: 52 additions & 8 deletions k8s/jobs/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,30 @@ package jobs

import (
"context"
"fmt"

eiriniv1 "code.cloudfoundry.org/eirini-controller/pkg/apis/eirini/v1"
"code.cloudfoundry.org/lager"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
)

type StatusGetter struct {
logger lager.Logger
logger lager.Logger
k8sClient client.Client
}

func NewStatusGetter(logger lager.Logger) *StatusGetter {
func NewStatusGetter(logger lager.Logger, k8sClient client.Client) *StatusGetter {
return &StatusGetter{
logger: logger,
logger: logger,
k8sClient: k8sClient,
}
}

func (s *StatusGetter) GetStatusConditions(ctx context.Context, job *batchv1.Job) []metav1.Condition {
func (s *StatusGetter) GetStatusConditions(ctx context.Context, job *batchv1.Job) ([]metav1.Condition, error) {
logger := s.logger.Session("get status condtions", lager.Data{"name": job.Name, "namespace": job.Namespace})
conditions := []metav1.Condition{
{
Type: eiriniv1.TaskInitializedConditionType,
Expand All @@ -30,7 +36,7 @@ func (s *StatusGetter) GetStatusConditions(ctx context.Context, job *batchv1.Job
}

if job.Status.StartTime == nil {
return conditions
return conditions, nil
}

conditions = append(conditions, metav1.Condition{
Expand All @@ -53,16 +59,54 @@ func (s *StatusGetter) GetStatusConditions(ctx context.Context, job *batchv1.Job

lastFailureTimestamp := getLastFailureTimestamp(job.Status)
if job.Status.Failed > 0 && lastFailureTimestamp != nil {
terminationState, err := s.getFailedContainerStatus(ctx, job)
if err != nil {
logger.Error("failed to get container status", err)

return nil, fmt.Errorf("failed to get container status: %w", err)
}

conditions = append(conditions, metav1.Condition{
Type: eiriniv1.TaskFailedConditionType,
Status: metav1.ConditionTrue,
LastTransitionTime: *lastFailureTimestamp,
Reason: "job_failed",
Message: "Job failed",
Reason: terminationState.Reason,
Message: fmt.Sprintf("Failed with exit code: %d", terminationState.ExitCode),
})
}

return conditions
return conditions, nil
}

func (s *StatusGetter) getFailedContainerStatus(ctx context.Context, job *batchv1.Job) (*corev1.ContainerStateTerminated, error) {
var jobPods corev1.PodList
if err := s.k8sClient.List(ctx, &jobPods, client.InNamespace(job.Namespace), client.MatchingLabels{"job-name": job.Name}); err != nil {
return nil, err
}

if len(jobPods.Items) > 1 {
return nil, fmt.Errorf("found more than one pod for job %s:%s", job.Namespace, job.Name)
}

if len(jobPods.Items) == 0 {
return nil, fmt.Errorf("no pods found for job %s:%s", job.Namespace, job.Name)
}

jobPod := jobPods.Items[0]

for _, containerStatus := range jobPod.Status.ContainerStatuses {
if containerStatus.Name != jobPod.Annotations[AnnotationTaskContainerName] {
continue
}

if containerStatus.State.Terminated == nil {
return nil, fmt.Errorf("no terminated state found for job %s:%s", job.Namespace, job.Name)
}

return containerStatus.State.Terminated, nil
}

return nil, fmt.Errorf("no task container found for job %s:%s", job.Namespace, job.Name)
}

func getLastFailureTimestamp(jobStatus batchv1.JobStatus) *metav1.Time {
Expand Down
135 changes: 126 additions & 9 deletions k8s/jobs/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,32 +5,43 @@ import (
"time"

"code.cloudfoundry.org/eirini-controller/k8s/jobs"
"code.cloudfoundry.org/eirini-controller/k8s/k8sfakes"
eiriniv1 "code.cloudfoundry.org/eirini-controller/pkg/apis/eirini/v1"
"code.cloudfoundry.org/eirini-controller/tests"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/pkg/errors"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
)

var _ = Describe("StatusGetter", func() {
var (
statusGetter *jobs.StatusGetter
job *batchv1.Job
conditions []metav1.Condition
statusGetter *jobs.StatusGetter
job *batchv1.Job
conditions []metav1.Condition
conditionsErr error
k8sClient *k8sfakes.FakeClient
)

BeforeEach(func() {
job = &batchv1.Job{
Status: batchv1.JobStatus{},
}

statusGetter = jobs.NewStatusGetter(tests.NewTestLogger("status_getter_test"))
k8sClient = new(k8sfakes.FakeClient)
statusGetter = jobs.NewStatusGetter(tests.NewTestLogger("status_getter_test"), k8sClient)
})

JustBeforeEach(func() {
conditions = statusGetter.GetStatusConditions(context.Background(), job)
conditions, conditionsErr = statusGetter.GetStatusConditions(context.Background(), job)
})

It("succeeds", func() {
Expect(conditionsErr).NotTo(HaveOccurred())
})

It("returns an initialized condition", func() {
Expand Down Expand Up @@ -81,14 +92,19 @@ var _ = Describe("StatusGetter", func() {

When("the job has failed", func() {
var (
now metav1.Time
later metav1.Time
now metav1.Time
later metav1.Time
podList corev1.PodList
)

BeforeEach(func() {
now = metav1.Now()
later = metav1.NewTime(now.Add(time.Hour))
job = &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: "my-job",
Namespace: "my-ns",
},
Status: batchv1.JobStatus{
Conditions: []batchv1.JobCondition{
{
Expand All @@ -108,11 +124,112 @@ var _ = Describe("StatusGetter", func() {
Failed: 1,
},
}

podList = corev1.PodList{
Items: []corev1.Pod{
{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
jobs.AnnotationTaskContainerName: "simon",
},
},
Status: corev1.PodStatus{
ContainerStatuses: []corev1.ContainerStatus{
{
Name: "henry",
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 1,
Reason: "Error",
},
},
},
{
Name: "simon",
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 42,
Reason: "Error",
},
},
},
},
},
},
},
}

k8sClient.ListStub = func(ctx context.Context, objList client.ObjectList, opts ...client.ListOption) error {
list, ok := objList.(*corev1.PodList)
Expect(ok).To(BeTrue())
*list = podList

return nil
}
})

It("returns a failed status", func() {
It("returns a failed status with values from the failed container", func() {
Expect(meta.IsStatusConditionTrue(conditions, eiriniv1.TaskFailedConditionType)).To(BeTrue())
Expect(meta.FindStatusCondition(conditions, eiriniv1.TaskFailedConditionType).LastTransitionTime).To(Equal(later))
failedCondition := meta.FindStatusCondition(conditions, eiriniv1.TaskFailedConditionType)
Expect(failedCondition.LastTransitionTime).To(Equal(later))

Expect(k8sClient.ListCallCount()).To(Equal(1))
_, listObj, opts := k8sClient.ListArgsForCall(0)
Expect(listObj).To(BeAssignableToTypeOf(&corev1.PodList{}))
Expect(opts).To(ContainElement(client.InNamespace("my-ns")))
Expect(opts).To(ContainElement(client.MatchingLabels{"job-name": "my-job"}))

Expect(failedCondition.Message).To(Equal("Failed with exit code: 42"))
})

When("listing the job pods fails", func() {
BeforeEach(func() {
k8sClient.ListReturns(errors.New("boom"))
})

It("returns the error", func() {
Expect(conditionsErr).To(MatchError(ContainSubstring("boom")))
})
})

When("we get more than one pod for a job", func() {
BeforeEach(func() {
podList.Items = append(podList.Items, podList.Items[0])
})

It("returns an error", func() {
Expect(conditionsErr).To(MatchError(ContainSubstring("found more than one pod for job")))
})
})

When("there are no pods for the job", func() {
BeforeEach(func() {
podList.Items = nil
})

It("returns an error", func() {
Expect(conditionsErr).To(MatchError(ContainSubstring("no pods found for job")))
})
})

When("task container does not have termination status", func() {
BeforeEach(func() {
podList.Items[0].Status.ContainerStatuses[1].State.Terminated = nil
})

It("returns an error", func() {
Expect(conditionsErr).To(MatchError(ContainSubstring("no terminated state found")))
})
})

When("task container is not found", func() {
BeforeEach(func() {
podList.Items[0].Annotations[jobs.AnnotationTaskContainerName] = "foo"
})

It("returns an error", func() {
Expect(conditionsErr).To(MatchError(ContainSubstring("no task container found")))
})
})
})
})
23 changes: 14 additions & 9 deletions k8s/reconciler/reconcilerfakes/fake_task_status_getter.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit ac1898d

Please sign in to comment.