Skip to content
This repository was archived by the owner on Jul 19, 2023. It is now read-only.

Commit cf54388

Browse files
authored
Patch release for v1.0.1 (#55)
* Fixing README (#45) * Updated License Badge Colors/Logo (#46) * Allow custom subnets in canaries (#47) * Allow custom subnets in canaries * Renamed canary EKS cluster * Build integration test container programmatically (#48) * Added script to deploy new integration container * Add AWSCLI to alpine container * Fixed incorrect script path * Modified AWSCLI installation * Start docker daemon * Removed sudo * Added docker daemon nohup * Move into tests to build * Added comments and documentation references * Float release semver up to major and minor tags (#50) * Adding non-ephemeral canary support (#51) * Fixing HPO/BT deletion resource leak when SageMaker throttles Describe (#52) * Fixing HPOJob leak when SageMaker throttles DescribeHPO requests * Fixing BatchTransformJob leak when SageMaker throttles DescribeBatchTransformJob requests * Do not delete non-ephemeral cluster (#54) * Push smlogs binaries with tags (#53) * Added tagged prefix binaries * Added full variables path * Proper printf format * Move import before logging * Renamed deployment_constants
1 parent 5a1a1ab commit cf54388

15 files changed

+230
-55
lines changed

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
# Amazon SageMaker Operators for Kubernetes
2-
![GitHub release (latest SemVer)](https://img.shields.io/github/v/release/aws/amazon-sagemaker-operator-for-k8s?sort=semver)
3-
[![License](https://img.shields.io/badge/license-Apache--2.0-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0)
4-
![GitHub go.mod Go version](https://img.shields.io/github/go-mod/go-version/aws/amazon-sagemaker-operator-for-k8s)
2+
![GitHub release (latest SemVer)](https://img.shields.io/github/v/release/aws/amazon-sagemaker-operator-for-k8s?sort=semver&logo=amazon-aws&color=232F3E)
3+
[![License](https://img.shields.io/badge/license-Apache--2.0-blue.svg?color=success)](http://www.apache.org/licenses/LICENSE-2.0)
4+
![GitHub go.mod Go version](https://img.shields.io/github/go-mod/go-version/aws/amazon-sagemaker-operator-for-k8s?color=69D7E5)
55

66
## Introduction
7-
Amazon SageMaker Operators for Kubernetes are operators that can be used to train machine learning models, optimize hyperparameters for a given model, run batch transform jobs over existing models, and set up inference endpoints. With these operators, users can manage their jobs in Amazon SageMaker from their Kubernetes cluster.
7+
Amazon SageMaker Operators for Kubernetes are operators that can be used to train machine learning models, optimize hyperparameters for a given model, run batch transform jobs over existing models, and set up inference endpoints. With these operators, users can manage their jobs in Amazon SageMaker from their Kubernetes cluster in Amazon Elastic Kubernetes Service [EKS](http://aws.amazon.com/eks).
88

99
## Usage
1010

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# This CodeBuild project is run using the docker:stable-dind container
2+
# Docker daemon start-up script was taken from the following URL:
3+
# https://docs.aws.amazon.com/codebuild/latest/userguide/sample-docker-custom-image.html
4+
5+
version: 0.2
6+
phases:
7+
install:
8+
commands:
9+
- nohup /usr/local/bin/dockerd --host=unix:///var/run/docker.sock --host=tcp://127.0.0.1:2375 --storage-driver=overlay2&
10+
- timeout 15 sh -c "until docker info; do echo .; sleep 1; done"
11+
pre_build:
12+
commands:
13+
# Add AWSCLI and bash
14+
- (apk add --update python python-dev py-pip build-base bash && pip install awscli --upgrade)
15+
build:
16+
commands:
17+
# Build new integration test container
18+
- (IMG=$INTEGRATION_CONTAINER_REPOSITORY bash codebuild/scripts/build_deploy_integration_container.sh)
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/bin/bash
2+
3+
# This script will build the integration test container. This container contains
4+
# all the tools necessary for running the build and test steps for each of the
5+
# CodeBuild projects. The script will also tag the container with the latest
6+
# commit SHA, and with the "latest" tag, then push to an ECR repository.
7+
8+
set -x
9+
10+
# Build new integration test container
11+
pushd tests
12+
IMG=$INTEGRATION_CONTAINER_REPOSITORY bash build_integration.sh
13+
popd
14+
15+
# Log into ECR
16+
$(aws ecr get-login --no-include-email --region $REGION --registry-ids $AWS_ACCOUNT_ID)
17+
18+
# Tag the container with SHA and latest
19+
docker tag $INTEGRATION_CONTAINER_REPOSITORY $AWS_ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$INTEGRATION_CONTAINER_REPOSITORY:$CODEBUILD_RESOLVED_SOURCE_VERSION
20+
docker tag $INTEGRATION_CONTAINER_REPOSITORY $AWS_ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$INTEGRATION_CONTAINER_REPOSITORY:latest
21+
22+
# Push the newly tagged containers
23+
docker push $AWS_ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$INTEGRATION_CONTAINER_REPOSITORY:$CODEBUILD_RESOLVED_SOURCE_VERSION
24+
docker push $AWS_ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$INTEGRATION_CONTAINER_REPOSITORY:latest
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
RELEASE_BUCKET_NAME_FMT="%s-%s"
2+
3+
RELEASE_BINARY_PREFIX_FMT="s3://%s/kubectl-smlogs-plugin"
4+
ALPHA_BINARY_PREFIX_FMT="s3://%s/%s"
5+
6+
ALPHA_LINUX_BINARY_PATH_FMT="%s/kubectl-smlogs-plugin.linux.amd64.tar.gz"
7+
ALPHA_DARWIN_BINARY_PATH_FMT="%s/kubectl-smlogs-plugin.darwin.amd64.tar.gz"
8+
9+
RELEASE_LINUX_BINARY_PATH_FMT="%s/%s/linux.amd64.tar.gz"
10+
RELEASE_DARWIN_BINARY_PATH_FMT="%s/%s/darwin.amd64.tar.gz"
11+
12+
PUBLIC_CP_ARGS="--acl public-read"

codebuild/scripts/package_alpha_operators.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#!/bin/bash
22

3-
set -x
4-
53
source codebuild/scripts/package_operators.sh
64

5+
set -x
6+
77
# Login to alpha ECR
88
$(aws ecr get-login --no-include-email --region $ALPHA_REPOSITORY_REGION --registry-ids $ALPHA_ACCOUNT_ID)
99

codebuild/scripts/package_operators.sh

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,15 @@
11
#!/bin/bash
22

3+
source codebuild/scripts/deployment_constants.sh
4+
35
set -e
46

7+
# Define alpha artifact locations
8+
printf -v ALPHA_BUCKET_PREFIX $ALPHA_BINARY_PREFIX_FMT $ALPHA_TARBALL_BUCKET $CODEBUILD_RESOLVED_SOURCE_VERSION
9+
10+
printf -v ALPHA_LINUX_BINARY_PATH $ALPHA_LINUX_BINARY_PATH_FMT $ALPHA_BUCKET_PREFIX
11+
printf -v ALPHA_DARWIN_BINARY_PATH $ALPHA_DARWIN_BINARY_PATH_FMT $ALPHA_BUCKET_PREFIX
12+
513
# This function deploys a region-specific operator to an ECR prod repo from the existing
614
# image in the alpha repository. The function also copies across the smlogs binaries
715
# from the alpha tarball bucket into the production buckets.
@@ -34,18 +42,15 @@ function deploy_from_alpha()
3442
docker push ${dest_ecr_image}:$CODEBUILD_RESOLVED_SOURCE_VERSION
3543
docker push ${dest_ecr_image}:latest
3644

37-
local bucket_name="${RELEASE_TARBALL_BUCKET_PREFIX}-${account_region}"
38-
local binary_prefix="s3://${bucket_name}/kubectl-smlogs-plugin"
39-
local alpha_prefix="s3://$ALPHA_TARBALL_BUCKET/${CODEBUILD_RESOLVED_SOURCE_VERSION}"
40-
41-
local cp_args="--acl public-read"
45+
printf -v bucket_name $RELEASE_BUCKET_NAME_FMT $RELEASE_TARBALL_BUCKET_PREFIX $account_region
46+
printf -v binary_prefix $RELEASE_BINARY_PREFIX_FMT $bucket_name
4247

4348
# Copy across the binaries and set as latest
44-
aws s3 cp "${alpha_prefix}/kubectl-smlogs-plugin.linux.amd64.tar.gz" "${binary_prefix}/${CODEBUILD_RESOLVED_SOURCE_VERSION}/linux.amd64.tar.gz" ${cp_args}
45-
aws s3 cp "${alpha_prefix}/kubectl-smlogs-plugin.linux.amd64.tar.gz" "${binary_prefix}/latest/linux.amd64.tar.gz" ${cp_args}
49+
aws s3 cp "$ALPHA_LINUX_BINARY_PATH" "$(printf $RELEASE_LINUX_BINARY_PATH_FMT $binary_prefix $CODEBUILD_RESOLVED_SOURCE_VERSION)" $PUBLIC_CP_ARGS
50+
aws s3 cp "$ALPHA_LINUX_BINARY_PATH" "$(printf $RELEASE_LINUX_BINARY_PATH_FMT $binary_prefix latest)" $PUBLIC_CP_ARGS
4651

47-
aws s3 cp "${alpha_prefix}/kubectl-smlogs-plugin.darwin.amd64.tar.gz" "${binary_prefix}/${CODEBUILD_RESOLVED_SOURCE_VERSION}/darwin.amd64.tar.gz" ${cp_args}
48-
aws s3 cp "${alpha_prefix}/kubectl-smlogs-plugin.darwin.amd64.tar.gz" "${binary_prefix}/latest/darwin.amd64.tar.gz" ${cp_args}
52+
aws s3 cp "$ALPHA_DARWIN_BINARY_PATH" "$(printf $RELEASE_DARWIN_BINARY_PATH_FMT $binary_prefix $CODEBUILD_RESOLVED_SOURCE_VERSION)" $PUBLIC_CP_ARGS
53+
aws s3 cp "$ALPHA_DARWIN_BINARY_PATH" "$(printf $RELEASE_DARWIN_BINARY_PATH_FMT $binary_prefix latest)" $PUBLIC_CP_ARGS
4954
}
5055

5156
# This function builds, packages and deploys a region-specific operator to an ECR repo and output bucket.
@@ -109,8 +114,8 @@ function package_operator()
109114
tar cvzf kubectl-smlogs-plugin.linux.amd64.tar.gz kubectl-smlogs.linux.amd64
110115
tar cvzf kubectl-smlogs-plugin.darwin.amd64.tar.gz kubectl-smlogs.darwin.amd64
111116

112-
aws s3 cp kubectl-smlogs-plugin.linux.amd64.tar.gz "s3://$ALPHA_TARBALL_BUCKET/${CODEBUILD_RESOLVED_SOURCE_VERSION}/kubectl-smlogs-plugin.linux.amd64.tar.gz"
113-
aws s3 cp kubectl-smlogs-plugin.darwin.amd64.tar.gz "s3://$ALPHA_TARBALL_BUCKET/${CODEBUILD_RESOLVED_SOURCE_VERSION}/kubectl-smlogs-plugin.darwin.amd64.tar.gz"
117+
aws s3 cp kubectl-smlogs-plugin.linux.amd64.tar.gz "$ALPHA_LINUX_BINARY_PATH"
118+
aws s3 cp kubectl-smlogs-plugin.darwin.amd64.tar.gz "$ALPHA_DARWIN_BINARY_PATH"
114119
popd
115120
fi
116121

@@ -119,6 +124,6 @@ function package_operator()
119124
tar cvzf sagemaker-k8s-operator.tar.gz sagemaker-k8s-operator
120125

121126
# Upload the final tar ball to s3 with standard name and git SHA
122-
aws s3 cp sagemaker-k8s-operator.tar.gz "s3://$ALPHA_TARBALL_BUCKET/${CODEBUILD_RESOLVED_SOURCE_VERSION}/sagemaker-k8s-operator-${account_region}${tarball_suffix}.tar.gz"
127+
aws s3 cp sagemaker-k8s-operator.tar.gz "$ALPHA_BUCKET_PREFIX/sagemaker-k8s-operator-${account_region}${tarball_suffix}.tar.gz"
123128
popd
124129
}

codebuild/scripts/release_tag.sh

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,15 @@
11
#!/bin/bash
22

3+
source codebuild/scripts/deployment_constants.sh
4+
35
set -e
46

7+
# Define alpha artifact locations
8+
printf -v ALPHA_BUCKET_PREFIX $ALPHA_BINARY_PREFIX_FMT $ALPHA_TARBALL_BUCKET $CODEBUILD_RESOLVED_SOURCE_VERSION
9+
10+
printf -v ALPHA_LINUX_BINARY_PATH $ALPHA_LINUX_BINARY_PATH_FMT $ALPHA_BUCKET_PREFIX
11+
printf -v ALPHA_DARWIN_BINARY_PATH $ALPHA_DARWIN_BINARY_PATH_FMT $ALPHA_BUCKET_PREFIX
12+
513
# This function will pull an existing image + tag and push it with a new tag.
614
# Parameter:
715
# $1: The repository and image to pull from.
@@ -18,10 +26,26 @@ function retag_image()
1826
docker push $image:$new_tag
1927
}
2028

21-
CODEBUILD_GIT_TAG="$(git describe --tags --exact-match 2>/dev/null)"
29+
# This function will push artifacts to their own folder with a given tag.
30+
# Parameter:
31+
# $1: The new tag to push for the artifacts.
32+
# $2: The region of the new artifacts.
33+
function retag_binaries()
34+
{
35+
local new_tag="$1"
36+
local region="$2"
37+
38+
printf -v release_bucket $RELEASE_BUCKET_NAME_FMT $RELEASE_TARBALL_BUCKET_PREFIX $region
39+
printf -v binary_prefix $RELEASE_BINARY_PREFIX_FMT $release_bucket
40+
41+
aws s3 cp "$ALPHA_LINUX_BINARY_PATH" "$(printf $RELEASE_LINUX_BINARY_PATH_FMT $binary_prefix $new_tag)" $PUBLIC_CP_ARGS
42+
aws s3 cp "$ALPHA_DARWIN_BINARY_PATH" "$(printf $RELEASE_DARWIN_BINARY_PATH_FMT $binary_prefix $new_tag)" $PUBLIC_CP_ARGS
43+
}
44+
45+
GIT_TAG="$(git describe --tags --exact-match 2>/dev/null)"
2246

2347
# Only run the release process for tagged commits
24-
if [ "$CODEBUILD_GIT_TAG" == "" ]; then
48+
if [ "$GIT_TAG" == "" ]; then
2549
exit 0
2650
fi
2751

@@ -45,9 +69,21 @@ for row in $(echo ${ACCOUNTS_ESCAPED} | jq -r '.[] | @base64'); do
4569

4670
image=${repository_account}.dkr.ecr.${region}.amazonaws.com/${image_repository}
4771
old_tag="${CODEBUILD_RESOLVED_SOURCE_VERSION}"
48-
new_tag="${CODEBUILD_GIT_TAG}"
72+
full_tag="${GIT_TAG}"
73+
74+
# Get minor and major version tags
75+
[[ $GIT_TAG =~ ^v[0-9]+\.[0-9]+ ]] && minor_tag="${BASH_REMATCH[0]}"
76+
[[ $GIT_TAG =~ ^v[0-9]+ ]] && major_tag="${BASH_REMATCH[0]}"
77+
78+
echo "Tagging $region with $full_tag"
79+
80+
retag_image "$image" "$old_tag" "$full_tag"
81+
retag_image "$image" "$old_tag" "$minor_tag"
82+
retag_image "$image" "$old_tag" "$major_tag"
4983

50-
echo "Tagging $image:$old_tag to $image:$new_tag"
84+
retag_binaries "$full_tag" "$region"
85+
retag_binaries "$minor_tag" "$region"
86+
retag_binaries "$major_tag" "$region"
5187

52-
retag_image "$image" "$old_tag" "$new_tag"
88+
echo "Finished tagging $region with $full_tag"
5389
done

controllers/batchtransformjob/batchtransformjob_controller.go

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ func (r *BatchTransformJobReconciler) reconcileJob(ctx reconcileRequestContext)
155155
} else {
156156

157157
ctx.Log.Info("Error getting batchtransformjob state in SageMaker", "requestErr", requestErr)
158-
return r.handleSageMakerApiFailure(ctx, requestErr)
158+
return r.handleSageMakerApiFailure(ctx, requestErr, false)
159159
}
160160

161161
}
@@ -180,12 +180,9 @@ func (r *BatchTransformJobReconciler) reconcileJobDeletion(ctx reconcileRequestC
180180
} else {
181181
// Case 2
182182
log.Info("Sagemaker returns 4xx or 5xx or unrecoverable API Error")
183-
if requestErr.StatusCode() == 400 {
184-
// handleSageMakerAPIFailure does not removes the finalizer
185-
r.removeFinalizerAndUpdate(ctx)
186-
}
183+
187184
// Handle the 500 or unrecoverable API Error
188-
return r.handleSageMakerApiFailure(ctx, requestErr)
185+
return r.handleSageMakerApiFailure(ctx, requestErr, true)
189186
}
190187
} else {
191188
log.Info("Job exists in Sagemaker, lets delete it")
@@ -227,7 +224,7 @@ func (r *BatchTransformJobReconciler) deleteBatchTransformJobIfFinalizerExists(c
227224
_, err := req.Send(ctx)
228225
if err != nil {
229226
log.Error(err, "Unable to stop the job in sagemaker", "context", ctx)
230-
return r.handleSageMakerApiFailure(ctx, err)
227+
return r.handleSageMakerApiFailure(ctx, err, false)
231228
}
232229

233230
return RequeueImmediately()
@@ -301,7 +298,7 @@ func (r *BatchTransformJobReconciler) reconcileSpecWithDescription(ctx reconcile
301298
return NoRequeue()
302299
}
303300

304-
func (r *BatchTransformJobReconciler) handleSageMakerApiFailure(ctx reconcileRequestContext, apiErr error) (ctrl.Result, error) {
301+
func (r *BatchTransformJobReconciler) handleSageMakerApiFailure(ctx reconcileRequestContext, apiErr error, allowRemoveFinalizer bool) (ctrl.Result, error) {
305302
if err := r.updateJobStatus(ctx, batchtransformjobv1.BatchTransformJobStatus{
306303
Additional: apiErr.Error(),
307304
LastCheckTime: Now(),
@@ -316,6 +313,11 @@ func (r *BatchTransformJobReconciler) handleSageMakerApiFailure(ctx reconcileReq
316313
ctx.Log.Info("SageMaker rate limit exceeded, will retry", "err", awsErr)
317314
return RequeueAfterInterval(r.PollInterval, nil)
318315
} else if awsErr.StatusCode() == 400 {
316+
317+
if allowRemoveFinalizer {
318+
return r.removeFinalizerAndUpdate(ctx)
319+
}
320+
319321
return NoRequeue()
320322
} else {
321323
return RequeueAfterInterval(r.PollInterval, nil)
@@ -357,7 +359,7 @@ func (r *BatchTransformJobReconciler) createBatchTransformJob(ctx reconcileReque
357359
return RequeueImmediately()
358360
}
359361
ctx.Log.Info("Unable to create Transform job", "createError", createError)
360-
return r.handleSageMakerApiFailure(ctx, createError)
362+
return r.handleSageMakerApiFailure(ctx, createError, false)
361363
}
362364

363365
func (r *BatchTransformJobReconciler) getSageMakerDescription(ctx reconcileRequestContext) (*sagemaker.DescribeTransformJobOutput, awserr.RequestFailure) {

controllers/batchtransformjob/batchtransformjob_controller_test.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -444,6 +444,34 @@ var _ = Describe("Reconciling a job with finalizer that is being deleted", func(
444444
Expect(job.Status.TransformJobStatus).To(ContainSubstring(string(sagemaker.TransformJobStatusStopping)))
445445
})
446446

447+
It("should update the status and retry if SageMaker throttles", func() {
448+
rateExceededMessage := "Rate exceeded"
449+
// Setup mock responses.
450+
sageMakerClient := builder.
451+
AddDescribeTransformJobErrorResponse("ThrottlingException", 400, "request id", rateExceededMessage).
452+
Build()
453+
454+
// Instantiate controller and reconciliation request.
455+
controller := createTransformJobReconcilerForSageMakerClient(k8sClient, sageMakerClient, 1)
456+
request := CreateReconciliationRequest(job.ObjectMeta.Name, job.ObjectMeta.Namespace)
457+
458+
// Run test and verify expectations.
459+
reconciliationResult, err := controller.Reconcile(request)
460+
461+
Expect(receivedRequests.Len()).To(Equal(1))
462+
Expect(err).ToNot(HaveOccurred())
463+
Expect(reconciliationResult.Requeue).To(Equal(false))
464+
Expect(reconciliationResult.RequeueAfter).To(Equal(controller.PollInterval))
465+
466+
// Verify status is updated.
467+
err = k8sClient.Get(context.Background(), types.NamespacedName{
468+
Namespace: job.ObjectMeta.Namespace,
469+
Name: job.ObjectMeta.Name,
470+
}, job)
471+
472+
Expect(job.Status.Additional).To(ContainSubstring(rateExceededMessage))
473+
})
474+
447475
It("should remove the finalizer and not requeue if the job is stopped", func() {
448476
description.TransformJobStatus = sagemaker.TransformJobStatusStopped
449477
// Setup mock responses.

controllers/controllertest/mock_sagemaker_client.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -274,9 +274,9 @@ func (m *MockSageMakerClientBuilder) AddDescribeEndpointErrorResponse(code strin
274274
}
275275

276276
// Add a DescribeTrainingJob error response to the client.
277-
func (m *MockSageMakerClientBuilder) AddDescribeTransformJobErrorResponse(code string, statusCode int, reqId string) *MockSageMakerClientBuilder {
277+
func (m *MockSageMakerClientBuilder) AddDescribeTransformJobErrorResponse(code string, statusCode int, reqId, message string) *MockSageMakerClientBuilder {
278278
m.responses.PushBack(describeTransformJobResponse{
279-
err: awserr.NewRequestFailure(awserr.New(code, "mock error message", fmt.Errorf(code)), statusCode, reqId),
279+
err: awserr.NewRequestFailure(awserr.New(code, message, fmt.Errorf(code)), statusCode, reqId),
280280
data: nil,
281281
})
282282
return m

controllers/hyperparametertuningjob/hyperparametertuningjob_controller.go

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ func (r *HyperparameterTuningJobReconciler) reconcileJob(ctx reconcileRequestCon
161161
} else {
162162

163163
ctx.Log.Info("Error getting HPO state in SageMaker", "requestErr", requestErr)
164-
return r.handleSageMakerApiFailure(ctx, requestErr)
164+
return r.handleSageMakerApiFailure(ctx, requestErr, false)
165165
}
166166
}
167167

@@ -184,12 +184,8 @@ func (r *HyperparameterTuningJobReconciler) reconcileJobDeletion(ctx reconcileRe
184184
} else {
185185
// Case 2
186186
log.Info("Sagemaker returns 4xx or 5xx or unrecoverable API Error")
187-
if requestErr.StatusCode() == 400 {
188-
// handleSageMakerAPIFailure does not removes the finalizer
189-
r.removeFinalizerAndUpdate(ctx)
190-
}
191187
// Handle the 500 or unrecoverable API Error
192-
return r.handleSageMakerApiFailure(ctx, requestErr)
188+
return r.handleSageMakerApiFailure(ctx, requestErr, true)
193189
}
194190
} else {
195191
log.Info("Job exists in Sagemaker, lets delete it")
@@ -231,7 +227,7 @@ func (r *HyperparameterTuningJobReconciler) deleteHyperparameterTuningJobIfFinal
231227
_, err := req.Send(ctx)
232228
if err != nil {
233229
log.Error(err, "Unable to stop the job in sagemaker", "context", ctx)
234-
return r.handleSageMakerApiFailure(ctx, err)
230+
return r.handleSageMakerApiFailure(ctx, err, false)
235231
}
236232

237233
return RequeueImmediately()
@@ -321,13 +317,13 @@ func (r *HyperparameterTuningJobReconciler) createHyperParameterTuningJob(ctx re
321317
return RequeueImmediately()
322318
} else {
323319
ctx.Log.Info("Unable to create HPO job", "createError", createError)
324-
return r.handleSageMakerApiFailure(ctx, createError)
320+
return r.handleSageMakerApiFailure(ctx, createError, false)
325321

326322
}
327323
}
328324

329325
// Update job status with error. If error had a 400 HTTP error code then do not requeue, otherwise requeue after interval.
330-
func (r *HyperparameterTuningJobReconciler) handleSageMakerApiFailure(ctx reconcileRequestContext, apiErr error) (ctrl.Result, error) {
326+
func (r *HyperparameterTuningJobReconciler) handleSageMakerApiFailure(ctx reconcileRequestContext, apiErr error, allowRemoveFinalizer bool) (ctrl.Result, error) {
331327

332328
if err := r.updateJobStatus(ctx, hpojobv1.HyperparameterTuningJobStatus{
333329
Additional: apiErr.Error(),
@@ -344,6 +340,11 @@ func (r *HyperparameterTuningJobReconciler) handleSageMakerApiFailure(ctx reconc
344340
ctx.Log.Info("SageMaker rate limit exceeded, will retry", "err", awsErr)
345341
return RequeueAfterInterval(r.PollInterval, nil)
346342
} else if awsErr.StatusCode() == 400 {
343+
344+
if allowRemoveFinalizer {
345+
return r.removeFinalizerAndUpdate(ctx)
346+
}
347+
347348
return NoRequeue()
348349
} else {
349350
return RequeueAfterInterval(r.PollInterval, nil)

0 commit comments

Comments
 (0)