Skip to content
This repository has been archived by the owner on Jul 19, 2023. It is now read-only.

Commit

Permalink
Patch release for v1.0.1 (#55)
Browse files Browse the repository at this point in the history
* Fixing README (#45)

* Updated License Badge Colors/Logo (#46)

* Allow custom subnets in canaries (#47)

* Allow custom subnets in canaries

* Renamed canary EKS cluster

* Build integration test container programmatically (#48)

* Added script to deploy new integration container

* Add AWSCLI to alpine container

* Fixed incorrect script path

* Modified AWSCLI installation

* Start docker daemon

* Removed sudo

* Added docker daemon nohup

* Move into tests to build

* Added comments and documentation references

* Float release semver up to major and minor tags (#50)

* Adding non-ephemeral canary support (#51)

* Fixing HPO/BT deletion resource leak when SageMaker throttles Describe  (#52)

* Fixing HPOJob leak when SageMaker throttles DescribeHPO requests

* Fixing BatchTransformJob leak when SageMaker throttles DescribeBatchTransformJob requests

* Do not delete non-ephemeral cluster (#54)

* Push smlogs binaries with tags (#53)

* Added tagged prefix binaries

* Added full variables path

* Proper printf format

* Move import before logging

* Renamed deployment_constants
  • Loading branch information
goswamig committed Dec 9, 2019
1 parent 5a1a1ab commit cf54388
Show file tree
Hide file tree
Showing 15 changed files with 230 additions and 55 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# Amazon SageMaker Operators for Kubernetes
![GitHub release (latest SemVer)](https://img.shields.io/github/v/release/aws/amazon-sagemaker-operator-for-k8s?sort=semver)
[![License](https://img.shields.io/badge/license-Apache--2.0-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0)
![GitHub go.mod Go version](https://img.shields.io/github/go-mod/go-version/aws/amazon-sagemaker-operator-for-k8s)
![GitHub release (latest SemVer)](https://img.shields.io/github/v/release/aws/amazon-sagemaker-operator-for-k8s?sort=semver&logo=amazon-aws&color=232F3E)
[![License](https://img.shields.io/badge/license-Apache--2.0-blue.svg?color=success)](http://www.apache.org/licenses/LICENSE-2.0)
![GitHub go.mod Go version](https://img.shields.io/github/go-mod/go-version/aws/amazon-sagemaker-operator-for-k8s?color=69D7E5)

## Introduction
Amazon SageMaker Operators for Kubernetes are operators that can be used to train machine learning models, optimize hyperparameters for a given model, run batch transform jobs over existing models, and set up inference endpoints. With these operators, users can manage their jobs in Amazon SageMaker from their Kubernetes cluster.
Amazon SageMaker Operators for Kubernetes are operators that can be used to train machine learning models, optimize hyperparameters for a given model, run batch transform jobs over existing models, and set up inference endpoints. With these operators, users can manage their jobs in Amazon SageMaker from their Kubernetes cluster in Amazon Elastic Kubernetes Service [EKS](http://aws.amazon.com/eks).

## Usage

Expand Down
18 changes: 18 additions & 0 deletions codebuild/build_integration_container.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# This CodeBuild project is run using the docker:stable-dind container
# Docker daemon start-up script was taken from the following URL:
# https://docs.aws.amazon.com/codebuild/latest/userguide/sample-docker-custom-image.html

version: 0.2
phases:
install:
commands:
- nohup /usr/local/bin/dockerd --host=unix:///var/run/docker.sock --host=tcp://127.0.0.1:2375 --storage-driver=overlay2&
- timeout 15 sh -c "until docker info; do echo .; sleep 1; done"
pre_build:
commands:
# Add AWSCLI and bash
- (apk add --update python python-dev py-pip build-base bash && pip install awscli --upgrade)
build:
commands:
# Build new integration test container
- (IMG=$INTEGRATION_CONTAINER_REPOSITORY bash codebuild/scripts/build_deploy_integration_container.sh)
24 changes: 24 additions & 0 deletions codebuild/scripts/build_deploy_integration_container.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash

# This script will build the integration test container. This container contains
# all the tools necessary for running the build and test steps for each of the
# CodeBuild projects. The script will also tag the container with the latest
# commit SHA, and with the "latest" tag, then push to an ECR repository.

set -x

# Build new integration test container
pushd tests
IMG=$INTEGRATION_CONTAINER_REPOSITORY bash build_integration.sh
popd

# Log into ECR
$(aws ecr get-login --no-include-email --region $REGION --registry-ids $AWS_ACCOUNT_ID)

# Tag the container with SHA and latest
docker tag $INTEGRATION_CONTAINER_REPOSITORY $AWS_ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$INTEGRATION_CONTAINER_REPOSITORY:$CODEBUILD_RESOLVED_SOURCE_VERSION
docker tag $INTEGRATION_CONTAINER_REPOSITORY $AWS_ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$INTEGRATION_CONTAINER_REPOSITORY:latest

# Push the newly tagged containers
docker push $AWS_ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$INTEGRATION_CONTAINER_REPOSITORY:$CODEBUILD_RESOLVED_SOURCE_VERSION
docker push $AWS_ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$INTEGRATION_CONTAINER_REPOSITORY:latest
12 changes: 12 additions & 0 deletions codebuild/scripts/deployment_constants.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
RELEASE_BUCKET_NAME_FMT="%s-%s"

RELEASE_BINARY_PREFIX_FMT="s3://%s/kubectl-smlogs-plugin"
ALPHA_BINARY_PREFIX_FMT="s3://%s/%s"

ALPHA_LINUX_BINARY_PATH_FMT="%s/kubectl-smlogs-plugin.linux.amd64.tar.gz"
ALPHA_DARWIN_BINARY_PATH_FMT="%s/kubectl-smlogs-plugin.darwin.amd64.tar.gz"

RELEASE_LINUX_BINARY_PATH_FMT="%s/%s/linux.amd64.tar.gz"
RELEASE_DARWIN_BINARY_PATH_FMT="%s/%s/darwin.amd64.tar.gz"

PUBLIC_CP_ARGS="--acl public-read"
4 changes: 2 additions & 2 deletions codebuild/scripts/package_alpha_operators.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#!/bin/bash

set -x

source codebuild/scripts/package_operators.sh

set -x

# Login to alpha ECR
$(aws ecr get-login --no-include-email --region $ALPHA_REPOSITORY_REGION --registry-ids $ALPHA_ACCOUNT_ID)

Expand Down
29 changes: 17 additions & 12 deletions codebuild/scripts/package_operators.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
#!/bin/bash

source codebuild/scripts/deployment_constants.sh

set -e

# Define alpha artifact locations
printf -v ALPHA_BUCKET_PREFIX $ALPHA_BINARY_PREFIX_FMT $ALPHA_TARBALL_BUCKET $CODEBUILD_RESOLVED_SOURCE_VERSION

printf -v ALPHA_LINUX_BINARY_PATH $ALPHA_LINUX_BINARY_PATH_FMT $ALPHA_BUCKET_PREFIX
printf -v ALPHA_DARWIN_BINARY_PATH $ALPHA_DARWIN_BINARY_PATH_FMT $ALPHA_BUCKET_PREFIX

# This function deploys a region-specific operator to an ECR prod repo from the existing
# image in the alpha repository. The function also copies across the smlogs binaries
# from the alpha tarball bucket into the production buckets.
Expand Down Expand Up @@ -34,18 +42,15 @@ function deploy_from_alpha()
docker push ${dest_ecr_image}:$CODEBUILD_RESOLVED_SOURCE_VERSION
docker push ${dest_ecr_image}:latest

local bucket_name="${RELEASE_TARBALL_BUCKET_PREFIX}-${account_region}"
local binary_prefix="s3://${bucket_name}/kubectl-smlogs-plugin"
local alpha_prefix="s3://$ALPHA_TARBALL_BUCKET/${CODEBUILD_RESOLVED_SOURCE_VERSION}"

local cp_args="--acl public-read"
printf -v bucket_name $RELEASE_BUCKET_NAME_FMT $RELEASE_TARBALL_BUCKET_PREFIX $account_region
printf -v binary_prefix $RELEASE_BINARY_PREFIX_FMT $bucket_name

# Copy across the binaries and set as latest
aws s3 cp "${alpha_prefix}/kubectl-smlogs-plugin.linux.amd64.tar.gz" "${binary_prefix}/${CODEBUILD_RESOLVED_SOURCE_VERSION}/linux.amd64.tar.gz" ${cp_args}
aws s3 cp "${alpha_prefix}/kubectl-smlogs-plugin.linux.amd64.tar.gz" "${binary_prefix}/latest/linux.amd64.tar.gz" ${cp_args}
aws s3 cp "$ALPHA_LINUX_BINARY_PATH" "$(printf $RELEASE_LINUX_BINARY_PATH_FMT $binary_prefix $CODEBUILD_RESOLVED_SOURCE_VERSION)" $PUBLIC_CP_ARGS
aws s3 cp "$ALPHA_LINUX_BINARY_PATH" "$(printf $RELEASE_LINUX_BINARY_PATH_FMT $binary_prefix latest)" $PUBLIC_CP_ARGS

aws s3 cp "${alpha_prefix}/kubectl-smlogs-plugin.darwin.amd64.tar.gz" "${binary_prefix}/${CODEBUILD_RESOLVED_SOURCE_VERSION}/darwin.amd64.tar.gz" ${cp_args}
aws s3 cp "${alpha_prefix}/kubectl-smlogs-plugin.darwin.amd64.tar.gz" "${binary_prefix}/latest/darwin.amd64.tar.gz" ${cp_args}
aws s3 cp "$ALPHA_DARWIN_BINARY_PATH" "$(printf $RELEASE_DARWIN_BINARY_PATH_FMT $binary_prefix $CODEBUILD_RESOLVED_SOURCE_VERSION)" $PUBLIC_CP_ARGS
aws s3 cp "$ALPHA_DARWIN_BINARY_PATH" "$(printf $RELEASE_DARWIN_BINARY_PATH_FMT $binary_prefix latest)" $PUBLIC_CP_ARGS
}

# This function builds, packages and deploys a region-specific operator to an ECR repo and output bucket.
Expand Down Expand Up @@ -109,8 +114,8 @@ function package_operator()
tar cvzf kubectl-smlogs-plugin.linux.amd64.tar.gz kubectl-smlogs.linux.amd64
tar cvzf kubectl-smlogs-plugin.darwin.amd64.tar.gz kubectl-smlogs.darwin.amd64

aws s3 cp kubectl-smlogs-plugin.linux.amd64.tar.gz "s3://$ALPHA_TARBALL_BUCKET/${CODEBUILD_RESOLVED_SOURCE_VERSION}/kubectl-smlogs-plugin.linux.amd64.tar.gz"
aws s3 cp kubectl-smlogs-plugin.darwin.amd64.tar.gz "s3://$ALPHA_TARBALL_BUCKET/${CODEBUILD_RESOLVED_SOURCE_VERSION}/kubectl-smlogs-plugin.darwin.amd64.tar.gz"
aws s3 cp kubectl-smlogs-plugin.linux.amd64.tar.gz "$ALPHA_LINUX_BINARY_PATH"
aws s3 cp kubectl-smlogs-plugin.darwin.amd64.tar.gz "$ALPHA_DARWIN_BINARY_PATH"
popd
fi

Expand All @@ -119,6 +124,6 @@ function package_operator()
tar cvzf sagemaker-k8s-operator.tar.gz sagemaker-k8s-operator

# Upload the final tar ball to s3 with standard name and git SHA
aws s3 cp sagemaker-k8s-operator.tar.gz "s3://$ALPHA_TARBALL_BUCKET/${CODEBUILD_RESOLVED_SOURCE_VERSION}/sagemaker-k8s-operator-${account_region}${tarball_suffix}.tar.gz"
aws s3 cp sagemaker-k8s-operator.tar.gz "$ALPHA_BUCKET_PREFIX/sagemaker-k8s-operator-${account_region}${tarball_suffix}.tar.gz"
popd
}
46 changes: 41 additions & 5 deletions codebuild/scripts/release_tag.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
#!/bin/bash

source codebuild/scripts/deployment_constants.sh

set -e

# Define alpha artifact locations
printf -v ALPHA_BUCKET_PREFIX $ALPHA_BINARY_PREFIX_FMT $ALPHA_TARBALL_BUCKET $CODEBUILD_RESOLVED_SOURCE_VERSION

printf -v ALPHA_LINUX_BINARY_PATH $ALPHA_LINUX_BINARY_PATH_FMT $ALPHA_BUCKET_PREFIX
printf -v ALPHA_DARWIN_BINARY_PATH $ALPHA_DARWIN_BINARY_PATH_FMT $ALPHA_BUCKET_PREFIX

# This function will pull an existing image + tag and push it with a new tag.
# Parameter:
# $1: The repository and image to pull from.
Expand All @@ -18,10 +26,26 @@ function retag_image()
docker push $image:$new_tag
}

CODEBUILD_GIT_TAG="$(git describe --tags --exact-match 2>/dev/null)"
# This function will push artifacts to their own folder with a given tag.
# Parameter:
# $1: The new tag to push for the artifacts.
# $2: The region of the new artifacts.
function retag_binaries()
{
local new_tag="$1"
local region="$2"

printf -v release_bucket $RELEASE_BUCKET_NAME_FMT $RELEASE_TARBALL_BUCKET_PREFIX $region
printf -v binary_prefix $RELEASE_BINARY_PREFIX_FMT $release_bucket

aws s3 cp "$ALPHA_LINUX_BINARY_PATH" "$(printf $RELEASE_LINUX_BINARY_PATH_FMT $binary_prefix $new_tag)" $PUBLIC_CP_ARGS
aws s3 cp "$ALPHA_DARWIN_BINARY_PATH" "$(printf $RELEASE_DARWIN_BINARY_PATH_FMT $binary_prefix $new_tag)" $PUBLIC_CP_ARGS
}

GIT_TAG="$(git describe --tags --exact-match 2>/dev/null)"

# Only run the release process for tagged commits
if [ "$CODEBUILD_GIT_TAG" == "" ]; then
if [ "$GIT_TAG" == "" ]; then
exit 0
fi

Expand All @@ -45,9 +69,21 @@ for row in $(echo ${ACCOUNTS_ESCAPED} | jq -r '.[] | @base64'); do

image=${repository_account}.dkr.ecr.${region}.amazonaws.com/${image_repository}
old_tag="${CODEBUILD_RESOLVED_SOURCE_VERSION}"
new_tag="${CODEBUILD_GIT_TAG}"
full_tag="${GIT_TAG}"

# Get minor and major version tags
[[ $GIT_TAG =~ ^v[0-9]+\.[0-9]+ ]] && minor_tag="${BASH_REMATCH[0]}"
[[ $GIT_TAG =~ ^v[0-9]+ ]] && major_tag="${BASH_REMATCH[0]}"

echo "Tagging $region with $full_tag"

retag_image "$image" "$old_tag" "$full_tag"
retag_image "$image" "$old_tag" "$minor_tag"
retag_image "$image" "$old_tag" "$major_tag"

echo "Tagging $image:$old_tag to $image:$new_tag"
retag_binaries "$full_tag" "$region"
retag_binaries "$minor_tag" "$region"
retag_binaries "$major_tag" "$region"

retag_image "$image" "$old_tag" "$new_tag"
echo "Finished tagging $region with $full_tag"
done
20 changes: 11 additions & 9 deletions controllers/batchtransformjob/batchtransformjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ func (r *BatchTransformJobReconciler) reconcileJob(ctx reconcileRequestContext)
} else {

ctx.Log.Info("Error getting batchtransformjob state in SageMaker", "requestErr", requestErr)
return r.handleSageMakerApiFailure(ctx, requestErr)
return r.handleSageMakerApiFailure(ctx, requestErr, false)
}

}
Expand All @@ -180,12 +180,9 @@ func (r *BatchTransformJobReconciler) reconcileJobDeletion(ctx reconcileRequestC
} else {
// Case 2
log.Info("Sagemaker returns 4xx or 5xx or unrecoverable API Error")
if requestErr.StatusCode() == 400 {
// handleSageMakerAPIFailure does not removes the finalizer
r.removeFinalizerAndUpdate(ctx)
}

// Handle the 500 or unrecoverable API Error
return r.handleSageMakerApiFailure(ctx, requestErr)
return r.handleSageMakerApiFailure(ctx, requestErr, true)
}
} else {
log.Info("Job exists in Sagemaker, lets delete it")
Expand Down Expand Up @@ -227,7 +224,7 @@ func (r *BatchTransformJobReconciler) deleteBatchTransformJobIfFinalizerExists(c
_, err := req.Send(ctx)
if err != nil {
log.Error(err, "Unable to stop the job in sagemaker", "context", ctx)
return r.handleSageMakerApiFailure(ctx, err)
return r.handleSageMakerApiFailure(ctx, err, false)
}

return RequeueImmediately()
Expand Down Expand Up @@ -301,7 +298,7 @@ func (r *BatchTransformJobReconciler) reconcileSpecWithDescription(ctx reconcile
return NoRequeue()
}

func (r *BatchTransformJobReconciler) handleSageMakerApiFailure(ctx reconcileRequestContext, apiErr error) (ctrl.Result, error) {
func (r *BatchTransformJobReconciler) handleSageMakerApiFailure(ctx reconcileRequestContext, apiErr error, allowRemoveFinalizer bool) (ctrl.Result, error) {
if err := r.updateJobStatus(ctx, batchtransformjobv1.BatchTransformJobStatus{
Additional: apiErr.Error(),
LastCheckTime: Now(),
Expand All @@ -316,6 +313,11 @@ func (r *BatchTransformJobReconciler) handleSageMakerApiFailure(ctx reconcileReq
ctx.Log.Info("SageMaker rate limit exceeded, will retry", "err", awsErr)
return RequeueAfterInterval(r.PollInterval, nil)
} else if awsErr.StatusCode() == 400 {

if allowRemoveFinalizer {
return r.removeFinalizerAndUpdate(ctx)
}

return NoRequeue()
} else {
return RequeueAfterInterval(r.PollInterval, nil)
Expand Down Expand Up @@ -357,7 +359,7 @@ func (r *BatchTransformJobReconciler) createBatchTransformJob(ctx reconcileReque
return RequeueImmediately()
}
ctx.Log.Info("Unable to create Transform job", "createError", createError)
return r.handleSageMakerApiFailure(ctx, createError)
return r.handleSageMakerApiFailure(ctx, createError, false)
}

func (r *BatchTransformJobReconciler) getSageMakerDescription(ctx reconcileRequestContext) (*sagemaker.DescribeTransformJobOutput, awserr.RequestFailure) {
Expand Down
28 changes: 28 additions & 0 deletions controllers/batchtransformjob/batchtransformjob_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,34 @@ var _ = Describe("Reconciling a job with finalizer that is being deleted", func(
Expect(job.Status.TransformJobStatus).To(ContainSubstring(string(sagemaker.TransformJobStatusStopping)))
})

It("should update the status and retry if SageMaker throttles", func() {
rateExceededMessage := "Rate exceeded"
// Setup mock responses.
sageMakerClient := builder.
AddDescribeTransformJobErrorResponse("ThrottlingException", 400, "request id", rateExceededMessage).
Build()

// Instantiate controller and reconciliation request.
controller := createTransformJobReconcilerForSageMakerClient(k8sClient, sageMakerClient, 1)
request := CreateReconciliationRequest(job.ObjectMeta.Name, job.ObjectMeta.Namespace)

// Run test and verify expectations.
reconciliationResult, err := controller.Reconcile(request)

Expect(receivedRequests.Len()).To(Equal(1))
Expect(err).ToNot(HaveOccurred())
Expect(reconciliationResult.Requeue).To(Equal(false))
Expect(reconciliationResult.RequeueAfter).To(Equal(controller.PollInterval))

// Verify status is updated.
err = k8sClient.Get(context.Background(), types.NamespacedName{
Namespace: job.ObjectMeta.Namespace,
Name: job.ObjectMeta.Name,
}, job)

Expect(job.Status.Additional).To(ContainSubstring(rateExceededMessage))
})

It("should remove the finalizer and not requeue if the job is stopped", func() {
description.TransformJobStatus = sagemaker.TransformJobStatusStopped
// Setup mock responses.
Expand Down
4 changes: 2 additions & 2 deletions controllers/controllertest/mock_sagemaker_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -274,9 +274,9 @@ func (m *MockSageMakerClientBuilder) AddDescribeEndpointErrorResponse(code strin
}

// Add a DescribeTrainingJob error response to the client.
func (m *MockSageMakerClientBuilder) AddDescribeTransformJobErrorResponse(code string, statusCode int, reqId string) *MockSageMakerClientBuilder {
func (m *MockSageMakerClientBuilder) AddDescribeTransformJobErrorResponse(code string, statusCode int, reqId, message string) *MockSageMakerClientBuilder {
m.responses.PushBack(describeTransformJobResponse{
err: awserr.NewRequestFailure(awserr.New(code, "mock error message", fmt.Errorf(code)), statusCode, reqId),
err: awserr.NewRequestFailure(awserr.New(code, message, fmt.Errorf(code)), statusCode, reqId),
data: nil,
})
return m
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ func (r *HyperparameterTuningJobReconciler) reconcileJob(ctx reconcileRequestCon
} else {

ctx.Log.Info("Error getting HPO state in SageMaker", "requestErr", requestErr)
return r.handleSageMakerApiFailure(ctx, requestErr)
return r.handleSageMakerApiFailure(ctx, requestErr, false)
}
}

Expand All @@ -184,12 +184,8 @@ func (r *HyperparameterTuningJobReconciler) reconcileJobDeletion(ctx reconcileRe
} else {
// Case 2
log.Info("Sagemaker returns 4xx or 5xx or unrecoverable API Error")
if requestErr.StatusCode() == 400 {
// handleSageMakerAPIFailure does not removes the finalizer
r.removeFinalizerAndUpdate(ctx)
}
// Handle the 500 or unrecoverable API Error
return r.handleSageMakerApiFailure(ctx, requestErr)
return r.handleSageMakerApiFailure(ctx, requestErr, true)
}
} else {
log.Info("Job exists in Sagemaker, lets delete it")
Expand Down Expand Up @@ -231,7 +227,7 @@ func (r *HyperparameterTuningJobReconciler) deleteHyperparameterTuningJobIfFinal
_, err := req.Send(ctx)
if err != nil {
log.Error(err, "Unable to stop the job in sagemaker", "context", ctx)
return r.handleSageMakerApiFailure(ctx, err)
return r.handleSageMakerApiFailure(ctx, err, false)
}

return RequeueImmediately()
Expand Down Expand Up @@ -321,13 +317,13 @@ func (r *HyperparameterTuningJobReconciler) createHyperParameterTuningJob(ctx re
return RequeueImmediately()
} else {
ctx.Log.Info("Unable to create HPO job", "createError", createError)
return r.handleSageMakerApiFailure(ctx, createError)
return r.handleSageMakerApiFailure(ctx, createError, false)

}
}

// Update job status with error. If error had a 400 HTTP error code then do not requeue, otherwise requeue after interval.
func (r *HyperparameterTuningJobReconciler) handleSageMakerApiFailure(ctx reconcileRequestContext, apiErr error) (ctrl.Result, error) {
func (r *HyperparameterTuningJobReconciler) handleSageMakerApiFailure(ctx reconcileRequestContext, apiErr error, allowRemoveFinalizer bool) (ctrl.Result, error) {

if err := r.updateJobStatus(ctx, hpojobv1.HyperparameterTuningJobStatus{
Additional: apiErr.Error(),
Expand All @@ -344,6 +340,11 @@ func (r *HyperparameterTuningJobReconciler) handleSageMakerApiFailure(ctx reconc
ctx.Log.Info("SageMaker rate limit exceeded, will retry", "err", awsErr)
return RequeueAfterInterval(r.PollInterval, nil)
} else if awsErr.StatusCode() == 400 {

if allowRemoveFinalizer {
return r.removeFinalizerAndUpdate(ctx)
}

return NoRequeue()
} else {
return RequeueAfterInterval(r.PollInterval, nil)
Expand Down
Loading

0 comments on commit cf54388

Please sign in to comment.