From cf5438867ed6e4ef46b48b6251615f24126cb871 Mon Sep 17 00:00:00 2001 From: Gautam Kumar Date: Mon, 9 Dec 2019 15:13:20 -0800 Subject: [PATCH] Patch release for v1.0.1 (#55) * Fixing README (#45) * Updated License Badge Colors/Logo (#46) * Allow custom subnets in canaries (#47) * Allow custom subnets in canaries * Renamed canary EKS cluster * Build integration test container programmatically (#48) * Added script to deploy new integration container * Add AWSCLI to alpine container * Fixed incorrect script path * Modified AWSCLI installation * Start docker daemon * Removed sudo * Added docker daemon nohup * Move into tests to build * Added comments and documentation references * Float release semver up to major and minor tags (#50) * Adding non-ephemeral canary support (#51) * Fixing HPO/BT deletion resource leak when SageMaker throttles Describe (#52) * Fixing HPOJob leak when SageMaker throttles DescribeHPO requests * Fixing BatchTransformJob leak when SageMaker throttles DescribeBatchTransformJob requests * Do not delete non-ephemeral cluster (#54) * Push smlogs binaries with tags (#53) * Added tagged prefix binaries * Added full variables path * Proper printf format * Move import before logging * Renamed deployment_constants --- README.md | 8 ++-- codebuild/build_integration_container.yaml | 18 ++++++++ .../build_deploy_integration_container.sh | 24 ++++++++++ codebuild/scripts/deployment_constants.sh | 12 +++++ codebuild/scripts/package_alpha_operators.sh | 4 +- codebuild/scripts/package_operators.sh | 29 +++++++----- codebuild/scripts/release_tag.sh | 46 +++++++++++++++++-- .../batchtransformjob_controller.go | 20 ++++---- .../batchtransformjob_controller_test.go | 28 +++++++++++ .../controllertest/mock_sagemaker_client.go | 4 +- .../hyperparametertuningjob_controller.go | 19 ++++---- ...hyperparametertuningjob_controller_test.go | 28 +++++++++++ tests/build_canary.sh | 4 +- tests/build_integration.sh | 4 +- tests/codebuild/run_canarytest.sh | 37 +++++++++++---- 15 files changed, 230 insertions(+), 55 deletions(-) create mode 100644 codebuild/build_integration_container.yaml create mode 100644 codebuild/scripts/build_deploy_integration_container.sh create mode 100644 codebuild/scripts/deployment_constants.sh diff --git a/README.md b/README.md index a6b21dc7..4d7243d5 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ # Amazon SageMaker Operators for Kubernetes -![GitHub release (latest SemVer)](https://img.shields.io/github/v/release/aws/amazon-sagemaker-operator-for-k8s?sort=semver) -[![License](https://img.shields.io/badge/license-Apache--2.0-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0) -![GitHub go.mod Go version](https://img.shields.io/github/go-mod/go-version/aws/amazon-sagemaker-operator-for-k8s) +![GitHub release (latest SemVer)](https://img.shields.io/github/v/release/aws/amazon-sagemaker-operator-for-k8s?sort=semver&logo=amazon-aws&color=232F3E) +[![License](https://img.shields.io/badge/license-Apache--2.0-blue.svg?color=success)](http://www.apache.org/licenses/LICENSE-2.0) +![GitHub go.mod Go version](https://img.shields.io/github/go-mod/go-version/aws/amazon-sagemaker-operator-for-k8s?color=69D7E5) ## Introduction -Amazon SageMaker Operators for Kubernetes are operators that can be used to train machine learning models, optimize hyperparameters for a given model, run batch transform jobs over existing models, and set up inference endpoints. With these operators, users can manage their jobs in Amazon SageMaker from their Kubernetes cluster. +Amazon SageMaker Operators for Kubernetes are operators that can be used to train machine learning models, optimize hyperparameters for a given model, run batch transform jobs over existing models, and set up inference endpoints. With these operators, users can manage their jobs in Amazon SageMaker from their Kubernetes cluster in Amazon Elastic Kubernetes Service [EKS](http://aws.amazon.com/eks). ## Usage diff --git a/codebuild/build_integration_container.yaml b/codebuild/build_integration_container.yaml new file mode 100644 index 00000000..b4ce48c0 --- /dev/null +++ b/codebuild/build_integration_container.yaml @@ -0,0 +1,18 @@ +# This CodeBuild project is run using the docker:stable-dind container +# Docker daemon start-up script was taken from the following URL: +# https://docs.aws.amazon.com/codebuild/latest/userguide/sample-docker-custom-image.html + +version: 0.2 +phases: + install: + commands: + - nohup /usr/local/bin/dockerd --host=unix:///var/run/docker.sock --host=tcp://127.0.0.1:2375 --storage-driver=overlay2& + - timeout 15 sh -c "until docker info; do echo .; sleep 1; done" + pre_build: + commands: + # Add AWSCLI and bash + - (apk add --update python python-dev py-pip build-base bash && pip install awscli --upgrade) + build: + commands: + # Build new integration test container + - (IMG=$INTEGRATION_CONTAINER_REPOSITORY bash codebuild/scripts/build_deploy_integration_container.sh) \ No newline at end of file diff --git a/codebuild/scripts/build_deploy_integration_container.sh b/codebuild/scripts/build_deploy_integration_container.sh new file mode 100644 index 00000000..6609248d --- /dev/null +++ b/codebuild/scripts/build_deploy_integration_container.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# This script will build the integration test container. This container contains +# all the tools necessary for running the build and test steps for each of the +# CodeBuild projects. The script will also tag the container with the latest +# commit SHA, and with the "latest" tag, then push to an ECR repository. + +set -x + +# Build new integration test container +pushd tests +IMG=$INTEGRATION_CONTAINER_REPOSITORY bash build_integration.sh +popd + +# Log into ECR +$(aws ecr get-login --no-include-email --region $REGION --registry-ids $AWS_ACCOUNT_ID) + +# Tag the container with SHA and latest +docker tag $INTEGRATION_CONTAINER_REPOSITORY $AWS_ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$INTEGRATION_CONTAINER_REPOSITORY:$CODEBUILD_RESOLVED_SOURCE_VERSION +docker tag $INTEGRATION_CONTAINER_REPOSITORY $AWS_ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$INTEGRATION_CONTAINER_REPOSITORY:latest + +# Push the newly tagged containers +docker push $AWS_ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$INTEGRATION_CONTAINER_REPOSITORY:$CODEBUILD_RESOLVED_SOURCE_VERSION +docker push $AWS_ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$INTEGRATION_CONTAINER_REPOSITORY:latest \ No newline at end of file diff --git a/codebuild/scripts/deployment_constants.sh b/codebuild/scripts/deployment_constants.sh new file mode 100644 index 00000000..797fb931 --- /dev/null +++ b/codebuild/scripts/deployment_constants.sh @@ -0,0 +1,12 @@ +RELEASE_BUCKET_NAME_FMT="%s-%s" + +RELEASE_BINARY_PREFIX_FMT="s3://%s/kubectl-smlogs-plugin" +ALPHA_BINARY_PREFIX_FMT="s3://%s/%s" + +ALPHA_LINUX_BINARY_PATH_FMT="%s/kubectl-smlogs-plugin.linux.amd64.tar.gz" +ALPHA_DARWIN_BINARY_PATH_FMT="%s/kubectl-smlogs-plugin.darwin.amd64.tar.gz" + +RELEASE_LINUX_BINARY_PATH_FMT="%s/%s/linux.amd64.tar.gz" +RELEASE_DARWIN_BINARY_PATH_FMT="%s/%s/darwin.amd64.tar.gz" + +PUBLIC_CP_ARGS="--acl public-read" \ No newline at end of file diff --git a/codebuild/scripts/package_alpha_operators.sh b/codebuild/scripts/package_alpha_operators.sh index ffec5b38..c854dc42 100644 --- a/codebuild/scripts/package_alpha_operators.sh +++ b/codebuild/scripts/package_alpha_operators.sh @@ -1,9 +1,9 @@ #!/bin/bash -set -x - source codebuild/scripts/package_operators.sh +set -x + # Login to alpha ECR $(aws ecr get-login --no-include-email --region $ALPHA_REPOSITORY_REGION --registry-ids $ALPHA_ACCOUNT_ID) diff --git a/codebuild/scripts/package_operators.sh b/codebuild/scripts/package_operators.sh index 8d4ee02a..062fdedd 100644 --- a/codebuild/scripts/package_operators.sh +++ b/codebuild/scripts/package_operators.sh @@ -1,7 +1,15 @@ #!/bin/bash +source codebuild/scripts/deployment_constants.sh + set -e +# Define alpha artifact locations +printf -v ALPHA_BUCKET_PREFIX $ALPHA_BINARY_PREFIX_FMT $ALPHA_TARBALL_BUCKET $CODEBUILD_RESOLVED_SOURCE_VERSION + +printf -v ALPHA_LINUX_BINARY_PATH $ALPHA_LINUX_BINARY_PATH_FMT $ALPHA_BUCKET_PREFIX +printf -v ALPHA_DARWIN_BINARY_PATH $ALPHA_DARWIN_BINARY_PATH_FMT $ALPHA_BUCKET_PREFIX + # This function deploys a region-specific operator to an ECR prod repo from the existing # image in the alpha repository. The function also copies across the smlogs binaries # from the alpha tarball bucket into the production buckets. @@ -34,18 +42,15 @@ function deploy_from_alpha() docker push ${dest_ecr_image}:$CODEBUILD_RESOLVED_SOURCE_VERSION docker push ${dest_ecr_image}:latest - local bucket_name="${RELEASE_TARBALL_BUCKET_PREFIX}-${account_region}" - local binary_prefix="s3://${bucket_name}/kubectl-smlogs-plugin" - local alpha_prefix="s3://$ALPHA_TARBALL_BUCKET/${CODEBUILD_RESOLVED_SOURCE_VERSION}" - - local cp_args="--acl public-read" + printf -v bucket_name $RELEASE_BUCKET_NAME_FMT $RELEASE_TARBALL_BUCKET_PREFIX $account_region + printf -v binary_prefix $RELEASE_BINARY_PREFIX_FMT $bucket_name # Copy across the binaries and set as latest - aws s3 cp "${alpha_prefix}/kubectl-smlogs-plugin.linux.amd64.tar.gz" "${binary_prefix}/${CODEBUILD_RESOLVED_SOURCE_VERSION}/linux.amd64.tar.gz" ${cp_args} - aws s3 cp "${alpha_prefix}/kubectl-smlogs-plugin.linux.amd64.tar.gz" "${binary_prefix}/latest/linux.amd64.tar.gz" ${cp_args} + aws s3 cp "$ALPHA_LINUX_BINARY_PATH" "$(printf $RELEASE_LINUX_BINARY_PATH_FMT $binary_prefix $CODEBUILD_RESOLVED_SOURCE_VERSION)" $PUBLIC_CP_ARGS + aws s3 cp "$ALPHA_LINUX_BINARY_PATH" "$(printf $RELEASE_LINUX_BINARY_PATH_FMT $binary_prefix latest)" $PUBLIC_CP_ARGS - aws s3 cp "${alpha_prefix}/kubectl-smlogs-plugin.darwin.amd64.tar.gz" "${binary_prefix}/${CODEBUILD_RESOLVED_SOURCE_VERSION}/darwin.amd64.tar.gz" ${cp_args} - aws s3 cp "${alpha_prefix}/kubectl-smlogs-plugin.darwin.amd64.tar.gz" "${binary_prefix}/latest/darwin.amd64.tar.gz" ${cp_args} + aws s3 cp "$ALPHA_DARWIN_BINARY_PATH" "$(printf $RELEASE_DARWIN_BINARY_PATH_FMT $binary_prefix $CODEBUILD_RESOLVED_SOURCE_VERSION)" $PUBLIC_CP_ARGS + aws s3 cp "$ALPHA_DARWIN_BINARY_PATH" "$(printf $RELEASE_DARWIN_BINARY_PATH_FMT $binary_prefix latest)" $PUBLIC_CP_ARGS } # This function builds, packages and deploys a region-specific operator to an ECR repo and output bucket. @@ -109,8 +114,8 @@ function package_operator() tar cvzf kubectl-smlogs-plugin.linux.amd64.tar.gz kubectl-smlogs.linux.amd64 tar cvzf kubectl-smlogs-plugin.darwin.amd64.tar.gz kubectl-smlogs.darwin.amd64 - aws s3 cp kubectl-smlogs-plugin.linux.amd64.tar.gz "s3://$ALPHA_TARBALL_BUCKET/${CODEBUILD_RESOLVED_SOURCE_VERSION}/kubectl-smlogs-plugin.linux.amd64.tar.gz" - aws s3 cp kubectl-smlogs-plugin.darwin.amd64.tar.gz "s3://$ALPHA_TARBALL_BUCKET/${CODEBUILD_RESOLVED_SOURCE_VERSION}/kubectl-smlogs-plugin.darwin.amd64.tar.gz" + aws s3 cp kubectl-smlogs-plugin.linux.amd64.tar.gz "$ALPHA_LINUX_BINARY_PATH" + aws s3 cp kubectl-smlogs-plugin.darwin.amd64.tar.gz "$ALPHA_DARWIN_BINARY_PATH" popd fi @@ -119,6 +124,6 @@ function package_operator() tar cvzf sagemaker-k8s-operator.tar.gz sagemaker-k8s-operator # Upload the final tar ball to s3 with standard name and git SHA - aws s3 cp sagemaker-k8s-operator.tar.gz "s3://$ALPHA_TARBALL_BUCKET/${CODEBUILD_RESOLVED_SOURCE_VERSION}/sagemaker-k8s-operator-${account_region}${tarball_suffix}.tar.gz" + aws s3 cp sagemaker-k8s-operator.tar.gz "$ALPHA_BUCKET_PREFIX/sagemaker-k8s-operator-${account_region}${tarball_suffix}.tar.gz" popd } \ No newline at end of file diff --git a/codebuild/scripts/release_tag.sh b/codebuild/scripts/release_tag.sh index 48251142..33d71b06 100644 --- a/codebuild/scripts/release_tag.sh +++ b/codebuild/scripts/release_tag.sh @@ -1,7 +1,15 @@ #!/bin/bash +source codebuild/scripts/deployment_constants.sh + set -e +# Define alpha artifact locations +printf -v ALPHA_BUCKET_PREFIX $ALPHA_BINARY_PREFIX_FMT $ALPHA_TARBALL_BUCKET $CODEBUILD_RESOLVED_SOURCE_VERSION + +printf -v ALPHA_LINUX_BINARY_PATH $ALPHA_LINUX_BINARY_PATH_FMT $ALPHA_BUCKET_PREFIX +printf -v ALPHA_DARWIN_BINARY_PATH $ALPHA_DARWIN_BINARY_PATH_FMT $ALPHA_BUCKET_PREFIX + # This function will pull an existing image + tag and push it with a new tag. # Parameter: # $1: The repository and image to pull from. @@ -18,10 +26,26 @@ function retag_image() docker push $image:$new_tag } -CODEBUILD_GIT_TAG="$(git describe --tags --exact-match 2>/dev/null)" +# This function will push artifacts to their own folder with a given tag. +# Parameter: +# $1: The new tag to push for the artifacts. +# $2: The region of the new artifacts. +function retag_binaries() +{ + local new_tag="$1" + local region="$2" + + printf -v release_bucket $RELEASE_BUCKET_NAME_FMT $RELEASE_TARBALL_BUCKET_PREFIX $region + printf -v binary_prefix $RELEASE_BINARY_PREFIX_FMT $release_bucket + + aws s3 cp "$ALPHA_LINUX_BINARY_PATH" "$(printf $RELEASE_LINUX_BINARY_PATH_FMT $binary_prefix $new_tag)" $PUBLIC_CP_ARGS + aws s3 cp "$ALPHA_DARWIN_BINARY_PATH" "$(printf $RELEASE_DARWIN_BINARY_PATH_FMT $binary_prefix $new_tag)" $PUBLIC_CP_ARGS +} + +GIT_TAG="$(git describe --tags --exact-match 2>/dev/null)" # Only run the release process for tagged commits -if [ "$CODEBUILD_GIT_TAG" == "" ]; then +if [ "$GIT_TAG" == "" ]; then exit 0 fi @@ -45,9 +69,21 @@ for row in $(echo ${ACCOUNTS_ESCAPED} | jq -r '.[] | @base64'); do image=${repository_account}.dkr.ecr.${region}.amazonaws.com/${image_repository} old_tag="${CODEBUILD_RESOLVED_SOURCE_VERSION}" - new_tag="${CODEBUILD_GIT_TAG}" + full_tag="${GIT_TAG}" + + # Get minor and major version tags + [[ $GIT_TAG =~ ^v[0-9]+\.[0-9]+ ]] && minor_tag="${BASH_REMATCH[0]}" + [[ $GIT_TAG =~ ^v[0-9]+ ]] && major_tag="${BASH_REMATCH[0]}" + + echo "Tagging $region with $full_tag" + + retag_image "$image" "$old_tag" "$full_tag" + retag_image "$image" "$old_tag" "$minor_tag" + retag_image "$image" "$old_tag" "$major_tag" - echo "Tagging $image:$old_tag to $image:$new_tag" + retag_binaries "$full_tag" "$region" + retag_binaries "$minor_tag" "$region" + retag_binaries "$major_tag" "$region" - retag_image "$image" "$old_tag" "$new_tag" + echo "Finished tagging $region with $full_tag" done \ No newline at end of file diff --git a/controllers/batchtransformjob/batchtransformjob_controller.go b/controllers/batchtransformjob/batchtransformjob_controller.go index 454d773b..bde3f538 100644 --- a/controllers/batchtransformjob/batchtransformjob_controller.go +++ b/controllers/batchtransformjob/batchtransformjob_controller.go @@ -155,7 +155,7 @@ func (r *BatchTransformJobReconciler) reconcileJob(ctx reconcileRequestContext) } else { ctx.Log.Info("Error getting batchtransformjob state in SageMaker", "requestErr", requestErr) - return r.handleSageMakerApiFailure(ctx, requestErr) + return r.handleSageMakerApiFailure(ctx, requestErr, false) } } @@ -180,12 +180,9 @@ func (r *BatchTransformJobReconciler) reconcileJobDeletion(ctx reconcileRequestC } else { // Case 2 log.Info("Sagemaker returns 4xx or 5xx or unrecoverable API Error") - if requestErr.StatusCode() == 400 { - // handleSageMakerAPIFailure does not removes the finalizer - r.removeFinalizerAndUpdate(ctx) - } + // Handle the 500 or unrecoverable API Error - return r.handleSageMakerApiFailure(ctx, requestErr) + return r.handleSageMakerApiFailure(ctx, requestErr, true) } } else { log.Info("Job exists in Sagemaker, lets delete it") @@ -227,7 +224,7 @@ func (r *BatchTransformJobReconciler) deleteBatchTransformJobIfFinalizerExists(c _, err := req.Send(ctx) if err != nil { log.Error(err, "Unable to stop the job in sagemaker", "context", ctx) - return r.handleSageMakerApiFailure(ctx, err) + return r.handleSageMakerApiFailure(ctx, err, false) } return RequeueImmediately() @@ -301,7 +298,7 @@ func (r *BatchTransformJobReconciler) reconcileSpecWithDescription(ctx reconcile return NoRequeue() } -func (r *BatchTransformJobReconciler) handleSageMakerApiFailure(ctx reconcileRequestContext, apiErr error) (ctrl.Result, error) { +func (r *BatchTransformJobReconciler) handleSageMakerApiFailure(ctx reconcileRequestContext, apiErr error, allowRemoveFinalizer bool) (ctrl.Result, error) { if err := r.updateJobStatus(ctx, batchtransformjobv1.BatchTransformJobStatus{ Additional: apiErr.Error(), LastCheckTime: Now(), @@ -316,6 +313,11 @@ func (r *BatchTransformJobReconciler) handleSageMakerApiFailure(ctx reconcileReq ctx.Log.Info("SageMaker rate limit exceeded, will retry", "err", awsErr) return RequeueAfterInterval(r.PollInterval, nil) } else if awsErr.StatusCode() == 400 { + + if allowRemoveFinalizer { + return r.removeFinalizerAndUpdate(ctx) + } + return NoRequeue() } else { return RequeueAfterInterval(r.PollInterval, nil) @@ -357,7 +359,7 @@ func (r *BatchTransformJobReconciler) createBatchTransformJob(ctx reconcileReque return RequeueImmediately() } ctx.Log.Info("Unable to create Transform job", "createError", createError) - return r.handleSageMakerApiFailure(ctx, createError) + return r.handleSageMakerApiFailure(ctx, createError, false) } func (r *BatchTransformJobReconciler) getSageMakerDescription(ctx reconcileRequestContext) (*sagemaker.DescribeTransformJobOutput, awserr.RequestFailure) { diff --git a/controllers/batchtransformjob/batchtransformjob_controller_test.go b/controllers/batchtransformjob/batchtransformjob_controller_test.go index a118c152..48bc4da4 100644 --- a/controllers/batchtransformjob/batchtransformjob_controller_test.go +++ b/controllers/batchtransformjob/batchtransformjob_controller_test.go @@ -444,6 +444,34 @@ var _ = Describe("Reconciling a job with finalizer that is being deleted", func( Expect(job.Status.TransformJobStatus).To(ContainSubstring(string(sagemaker.TransformJobStatusStopping))) }) + It("should update the status and retry if SageMaker throttles", func() { + rateExceededMessage := "Rate exceeded" + // Setup mock responses. + sageMakerClient := builder. + AddDescribeTransformJobErrorResponse("ThrottlingException", 400, "request id", rateExceededMessage). + Build() + + // Instantiate controller and reconciliation request. + controller := createTransformJobReconcilerForSageMakerClient(k8sClient, sageMakerClient, 1) + request := CreateReconciliationRequest(job.ObjectMeta.Name, job.ObjectMeta.Namespace) + + // Run test and verify expectations. + reconciliationResult, err := controller.Reconcile(request) + + Expect(receivedRequests.Len()).To(Equal(1)) + Expect(err).ToNot(HaveOccurred()) + Expect(reconciliationResult.Requeue).To(Equal(false)) + Expect(reconciliationResult.RequeueAfter).To(Equal(controller.PollInterval)) + + // Verify status is updated. + err = k8sClient.Get(context.Background(), types.NamespacedName{ + Namespace: job.ObjectMeta.Namespace, + Name: job.ObjectMeta.Name, + }, job) + + Expect(job.Status.Additional).To(ContainSubstring(rateExceededMessage)) + }) + It("should remove the finalizer and not requeue if the job is stopped", func() { description.TransformJobStatus = sagemaker.TransformJobStatusStopped // Setup mock responses. diff --git a/controllers/controllertest/mock_sagemaker_client.go b/controllers/controllertest/mock_sagemaker_client.go index 9b52355e..9f1ec205 100644 --- a/controllers/controllertest/mock_sagemaker_client.go +++ b/controllers/controllertest/mock_sagemaker_client.go @@ -274,9 +274,9 @@ func (m *MockSageMakerClientBuilder) AddDescribeEndpointErrorResponse(code strin } // Add a DescribeTrainingJob error response to the client. -func (m *MockSageMakerClientBuilder) AddDescribeTransformJobErrorResponse(code string, statusCode int, reqId string) *MockSageMakerClientBuilder { +func (m *MockSageMakerClientBuilder) AddDescribeTransformJobErrorResponse(code string, statusCode int, reqId, message string) *MockSageMakerClientBuilder { m.responses.PushBack(describeTransformJobResponse{ - err: awserr.NewRequestFailure(awserr.New(code, "mock error message", fmt.Errorf(code)), statusCode, reqId), + err: awserr.NewRequestFailure(awserr.New(code, message, fmt.Errorf(code)), statusCode, reqId), data: nil, }) return m diff --git a/controllers/hyperparametertuningjob/hyperparametertuningjob_controller.go b/controllers/hyperparametertuningjob/hyperparametertuningjob_controller.go index 86ea9ab1..37f48795 100644 --- a/controllers/hyperparametertuningjob/hyperparametertuningjob_controller.go +++ b/controllers/hyperparametertuningjob/hyperparametertuningjob_controller.go @@ -161,7 +161,7 @@ func (r *HyperparameterTuningJobReconciler) reconcileJob(ctx reconcileRequestCon } else { ctx.Log.Info("Error getting HPO state in SageMaker", "requestErr", requestErr) - return r.handleSageMakerApiFailure(ctx, requestErr) + return r.handleSageMakerApiFailure(ctx, requestErr, false) } } @@ -184,12 +184,8 @@ func (r *HyperparameterTuningJobReconciler) reconcileJobDeletion(ctx reconcileRe } else { // Case 2 log.Info("Sagemaker returns 4xx or 5xx or unrecoverable API Error") - if requestErr.StatusCode() == 400 { - // handleSageMakerAPIFailure does not removes the finalizer - r.removeFinalizerAndUpdate(ctx) - } // Handle the 500 or unrecoverable API Error - return r.handleSageMakerApiFailure(ctx, requestErr) + return r.handleSageMakerApiFailure(ctx, requestErr, true) } } else { log.Info("Job exists in Sagemaker, lets delete it") @@ -231,7 +227,7 @@ func (r *HyperparameterTuningJobReconciler) deleteHyperparameterTuningJobIfFinal _, err := req.Send(ctx) if err != nil { log.Error(err, "Unable to stop the job in sagemaker", "context", ctx) - return r.handleSageMakerApiFailure(ctx, err) + return r.handleSageMakerApiFailure(ctx, err, false) } return RequeueImmediately() @@ -321,13 +317,13 @@ func (r *HyperparameterTuningJobReconciler) createHyperParameterTuningJob(ctx re return RequeueImmediately() } else { ctx.Log.Info("Unable to create HPO job", "createError", createError) - return r.handleSageMakerApiFailure(ctx, createError) + return r.handleSageMakerApiFailure(ctx, createError, false) } } // Update job status with error. If error had a 400 HTTP error code then do not requeue, otherwise requeue after interval. -func (r *HyperparameterTuningJobReconciler) handleSageMakerApiFailure(ctx reconcileRequestContext, apiErr error) (ctrl.Result, error) { +func (r *HyperparameterTuningJobReconciler) handleSageMakerApiFailure(ctx reconcileRequestContext, apiErr error, allowRemoveFinalizer bool) (ctrl.Result, error) { if err := r.updateJobStatus(ctx, hpojobv1.HyperparameterTuningJobStatus{ Additional: apiErr.Error(), @@ -344,6 +340,11 @@ func (r *HyperparameterTuningJobReconciler) handleSageMakerApiFailure(ctx reconc ctx.Log.Info("SageMaker rate limit exceeded, will retry", "err", awsErr) return RequeueAfterInterval(r.PollInterval, nil) } else if awsErr.StatusCode() == 400 { + + if allowRemoveFinalizer { + return r.removeFinalizerAndUpdate(ctx) + } + return NoRequeue() } else { return RequeueAfterInterval(r.PollInterval, nil) diff --git a/controllers/hyperparametertuningjob/hyperparametertuningjob_controller_test.go b/controllers/hyperparametertuningjob/hyperparametertuningjob_controller_test.go index 51b8a224..6ca5006d 100644 --- a/controllers/hyperparametertuningjob/hyperparametertuningjob_controller_test.go +++ b/controllers/hyperparametertuningjob/hyperparametertuningjob_controller_test.go @@ -216,6 +216,34 @@ var _ = Describe("Reconciling a job with finalizer that is being deleted", func( Expect(receivedRequests.Len()).To(Equal(2)) }) + It("should update the status and retry if SageMaker throttles", func() { + rateExceededMessage := "Rate exceeded" + // Setup mock responses. + sageMakerClient := builder. + AddDescribeHyperParameterTuningJobErrorResponseWithMessage("ThrottlingException", 400, "request id", rateExceededMessage). + Build() + + // Instantiate controller and reconciliation request. + controller := createHpoReconcilerForSageMakerClient(k8sClient, sageMakerClient, "1s") + request := CreateReconciliationRequest(job.ObjectMeta.Name, job.ObjectMeta.Namespace) + + // Run test and verify expectations. + reconciliationResult, err := controller.Reconcile(request) + + Expect(receivedRequests.Len()).To(Equal(1)) + Expect(err).ToNot(HaveOccurred()) + Expect(reconciliationResult.Requeue).To(Equal(false)) + Expect(reconciliationResult.RequeueAfter).To(Equal(controller.PollInterval)) + + // Verify status is updated. + err = k8sClient.Get(context.Background(), types.NamespacedName{ + Namespace: job.ObjectMeta.Namespace, + Name: job.ObjectMeta.Name, + }, job) + + Expect(job.Status.Additional).To(ContainSubstring(rateExceededMessage)) + }) + It("should update the status and requeue if the job is stopping", func() { description.HyperParameterTuningJobStatus = sagemaker.HyperParameterTuningJobStatusStopping // Setup mock responses. diff --git a/tests/build_canary.sh b/tests/build_canary.sh index e41c519c..1e6ef46c 100755 --- a/tests/build_canary.sh +++ b/tests/build_canary.sh @@ -1 +1,3 @@ -docker build -f images/Dockerfile.canary . -t ${IMG} --build-arg DATA_BUCKET --build-arg COMMIT_SHA --build-arg RESULT_BUCKET \ No newline at end of file +#!/bin/bash + +docker build -f images/Dockerfile.canary . -t ${IMG:-canary-test-container} --build-arg DATA_BUCKET --build-arg COMMIT_SHA --build-arg RESULT_BUCKET \ No newline at end of file diff --git a/tests/build_integration.sh b/tests/build_integration.sh index 38d0b473..a8051349 100755 --- a/tests/build_integration.sh +++ b/tests/build_integration.sh @@ -1 +1,3 @@ -docker build -f images/Dockerfile.integration -t integration-test-container . \ No newline at end of file +#!/bin/bash + +docker build -f images/Dockerfile.integration -t ${IMG:-integration-test-container} . \ No newline at end of file diff --git a/tests/codebuild/run_canarytest.sh b/tests/codebuild/run_canarytest.sh index 916122fd..33e31d6b 100644 --- a/tests/codebuild/run_canarytest.sh +++ b/tests/codebuild/run_canarytest.sh @@ -5,7 +5,11 @@ # Build environment `Docker image` has all prerequisite setup and credentials are being passed using AWS system manager CLUSTER_REGION=${CLUSTER_REGION:-us-east-1} -CLUSTER_VERSION=${CLUSTER_VERSION:-1.12} +CLUSTER_VERSION=${CLUSTER_VERSION:-1.13} + +# Define the list of optional subnets for the EKS test cluster +CLUSTER_PUBLIC_SUBNETS=${CLUSTER_PUBLIC_SUBNETS:-} +CLUSTER_PRIVATE_SUBNETS=${CLUSTER_PRIVATE_SUBNETS:-} # Verbose trace of commands, helpful since test iteration takes a long time. set -x @@ -32,9 +36,12 @@ function cleanup { delete_tests - # Tear down the cluster if we set it up. - echo "need_setup_cluster is true, tearing down cluster we created." - eksctl delete cluster --name "${cluster_name}" --region "${CLUSTER_REGION}" + if [ -z "${USE_EXISTING_CLUSTER}" ] + then + # Tear down the cluster if we set it up. + echo "USE_EXISTING_CLUSTER is true, tearing down cluster we created." + eksctl delete cluster --name "${cluster_name}" --region "${CLUSTER_REGION}" + fi } # Set the trap to clean up resources @@ -49,14 +56,24 @@ echo "Launching canary test for ${COMMIT_SHA}" # Launch EKS cluster if we need to and define cluster_name,CLUSTER_REGION. echo "Launching the cluster" -readonly cluster_name="sagemaker-k8s-pipeline-"$(date '+%Y-%m-%d-%H-%M-%S')"" -# By default eksctl picks random AZ, which time to time leads to capacity issue. -# Generally 1a, 1b, 1c are topmost available AZ, hence specifying it explicitly -eksctl create cluster "${cluster_name}" --nodes 1 --node-type=c5.xlarge --timeout=40m --region "${CLUSTER_REGION}" --auto-kubeconfig --version ${CLUSTER_VERSION} +cluster_name="sagemaker-k8s-canary-"$(date '+%Y-%m-%d-%H-%M-%S')"" + +if [ -z "${USE_EXISTING_CLUSTER}" ] +then + eksctl_args=( --nodes 1 --node-type=c5.xlarge --timeout=40m --region "${CLUSTER_REGION}" --auto-kubeconfig --version "${CLUSTER_VERSION}" ) + [ "${CLUSTER_PUBLIC_SUBNETS}" != "" ] && eksctl_args+=( --vpc-public-subnets="${CLUSTER_PUBLIC_SUBNETS}" ) + [ "${CLUSTER_PRIVATE_SUBNETS}" != "" ] && eksctl_args+=( --vpc-private-subnets="${CLUSTER_PRIVATE_SUBNETS}" ) + + eksctl create cluster "${cluster_name}" "${eksctl_args[@]}" + + echo "Setting kubeconfig" + export KUBECONFIG="/root/.kube/eksctl/clusters/${cluster_name}" +else + cluster_name="non-ephemeral-cluster" + aws eks update-kubeconfig --name "${cluster_name}" --region "${CLUSTER_REGION}" +fi -echo "Setting kubeconfig" -export KUBECONFIG="/root/.kube/eksctl/clusters/${cluster_name}" # Download the CRD tar -xf sagemaker-k8s-operator.tar.gz