diff --git a/1.architectures/5.sagemaker-hyperpod/easy_smhp/README.md b/1.architectures/5.sagemaker-hyperpod/easy_smhp/README.md new file mode 100644 index 00000000..a3608b3d --- /dev/null +++ b/1.architectures/5.sagemaker-hyperpod/easy_smhp/README.md @@ -0,0 +1,13 @@ +# Automate SageMaker HyperPod Cluster Creation for EKS and Slurm orchestrator +A bash script that automates the manual cluster creation process for SageMaker HyperPod SLURM and EKS + +This automates the steps from the: +- [SageMaker HyperPod SLURM Workshop](https://catalog.workshops.aws/sagemaker-hyperpod/en-US) +- [SageMaker HyperPod EKS Workshop](https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US) + +## 🚀 Installation and Usage +Using this script is very simple. Run ```bash easy_smhp_x1.sh``` + +The script will walk you through creating the cluster configuration for your SageMaker HyperPod Slurm or EKS cluster based of the variables exported. Please read through the instructions provided while running the script for the best experience. + + diff --git a/1.architectures/5.sagemaker-hyperpod/easy_smhp/easy_smhp_x1.sh b/1.architectures/5.sagemaker-hyperpod/easy_smhp/easy_smhp_x1.sh new file mode 100644 index 00000000..b3a992a7 --- /dev/null +++ b/1.architectures/5.sagemaker-hyperpod/easy_smhp/easy_smhp_x1.sh @@ -0,0 +1,1152 @@ +#!/bin/bash +# Automatizing the full Workshop available at: +# - https://catalog.workshops.aws/sagemaker-hyperpod +# - https://catalog.workshops.aws/sagemaker-hyperpod-eks +unset az, region, controller, login, worker # DO NOT TOUCH +unset subnet_private_id, fsxl_id, fsxl_mount, sg, role_name, bucket # DO NOT TOUCH + +# INPUT from Customer +export orchestrator="eks" # slurm or eks +export verbose="true" # Display actions taken to run this script +export install_dir="/tmp/smhp_install" # folder to copy, create, install software and configuration files +export bin_dir="/usr/local/bin" # folder to store the binaries installed +export venv_smhp="${install_dir}/venv" # Virtual Environment for local Python installation +export VolumeSizeInGB=500 # Instances root EBS volume size +export region="us-west-2" # Virginia/us-east-1/use1-azX Ohio:us-east-2/use2-azX California/us-west-1/usw1-azX Oregon/us-west-2/usw2-azX Sidney/ap-southeast-2/apse2-azX Dublin/eu-west-1/euw1-azX +export az="usw2-az1" # set if you want to deploy a NEW VPC stack, unset if already deployed +export vpc_cf_stack_name="vpc-smhp-eks-test-1" # vpc_cf_stack_name="sagemaker-hyperpod" +export observability_cf_stack_name="Hyperpod-Cluster-Observability" # Slurm only - unset to deactivate +export SMHP_ClusterName="ml-cluster-gui1" # ClusterName="ml-cluster" +export EKS_ResourceNamePrefix="gui-smhp-eks-test-1" # "-cluster" will be added to it to name the EKS cluster + + +declare -A controller=( # controller node [instance_type]=count for Slurm orchestrator only + [ml.m5.4xlarge]="1" +) +declare -A login=( # login node [instance_type]=count for Slurm orchestrator only + [ml.m5.4xlarge]="1" + # [ml.m5.4xlarge]="2" + # [ml.m5.4xlarge]="2" +) +declare -A worker=( # worker node [instance_type]=count + # [ml.g5.8xlarge]="1" + [ml.m5.4xlarge]="1" +) +declare -A tag=( # SMHP cluster tags [Key]=Value + [project]="p1" + [cost]="c1" +) + + +### Do not set below variables except if you want to override the CloudFormation stack output's +# AmazonS3BucketName="" # S3 bucket for Life Cycle Scripts - if set, it overrides the VPC CloudFormation stack output's +# PrimaryPrivateSubnet="" # Private Subnet ID (subnet-xxxxxx) - if set, it overrides the VPC CloudFormation stack output's +# FSxLustreFilesystemMountname="" # FSx Lustre mount name (abcde1234) - if set, it overrides the VPC CloudFormation stack output's +# cf_FSxLustreFilesystemDNSname="" # FSx Lustre DNS name (fs-xxxxxxx.fsx.${region}.amazonaws.com) - if set, it overrides the VPC CloudFormation stack output's +# AmazonSagemakerClusterExecutionRoleArn="" # IAM Execution Role ARN - if set, it overrides the VPC CloudFormation stack output's +# FSxLustreFilesystemId="" # FSx Lustre FS ID (fs-xxxxxxxx) - if set, it overrides the VPC CloudFormation stack output's +# SecurityGroup="" # Security Group (sg-xxxxxxxxx) - if set, it overrides the VPC CloudFormation stack output's + +# Internal Enviromment variables - Should not be touched +export AWS_PAGER="" # deactivate AWS CLI paging + +awscli_pkg="awscli-exe-linux-x86_64.zip" # AWS CLI file name +awscli_url="https://awscli.amazonaws.com/${awscli_pkg}" # AWS CLI url to download and install it +gh_adt_url="aws-samples/awsome-distributed-training" # GitHub name of the awsome-distributed-training repo +gh_adt_dir="$(basename "${gh_adt_url}")" # awsome-distributed-training repo folder name +smhp_config="cluster-config.json" # SMHP cluster config file name +env_vars="env_vars" # environement variables exported to be used in further scripts +instances_types="${!controller[@]} ${!login[@]} ${!worker[@]}" # complete list of all instance types used in the SMHP cluster to check quotas + +case "${orchestrator}" in + slurm) + vpc_cf_stack_file="sagemaker-hyperpod.yaml" + vpc_cf_stack_url="https://awsome-distributed-training.s3.amazonaws.com/templates/${vpc_cf_stack_file}" # should use local file + observability_cf_stack_url="https://awsome-distributed-training.s3.amazonaws.com/templates/cluster-observability.yaml" + ;; + eks) + vpc_cf_stack_file="hyperpod-eks-full-stack.yaml" + vpc_cf_stack_url="https://ws-assets-prod-iad-r-pdx-f3b3f9f1a7d6a3d0.s3.us-west-2.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/${vpc_cf_stack_file}" # should use local file + observability_cf_stack_url="" + ;; + *) + pecho "ERROR: unknown orchestrator \"${orchestrator}\"" + exit 1 + ;; +esac + +# Slurm only +lcs_path="${gh_adt_dir}/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config" +smhp_provisioning="provisioning_parameters.json" +account_id="$(aws sts get-caller-identity | jq -r '.Account')" + +# EKS only +NodeRecovery="Automatic" # EKS only (string) default to "Automatic" - Enable node auto-recovery. Set to "None" to disable. +kubectl_url="https://s3.${region}.amazonaws.com/amazon-eks/1.29.3/2024-04-19/bin/linux/amd64/kubectl" +eksctl_url="https://github.com/eksctl-io/eksctl/releases/latest/download" +helm_url="https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3" +smhp_eks_policy_name="hyperpod-eks-policy" +lcs_eks_path="https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/main/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh" +gh_smhpcli_dir="sagemaker-hyperpod-cli" # aws/sagemaker-hyperpod-cli.git +gh_smhpcli_url="aws/${gh_smhpcli_dir}" +aws_fsx_csi_url="https://kubernetes-sigs.github.io/aws-fsx-csi-driver" +role_user_name=$(aws sts get-caller-identity --query "Arn" --output text | cut -d':' -f6 |cut -d'/' -f2 ) +role_user_arn=$(aws sts get-caller-identity --query "Arn" --output text ) +# nsight_sidecar="https://helm.ngc.nvidia.com/nvidia/devtools/charts/devtools-sidecar-injector-1.0.7.tgz" + +# pretty echo +pecho(){ + echo + echo "############## $@" +} + +# execute a command, print it first if verbose is set, exit if return code is non valid +run(){ + cmd="$@" + if [[ "$verbose" == "true" ]] ;then + echo "##### Running: \"${cmd}\" >" + fi + eval "${cmd}" + ret="$?" + if [[ ${ret} -ne 0 ]] ;then + echo + echo "##### Failed #####" + exit ${ret} + fi +} + +# same as run() but dedicated for validate-config.py only +# workaround for validate-config.py which does not return a valid error when failing +run_spe(){ + cmd="$@" + if [[ "$verbose" == "true" ]] ;then + echo "##### Running: \"${cmd}\" >" + fi + output=$(eval "${cmd}") + echo "${output}" + echo "${output}" | grep invalid &>/dev/null + if [[ ${?} -eq 0 ]] ;then + echo + echo "##### Failed #####" + exit 1 + fi +} + +# check all commands in args exist in path +dep_check(){ + for c in ${@} ;do + # $c --version | head -n1 + if ! command -v $c &> /dev/null ; then + echo "[WARNING] Command \"${c}\" can not be found." >&2 + # exit -1 + fi + done +} + +# wait for the CF stack in arg to be complete +cf_wait(){ + local stack="${1}" + + if [[ -n ${stack} ]] ;then + pecho "Waiting on CloudFormation stack \"${stack}\" to be fully deployed..." + aws cloudformation wait stack-create-complete --stack-name "${stack}" + if [[ $? -eq 0 ]] ;then + echo "...CloudFormation stack \"${stack}\" is deployed." + else + echo "ERROR: failed to wait on CloudFormation stack \"${stack}\"" + exit 1 + fi + fi +} + +# get a specific var out of a CF stack +cf_get_var(){ + cf_stack_name="${1}" + var="${2}" + output=$(aws cloudformation describe-stacks \ + --stack-name "${cf_stack_name}" \ + --query "Stacks[0].Outputs[?OutputKey==\`${var}\`].OutputValue" \ + --output text) + # --region "${region}" + + echo "${output}" +} + +# unset all variables from the CF stack from the environment +cf_unset_vars_all(){ + cf_stack_name="${1}" + output=$(aws cloudformation describe-stacks --stack-name "${cf_stack_name}" --output json 2>/dev/null) + if [[ $? -eq 0 ]] ;then + echo -n "Unsetting: " + for var in $(echo "${output}" | jq -r ".Stacks[].Outputs[].OutputKey") ;do + unset "${var}" + echo -n "${var} " + done | xargs echo -n | sed 's/ /, /g' + echo '.' + fi +} + +# export all variables from the CF stack to the environment +cf_export_all(){ + cf_stack_name="${1}" + output=$(aws cloudformation describe-stacks --stack-name "${cf_stack_name}" --output json | jq ".Stacks[].Outputs") + # --region "${region}" + read -d '' -r -a key <<< $(echo "${output}" | jq -r ".[].OutputKey") + read -d '' -r -a val <<< $(echo "${output}" | jq -r ".[].OutputValue") + for ((i=0;i<${#key[@]};++i)); do + # export "cf_${key[i]}"="${val[i]}" + # echo export "cf_${key[i]}"="${val[i]}" + if [[ -z "${!key[i]}" ]] ;then + # from cf stack + export "${key[i]}"="${val[i]}" + echo export "${key[i]}"="${val[i]}" | tee -a ${env_vars} + else + # Recap Customer input, already exported + echo export "${key[i]}"="${!key[i]}" | tee -a ${env_vars} + fi + done +} + +# pre-initialize the environement +init_env_pre(){ + pecho "Pre-initialize the environement" + + dep_check jq aws curl wget sed tr awk column + pwd_previous="$(pwd)" + pecho "Moving to ${install_dir}" + mkdir -p "${install_dir}" + mkdir -p "${bin_dir}" + cd "${install_dir}" + rm -f "${env_vars}" + + region_previous=$(aws configure get region) + pecho "Current aws-cli region is set to \"${region_previous}\", setting it to \"${region}\"" + export AWS_REGION="$region" + run aws configure set region $region # to avoid difference between configuration +} + +# checking ${SMHP_ClusterName} as SMHP cluster name, failing if already existing +smhp_check_cluster(){ + pecho "Checking if the HyperPod cluster \"${SMHP_ClusterName}\" exists already:" + + aws sagemaker describe-cluster --cluster-name "${SMHP_ClusterName}" &> /dev/null + if [[ $? -eq 0 ]] ;then + status=$(aws sagemaker describe-cluster --cluster-name "${SMHP_ClusterName}" | jq -r .ClusterStatus) + pecho "Stopping: the HyperPod cluster \"${SMHP_ClusterName}\" exists already with the status \"${status}\"." + echo "You can change the ClusterName variable or delete it with:" + echo "aws sagemaker delete-cluster --cluster-name \"${SMHP_ClusterName}\"" + echo + exit 1 + else + echo "using \"${SMHP_ClusterName}\" as cluster name." + fi +} + +# listing SMHP cluters +smhp_list(){ + pecho "Listing current sagemaker HyperPod clusters:" + run 'aws sagemaker list-clusters --output text | sed "s/CLUSTERSUMMARIES//g" | column -t' +} + + +# Downloading and Deploying the VPC stack ${vpc_cf_stack_name} using environment variables +# waiting on its deployement and exporting the output variables +vpc_stack_deploy(){ + pecho "Downloading and Deploying the VPC stack \"${vpc_cf_stack_name}\"" + run wget -q "${vpc_cf_stack_url}" # should use local file instead + + case "${orchestrator}" in + slurm) + run aws cloudformation deploy \ + --capabilities CAPABILITY_NAMED_IAM \ + --stack-name "${vpc_cf_stack_name}" \ + --template-file "${vpc_cf_stack_file}" \ + --parameter-overrides \ + VPCName="SageMaker HyperPod VPC"\ + PrimarySubnetAZ=${az}\ + BackupSubnetAZ=""\ + CreateS3Endpoint='true'\ + Capacity=1200\ + S3Bucket="sagemaker-lifecycle"\ + PerUnitStorageThroughput=250\ + Compression="LZ4" + ;; + eks) + run aws cloudformation deploy \ + --capabilities CAPABILITY_NAMED_IAM \ + --stack-name "${vpc_cf_stack_name}" \ + --template-file "${vpc_cf_stack_file}" \ + --parameter-overrides \ + AvailabilityZoneId="${az}" \ + ResourceNamePrefix="${EKS_ResourceNamePrefix}" + ;; + esac + + cf_wait "${vpc_cf_stack_name}" + cf_export_all "${vpc_cf_stack_name}" +} + +# checking the VPC stack has been deployed with the expected AZ +# deploying it if needed +vpc_stack_check_deploy(){ + if [[ ${vpc_cf_stack_name} ]] ;then + pecho "Checking about the VPC stack \"${vpc_cf_stack_name}\":" + aws cloudformation describe-stacks --stack-name "${vpc_cf_stack_name}" &>/dev/null + if [[ $? -eq 0 ]] ;then + cf_wait "${vpc_cf_stack_name}" + echo + cf_export_all "${vpc_cf_stack_name}" + echo + case "${orchestrator}" in + slurm) private_subnet="${PrimaryPrivateSubnet}" ;; + eks) private_subnet="${PrivateSubnet1}" ;; + esac + cf_az=$(aws ec2 describe-subnets --subnet-ids "${private_subnet}" --query 'Subnets[0].AvailabilityZoneId' --output text) + if [[ -n ${az} ]] ;then + echo "You set az=\"${az}\". But the VPC stack \"${vpc_cf_stack_name}\" does exist already. It's deployed in AZ \"${cf_az}\"" + if [[ "${az}" != "${cf_az}" ]] ;then + echo "WARNING: the requested AZ differs (\"${az}\") from the AZ retrieved (\"${cf_az}\") from the VPC stack!" + fi + else + echo "We are using the VPC stack \"${vpc_cf_stack_name}\" already deployed in AZ \"${cf_az}\"." + fi + else + if [[ -n ${az} ]] ;then + vpc_stack_deploy + else + echo "The VPC stack \"${vpc_cf_stack_name}\" does not exist yet, you need to set the \"az\" variable in this script." + fi + fi + fi +} + +# Checking about the Observability stack ${observability_cf_stack_name} and deploys it if needed +check_obs_stack_slurm(){ + if [[ ${observability_cf_stack_name} ]] ;then + pecho "Checking about the Observability stack \"${observability_cf_stack_name}\":" + aws cloudformation describe-stacks --stack-name "${observability_cf_stack_name}" &>/dev/null + if [[ $? -eq 0 ]] ;then + echo "The Observability stack \"${observability_cf_stack_name}\" is deployed." + else + echo "Deploying the Observability stack \"${observability_cf_stack_name}\"" + run aws cloudformation create-stack \ + --capabilities CAPABILITY_NAMED_IAM \ + --stack-name "${observability_cf_stack_name}" \ + --template-url "${observability_cf_stack_url}" + fi + fi +} + +# future, not working yet +grafana_import_dashboard(){ + cf_wait "${observability_cf_stack_name}" + cf_export_all "${observability_cf_stack_name}" + # AMPRemoteWriteURL + # GrafanWorkspaceURL + # https://grafana.com/grafana/dashboards/4323-slurm-dashboard/ + # https://grafana.com/grafana/dashboards/1860-node-exporter-full/ + # https://grafana.com/grafana/dashboards/12239-nvidia-dcgm-exporter-dashboard/ + # https://grafana.com/grafana/dashboards/20579-efa-metrics-dev/ + # https://grafana.com/grafana/dashboards/20906-fsx-lustre/ + # AWS Data Sources -- Data Source -- CloudWatch and Prometheus with Region + # https://g-xxx.grafana-workspace.us-west-2.amazonaws.com/a/aws-datasource-provisioner-app/?tab=datasources&id=prometheus + # https://g-xxx.grafana-workspace.us-west-2.amazonaws.com/a/aws-datasource-provisioner-app/?tab=datasources&id=cloudwatch +} + +# Generate environement variables based on inputs and CloudFormation stack outputs +init_env(){ + pecho "Generate environement variables based on inputs and CloudFormation stack outputs:" + # export role_arn="arn:aws:iam::${account_id}:role/${role_name}" + run export role_name=$(basename "${AmazonSagemakerClusterExecutionRoleArn}") + run export SourceS3Uri="s3://${AmazonS3BucketName}/src" +} + +# install awscli since the last version is usually required to use the last update from SMHP improvements +install_awscli(){ + pecho "Installing/Updating the AWS CLI" + echo "Pre-update: $(aws --version)" + curl -s "${awscli_url}" -o "${awscli_pkg}" + rm -rf ./aws + unzip "${awscli_pkg}" > /dev/null + # ./aws/install --help + # -i, --install-dir # default: /usr/local/aws-cli + # -b, --bin-dir # default: /usr/local/bin + sudo ./aws/install --update > /dev/null + rm -rf ./aws + echo "Post-update: $(aws --version)" +} + +# checking quotas for instances and and EBS size per instances +check_quota(){ + # no action(s) due to API throttling risk + local quota_file="sagemaker.quota" + aws service-quotas list-service-quotas --service-code sagemaker \ + --query 'Quotas[].{Name:QuotaName,Value:Value,Metric:UsageMetric.MetricDimensions.Resource,Code:QuotaCode}' \ + --output text | tr " " "_" > ${quota_file} + + pecho "Checking your Quotas for SageMaker EBS" + grep "cluster/ebs_per_instance_max" < ${quota_file} + + pecho "Checking your Quotas for SageMaker instances used" + for itype in $(sort -u <<< $( tr ' ' '\n' <<< ${instances_types})) ;do + grep "cluster/${itype}" < ${quota_file} + done | column -t + +} + +# adding required policies for the observability stack +obs_enable_iam_hp_role(){ + pecho "Adding Observability policies to the role \"${role_name}\"" + run aws iam attach-role-policy --role-name ${role_name} --policy-arn arn:aws:iam::aws:policy/AmazonPrometheusRemoteWriteAccess + run aws iam attach-role-policy --role-name ${role_name} --policy-arn arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess + + pecho "Checking policies of the role \"${role_name}\"" + run aws iam list-attached-role-policies --role-name ${role_name} --query 'AttachedPolicies[].PolicyName' + run aws iam list-role-policies --role-name ${role_name} +} + +# clone a specific GitHub repo +gh_clone(){ + local repo="${1}" + local folder="$(basename "${repo}")" + + pecho "Cloning the GitHub repository \"${repo}\":" + rm -rf "./${folder}" + # run git clone --depth=1 "https://github.com/${repo}" + run git clone "https://github.com/${repo}" +} + +# Life Cycle Script option swap to add a feature for example +lcs_option_swap(){ + local key="${1}" + local val="${2}" + local conf="config.py" + pecho "Swapping \"${key}\" to \"${val}\" in Life Cycle Script configuration file \"${conf}\"" + echo "Before: $(grep "${key}.*=" ${lcs_path}/${conf})" + sed -i "${lcs_path}/${conf}" -e "s#${key} =.*#${key} = ${val}#g" + echo "After: $(grep "${key}.*=" ${lcs_path}/${conf})" +} + +# Generating the HyperPod Slurm cluster configuration file +gen_cluster_conf_slurm(){ + local cluster_conf="${1}" # smhp_config + # needs bash arrays: controller, login, worker + pecho "Generating the HyperPod Slurm cluster configuration ${cluster_conf}" + + cat > ${cluster_conf} << EOL +{ + "ClusterName": "${SMHP_ClusterName}", + "InstanceGroups": [ +EOL + + for itype in "${!controller[@]}" ;do + icount=1 # only one + cat >> ${cluster_conf} << EOL + { + "InstanceGroupName": "controller-machine", + "InstanceType": "${itype}", + "InstanceStorageConfigs": [ + { + "EbsVolumeConfig": { + "VolumeSizeInGB": ${VolumeSizeInGB} + } + } + ], + "InstanceCount": ${icount}, + "LifeCycleConfig": { + "SourceS3Uri": "${SourceS3Uri}", + "OnCreate": "on_create.sh" + }, + "ExecutionRole": "${AmazonSagemakerClusterExecutionRoleArn}", + "ThreadsPerCore": 2 + }, +EOL + break # only one Instance Type + done + + group=1 + for itype in "${!login[@]}" ;do + icount="${login[${itype}]}" + cat >> ${cluster_conf} << EOL + { + "InstanceGroupName": "login-group", + "InstanceType": "${itype}", + "InstanceStorageConfigs": [ + { + "EbsVolumeConfig": { + "VolumeSizeInGB": ${VolumeSizeInGB} + } + } + ], + "InstanceCount": ${icount}, + "LifeCycleConfig": { + "SourceS3Uri": "${SourceS3Uri}", + "OnCreate": "on_create.sh" + }, + "ExecutionRole": "${AmazonSagemakerClusterExecutionRoleArn}", + "ThreadsPerCore": 2 + }, +EOL + # "InstanceGroupName": "login-group-${group}", + ((group++)) + done + + group=1 + for itype in "${!worker[@]}" ;do + icount="${worker[${itype}]}" + if [[ $group -ge 2 ]] ;then echo ' ,' >> ${cluster_conf} ;fi + cat >> ${cluster_conf} << EOL + { + "InstanceGroupName": "worker-group-${group}", + "InstanceType": "${itype}", + "InstanceCount": ${icount}, + "InstanceStorageConfigs": [ + { + "EbsVolumeConfig": { + "VolumeSizeInGB": ${VolumeSizeInGB} + } + } + ], + "LifeCycleConfig": { + "SourceS3Uri": "${SourceS3Uri}", + "OnCreate": "on_create.sh" + }, + "ExecutionRole": "${AmazonSagemakerClusterExecutionRoleArn}", + "ThreadsPerCore": 1 + } +EOL + ((group++)) + done + echo ' ],' >> ${cluster_conf} + + if [[ ${#tag[@]} -ge 1 ]] ;then + echo ' "Tags": [' >> ${cluster_conf} + group=1 + for key in "${!tag[@]}" ;do + val="${tag[${key}]}" + if [[ $group -ge 2 ]] ;then echo ' ,' >> ${cluster_conf} ;fi + cat >> ${cluster_conf} << EOL + { + "Key": "${key}", + "Value": "${val}" + } +EOL + ((group++)) + done + echo ' ],' >> ${cluster_conf} + fi + + cat >> ${cluster_conf} << EOL + "VpcConfig": { + "SecurityGroupIds": ["${SecurityGroup}"], + "Subnets":["${PrimaryPrivateSubnet}"] + } +} +EOL +} + +# Generating the HyperPod cluster provisioning file +gen_cluster_provisioning_slurm(){ + local provisioning_conf="${1}" # smhp_provisioning + # needs bash arrays: controller, login, worker + pecho "Generating the HyperPod cluster provisioning ${provisioning_conf}" + + cat > ${provisioning_conf} << EOL +{ + "version": "1.0.0", + "workload_manager": "slurm", + "controller_group": "controller-machine", +EOL + + if [[ ${#login[@]} -ge 1 ]] ;then + echo ' "login_group": "login-group",' >> ${provisioning_conf} + # echo ' "login_groups": [' >> ${smhp_provisioning} + # group=1 + # for itype in "${!login[@]}" ;do + # if [[ $group -ge 2 ]] ;then echo ' ,' >> ${smhp_provisioning} ;fi + # cat >> ${smhp_provisioning} << EOL + # { + # "instance_group_name": "login-group-${group}" + # } + # EOL + # # "partition_name": "${itype}" + # ((group++)) + # done + fi + # echo " ]," >> ${smhp_provisioning} + + echo ' "worker_groups": [' >> ${provisioning_conf} + group=1 + for itype in "${!worker[@]}" ;do + if [[ $group -ge 2 ]] ;then echo ' ,' >> ${provisioning_conf} ;fi + cat >> ${provisioning_conf} << EOL + { + "instance_group_name": "worker-group-${group}", + "partition_name": "${itype}" + } +EOL + ((group++)) + done + echo " ]," >> ${provisioning_conf} + + cat >> ${provisioning_conf} << EOL + "fsx_dns_name": "${FSxLustreFilesystemDNSname}", + "fsx_mountname": "${FSxLustreFilesystemMountname}" +} +EOL +} + +# install a local python venv to validate the conf produced +install_boto_venv(){ + local venv="${1}" # venv_smhp + pecho "Installing Python3 boto3 in \"${venv}\"" + + rm -rf "${venv}" + python3 -m venv "${venv}" + source ${venv}/bin/activate + python3 -m pip install --upgrade pip > /dev/null + python3 -m pip install --upgrade boto3 > /dev/null + python3 -m pip install --upgrade jsonschema > /dev/null + deactivate +} + +# validate the conf produced +validate_slurm_config(){ + source ${venv_smhp}/bin/activate + pecho "Validating the configuration files:" + run_spe python3 ${gh_adt_dir}/1.architectures/5.sagemaker-hyperpod/validate-config.py \ + --cluster-config ${smhp_config} \ + --provisioning-parameters ${smhp_provisioning} + deactivate +} + +# Uploading Life Cycle Scripts and configuration to the S3 bucket +upload_lcs_slurm(){ + pecho "Uploading Life Cycle Scripts and configuration to the S3 bucket \"${SourceS3Uri}\"" + # run "aws s3 rm ${SourceS3Uri} --recursive > /dev/null" # WARNING + run "aws s3 cp ${smhp_provisioning} ${SourceS3Uri}/ > /dev/null" + run "aws s3 cp --recursive ${gh_adt_dir}/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/ ${SourceS3Uri} > /dev/null" + run aws s3 ls ${SourceS3Uri} --recursive --summarize --human-readable + + pecho "Checking \"${smhp_provisioning}\" upload:" + remote="$(aws s3 cp ${SourceS3Uri}/${smhp_provisioning} - | md5sum)" + loc="$(cat ${smhp_provisioning} | md5sum)" + if [[ "${remote}" == "${loc}" ]] ;then + echo "\"${smhp_provisioning}\" successfully uploaded." + else + echo "\"${smhp_provisioning}\" upload error, aborting." + exit 1 + fi +} + +# Trigger the HyperPod cluster creation +cluster_create(){ + pecho "Trigger the HyperPod cluster creation" + date + run aws sagemaker create-cluster --cli-input-json file://${smhp_config} --output text + # --region ${region} + +} + +# Waiting on the HyperPod cluster creation +cluster_wait(){ + echo -n "Waiting on the HyperPod cluster \"${SMHP_ClusterName}\" to be ready..." + while true ;do + status=$(aws sagemaker describe-cluster --cluster-name "${SMHP_ClusterName}" | jq -r .ClusterStatus) + case "${status}" in + "Creating") + echo -n "." + sleep 5 + ;; + "Failed") + echo + echo "the HyperPod cluster \"${SMHP_ClusterName}\" status is \"$status\"" + date + aws sagemaker describe-cluster --cluster-name "${SMHP_ClusterName}" | jq -r .FailureMessage + echo + pecho "Deleting \"${SMHP_ClusterName}\":" + run aws sagemaker delete-cluster --cluster-name "${SMHP_ClusterName}" + exit 1 + break + ;; + "RollingBack") + echo + echo "the HyperPod cluster \"${SMHP_ClusterName}\" status is \"$status\"" + date + run 'aws sagemaker describe-cluster --cluster-name "${SMHP_ClusterName}" | jq -r .FailureMessage' + echo + echo "Once the cluster will be in \"Failed\" status, you can delete it with:" + echo "aws sagemaker delete-cluster --cluster-name \"${SMHP_ClusterName}\"" + exit 1 + break + ;; + *|"InService") + echo + echo "the HyperPod cluster \"${SMHP_ClusterName}\" status is \"$status\"" + date + smhp_list + break + ;; + esac + done +} + +# revert the awscli region conf setting +close_env(){ + pecho "Reversing back to your previous configuration " + cd "${pwd_previous}" + run aws configure set region ${region_previous} # revert client environment modification +} + +# preparing for SSH/SSM to clusters instances +gen_ssm_cli_slurm(){ + local cn="${1}" # SMHP_ClusterName + + cluster_id=$(aws sagemaker describe-cluster --cluster-name "${cn}" | jq -r '.ClusterArn' | awk -F/ '{gsub(/"/, "", $NF); print $NF}') + for node_group in $(aws sagemaker describe-cluster --cluster-name "${cn}" | jq -r ".InstanceGroups[].InstanceGroupName") ;do + node_group_list="$(aws sagemaker list-cluster-nodes --cluster-name "${cn}" --instance-group-name-contains "${node_group}" | jq -r '.ClusterNodeSummaries[].InstanceId')" + echo "For ${node_group} ($(echo ${node_group_list} | wc -w) nodes):" + for instance_id in ${node_group_list} ;do + cmd="aws ssm start-session --target sagemaker-cluster:${cluster_id}_${node_group}-${instance_id}" + echo " ${cmd}" + done + done + + pecho "Getting \"easy-ssh.sh\"" + run curl -s -O https://raw.githubusercontent.com/${gh_adt_url}/main/1.architectures/5.sagemaker-hyperpod/easy-ssh.sh + chmod +x easy-ssh.sh + echo "you can use ./easy-ssh.sh -c controller-machine ${cn}" +} + + +# EKS only - Install kubectl +install_kubectl(){ + local cmd="kubectl" + pecho "Install ${cmd}" + + curl -s -O "${kubectl_url}" + chmod +x ./${cmd} + sudo cp ./${cmd} ${bin_dir}/${cmd} +} + +# EKS only - Install eksctl +install_eksctl(){ + local arch=$(uname -m) + case ${arch} in + armv5*) arch="armv5" ;; + armv6*) arch="armv6" ;; + armv7*) arch="arm" ;; + aarch64) arch="arm64" ;; + x86) arch="386" ;; + x86_64) arch="amd64" ;; + i686) arch="386" ;; + i386) arch="386" ;; + esac + # local arch="amd64" # for ARM systems, set arch to: `arm64`, `armv6` or `armv7` + local platform="$(uname -s)_${arch}" + pecho "Install eksctl ${platform}" + + curl -sLO "${eksctl_url}/eksctl_${platform}.tar.gz" + # (Optional) Verify checksum + curl -sL "${eksctl_url}/eksctl_checksums.txt" | grep ${platform} | sha256sum --check + tar -xzf eksctl_${platform}.tar.gz -C ${install_dir}/ && rm eksctl_${platform}.tar.gz + sudo mv ${install_dir}/eksctl ${bin_dir}/ +} + +# EKS only - Install Helm +install_helm(){ + export HELM_INSTALL_DIR="${bin_dir}" + local script="get_helm.sh" + pecho "Install helm" + + curl -fsSL -o ${script} "${helm_url}" + chmod 700 ${script} + ./${script} + # rm -f ${script} +} + +# EKS only - Adding policy ${smhp_eks_policy_name} to ${role_user_name} +eks_hp_enable_iam_user_role(){ + local policy_file="hyperpod-eks-policy.json" + pecho "Adding policy \"${smhp_eks_policy_name}\" to \"${role_user_name}\"" + + cat > ${policy_file} << EOL +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "${AmazonSagemakerClusterExecutionRoleArn}" + }, + { + "Effect": "Allow", + "Action": [ + "sagemaker:CreateCluster", + "sagemaker:DeleteCluster", + "sagemaker:DescribeCluster", + "sagemaker:DescribeCluterNode", + "sagemaker:ListClusterNodes", + "sagemaker:ListClusters", + "sagemaker:UpdateCluster", + "sagemaker:UpdateClusterSoftware", + "sagemaker:DeleteClusterNodes", + "eks:DescribeCluster", + "eks:CreateAccessEntry", + "eks:DescribeAccessEntry", + "eks:DeleteAccessEntry", + "eks:AssociateAccessPolicy", + "iam:CreateServiceLinkedRole" + ], + "Resource": "*" + } + ] +} +EOL + + aws iam create-policy \ + --policy-name "${smhp_eks_policy_name}" \ + --policy-document file://${policy_file} + + aws iam attach-role-policy \ + --policy-arn "arn:aws:iam::${account_id}:policy/${smhp_eks_policy_name}" \ + --role-name "${role_user_name}" +} + +# EKS only - Upload the OnCreate script to ${SourceS3Uri} +upload_lcs_eks(){ + pecho "Upload the OnCreate LCS to ${SourceS3Uri}" + + local OnCreate="on_create.sh" + curl -s "${lcs_eks_path}" --output "${OnCreate}" + aws s3 cp "${OnCreate}" "${SourceS3Uri}/" +} + +# EKS only - Configure the EKS Cluster and add the required access-entry and access-policy +eks_configure(){ + pecho "Configure the EKS Cluster" + + run aws eks update-kubeconfig --name "${ClusterName}" + + ls -lh ${HOME}/.kube/config + cat ${HOME}/.kube/config + + run kubectl config current-context + + pecho "Create additional access entries, to give your IAM principal access your EKS cluster, with an access policy and its scope" + + aws eks create-access-entry --cluster-name ${ClusterName} --principal-arn ${role_user_arn} --type STANDARD + # --username ${role_user_name} + aws eks list-access-entries --cluster-name ${ClusterName} + aws eks associate-access-policy \ + --cluster-name ${ClusterName} --principal-arn ${role_user_arn} \ + --access-scope type=cluster --policy-arn "arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy" + aws eks list-associated-access-policies --cluster-name ${ClusterName} --principal-arn ${role_user_arn} +} + +# EKS only - Install the Helm Chart, update dependencies, dry run, deploy and list them, and locally test the helm chart +eks_install_depencies(){ + pecho "Install the Helm Chart, update dependencies, dry run, deploy and list them, and locally test the helm chart" + + gh_clone "${gh_smhpcli_url}" + cd "${gh_smhpcli_dir}/helm_chart" + helm lint HyperPodHelmChart + helm dependencies update HyperPodHelmChart + helm install dependencies HyperPodHelmChart --dry-run + helm install dependencies HyperPodHelmChart --namespace kube-system + run helm list --namespace kube-system + cd - + + cd "${gh_smhpcli_dir}" + source ${venv_smhp}/bin/activate + pip install . + which hyperpod + deactivate + cd - + + # https://catalog.ngc.nvidia.com/orgs/nvidia/teams/devtools/helm-charts/devtools-sidecar-injector + # NVIDIA DevTools Sidecar Injector + # helm install -f custom_values.yaml devtools-sidecar-injector "${nsight_sidecar}" + +} + +# EKS only - Display everything which has been installed in EKS +eks_list(){ + pecho "Display everything which has been installed in EKS" + + run kubectl get svc + run kubectl get ds health-monitoring-agent -n aws-hyperpod + run kubectl get ds dependencies-nvidia-device-plugin -n kube-system + run kubectl get ds neuron-device-plugin-daemonset -n kube-system + run kubectl get ds dependencies-aws-efa-k8s-device-plugin -n kube-system + run kubectl get deploy dependencies-training-operators -n kubeflow + run kubectl get crd | grep kubeflow + run kubectl get deploy dependencies-mpi-operator -n kube-system + run kubectl get crd mpijobs.kubeflow.org -n kubeflow -o jsonpath='{.status.storedVersions[]}' + run kubectl get priorityclass + run kubectl describe pvc fsx-claim + run kubectl get storageclass + run kubectl get nodes -o wide + run helm list --namespace kube-system + run kubectl get namespaces +} + +# Generating the HyperPod EKS cluster configuration +gen_cluster_conf_eks(){ + local cluster_conf="${1}" # smhp_config + pecho "Generating the HyperPod EKS cluster configuration ${cluster_conf}" + + cat > ${cluster_conf} << EOL +{ + "ClusterName": "${SMHP_ClusterName}", + "Orchestrator": { + "Eks": + { + "ClusterArn": "${ClusterArn}" + } + }, + "InstanceGroups": [ +EOL + + group=1 + for itype in "${!worker[@]}" ;do + icount="${worker[${itype}]}" + if [[ $group -ge 2 ]] ;then echo ' ,' >> ${cluster_conf} ;fi + cat >> ${cluster_conf} << EOL + { + "InstanceGroupName": "worker-group-${group}", + "InstanceType": "${itype}", + "InstanceCount": ${icount}, + "InstanceStorageConfigs": [ + { + "EbsVolumeConfig": { + "VolumeSizeInGB": ${VolumeSizeInGB} + } + } + ], + "LifeCycleConfig": { + "SourceS3Uri": "${SourceS3Uri}", + "OnCreate": "on_create.sh" + }, + "ExecutionRole": "${AmazonSagemakerClusterExecutionRoleArn}", + "ThreadsPerCore": 1 +EOL + if [[ "${itype}" =~ ml\.[gp][0-9]+[a-z]*\.[0-9]+xlarge ]] ;then + echo ' "OnStartDeepHealthChecks": ["InstanceStress", "InstanceConnectivity"]' >> ${cluster_conf} + fi + echo ' }' >> ${cluster_conf} + ((group++)) + done + echo ' ],' >> ${cluster_conf} + + if [[ ${#tag[@]} -ge 1 ]] ;then + echo ' "Tags": [' >> ${cluster_conf} + group=1 + for key in "${!tag[@]}" ;do + val="${tag[${key}]}" + if [[ $group -ge 2 ]] ;then echo ' ,' >> ${cluster_conf} ;fi + cat >> ${cluster_conf} << EOL + { + "Key": "${key}", + "Value": "${val}" + } +EOL + ((group++)) + done + echo ' ],' >> ${cluster_conf} + fi + + cat >> ${cluster_conf} << EOL + "VpcConfig": { + "SecurityGroupIds": ["${NoIngressSecurityGroup}"], + "Subnets":["${PrivateSubnet1}"] + }, + "NodeRecovery": "${NodeRecovery}" +} +EOL +} + +# EKS only - Install the Amazon FSx for Lustre CSI Driver fsx-csi-controller +eks_setup_fsxl_csi(){ + local driver="$(basename "${aws_fsx_csi_url}")" # aws-fsx-csi-driver + local role_eks_fsxl="AmazonEKSFSxLustreCSIDriverFullAccess" + local role_sa_arn=$(aws iam get-role --role-name "${role_eks_fsxl}" --query 'Role.Arn' --output text) + local fsx_csi="fsx-csi-controller" + local sa_name="${fsx_csi}-sa" + + pecho "Install the Amazon FSx for Lustre CSI Driver fsx-csi-controller" + + eksctl utils associate-iam-oidc-provider --cluster "${ClusterName}" --approve + helm repo add "${driver}" "${aws_fsx_csi_url}" + helm repo update + helm upgrade --install "${driver}" "${driver}/${driver}" --namespace kube-system + + eksctl create iamserviceaccount \ + --name "${sa_name}" \ + --override-existing-serviceaccounts \ + --namespace kube-system \ + --cluster "${ClusterName}" \ + --attach-policy-arn "arn:aws:iam::aws:policy/AmazonFSxFullAccess" \ + --approve \ + --role-name "${role_eks_fsxl}" + + + kubectl annotate serviceaccount -n kube-system "${sa_name}" \ + eks.amazonaws.com/role-arn=${role_sa_arn} --overwrite=true + + kubectl get serviceaccount -n kube-system "${sa_name}" -oyaml + kubectl rollout restart deployment "${fsx_csi}" -n kube-system +} + +# EKS only - Install FSx for Lustre CSI Driver Dynamic Provisioning with StorageClass fsx-sc provisioner: fsx.csi.aws.com and PersistentVolumeClaim fsx-claim +eks_gen_fsxl_csi_dynamic(){ + sc_fsxl_conf="storageclass.yaml" + pvc_fsxl_conf="pvc.yaml" + pecho "Install FSx for Lustre CSI Driver Dynamic Provisioning with StorageClass fsx-sc provisioner: fsx.csi.aws.com and PersistentVolumeClaim fsx-claim" + + cat << EOF > ${sc_fsxl_conf} +kind: StorageClass +apiVersion: storage.k8s.io/v1 +metadata: + name: fsx-sc +provisioner: fsx.csi.aws.com +parameters: + subnetId: ${PrivateSubnet1} + securityGroupIds: ${NoIngressSecurityGroup} + deploymentType: PERSISTENT_2 + automaticBackupRetentionDays: "0" + copyTagsToBackups: "true" + perUnitStorageThroughput: "250" + dataCompressionType: "LZ4" + fileSystemTypeVersion: "2.12" +mountOptions: + - flock +EOF + + cat < ${pvc_fsxl_conf} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: fsx-claim +spec: + accessModes: + - ReadWriteMany + storageClassName: fsx-sc + resources: + requests: + storage: 1200Gi +EOF + + kubectl apply -f ${sc_fsxl_conf} + kubectl apply -f ${pvc_fsxl_conf} + +} + +# TBD +# eks_gen_fsxl_csi_static(){ +# pecho "FSx for Lustre CSI Driver Static Provisioning" +# } + + +# main for SMHP Slurm +main_slurm(){ + + init_env_pre + + check_quota + smhp_list + + smhp_check_cluster "${SMHP_ClusterName}" + vpc_stack_check_deploy + check_obs_stack_slurm + + init_env + + obs_enable_iam_hp_role + # grafana_import_dashboard & + + install_awscli + gh_clone "${gh_adt_url}" + + # lcs_option_swap enable_mount_s3 True + # lcs_option_swap data_bucket \"${AmazonS3BucketName}\" + lcs_option_swap "enable_observability" "True" + + gen_cluster_conf_slurm "${smhp_config}" + gen_cluster_provisioning_slurm "${smhp_provisioning}" + + install_boto_venv "${venv_smhp}" + validate_slurm_config + + upload_lcs_slurm + + cluster_create + cluster_wait + + # pecho "Wait on previous background fonctions..." ; wait ; echo "...all done." + + gen_ssm_cli_slurm "${SMHP_ClusterName}" + close_env +} + +# main for SMHP EKS +main_eks(){ + + init_env_pre + + check_quota + smhp_list + + smhp_check_cluster "${SMHP_ClusterName}" + vpc_stack_check_deploy + + init_env + + install_awscli + install_kubectl + install_eksctl + install_helm + + eks_hp_enable_iam_user_role + upload_lcs_eks + eks_configure + eks_install_depencies + gen_cluster_conf_eks "${smhp_config}" + eks_setup_fsxl_csi + eks_gen_fsxl_csi_dynamic + # eks_gen_fsxl_csi_static + + eks_list + cluster_create + cluster_wait + + gen_ssm_cli_slurm "${SMHP_ClusterName}" + close_env +} + +# main +main(){ + case "${orchestrator}" in + slurm) + main_slurm $@ + ;; + eks) + main_eks $@ + ;; + *) + pecho "ERROR: unknown orchestrator \"${orchestrator}\"" + exit 1 + ;; + esac +} + + +main $@ + +exit + + + + + + diff --git a/1.architectures/5.sagemaker-hyperpod/easy_smhp/hyperpod-eks-full-stack.yaml b/1.architectures/5.sagemaker-hyperpod/easy_smhp/hyperpod-eks-full-stack.yaml new file mode 100644 index 00000000..024cdc9f --- /dev/null +++ b/1.architectures/5.sagemaker-hyperpod/easy_smhp/hyperpod-eks-full-stack.yaml @@ -0,0 +1,705 @@ +AWSTemplateFormatVersion: '2010-09-09' +Description: This template deploys a VPC, with three public and private subnets spread + across three Availability Zones. It deploys an internet gateway, with a default + route on the public subnets. It deploys a NAT gateway in each AZ, + and default routes for them in the private subnets. This template also deploys an EKS cluster, + an IAM execution role for SageMaker, and an S3 bucket for storing lifecycle scripts. + +Metadata: + AWS::CloudFormation::Interface: + ParameterGroups: + - + Label: + default: "Set the Stack Deployment Mode" + Parameters: + - CreateEKSCluster + - CreateSubnet + - ResourceNamePrefix + - + Label: + default: "Common Parameters (for Full and Integrative Deployment Modes)" + Parameters: + - AvailabilityZoneId + - PrivateSubnet1CIDR + - + Label: + default: "Full Deployment Mode Parameters" + Parameters: + - VpcCIDR + - PublicSubnet1CIDR + - PublicSubnet2CIDR + - PublicSubnet3CIDR + - EKSPrivateSubnet1CIDR + - EKSPrivateSubnet2CIDR + - EKSPrivateSubnet3CIDR + - KubernetesVersion + - + Label: + default: "Integrative Deployment Mode Parameters" + Parameters: + - VpcId + - NatGatewayId + - SecurityGroupId + + ParameterLabels: + CreateEKSCluster: + default: "[Full Deployment Mode] CreateEKSCluster: Do you need to create a new VPC and EKS cluster?" + CreateSubnet: + default: "[Integrative Deployment Mode] CreateSubnet: Do you need a new private /16 subnet for an existing VPC and EKS cluster?" + +Parameters: + + CreateEKSCluster: + Description: 'Boolean to enable or disable the creation of VPC and EKS Cluster resources' + Type: String + AllowedValues: ['true', 'false'] + Default: 'true' + + CreateSubnet: + Description: 'Boolean to enable or disable the creation of a /16 private subnet. Note, this parameter is only evaluated when the CreateEKSCluster parameter is set to false' + Type: String + AllowedValues: ['true', 'false'] + Default: 'false' + + ResourceNamePrefix: + Description: 'Prefix to be used for all resources created by this template' + Type: String + Default: 'hyperpod-eks' + +### ---------------- VPC Params ----------------### + AvailabilityZoneId: + Description: Please specify the Availability Zone Id you wish to deploy HyperPod ENIs into. Note, this parameter is not evaluated if you are using your own EKS Cluster and your own private subnet (CreateEKSCluster=false, CreateSubnet=false). + Type: String + Default: usw2-az2 + AllowedPattern: ^[a-z]{3,4}[0-9]-az[0-9]$ + ConstraintDescription: The Availability Zone Id must match the expression ^[a-z]{3,4}[0-9]-az[0-9]$. For example, use1-az4, usw2-az2, or apse1-az2. + + VpcId: + Description: (OPTIONAL) If you are using an Existing EKS Cluster, please specify the Id of your VPC (CreateEKSCluster=false, CreateSubnet=true). + Type: String + Default: vpc-1234567890abcdef0 + + VpcCIDR: + Description: Please enter the IP range (CIDR notation) for this VPC + Type: String + Default: 10.192.0.0/16 + + SecurityGroupId: + Description: (OPTIONAL) If you are using an Existing EKS Cluster, please specify the Id of your cluster security group (CreateEKSCluster=false, CreateSubnet=true). + Type: String + Default: sg-1234567890abcdef0 + + NatGatewayId: + Description: (OPTIONAL) If you are using an Existing EKS Cluster, please specify the Id of a NAT Gateway to route enternet bound traffic to (CreateEKSCluster=false, CreateSubnet=true). + Type: String + Default: nat-1234567890abcdef0 + + PublicSubnet1CIDR: + Description: Please enter the IP range (CIDR notation) for the public subnet in the first Availability Zone + Type: String + Default: 10.192.10.0/24 + + PublicSubnet2CIDR: + Description: Please enter the IP range (CIDR notation) for the public subnet in the second Availability Zone + Type: String + Default: 10.192.11.0/24 + + PublicSubnet3CIDR: + Description: Please enter the IP range (CIDR notation) for the public subnet in the third Availability Zone + Type: String + Default: 10.192.12.0/24 + + PrivateSubnet1CIDR: + Description: Please enter the IP range (CIDR notation) for the private subnet in the first Availability Zone + Type: String + Default: 10.1.0.0/16 + +### ---------------- EKS Params ----------------### + KubernetesVersion: + Description: Kubernetes version to use for EKS cluster + Type: String + Default: '1.29' + + EKSPrivateSubnet1CIDR: + Description: Please enter the IP range (CIDR notation) for the EKS private subnet in the first Availability Zone. EKS will use this subnet to deploy cross-account ENIs. + Type: String + Default: 10.192.7.0/28 + + EKSPrivateSubnet2CIDR: + Description: Please enter the IP range (CIDR notation) for the EKS private subnet in the second Availability Zone. EKS will use this subnet to deploy cross-account ENIs. + Type: String + Default: 10.192.8.0/28 + + EKSPrivateSubnet3CIDR: + Description: Please enter the IP range (CIDR notation) for the EKS private subnet in the third Availability Zone. EKS will use this subnet to deploy cross-account ENIs. + Type: String + Default: 10.192.9.0/28 + +### ---------------- Conditions ----------------### + +Conditions: + CreateEKSCluster: !Equals + - !Ref CreateEKSCluster + - 'true' + + CreateSubnet: !And + - !Not [!Condition CreateEKSCluster] + - !Equals [!Ref CreateSubnet, 'true'] + + EKSOrSubnet: !Or + - !Condition CreateEKSCluster + - !Condition CreateSubnet + +Resources: +### ---------------- VPC Resources ----------------### + VPC: + Type: AWS::EC2::VPC + Condition: CreateEKSCluster + Properties: + CidrBlock: !Ref VpcCIDR + EnableDnsSupport: true + EnableDnsHostnames: true + Tags: + - Key: Name + Value: !Sub '${ResourceNamePrefix}-vpc' + + AdditionalCidrBlock1: + Type: AWS::EC2::VPCCidrBlock + Condition: EKSOrSubnet + Properties: + VpcId: !If + - CreateEKSCluster + - !Ref VPC + - !Ref VpcId + CidrBlock: !Ref PrivateSubnet1CIDR + AmazonProvidedIpv6CidrBlock: false + + InternetGateway: + Type: AWS::EC2::InternetGateway + Condition: CreateEKSCluster + Properties: + Tags: + - Key: Name + Value: !Sub ${ResourceNamePrefix}-igw + + InternetGatewayAttachment: + Type: AWS::EC2::VPCGatewayAttachment + Condition: CreateEKSCluster + Properties: + InternetGatewayId: !Ref InternetGateway + VpcId: !Ref VPC + + PublicSubnet1: + Type: AWS::EC2::Subnet + Condition: CreateEKSCluster + Properties: + VpcId: !Ref VPC + AvailabilityZone: !Select [ 0, !GetAZs '' ] + CidrBlock: !Ref PublicSubnet1CIDR + MapPublicIpOnLaunch: true + Tags: + - Key: Name + Value: !Sub ${ResourceNamePrefix} Public Subnet (AZ1) + + PublicSubnet2: + Type: AWS::EC2::Subnet + Condition: CreateEKSCluster + Properties: + VpcId: !Ref VPC + AvailabilityZone: !Select [ 1, !GetAZs '' ] + CidrBlock: !Ref PublicSubnet2CIDR + MapPublicIpOnLaunch: true + Tags: + - Key: Name + Value: !Sub ${ResourceNamePrefix} Public Subnet (AZ2) + + PublicSubnet3: + Type: AWS::EC2::Subnet + Condition: CreateEKSCluster + Properties: + VpcId: !Ref VPC + AvailabilityZone: !Select [ 2, !GetAZs '' ] + CidrBlock: !Ref PublicSubnet3CIDR + MapPublicIpOnLaunch: true + Tags: + - Key: Name + Value: !Sub ${ResourceNamePrefix} Public Subnet (AZ3) + + # Used for HyperPod Instance Group Placement + PrivateSubnet1: + Type: AWS::EC2::Subnet + Condition: EKSOrSubnet + Properties: + VpcId: !If + - CreateEKSCluster + - !Ref VPC + - !Ref VpcId + AvailabilityZoneId: !Ref AvailabilityZoneId # select a specific AZ for capacity + CidrBlock: !Ref PrivateSubnet1CIDR + MapPublicIpOnLaunch: false + Tags: + - Key: Name + Value: !Sub ${ResourceNamePrefix} Private Subnet 1 + DependsOn: + - AdditionalCidrBlock1 + + EKSPrivateSubnet1: + Type: AWS::EC2::Subnet + Condition: CreateEKSCluster + Properties: + VpcId: !Ref VPC + AvailabilityZone: !Select [ 0, !GetAZs '' ] + CidrBlock: !Ref EKSPrivateSubnet1CIDR + MapPublicIpOnLaunch: false + Tags: + - Key: Name + Value: !Sub ${ResourceNamePrefix} EKS Cluster Private Subnet 1 + + EKSPrivateSubnet2: + Type: AWS::EC2::Subnet + Condition: CreateEKSCluster + Properties: + VpcId: !Ref VPC + AvailabilityZone: !Select [ 1, !GetAZs '' ] + CidrBlock: !Ref EKSPrivateSubnet2CIDR + MapPublicIpOnLaunch: false + Tags: + - Key: Name + Value: !Sub ${ResourceNamePrefix} EKS Cluster Private Subnet 2 + + EKSPrivateSubnet3: + Type: AWS::EC2::Subnet + Condition: CreateEKSCluster + Properties: + VpcId: !Ref VPC + AvailabilityZone: !Select [ 2, !GetAZs '' ] + CidrBlock: !Ref EKSPrivateSubnet3CIDR + MapPublicIpOnLaunch: false + Tags: + - Key: Name + Value: !Sub ${ResourceNamePrefix} EKS Cluster Private Subnet 3 + + NatGateway1EIP: + Type: AWS::EC2::EIP + Condition: CreateEKSCluster + DependsOn: InternetGatewayAttachment + Properties: + Domain: vpc + + NatGateway2EIP: + Type: AWS::EC2::EIP + Condition: CreateEKSCluster + DependsOn: InternetGatewayAttachment + Properties: + Domain: vpc + + NatGateway1: + Type: AWS::EC2::NatGateway + Condition: CreateEKSCluster + Properties: + AllocationId: !GetAtt NatGateway1EIP.AllocationId + SubnetId: !Ref PublicSubnet1 + + NatGateway2: + Type: AWS::EC2::NatGateway + Condition: CreateEKSCluster + Properties: + AllocationId: !GetAtt NatGateway2EIP.AllocationId + SubnetId: !Ref PublicSubnet2 + + PublicRouteTable: + Type: AWS::EC2::RouteTable + Condition: CreateEKSCluster + Properties: + VpcId: !Ref VPC + Tags: + - Key: Name + Value: !Sub ${ResourceNamePrefix} Public Routes + + DefaultPublicRoute: + Type: AWS::EC2::Route + Condition: CreateEKSCluster + DependsOn: InternetGatewayAttachment + Properties: + RouteTableId: !Ref PublicRouteTable + DestinationCidrBlock: 0.0.0.0/0 + GatewayId: !Ref InternetGateway + + PublicSubnet1RouteTableAssociation: + Type: AWS::EC2::SubnetRouteTableAssociation + Condition: CreateEKSCluster + Properties: + RouteTableId: !Ref PublicRouteTable + SubnetId: !Ref PublicSubnet1 + + PublicSubnet2RouteTableAssociation: + Type: AWS::EC2::SubnetRouteTableAssociation + Condition: CreateEKSCluster + Properties: + RouteTableId: !Ref PublicRouteTable + SubnetId: !Ref PublicSubnet2 + + PublicSubnet3RouteTableAssociation: + Type: AWS::EC2::SubnetRouteTableAssociation + Condition: CreateEKSCluster + Properties: + RouteTableId: !Ref PublicRouteTable + SubnetId: !Ref PublicSubnet3 + + PrivateRouteTable: + Type: AWS::EC2::RouteTable + Condition: EKSOrSubnet + Properties: + VpcId: !If + - CreateEKSCluster + - !Ref VPC + - !Ref VpcId + Tags: + - Key: Name + Value: !Sub ${ResourceNamePrefix} Private Routes (AZ1) + + DefaultPrivateRoute1: + Type: AWS::EC2::Route + Condition: EKSOrSubnet + Properties: + RouteTableId: !Ref PrivateRouteTable + DestinationCidrBlock: 0.0.0.0/0 + NatGatewayId: !If + - CreateEKSCluster + - !Ref NatGateway1 + - !Ref NatGatewayId + + PrivateSubnet1RouteTableAssociation: + Type: AWS::EC2::SubnetRouteTableAssociation + Condition: EKSOrSubnet + Properties: + RouteTableId: !Ref PrivateRouteTable + SubnetId: !Ref PrivateSubnet1 + + NoIngressSecurityGroup: + Type: AWS::EC2::SecurityGroup + Condition: EKSOrSubnet + Properties: + GroupName: !Sub '${ResourceNamePrefix}-no-ingress-sg' + GroupDescription: "Security group with no ingress rule" + VpcId: !If + - CreateEKSCluster + - !Ref VPC + - !Ref VpcId + + IntraSGCommunicationRule: + Type: AWS::EC2::SecurityGroupIngress + Condition: EKSOrSubnet + Properties: + Description: "Allow traffic within the security group" + GroupId: !Ref NoIngressSecurityGroup + IpProtocol: -1 + SourceSecurityGroupId: !Ref NoIngressSecurityGroup + + IntraSGCommunicationRuleEgress: + Type: AWS::EC2::SecurityGroupEgress + Condition: EKSOrSubnet + Properties: + Description: "Allow traffic within the security group" + GroupId: !Ref NoIngressSecurityGroup + IpProtocol: -1 + DestinationSecurityGroupId: !Ref NoIngressSecurityGroup + + InternetCommunicationRuleEgress: + Type: AWS::EC2::SecurityGroupEgress + Condition: EKSOrSubnet + Properties: + Description: "Allow traffic to internet" + GroupId: !Ref NoIngressSecurityGroup + CidrIp: 0.0.0.0/0 + IpProtocol: -1 + + FSxForLustreRule1: + Type: AWS::EC2::SecurityGroupIngress + Condition: EKSOrSubnet + Properties: + Description: "Allows Lustre traffic between FSx for Lustre file servers and Lustre clients" + GroupId: !Ref NoIngressSecurityGroup + IpProtocol: tcp + FromPort: 988 + ToPort: 988 + SourceSecurityGroupId: !Ref NoIngressSecurityGroup + + FSxForLustreRule2: + Type: AWS::EC2::SecurityGroupIngress + Condition: EKSOrSubnet + Properties: + Description: "Allows Lustre traffic between FSx for Lustre file servers and Lustre clients" + GroupId: !Ref NoIngressSecurityGroup + IpProtocol: tcp + FromPort: 1018 + ToPort: 1023 + SourceSecurityGroupId: !Ref NoIngressSecurityGroup + + CrossSGRuleIngress: + Type: AWS::EC2::SecurityGroupIngress + Condition: CreateSubnet + Properties: + Description: "Allow traffic across security groups" + GroupId: !Ref NoIngressSecurityGroup + IpProtocol: -1 + SourceSecurityGroupId: !Ref SecurityGroupId + + CrossSGRuleEgress: + Type: AWS::EC2::SecurityGroupEgress + Condition: CreateSubnet + Properties: + Description: "Allow traffic across security groups" + GroupId: !Ref NoIngressSecurityGroup + IpProtocol: -1 + DestinationSecurityGroupId: !Ref SecurityGroupId + + CrossSGRuleIngress2: + Type: AWS::EC2::SecurityGroupIngress + Condition: CreateSubnet + Properties: + Description: "Allow traffic across security groups" + GroupId: !Ref SecurityGroupId + IpProtocol: -1 + SourceSecurityGroupId: !Ref NoIngressSecurityGroup + + CrossSGRuleEgress2: + Type: AWS::EC2::SecurityGroupEgress + Condition: CreateSubnet + Properties: + Description: "Allow traffic across security groups" + GroupId: !Ref SecurityGroupId + IpProtocol: -1 + DestinationSecurityGroupId: !Ref NoIngressSecurityGroup + +### ---------------- EKS Cluster Resources ----------------### + ClusterRole: + Type: 'AWS::IAM::Role' + Condition: CreateEKSCluster + Properties: + RoleName: !Sub '${ResourceNamePrefix}-cluster-role-${AWS::Region}' + AssumeRolePolicyDocument: + Version: 2012-10-17 + Statement: + - Effect: Allow + Principal: + Service: + - eks.amazonaws.com + Action: + - 'sts:AssumeRole' + Path: / + ManagedPolicyArns: + - "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy" + + EKSCluster: + Type: 'AWS::EKS::Cluster' + Condition: CreateEKSCluster + Properties: + Name: !Sub '${ResourceNamePrefix}-cluster' + Version: !Ref KubernetesVersion + RoleArn: !GetAtt ClusterRole.Arn + AccessConfig: + AuthenticationMode: API_AND_CONFIG_MAP + Logging: + ClusterLogging: + EnabledTypes: + - Type: api + - Type: audit + - Type: authenticator + - Type: controllerManager + - Type: scheduler + ResourcesVpcConfig: + SubnetIds: + - !Ref EKSPrivateSubnet1 + - !Ref EKSPrivateSubnet2 + - !Ref EKSPrivateSubnet3 + SecurityGroupIds: + - !Ref NoIngressSecurityGroup + + + VpcCNIAddOn: + Type: 'AWS::EKS::Addon' + Condition: CreateEKSCluster + Properties: + AddonName: vpc-cni + ClusterName: !Ref EKSCluster + ResolveConflicts: OVERWRITE + + KubeProxyAddOn: + Type: 'AWS::EKS::Addon' + Condition: CreateEKSCluster + Properties: + AddonName: kube-proxy + ClusterName: !Ref EKSCluster + ResolveConflicts: OVERWRITE + + CoreDNSAddOn: + Type: 'AWS::EKS::Addon' + Condition: CreateEKSCluster + Properties: + AddonName: coredns + ClusterName: !Ref EKSCluster + ResolveConflicts: OVERWRITE + + PodIdentityAddOn: + Type: 'AWS::EKS::Addon' + Condition: CreateEKSCluster + Properties: + AddonName: eks-pod-identity-agent + ClusterName: !Ref EKSCluster + ResolveConflicts: OVERWRITE + +### ---------------- SageMaker Execution and Service Roles ----------------### + ExecutionRole: + Type: 'AWS::IAM::Role' + Properties: + AssumeRolePolicyDocument: + Version: 2012-10-17 + Statement: + - Effect: Allow + Principal: + Service: + - sagemaker.amazonaws.com + Action: + - 'sts:AssumeRole' + Path: / + ManagedPolicyArns: + - 'arn:aws:iam::aws:policy/AmazonSageMakerClusterInstanceRolePolicy' + Policies: + - PolicyName: !Sub '${ResourceNamePrefix}-ExecutionRolePolicy-${AWS::Region}' + PolicyDocument: + Version: 2012-10-17 + Statement: + - Effect: Allow + Action: + - 'ec2:AssignPrivateIpAddresses' + - 'ec2:CreateNetworkInterface' + - 'ec2:CreateNetworkInterfacePermission' + - 'ec2:DeleteNetworkInterface' + - 'ec2:DeleteNetworkInterfacePermission' + - 'ec2:DescribeNetworkInterfaces' + - 'ec2:DescribeVpcs' + - 'ec2:DescribeDhcpOptions' + - 'ec2:DescribeSubnets' + - 'ec2:DescribeSecurityGroups' + - 'ec2:DetachNetworkInterface' + - 'ec2:ModifyNetworkInterfaceAttribute' + - 'ec2:UnassignPrivateIpAddresses' + - 'ecr:BatchGetImage' + - 'ecr:GetAuthorizationToken' + - 'ecr:GetDownloadUrlForLayer' + - 'eks-auth:AssumeRoleForPodIdentity' + Resource: '*' + - Effect: Allow + Action: + - 'ec2:CreateTags' + Resource: 'arn:aws:ec2:*:*:network-interface/*' + - Effect: Allow + Action: + - 's3:ListBucket' + - 's3:GetObject' + Resource: + - !GetAtt Bucket.Arn + - !Sub '${Bucket.Arn}/*' + RoleName: !Sub '${ResourceNamePrefix}-ExecutionRole-${AWS::Region}' + + Bucket: + Type: 'AWS::S3::Bucket' + Properties: + BucketName: !Sub '${ResourceNamePrefix}-bucket-${AWS::AccountId}-${AWS::Region}' + BucketEncryption: + ServerSideEncryptionConfiguration: + - ServerSideEncryptionByDefault: + SSEAlgorithm: AES256 + + S3Endpoint: + Type: AWS::EC2::VPCEndpoint + Condition: EKSOrSubnet + Properties: + PolicyDocument: + Version: 2012-10-17 + Statement: + - Effect: Allow + Principal: '*' + Action: + - '*' + Resource: + - '*' + RouteTableIds: + - !Ref PrivateRouteTable + ServiceName: !Join + - '' + - - com.amazonaws. + - !Ref AWS::Region + - .s3 + VpcId: !If + - CreateEKSCluster + - !Ref VPC + - !Ref VpcId + +Outputs: + VPC: + Condition: EKSOrSubnet + Description: A reference to the created VPC + Value: !If + - CreateEKSCluster + - !Ref VPC + - !Ref VpcId + + PublicSubnet1: + Condition: CreateEKSCluster + Description: A reference to the public subnet in the 1st Availability Zone + Value: !Ref PublicSubnet1 + + PublicSubnet2: + Condition: CreateEKSCluster + Description: A reference to the public subnet in the 2nd Availability Zone + Value: !Ref PublicSubnet2 + + PublicSubnet3: + Condition: CreateEKSCluster + Description: A reference to the public subnet in the 3rd Availability Zone + Value: !Ref PublicSubnet3 + + PrivateSubnet1: + Condition: EKSOrSubnet + Description: A reference to the private subnet used for HyperPod ENIs + Value: !Ref PrivateSubnet1 + + EKSPrivateSubnet1: + Condition: CreateEKSCluster + Description: A reference to the EKS cluster private subnet in the 1st Availability Zone + Value: !Ref EKSPrivateSubnet1 + + EKSPrivateSubnet2: + Condition: CreateEKSCluster + Description: A reference to the EKS cluster private subnet in the 2nd Availability Zone + Value: !Ref EKSPrivateSubnet2 + + EKSPrivateSubnet3: + Condition: CreateEKSCluster + Description: A reference to the EKS cluster private subnet in the 3rd Availability Zone + Value: !Ref EKSPrivateSubnet3 + + NoIngressSecurityGroup: + Condition: EKSOrSubnet + Description: Security group with no ingress rule + Value: !Ref NoIngressSecurityGroup + + ClusterArn: + Condition: CreateEKSCluster + Description: The ARN of the EKS cluster + Value: !GetAtt EKSCluster.Arn + + ClusterName: + Condition: CreateEKSCluster + Description: The name of the EKS cluster + Value: !Ref EKSCluster + + AmazonSagemakerClusterExecutionRoleArn: + Description: 'Execution Role Arn' + Value: !GetAtt ExecutionRole.Arn + + AmazonS3BucketName: + Description: 'Bucket Name' + Value: !Ref Bucket \ No newline at end of file diff --git a/1.architectures/5.sagemaker-hyperpod/easy_smhp/sagemaker-hyperpod.yaml b/1.architectures/5.sagemaker-hyperpod/easy_smhp/sagemaker-hyperpod.yaml new file mode 100644 index 00000000..c2a84a0f --- /dev/null +++ b/1.architectures/5.sagemaker-hyperpod/easy_smhp/sagemaker-hyperpod.yaml @@ -0,0 +1,496 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +AWSTemplateFormatVersion: '2010-09-09' +Description: > + This CloudFormation stack creates all the necessary pre-requisites for Amazon SageMaker Hyperpod, + these include VPC and two Subnets, a security group, a FSx Lustre Filesystem, S3 Bucket and IAM Role. + A public subnet and a private subnet are created in an Availability Zone that you provide as a parameter. + As part of the template you'll deploy an Internet Gateway and NAT Gateway in + the public subnet. In addition you deploy endpoints for Amazon S3. The VPC contains 2 CIDR blocks with 10.0.0.0/16 and 10.1.0.0/16 + The first CIDR is used for the public subnet, the second is used for the private. + The template creates an fsx lustre volume in the specified AZ with a default of + 1.2 TB storage which can be overridden by parameter. A role is also created which + helps to execute HyperPod cluster operations. + + +#################### +## Stack Metadata ## +#################### + +Metadata: + AWS::CloudFormation::Interface: + ParameterGroups: + - Label: + default: General configuration + Parameters: + - VPCName + - Label: + default: Availability Zone configuration for the subnets + Parameters: + - PrimarySubnetAZ + - BackupSubnetAZ + - Label: + default: Fsx Lustre storage size + Parameters: + - Capacity + - Label: + default: Network and S3 endpoints configuration + Parameters: + - CreateS3Endpoint + - Label: + default: S3 bucket name + Parameters: + - S3Bucket + ParameterLabels: + VPCName: + default: Name of your VPC + PrimarySubnetAZ: + default: Availability zone id to deploy the primary subnets + BackupSubnetAZ: + default: (Optional) Availability zone id to deploy the backup private subnet + CreateS3Endpoint: + default: Create an S3 endpoint + +###################### +## Stack Parameters ## +###################### + +Parameters: + VPCName: + Description: Name of your VPC + Default: 'SageMaker HyperPod VPC' + Type: String + + PrimarySubnetAZ: + Description: Availability zone id in which the public subnet and primary private subnet will be created. + Type: String + Default: usw2-az4 + + BackupSubnetAZ: + Description: Availability zone id in which the backup private subnet will be created. Specify it when you need multiple AZs for other AWS services (e.g. AWS Directory Service). Leave empty if you don't need it. + Type: String + Default: '' + + CreateS3Endpoint: + AllowedValues: + - 'true' + - 'false' + Default: 'true' + Description: + Set to false if to avoid creating an S3 endpoint on your VPC. + Type: String + + Capacity: + Description: Storage capacity in GiB (1200 or increments of 2400) + Type: Number + Default: 1200 + + S3Bucket: + Description: S3 Bucket to save lifecycle configuration file + Type: String + Default: "sagemaker-lifecycle" + + + PerUnitStorageThroughput: + Description: Provisioned Read/Write (MB/s/TiB) + Type: Number + Default: 250 + AllowedValues: + - 125 + - 250 + - 500 + - 1000 + + Compression: + Description: Data compression type + Type: String + AllowedValues: + - "LZ4" + - "NONE" + Default: "LZ4" + + LustreVersion: + Description: Lustre software version + Type: String + AllowedValues: + - "2.15" + - "2.12" + Default: "2.15" + +############################### +## Conditions for Parameters ## +############################### + +Conditions: + S3EndpointCondition: !Equals [!Ref 'CreateS3Endpoint', 'true'] + BackupSubnetCondition: !Not [ !Equals [!Ref 'BackupSubnetAZ', ''] ] + + +########################## +## Rules for Parameters ## +########################## + +Rules: + AZsRule: + Assertions: + - Assert: !Not + - !Equals + - !Ref PrimarySubnetAZ + - !Ref BackupSubnetAZ + AssertDescription: Backup AZ has to be different from the primary AZ. + + +######################### +## VPC & Network Setup ## +######################### + +Mappings: + Networking: + VPC: + CIDR0: 10.0.0.0/16 + CIDR1: 10.1.0.0/16 + +Resources: + # Create a VPC + VPC: + Type: AWS::EC2::VPC + Properties: + EnableDnsSupport: true + EnableDnsHostnames: true + CidrBlock: !FindInMap [Networking, VPC, CIDR0] + Tags: + - Key: Name + Value: SageMaker HyperPod VPC + + VpcCidrBlock: + Type: AWS::EC2::VPCCidrBlock + DependsOn: VPC + Properties: + VpcId: !Ref VPC + CidrBlock: !FindInMap [Networking, VPC, CIDR1] + + FlowLogsRole: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Principal: + Service: vpc-flow-logs.amazonaws.com + Action: sts:AssumeRole + Policies: + - PolicyName: flowlogs-policy + PolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Action: + - logs:CreateLogStream + - logs:PutLogEvents + - logs:DescribeLogGroups + - logs:DescribeLogStreams + Resource: !GetAtt FlowLogsGroup.Arn + FlowLogsGroup: + Type: AWS::Logs::LogGroup + Properties: + RetentionInDays: 7 + + FlowLogVPC: + Type: AWS::EC2::FlowLog + Properties: + DeliverLogsPermissionArn: !GetAtt FlowLogsRole.Arn + LogGroupName: FlowLogsGroup + ResourceId: !Ref VPC + ResourceType: VPC + TrafficType: ALL + + # Create an IGW and add it to the VPC + InternetGateway: + Type: AWS::EC2::InternetGateway + + GatewayToInternet: + Type: AWS::EC2::VPCGatewayAttachment + Properties: + VpcId: !Ref VPC + InternetGatewayId: !Ref InternetGateway + + # Create a NAT GW then add it to the public subnet + NATGateway: + Type: AWS::EC2::NatGateway + Properties: + AllocationId: !GetAtt ElasticIP.AllocationId + SubnetId: !Ref PublicSubnet + + ElasticIP: + Type: AWS::EC2::EIP + Properties: + Domain: vpc + + # NOTE: when you create additional security groups, you must ensure that every + # security group has ingress/egress from/to its own security group id. Failure + # to do so may cause trn1/p4d/p4de/p5 SMHP cluster creation to fail: + # + # Instance i-aaaabbbbccccddddf failed to provision with the following + # error: "EFA health checks did not run successfully. Ensure that your + # VPC and security groups are properly configured before attempting to + # create a new cluster." Note that multiple instances may be impacted." + SecurityGroup: + Type: AWS::EC2::SecurityGroup + Properties: + GroupDescription: Allow EFA communication for Multi-Node Parallel Batch jobs + VpcId: !Ref VPC + EFASecurityGroupIngress: + Type: AWS::EC2::SecurityGroupIngress + Properties: + Description: All to all communication for EFA Ingress within Security Group + IpProtocol: -1 + FromPort: -1 + ToPort: -1 + GroupId: !Ref SecurityGroup + SourceSecurityGroupId: !Ref SecurityGroup + EFASecurityGroupEgress: + Type: AWS::EC2::SecurityGroupEgress + Properties: + Description: All to all communication for EFA Egress within Security Group + IpProtocol: -1 + FromPort: -1 + ToPort: -1 + GroupId: !Ref SecurityGroup + DestinationSecurityGroupId: !Ref SecurityGroup + EFASecurityGroupEgressECS: + Type: AWS::EC2::SecurityGroupEgress + Properties: + Description: All to all communication for Egress to all + IpProtocol: -1 + FromPort: -1 + ToPort: -1 + GroupId: !Ref SecurityGroup + CidrIp: 0.0.0.0/0 + + # Build the public subnet + PublicSubnet: + Type: AWS::EC2::Subnet + DependsOn: VPC + Properties: + MapPublicIpOnLaunch: true + VpcId: !Ref VPC + CidrBlock: !Select [ 0, !Cidr [ !GetAtt VPC.CidrBlock, 2, 15 ]] + AvailabilityZoneId: !Ref PrimarySubnetAZ + Tags: + - Key: Name + Value: !Join [ ' ', [ !Ref VPCName, 'Public Subnet -', !Ref PrimarySubnetAZ ] ] + + # Create the primary private subnet + PrimaryPrivateSubnet: + Type: AWS::EC2::Subnet + DependsOn: [VpcCidrBlock] + Properties: + VpcId: !Ref VPC + CidrBlock: !Select [ 0, !Cidr [ !FindInMap [Networking, VPC, CIDR1], 2, 15 ]] + AvailabilityZoneId: !Ref PrimarySubnetAZ + Tags: + - Key: Name + Value: !Join [ ' ', [ !Ref VPCName, 'Private Subnet -', !Ref PrimarySubnetAZ ] ] + + # Create the backup private subnet + BackupPrivateSubnet: + Condition: BackupSubnetCondition + Type: AWS::EC2::Subnet + DependsOn: [VpcCidrBlock] + Properties: + VpcId: !Ref VPC + CidrBlock: !Select [ 1, !Cidr [ !FindInMap [Networking, VPC, CIDR1], 2, 15 ]] + AvailabilityZoneId: !Ref BackupSubnetAZ + Tags: + - Key: Name + Value: !Join [ ' ', [ !Ref VPCName, 'Private Subnet -', !Ref BackupSubnetAZ ] ] + + # Create and set the public route table + PublicRouteTable: + Type: AWS::EC2::RouteTable + Properties: + VpcId: !Ref VPC + + PublicRoute: + Type: AWS::EC2::Route + Properties: + RouteTableId: !Ref PublicRouteTable + DestinationCidrBlock: 0.0.0.0/0 + GatewayId: !Ref InternetGateway + + # Then the private route table + PrivateRouteTable: + Type: AWS::EC2::RouteTable + Properties: + VpcId: !Ref VPC + + PrivateRouteToInternet: + Type: AWS::EC2::Route + Properties: + RouteTableId: !Ref PrivateRouteTable + DestinationCidrBlock: 0.0.0.0/0 + NatGatewayId: !Ref NATGateway + + # Associate the public route table to the public subnet + PublicSubnetRouteTableAssociation: + Type: AWS::EC2::SubnetRouteTableAssociation + Properties: + SubnetId: !Ref PublicSubnet + RouteTableId: !Ref PublicRouteTable + + # and the primary private subnet to the private route table + PrimaryPrivateSubnetRTAssociation: + Type: AWS::EC2::SubnetRouteTableAssociation + Properties: + SubnetId: !Ref PrimaryPrivateSubnet + RouteTableId: !Ref PrivateRouteTable + + # and the backup private subnet to the private route table + BackupPrivateSubnetRTAssociation: + Condition: BackupSubnetCondition + Type: AWS::EC2::SubnetRouteTableAssociation + Properties: + SubnetId: !Ref BackupPrivateSubnet + RouteTableId: !Ref PrivateRouteTable + + # S3 endpoint + S3Endpoint: + Condition: S3EndpointCondition + Type: AWS::EC2::VPCEndpoint + Properties: + PolicyDocument: + Version: 2012-10-17 + Statement: + - Effect: Allow + Principal: '*' + Action: + - '*' + Resource: + - '*' + RouteTableIds: + - !Ref PublicRouteTable + - !Ref PrivateRouteTable + ServiceName: !Join + - '' + - - com.amazonaws. + - !Ref AWS::Region + - .s3 + VpcId: !Ref VPC + + FSxLFilesystem: + Type: AWS::FSx::FileSystem + DeletionPolicy: Delete + UpdateReplacePolicy: Delete + Properties: + FileSystemType: LUSTRE + StorageType: SSD + FileSystemTypeVersion: !Ref LustreVersion + StorageCapacity: !Ref Capacity + SecurityGroupIds: + - !Ref SecurityGroup + SubnetIds: + - !Ref PrimaryPrivateSubnet + LustreConfiguration: + DataCompressionType: !Ref Compression + DeploymentType: PERSISTENT_2 + PerUnitStorageThroughput: !Ref PerUnitStorageThroughput + MetadataConfiguration: + Mode: AUTOMATIC + + AmazonSagemakerClusterExecutionRole: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Version: "2012-10-17" + Statement: + - Effect: Allow + Principal: + Service: + - sagemaker.amazonaws.com + Action: + - sts:AssumeRole + ManagedPolicyArns: + - "arn:aws:iam::aws:policy/AmazonSageMakerClusterInstanceRolePolicy" + Policies: + - PolicyName: AmazonSagemakerClusterVPCPolicy + PolicyDocument: + Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + [ + "ec2:CreateNetworkInterface", + "ec2:CreateNetworkInterfacePermission", + "ec2:DeleteNetworkInterface", + "ec2:DeleteNetworkInterfacePermission", + "ec2:DescribeNetworkInterfaces", + "ec2:DescribeVpcs", + "ec2:DescribeDhcpOptions", + "ec2:DescribeSubnets", + "ec2:DescribeSecurityGroups", + "ec2:DetachNetworkInterface", + "ec2:CreateTags" + ] + Resource: "*" + LCScriptsBucket: + Type: 'AWS::S3::Bucket' + DeletionPolicy: Retain + Properties: + BucketName: + !Sub + - '${S3Bucket}-${RandomGUID}' + - { RandomGUID: !Select [0, !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId ]]]] } + + +############# +## Outputs ## +############# +Outputs: + VPC: + Value: !Ref VPC + Description: ID of the VPC + Export: + Name: !Sub ${AWS::StackName}-VPC + PublicSubnet: + Value: !Ref PublicSubnet + Description: ID of the public subnet + Export: + Name: !Sub ${AWS::StackName}-PublicSubnet + PrimaryPrivateSubnet: + Value: !Ref PrimaryPrivateSubnet + Description: ID of the primary private subnet + Export: + Name: !Sub ${AWS::StackName}-PrimaryPrivateSubnet + SecurityGroup: + Value: !Ref SecurityGroup + Description: SecurityGroup for Batch + Export: + Name: !Sub ${AWS::StackName}-SecurityGroup + FSxLustreFilesystemMountname: + Description: The ID of the FSxL filesystem that has been created + Value: !GetAtt FSxLFilesystem.LustreMountName + Export: + Name: !Sub ${AWS::StackName}-FSxLustreFilesystemMountname + FSxLustreFilesystemDNSname: + Description: The DNS of the FSxL filesystem that has been created + Value: !GetAtt FSxLFilesystem.DNSName + Export: + Name: !Sub ${AWS::StackName}-FSxLustreFilesystemDNSname + FSxLustreFilesystemId: + Description: The ID of the FSxL filesystem that has been created + Value: !Ref FSxLFilesystem + Export: + Name: !Sub ${AWS::StackName}-FSxLustreFilesystemId + AmazonSagemakerClusterExecutionRoleArn: + Description: The Role ARN used for cluster creation + Value: !GetAtt AmazonSagemakerClusterExecutionRole.Arn + Export: + Name: !Sub ${AWS::StackName}-AmazonSagemakerClusterExecutionRoleArn + + AmazonS3BucketName: + Description: The S3 bucket where Lifecycle scripts are stored + Value: !Ref LCScriptsBucket + Export: + Name: !Sub ${AWS::StackName}-AmazonSagemakerLCScriptsBucketName diff --git a/1.architectures/9.mlflow_server/README.md b/1.architectures/9.mlflow_server/README.md new file mode 100644 index 00000000..61192ff4 --- /dev/null +++ b/1.architectures/9.mlflow_server/README.md @@ -0,0 +1,67 @@ +# Deploy an MLflow server on Amazon EC2. + +This sample setup a MLflow server on an Amazon EC2 instance for HPC and ML cluster. +It contains two groups: + +1. clusteradmins: for people who will administer the cluster +1. clusterusers: for the personas who will use the cluster + +It provides a UI based on `MLflow HTTP` that enable you to ... + +## Prerequesites + +This solution requires a security group with the following rules: + +Inbound rules + +| Type | Protocol | Port Range | Source | Description | +| ---- | -------- | ---------- | ----------------------------------------------------------------------------------------- | ------------------------------------ | +| MLflow | TCP | 389 | Choose Custom and enter the security group ID of the security group that you just created | Allows access to the MLflow server | +| HTTP | TCP | 80 | Enter IP range or security group Id from which you want to access the UI from | Allows access to the UI | +| HTTPS | TCP | 443 | Enter IP range or security group Id from which you want to access the UI from | Allows access to the UI | + +Outbound rules + +| Type | Protocol | Port Range | Source | Description | +| ----- | -------- | ---------- | ----------------------------------------------------------------------------------------- | ------------------------------------ | +| MLflow | TCP | 389 | Choose Custom and enter the security group ID of the security group that you just created | Allows access to the MLflow server | +| HTTPS | TCP | 443 | 0.0.0.0/0 | Allows access to the internet | + +## Deploy + +1. Download the `cf_mlflow_server.yaml` file +1. Run the following command + ```bash + aws cloudformation deploy --stack-name mlflow-server \ + --template-file cf_mlflow_server.yaml \ + --capabilities CAPABILITY_IAM \ + --parameter-overrides SubnetId=XXX SecurityGroupIds=XXX,XXX + ``` + +## Connect to the UI + +1. Retrieve the `LdapUIUrl` to connect to the MLflow User Interface. + ```bash + aws cloudformation describe-stacks --stack-name mlflow-server \ + --query 'Stacks[0].Outputs[?OutputKey==`LdapUIUrl`].OutputValue' \ + --output text + ``` + Copy URL into a Web Browser. + + +## Get the MLflow Password +The password to access the MLflow was generated randomly and stored in AWS Secret Manager under `LdapPassword` output of the cloudformation stack. + +1. Get the Secret ARN + ```bash + SECRET_ARN=$(aws cloudformation describe-stacks --stack-name mlflow-server \ + --query 'Stacks[0].Outputs[?OutputKey==`LdapPassword`].OutputValue' \ + --output text) + ``` + +1. Get the password that you will use to login + ```bash + aws secretsmanager get-secret-value --secret-id $SECRET_ARN\ + --query SecretString \ + --output text + ``` diff --git a/1.architectures/9.mlflow_server/cf_mlflow_server.yaml b/1.architectures/9.mlflow_server/cf_mlflow_server.yaml new file mode 100644 index 00000000..8c51e111 --- /dev/null +++ b/1.architectures/9.mlflow_server/cf_mlflow_server.yaml @@ -0,0 +1,266 @@ +AWSTemplateFormatVersion: '2010-09-09' +Description: > + Setup a LDAP server + Author: maxhaws@amazon.com + +Parameters: + InstanceType: + Description: EC2 instance type + Type: String + Default: t3a.micro + AllowedValues: + - t3a.micro + - t3a.small + - t3a.medium + - t3a.large + - t2.small + LatestUbuntuAmiId: + Type: AWS::SSM::Parameter::Value + Default: '/aws/service/canonical/ubuntu/server/22.04/stable/current/amd64/hvm/ebs-gp2/ami-id' + Description: 'Ubuntu 22.04 AMI Id' + EBSBootSize: + Type: Number + Default: 20 + Description: 'Size in GiB of EBS root volume' + SubnetId: + Type: AWS::EC2::Subnet::Id + Description: 'Subnet Id' + SecurityGroupIds: + Type: List + Description: 'Security Group Ids' + +############################ +## LDAP Server +Resources: + LdapKeyPair: + Type: AWS::EC2::KeyPair + Properties: + KeyName: !Sub '${AWS::StackName}-key' + KeyType: ed25519 + + LdapPassword: + Type: AWS::SecretsManager::Secret + Properties: + GenerateSecretString: + RequireEachIncludedType: True + PasswordLength: 32 + ExcludePunctuation: True + Tags: + - Key: Name + Value: 'LDAP Server' + + LdapSecretPolicy: + Type: AWS::IAM::Policy + Properties: + PolicyName: MyPolicy + PolicyDocument: + Version: 2012-10-17 + Statement: + - Effect: Allow + Action: + - secretsmanager:GetSecretValue + Resource: + - !Ref LdapPassword + Roles: + - !Ref LdapInstanceRole + + LdapInstanceRole: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Version: 2012-10-17 + Statement: + - Effect: Allow + Principal: + Service: + - ec2.amazonaws.com + Action: + - sts:AssumeRole + ManagedPolicyArns: + - arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore + + LdapInstanceProfile: + Type: AWS::IAM::InstanceProfile + DependsOn: LdapSecretPolicy + Properties: + Roles: + - !Ref LdapInstanceRole + + LdapServer: + Type: 'AWS::EC2::Instance' + Metadata: + AWS::CloudFormation::Init: + configSets: + full_install: + - install_and_enable_cfn_hup + install_and_enable_cfn_hup: + files: + /etc/cfn/cfn-hup.conf: + content: !Sub | + [main] + stack=${AWS::StackId} + region=${AWS::Region} + mode: "000400" + owner: root + group: root + /etc/cfn/hooks.d/cfn-auto-reloader.conf: + content: !Sub | + [cfn-auto-reloader-hook] + triggers=post.update + path=Resources.EC2Instance.Metadata.AWS::CloudFormation::Init + action=/opt/aws/bin/cfn-init -v --stack ${AWS::StackName} --resource LdapServer --configsets InstallAndRun --region ${AWS::Region} + runas=root + mode: "000400" + owner: root + group: root + /lib/systemd/system/cfn-hup.service: + content: | + [Unit] + Description=cfn-hup daemon + [Service] + Type=simple + ExecStart=/usr/local/bin/cfn-hup + Restart=always + [Install] + WantedBy=multi-user.target + commands: + 01enable_cfn_hup: + command: systemctl enable cfn-hup.service + 02start_cfn_hup: + command: systemctl start cfn-hup.service + Properties: + BlockDeviceMappings: + - DeviceName: '/dev/sda1' + Ebs: + DeleteOnTermination: false + Encrypted: true + Iops: 3000 + VolumeSize: !Ref EBSBootSize + VolumeType: 'gp3' + KeyName: !Ref LdapKeyPair + IamInstanceProfile: !Ref LdapInstanceProfile + ImageId: !Ref LatestUbuntuAmiId + InstanceType: !Ref InstanceType + SecurityGroupIds: !Ref SecurityGroupIds + SubnetId: !Ref SubnetId + Tags: + - Key: Name + Value: 'LDAP Server' + UserData: + Fn::Base64: !Sub + - | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="==MYBOUNDARY==" + + --==MYBOUNDARY== + Content-Type: text/x-shellscript; charset="us-ascii" + + #!/bin/bash + + export DEBIAN_FRONTEND='non-interactive' + + apt-get update -y + apt-get -y install python3-pip awscli + mkdir -p /opt/aws/ + pip3 install https://s3.amazonaws.com/cloudformation-examples/aws-cfn-bootstrap-py3-latest.tar.gz + ln -s /usr/local/init/ubuntu/cfn-hup /etc/init.d/cfn-hup + + PASSWD=$(aws secretsmanager get-secret-value --secret-id ${SECRET} --query SecretString --output text --region ${AWS::Region}) + + debconf-set-selections < /etc/ldap/ldap.conf << EOF + BASE dc=example,dc=com + URI ldap://localhost + EOF + + systemctl restart slapd + + #Create schema for ldapPublicKey + cat > /tmp/openssh-lpk.ldif << EOF + dn: cn=openssh-lpk,cn=schema,cn=config + objectClass: olcSchemaConfig + cn: openssh-lpk + olcAttributeTypes: ( 1.3.6.1.4.1.24552.500.1.1.1.13 NAME 'sshPublicKey' + DESC 'MANDATORY: OpenSSH Public key' + EQUALITY octetStringMatch + SYNTAX 1.3.6.1.4.1.1466.115.121.1.40 ) + olcObjectClasses: ( 1.3.6.1.4.1.24552.500.1.1.2.0 NAME 'ldapPublicKey' SUP top AUXILIARY + DESC 'MANDATORY: OpenSSH LPK objectclass' + MAY ( sshPublicKey $ uid ) + ) + EOF + + # Add ldapPublicKey schema + ldapadd -Y EXTERNAL -H ldapi:/// -f /tmp/openssh-lpk.ldif + + cat > /tmp/disable_anon.ldif << EOF + dn: cn=config + changetype: modify + add: olcDisallows + olcDisallows: bind_anon + EOF + + ldapadd -Y EXTERNAL -H ldapi:/// -f /tmp/disable_anon.ldif + + # Create cluster groups + cat > /tmp/groups.ldif << EOF + dn: cn=clusteradmins,dc=example,dc=com + objectClass: posixGroup + cn: clusteradmin + gidNumber: 2000 + + dn: cn=clusterusers,dc=example,dc=com + objectClass: posixGroup + cn: clusterusers + gidNumber: 3000 + EOF + + ldapadd -x -D cn=admin,dc=example,dc=com -f /tmp/groups.ldif -w "$PASSWD" + + /usr/local/bin/cfn-init -v --stack ${AWS::StackName} \ + --resource LdapServer \ + --configsets full_install \ + --region ${AWS::Region} + + /usr/local/bin/cfn-signal \ + -e $? \ + --stack ${AWS::StackName} \ + --region ${AWS::Region} \ + --resource LdapServer + + --==MYBOUNDARY==-- + - { + SECRET: !Ref LdapPassword + } + + CreationPolicy: + ResourceSignal: + Timeout: PT10M + +############# +## Outputs ## +############# +Outputs: + LdapServerPublicIp: + Value: !GetAtt LdapServer.PublicIp + LdapServerInstanceId: + Value: !GetAtt LdapServer.InstanceId + LdapUIUrl: + Value: !Join ['',[!GetAtt LdapServer.PublicIp,'/phpldapadmin']] + LdapPassword: + Value: !Ref LdapPassword \ No newline at end of file