diff --git a/_run_shellcheck_tests.sh b/_run_shellcheck_tests.sh index 93cd382b8..b2c176144 100755 --- a/_run_shellcheck_tests.sh +++ b/_run_shellcheck_tests.sh @@ -19,7 +19,7 @@ # You should have received a copy of the GNU General Public License # along with KuberDock; if not, see . # -ERRORS_TRESHOLD=950 +ERRORS_TRESHOLD=960 tmpfile=$(mktemp /tmp/shellcheck-parse.XXXXXX) find -iname '*.sh' | xargs shellcheck -s bash | tee $tmpfile diff --git a/aws-kd-deploy/aws-kd-ami.sh b/aws-kd-deploy/aws-kd-ami.sh new file mode 100755 index 000000000..1bc7b3f35 --- /dev/null +++ b/aws-kd-deploy/aws-kd-ami.sh @@ -0,0 +1,916 @@ +#!/usr/bin/env bash + +set -o errexit +set -o nounset +set -o pipefail + + +KUBERDOCK_DIR=kuberdock-files +KUBE_ROOT=$(dirname "${BASH_SOURCE}") + +if [ -f "${KUBE_ROOT}/cluster/env.sh" ]; then + source "${KUBE_ROOT}/cluster/env.sh" +fi + +# AWS_CMD, AWS_SSH_KEY, AWS_SSH_KEY_NAME, SUBNET_CIDR, ssh-key-setup +source "${KUBE_ROOT}/cluster/aws/util.sh" + +AMI_BASE_IMAGE=${AMI_BASE_IMAGE:-ami-66728470} +AMI_INSTANCE_TYPE=${AMI_INSTANCE_TYPE:-t2.micro} +AMI_PREFIX=${AMI_PREFIX:-kuberdock} +AMI_CLUSTER_NAME=${AMI_PREFIX}-ami +AMI_VERSION=${AMI_VERSION:-} +AWS_ACCOUNT_ID=${AMI_AWS_ACCOUNT_ID:-256284446245} +AWS_AVAILABILITY_ZONE=${AMI_AWS_AVAILABILITY_ZONE:-us-east-1b} +AWS_DEFAULT_REGION=${AWS_AVAILABILITY_ZONE%?} + + +DO_CLEANUP= +DO_DEPLOY="yes" +DO_HELP= +DO_IMAGE="yes" +DO_INFO= +DO_LIST= +DO_RELEASE= +DO_SETUP="yes" +FORCED_VERSION= +REGIONS="-" +SKIP_CLEANUP= +WITH_TESTING=${AMI_WITH_TESTING:-} + + +ami_name() { echo "${AMI_PREFIX}-${1}-$(ami_version)" ;} +aws_cmd() { ${AWS_CMD} "${@}" ;} +aws_filter() { echo "Name=${1},Values=${2}" ;} +check() { [ -n "${1:-}" ] ;} +check_echo() { [ -n "${1:-}" ] && echo "${@}" ;} +hash_file() { md5sum "${1}" | cut -d " " -f 1 ;} + +do_cleanup() { check "${DO_CLEANUP}" ;} +do_deploy() { check "${DO_DEPLOY}" ;} +do_help() { check "${DO_HELP}" ;} +do_image() { check "${DO_IMAGE}" ;} +do_info() { check "${DO_INFO}" ;} +do_list() { check "${DO_LIST}" ;} +do_release() { check "${DO_RELEASE}" ;} +do_setup() { check "${DO_SETUP}" ;} + + +ami_hash() +{ + local data= + local files=" + aws-kd-deploy/aws-kd-ami.sh \ + deploy.sh \ + kuberdock.spec \ + node_install.sh \ + node_install_ami.sh \ + node_install_common.sh \ + node_prepare_ami.sh \ + " + + check_echo "${AMI_HASH:-}" && return + + pushd "${KUBE_ROOT}"/.. >> /dev/null + AMI_HASH=$(cat ${files} | md5sum | cut -c-8) + popd >> /dev/null + + echo "${AMI_HASH}" +} + + +ami_regions() +{ + if [ "${REGIONS}" = "*" ]; then + echo \ + us-east-1 \ + us-east-2 \ + us-west-1 \ + us-west-2 \ + ca-central-1 \ + ap-south-1 \ + ap-northeast-2 \ + ap-southeast-1 \ + ap-southeast-2 \ + ap-northeast-1 \ + eu-central-1 \ + eu-west-1 \ + eu-west-2 \ + sa-east-1 \ + ; + return + fi + + echo "${REGIONS}" | tr ',' '\n'| sed "s/^-$/${AWS_DEFAULT_REGION}/" | sort -u | tr '\n' ' ' +} + + +ami_version() +{ + check_echo "${FORCED_VERSION}" && return + do_release || { check_echo "${AMI_VERSION}" && return ;} + + if do_release; then + kd_version + else + ami_hash + fi +} + + +aws_filter_version() +{ + aws_filter tag:KuberDockAmiVersion "$(ami_version)" +} + + +k8s_node_version() +{ + local version + + check_echo "${K8S_NODE_VERSION:-}" && return + version=$(grep "Requires: kubernetes-node" "${KUBE_ROOT}"/../kuberdock.spec | cut -d ' ' -f 4 | cut -d ':' -f 2) + K8S_NODE_VERSION=kubernetes-node-${version} + + echo "${K8S_NODE_VERSION}" +} + + +kd_version() +{ + local version + + check_echo "${KD_VERSION:-}" && return + version=$(git tag -l --points-at HEAD | awk -F 'kuberdock@' '{print $2}') + if ! check "${version}"; then + >&2 echo "ERROR: there is no tag on the current commit" + return 1 + fi + KD_VERSION=${version} + + echo "${KD_VERSION}" +} + + +print_help() +{ + for help_line in "${@}"; do + echo "${help_line}" | sed -e 's/^[[:space:]]*//' -e 's/^:/ /' + done +} + + +scp_wrapper() +{ + scp -i "${AWS_SSH_KEY}" -o StrictHostKeyChecking=no -q "${@:3}" "${SSH_USER}@${1}:${2}" +} + + +ssh_wrapper() +{ + echo "ssh ${1}: ${*:2}" + # "${@:2}" should be expanded on client side + # shellcheck disable=SC2029 + ssh -i "${AWS_SSH_KEY}" -o StrictHostKeyChecking=no -q "${SSH_USER}@${1}" "${@:2}" +} + + +copy_image() +{ + local image_id + local source_image_id + local source_region + + source_image_id=$(get_image "${1}") + + if [ "${AWS_DEFAULT_REGION}" = "${2}" ]; then + echo "${source_image_id}" + return + fi + + source_region=${AWS_DEFAULT_REGION} + local AWS_DEFAULT_REGION=${2} + + image_id=$(get_image "${1}") + if ! check "${image_id}"; then + image_id=$(aws_cmd copy-image \ + --name "$(ami_name "${1}")" \ + --source-region "${source_region}" \ + --source-image-id "${source_image_id}") + aws_cmd wait image-exists --image-ids "${image_id}" + create_tags "${image_id}" "$(ami_name "${1}")" + create_role_tag "${image_id}" "${1}" + fi + echo "${image_id}" +} + + +create_image() +{ + local image_id + + image_id=$(aws_cmd create-image \ + --instance-id "${instance_id}" \ + --name "$(ami_name "${1}")" \ + --query 'ImageId') + aws_cmd wait image-exists --image-ids "${image_id}" + create_tags "${image_id}" "$(ami_name "${1}")" + create_role_tag "${image_id}" "${1}" + + echo "${image_id}" +} + + +create_internet_gateway() +{ + local internet_gateway_id + + internet_gateway_id=$(aws_cmd create-internet-gateway \ + --query 'InternetGateway.InternetGatewayId') + wait_internet_gateway "${internet_gateway_id}" + create_tags "${internet_gateway_id}" "${AMI_CLUSTER_NAME}" + aws_cmd attach-internet-gateway \ + --internet-gateway-id "${internet_gateway_id}" \ + --vpc-id "${1}" >> /dev/null + + echo "${internet_gateway_id}" +} + + +create_role_tag() +{ + aws_cmd create-tags --resources "${1}" --tags Key=KuberDockClusterRole,Value="${2}" +} + + +create_route_table() +{ + local route_table_id + + route_table_id=$(aws_cmd create-route-table \ + --vpc-id "${1}" \ + --query 'RouteTable.RouteTableId') + wait_route_table "${route_table_id}" + create_tags "${route_table_id}" "${AMI_CLUSTER_NAME}" + aws_cmd associate-route-table \ + --route-table-id "${route_table_id}" \ + --subnet-id "${2}" >> /dev/null + aws_cmd create-route \ + --route-table-id "${route_table_id}" \ + --destination-cidr-block 0.0.0.0/0 \ + --gateway-id "${3}" >> /dev/null + + echo "${route_table_id}" +} + + +create_security_group() +{ + local security_group_id + + security_group_id=$(aws_cmd create-security-group \ + --group-name "${AMI_CLUSTER_NAME}" \ + --description "${AMI_CLUSTER_NAME}" \ + --vpc-id "${1}" \ + --query 'GroupId') + wait_security_group "${security_group_id}" + create_tags "${security_group_id}" "${AMI_CLUSTER_NAME}" + aws_cmd authorize-security-group-ingress \ + --group-id "${security_group_id}" \ + --protocol tcp \ + --port 22 \ + --cidr 0.0.0.0/0 >> /dev/null + + echo "${security_group_id}" +} + + +create_subnet() +{ + local subnet_id + + subnet_id=$(aws_cmd create-subnet \ + --cidr-block "${SUBNET_CIDR}" \ + --vpc-id "${1}" \ + --availability-zone "${AWS_AVAILABILITY_ZONE}" \ + --query 'Subnet.SubnetId') + aws_cmd wait subnet-available --subnet-ids "${subnet_id}" + create_tags "${subnet_id}" "${AMI_CLUSTER_NAME}" + + echo "${subnet_id}" +} + + +create_tags() +{ + aws_cmd create-tags \ + --resources "${1}" \ + --tags \ + Key=Name,Value="${2}" \ + Key=KuberDockAmiVersion,Value="$(ami_version)" +} + + +create_vpc() +{ + local vpc_id + + vpc_id=$(aws_cmd create-vpc \ + --cidr-block "${VPC_CIDR}" \ + --query 'Vpc.VpcId') + aws_cmd wait vpc-exists --vpc-ids "${vpc_id}" + create_tags "${vpc_id}" "${AMI_CLUSTER_NAME}" + + echo "${vpc_id}" +} + + +delete_image() +{ + local snapshot_id + + for image_id in $(get_image "${1}"); do + image_is_public "${image_id}" && continue + snapshot_id=$(get_image_snapshot "${image_id}") + echo "Deregister AMI: ${image_id}" + aws_cmd deregister-image --image-id "${image_id}" + echo "Delete Snapshot: ${snapshot_id}" + aws_cmd delete-snapshot --snapshot-id "${snapshot_id}" + done +} + + + +delete_internet_gateway() +{ + local vpc_ids + + for internet_gateway_id in $(get_internet_gateway); do + vpc_ids=$(aws_cmd describe-internet-gateways \ + --internet-gateway-ids "${internet_gateway_id}" \ + --query 'InternetGateways[].Attachments[].VpcId') + for vpc_id in ${vpc_ids}; do + aws_cmd detach-internet-gateway \ + --internet-gateway-id "${internet_gateway_id}" \ + --vpc-id "${vpc_id}" + done + echo "Delete Internet Gateway: ${internet_gateway_id}" + aws_cmd delete-internet-gateway --internet-gateway-id "${internet_gateway_id}" + done +} + + +delete_route_table() +{ + for route_table_id in $(get_route_table); do + echo "Delete Route Table: ${route_table_id}" + aws_cmd delete-route \ + --route-table-id "${route_table_id}" \ + --destination-cidr-block 0.0.0.0/0 + aws_cmd delete-route-table --route-table-id "${route_table_id}" + done +} + + +delete_security_group() +{ + for security_group_id in $(get_security_group); do + echo "Delete Security Group: ${security_group_id}" + aws_cmd delete-security-group --group-id "${security_group_id}" + done +} + + +delete_subnet() +{ + for subnet_id in $(get_subnet); do + echo "Delete Subnet: ${subnet_id}" + aws_cmd delete-subnet --subnet-id "${subnet_id}" + done +} + + +delete_vpc() +{ + for vpc_id in $(get_vpc); do + echo "Delete VPC: ${vpc_id}" + aws_cmd delete-vpc --vpc-id "${vpc_id}" + done +} + + +get_image() +{ + check_echo "$(aws_cmd describe-images \ + --filters \ + "$(aws_filter owner-id "${AWS_ACCOUNT_ID}")" \ + "$(aws_filter tag:KuberDockClusterRole "${1}")" \ + "$(aws_filter_version)" \ + --query 'Images[].ImageId')" +} + + +get_image_snapshot() +{ + root_device=$(aws_cmd describe-images \ + --image-ids "${1}" \ + --query 'Images[].RootDeviceName') + check_echo "$(aws_cmd describe-images \ + --image-ids "${1}" \ + --query 'Images[].BlockDeviceMappings[?DeviceName==`'"${root_device}"'`].Ebs[].SnapshotId')" +} + + +get_instance() +{ + check_echo "$(aws_cmd describe-instances \ + --filters \ + "$(aws_filter tag:KuberDockClusterRole "${1}")" \ + "$(aws_filter_version)" \ + --query 'Reservations[].Instances[].InstanceId')" +} + + +get_live_instance() +{ + aws_cmd describe-instances \ + --filters \ + "$(aws_filter instance-state-name pending,rebooting,running,stopped,stopping)" \ + "$(aws_filter tag:KuberDockClusterRole "${1}")" \ + "$(aws_filter_version)" \ + --query 'Reservations[].Instances[].InstanceId' +} + + +get_internet_gateway() +{ + check_echo "$(aws_cmd describe-internet-gateways \ + --filters "$(aws_filter_version)" \ + --query 'InternetGateways[].InternetGatewayId')" +} + + +get_route_table() +{ + check_echo "$(aws_cmd describe-route-tables \ + --filters "$(aws_filter_version)" \ + --query 'RouteTables[].RouteTableId')" +} + + +get_public_ip() +{ + aws_cmd describe-instances \ + --instance-ids "${1}" \ + --query 'Reservations[].Instances[].PublicIpAddress' +} + + +get_security_group() +{ + check_echo "$(aws_cmd describe-security-groups \ + --filters "$(aws_filter_version)" \ + --query 'SecurityGroups[].GroupId')" +} + + +get_subnet() +{ + check_echo "$(aws_cmd describe-subnets \ + --filters "$(aws_filter_version)" \ + --query 'Subnets[].SubnetId')" +} + + +get_vpc() +{ + check_echo "$(aws_cmd describe-vpcs \ + --filters "$(aws_filter_version)" \ + --query 'Vpcs[].VpcId')" +} + + +image_is_public() +{ + check "$(aws_cmd describe-images \ + --filters \ + "$(aws_filter is-public true)" \ + "$(aws_filter_version)" \ + --image-ids "${1}" \ + --query 'Images[].ImageId')" +} + + +run_instance() +{ + local block_device_mappings + local instance_id + local root_device + + root_device=$(aws_cmd describe-images \ + --image-ids "${AMI_BASE_IMAGE}" \ + --query 'Images[].RootDeviceName') + block_device_mappings='[{"DeviceName":"'${root_device}'","Ebs":{"DeleteOnTermination":true}}]' + instance_id=$(aws_cmd run-instances \ + --image-id "${AMI_BASE_IMAGE}" \ + --instance-type "${AMI_INSTANCE_TYPE}" \ + --subnet-id "${2}" \ + --key-name "${AWS_SSH_KEY_NAME}" \ + --security-group-ids "${3}" \ + --associate-public-ip-address \ + --block-device-mappings "${block_device_mappings}" \ + --query 'Instances[].InstanceId') + create_tags "${instance_id}" "$(ami_name "${1}")" + create_role_tag "${instance_id}" "${1}" + + echo "${instance_id}" +} + + +tag_snapshot() +{ + local snapshot_id + + aws_cmd wait image-available --image-ids "${2}" + snapshot_id=$(get_image_snapshot "${2}") + create_tags "${snapshot_id}" "$(ami_name "${1}")" + create_role_tag "${snapshot_id}" "${1}" + + echo "${snapshot_id}" +} + + +terminate_instance() +{ + local instance_id + + instance_id=$(get_live_instance "${1}") + if check "${instance_id}"; then + echo "Terminate Instance: ${instance_id}" + aws_cmd terminate-instances --instance-ids "${instance_id}" >> /dev/null + aws_cmd delete-tags --resources "${instance_id}" + if check "${2:-}"; then + aws_cmd wait instance-terminated --instance-ids "${instance_id}" + fi + fi +} + + +wait_accessible() +{ + for _ in $(seq 40); do + ssh_wrapper "${1}" -o BatchMode=yes -o ConnectTimeout=1 true >> /dev/null && break + sleep 15 + done +} + + +wait_image() +{ + for _ in $(seq 40); do + get_image "${1}" >> /dev/null && break + sleep 15 + done +} + + +wait_internet_gateway() +{ + for _ in $(seq 40); do + check "$(aws_cmd describe-internet-gateways \ + --internet-gateway-ids "${@}" 2> /dev/null)" && break + sleep 15 + done +} + + +wait_route_table() +{ + for _ in $(seq 40); do + check "$(aws_cmd describe-route-tables \ + --route-table-ids "${@}" 2> /dev/null)" && break + sleep 15 + done +} + + +wait_security_group() +{ + for _ in $(seq 40); do + check "$(aws_cmd describe-security-groups \ + --group-ids "${@}" 2> /dev/null)" && break + sleep 15 + done +} + + +ami_cleanup() +{ + echo "* Cleanup *" + + delete_image node + terminate_instance node wait + delete_security_group + delete_subnet + delete_internet_gateway + delete_route_table + delete_vpc +} + + +ami_deploy_node_copy_files() +{ + ssh_wrapper "${1}" rm -fr "${KUBERDOCK_DIR}" + ssh_wrapper "${1}" mkdir -p "${KUBERDOCK_DIR}"/node_storage_manage + + pushd "${KUBE_ROOT}"/.. >> /dev/null + + scp_wrapper "${1}" "${KUBERDOCK_DIR}" \ + backup_node.py \ + backup_node_merge.py \ + fslimit.py \ + kubelet_args.py \ + node_install.sh \ + node_install_ami.sh \ + node_install_common.sh \ + node_prepare_ami.sh \ + pd.sh \ + node_scripts/kd-docker-exec.sh \ + node_scripts/kd-ssh-gc \ + node_scripts/kd-ssh-user.sh \ + node_scripts/kd-ssh-user-update.sh + + scp_wrapper "${1}" "${KUBERDOCK_DIR}"/node_storage_manage \ + node_storage_manage/__init__.py \ + node_storage_manage/aws.py \ + node_storage_manage/common.py \ + node_storage_manage/manage.py \ + node_storage_manage/node_lvm_manage.py \ + node_storage_manage/node_zfs_manage.py + + popd >> /dev/null + + ssh_wrapper "${1}" -t sudo mv -f "${KUBERDOCK_DIR}"/backup_node.py /usr/bin/kd-backup-node + ssh_wrapper "${1}" -t sudo mv -f "${KUBERDOCK_DIR}"/backup_node_merge.py /usr/bin/kd-backup-node-merge + ssh_wrapper "${1}" -t sudo mv -f "${KUBERDOCK_DIR}"/* / + ssh_wrapper "${1}" rm -fr "${KUBERDOCK_DIR}" +} + + +ami_deploy_node_prepare_ami() +{ + node_k8s=$(k8s_node_version) + ssh_wrapper "${1}" -t "cd / && sudo AMI=True AWS=True NODE_KUBERNETES=${node_k8s} WITH_TESTING=${WITH_TESTING} ZFS=yes bash node_install.sh" + ssh_wrapper "${1}" -t "cd / && sudo bash node_prepare_ami.sh" +} + + +ami_deploy_node() +{ + echo "* Deploy Node *" + + local node_instance_id + local node_public_ip + + node_instance_id=$(get_instance node) + aws_cmd wait instance-running --instance-ids "${node_instance_id}" + + node_public_ip=$(get_public_ip "${node_instance_id}") + wait_accessible "${node_public_ip}" + + ami_deploy_node_copy_files "${node_public_ip}" + ami_deploy_node_prepare_ami "${node_public_ip}" +} + + +ami_help() +{ + print_help "\ + bash ${BASH_SOURCE} [-h|--help] [-i|--info] [-l|--list] [-f|--force-version] [-r|--regions REGIONS] [-t|--with-testing] [-c|--cleanup] [--release] [--skip-setup] [--skip-deploy] [--skip-ami] [--skip-cleanup] [--use-ami AMI_ID] + : -h|--help : print this help + : -i|--info : print some info + : -l|--list : list available AMIs + : -f|--force-version : force AMI version + : -r|--regions REGIONS : comma-separated list of regions to use with + : : -l|--list or --release + : : default is '-' -- current region configured + : : to operate in + : -t|--with-testing : deploy with kube-testing repo + : -c|--cleanup : do cleanup + : --release : make release AMI + : --skip-setup : skip setup + : --skip-deploy : skip deploy + : --skip-ami : skip AMI creation + : --skip-cleanup : skip cleanup (skipped by default if --release + : : is not specified) + : --use-ami AMI_ID : use specified AMI to run instance + " +} + + +ami_image() +{ + echo "* Create AMI *" + + delete_image "${1}" + + local image_id + local instance_id + local snapshot_id + + instance_id=$(get_instance "${1}") + if check "${instance_id}"; then + aws_cmd wait instance-running --instance-ids "${instance_id}" + image_id=$(get_image "${1}" || create_image "${1}") + echo "AMI: ${image_id}" + snapshot_id=$(tag_snapshot "${1}" "${image_id}") + echo "Snapshot: ${snapshot_id}" + fi +} + + +ami_info() { + local image_id + local internet_gateway_id + local node_instance_id + local node_public_ip + local route_table_id + local security_group_id + local snapshot_id + local subnet_id + local vpc_id + + echo "AMI Version: $(ami_version)" + echo "AWS Region: ${AWS_DEFAULT_REGION}" + echo "AWS Availability Zone: ${AWS_AVAILABILITY_ZONE}" + echo "Base AMI: ${AMI_BASE_IMAGE}" + vpc_id=$(get_vpc) && \ + echo "VPC: ${vpc_id}" + subnet_id=$(get_subnet) && \ + echo "Subnet: ${subnet_id}" + internet_gateway_id=$(get_internet_gateway) && \ + echo "Internet Gateway: ${internet_gateway_id}" + route_table_id=$(get_route_table) && \ + echo "Route Table: ${route_table_id}" + security_group_id=$(get_security_group) && \ + echo "Security Group: ${security_group_id}" + node_instance_id=$(get_instance node) && \ + echo "Node Instance: ${node_instance_id}" + if check "${node_instance_id}"; then + node_public_ip=$(get_public_ip "${node_instance_id}") && \ + echo "Node Public IP: ${node_public_ip}" + fi + image_id=$(get_image node) && \ + echo "Node AMI: ${image_id}" + if check "${image_id}"; then + snapshot_id=$(get_image_snapshot "${image_id}") && \ + echo "Node Snapshot: ${snapshot_id}" + fi +} + + +ami_list() +{ + for region in $(ami_regions); do + echo "${region}:" + check_echo "$(AWS_DEFAULT_REGION=${region} aws_cmd describe-images \ + --filters \ + "$(aws_filter owner-id "${AWS_ACCOUNT_ID}")" \ + "$(aws_filter tag-key KuberDockAmiVersion)" \ + --query 'Images[].[Name,ImageId,BlockDeviceMappings[0].Ebs.SnapshotId,Public]')" \ + || echo "- no AMIs -" + done +} + + +ami_release() +{ + echo "* Release *" + + local node_image_id + local node_snapshot_id + + wait_image node + + for region in $(ami_regions); do + echo "${region}:" + node_image_id=$(copy_image node "${region}") + echo "Node AMI: ${node_image_id}" + node_snapshot_id=$(AWS_DEFAULT_REGION=${region} tag_snapshot node "${node_image_id}") + echo "Node Snapshot: ${node_snapshot_id}" + AWS_DEFAULT_REGION=${region} aws_cmd modify-image-attribute \ + --image-id "${node_image_id}" \ + --launch-permission '{"Add":[{"Group":"all"}]}' + done +} + + +ami_setup() +{ + echo "* Setup *" + + local internet_gateway_id + local node_instance_id + local node_public_ip + local route_table_id + local security_group_id + local subnet_id + local vpc_id + + terminate_instance node + + ssh-key-setup >> /dev/null + + vpc_id=$(get_vpc || create_vpc) + echo "VPC: ${vpc_id}" + + subnet_id=$(get_subnet || create_subnet "${vpc_id}") + echo "Subnet: ${subnet_id}" + + internet_gateway_id=$(get_internet_gateway || create_internet_gateway "${vpc_id}") + echo "Internet Gateway: ${internet_gateway_id}" + + route_table_id=$(get_route_table || create_route_table "${vpc_id}" "${subnet_id}" "${internet_gateway_id}") + echo "Route Table: ${route_table_id}" + + security_group_id=$(get_security_group || create_security_group "${vpc_id}") + echo "Security Group: ${security_group_id}" + + node_instance_id=$(run_instance node "${subnet_id}" "${security_group_id}") + echo "Node Instance: ${node_instance_id}" + + node_public_ip=$(get_public_ip "${node_instance_id}") + echo "Node Public IP: ${node_public_ip}" +} + + +while [ $# -gt 0 ]; do + key=${1} + case ${key} in + -c|--cleanup) + DO_CLEANUP=yes + DO_DEPLOY= + DO_IMAGE= + DO_SETUP= + ;; + -f|--force-version) + FORCED_VERSION=${2} + shift + ;; + -h|--help) + DO_HELP=yes + ;; + -i|--info) + DO_INFO=yes + ;; + -l|--list) + DO_LIST=yes + ;; + -r|--regions) + REGIONS=${2} + shift + ;; + -t|--with-testing) + WITH_TESTING=yes + ;; + --release) + check "${SKIP_CLEANUP}" || DO_CLEANUP=yes + DO_RELEASE=yes + WITH_TESTING=${WITH_TESTING:-} + ;; + --skip-cleanup) + DO_CLEANUP= + SKIP_CLEANUP=yes + ;; + --skip-deploy) + DO_DEPLOY= + ;; + --skip-ami) + DO_IMAGE= + ;; + --skip-setup) + DO_SETUP= + ;; + --use-ami) + AMI_BASE_IMAGE=${2} + shift + ;; + *) + ami_help + exit 1 + ;; + esac + shift +done + + +do_release && ami_version >> /dev/null + +do_help && { ami_help; exit; } +do_info && { ami_info; exit; } +do_list && { ami_list; exit; } +do_setup && ami_setup +do_deploy && ami_deploy_node +do_image && ami_image node +do_release && ami_release +do_cleanup && ami_cleanup diff --git a/aws-kd-deploy/cluster/aws/util.sh b/aws-kd-deploy/cluster/aws/util.sh index a65382892..7149bdf19 100755 --- a/aws-kd-deploy/cluster/aws/util.sh +++ b/aws-kd-deploy/cluster/aws/util.sh @@ -1224,7 +1224,7 @@ function deploy-master(){ fi ssh -oStrictHostKeyChecking=no -i "${AWS_SSH_KEY}" -tt "${SSH_USER}@${KUBE_MASTER_IP}" mkdir /home/${SSH_USER}/kuberdock-files 2>"$LOG" - ssh -oStrictHostKeyChecking=no -i "${AWS_SSH_KEY}" -tt "${SSH_USER}@${KUBE_MASTER_IP}" cp /var/opt/kuberdock/{node_install.sh,pd.sh} /etc/pki/etcd/ca.crt /etc/pki/etcd/etcd-client.crt /etc/pki/etcd/etcd-client.key /home/${SSH_USER}/kuberdock-files 2>"$LOG" + ssh -oStrictHostKeyChecking=no -i "${AWS_SSH_KEY}" -tt "${SSH_USER}@${KUBE_MASTER_IP}" cp /var/opt/kuberdock/node_install.sh /var/opt/kuberdock/node_install_common.sh /var/opt/kuberdock/pd.sh /etc/pki/etcd/ca.crt /etc/pki/etcd/etcd-client.crt /etc/pki/etcd/etcd-client.key /home/${SSH_USER}/kuberdock-files 2>"$LOG" ssh -oStrictHostKeyChecking=no -i "${AWS_SSH_KEY}" -tt "${SSH_USER}@${KUBE_MASTER_IP}" "sudo cp /var/lib/nginx/.ssh/id_rsa.pub /home/${SSH_USER}/kuberdock-files" 2>"$LOG" ssh -oStrictHostKeyChecking=no -i "${AWS_SSH_KEY}" -tt "${SSH_USER}@${KUBE_MASTER_IP}" "sudo chown ${SSH_USER} /home/${SSH_USER}/kuberdock-files/*" < <(cat) 2>"$LOG" diff --git a/kubedock/kapi/elasticsearch_utils.py b/kubedock/kapi/elasticsearch_utils.py index 329971201..eefceabe6 100644 --- a/kubedock/kapi/elasticsearch_utils.py +++ b/kubedock/kapi/elasticsearch_utils.py @@ -26,7 +26,7 @@ from ..nodes.models import Node from ..pods.models import Pod from ..users.models import User -from .nodes import get_kuberdock_logs_pod_name +from .service_pods import get_kuberdock_logs_pod_name from ..settings import ( ELASTICSEARCH_REST_PORT, KUBE_API_PORT, KUBE_API_HOST, KUBE_API_VERSION) diff --git a/kubedock/kapi/es_logs.py b/kubedock/kapi/es_logs.py index d92f4b9aa..68b2b5266 100644 --- a/kubedock/kapi/es_logs.py +++ b/kubedock/kapi/es_logs.py @@ -25,8 +25,8 @@ from dateutil.parser import parse as parse_dt from .elasticsearch_utils import execute_es_query, LogsError -from .nodes import get_kuberdock_logs_pod_name from .podcollection import PodCollection, POD_STATUSES +from .service_pods import get_kuberdock_logs_pod_name from ..core import ConnectionPool from ..exceptions import APIError from ..nodes.models import Node diff --git a/kubedock/kapi/migrate_pod_utils.py b/kubedock/kapi/migrate_pod_utils.py new file mode 100644 index 000000000..082363cbb --- /dev/null +++ b/kubedock/kapi/migrate_pod_utils.py @@ -0,0 +1,193 @@ +"""Utilities to migrate pods from one node to another if those pods are binded +to a node. +""" +from flask import current_app + +from kubedock.core import db +from kubedock.kapi import node_utils, node_utils_aws, service_pods +from kubedock.kapi.podcollection import PodCollection +from kubedock.nodes.models import Node +from kubedock.pods.models import PersistentDisk, Pod +from kubedock.settings import AWS, WITH_TESTING +from kubedock.users.models import User +from kubedock.utils import NODE_STATUSES, POD_STATUSES + + +def change_pods_node_selectors(from_node, to_node): + """Changes node selector of pods from 'from_node' to 'to_node' + :param from_node: node which has binded pods (object of model Node) + :param to_node: to this node should be binded pods. Also object of model + Node + """ + current_app.logger.debug( + "Starting pods migration from '{}' to '{}'".format( + from_node.hostname, to_node.hostname + ) + ) + pc = PodCollection() + internal_user = User.get_internal() + PersistentDisk.get_by_node_id(from_node.id).update( + {PersistentDisk.node_id: to_node.id}, + synchronize_session=False + ) + dbpods = Pod.query.filter( + Pod.owner_id != internal_user.id, + Pod.status != 'deleted' + ).all() + + for pod in dbpods: + if pod.pinned_node != from_node.hostname: + continue + current_app.logger.debug( + "Update pod ({0}) config to use new node '{1}'".format( + pod.id, to_node.hostname + ) + ) + pc.update( + pod.id, {'command': 'change_config', 'node': to_node.hostname} + ) + k8s_pod = pc._get_by_id(pod.id) + if k8s_pod.status in (POD_STATUSES.stopping, POD_STATUSES.stopped): + continue + + current_app.logger.debug( + "Restarting pod after migration: {}".format(pod.id)) + try: + pc.update(pod.id, {'command': 'redeploy'}) + # current_app.logger.debug("Skip restarting") + except: + current_app.logger.exception( + "Failed to redeploy pod after migration: {}".format(pod.id) + ) + + +def manage_failed_aws_node(node_hostname, ssh, deploy_func): + current_app.logger.debug( + "Try to migrate from failed node '{0}'".format(node_hostname) + ) + + failed_node = db.session.query(Node).filter( + Node.hostname == node_hostname + ).first() + if not failed_node: + current_app.logger.error( + "Node '{0}' not found in DB".format(node_hostname) + ) + return + + reserve_node_hostname, reserve_node_ip, reserve_node_fast = \ + node_utils_aws.spawn_reserving_node(node_hostname) + current_app.logger.debug( + 'Created reserve node: {0}'.format(reserve_node_hostname) + ) + + current_app.logger.debug('Setting failed node ({0}) ' + 'as unschedulable'.format(node_hostname)) + node_utils.set_node_schedulable_flag(node_hostname, False) + + current_app.logger.debug( + 'Add node to database {0}'.format(reserve_node_hostname) + ) + reserve_node = Node( + ip=reserve_node_ip, + hostname=reserve_node_hostname, + kube_id=failed_node.kube_id, + state=NODE_STATUSES.pending, + ) + reserve_node = node_utils.add_node_to_db(reserve_node) + + current_app.logger.debug('Waiting for node is running ' + '{0}'.format(reserve_node_hostname)) + node_utils_aws.wait_node_running(reserve_node_hostname) + current_app.logger.debug( + 'Node is running {0}'.format(reserve_node_hostname) + ) + current_app.logger.debug('Waiting for node is accessible ' + '{0}'.format(reserve_node_hostname)) + target_ssh = node_utils_aws.wait_node_accessible(reserve_node_hostname) + current_app.logger.debug( + 'Node is accessible {0}'.format(reserve_node_hostname) + ) + + current_app.logger.debug( + "Create logging pod for '{0}'".format(reserve_node_hostname) + ) + ku = User.get_internal() + log_pod = service_pods.create_logs_pod(reserve_node_hostname, ku) + + current_app.logger.debug( + "Deploying reserving node '{0}'".format(reserve_node_hostname) + ) + current_app.logger.debug( + 'Fast deploy' if reserve_node_fast else 'Slow deploy' + ) + deploy_func(reserve_node_fast, reserve_node.id, log_pod['podIP']) + + if ssh: + current_app.logger.debug( + "Stopping k8s services on node '{0}'".format(node_hostname) + ) + node_utils.stop_node_k8_services(ssh) + else: + current_app.logger.warning( + "Failed node '{0}' is not accessible via ssh. " + "Can't stop k8s services on the node.".format(node_hostname)) + + current_app.logger.debug( + "Moving storage from '{0}' to '{1}'".format( + node_hostname, reserve_node_hostname) + ) + if not node_utils.move_aws_node_storage(ssh, target_ssh, + failed_node, reserve_node): + current_app.logger.error( + "Failed to move aws storage from '{0}' to '{1}'".format( + node_hostname, reserve_node_hostname) + ) + return + + current_app.logger.debug( + "Changing pods selectors to node '{0}'".format(reserve_node_hostname) + ) + change_pods_node_selectors(failed_node, reserve_node) + + current_app.logger.debug( + 'Delete node from KuberDock {0}'.format(node_hostname) + ) + try: + node_utils.revoke_celery_node_install_task(failed_node) + node_utils.delete_node_from_db(failed_node) + res = node_utils.remove_node_from_k8s(node_hostname) + if res['status'] == 'Failure' and res['code'] != 404: + raise Exception('Failed to remove node {0} ' + 'from Kubernetes'.format(node_hostname)) + node_utils.remove_node_install_log(node_hostname) + node_utils.cleanup_node_network_policies(node_hostname) + service_pods.delete_logs_pod(node_hostname) + node_utils_aws.terminate_node(node_hostname) + except Exception as e: + current_app.logger.error( + "Failed to delete node '{0}': {1}".format(node_hostname, e) + ) + + try: + db.session.commit() + except: + db.session.rollback() + raise + + current_app.logger.error("Migration to node '{0}' completed " + "successfully".format(reserve_node_hostname)) + + +def manage_failed_node(node_hostname, ssh, deploy_func): + """Does some actions with the failed node if it is possible. + Now works only for AWS setup configured with reserved nodes: + if some node has failed, then take one from reserve, move there storage + of failed node. + """ + if AWS: + manage_failed_aws_node(node_hostname, ssh, deploy_func) + else: + current_app.logger.warning( + 'Failed node detected ({0}), ' + 'but no recovery procedure available.'.format(node_hostname)) diff --git a/kubedock/kapi/node_utils.py b/kubedock/kapi/node_utils.py index 6d61c9fd9..c7b397776 100644 --- a/kubedock/kapi/node_utils.py +++ b/kubedock/kapi/node_utils.py @@ -23,6 +23,7 @@ import json import uuid import re +import os from flask import current_app @@ -38,8 +39,11 @@ NODE_INSTALL_LOG_FILE, AWS, CEPH, PD_NAMESPACE, PD_NS_SEPARATOR, NODE_STORAGE_MANAGE_CMD, ZFS, ETCD_CALICO_HOST_ENDPOINT_KEY_PATH_TEMPLATE, ETCD_CALICO_HOST_CONFIG_KEY_PATH_TEMPLATE, ETCD_NETWORK_POLICY_NODES, - KD_NODE_HOST_ENDPOINT_ROLE) + KD_NODE_HOST_ENDPOINT_ROLE, NODE_CEPH_AWARE_KUBERDOCK_LABEL, + NODE_INSTALL_TASK_ID) +from ..kd_celery import celery from .network_policies import get_node_host_endpoint_policy +from .node import Node as K8SNode def get_nodes_collection(kube_type=None): @@ -759,3 +763,200 @@ def cleanup_node_network_policies(node_hostname): Etcd(ETCD_NETWORK_POLICY_NODES).delete( '{}-{}'.format(i, node_hostname) ) + + +def stop_node_k8_services(ssh): + """Tries to stop docker daemon and kubelet service on the node + :param ssh: ssh connection to the node + """ + services = ('kubelet', 'kube-proxy', 'docker') + cmd = 'systemctl stop ' + for service in services: + _, o, e = ssh.exec_command(cmd + service) + if o.channel.recv_exit_status(): + current_app.logger.warning( + u"Failed to stop service '{}' on a failed node: {}".format( + service, e.read()) + ) + + +def stop_aws_storage(ssh): + """Stops aws storage (zpool or lvm export), + detaches ebs volumes of the storage. + """ + cmd = NODE_STORAGE_MANAGE_CMD + ' export-storage' + from kubedock.settings import AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY + cmd += ' --detach-ebs '\ + '--aws-access-key-id {0} --aws-secret-access-key {1}'.format( + AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) + _, o, e = ssh.exec_command(cmd) + if o.channel.recv_exit_status(): + current_app.logger.error( + 'Failed to stop zpool on AWS node: {0}'.format(e.read()) + ) + return False + result = o.read() + try: + data = json.loads(result) + except (ValueError, TypeError): + current_app.logger.error( + 'Unknown answer format from remote script: {0}\n' + '==================\nSTDERR:\n{1}'.format(result, e.read()) + ) + return False + if data['status'] != 'OK': + current_app.logger.error( + 'Failed to stop storage: {0}'.format( + data.get('data', {}).get('message') + ) + ) + return False + return True + + +def import_aws_storage(ssh, volumes, force_detach=False): + """Imports aws storage to node given by ssh + """ + cmd = NODE_STORAGE_MANAGE_CMD + ' import-aws-storage' + from kubedock.settings import AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY + cmd += ' --aws-access-key-id {0} --aws-secret-access-key {1}'.format( + AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY + ) + if force_detach: + cmd += ' --force-detach' + cmd += ' --ebs-volumes {0}'.format(' '.join(volumes)) + current_app.logger.debug('Executing command: {}'.format(cmd)) + _, o, e = ssh.exec_command(cmd) + if o.channel.recv_exit_status(): + current_app.logger.error( + u"Failed to import zpool on AWS node: {}".format(e.read()) + ) + return False + result = o.read() + try: + data = json.loads(result) + except (ValueError, TypeError): + current_app.logger.error( + 'Unknown answer format from remote script: {0}\n' + '==================\nSTDERR:\n{1}'.format(result, e.read()) + ) + return False + if data['status'] != 'OK': + current_app.logger.error( + 'Failed to import storage: {0}'.format( + data.get('data', {}).get('message') + ) + ) + return False + current_app.logger.debug( + 'Result of importing storage: {0}'.format(o.read()) + ) + return True + + +def move_aws_node_storage(from_node_ssh, to_node_ssh, + from_node, to_node): + """Moves EBS volumes with KD zpool from one node to another on AWS setup. + """ + # Flag show should we force detach volumes before attaching it to new + # instance. It will be set in case if ssh to source node is unavailable, + # so we can't detach volumes on that node. + force_detach = False + if from_node_ssh: + # If we can get ssh to the old node, then try to do some preparations + # before moving volumes: + # * export zpool (also should unmount volumes) + # * detach EBS volumes + current_app.logger.debug( + "Stopping node '{0}' storage".format(from_node.hostname) + ) + if not stop_aws_storage(from_node_ssh): + force_detach = True + else: + current_app.logger.debug( + 'Failed node is inaccessible. EBS volumes will be force detached' + ) + force_detach = True + + ls_devices = db.session.query(LocalStorageDevices).filter( + LocalStorageDevices.node_id == from_node.id + ).all() + volumes = [item.volume_name for item in ls_devices] + current_app.logger.debug( + "Importing on node '{0}' volumes: {1}".format(to_node.hostname, + volumes) + ) + if not import_aws_storage(to_node_ssh, volumes, force_detach=force_detach): + return False + for dev in ls_devices: + dev.node_id = to_node.id + return True + + +def add_node_to_k8s(host, kube_type, is_ceph_installed=False): + """ + :param host: Node hostname + :param kube_type: Kuberdock kube type (integer id) + :return: Error text if error else False + """ + # TODO handle connection errors except requests.RequestException + data = { + 'metadata': { + 'name': host, + 'labels': { + 'kuberdock-node-hostname': host, + 'kuberdock-kube-type': 'type_' + str(kube_type) + }, + 'annotations': { + K8SNode.FREE_PUBLIC_IP_COUNTER_FIELD: '0' + } + }, + 'spec': { + 'externalID': host, + } + } + if is_ceph_installed: + data['metadata']['labels'][NODE_CEPH_AWARE_KUBERDOCK_LABEL] = 'True' + res = requests.post(get_api_url('nodes', namespace=False), + json=data) + return res.text if not res.ok else False + + +def set_node_schedulable_flag(node_hostname, schedulable): + """Marks given node as schedulable or unschedulable depending of + 'schedulable' flag value. + :param node_hostname: name of the node in kubernetes + :param schedulable: bool flag, if it is True, then node will be marked as + schedulable, if Flase - unschedulable + """ + url = get_api_url('nodes', node_hostname, namespace=False) + try_times = 100 + for _ in range(try_times): + try: + node = requests.get(url).json() + node['spec']['unschedulable'] = not schedulable + res = requests.put(url, data=json.dumps(node)) + except (requests.RequestException, ValueError, KeyError): + continue + if res.ok: + return True + return False + + +def remove_node_from_k8s(host): + r = requests.delete(get_api_url('nodes', host, namespace=False)) + return r.json() + + +def remove_node_install_log(hostname): + try: + os.remove(NODE_INSTALL_LOG_FILE.format(hostname)) + except OSError: + pass + + +def revoke_celery_node_install_task(node): + celery.control.revoke( + task_id=NODE_INSTALL_TASK_ID.format(node.hostname, node.id), + terminate=True, + ) diff --git a/kubedock/kapi/node_utils_aws.py b/kubedock/kapi/node_utils_aws.py new file mode 100644 index 000000000..4264776b0 --- /dev/null +++ b/kubedock/kapi/node_utils_aws.py @@ -0,0 +1,383 @@ +import hashlib +import socket +import time + +from boto import ec2 +from boto.ec2 import blockdevicemapping as ec2_bdm +from boto.ec2 import networkinterface as ec2_ni + +from kubedock.core import db, ssh_connect +from kubedock.kapi.node_utils import ( + add_node_to_k8s, + complete_calico_node_config, +) +from kubedock.nodes.models import Node +from kubedock.rbac.models import Role +from kubedock.settings import ( + AWS, + AWS_ACCESS_KEY_ID, + AWS_SECRET_ACCESS_KEY, + AWS_INSTANCE_RUNNING_INTERVAL, + AWS_INSTANCE_RUNNING_MAX_ATTEMTPS, + FIXED_IP_POOLS, + MASTER_IP, + NODE_INSTALL_LOG_FILE, + NODE_INSTALL_TIMEOUT_SEC, + NODE_SSH_COMMAND_SHORT_EXEC_TIMEOUT, + REGION, + SSH_PUB_FILENAME, + IS_PRODUCTION_PKG, +) +from kubedock.system_settings.models import SystemSettings +from kubedock.users.models import User, SessionData +from kubedock.utils import ( + NODE_STATUSES, + get_timezone, + get_version, + send_event, + send_logs, +) + + +CENTOS_AMI = { + 'ap-northeast-1': 'ami-eec1c380', + 'ap-northeast-2': 'ami-c74789a9', + 'ap-southeast-1': 'ami-f068a193', + 'ap-southeast-2': 'ami-fedafc9d', + 'eu-west-1': 'ami-7abd0209', + 'eu-central-1': 'ami-9bf712f4', + 'sa-east-1': 'ami-26b93b4a', + 'us-east-1': 'ami-6d1c2007', + 'us-west-1': 'ami-af4333cf', + 'us-west-2': 'ami-d2c924b2', +} + + +HASH_FILES = [ + 'aws-kd-deploy/aws-kd-ami.sh', + 'deploy.sh', + 'kuberdock.spec', + 'node_install.sh', + 'node_install_ami.sh', + 'node_install_common.sh', + 'node_prepare_ami.sh', +] + + +USER_DATA_TEMPLATE = """\ +#!/bin/bash +echo "{0}" > /root/.ssh/authorized_keys +""" + + +class AMIException(Exception): + pass + + +def create_instance(image_id, node_root_disk_size, node_root_disk_type, + iam_profile_node_arn, node_size, subnet_id, + aws_ssh_key_name, security_group_ids, tags): + connection = get_connection() + + image = connection.get_image(image_id=image_id) + + block_device = ec2_bdm.EBSBlockDeviceType() + block_device.delete_on_termination = True + block_device.size = node_root_disk_size + block_device.volume_type = node_root_disk_type + + block_device_map = ec2_bdm.BlockDeviceMapping() + block_device_map[image.root_device_name] = block_device + + network_interface = ec2_ni.NetworkInterfaceSpecification( + subnet_id=subnet_id, + groups=security_group_ids, + associate_public_ip_address=True, + ) + network_interfaces = ec2_ni.NetworkInterfaceCollection(network_interface) + + user_data = get_user_data() + + reservation = connection.run_instances( + image_id=image_id, + instance_profile_arn=iam_profile_node_arn, + instance_type=node_size, + key_name=aws_ssh_key_name, + block_device_map=block_device_map, + network_interfaces=network_interfaces, + user_data=user_data, + ) + + instance = reservation.instances[0] + + connection.create_tags(resource_ids=[instance.id], tags=tags) + instance.modify_attribute(attribute='sourceDestCheck', value=False) + + return instance + + +def deploy_node(node_id, log_pod_ip): + db_node = Node.get_by_id(node_id) + admin_rid = Role.query.filter_by(rolename='Admin').one().id + channels = [i.id for i in SessionData.query.filter_by(role_id=admin_rid)] + initial_evt_sent = False + host = db_node.hostname + kube_type = db_node.kube_id + cpu_multiplier = SystemSettings.get_by_name('cpu_multiplier') + memory_multiplier = SystemSettings.get_by_name('memory_multiplier') + ku = User.get_internal() + token = ku.get_token() + with open(NODE_INSTALL_LOG_FILE.format(host), 'w') as log_file: + try: + timezone = get_timezone() + except OSError as e: + raise AMIException('Cannot get master timezone: {0}'.format(e)) + + send_logs( + db_node.id, + 'Connecting to {0} with ssh with user "root" ...'.format(host), + log_file, + channels + ) + + ssh, err = ssh_connect(host) + if err: + raise AMIException(err) + + sftp = ssh.open_sftp() + sftp.get_channel().settimeout(NODE_SSH_COMMAND_SHORT_EXEC_TIMEOUT) + sftp.put('/etc/kubernetes/configfile_for_nodes', + '/etc/kubernetes/configfile') + sftp.put('/etc/pki/etcd/ca.crt', '/etc/pki/etcd/ca.crt') + sftp.put('/etc/pki/etcd/etcd-client.crt', + '/etc/pki/etcd/etcd-client.crt') + sftp.put('/etc/pki/etcd/etcd-client.key', + '/etc/pki/etcd/etcd-client.key') + sftp.put('/etc/pki/etcd/etcd-dns.crt', '/etc/pki/etcd/etcd-dns.crt') + sftp.put('/etc/pki/etcd/etcd-dns.key', '/etc/pki/etcd/etcd-dns.key') + sftp.close() + + deploy_vars = { + 'NODE_IP': db_node.ip, + 'AWS': AWS, + 'MASTER_IP': MASTER_IP, + 'TZ': timezone, + 'NODENAME': host, + 'CPU_MULTIPLIER': cpu_multiplier, + 'MEMORY_MULTIPLIER': memory_multiplier, + 'FIXED_IP_POOLS': FIXED_IP_POOLS, + 'TOKEN': token, + 'LOG_POD_IP': log_pod_ip, + } + deploy_cmd = 'bash /node_install_ami.sh' + + set_vars_str = ' '.join('{key}="{value}"'.format(key=k, value=v) + for k, v in deploy_vars.items()) + cmd = '{set_vars} {deploy_cmd}'.format(set_vars=set_vars_str, + deploy_cmd=deploy_cmd) + + s_time = time.time() + i, o, e = ssh.exec_command(cmd, + timeout=NODE_INSTALL_TIMEOUT_SEC, + get_pty=True) + try: + while not o.channel.exit_status_ready(): + data = o.channel.recv(1024) + while data: + if not initial_evt_sent: + send_event('node:change', {'id': db_node.id}) + initial_evt_sent = True + for line in data.split('\n'): + send_logs(db_node.id, line, log_file, channels) + data = o.channel.recv(1024) + if (time.time() - s_time) > NODE_INSTALL_TIMEOUT_SEC: + raise socket.timeout() + time.sleep(0.2) + except socket.timeout: + raise AMIException('Timeout hit during node install ' + '{0} cmd'.format(cmd)) + + ret_code = o.channel.recv_exit_status() + if ret_code != 0: + raise AMIException( + 'Node install failed cmd {0} ' + 'with retcode {1}'.format(cmd, ret_code) + ) + + complete_calico_node_config(host, db_node.ip) + + time.sleep(2) + + err = add_node_to_k8s(host, kube_type) + if err: + raise AMIException('ERROR adding node', log_file, channels) + + send_logs(db_node.id, 'Adding Node completed successful.', + log_file, channels) + send_logs(db_node.id, '===================================', + log_file, channels) + + ssh.close() + + db_node.state = NODE_STATUSES.completed + db.session.commit() + send_event('node:change', {'id': db_node.id}) + + +def detect_ami_image(role): + connection = get_connection() + + try: + images = connection.get_all_images( + filters={ + 'tag:KuberDockAmiVersion': get_ami_version(), + 'tag:KuberDockClusterRole': role, + } + ) + image = images[0] + except (AttributeError, IndexError, TypeError) as e: + raise AMIException(e) + + return image + + +def get_ami_hash(): + md5 = hashlib.md5() + for f in HASH_FILES: + with open(f) as fd: + md5.update(fd.read()) + return md5.hexdigest()[:8] + + +def get_ami_version(): + return get_version('kuberdock') if IS_PRODUCTION_PKG else get_ami_hash() + + +def get_connection(): + connection = ec2.connect_to_region( + REGION, + aws_access_key_id=AWS_ACCESS_KEY_ID, + aws_secret_access_key=AWS_SECRET_ACCESS_KEY, + ) + if connection is None: + raise AMIException('Unable to connect to region {0}'.format(REGION)) + return connection + + +def get_instance(hostname): + connection = get_connection() + + try: + instances = connection.get_only_instances( + filters={'private_dns_name': hostname} + ) + instance = instances[0] + except (AttributeError, IndexError, TypeError) as e: + raise AMIException(e) + + return instance + + +def get_instance_profile_arn(instance): + try: + instance_profile_arn = instance.instance_profile['arn'] + except (AttributeError, KeyError) as e: + raise AMIException(e) + return instance_profile_arn + + +def get_nginx_id_rsa_pub(): + with open(SSH_PUB_FILENAME) as id_rsa_pub_file: + id_rsa_pub = id_rsa_pub_file.read() + return id_rsa_pub + + +def get_node_data(hostname): + fast = True + try: + image = detect_ami_image('node') + except AMIException: + fast = False + image = CENTOS_AMI.get(REGION) + if image is None: + raise AMIException( + 'There is no CentOS AMI for region {0}'.format(REGION) + ) + instance = get_instance(hostname) + iam_profile = get_instance_profile_arn(instance) + root_volume = get_root_volume(instance) + + data = dict( + image_id=image.id, + node_root_disk_size=root_volume.size, + node_root_disk_type=root_volume.type, + iam_profile_node_arn=iam_profile, + node_size=instance.instance_type, + subnet_id=instance.subnet_id, + aws_ssh_key_name=instance.key_name, + security_group_ids=[group.id for group in instance.groups], + tags=instance.tags, + ) + + return data, fast + + +def get_root_volume(instance): + connection = get_connection() + + try: + block_device = instance.block_device_mapping[instance.root_device_name] + volumes = connection.get_all_volumes( + filters={'volume_id': block_device.volume_id} + ) + volume = volumes[0] + except (AttributeError, IndexError, KeyError) as e: + raise AMIException(e) + + return volume + + +def get_user_data(): + nginx_id_rsa_pub = get_nginx_id_rsa_pub() + user_data = USER_DATA_TEMPLATE.format(nginx_id_rsa_pub) + return user_data + + +def spawn_reserving_node(hostname): + node_data, fast = get_node_data(hostname) + instance = create_instance(**node_data) + return instance.private_dns_name, instance.private_ip_address, fast + + +def terminate_node(hostname): + instance = get_instance(hostname) + instance.connection.terminate_instances([instance.id]) + + +def wait_node_accessible(hostname): + attempt = AWS_INSTANCE_RUNNING_MAX_ATTEMTPS + while attempt > 0: + ssh, err = ssh_connect(hostname) + if err is None: + break + attempt -= 1 + time.sleep(AWS_INSTANCE_RUNNING_INTERVAL) + else: + raise AMIException( + 'Timeout waiting node {0} to be accessible'.format(hostname) + ) + return ssh + + +def wait_node_running(hostname): + attempt = AWS_INSTANCE_RUNNING_MAX_ATTEMTPS + while attempt > 0: + instance = get_instance(hostname) + if instance.state == 'running': + break + attempt -= 1 + time.sleep(AWS_INSTANCE_RUNNING_INTERVAL) + else: + raise AMIException( + 'Timeout waiting node {0} to be running'.format(hostname) + ) diff --git a/kubedock/kapi/nodes.py b/kubedock/kapi/nodes.py index c103464d8..83122cff5 100644 --- a/kubedock/kapi/nodes.py +++ b/kubedock/kapi/nodes.py @@ -20,56 +20,45 @@ """Management functions for nodes. """ -import math import os import socket from flask import current_app -from sqlalchemy.exc import IntegrityError from kubedock.exceptions import InternalPodsCreationError from . import ingress from . import pstorage -from .network_policies import get_dns_policy_config, get_logs_policy_config from .node_utils import ( add_node_to_db, delete_node_from_db, remove_ls_storage, - cleanup_node_network_policies) -from .podcollection import PodCollection + cleanup_node_network_policies, revoke_celery_node_install_task, + remove_node_install_log, remove_node_from_k8s) +from .service_pods import ( + create_dns_pod, + create_logs_pod, + create_policy_pod, + delete_logs_pod, +) from .. import tasks from ..billing import kubes_to_limits from ..billing.models import Kube from ..core import db, ExclusiveLockContextManager from ..domains.models import BaseDomain from ..exceptions import APIError -from ..kd_celery import celery from ..nodes.models import Node, NodeFlag, NodeFlagNames -from ..pods.models import Pod, PodIP +from ..pods.models import PodIP from ..settings import ( - MASTER_IP, KUBERDOCK_SETTINGS_FILE, KUBERDOCK_INTERNAL_USER, - ELASTICSEARCH_REST_PORT, NODE_INSTALL_LOG_FILE, NODE_INSTALL_TASK_ID, - ETCD_NETWORK_POLICY_SERVICE, ELASTICSEARCH_PUBLISH_PORT, CALICO, - ZFS, AWS, ETCD_CACERT, DNS_CLIENT_CRT, DNS_CLIENT_KEY, DNS_SERVICE_IP) + MASTER_IP, KUBERDOCK_SETTINGS_FILE, NODE_INSTALL_LOG_FILE, + NODE_INSTALL_TASK_ID, ZFS, AWS) from ..users.models import User from ..utils import ( - Etcd, send_event_to_role, run_ssh_command, - retry, NODE_STATUSES, get_node_token, ip2int, get_current_host_ips, get_node_interface, ) -from ..validation import check_internal_pod_data - -KUBERDOCK_DNS_POD_NAME = 'kuberdock-dns' -KUBERDOCK_POLICY_POD_NAME = 'kuberdock-policy-agent' -KUBERDOCK_LOGS_MEMORY_LIMIT = 256 * 1024 * 1024 - - -def get_kuberdock_logs_pod_name(node): - return 'kuberdock-logs-{0}'.format(node) def create_node(ip, hostname, kube_id, public_interface=None, @@ -168,120 +157,6 @@ def _create_internal_pods(hostname, token): return log_pod -def create_logs_pod(hostname, owner): - def _create_pod(): - pod_name = get_kuberdock_logs_pod_name(hostname) - dbpod = db.session.query(Pod).filter(Pod.name == pod_name, - Pod.owner_id == owner.id).first() - if dbpod: - return PodCollection(owner).get(dbpod.id, as_json=False) - - try: - logs_kubes = 1 - logcollector_kubes = logs_kubes - logstorage_kubes = logs_kubes - node_resources = kubes_to_limits( - logs_kubes, Kube.get_internal_service_kube_type() - )['resources'] - logs_memory_limit = node_resources['limits']['memory'] - if logs_memory_limit < KUBERDOCK_LOGS_MEMORY_LIMIT: - logs_kubes = int(math.ceil( - float(KUBERDOCK_LOGS_MEMORY_LIMIT) / logs_memory_limit - )) - - if logs_kubes > 1: - # allocate total log cubes to log collector and to log - # storage/search containers as 1 : 3 - total_kubes = logs_kubes * 2 - logcollector_kubes = int(math.ceil(float(total_kubes) / 4)) - logstorage_kubes = total_kubes - logcollector_kubes - internal_ku_token = owner.get_token() - - logs_config = get_kuberdock_logs_config( - hostname, pod_name, - Kube.get_internal_service_kube_type(), - logcollector_kubes, - logstorage_kubes, - MASTER_IP, - internal_ku_token - ) - check_internal_pod_data(logs_config, owner) - logs_pod = PodCollection(owner).add(logs_config, skip_check=True) - pod_id = logs_pod['id'] - PodCollection(owner).update( - pod_id, {'command': 'synchronous_start'} - ) - if CALICO: - logs_policy = get_logs_policy_config( - owner.id, pod_id, pod_name) - Etcd(ETCD_NETWORK_POLICY_SERVICE).put( - pod_name, value=logs_policy) - return PodCollection(owner).get(pod_id, as_json=False) - - except (IntegrityError, APIError): - # Either pod already exists or an error occurred during it's - # creation - log and retry - current_app.logger.exception( - 'During "{}" node creation tried to create a Logs service ' - 'pod but got an error.'.format(hostname)) - - return retry(_create_pod, 1, 5, exc=APIError('Could not create Log ' - 'service POD')) - - -def create_dns_pod(hostname, owner): - def _create_pod(): - if db.session.query(Pod) \ - .filter_by(name=KUBERDOCK_DNS_POD_NAME, owner=owner).first(): - return True - - try: - dns_config = get_dns_pod_config() - check_internal_pod_data(dns_config, owner) - dns_pod = PodCollection(owner).add(dns_config, skip_check=True) - PodCollection(owner).update(dns_pod['id'], - {'command': 'synchronous_start'}) - if CALICO: - dns_policy = get_dns_policy_config(owner.id, dns_pod['id']) - Etcd(ETCD_NETWORK_POLICY_SERVICE).put( - KUBERDOCK_DNS_POD_NAME, value=dns_policy - ) - return True - except (IntegrityError, APIError): - # Either pod already exists or an error occurred during it's - # creation - log and retry - current_app.logger.exception( - 'During "{}" node creation tried to create a DNS service ' - 'pod but got an error.'.format(hostname)) - - return retry(_create_pod, 1, 5, exc=APIError('Could not create DNS ' - 'service POD')) - - -def create_policy_pod(hostname, owner, token): - def _create_pod(): - if db.session.query(Pod).filter_by( - name=KUBERDOCK_POLICY_POD_NAME, owner=owner).first(): - return True - - try: - policy_conf = get_policy_agent_config(MASTER_IP, token) - check_internal_pod_data(policy_conf, owner) - policy_pod = PodCollection(owner).add(policy_conf, skip_check=True) - PodCollection(owner).update(policy_pod['id'], - {'command': 'synchronous_start'}) - return True - except (IntegrityError, APIError): - # Either pod already exists or an error occurred during it's - # creation - log and retry - current_app.logger.exception( - 'During "{}" node creation tried to create a Network Policy ' - 'service pod but got an error.'.format(hostname)) - - return retry(_create_pod, 1, 5, exc=APIError('Could not create Network ' - 'Policy service POD')) - - def mark_node_as_being_deleted(node_id): node = Node.query.filter(Node.id == node_id).first() if node is None: @@ -313,20 +188,11 @@ def delete_node(node_id=None, node=None, force=False, verbose=True): _check_node_can_be_deleted(node) try: - celery.control.revoke(task_id=NODE_INSTALL_TASK_ID.format( - node.hostname, node.id - ), - terminate=True) + revoke_celery_node_install_task(node) except Exception as e: raise APIError('Couldn\'t cancel node deployment. Error: {}'.format(e)) - ku = User.query.filter_by(username=KUBERDOCK_INTERNAL_USER).first() - - logs_pod_name = get_kuberdock_logs_pod_name(node.hostname) - logs_pod = db.session.query(Pod).filter_by(name=logs_pod_name, - owner=ku).first() - if logs_pod: - PodCollection(ku).delete(logs_pod.id, force=True) + delete_logs_pod(node.hostname) try: delete_node_from_db(node) @@ -334,15 +200,13 @@ def delete_node(node_id=None, node=None, force=False, verbose=True): raise APIError(u'Failed to delete node from DB: {}'.format(e)) ls_clean_error = remove_ls_storage(hostname, raise_on_error=False) - res = tasks.remove_node_by_host(hostname) + res = remove_node_from_k8s(hostname) if res['status'] == 'Failure' and res['code'] != 404: raise APIError('Failed to delete node in k8s: {0}, code: {1}. \n' 'Please check and remove it manually if needed.' .format(res['message'], res['code'])) - try: - os.remove(NODE_INSTALL_LOG_FILE.format(hostname)) - except OSError: - pass + + remove_node_install_log(hostname) try: cleanup_node_network_policies(hostname) @@ -478,326 +342,3 @@ def _deploy_node(dbnode, log_pod_ip, do_deploy, with_testing, else: dbnode.state = NODE_STATUSES.completed db.session.commit() - - -def get_kuberdock_logs_config(node, name, kube_type, - collector_kubes, storage_kubes, master_ip, - internal_ku_token): - # Give 2/3 of elastic kubes limits to elastic heap. It's recommended do not - # give all memory to the heap, and leave some to Lucene. - es_memory_limit = kubes_to_limits( - storage_kubes, kube_type - )['resources']['limits']['memory'] - es_heap_limit = (es_memory_limit * 2) / 3 - return { - "name": name, - "replicas": 1, - "kube_type": kube_type, - "node": node, - "restartPolicy": "Always", - "volumes": [ - { - "name": "docker-containers", - # path is allowed only for kuberdock-internal - "localStorage": {"path": "/var/lib/docker/containers"} - }, - { - "name": "es-persistent-storage", - # path is allowed only for kuberdock-internal - "localStorage": {"path": "/var/lib/elasticsearch"}, - } - ], - "containers": [ - { - "command": ["./run.sh"], - "kubes": collector_kubes, - "image": "kuberdock/fluentd:1.8", - "name": "fluentd", - "env": [ - { - "name": "NODENAME", - "value": node - }, - { - "name": "ES_HOST", - "value": "127.0.0.1" - } - ], - "ports": [ - { - "isPublic": False, - "protocol": "UDP", - "containerPort": 5140, - "hostPort": 5140 - } - ], - "volumeMounts": [ - { - "name": "docker-containers", - "mountPath": "/var/lib/docker/containers" - } - ], - "workingDir": "/root", - "terminationMessagePath": None - }, - { - "kubes": storage_kubes, - "image": "kuberdock/elasticsearch:2.2", - "name": "elasticsearch", - "env": [ - { - "name": "MASTER", - "value": master_ip - }, - { - "name": "TOKEN", - "value": internal_ku_token - }, - { - "name": "NODENAME", - "value": node - }, - { - "name": "ES_HEAP_SIZE", - "value": "{}m".format(es_heap_limit / (1024 * 1024)) - } - ], - "ports": [ - { - "isPublic": False, - "protocol": "TCP", - "containerPort": ELASTICSEARCH_REST_PORT, - "hostPort": ELASTICSEARCH_REST_PORT - }, - { - "isPublic": False, - "protocol": "TCP", - "containerPort": ELASTICSEARCH_PUBLISH_PORT, - "hostPort": ELASTICSEARCH_PUBLISH_PORT - } - ], - "volumeMounts": [ - { - "name": "es-persistent-storage", - "mountPath": "/elasticsearch/data" - } - ], - "workingDir": "", - "terminationMessagePath": None - } - ] - } - - -def get_dns_pod_config(domain='kuberdock', ip=DNS_SERVICE_IP): - """Returns config of k8s DNS service pod.""" - # Based on - # https://github.com/kubernetes/kubernetes/blob/release-1.2/ - # cluster/addons/dns/skydns-rc.yaml.in - # TODO AC-3377: migrate on yaml-based templates - # TODO AC-3378: integrate exechealthz container - return { - "name": KUBERDOCK_DNS_POD_NAME, - "podIP": ip, - "replicas": 1, - "kube_type": Kube.get_internal_service_kube_type(), - "node": None, - "restartPolicy": "Always", - "dnsPolicy": "Default", - "volumes": [ - { - "name": "kubernetes-config", - # path is allowed only for kuberdock-internal - "localStorage": {"path": "/etc/kubernetes"} - }, - { - "name": "etcd-pki", - # path is allowed only for kuberdock-internal - "localStorage": {"path": "/etc/pki/etcd"} - } - ], - "containers": [ - { - "name": "etcd", - "command": [ - "/usr/local/bin/etcd", - "-data-dir", - "/var/etcd/data", - "-listen-client-urls", - "https://0.0.0.0:2379,http://127.0.0.1:4001", - "-advertise-client-urls", - "https://0.0.0.0:2379,http://127.0.0.1:4001", - "-initial-cluster-token", - "skydns-etcd", - "--ca-file", - ETCD_CACERT, - "--cert-file", - "/etc/pki/etcd/etcd-dns.crt", - "--key-file", - "/etc/pki/etcd/etcd-dns.key" - ], - "kubes": 1, - "image": "gcr.io/google_containers/etcd-amd64:2.2.1", - "env": [], - "ports": [ - { - "isPublic": False, - "protocol": "TCP", - "containerPort": 2379 - } - ], - "volumeMounts": [ - { - "name": "etcd-pki", - "mountPath": "/etc/pki/etcd" - } - ], - "workingDir": "", - "terminationMessagePath": None - }, - { - "name": "kube2sky", - "args": [ - "--domain={0}".format(domain), - "--kubecfg-file=/etc/kubernetes/configfile", - "--kube-master-url=https://10.254.0.1", - ], - "kubes": 1, - "image": "kuberdock/kube2sky:1.2", - "env": [], - "ports": [], - "volumeMounts": [ - { - "name": "kubernetes-config", - "mountPath": "/etc/kubernetes" - } - ], - "workingDir": "", - "terminationMessagePath": None, - "readinessProbe": { - "httpGet": { - "path": "/readiness", - "port": 8081, - "scheme": "HTTP", - }, - "initialDelaySeconds": 30, - "timeoutSeconds": 5 - }, - "livenessProbe": { - "httpGet": { - "path": "/healthz", - "port": 8080, - "scheme": "HTTP" - }, - "initialDelaySeconds": 60, - "timeoutSeconds": 5, - "successThreshold": 1, - "failureThreshold": 5, - } - }, - { - "name": "skydns", - "args": [ - "-machines=http://127.0.0.1:4001", - "-addr=0.0.0.0:53", - "-ns-rotate=false", - "-domain={0}.".format(domain) - ], - "kubes": 1, - "image": "gcr.io/google_containers/skydns:2015-10-13-8c72f8c", - "env": [], - "ports": [ - { - "isPublic": False, - "protocol": "UDP", - "containerPort": 53 - }, - { - "isPublic": False, - "protocol": "TCP", - "containerPort": 53 - } - ], - "volumeMounts": [], - "workingDir": "", - "terminationMessagePath": None - }, - { - "name": "healthz", - "image": "gcr.io/google_containers/exechealthz:1.0", - "args": [ - "-cmd=nslookup {0} 127.0.0.1 >/dev/null".format(domain), - "-port=8080" - ], - "ports": [{ - "protocol": "TCP", - "containerPort": 8080 - }] - } - ] - } - - -def get_policy_agent_config(master, token): - return { - "name": "kuberdock-policy-agent", - "replicas": 1, - "kube_type": Kube.get_internal_service_kube_type(), - "node": None, - "restartPolicy": "Always", - "hostNetwork": True, - "volumes": [ - { - "name": "etcd-pki", - # path is allowed only for kuberdock-internal - "localStorage": {"path": "/etc/pki/etcd"} - } - ], - "containers": [ - { - "command": [], - "kubes": 1, - "image": "kuberdock/k8s-policy-agent:v0.1.4-kd2", - "name": "policy-agent", - "env": [ - { - "name": "ETCD_AUTHORITY", - "value": "{0}:2379".format(master) - }, - { - "name": "ETCD_SCHEME", - "value": "https" - }, - { - "name": "ETCD_CA_CERT_FILE", - "value": ETCD_CACERT - }, - { - "name": "ETCD_CERT_FILE", - "value": DNS_CLIENT_CRT - }, - { - "name": "ETCD_KEY_FILE", - "value": DNS_CLIENT_KEY - }, - { - "name": "K8S_API", - "value": "https://{0}:6443".format(master) - }, - { - "name": "K8S_AUTH_TOKEN", - "value": token - } - ], - "ports": [], - "volumeMounts": [ - { - "name": "etcd-pki", - "mountPath": "/etc/pki/etcd" - } - ], - "workingDir": "", - "terminationMessagePath": None - }, - ] - } diff --git a/kubedock/kapi/service_pods.py b/kubedock/kapi/service_pods.py new file mode 100644 index 000000000..dce7e113f --- /dev/null +++ b/kubedock/kapi/service_pods.py @@ -0,0 +1,479 @@ +import math + +from flask import current_app +from sqlalchemy.exc import IntegrityError + +from .network_policies import get_dns_policy_config, get_logs_policy_config +from .podcollection import PodCollection +from ..billing import kubes_to_limits +from ..billing.models import Kube +from ..core import db +from ..exceptions import APIError +from ..pods.models import Pod +from ..settings import ( + DNS_CLIENT_CRT, + DNS_CLIENT_KEY, + DNS_SERVICE_IP, + ELASTICSEARCH_PUBLISH_PORT, + ELASTICSEARCH_REST_PORT, + ETCD_CACERT, + ETCD_NETWORK_POLICY_SERVICE, + MASTER_IP, +) +from ..users.models import User +from ..utils import Etcd, retry +from ..validation import check_internal_pod_data + + +KUBERDOCK_DNS_POD_NAME = 'kuberdock-dns' +KUBERDOCK_POLICY_POD_NAME = 'kuberdock-policy-agent' +KUBERDOCK_LOGS_MEMORY_LIMIT = 256 * 1024 * 1024 + + +def get_kuberdock_logs_pod_name(node): + return 'kuberdock-logs-{0}'.format(node) + + +def create_logs_pod(hostname, owner): + def _create_pod(): + pod_name = get_kuberdock_logs_pod_name(hostname) + dbpod = db.session.query(Pod).filter(Pod.name == pod_name, + Pod.owner_id == owner.id).first() + if dbpod: + return PodCollection(owner).get(dbpod.id, as_json=False) + + try: + logs_kubes = 1 + logcollector_kubes = logs_kubes + logstorage_kubes = logs_kubes + node_resources = kubes_to_limits( + logs_kubes, Kube.get_internal_service_kube_type() + )['resources'] + logs_memory_limit = node_resources['limits']['memory'] + if logs_memory_limit < KUBERDOCK_LOGS_MEMORY_LIMIT: + logs_kubes = int(math.ceil( + float(KUBERDOCK_LOGS_MEMORY_LIMIT) / logs_memory_limit + )) + + if logs_kubes > 1: + # allocate total log cubes to log collector and to log + # storage/search containers as 1 : 3 + total_kubes = logs_kubes * 2 + logcollector_kubes = int(math.ceil(float(total_kubes) / 4)) + logstorage_kubes = total_kubes - logcollector_kubes + internal_ku_token = owner.get_token() + + logs_config = get_kuberdock_logs_config( + hostname, pod_name, + Kube.get_internal_service_kube_type(), + logcollector_kubes, + logstorage_kubes, + MASTER_IP, + internal_ku_token + ) + check_internal_pod_data(logs_config, owner) + logs_pod = PodCollection(owner).add(logs_config, skip_check=True) + pod_id = logs_pod['id'] + PodCollection(owner).update( + pod_id, {'command': 'synchronous_start'} + ) + logs_policy = get_logs_policy_config( + owner.id, pod_id, pod_name) + Etcd(ETCD_NETWORK_POLICY_SERVICE).put( + pod_name, value=logs_policy) + return PodCollection(owner).get(pod_id, as_json=False) + + except (IntegrityError, APIError): + # Either pod already exists or an error occurred during it's + # creation - log and retry + current_app.logger.exception( + 'During "{}" node creation tried to create a Logs service ' + 'pod but got an error.'.format(hostname)) + + return retry(_create_pod, 1, 5, exc=APIError('Could not create Log ' + 'service POD')) + + +def create_dns_pod(hostname, owner): + def _create_pod(): + if db.session.query(Pod) \ + .filter_by(name=KUBERDOCK_DNS_POD_NAME, owner=owner).first(): + return True + + try: + dns_config = get_dns_pod_config() + check_internal_pod_data(dns_config, owner) + dns_pod = PodCollection(owner).add(dns_config, skip_check=True) + PodCollection(owner).update(dns_pod['id'], + {'command': 'synchronous_start'}) + dns_policy = get_dns_policy_config(owner.id, dns_pod['id']) + Etcd(ETCD_NETWORK_POLICY_SERVICE).put( + KUBERDOCK_DNS_POD_NAME, value=dns_policy + ) + return True + except (IntegrityError, APIError): + # Either pod already exists or an error occurred during it's + # creation - log and retry + current_app.logger.exception( + 'During "{}" node creation tried to create a DNS service ' + 'pod but got an error.'.format(hostname)) + + return retry(_create_pod, 1, 5, exc=APIError('Could not create DNS ' + 'service POD')) + + +def create_policy_pod(hostname, owner, token): + def _create_pod(): + if db.session.query(Pod).filter_by( + name=KUBERDOCK_POLICY_POD_NAME, owner=owner).first(): + return True + + try: + policy_conf = get_policy_agent_config(MASTER_IP, token) + check_internal_pod_data(policy_conf, owner) + policy_pod = PodCollection(owner).add(policy_conf, skip_check=True) + PodCollection(owner).update(policy_pod['id'], + {'command': 'synchronous_start'}) + return True + except (IntegrityError, APIError): + # Either pod already exists or an error occurred during it's + # creation - log and retry + current_app.logger.exception( + 'During "{}" node creation tried to create a Network Policy ' + 'service pod but got an error.'.format(hostname)) + + return retry(_create_pod, 1, 5, exc=APIError('Could not create Network ' + 'Policy service POD')) + + +def delete_logs_pod(hostname): + ku = User.get_internal() + + logs_pod_name = get_kuberdock_logs_pod_name(hostname) + logs_pod = db.session.query(Pod).filter_by(name=logs_pod_name, + owner=ku).first() + if logs_pod: + PodCollection(ku).delete(logs_pod.id, force=True) + + +def get_kuberdock_logs_config(node, name, kube_type, + collector_kubes, storage_kubes, master_ip, + internal_ku_token): + # Give 2/3 of elastic kubes limits to elastic heap. It's recommended do not + # give all memory to the heap, and leave some to Lucene. + es_memory_limit = kubes_to_limits( + storage_kubes, kube_type + )['resources']['limits']['memory'] + es_heap_limit = (es_memory_limit * 2) / 3 + return { + "name": name, + "replicas": 1, + "kube_type": kube_type, + "node": node, + "restartPolicy": "Always", + "volumes": [ + { + "name": "docker-containers", + # path is allowed only for kuberdock-internal + "localStorage": {"path": "/var/lib/docker/containers"} + }, + { + "name": "es-persistent-storage", + # path is allowed only for kuberdock-internal + "localStorage": {"path": "/var/lib/elasticsearch"}, + } + ], + "containers": [ + { + "command": ["./run.sh"], + "kubes": collector_kubes, + "image": "kuberdock/fluentd:1.8", + "name": "fluentd", + "env": [ + { + "name": "NODENAME", + "value": node + }, + { + "name": "ES_HOST", + "value": "127.0.0.1" + } + ], + "ports": [ + { + "isPublic": False, + "protocol": "UDP", + "containerPort": 5140, + "hostPort": 5140 + } + ], + "volumeMounts": [ + { + "name": "docker-containers", + "mountPath": "/var/lib/docker/containers" + } + ], + "workingDir": "/root", + "terminationMessagePath": None + }, + { + "kubes": storage_kubes, + "image": "kuberdock/elasticsearch:2.2", + "name": "elasticsearch", + "env": [ + { + "name": "MASTER", + "value": master_ip + }, + { + "name": "TOKEN", + "value": internal_ku_token + }, + { + "name": "NODENAME", + "value": node + }, + { + "name": "ES_HEAP_SIZE", + "value": "{}m".format(es_heap_limit / (1024 * 1024)) + } + ], + "ports": [ + { + "isPublic": False, + "protocol": "TCP", + "containerPort": ELASTICSEARCH_REST_PORT, + "hostPort": ELASTICSEARCH_REST_PORT + }, + { + "isPublic": False, + "protocol": "TCP", + "containerPort": ELASTICSEARCH_PUBLISH_PORT, + "hostPort": ELASTICSEARCH_PUBLISH_PORT + } + ], + "volumeMounts": [ + { + "name": "es-persistent-storage", + "mountPath": "/elasticsearch/data" + } + ], + "workingDir": "", + "terminationMessagePath": None + } + ] + } + + +def get_dns_pod_config(domain='kuberdock', ip=DNS_SERVICE_IP): + """Returns config of k8s DNS service pod.""" + # Based on + # https://github.com/kubernetes/kubernetes/blob/release-1.2/ + # cluster/addons/dns/skydns-rc.yaml.in + # TODO AC-3377: migrate on yaml-based templates + # TODO AC-3378: integrate exechealthz container + return { + "name": KUBERDOCK_DNS_POD_NAME, + "podIP": ip, + "replicas": 1, + "kube_type": Kube.get_internal_service_kube_type(), + "node": None, + "restartPolicy": "Always", + "dnsPolicy": "Default", + "volumes": [ + { + "name": "kubernetes-config", + # path is allowed only for kuberdock-internal + "localStorage": {"path": "/etc/kubernetes"} + }, + { + "name": "etcd-pki", + # path is allowed only for kuberdock-internal + "localStorage": {"path": "/etc/pki/etcd"} + } + ], + "containers": [ + { + "name": "etcd", + "command": [ + "/usr/local/bin/etcd", + "-data-dir", + "/var/etcd/data", + "-listen-client-urls", + "https://0.0.0.0:2379,http://127.0.0.1:4001", + "-advertise-client-urls", + "https://0.0.0.0:2379,http://127.0.0.1:4001", + "-initial-cluster-token", + "skydns-etcd", + "--ca-file", + ETCD_CACERT, + "--cert-file", + "/etc/pki/etcd/etcd-dns.crt", + "--key-file", + "/etc/pki/etcd/etcd-dns.key" + ], + "kubes": 1, + "image": "gcr.io/google_containers/etcd-amd64:2.2.1", + "env": [], + "ports": [ + { + "isPublic": False, + "protocol": "TCP", + "containerPort": 2379 + } + ], + "volumeMounts": [ + { + "name": "etcd-pki", + "mountPath": "/etc/pki/etcd" + } + ], + "workingDir": "", + "terminationMessagePath": None + }, + { + "name": "kube2sky", + "args": [ + "--domain={0}".format(domain), + "--kubecfg-file=/etc/kubernetes/configfile", + "--kube-master-url=https://10.254.0.1", + ], + "kubes": 1, + "image": "kuberdock/kube2sky:1.2", + "env": [], + "ports": [], + "volumeMounts": [ + { + "name": "kubernetes-config", + "mountPath": "/etc/kubernetes" + } + ], + "workingDir": "", + "terminationMessagePath": None, + "readinessProbe": { + "httpGet": { + "path": "/readiness", + "port": 8081, + "scheme": "HTTP", + }, + "initialDelaySeconds": 30, + "timeoutSeconds": 5 + }, + "livenessProbe": { + "httpGet": { + "path": "/healthz", + "port": 8080, + "scheme": "HTTP" + }, + "initialDelaySeconds": 60, + "timeoutSeconds": 5, + "successThreshold": 1, + "failureThreshold": 5, + } + }, + { + "name": "skydns", + "args": [ + "-machines=http://127.0.0.1:4001", + "-addr=0.0.0.0:53", + "-ns-rotate=false", + "-domain={0}.".format(domain) + ], + "kubes": 1, + "image": "gcr.io/google_containers/skydns:2015-10-13-8c72f8c", + "env": [], + "ports": [ + { + "isPublic": False, + "protocol": "UDP", + "containerPort": 53 + }, + { + "isPublic": False, + "protocol": "TCP", + "containerPort": 53 + } + ], + "volumeMounts": [], + "workingDir": "", + "terminationMessagePath": None + }, + { + "name": "healthz", + "image": "gcr.io/google_containers/exechealthz:1.0", + "args": [ + "-cmd=nslookup {0} 127.0.0.1 >/dev/null".format(domain), + "-port=8080" + ], + "ports": [{ + "protocol": "TCP", + "containerPort": 8080 + }] + } + ] + } + + +def get_policy_agent_config(master, token): + return { + "name": "kuberdock-policy-agent", + "replicas": 1, + "kube_type": Kube.get_internal_service_kube_type(), + "node": None, + "restartPolicy": "Always", + "hostNetwork": True, + "volumes": [ + { + "name": "etcd-pki", + # path is allowed only for kuberdock-internal + "localStorage": {"path": "/etc/pki/etcd"} + } + ], + "containers": [ + { + "command": [], + "kubes": 1, + "image": "kuberdock/k8s-policy-agent:v0.1.4-kd2", + "name": "policy-agent", + "env": [ + { + "name": "ETCD_AUTHORITY", + "value": "{0}:2379".format(master) + }, + { + "name": "ETCD_SCHEME", + "value": "https" + }, + { + "name": "ETCD_CA_CERT_FILE", + "value": ETCD_CACERT + }, + { + "name": "ETCD_CERT_FILE", + "value": DNS_CLIENT_CRT + }, + { + "name": "ETCD_KEY_FILE", + "value": DNS_CLIENT_KEY + }, + { + "name": "K8S_API", + "value": "https://{0}:6443".format(master) + }, + { + "name": "K8S_AUTH_TOKEN", + "value": token + } + ], + "ports": [], + "volumeMounts": [ + { + "name": "etcd-pki", + "mountPath": "/etc/pki/etcd" + } + ], + "workingDir": "", + "terminationMessagePath": None + }, + ] + } diff --git a/kubedock/kapi/tests/test_elasticsearch_utils.py b/kubedock/kapi/tests/test_elasticsearch_utils.py index e0972dfcb..031b4cffa 100644 --- a/kubedock/kapi/tests/test_elasticsearch_utils.py +++ b/kubedock/kapi/tests/test_elasticsearch_utils.py @@ -33,7 +33,7 @@ from kubedock.settings import KUBE_API_HOST, KUBE_API_PORT from kubedock.billing.models import Kube from kubedock.nodes.models import Node -from kubedock.kapi.nodes import get_kuberdock_logs_pod_name +from kubedock.kapi.service_pods import get_kuberdock_logs_pod_name class TestElasticsearchUtils(DBTestCase): diff --git a/kubedock/kapi/tests/test_nodes.py b/kubedock/kapi/tests/test_nodes.py index db2d57876..83a02ba1f 100644 --- a/kubedock/kapi/tests/test_nodes.py +++ b/kubedock/kapi/tests/test_nodes.py @@ -32,9 +32,8 @@ from kubedock.billing.models import Kube from kubedock.core import db from kubedock.exceptions import APIError -from kubedock.kapi import nodes, network_policies +from kubedock.kapi import nodes from kubedock.nodes.models import Node -from kubedock.users.models import User from kubedock.pods.models import IPPool from kubedock.testutils.testcases import DBTestCase from kubedock.utils import NODE_STATUSES @@ -74,19 +73,20 @@ def tearDown(self): @mock.patch.object(nodes, 'get_current_host_ips') @mock.patch.object(nodes, '_check_node_ip') @mock.patch.object(nodes, 'PodIP') + @mock.patch.object(nodes, 'create_policy_pod') @mock.patch.object(nodes, 'create_logs_pod') - @mock.patch.object(nodes, 'Etcd') + @mock.patch.object(nodes, 'create_dns_pod') @mock.patch.object(nodes, 'get_node_token') @mock.patch.object(nodes, '_check_node_hostname') @mock.patch.object(nodes, '_deploy_node') - @mock.patch.object(nodes, 'PodCollection') @mock.patch.object(nodes.socket, 'gethostbyname') - def test_create_node(self, gethostbyname_mock, podcollection_mock, + def test_create_node(self, gethostbyname_mock, deploy_node_mock, check_node_hostname_mock, get_node_token_mock, - etcd_mock, + create_dns_pod_mock, create_logs_pod_mock, + create_policy_pod_mock, pod_ip_mock, check_node_ip_mock, get_current_host_ips_mock): @@ -99,13 +99,10 @@ def test_create_node(self, gethostbyname_mock, podcollection_mock, gethostbyname_mock.return_value = ip get_current_host_ips_mock.return_value = fake_master_ips - podcollection_mock.return_value.add.return_value = {'id': 1} get_node_token_mock.return_value = 'some-token' log_pod_ip = '123.123.123.123' create_logs_pod_mock.return_value = {'podIP': log_pod_ip} - nodes.CALICO = True - # add a Node with Pod IP pod_ip = mock.Mock() pod_ip.ip_address = 3232235778 # '192.168.1.2' @@ -135,7 +132,9 @@ def test_create_node(self, gethostbyname_mock, podcollection_mock, check_node_ip_mock.assert_called_once_with(node.ip, hostname) get_node_token_mock.assert_called_once_with() # one call for dns pod - self.assertEqual(etcd_mock.call_count, 1) + self.assertEqual(create_dns_pod_mock.call_count, 1) + # one call for policy pod + self.assertEqual(create_policy_pod_mock.call_count, 1) # add a node with the same IP with self.assertRaises(APIError): @@ -147,30 +146,9 @@ def test_create_node(self, gethostbyname_mock, podcollection_mock, with self.assertRaises(APIError): nodes.create_node(None, 'anotherhost', kube_id) - @mock.patch.object(network_policies, 'get_calico_ip_tunnel_address') - @mock.patch.object(nodes, 'Etcd') - @mock.patch.object(nodes, 'PodCollection') - def test_create_logs_pod(self, podcollection_mock, etcd_mock, - get_calico_ip_mock): - hostname = 'qwerty' - test_result = 3131313 - pod_id = '424242' - podcollection_mock.return_value.add.return_value = {'id': pod_id} - podcollection_mock.return_value.get.return_value = test_result - get_calico_ip_mock.return_value = '12.12.12.12' - nodes.CALICO = True - owner = User.get_internal() - res = nodes.create_logs_pod(hostname, owner) - self.assertEqual(res, test_result) - self.assertEqual(etcd_mock.call_count, 1) - get_calico_ip_mock.assert_called_once_with() - podcollection_mock.return_value.update.assert_called_once_with( - pod_id, {'command': 'synchronous_start'} - ) - @mock.patch.object(nodes, 'remove_ls_storage') - @mock.patch.object(nodes.tasks, 'remove_node_by_host') - def test_delete_node(self, remove_by_host_mock, remove_ls_storage_mock): + @mock.patch.object(nodes, 'remove_node_from_k8s') + def test_delete_node(self, remove_node_mock, remove_ls_storage_mock): """Test for kapi.nodes.delete_node function.""" node1, node2 = self.add_two_nodes() id1 = node1.id @@ -179,7 +157,7 @@ def test_delete_node(self, remove_by_host_mock, remove_ls_storage_mock): nodes.delete_node(id1) nodes_ = Node.get_all() - remove_by_host_mock.assert_called_once_with(node1.hostname) + remove_node_mock.assert_called_once_with(node1.hostname) remove_ls_storage_mock.assert_called_once_with( node1.hostname, raise_on_error=False) self.assertEqual(nodes_, [node2]) @@ -188,9 +166,9 @@ def test_delete_node(self, remove_by_host_mock, remove_ls_storage_mock): nodes.delete_node(id1) @mock.patch.object(nodes, 'remove_ls_storage') - @mock.patch.object(nodes.tasks, 'remove_node_by_host') + @mock.patch.object(nodes, 'remove_node_from_k8s') def test_node_cant_be_deleted_in_fixed_ip_pools_mode_with_active_ip_pools( - self, remove_by_host_mock, remove_ls_storage_mock): + self, remove_node_mock, remove_ls_storage_mock): remove_ls_storage_mock.return_value = '' current_app.config['FIXED_IP_POOLS'] = True diff --git a/kubedock/kapi/tests/test_service_pods.py b/kubedock/kapi/tests/test_service_pods.py new file mode 100644 index 000000000..d2c715bd4 --- /dev/null +++ b/kubedock/kapi/tests/test_service_pods.py @@ -0,0 +1,28 @@ +import mock + +from kubedock.kapi import network_policies, service_pods +from kubedock.users.models import User +from kubedock.testutils.testcases import DBTestCase + + +class TestServicePods(DBTestCase): + + @mock.patch.object(network_policies, 'get_calico_ip_tunnel_address') + @mock.patch.object(service_pods, 'Etcd') + @mock.patch.object(service_pods, 'PodCollection') + def test_create_logs_pod(self, podcollection_mock, etcd_mock, + get_calico_ip_mock): + hostname = 'qwerty' + test_result = 3131313 + pod_id = '424242' + podcollection_mock.return_value.add.return_value = {'id': pod_id} + podcollection_mock.return_value.get.return_value = test_result + get_calico_ip_mock.return_value = '12.12.12.12' + owner = User.get_internal() + res = service_pods.create_logs_pod(hostname, owner) + self.assertEqual(res, test_result) + self.assertEqual(etcd_mock.call_count, 1) + get_calico_ip_mock.assert_called_once_with() + podcollection_mock.return_value.update.assert_called_once_with( + pod_id, {'command': 'synchronous_start'} + ) diff --git a/kubedock/network_policies_utils.py b/kubedock/network_policies_utils.py index 547220abc..04ca62293 100644 --- a/kubedock/network_policies_utils.py +++ b/kubedock/network_policies_utils.py @@ -32,7 +32,7 @@ get_tiers, ) from .kapi.node_utils import complete_calico_node_config -from .kapi.nodes import ( +from .kapi.service_pods import ( KUBERDOCK_DNS_POD_NAME, get_kuberdock_logs_pod_name, ) diff --git a/kubedock/settings.py b/kubedock/settings.py index 2d8d614a5..a8e4b5ec0 100644 --- a/kubedock/settings.py +++ b/kubedock/settings.py @@ -191,6 +191,7 @@ def _get_remote_sentry_settings(): # If None, defaults will be used SSH_KEY_FILENAME = '/var/lib/nginx/.ssh/id_rsa' +SSH_PUB_FILENAME = '/var/lib/nginx/.ssh/id_rsa.pub' INFLUXDB_HOST = os.environ.get('INFLUXDB_HOST', '127.0.0.1') INFLUXDB_PORT = 8086 @@ -284,12 +285,15 @@ def _get_remote_sentry_settings(): ZFS = False AWS = False +AWS_ACCESS_KEY_ID = AWS_SECRET_ACCESS_KEY = REGION = '' # Default EBS volume type for node storage on AWS. # Available types are: 'standard', 'io1', 'gp2' AWS_DEFAULT_EBS_VOLUME_TYPE = 'standard' AWS_DEFAULT_EBS_VOLUME_IOPS = 1000 # AWS EBS volume types which support provisioned iops AWS_IOPS_PROVISION_VOLUME_TYPES = ('io1',) +AWS_INSTANCE_RUNNING_INTERVAL = 3 +AWS_INSTANCE_RUNNING_MAX_ATTEMTPS = 30 MASTER_IP = '' MASTER_TOBIND_FLANNEL = 'enp0s5' diff --git a/kubedock/tasks.py b/kubedock/tasks.py index fcf6f2f15..ec531546b 100644 --- a/kubedock/tasks.py +++ b/kubedock/tasks.py @@ -32,10 +32,12 @@ from . import dns_management from .core import db, ssh_connect +from .kapi import migrate_pod_utils, node_utils_aws from .kapi.collect import collect, send from .kapi.helpers import KubeQuery, raise_if_failure from .kapi.node import Node as K8SNode from .kapi.node_utils import ( + add_node_to_k8s, remove_node_from_k8s, setup_storage_to_aws_node, add_volume_to_node_ls, complete_calico_node_config) from .kapi.pstorage import ( @@ -48,15 +50,16 @@ from .rbac.models import Role from .settings import ( NODE_INSTALL_LOG_FILE, MASTER_IP, AWS, NODE_INSTALL_TIMEOUT_SEC, - NODE_CEPH_AWARE_KUBERDOCK_LABEL, CEPH, CEPH_KEYRING_PATH, + CEPH, CEPH_KEYRING_PATH, CEPH_POOL_NAME, CEPH_CLIENT_USER, KUBERDOCK_INTERNAL_USER, NODE_SSH_COMMAND_SHORT_EXEC_TIMEOUT, - CALICO, NODE_STORAGE_MANAGE_DIR, ZFS) + CALICO, NODE_STORAGE_MANAGE_DIR, ZFS, WITH_TESTING) from .system_settings.models import SystemSettings +from .updates.helpers import get_maintenance from .users.models import SessionData from .utils import ( update_dict, get_api_url, send_event, send_event_to_role, send_logs, - k8s_json_object_hook, get_timezone, NODE_STATUSES, POD_STATUSES + k8s_json_object_hook, get_timezone, NODE_STATUSES, POD_STATUSES, ) @@ -125,43 +128,18 @@ def delete_service_nodelay(item, namespace=None): return r.json() -def remove_node_by_host(host): - r = requests.delete(get_api_url('nodes', host, namespace=False)) - return r.json() - - -def add_node_to_k8s(host, kube_type, is_ceph_installed=False): - """ - :param host: Node hostname - :param kube_type: Kuberdock kube type (integer id) - :return: Error text if error else False - """ - # TODO handle connection errors except requests.RequestException - data = { - 'metadata': { - 'name': host, - 'labels': { - 'kuberdock-node-hostname': host, - 'kuberdock-kube-type': 'type_' + str(kube_type) - }, - 'annotations': { - K8SNode.FREE_PUBLIC_IP_COUNTER_FIELD: '0' - } - }, - 'spec': { - 'externalID': host, - } - } - if is_ceph_installed: - data['metadata']['labels'][NODE_CEPH_AWARE_KUBERDOCK_LABEL] = 'True' - res = requests.post(get_api_url('nodes', namespace=False), - json=data) - return res.text if not res.ok else False +def deploy_node(fast, node_id, log_pod_ip): + if fast: + return node_utils_aws.deploy_node(node_id, log_pod_ip) + else: + return add_new_node(node_id, log_pod_ip, with_testing=WITH_TESTING, + skip_storate=True) @celery.task(base=AddNodeTask) def add_new_node(node_id, log_pod_ip, with_testing=False, redeploy=False, - ls_devices=None, ebs_volume=None, deploy_options=None): + ls_devices=None, ebs_volume=None, deploy_options=None, + skip_storate=False): db_node = Node.get_by_id(node_id) admin_rid = Role.query.filter_by(rolename="Admin").one().id channels = [i.id for i in SessionData.query.filter_by(role_id=admin_rid)] @@ -194,7 +172,7 @@ def add_new_node(node_id, log_pod_ip, with_testing=False, redeploy=False, send_logs(node_id, 'Redeploy.', log_file, channels) send_logs(node_id, 'Remove node {0} from kubernetes...'.format( host), log_file, channels) - result = remove_node_by_host(host) + result = remove_node_from_k8s(host) send_logs(node_id, json.dumps(result, indent=2), log_file, channels) @@ -219,6 +197,7 @@ def add_new_node(node_id, log_pod_ip, with_testing=False, redeploy=False, sftp.put('fslimit.py', '/fslimit.py') sftp.put('make_elastic_config.py', '/make_elastic_config.py') sftp.put('node_install.sh', '/node_install.sh') + sftp.put('node_install_common.sh', '/node_install_common.sh') if not CALICO: sftp.put('node_network_plugin.sh', '/node_network_plugin.sh') sftp.put('node_network_plugin.py', '/node_network_plugin.py') @@ -345,6 +324,8 @@ def add_new_node(node_id, log_pod_ip, with_testing=False, redeploy=False, ssh.exec_command('rm /node_install.sh', timeout=NODE_SSH_COMMAND_SHORT_EXEC_TIMEOUT) + ssh.exec_command('rm /node_install_common.sh', + timeout=NODE_SSH_COMMAND_SHORT_EXEC_TIMEOUT) if CEPH: NodeFlag.save_flag( @@ -363,10 +344,14 @@ def add_new_node(node_id, log_pod_ip, with_testing=False, redeploy=False, send_logs(node_id, err_message, log_file, channels) raise NodeInstallException(err_message) else: - send_logs(node_id, 'Setup persistent storage...', log_file, - channels) - setup_node_storage(ssh, node_id, ls_devices, ebs_volume, - channels) + if skip_storate: + send_logs(node_id, 'Skip storage setup, node is reserving...', + log_file, channels) + else: + send_logs(node_id, 'Setup persistent storage...', log_file, + channels) + setup_node_storage(ssh, node_id, ls_devices, ebs_volume, + channels) complete_calico_node_config(host, db_node.ip) @@ -547,23 +532,37 @@ def clean_drives_for_deleted_users(): @celery.task(ignore_result=True) def check_if_node_down(hostname): - # In some cases kubelet doesn't post it's status, and restart may help - # to make it alive. It's a workaround for kubelet bug. - # TODO: research the bug and remove the workaround + """ In some cases kubelet doesn't post it's status, and restart may help + to make it alive. It's a workaround for kubelet bug. + TODO: research the bug and remove the workaround + """ + # Do not try to recover if maintenance mode is on + if get_maintenance(): + current_app.warning( + "Skip recover kubelet on '{}', because of maintenance mode is on" + .format(hostname) + ) + return + node_is_failed = False ssh, error_message = ssh_connect(hostname, timeout=3) if error_message: current_app.logger.debug( 'Failed connect to node %s: %s', hostname, error_message ) - return - i, o, e = ssh.exec_command('systemctl restart kubelet') - exit_status = o.channel.recv_exit_status() - if exit_status != 0: - current_app.logger.debug( - 'Failed to restart kubelet on node: %s, exit status: %s', - hostname, exit_status - ) + node_is_failed = True + ssh = None + else: + i, o, e = ssh.exec_command('systemctl restart kubelet') + exit_status = o.channel.recv_exit_status() + if exit_status != 0: + node_is_failed = True + current_app.logger.debug( + 'Failed to restart kubelet on node: %s, exit status: %s', + hostname, exit_status + ) + if node_is_failed: + migrate_pod_utils.manage_failed_node(hostname, ssh, deploy_node) @celery.task() diff --git a/kubedock/updates/health_check.py b/kubedock/updates/health_check.py index a7edbe59c..ffa450663 100644 --- a/kubedock/updates/health_check.py +++ b/kubedock/updates/health_check.py @@ -39,7 +39,7 @@ from kubedock.users.models import User from kubedock.kapi.node_utils import get_nodes_collection from kubedock.kapi.podcollection import PodCollection -from kubedock.kapi.nodes import get_kuberdock_logs_pod_name +from kubedock.kapi.service_pods import get_kuberdock_logs_pod_name from kubedock.kapi.helpers import LocalService from kubedock.kapi import pstorage from kubedock.pods.models import Pod diff --git a/node_install.sh b/node_install.sh index 5d4964e53..f566bcb9d 100644 --- a/node_install.sh +++ b/node_install.sh @@ -24,34 +24,15 @@ # IMPORTANT: each package must be installed with separate command because of # yum incorrect error handling! -# ========================= DEFINED VARS =============================== -AWS=${AWS} -KUBERNETES_CONF_DIR='/etc/kubernetes' -EXIT_MESSAGE="Installation error." -KUBE_REPO='/etc/yum.repos.d/kube-cloudlinux.repo' -KUBE_TEST_REPO='/etc/yum.repos.d/kube-cloudlinux-testing.repo' -PLUGIN_DIR_BASE='/usr/libexec/kubernetes' -KD_WATCHER_SERVICE='/etc/systemd/system/kuberdock-watcher.service' -KD_KERNEL_VARS='/etc/sysctl.d/75-kuberdock.conf' -KD_RSYSLOG_CONF='/etc/rsyslog.d/kuberdock.conf' -KD_ELASTIC_LOGS='/var/lib/elasticsearch' -FSTAB_BACKUP="/var/lib/kuberdock/backups/fstab.pre-swapoff" -CEPH_VERSION=hammer -CEPH_BASE='/etc/yum.repos.d/ceph-base' -CEPH_REPO='/etc/yum.repos.d/ceph.repo' - -KD_SSH_GC_PATH="/var/lib/kuberdock/scripts/kd-ssh-gc" -KD_SSH_GC_LOCK="/var/run/kuberdock-ssh-gc.lock" -KD_SSH_GC_CMD="flock -n $KD_SSH_GC_LOCK -c '$KD_SSH_GC_PATH;rm $KD_SSH_GC_LOCK'" -KD_SSH_GC_CRON="@hourly $KD_SSH_GC_CMD >/dev/null 2>&1" - -ZFS_MODULES_LOAD_CONF="/etc/modules-load.d/kuberdock-zfs.conf" -ZFS_POOLS_LOAD_CONF="/etc/modprobe.d/kuberdock-zfs.conf" - -NODE_STORAGE_MANAGE_DIR=node_storage_manage -# ======================= // DEFINED VARS =============================== +source "$(dirname "${BASH_SOURCE}")/node_install_common.sh" +ami() +{ + [ -n "${AMI}" ] +} +check_iface_and_ip() +{ # check public interface and node ip if [ -z "$NODE_IP" ]; then >&2 echo "NODE_IP is not set" @@ -76,11 +57,13 @@ else fi fi # // check public interface and node ip +} +ami || check_iface_and_ip echo "Set locale to en_US.UTF-8" export LANG=en_US.UTF-8 -echo "Using MASTER_IP=${MASTER_IP}" +ami || echo "Using MASTER_IP=${MASTER_IP}" if [ "$ZFS" = yes ]; then echo "Using ZFS as storage backend" elif [ ! -z "$CEPH_CONF" ]; then @@ -88,8 +71,7 @@ elif [ ! -z "$CEPH_CONF" ]; then else echo "Using LVM as storage backend" fi -echo "Set time zone to $TZ" -timedatectl set-timezone "$TZ" +ami || set_timezone echo "Deploy started: $(date)" # This should be as early as possible because outdated metadata (one that was @@ -100,16 +82,6 @@ yum clean metadata # ======================== JUST COMMON HELPERS ================================ # SHOULD BE DEFINED FIRST, AND BEFORE USE # Most common helpers are defined first. -check_status() -{ - local temp=$? - if [ $temp -ne 0 ];then - echo $EXIT_MESSAGE - exit $temp - fi -} - - yum_wrapper() { if [ -z "$WITH_TESTING" ];then @@ -363,7 +335,7 @@ check_xfs() } check_release -check_mem +ami || check_mem check_selinux check_xfs "/var/lib/docker/overlay" @@ -378,7 +350,7 @@ if [[ $ERRORS ]]; then exit 3 fi -check_disk +ami || check_disk if [[ $ERRORS ]]; then printf "Following noncompliances of KD cluster requirements have been detected:\n" @@ -387,7 +359,7 @@ if [[ $ERRORS ]]; then exit 3 fi -check_disk_for_production +ami || check_disk_for_production if [[ $WARNS ]]; then printf "Warning:\n" @@ -411,11 +383,6 @@ setup_ntpd () local ntp_config="/etc/ntp.conf" - # Backup ntp.conf before any modifications - backup_ntp_config="${ntp_config}.kd.backup.$(date --iso-8601=ns --utc)" - echo "Save current $ntp_config to $backup_ntp_config" - cp "$ntp_config" "$backup_ntp_config" - _sync_time() { grep '^server' "$ntp_config" | awk '{print $2}' | xargs ntpdate -u } @@ -431,11 +398,7 @@ setup_ntpd () _sync_time check_status - sed -i "/^server /d; /^tinker /d" "$ntp_config" - # NTP on master server should work at least a few minutes before ntp - # clients start trusting him. Thus we postpone the sync with it - echo "server ${MASTER_IP} iburst minpoll 3 maxpoll 4" >> "$ntp_config" - echo "tinker panic 0" >> "$ntp_config" + ami || configure_ntpd systemctl daemon-reload systemctl restart ntpd @@ -667,9 +630,10 @@ fi # 4 copy kubelet auth token and etcd certs +copy_kubelet_auth_and_etcd_certs() +{ echo "Copy certificates and tokens..." mv /configfile $KUBERNETES_CONF_DIR/configfile -mkdir -p /etc/pki/etcd check_status mv /ca.crt /etc/pki/etcd/ mv /etcd-client.crt /etc/pki/etcd/ @@ -677,6 +641,10 @@ mv /etcd-client.key /etc/pki/etcd/ mv /etcd-dns.crt /etc/pki/etcd/ mv /etcd-dns.key /etc/pki/etcd/ check_status +} + +mkdir -p /etc/pki/etcd +ami || copy_kubelet_auth_and_etcd_certs # 4.1 create and populate scripts directory # TODO refactor this staff to kdnode package or copy folder ones @@ -734,72 +702,17 @@ setup_cron mv /${NODE_STORAGE_MANAGE_DIR} /var/lib/kuberdock/scripts/ - -if [ "$FIXED_IP_POOLS" = True ]; then - fixed_ip_pools="yes" -else - fixed_ip_pools="no" -fi -cat < "/var/lib/kuberdock/kuberdock.json" -{"fixed_ip_pools": "$fixed_ip_pools", -"master": "$MASTER_IP", -"node": "$NODENAME", -"network_interface": "$PUBLIC_INTERFACE", -"token": "$TOKEN"} -EOF +ami || kuberdock_json # 5. configure Node config -echo "Configuring kubernetes..." -sed -i "/^KUBE_MASTER/ {s|http://127.0.0.1:8080|https://${MASTER_IP}:6443|}" $KUBERNETES_CONF_DIR/config -sed -i '/^KUBELET_HOSTNAME/s/^/#/' $KUBERNETES_CONF_DIR/kubelet - -# Kubelet's 10255 port (built-in cadvisor) should be accessible from master, -# because heapster.service use it to gather data for our "usage statistics" -# feature. Master-only access is ensured by our cluster-wide firewall -sed -i "/^KUBELET_ADDRESS/ {s|127.0.0.1|0.0.0.0|}" $KUBERNETES_CONF_DIR/kubelet -check_status - -sed -i "/^KUBELET_API_SERVER/ {s|http://127.0.0.1:8080|https://${MASTER_IP}:6443|}" $KUBERNETES_CONF_DIR/kubelet -if [ "$AWS" = True ];then - sed -i '/^KUBELET_ARGS/ {s|""|"--cloud-provider=aws --kubeconfig=/etc/kubernetes/configfile --cluster_dns=10.254.0.10 --cluster_domain=kuberdock --register-node=false --network-plugin=kuberdock --maximum-dead-containers=1 --maximum-dead-containers-per-container=1 --minimum-container-ttl-duration=10s --cpu-cfs-quota=true --cpu-multiplier='${CPU_MULTIPLIER}' --memory-multiplier='${MEMORY_MULTIPLIER}' --node-ip='${NODE_IP}'"|}' $KUBERNETES_CONF_DIR/kubelet -else - sed -i '/^KUBELET_ARGS/ {s|""|"--kubeconfig=/etc/kubernetes/configfile --cluster_dns=10.254.0.10 --cluster_domain=kuberdock --register-node=false --network-plugin=kuberdock --maximum-dead-containers=1 --maximum-dead-containers-per-container=1 --minimum-container-ttl-duration=10s --cpu-cfs-quota=true --cpu-multiplier='${CPU_MULTIPLIER}' --memory-multiplier='${MEMORY_MULTIPLIER}' --node-ip='${NODE_IP}'"|}' $KUBERNETES_CONF_DIR/kubelet -fi -sed -i '/^KUBE_PROXY_ARGS/ {s|""|"--kubeconfig=/etc/kubernetes/configfile --proxy-mode iptables"|}' $KUBERNETES_CONF_DIR/proxy -check_status - +ami || configure_kubelet # 6a. configure Calico CNI plugin echo "Enabling Calico CNI plugin ..." yum_wrapper -y install calico-cni-1.3.1-3.el7 -echo >> $KUBERNETES_CONF_DIR/config -echo "# Calico etcd authority" >> $KUBERNETES_CONF_DIR/config -echo ETCD_AUTHORITY="$MASTER_IP:2379" >> $KUBERNETES_CONF_DIR/config -echo ETCD_SCHEME="https" >> $KUBERNETES_CONF_DIR/config -echo ETCD_CA_CERT_FILE="/etc/pki/etcd/ca.crt" >> $KUBERNETES_CONF_DIR/config -echo ETCD_CERT_FILE="/etc/pki/etcd/etcd-client.crt" >> $KUBERNETES_CONF_DIR/config -echo ETCD_KEY_FILE="/etc/pki/etcd/etcd-client.key" >> $KUBERNETES_CONF_DIR/config - -TOKEN=$(grep token /etc/kubernetes/configfile | grep -oP '[a-zA-Z0-9]+$') - -mkdir -p /etc/cni/net.d -cat > /etc/cni/net.d/10-calico.conf << EOF -{ - "name": "calico-k8s-network", - "type": "calico", - "log_level": "info", - "ipam": { - "type": "calico-ipam" - }, - "policy": { - "type": "k8s", - "k8s_api_root": "https://$MASTER_IP:6443/api/v1/", - "k8s_auth_token": "$TOKEN" - } -} -EOF +ami || configure_cni pushd /var/lib/kuberdock/scripts python kubelet_args.py --network-plugin= @@ -829,13 +742,7 @@ EOF # 8. setup rsyslog forwarding -echo "Reconfiguring rsyslog..." -cat > $KD_RSYSLOG_CONF << EOF -\$LocalHostName $NODENAME -\$template LongTagForwardFormat,"<%PRI%>%TIMESTAMP:::date-rfc3339% %HOSTNAME% %syslogtag%%msg:::sp-if-no-1st-sp%%msg%" -*.* @${LOG_POD_IP}:5140;LongTagForwardFormat -EOF - +ami || configure_rsyslog echo 'Configuring docker...' @@ -917,7 +824,7 @@ prjquota_enable "/var/lib/kuberdock/storage" # 10. enable services echo "Enabling services..." systemctl daemon-reload -systemctl reenable kubelet +ami || systemctl reenable kubelet check_status mkdir -p /etc/systemd/system/kube-proxy.service.d @@ -929,7 +836,7 @@ Restart=always RestartSec=5s" > /etc/systemd/system/kube-proxy.service.d/restart.conf systemctl daemon-reload -systemctl reenable kube-proxy +ami || systemctl reenable kube-proxy check_status # 11. disable swap for best performance @@ -1039,9 +946,7 @@ docker pull "$CALICO_NODE_IMAGE" > /dev/null time sync #sleep 10 # even harder workaround -echo "Starting Calico node..." -ETCD_AUTHORITY="$MASTER_IP:2379" ETCD_SCHEME=https ETCD_CA_CERT_FILE=/etc/pki/etcd/ca.crt ETCD_CERT_FILE=/etc/pki/etcd/etcd-client.crt ETCD_KEY_FILE=/etc/pki/etcd/etcd-client.key HOSTNAME="$NODENAME" /opt/bin/calicoctl node --ip="$NODE_IP" --node-image="$CALICO_NODE_IMAGE" -check_status +ami || start_calico_node # 16. Reboot will be executed in python function echo "Node deploy script finished: $(date)" diff --git a/node_install_ami.sh b/node_install_ami.sh new file mode 100644 index 000000000..d8c2c15f0 --- /dev/null +++ b/node_install_ami.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +source "$(dirname "${BASH_SOURCE}")/node_install_common.sh" + +set_timezone +configure_ntpd +configure_rsyslog +configure_cni +configure_kubelet +kuberdock_json + +for service in ntpd rsyslog kubelet kube-proxy +do + systemctl restart ${service} +done + +start_calico_node + +rm -f /node_install_ami.sh +rm -f /node_install_common.sh diff --git a/node_install_common.sh b/node_install_common.sh new file mode 100644 index 000000000..9583d6239 --- /dev/null +++ b/node_install_common.sh @@ -0,0 +1,153 @@ +#!/usr/bin/env bash +# ========================= DEFINED VARS =============================== +export AWS=${AWS} +export KUBERNETES_CONF_DIR='/etc/kubernetes' +export EXIT_MESSAGE="Installation error." +export KUBE_REPO='/etc/yum.repos.d/kube-cloudlinux.repo' +export KUBE_TEST_REPO='/etc/yum.repos.d/kube-cloudlinux-testing.repo' +export PLUGIN_DIR_BASE='/usr/libexec/kubernetes' +export KD_WATCHER_SERVICE='/etc/systemd/system/kuberdock-watcher.service' +export KD_KERNEL_VARS='/etc/sysctl.d/75-kuberdock.conf' +export KD_RSYSLOG_CONF='/etc/rsyslog.d/kuberdock.conf' +export KD_ELASTIC_LOGS='/var/lib/elasticsearch' +export FSTAB_BACKUP="/var/lib/kuberdock/backups/fstab.pre-swapoff" +export CEPH_VERSION=hammer +export CEPH_BASE='/etc/yum.repos.d/ceph-base' +export CEPH_REPO='/etc/yum.repos.d/ceph.repo' + +export KD_SSH_GC_PATH="/var/lib/kuberdock/scripts/kd-ssh-gc" +export KD_SSH_GC_LOCK="/var/run/kuberdock-ssh-gc.lock" +export KD_SSH_GC_CMD="flock -n ${KD_SSH_GC_LOCK} -c '${KD_SSH_GC_PATH}; rm ${KD_SSH_GC_LOCK}'" +export KD_SSH_GC_CRON="@hourly ${KD_SSH_GC_CMD} > /dev/null 2>&1" + +export ZFS_MODULES_LOAD_CONF="/etc/modules-load.d/kuberdock-zfs.conf" +export ZFS_POOLS_LOAD_CONF="/etc/modprobe.d/kuberdock-zfs.conf" + +export NODE_STORAGE_MANAGE_DIR=node_storage_manage +# ======================= // DEFINED VARS =============================== + + +check_status() +{ + local temp=$? + if [ "${temp}" -ne 0 ];then + echo "${EXIT_MESSAGE}" + exit "${temp}" + fi +} + + +configure_ntpd() +{ + local ntp_config="/etc/ntp.conf" + + # Backup ntp.conf before any modifications + backup_ntp_config="${ntp_config}.kd.backup.$(date --iso-8601=ns --utc)" + echo "Save current ${ntp_config} to ${backup_ntp_config}" + cp "${ntp_config}" "${backup_ntp_config}" + + sed -i "/^server /d; /^tinker /d" "${ntp_config}" + # NTP on master server should work at least a few minutes before ntp + # clients start trusting him. Thus we postpone the sync with it + echo "server ${MASTER_IP} iburst minpoll 3 maxpoll 4" >> "${ntp_config}" + echo "tinker panic 0" >> "${ntp_config}" +} + + +kuberdock_json() +{ +if [ "${FIXED_IP_POOLS}" = True ]; then + fixed_ippools="yes" +else + fixed_ippools="no" +fi +cat << EOF > "/var/lib/kuberdock/kuberdock.json" +{"fixed_ip_pools": "${fixed_ippools}", +"master": "${MASTER_IP}", +"node": "${NODENAME}", +"network_interface": "${PUBLIC_INTERFACE}", +"token": "${TOKEN}"} +EOF +} + + +configure_kubelet() +{ +echo "Configuring kubernetes..." +sed -i "/^KUBE_MASTER/ {s|http://127.0.0.1:8080|https://${MASTER_IP}:6443|}" "${KUBERNETES_CONF_DIR}"/config +sed -i '/^KUBELET_HOSTNAME/s/^/#/' "${KUBERNETES_CONF_DIR}"/kubelet + +# Kubelet's 10255 port (built-in cadvisor) should be accessible from master, +# because heapster.service use it to gather data for our "usage statistics" +# feature. Master-only access is ensured by our cluster-wide firewall +sed -i "/^KUBELET_ADDRESS/ {s|127.0.0.1|0.0.0.0|}" "${KUBERNETES_CONF_DIR}"/kubelet +check_status + +sed -i "/^KUBELET_API_SERVER/ {s|http://127.0.0.1:8080|https://${MASTER_IP}:6443|}" "${KUBERNETES_CONF_DIR}"/kubelet +if [ "${AWS}" = True ];then + sed -i '/^KUBELET_ARGS/ {s|""|"--cloud-provider=aws --kubeconfig=/etc/kubernetes/configfile --cluster_dns=10.254.0.10 --cluster_domain=kuberdock --register-node=false --network-plugin=kuberdock --maximum-dead-containers=1 --maximum-dead-containers-per-container=1 --minimum-container-ttl-duration=10s --cpu-cfs-quota=true --cpu-multiplier='"${CPU_MULTIPLIER}"' --memory-multiplier='"${MEMORY_MULTIPLIER}"' --node-ip='"${NODE_IP}"'"|}' "${KUBERNETES_CONF_DIR}"/kubelet +else + sed -i '/^KUBELET_ARGS/ {s|""|"--kubeconfig=/etc/kubernetes/configfile --cluster_dns=10.254.0.10 --cluster_domain=kuberdock --register-node=false --network-plugin=kuberdock --maximum-dead-containers=1 --maximum-dead-containers-per-container=1 --minimum-container-ttl-duration=10s --cpu-cfs-quota=true --cpu-multiplier='"${CPU_MULTIPLIER}"' --memory-multiplier='"${MEMORY_MULTIPLIER}"' --node-ip='"${NODE_IP}"'"|}' "${KUBERNETES_CONF_DIR}"/kubelet +fi +sed -i '/^KUBE_PROXY_ARGS/ {s|""|"--kubeconfig=/etc/kubernetes/configfile --proxy-mode iptables"|}' "$KUBERNETES_CONF_DIR"/proxy +check_status +} + + +configure_cni() +{ +{ +echo +echo "# Calico etcd authority" +echo ETCD_AUTHORITY="${MASTER_IP}:2379" +echo ETCD_SCHEME="https" +echo ETCD_CA_CERT_FILE="/etc/pki/etcd/ca.crt" +echo ETCD_CERT_FILE="/etc/pki/etcd/etcd-client.crt" +echo ETCD_KEY_FILE="/etc/pki/etcd/etcd-client.key" +} >> "${KUBERNETES_CONF_DIR}"/config + +K8S_TOKEN=$(grep token /etc/kubernetes/configfile | grep -oP '[a-zA-Z0-9]+$') + +mkdir -p /etc/cni/net.d +cat > /etc/cni/net.d/10-calico.conf << EOF +{ + "name": "calico-k8s-network", + "type": "calico", + "log_level": "info", + "ipam": { + "type": "calico-ipam" + }, + "policy": { + "type": "k8s", + "k8s_api_root": "https://${MASTER_IP}:6443/api/v1/", + "k8s_auth_token": "${K8S_TOKEN}" + } +} +EOF +} + + +configure_rsyslog() +{ +echo "Reconfiguring rsyslog..." +cat > "${KD_RSYSLOG_CONF}" << EOF +\$LocalHostName ${NODENAME} +\$template LongTagForwardFormat,"<%PRI%>%TIMESTAMP:::date-rfc3339% %HOSTNAME% %syslogtag%%msg:::sp-if-no-1st-sp%%msg%" +*.* @${LOG_POD_IP}:5140;LongTagForwardFormat +EOF +} + + +set_timezone() +{ +echo Set time zone to "${TZ}" +timedatectl set-timezone "${TZ}" +} + + +start_calico_node() +{ +echo "Starting Calico node..." +ETCD_AUTHORITY="${MASTER_IP}:2379" ETCD_SCHEME=https ETCD_CA_CERT_FILE=/etc/pki/etcd/ca.crt ETCD_CERT_FILE=/etc/pki/etcd/etcd-client.crt ETCD_KEY_FILE=/etc/pki/etcd/etcd-client.key HOSTNAME="${NODENAME}" /opt/bin/calicoctl node --ip="${NODE_IP}" --node-image="${CALICO_NODE_IMAGE}" +check_status +} diff --git a/node_prepare_ami.sh b/node_prepare_ami.sh new file mode 100644 index 000000000..fe02cfc1f --- /dev/null +++ b/node_prepare_ami.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +for image in \ + gcr.io/google_containers/etcd-amd64:2.2.1 \ + gcr.io/google_containers/exechealthz:1.0 \ + gcr.io/google_containers/skydns:2015-10-13-8c72f8c \ + gcr.io/google_containers/pause:2.0 \ + jetstack/kube-lego:0.1.3 \ + kuberdock/calico-node:0.22.0-kd2 \ + kuberdock/defaultbackend:0.0.1 \ + kuberdock/elasticsearch:2.2 \ + kuberdock/fluentd:1.8 \ + kuberdock/k8s-policy-agent:v0.1.4-kd2 \ + kuberdock/kube2sky:1.2 \ + kuberdock/nginx-ingress-controller:0.2.0 \ + ; do + docker pull ${image} +done + +systemctl disable rsyslog +systemctl stop rsyslog + +rm -f /node_install.sh + +yum clean packages diff --git a/node_storage_manage/aws.py b/node_storage_manage/aws.py index 5b8305290..766c6f131 100644 --- a/node_storage_manage/aws.py +++ b/node_storage_manage/aws.py @@ -39,6 +39,7 @@ def get_aws_instance_meta(utils): + """Returns some metadata for the instance.""" identity = utils.get_instance_identity() meta = utils.get_instance_metadata() return { @@ -49,6 +50,14 @@ def get_aws_instance_meta(utils): def get_aws_block_device_mapping(connection, instance_id): + """Returns dict of { + : + } + where device name is string in form of '/dev/xvdX' + block device type is an object of class + boto.ec2.blockdevicemapping.BlockDeviceType + (we need only volume_id from it) + """ return connection.get_instance_attribute( instance_id=instance_id, attribute='blockDeviceMapping' @@ -56,6 +65,11 @@ def get_aws_block_device_mapping(connection, instance_id): def detach_ebs(aws_access_key_id, aws_secret_access_key, devices): + """Detaches volumes for given devices if they are attached to the instance. + Will wait some time until all detached volumes become 'detached'. + Returns true if all devices was successfully detached. + False - otherwise. + """ import boto import boto.ec2 meta = get_aws_instance_meta(boto.utils) @@ -67,9 +81,99 @@ def detach_ebs(aws_access_key_id, aws_secret_access_key, devices): bd_mapping = get_aws_block_device_mapping( connection, meta['instance-id'] ) - for key, bd in bd_mapping.iteritems(): - if key in devices: - connection.detach_volume(bd.volume_id) + volume_ids = [bd.volume_id for key, bd in bd_mapping.iteritems() + if key in devices] + return _detach_ebs_volumes(connection, volume_ids, force=False) + + +def _detach_ebs_volumes(connection, volume_ids, force=False): + for volume_id in volume_ids: + connection.detach_volume(volume_id, force=force) + if not wait_for_detached_state(connection, volume_ids): + return False + return True + + +def _volumes_are_available(connection, volume_ids): + volumes = connection.get_all_volumes(volume_ids) + # In AWS documentation there are the following states: + # (attaching | attached | detaching | detached) + # But actually boto returns None for detached volume, so check it + # both here. + return all( + ( + item.attachment_state() == 'detached' or + item.attachment_state() is None + ) and + item.status == 'available' for item in volumes + ) + + +def wait_for_detached_state(connection, volume_ids): + """Will wait until volumes with given ids become detached. + It will check volumes states 180 times with 1 second pause, so max + wait time will be approximately 3 minutes. + Returns True if all volumes become detached. False if one or more volumes + will not become detached. + """ + retry_count = 60 * 3 + pause = 1 + for _ in xrange(retry_count): + if _volumes_are_available(connection, volume_ids): + return True + time.sleep(pause) + return False + + +def attach_existing_volumes(call_args): + """Attaches volumes to the instance. Volumes names must be passed via + call_args.ebs_volumes list. + Returns tuple of success flag and list of dicts described attached volumes. + Each element of this list will include fields: + 'name': , + 'instance_id': , + 'device': + """ + import boto + import boto.ec2 + meta = get_aws_instance_meta(boto.utils) + region = meta['region'] + instance_id = meta['instance-id'] + connection = boto.ec2.connect_to_region( + region, + aws_access_key_id=call_args.aws_access_key_id, + aws_secret_access_key=call_args.aws_secret_access_key + ) + names_to_attach = call_args.ebs_volumes + force_detach = call_args.force_detach + + existing_volumes = connection.get_all_volumes() + volumes = [] + for item in existing_volumes: + if item.tags.get('Name', 'Nameless') in names_to_attach: + volumes.append(item) + break + if len(volumes) != len(names_to_attach): + return False, 'Not all EBS volumes were found' + + volume_ids = [vol.id for vol in volumes] + if not _volumes_are_available(connection, volume_ids): + if force_detach: + _detach_ebs_volumes(connection, volume_ids, force=True) + # update volumes, because it's state may be changed + volumes = connection.get_all_volumes(volume_ids) + + dev_list = [] + for volume in volumes: + try: + ok, result = attach_ebs_volume(connection, instance_id, volume) + if ok != OK: + return False, result + dev_list.append(result) + except (boto.exception.BotoClientError, + boto.exception.BotoServerError) as err: + return False, 'Failed to attach volume: {}'.format(err) + return True, dev_list def do_ebs_attach(call_args): diff --git a/node_storage_manage/manage.py b/node_storage_manage/manage.py index f42328d6d..8d4df6d21 100644 --- a/node_storage_manage/manage.py +++ b/node_storage_manage/manage.py @@ -98,7 +98,8 @@ from .storage import ( do_get_info, do_add_volume, do_remove_storage, VOLUME_MANAGE_NAME, - do_create_volume, do_remove_volume, do_resize_volume + do_create_volume, do_remove_volume, do_resize_volume, + do_export_storage, do_import_storage, ) @@ -113,6 +114,28 @@ def _do_remove_storage(call_args): return ERROR, {'message': u'Remove storage failed: {}'.format(result)} # we expect device list in result in case of successful storage removing devices = result + if result and call_args.detach_ebs: + if not aws.detach_ebs(call_args.aws_access_key_id, + call_args.aws_secret_access_key, + devices): + return ERROR, {'message': 'Failed to detach EBS volumes'} + return OK, { + 'message': 'Localstorage ({}) has been deleted' + .format(VOLUME_MANAGE_NAME) + } + + +def _do_export_storage(call_args): + """Prepare storage to be exported to another host. + Now supported only by ZFS storage backend. + TODO: implement for LVM. + """ + ok, result = do_export_storage(call_args) + if not ok: + return ERROR, { + 'message': u'Export storage failed: {}'.format(result)} + # we expect device list in result in case of successful storage export + devices = result if result and call_args.detach_ebs: aws.detach_ebs( call_args.aws_access_key_id, @@ -120,10 +143,28 @@ def _do_remove_storage(call_args): devices ) return OK, { - 'message': 'Localstorage ({}) has been deleted' + 'message': 'Localstorage ({}) has been exported' .format(VOLUME_MANAGE_NAME) } + +def _do_import_aws_storage(call_args): + """Imports storage on AWS. + """ + ok, result = aws.attach_existing_volumes(call_args) + if not ok: + return ERROR, { + 'message': u'Failed to attach EBS volumes: {}'.format(result) + } + ok, import_result = do_import_storage(result) + if not ok: + return ERROR, { + 'message': u'Failed to import storage: {}'.format( + import_result) + } + return OK, result + + def _do_resize_volume(call_args): """Calls do_resize_volume of current storage. Before calling the method of current storage checks if path exists, and @@ -146,9 +187,11 @@ def _do_resize_volume(call_args): 'add-volume': do_add_volume, 'get-info': do_get_info, 'remove-storage': _do_remove_storage, + 'export-storage': _do_export_storage, 'create-volume': do_create_volume, 'remove-volume': do_remove_volume, 'resize-volume': _do_resize_volume, + 'import-aws-storage': _do_import_aws_storage, } @@ -232,6 +275,27 @@ def process_args(): dest='aws_secret_access_key', required=False, help='AWS secret access key' ) + + export_storage = subparsers.add_parser( + 'export-storage', + help='Unmount and export localstorage volume group from node.' + ) + export_storage.add_argument( + '--detach-ebs', dest='detach_ebs', + default=False, action='store_true', + help='Detach EBS volumes that were in LS volume group' + ) + export_storage.add_argument( + '--aws-access-key-id', + dest='aws_access_key_id', required=False, + help='AWS access key ID' + ) + export_storage.add_argument( + '--aws-secret-access-key', + dest='aws_secret_access_key', required=False, + help='AWS secret access key' + ) + create_volume = subparsers.add_parser( 'create-volume', help='Create persistent volume with current localstorage backend' @@ -264,6 +328,37 @@ def process_args(): help='New size quota (GB) of the volume' ) + import_aws_storage = subparsers.add_parser( + 'import-aws-storage', + help='Imports storage to the AWS node' + ) + import_aws_storage.add_argument( + '--aws-access-key-id', + dest='aws_access_key_id', + required=True, + help='AWS access key ID' + ) + import_aws_storage.add_argument( + '--aws-secret-access-key', + dest='aws_secret_access_key', + required=True, + help='AWS secret access key' + ) + import_aws_storage.add_argument( + '--force-detach', + dest='force_detach', + required=False, default=False, + help='Try to force detach volumes before attaching', + action='store_true' + ) + import_aws_storage.add_argument( + '--ebs-volumes', + dest='ebs_volumes', + nargs='+', + required=True, + help='List of EBS volume names which contains importing storage' + ) + return parser.parse_args() diff --git a/node_storage_manage/node_lvm_manage.py b/node_storage_manage/node_lvm_manage.py index f515d791c..3c5e88008 100644 --- a/node_storage_manage/node_lvm_manage.py +++ b/node_storage_manage/node_lvm_manage.py @@ -108,6 +108,20 @@ def do_remove_storage(_): vg.close() +def do_export_storage(_): + """Prepares storage to be used in another host. + Not implemented for LVM. + """ + return False, u'The operation is not implemented for LVM' + + +def do_import_storage(_): + """Prepares imports a storage detached from another node. + Not implemented for LVM. + """ + return False, u'The operation is not implemented for LVM' + + def remove_ls_mount(): save_file = '/etc/fstab.kdsave' fstab = '/etc/fstab' diff --git a/node_storage_manage/node_zfs_manage.py b/node_storage_manage/node_zfs_manage.py index 341f63db3..d1210e746 100644 --- a/node_storage_manage/node_zfs_manage.py +++ b/node_storage_manage/node_zfs_manage.py @@ -172,10 +172,64 @@ def do_remove_storage(_): return readable statuses of performed operation. """ + return _perform_zpool_stop_operation('destroy') + + +def do_export_storage(_): + """Prepares storage to be used in another host. + Runs 'zpool export' command. + Returns success flag and list of devices used in zpool, or error message + in case of an error. + """ + return _perform_zpool_stop_operation('export') + + +def do_import_storage(_): + """Prepares imports a storage detached from another node. + Executes 'zpool import' operation. + """ + try: + all_names = _list_zpools() + except: + return False, ('Unable to list ZFS pools. Maybe ZFS is not properly ' + 'installed yet, ' + 'skip this if this is during node cleanup process') + if KD_ZPOOL_NAME in all_names: + return False, 'Zpool {} already exists.'.format(KD_ZPOOL_NAME) + try: + silent_call(['zpool', 'import', '-f', KD_ZPOOL_NAME]) + except: + return False, 'Failed to import zpool' + + try: + silent_call(['zfs', 'mount', '-a']) + except: + return False, 'Failed to mount zfs volumes' + + try: + devices = get_device_list(KD_ZPOOL_NAME) + except: + return ( + False, + 'Failed to get device list in zpool "{}"'.format(KD_ZPOOL_NAME) + ) + return True, devices + + +def _perform_zpool_stop_operation(operation): + """Performs one of operations on existing zpool: + ('destroy', 'export'). Returns list of device names which was used by the + zpool. + """ + allowed_operations = ('destroy', 'export') + if operation not in allowed_operations: + return False, u'Invalid operation name: {}'.format(operation) + try: all_names = _list_zpools() except Exception: - return False, ('Unable to list ZFS pools. Maybe ZFS is not properly installed yet, ' + return False, ('Unable to list ZFS pools. Maybe ZFS is not properly ' + 'installed yet, ' 'skip this if this is during node cleanup process') if KD_ZPOOL_NAME not in all_names: return True, [] @@ -187,10 +241,10 @@ def do_remove_storage(_): 'Failed to get device list in zpool "{}"'.format(KD_ZPOOL_NAME) ) try: - silent_call(['zpool', 'destroy', '-f', KD_ZPOOL_NAME]) + silent_call(['zpool', operation, '-f', KD_ZPOOL_NAME]) return True, devices except: - return False, 'Failed to delete zpool {}'.format(KD_ZPOOL_NAME) + return False, 'Failed to {} zpool {}'.format(operation, KD_ZPOOL_NAME) def add_devices_to_localstorage(devices):