Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 16 additions & 36 deletions parts/linux/cloud-init/artifacts/cse_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ configureSwapFile() {
# https://learn.microsoft.com/en-us/troubleshoot/azure/virtual-machines/troubleshoot-device-names-problems#identify-disk-luns
swap_size_kb=$(expr ${SWAP_FILE_SIZE_MB} \* 1000)
swap_location=""

# Attempt to use the resource disk
if [ -L /dev/disk/azure/resource-part1 ]; then
resource_disk_path=$(findmnt -nr -o target -S $(readlink -f /dev/disk/azure/resource-part1))
Expand Down Expand Up @@ -154,7 +154,7 @@ configureCustomCaCertificate() {
chmod 755 /opt/certs
for i in $(seq 0 $((${CUSTOM_CA_TRUST_COUNT} - 1))); do
# declare dynamically and use "!" to avoid bad substition errors
declare varname=CUSTOM_CA_CERT_${i}
declare varname=CUSTOM_CA_CERT_${i}
echo "${!varname}" | base64 -d > /opt/certs/00000000000000cert${i}.crt
done
# blocks until svc is considered active, which will happen when ExecStart command terminates with code 0
Expand Down Expand Up @@ -190,7 +190,7 @@ configureAzureJson() {
fi
SERVICE_PRINCIPAL_CLIENT_SECRET=${SERVICE_PRINCIPAL_CLIENT_SECRET//\\/\\\\}
SERVICE_PRINCIPAL_CLIENT_SECRET=${SERVICE_PRINCIPAL_CLIENT_SECRET//\"/\\\"}

cat << EOF > "${AZURE_JSON_PATH}"
{
"cloud": "${TARGET_CLOUD}",
Expand Down Expand Up @@ -331,7 +331,7 @@ ensureContainerd() {
if [ "${TELEPORT_ENABLED}" = "true" ]; then
ensureTeleportd
fi
mkdir -p "/etc/systemd/system/containerd.service.d"
mkdir -p "/etc/systemd/system/containerd.service.d"
tee "/etc/systemd/system/containerd.service.d/exec_start.conf" > /dev/null <<EOF
[Service]
ExecStartPost=/sbin/iptables -P FORWARD ACCEPT
Expand All @@ -358,7 +358,7 @@ EOF
logs_to_events "AKS.CSE.ensureContainerd.configureContainerdRegistryHost" configureContainerdRegistryHost
fi

tee "/etc/sysctl.d/99-force-bridge-forward.conf" > /dev/null <<EOF
tee "/etc/sysctl.d/99-force-bridge-forward.conf" > /dev/null <<EOF
net.ipv4.ip_forward = 1
net.ipv4.conf.all.forwarding = 1
net.ipv6.conf.all.forwarding = 1
Expand Down Expand Up @@ -407,33 +407,13 @@ ensureArtifactStreaming() {
systemctl link /opt/overlaybd/snapshotter/overlaybd-snapshotter.service
systemctlEnableAndStart overlaybd-tcmu.service 30
systemctlEnableAndStart overlaybd-snapshotter.service 30

}

ensureAcrNodeMon() {
systemctlEnableAndStart acr-nodemon 30
}

ensureDocker() {
DOCKER_SERVICE_EXEC_START_FILE=/etc/systemd/system/docker.service.d/exec_start.conf
usermod -aG docker ${ADMINUSER}
DOCKER_MOUNT_FLAGS_SYSTEMD_FILE=/etc/systemd/system/docker.service.d/clear_mount_propagation_flags.conf
DOCKER_JSON_FILE=/etc/docker/daemon.json
for i in $(seq 1 1200); do
if [ -s $DOCKER_JSON_FILE ]; then
jq '.' < $DOCKER_JSON_FILE && break
fi
if [ $i -eq 1200 ]; then
exit $ERR_FILE_WATCH_TIMEOUT
else
sleep 1
fi
done
systemctl is-active --quiet containerd && (systemctl_disable 20 30 120 containerd || exit $ERR_SYSTEMD_CONTAINERD_STOP_FAIL)
systemctlEnableAndStart docker 30 || exit $ERR_DOCKER_START_FAIL

}

ensureDHCPv6() {
systemctlEnableAndStart dhcpv6 30 || exit $ERR_SYSTEMCTL_START_FAIL
retrycmd_if_failure 120 5 25 modprobe ip6_tables || exit $ERR_MODPROBE_FAIL
Expand Down Expand Up @@ -461,7 +441,7 @@ getPrimaryNicIP() {

generateSelfSignedKubeletServingCertificate() {
mkdir -p "/etc/kubernetes/certs"

KUBELET_SERVER_PRIVATE_KEY_PATH="/etc/kubernetes/certs/kubeletserver.key"
KUBELET_SERVER_CERT_PATH="/etc/kubernetes/certs/kubeletserver.crt"

Expand Down Expand Up @@ -663,7 +643,7 @@ current-context: bootstrap-context
EOF
else
echo "generating kubeconfig referencing the provided kubelet client certificate"

KUBECONFIG_FILE=/var/lib/kubelet/kubeconfig
mkdir -p "$(dirname "${KUBECONFIG_FILE}")"
touch "${KUBECONFIG_FILE}"
Expand Down Expand Up @@ -691,7 +671,7 @@ EOF
fi

set -x

KUBELET_RUNTIME_CONFIG_SCRIPT_FILE=/opt/azure/containers/kubelet.sh
tee "${KUBELET_RUNTIME_CONFIG_SCRIPT_FILE}" > /dev/null <<EOF
#!/bin/bash
Expand Down Expand Up @@ -740,7 +720,7 @@ EOF
fi
else
logs_to_events "AKS.CSE.ensureKubelet.installCredentialProviderFromPMC" "installCredentialProviderFromPMC ${KUBERNETES_VERSION}"
fi
fi
fi
fi

Expand Down Expand Up @@ -871,7 +851,7 @@ configGPUDrivers() {
fi
ctr -n k8s.io images rm --sync $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG
else
bash -c "$DOCKER_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG install"
bash -c "$DOCKER_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG install"
ret=$?
if [ "$ret" -ne 0 ]; then
echo "Failed to install GPU driver, exiting..."
Expand All @@ -883,7 +863,7 @@ configGPUDrivers() {
downloadGPUDrivers
installNvidiaContainerToolkit
enableNvidiaPersistenceMode
else
else
echo "os $OS $OS_VARIANT not supported at this time. skipping configGPUDrivers"
exit 1
fi
Expand All @@ -896,7 +876,7 @@ configGPUDrivers() {
if isMarinerOrAzureLinux "$OS"; then
createNvidiaSymlinkToAllDeviceNodes
fi

if [ "${CONTAINER_RUNTIME}" = "containerd" ]; then
retrycmd_if_failure 120 5 25 pkill -SIGHUP containerd || exit $ERR_GPU_DRIVERS_INSTALL_TIMEOUT
else
Expand All @@ -910,7 +890,7 @@ validateGPUDrivers() {
fi

retrycmd_if_failure 24 5 25 nvidia-modprobe -u -c0 && echo "gpu driver loaded" || configGPUDrivers || exit $ERR_GPU_DRIVERS_START_FAIL

if which nvidia-smi; then
SMI_RESULT=$(retrycmd_if_failure 24 5 300 nvidia-smi)
else
Expand Down Expand Up @@ -951,7 +931,7 @@ disableSSH() {
configureSSHPubkeyAuth() {
local disable_pubkey_auth="$1"
local ssh_use_pubkey_auth

# Determine the desired pubkey auth setting
if [ "${disable_pubkey_auth}" = "true" ]; then
ssh_use_pubkey_auth="no"
Expand Down Expand Up @@ -1098,7 +1078,7 @@ enableLocalDNS() {
echo "Enable localdns succeeded."
}

# localdns corefile used by localdns systemd unit.
# localdns corefile used by localdns systemd unit.
LOCALDNS_COREFILE="/opt/azure/containers/localdns/localdns.corefile"
# localdns slice file used by localdns systemd unit.
LOCALDNS_SLICEFILE="/etc/systemd/system/localdns.slice"
Expand Down
18 changes: 6 additions & 12 deletions parts/linux/cloud-init/artifacts/cse_main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ function basePrep {
if [ -n "${BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER}" ]; then
registry_domain_name="${BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER%%/*}"
fi

logs_to_events "AKS.CSE.orasLogin.oras_login_with_kubelet_identity" oras_login_with_kubelet_identity "${registry_domain_name}" $USER_ASSIGNED_IDENTITY_ID $TENANT_ID || exit $?
fi

Expand Down Expand Up @@ -200,8 +200,6 @@ function basePrep {
if [ "${NEEDS_CONTAINERD}" = "true" ]; then
# containerd should not be configured until cni has been configured first
logs_to_events "AKS.CSE.ensureContainerd" ensureContainerd
else
logs_to_events "AKS.CSE.ensureDocker" ensureDocker
fi

if [ -n "${MESSAGE_OF_THE_DAY}" ]; then
Expand Down Expand Up @@ -344,11 +342,11 @@ function nodePrep {
# This file indicates the cluster doesn't have outbound connectivity and should be excluded in future external outbound checks
touch /var/run/outbound-check-skipped
fi

# Determine if GPU driver installation should be skipped
export -f should_skip_nvidia_drivers
skip_nvidia_driver_install=$(retrycmd_silent 10 1 10 bash -cx should_skip_nvidia_drivers)

if [ "$?" -ne 0 ]; then
echo "Failed to determine if nvidia driver install should be skipped"
exit $ERR_NVIDIA_DRIVER_INSTALL
Expand All @@ -365,10 +363,10 @@ function nodePrep {
# Install and configure GPU drivers if this is a GPU node
if [ "${GPU_NODE}" = "true" ] && [ "${skip_nvidia_driver_install}" != "true" ]; then
echo $(date),$(hostname), "Start configuring GPU drivers"

# Install GPU drivers
logs_to_events "AKS.CSE.ensureGPUDrivers" ensureGPUDrivers

# Install fabric manager if needed
if [ "${GPU_NEEDS_FABRIC_MANAGER}" = "true" ]; then
# fabric manager trains nvlink connections between multi instance gpus.
Expand Down Expand Up @@ -418,7 +416,7 @@ EOF
else
logs_to_events "AKS.CSE.stop.nvidia-device-plugin" "systemctlDisableAndStop nvidia-device-plugin"
fi

echo $(date),$(hostname), "End configuring GPU drivers"
fi

Expand Down Expand Up @@ -479,10 +477,6 @@ EOF

logs_to_events "AKS.CSE.ensureKubelet" ensureKubelet

if [ "${ARTIFACT_STREAMING_ENABLED}" = "true" ]; then
logs_to_events "AKS.CSE.ensureContainerd.ensureAcrNodeMon" ensureAcrNodeMon || exit $ERR_ARTIFACT_STREAMING_ACR_NODEMON_START_FAIL
fi

if $REBOOTREQUIRED; then
echo 'reboot required, rebooting node in 1 minute'
/bin/bash -c "shutdown -r 1 &"
Expand Down
Loading
Loading