From 4ddffa9ef424b61dc3dca7ff6c923f2f3c5a4dea Mon Sep 17 00:00:00 2001 From: Steven Fairchild Date: Fri, 12 Apr 2024 07:44:34 -0400 Subject: [PATCH] Add wait for all rpm related transactions to avoid rpm database corruption Add starting log message for each function call rpm database corruption has been seen in testing, and Prod deployments due to concurrent rpm database operations between rpm and dnf. wait is needed due to this. --- pkg/deploy/generator/scripts/rpVMSS.sh | 93 ++++++++++++++++++++------ 1 file changed, 74 insertions(+), 19 deletions(-) diff --git a/pkg/deploy/generator/scripts/rpVMSS.sh b/pkg/deploy/generator/scripts/rpVMSS.sh index c08656c2995..e2ba4c71ee7 100644 --- a/pkg/deploy/generator/scripts/rpVMSS.sh +++ b/pkg/deploy/generator/scripts/rpVMSS.sh @@ -3,8 +3,6 @@ set -o errexit \ -o nounset -trap 'catch' ERR - main() { configure_sshd configure_and_install_dnf_pkgs_repos @@ -23,6 +21,7 @@ main() { # We need to configure PasswordAuthentication to yes in order for the VMSS Access JIT to work configure_sshd() { + log "starting" log "setting ssh password authentication" sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/g' /etc/ssh/sshd_config @@ -32,9 +31,16 @@ configure_sshd() { # configure_and_install_dnf_pkgs_repos configure_and_install_dnf_pkgs_repos() { + log "starting" configure_rhui_repo create_azure_rpm_repos - dnf_update_pkgs + + local -ar exclude_pkgs=( + "-x WALinuxAgent" + "-x WALinuxAgent-udev" + ) + + dnf_update_pkgs exclude_pkgs local -ra rpm_keys=( https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8 @@ -67,35 +73,44 @@ configure_and_install_dnf_pkgs_repos() { # configure_rhui_repo configure_rhui_repo() { + log "starting" log "running RHUI package updates" + #Adding retry logic to yum commands in order to avoid stalling out on resource locks for attempt in {1..5}; do + log "attempt #${attempt} - running dnf update" dnf update \ -y \ --disablerepo='*' \ - --enablerepo='rhui-microsoft-azure*' \ - && break + --enablerepo='rhui-microsoft-azure*' & + + wait $! && break if [[ ${attempt} -lt 5 ]]; then sleep 10 else - abort "failed to run dnf update" + abort "attempt #${attempt} - Failed to update packages" fi done } # dnf_update_pkgs dnf_update_pkgs() { + local -n excludes="$1" + log "starting" + for attempt in {1..5}; do - log "running dnf update attempt #${attempt}" + log "attempt #${attempt} - running dnf update" + # shellcheck disable=SC2068 dnf -y \ - -x WALinuxAgent \ - -x WALinuxAgent-udev \ - update --allowerasing \ - && break + ${excludes[@]} \ + update \ + --allowerasing & + + wait $! && break if [[ ${attempt} -lt 5 ]]; then sleep 10 else - abort "Failed to update packages after ${attempt} attempts" + abort "attempt #${attempt} - Failed to update packages" fi done } @@ -103,16 +118,19 @@ dnf_update_pkgs() { # dnf_install_pkgs dnf_install_pkgs() { local -n pkgs="$1" + log "starting" + for attempt in {1..5}; do - log "Installing packages ${pkgs[*]} attempt #$attempt" + log "attempt #$attempt - Installing packages ${pkgs[*]}" dnf -y \ install \ - "${install_pkgs[@]}" \ - && break + "${pkgs[@]}" & + + wait $! && break if [[ ${attempt} -lt 5 ]]; then sleep 10 else - abort "Failed to install packages ${pkgs[*]} after $attempt attempts" + abort "attempt #${attempt} - Failed to install packages ${pkgs[*]}" fi done } @@ -120,13 +138,15 @@ dnf_install_pkgs() { # rpm_import_keys rpm_import_keys() { local -n keys="$1" + log "starting" + # shellcheck disable=SC2068 for key in ${keys[@]}; do if [ ${#keys[@]} -eq 0 ]; then break fi for attempt in {1..5}; do - log "importing rpm repository key $key attempt #$attempt" + log "attempt #$attempt - importing rpm repository key $key" rpm --import \ -v \ "$key" \ @@ -136,14 +156,16 @@ rpm_import_keys() { if [ -z ${key+x} ] && [[ ${attempt} -lt 5 ]]; then sleep 10 else - abort "Failed to import rpm repository key $key after $attempt attempts" + abort "attempt #${attempt} - Failed to import rpm repository key $key" fi done } # configure_disk_partitions configure_disk_partitions() { + log "starting" log "extending partition table" + # Linux block devices are inconsistently named # it's difficult to tie the lvm pv to the physical disk using /dev/disk files, which is why lvs is used here physicalDisk="$(lvs -o devices -a | head -n2 | tail -n1 | cut -d ' ' -f 3 | cut -d \( -f 1 | tr -d '[:digit:]')" @@ -163,6 +185,8 @@ configure_disk_partitions() { # configure_logrotate clobbers /etc/logrotate.conf configure_logrotate() { + log "starting" + local -r logrotate_conf_filename='/etc/logrotate.conf' local -r logrotate_conf_file='# see "man logrotate" for details # rotate log files weekly @@ -203,6 +227,8 @@ include /etc/logrotate.d # create_azure_rpm_repos creates /etc/yum.repos.d/azure.repo repository file create_azure_rpm_repos() { + log "starting" + local -r azure_repo_filename='/etc/yum.repos.d/azure.repo' local -r azure_repo_file='[azure-cli] name=azure-cli @@ -216,11 +242,13 @@ baseurl=https://packages.microsoft.com/yumrepos/azurecore enabled=yes gpgcheck=no' - write_file azure_repo_filename azure_repo_file + write_file azure_repo_filename azure_repo_file true } # configure_selinux configure_selinux() { + log "starting" + local -r relabel="${1:-false}" already_defined_ignore_error="File context for /var/log/journal(/.*)? already defined" @@ -234,6 +262,8 @@ configure_selinux() { # configure_firewalld_rules configure_firewalld_rules() { + log "starting" + # https://access.redhat.com/security/cve/cve-2020-13401 local -r prefix="/etc/sysctl.d" local -r diable_accept_ra_conf="$prefix/02-disable-accept-ra.conf" @@ -267,6 +297,8 @@ EOF # pull_container_images pull_container_images() { + log "starting" + echo "logging into prod acr" az login -i --allow-no-subscriptions @@ -309,6 +341,8 @@ configure_system_services() { # enable_aro_services enables all services required for aro rp enable_aro_services() { + log "starting" + local -ra aro_services=( "aro-dbtoken" "aro-monitor" @@ -332,7 +366,9 @@ enable_aro_services() { # configure_service_fluentbit configure_service_fluentbit() { + log "starting" log "configuring fluentbit service" + mkdir -p /etc/fluentbit/ mkdir -p /var/lib/fluent @@ -417,6 +453,8 @@ WantedBy=multi-user.target" # configure_certs configure_certs() { + log "starting" + mkdir /etc/aro-rp base64 -d <<<"$ADMINAPICABUNDLE" >/etc/aro-rp/admin-ca-bundle.pem if [[ -n "$ARMAPICABUNDLE" ]]; then @@ -448,6 +486,7 @@ configure_certs() { # configure_service_mdm configure_service_mdm() { + log "starting" log "configuring mdm service" local -r sysconfig_mdm_filename="/etc/sysconfig/mdm" @@ -499,6 +538,8 @@ WantedBy=multi-user.target" # configure_timers_mdm_mdsd configure_timers_mdm_mdsd() { + log "starting" + for var in "mdsd" "mdm"; do local download_creds_service_filename="/etc/systemd/system/download-$var-credentials.service" local download_creds_service_file="[Unit] @@ -637,6 +678,8 @@ WantedBy=multi-user.target' # configure_service_aro_rp configure_service_aro_rp() { + log "starting" + local -r aro_rp_conf_filename='/etc/sysconfig/aro-rp' local -r aro_rp_conf_file="ACR_RESOURCE_ID='$ACRRESOURCEID' ADMIN_API_CLIENT_CERT_COMMON_NAME='$ADMINAPICLIENTCERTCOMMONNAME' @@ -725,6 +768,8 @@ WantedBy=multi-user.target" # configure_service_aro_dbtoken configure_service_aro_dbtoken() { + log "starting" + local -r aro_dbtoken_service_conf_filename='/etc/sysconfig/aro-dbtoken' local -r aro_dbtoken_service_conf_file="DATABASE_ACCOUNT_NAME='$DATABASEACCOUNTNAME' AZURE_DBTOKEN_CLIENT_ID='$DBTOKENCLIENTID' @@ -775,7 +820,9 @@ WantedBy=multi-user.target" # configure_service_aro_monitor configure_service_aro_monitor() { + log "starting" log "configuring aro-monitor service" + # DOMAIN_NAME, CLUSTER_MDSD_ACCOUNT, CLUSTER_MDSD_CONFIG_VERSION, GATEWAY_DOMAINS, GATEWAY_RESOURCEGROUP, MDSD_ENVIRONMENT CLUSTER_MDSD_NAMESPACE # are not used, but can't easily be refactored out. Should be revisited in the future. local -r aro_monitor_service_conf_filename='/etc/sysconfig/aro-monitor' @@ -841,6 +888,8 @@ WantedBy=multi-user.target" # configure_service_aro_portal configure_service_aro_portal() { + log "starting" + local -r aro_portal_service_conf_filename='/etc/sysconfig/aro-portal' local -r aro_portal_service_conf_file="AZURE_PORTAL_ACCESS_GROUP_IDS='$PORTALACCESSGROUPIDS' AZURE_PORTAL_CLIENT_ID='$PORTALCLIENTID' @@ -894,6 +943,8 @@ WantedBy=multi-user.target" # configure_service_mdsd configure_service_mdsd() { + log "starting" + local -r mdsd_service_dir="/etc/systemd/system/mdsd.service.d" mkdir "$mdsd_service_dir" @@ -928,6 +979,8 @@ export MDSD_MSGPACK_SORT_COLUMNS=1\"" # run_azsecd_config_scan run_azsecd_config_scan() { + log "starting" + local -ar configs=( "baseline" "clamav" @@ -963,6 +1016,8 @@ write_file() { # reboot_vm restores all selinux file contexts, waits 30 seconds then reboots reboot_vm() { + log "starting" + configure_selinux "true" (sleep 30 && log "rebooting vm now"; reboot) & }