diff --git a/.pipelines/e2e.yml b/.pipelines/e2e.yml index 90edc5c18ed..068d2b60df1 100644 --- a/.pipelines/e2e.yml +++ b/.pipelines/e2e.yml @@ -30,7 +30,7 @@ jobs: - script: | set -xe sudo rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm - sudo dnf install -y openvpn make podman + sudo dnf install -y openvpn make podman jq displayName: Setup (Container) target: container @@ -84,6 +84,12 @@ jobs: . ./hack/e2e/run-rp-and-e2e.sh hack/get-admin-kubeconfig.sh /subscriptions/$AZURE_SUBSCRIPTION_ID/resourceGroups/$CLUSTER/providers/Microsoft.RedHatOpenShift/openShiftClusters/$CLUSTER >admin.kubeconfig + displayName: Get admin kubeconfig for must-gather + condition: failed() + # must-gather collection must be run inside the container so it can access the VPN + - script: | + export CI=true + . ./hack/e2e/run-rp-and-e2e.sh export KUBECONFIG=admin.kubeconfig wget -nv https://mirror.openshift.com/pub/openshift-v4/x86_64/clients/ocp/$(OpenShiftVersion)/openshift-client-linux-$(OpenShiftVersion).tar.gz @@ -91,6 +97,7 @@ jobs: ./oc adm must-gather tar cf must-gather.tar.gz must-gather.local.* displayName: Collect must-gather + target: container condition: failed() - publish: must-gather.tar.gz artifact: must-gather diff --git a/cmd/aro/update_ocp_versions.go b/cmd/aro/update_ocp_versions.go index 43e9a72a522..2b0d60d3893 100644 --- a/cmd/aro/update_ocp_versions.go +++ b/cmd/aro/update_ocp_versions.go @@ -21,6 +21,29 @@ import ( "github.com/Azure/ARO-RP/pkg/util/version" ) +func getInstallerImageDigests(envKey string) (map[string]string, error) { + var installerImageDigests map[string]string + var err error + + jsonData := []byte(os.Getenv(envKey)) + + // For Azure DevOps pipelines, the JSON data is Base64-encoded + // since it's embedded in JSON-formatted build artifacts. But + // let's not force that on local development mode. + if !env.IsLocalDevelopmentMode() { + jsonData, err = base64.StdEncoding.DecodeString(string(jsonData)) + if err != nil { + return nil, fmt.Errorf("%s: Failed to decode base64: %v", envKey, err) + } + } + + if err = json.Unmarshal(jsonData, &installerImageDigests); err != nil { + return nil, fmt.Errorf("%s: Failed to parse JSON: %v", envKey, err) + } + + return installerImageDigests, nil +} + func getLatestOCPVersions(ctx context.Context, log *logrus.Entry) ([]api.OpenShiftVersion, error) { env, err := env.NewCoreForCI(ctx, log) if err != nil { @@ -36,20 +59,9 @@ func getLatestOCPVersions(ctx context.Context, log *logrus.Entry) ([]api.OpenShi // the aro-installer wrapper digest. This allows us to utilize // Azure Safe Deployment Practices (SDP) instead of pushing the // version tag and deploying to all regions at once. - var installerImageDigests map[string]string - jsonData := []byte(os.Getenv("INSTALLER_IMAGE_DIGESTS")) - - // For Azure DevOps pipelines, the JSON data is Base64-encoded - // since it's embedded in JSON-formatted build artifacts. But - // let's not force that on local development mode. - if !env.IsLocalDevelopmentMode() { - jsonData, err = base64.StdEncoding.DecodeString(string(jsonData)) - if err != nil { - return nil, fmt.Errorf("INSTALLER_IMAGE_DIGESTS: Failed to decode base64: %v", err) - } - } - if err = json.Unmarshal(jsonData, &installerImageDigests); err != nil { - return nil, fmt.Errorf("INSTALLER_IMAGE_DIGESTS: Failed to parse JSON: %v", err) + installerImageDigests, err := getInstallerImageDigests("INSTALLER_IMAGE_DIGESTS") + if err != nil { + return nil, err } for _, vers := range version.HiveInstallStreams { diff --git a/pkg/cluster/generate.go b/pkg/cluster/generate.go index 0076f15d61b..2eb197fb00b 100644 --- a/pkg/cluster/generate.go +++ b/pkg/cluster/generate.go @@ -4,4 +4,6 @@ package cluster // Licensed under the Apache License 2.0. //go:generate go run ../../vendor/github.com/golang/mock/mockgen -destination=../util/mocks/$GOPACKAGE/$GOPACKAGE.go github.com/Azure/ARO-RP/pkg/$GOPACKAGE Interface +//go:generate go run ../../vendor/github.com/golang/mock/mockgen -destination=../util/mocks/samplesclient/versioned.go github.com/openshift/client-go/samples/clientset/versioned Interface +//go:generate go run ../../vendor/github.com/golang/mock/mockgen -destination=../util/mocks/samples/samples.go github.com/openshift/client-go/samples/clientset/versioned/typed/samples/v1 SamplesV1Interface,ConfigInterface //go:generate go run ../../vendor/golang.org/x/tools/cmd/goimports -local=github.com/Azure/ARO-RP -e -w ../util/mocks/$GOPACKAGE/$GOPACKAGE.go diff --git a/pkg/cluster/samples.go b/pkg/cluster/samples.go index 61077b4ef5f..befa7962422 100644 --- a/pkg/cluster/samples.go +++ b/pkg/cluster/samples.go @@ -8,6 +8,7 @@ import ( configv1 "github.com/openshift/api/config/v1" operatorv1 "github.com/openshift/api/operator/v1" + "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/util/retry" ) @@ -19,17 +20,22 @@ func (m *manager) disableSamples(ctx context.Context) error { return nil } - return retry.RetryOnConflict(retry.DefaultRetry, func() error { - c, err := m.samplescli.SamplesV1().Configs().Get(ctx, "cluster", metav1.GetOptions{}) - if err != nil { - return err - } + return retry.OnError( + retry.DefaultRetry, + func(err error) bool { + return errors.IsConflict(err) || errors.IsNotFound(err) + }, + func() error { + c, err := m.samplescli.SamplesV1().Configs().Get(ctx, "cluster", metav1.GetOptions{}) + if err != nil { + return err + } - c.Spec.ManagementState = operatorv1.Removed + c.Spec.ManagementState = operatorv1.Removed - _, err = m.samplescli.SamplesV1().Configs().Update(ctx, c, metav1.UpdateOptions{}) - return err - }) + _, err = m.samplescli.SamplesV1().Configs().Update(ctx, c, metav1.UpdateOptions{}) + return err + }) } // disableOperatorHubSources disables operator hub sources if there's no diff --git a/pkg/cluster/samples_test.go b/pkg/cluster/samples_test.go new file mode 100644 index 00000000000..1fb4ecb5981 --- /dev/null +++ b/pkg/cluster/samples_test.go @@ -0,0 +1,105 @@ +package cluster + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "context" + "errors" + "testing" + + "github.com/golang/mock/gomock" + operatorv1 "github.com/openshift/api/operator/v1" + samplesv1 "github.com/openshift/api/samples/v1" + kerrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + + "github.com/Azure/ARO-RP/pkg/api" + mock_env "github.com/Azure/ARO-RP/pkg/util/mocks/env" + mock_samples "github.com/Azure/ARO-RP/pkg/util/mocks/samples" + mock_samplesclient "github.com/Azure/ARO-RP/pkg/util/mocks/samplesclient" + utilerror "github.com/Azure/ARO-RP/test/util/error" +) + +func Test_manager_disableSamples(t *testing.T) { + ctx := context.Background() + samplesConfig := &samplesv1.Config{ + TypeMeta: metav1.TypeMeta{}, + ObjectMeta: metav1.ObjectMeta{}, + Spec: samplesv1.ConfigSpec{}, + Status: samplesv1.ConfigStatus{}, + } + tests := []struct { + name string + samplesConfig *samplesv1.Config + samplesCRGetError error + samplesCRUpdateError error + expectedMinNumberOfGetCalls int + expectedMaxNumberOfGetCalls int + wantErr string + }{ + { + name: "samples cr is found and updated", + samplesConfig: samplesConfig, + expectedMinNumberOfGetCalls: 1, + expectedMaxNumberOfGetCalls: 1, + wantErr: "", + }, + { + name: "samples cr is not found and retried", + samplesCRGetError: kerrors.NewNotFound(schema.GroupResource{}, "samples"), + expectedMinNumberOfGetCalls: 2, + expectedMaxNumberOfGetCalls: 15, + wantErr: " \"samples\" not found", + }, + { + name: "samples cr update is conflicting and retried", + samplesConfig: samplesConfig, + expectedMinNumberOfGetCalls: 2, + expectedMaxNumberOfGetCalls: 15, + samplesCRUpdateError: kerrors.NewConflict(schema.GroupResource{}, "samples", errors.New("conflict")), + wantErr: "Operation cannot be fulfilled on \"samples\": conflict", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + controller := gomock.NewController(t) + defer controller.Finish() + env := mock_env.NewMockInterface(controller) + samplescli := mock_samplesclient.NewMockInterface(controller) + samplesInterface := mock_samples.NewMockSamplesV1Interface(controller) + configInterface := mock_samples.NewMockConfigInterface(controller) + + env.EXPECT().IsLocalDevelopmentMode().Return(false) + samplescli.EXPECT().SamplesV1().AnyTimes().Return(samplesInterface) + samplesInterface.EXPECT().Configs().AnyTimes().Return(configInterface) + configInterface.EXPECT().Get(gomock.Any(), "cluster", metav1.GetOptions{}). + MinTimes(tt.expectedMinNumberOfGetCalls). + MaxTimes(tt.expectedMaxNumberOfGetCalls). + Return(tt.samplesConfig, tt.samplesCRGetError) + + if tt.samplesConfig != nil { + samplesConfig.Spec.ManagementState = operatorv1.Removed + configInterface.EXPECT().Update(gomock.Any(), samplesConfig, metav1.UpdateOptions{}).AnyTimes().Return(samplesConfig, tt.samplesCRUpdateError) + } + + m := &manager{ + env: env, + doc: &api.OpenShiftClusterDocument{ + OpenShiftCluster: &api.OpenShiftCluster{ + Properties: api.OpenShiftClusterProperties{ + ClusterProfile: api.ClusterProfile{ + ResourceGroupID: "/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/clusterRGName", + }, + }, + }, + }, + samplescli: samplescli, + } + + err := m.disableSamples(ctx) + utilerror.AssertErrorMessage(t, err, tt.wantErr) + }) + } +} diff --git a/pkg/deploy/assets/gateway-production.json b/pkg/deploy/assets/gateway-production.json index 2112428c0d4..3ce7be4d58a 100644 --- a/pkg/deploy/assets/gateway-production.json +++ b/pkg/deploy/assets/gateway-production.json @@ -309,7 +309,7 @@ "autoUpgradeMinorVersion": true, "settings": {}, "protectedSettings": { - "script": "[base64(concat(base64ToString('c2V0IC1leAoK'),'ACRRESOURCEID=$(base64 -d \u003c\u003c\u003c''',base64(parameters('acrResourceId')),''')\n','AZURECLOUDNAME=$(base64 -d \u003c\u003c\u003c''',base64(parameters('azureCloudName')),''')\n','AZURESECPACKQUALYSURL=$(base64 -d \u003c\u003c\u003c''',base64(parameters('azureSecPackQualysUrl')),''')\n','AZURESECPACKVSATENANTID=$(base64 -d \u003c\u003c\u003c''',base64(parameters('azureSecPackVSATenantId')),''')\n','DATABASEACCOUNTNAME=$(base64 -d \u003c\u003c\u003c''',base64(parameters('databaseAccountName')),''')\n','DBTOKENCLIENTID=$(base64 -d \u003c\u003c\u003c''',base64(parameters('dbtokenClientId')),''')\n','DBTOKENURL=$(base64 -d \u003c\u003c\u003c''',base64(parameters('dbtokenUrl')),''')\n','MDMFRONTENDURL=$(base64 -d \u003c\u003c\u003c''',base64(parameters('mdmFrontendUrl')),''')\n','MDSDENVIRONMENT=$(base64 -d \u003c\u003c\u003c''',base64(parameters('mdsdEnvironment')),''')\n','FLUENTBITIMAGE=$(base64 -d \u003c\u003c\u003c''',base64(parameters('fluentbitImage')),''')\n','GATEWAYMDSDCONFIGVERSION=$(base64 -d \u003c\u003c\u003c''',base64(parameters('gatewayMdsdConfigVersion')),''')\n','GATEWAYDOMAINS=$(base64 -d \u003c\u003c\u003c''',base64(parameters('gatewayDomains')),''')\n','GATEWAYFEATURES=$(base64 -d \u003c\u003c\u003c''',base64(parameters('gatewayFeatures')),''')\n','KEYVAULTDNSSUFFIX=$(base64 -d \u003c\u003c\u003c''',base64(parameters('keyvaultDNSSuffix')),''')\n','KEYVAULTPREFIX=$(base64 -d \u003c\u003c\u003c''',base64(parameters('keyvaultPrefix')),''')\n','RPIMAGE=$(base64 -d \u003c\u003c\u003c''',base64(parameters('rpImage')),''')\n','RPMDMACCOUNT=$(base64 -d \u003c\u003c\u003c''',base64(parameters('rpMdmAccount')),''')\n','RPMDSDACCOUNT=$(base64 -d \u003c\u003c\u003c''',base64(parameters('rpMdsdAccount')),''')\n','RPMDSDNAMESPACE=$(base64 -d \u003c\u003c\u003c''',base64(parameters('rpMdsdNamespace')),''')\n','MDMIMAGE=''/genevamdm:2.2023.609.2051-821f47-20230706t0953''\n','LOCATION=$(base64 -d \u003c\u003c\u003c''',base64(resourceGroup().location),''')\n','SUBSCRIPTIONID=$(base64 -d \u003c\u003c\u003c''',base64(subscription().subscriptionId),''')\n','RESOURCEGROUPNAME=$(base64 -d \u003c\u003c\u003c''',base64(resourceGroup().name),''')\n','\n',base64ToString('#!/bin/bash

echo "setting ssh password authentication"
# We need to manually set PasswordAuthentication to true in order for the VMSS Access JIT to work
sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/g' /etc/ssh/sshd_config
systemctl reload sshd.service

echo "running RHUI fix"
yum update -y --disablerepo='*' --enablerepo='rhui-microsoft-azure*'

echo "running yum update"
yum -y -x WALinuxAgent -x WALinuxAgent-udev update --allowerasing

echo "extending partition table"
# Linux block devices are inconsistently named
# it's difficult to tie the lvm pv to the physical disk using /dev/disk files, which is why lvs is used here
physicalDisk="$(lvs -o devices -a | head -n2 | tail -n1 | cut -d ' ' -f 3 | cut -d \( -f 1 | tr -d '[:digit:]')"
growpart "$physicalDisk" 2

echo "extending filesystems"
lvextend -l +20%FREE /dev/rootvg/rootlv
xfs_growfs /

lvextend -l +100%FREE /dev/rootvg/varlv
xfs_growfs /var

rpm --import https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8
rpm --import https://packages.microsoft.com/keys/microsoft.asc

for attempt in {1..5}; do
  yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm && break
  if [[ ${attempt} -lt 5 ]]; then sleep 10; else exit 1; fi
done

echo "configuring logrotate"
cat >/etc/logrotate.conf <<'EOF'
# see "man logrotate" for details
# rotate log files weekly
weekly

# keep 2 weeks worth of backlogs
rotate 2

# create new (empty) log files after rotating old ones
create

# use date as a suffix of the rotated file
dateext

# uncomment this if you want your log files compressed
compress

# RPM packages drop log rotation information into this directory
include /etc/logrotate.d

# no packages own wtmp and btmp -- we'll rotate them here
/var/log/wtmp {
    monthly
    create 0664 root utmp
        minsize 1M
    rotate 1
}

/var/log/btmp {
    missingok
    monthly
    create 0600 root utmp
    rotate 1
}
EOF

echo "configuring yum repository and running yum update"
cat >/etc/yum.repos.d/azure.repo <<'EOF'
[azure-cli]
name=azure-cli
baseurl=https://packages.microsoft.com/yumrepos/azure-cli
enabled=yes
gpgcheck=yes

[azurecore]
name=azurecore
baseurl=https://packages.microsoft.com/yumrepos/azurecore
enabled=yes
gpgcheck=no
EOF

semanage fcontext -a -t var_log_t "/var/log/journal(/.*)?"
mkdir -p /var/log/journal

for attempt in {1..5}; do
  yum -y install clamav azsec-clamav azsec-monitor azure-cli azure-mdsd azure-security podman-docker openssl-perl python3 && break
  # hack - we are installing python3 on hosts due to an issue with Azure Linux Extensions https://github.com/Azure/azure-linux-extensions/pull/1505
  if [[ ${attempt} -lt 5 ]]; then sleep 10; else exit 1; fi
done

echo "applying firewall rules"
# https://access.redhat.com/security/cve/cve-2020-13401
cat >/etc/sysctl.d/02-disable-accept-ra.conf <<'EOF'
net.ipv6.conf.all.accept_ra=0
EOF

cat >/etc/sysctl.d/01-disable-core.conf <<'EOF'
kernel.core_pattern = |/bin/true
EOF
sysctl --system

firewall-cmd --add-port=80/tcp --permanent
firewall-cmd --add-port=8081/tcp --permanent
firewall-cmd --add-port=443/tcp --permanent

echo "logging into prod acr"
export AZURE_CLOUD_NAME=$AZURECLOUDNAME
az login -i --allow-no-subscriptions

# The managed identity that the VM runs as only has a single roleassignment.
# This role assignment is ACRPull which is not necessarily present in the
# subscription we're deploying into.  If the identity does not have any
# role assignments scoped on the subscription we're deploying into, it will
# not show on az login -i, which is why the below line is commented.
# az account set -s "$SUBSCRIPTIONID"

# Suppress emulation output for podman instead of docker for az acr compatability
mkdir -p /etc/containers/
touch /etc/containers/nodocker

mkdir -p /root/.docker
REGISTRY_AUTH_FILE=/root/.docker/config.json az acr login --name "$(sed -e 's|.*/||' <<<"$ACRRESOURCEID")"

MDMIMAGE="${RPIMAGE%%/*}/${MDMIMAGE##*/}"
docker pull "$MDMIMAGE"
docker pull "$RPIMAGE"
docker pull "$FLUENTBITIMAGE"

az logout

echo "configuring fluentbit service"
mkdir -p /etc/fluentbit/
mkdir -p /var/lib/fluent

cat >/etc/fluentbit/fluentbit.conf <<'EOF'
[INPUT]
	Name systemd
	Tag journald
	Systemd_Filter _COMM=aro
	DB /var/lib/fluent/journaldb

[FILTER]
	Name modify
	Match journald
	Remove_wildcard _
	Remove TIMESTAMP

[OUTPUT]
	Name forward
	Match *
	Port 29230
EOF

echo "FLUENTBITIMAGE=$FLUENTBITIMAGE" >/etc/sysconfig/fluentbit

cat >/etc/systemd/system/fluentbit.service <<'EOF'
[Unit]
After=network-online.target
Wants=network-online.target
StartLimitIntervalSec=0

[Service]
RestartSec=1s
EnvironmentFile=/etc/sysconfig/fluentbit
ExecStartPre=-/usr/bin/docker rm -f %N
ExecStart=/usr/bin/docker run \
  --security-opt label=disable \
  --entrypoint /opt/td-agent-bit/bin/td-agent-bit \
  --net=host \
  --hostname %H \
  --name %N \
  --rm \
  --cap-drop net_raw \
  -v /etc/fluentbit/fluentbit.conf:/etc/fluentbit/fluentbit.conf \
  -v /var/lib/fluent:/var/lib/fluent:z \
  -v /var/log/journal:/var/log/journal:ro \
  -v /etc/machine-id:/etc/machine-id:ro \
  $FLUENTBITIMAGE \
  -c /etc/fluentbit/fluentbit.conf

ExecStop=/usr/bin/docker stop %N
Restart=always
RestartSec=5
StartLimitInterval=0

[Install]
WantedBy=multi-user.target
EOF

echo "configuring mdm service"
cat >/etc/sysconfig/mdm <<EOF
MDMFRONTENDURL='$MDMFRONTENDURL'
MDMIMAGE='$MDMIMAGE'
MDMSOURCEENVIRONMENT='$LOCATION'
MDMSOURCEROLE=gateway
MDMSOURCEROLEINSTANCE='$(hostname)'
EOF

mkdir /var/etw
cat >/etc/systemd/system/mdm.service <<'EOF'
[Unit]
After=network-online.target
Wants=network-online.target

[Service]
EnvironmentFile=/etc/sysconfig/mdm
ExecStartPre=-/usr/bin/docker rm -f %N
ExecStart=/usr/bin/docker run \
  --entrypoint /usr/sbin/MetricsExtension \
  --hostname %H \
  --name %N \
  --rm \
  --cap-drop net_raw \
  -m 2g \
  -v /etc/mdm.pem:/etc/mdm.pem \
  -v /var/etw:/var/etw:z \
  $MDMIMAGE \
  -CertFile /etc/mdm.pem \
  -FrontEndUrl $MDMFRONTENDURL \
  -Logger Console \
  -LogLevel Warning \
  -PrivateKeyFile /etc/mdm.pem \
  -SourceEnvironment $MDMSOURCEENVIRONMENT \
  -SourceRole $MDMSOURCEROLE \
  -SourceRoleInstance $MDMSOURCEROLEINSTANCE
ExecStop=/usr/bin/docker stop %N
Restart=always
RestartSec=1
StartLimitInterval=0

[Install]
WantedBy=multi-user.target
EOF

echo "configuring aro-gateway service"
cat >/etc/sysconfig/aro-gateway <<EOF
ACR_RESOURCE_ID='$ACRRESOURCEID'
DATABASE_ACCOUNT_NAME='$DATABASEACCOUNTNAME'
AZURE_DBTOKEN_CLIENT_ID='$DBTOKENCLIENTID'
DBTOKEN_URL='$DBTOKENURL'
MDM_ACCOUNT="$RPMDMACCOUNT"
MDM_NAMESPACE=Gateway
GATEWAY_DOMAINS='$GATEWAYDOMAINS'
GATEWAY_FEATURES='$GATEWAYFEATURES'
RPIMAGE='$RPIMAGE'
EOF

cat >/etc/systemd/system/aro-gateway.service <<'EOF'
[Unit]
After=network-online.target
Wants=network-online.target

[Service]
EnvironmentFile=/etc/sysconfig/aro-gateway
ExecStartPre=-/usr/bin/docker rm -f %N
ExecStart=/usr/bin/docker run \
  --hostname %H \
  --name %N \
  --rm \
  --cap-drop net_raw \
  -e ACR_RESOURCE_ID \
  -e DATABASE_ACCOUNT_NAME \
  -e AZURE_DBTOKEN_CLIENT_ID \
  -e DBTOKEN_URL \
  -e GATEWAY_DOMAINS \
  -e GATEWAY_FEATURES \
  -e MDM_ACCOUNT \
  -e MDM_NAMESPACE \
  -m 2g \
  -p 80:8080 \
  -p 8081:8081 \
  -p 443:8443 \
  -v /run/systemd/journal:/run/systemd/journal \
  -v /var/etw:/var/etw:z \
  $RPIMAGE \
  gateway
ExecStop=/usr/bin/docker stop -t 3600 %N
TimeoutStopSec=3600
Restart=always
RestartSec=1
StartLimitInterval=0

[Install]
WantedBy=multi-user.target
EOF

chcon -R system_u:object_r:var_log_t:s0 /var/opt/microsoft/linuxmonagent

mkdir -p /var/lib/waagent/Microsoft.Azure.KeyVault.Store

echo "configuring mdsd and mdm services"
for var in "mdsd" "mdm"; do
cat >/etc/systemd/system/download-$var-credentials.service <<EOF
[Unit]
Description=Periodic $var credentials refresh

[Service]
Type=oneshot
ExecStart=/usr/local/bin/download-credentials.sh $var
EOF

cat >/etc/systemd/system/download-$var-credentials.timer <<EOF
[Unit]
Description=Periodic $var credentials refresh
After=network-online.target
Wants=network-online.target

[Timer]
OnBootSec=0min
OnCalendar=0/12:00:00
AccuracySec=5s

[Install]
WantedBy=timers.target
EOF
done

cat >/usr/local/bin/download-credentials.sh <<EOF
#!/bin/bash
set -eu

COMPONENT="\$1"
echo "Download \$COMPONENT credentials"

TEMP_DIR=\$(mktemp -d)
export AZURE_CONFIG_DIR=\$(mktemp -d)

echo "Logging into Azure..."
RETRIES=3
while [ "\$RETRIES" -gt 0 ]; do
    if az login -i --allow-no-subscriptions
    then
        echo "az login successful"
        break
    else
        echo "az login failed. Retrying..."
        let RETRIES-=1
        sleep 5
    fi
done

trap "cleanup" EXIT

cleanup() {
  az logout
  [[ "\$TEMP_DIR" =~ /tmp/.+ ]] && rm -rf \$TEMP_DIR
  [[ "\$AZURE_CONFIG_DIR" =~ /tmp/.+ ]] && rm -rf \$AZURE_CONFIG_DIR
}

if [ "\$COMPONENT" = "mdm" ]; then
  CURRENT_CERT_FILE="/etc/mdm.pem"
elif [ "\$COMPONENT" = "mdsd" ]; then
  CURRENT_CERT_FILE="/var/lib/waagent/Microsoft.Azure.KeyVault.Store/mdsd.pem"
else
  echo Invalid usage && exit 1
fi

SECRET_NAME="gwy-\${COMPONENT}"
NEW_CERT_FILE="\$TEMP_DIR/\$COMPONENT.pem"
for attempt in {1..5}; do
  az keyvault secret download --file \$NEW_CERT_FILE --id "https://$KEYVAULTPREFIX-gwy.$KEYVAULTDNSSUFFIX/secrets/\$SECRET_NAME" && break
  if [[ \$attempt -lt 5 ]]; then sleep 10; else exit 1; fi
done

if [ -f \$NEW_CERT_FILE ]; then
  if [ "\$COMPONENT" = "mdsd" ]; then
    chown syslog:syslog \$NEW_CERT_FILE
  else
    sed -i -ne '1,/END CERTIFICATE/ p' \$NEW_CERT_FILE
  fi
  if ! diff $NEW_CERT_FILE $CURRENT_CERT_FILE >/dev/null 2>&1; then
    chmod 0600 \$NEW_CERT_FILE
    mv \$NEW_CERT_FILE \$CURRENT_CERT_FILE
  fi
else
  echo Failed to refresh certificate for \$COMPONENT && exit 1
fi
EOF

chmod u+x /usr/local/bin/download-credentials.sh

systemctl enable download-mdsd-credentials.timer
systemctl enable download-mdm-credentials.timer

/usr/local/bin/download-credentials.sh mdsd
/usr/local/bin/download-credentials.sh mdm
MDSDCERTIFICATESAN=$(openssl x509 -in /var/lib/waagent/Microsoft.Azure.KeyVault.Store/mdsd.pem -noout -subject | sed -e 's/.*CN = //')

cat >/etc/systemd/system/watch-mdm-credentials.service <<EOF
[Unit]
Description=Watch for changes in mdm.pem and restarts the mdm service

[Service]
Type=oneshot
ExecStart=/usr/bin/systemctl restart mdm.service

[Install]
WantedBy=multi-user.target
EOF

cat >/etc/systemd/system/watch-mdm-credentials.path <<EOF
[Path]
PathModified=/etc/mdm.pem

[Install]
WantedBy=multi-user.target
EOF

systemctl enable watch-mdm-credentials.path
systemctl start watch-mdm-credentials.path

mkdir /etc/systemd/system/mdsd.service.d
cat >/etc/systemd/system/mdsd.service.d/override.conf <<'EOF'
[Unit]
After=network-online.target
EOF

cat >/etc/default/mdsd <<EOF
MDSD_ROLE_PREFIX=/var/run/mdsd/default
MDSD_OPTIONS="-A -d -r \$MDSD_ROLE_PREFIX"

export MONITORING_GCS_ENVIRONMENT='$MDSDENVIRONMENT'
export MONITORING_GCS_ACCOUNT='$RPMDSDACCOUNT'
export MONITORING_GCS_REGION='$LOCATION'
export MONITORING_GCS_AUTH_ID_TYPE=AuthKeyVault
export MONITORING_GCS_AUTH_ID='$MDSDCERTIFICATESAN'
export MONITORING_GCS_NAMESPACE='$RPMDSDNAMESPACE'
export MONITORING_CONFIG_VERSION='$GATEWAYMDSDCONFIGVERSION'
export MONITORING_USE_GENEVA_CONFIG_SERVICE=true

export MONITORING_TENANT='$LOCATION'
export MONITORING_ROLE=gateway
export MONITORING_ROLE_INSTANCE='$(hostname)'

export MDSD_MSGPACK_SORT_COLUMNS=1
EOF

# setting MONITORING_GCS_AUTH_ID_TYPE=AuthKeyVault seems to have caused mdsd not
# to honour SSL_CERT_FILE any more, heaven only knows why.
mkdir -p /usr/lib/ssl/certs
csplit -f /usr/lib/ssl/certs/cert- -b %03d.pem /etc/pki/tls/certs/ca-bundle.crt /^$/1 {*} >/dev/null
c_rehash /usr/lib/ssl/certs

# we leave clientId blank as long as only 1 managed identity assigned to vmss
# if we have more than 1, we will need to populate with clientId used for off-node scanning
cat >/etc/default/vsa-nodescan-agent.config <<EOF
{
    "Nice": 19,
    "Timeout": 10800,
    "ClientId": "",
    "TenantId": "$AZURESECPACKVSATENANTID",
    "QualysStoreBaseUrl": "$AZURESECPACKQUALYSURL",
    "ProcessTimeout": 300,
    "CommandDelay": 0
  }
EOF

# we start a cron job to run every hour to ensure the said directory is accessible
# by the correct user as it gets created by root and may cause a race condition
# where root owns the dir instead of syslog
# TODO: https://msazure.visualstudio.com/AzureRedHatOpenShift/_workitems/edit/12591207
cat >/etc/cron.d/mdsd-chown-workaround <<EOF
SHELL=/bin/bash
PATH=/bin
0 * * * * root chown syslog:syslog /var/opt/microsoft/linuxmonagent/eh/EventNotice/arorplogs*
EOF

echo "enabling aro services"
for service in aro-gateway auoms azsecd azsecmond mdsd mdm chronyd fluentbit; do
  systemctl enable $service.service
done

for scan in baseline clamav software; do
  /usr/local/bin/azsecd config -s $scan -d P1D
done

echo "rebooting"
restorecon -RF /var/log/*
(sleep 30; reboot) &
')))]" + "script": "[base64(concat(base64ToString('c2V0IC1leAoK'),'ACRRESOURCEID=$(base64 -d \u003c\u003c\u003c''',base64(parameters('acrResourceId')),''')\n','AZURECLOUDNAME=$(base64 -d \u003c\u003c\u003c''',base64(parameters('azureCloudName')),''')\n','AZURESECPACKQUALYSURL=$(base64 -d \u003c\u003c\u003c''',base64(parameters('azureSecPackQualysUrl')),''')\n','AZURESECPACKVSATENANTID=$(base64 -d \u003c\u003c\u003c''',base64(parameters('azureSecPackVSATenantId')),''')\n','DATABASEACCOUNTNAME=$(base64 -d \u003c\u003c\u003c''',base64(parameters('databaseAccountName')),''')\n','DBTOKENCLIENTID=$(base64 -d \u003c\u003c\u003c''',base64(parameters('dbtokenClientId')),''')\n','DBTOKENURL=$(base64 -d \u003c\u003c\u003c''',base64(parameters('dbtokenUrl')),''')\n','MDMFRONTENDURL=$(base64 -d \u003c\u003c\u003c''',base64(parameters('mdmFrontendUrl')),''')\n','MDSDENVIRONMENT=$(base64 -d \u003c\u003c\u003c''',base64(parameters('mdsdEnvironment')),''')\n','FLUENTBITIMAGE=$(base64 -d \u003c\u003c\u003c''',base64(parameters('fluentbitImage')),''')\n','GATEWAYMDSDCONFIGVERSION=$(base64 -d \u003c\u003c\u003c''',base64(parameters('gatewayMdsdConfigVersion')),''')\n','GATEWAYDOMAINS=$(base64 -d \u003c\u003c\u003c''',base64(parameters('gatewayDomains')),''')\n','GATEWAYFEATURES=$(base64 -d \u003c\u003c\u003c''',base64(parameters('gatewayFeatures')),''')\n','KEYVAULTDNSSUFFIX=$(base64 -d \u003c\u003c\u003c''',base64(parameters('keyvaultDNSSuffix')),''')\n','KEYVAULTPREFIX=$(base64 -d \u003c\u003c\u003c''',base64(parameters('keyvaultPrefix')),''')\n','RPIMAGE=$(base64 -d \u003c\u003c\u003c''',base64(parameters('rpImage')),''')\n','RPMDMACCOUNT=$(base64 -d \u003c\u003c\u003c''',base64(parameters('rpMdmAccount')),''')\n','RPMDSDACCOUNT=$(base64 -d \u003c\u003c\u003c''',base64(parameters('rpMdsdAccount')),''')\n','RPMDSDNAMESPACE=$(base64 -d \u003c\u003c\u003c''',base64(parameters('rpMdsdNamespace')),''')\n','MDMIMAGE=''/genevamdm:2.2023.609.2051-821f47-20230706t0953''\n','LOCATION=$(base64 -d \u003c\u003c\u003c''',base64(resourceGroup().location),''')\n','SUBSCRIPTIONID=$(base64 -d \u003c\u003c\u003c''',base64(subscription().subscriptionId),''')\n','RESOURCEGROUPNAME=$(base64 -d \u003c\u003c\u003c''',base64(resourceGroup().name),''')\n','\n',base64ToString('#!/bin/bash

echo "setting ssh password authentication"
# We need to manually set PasswordAuthentication to true in order for the VMSS Access JIT to work
sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/g' /etc/ssh/sshd_config
systemctl reload sshd.service

echo "running RHUI fix"
yum update -y --disablerepo='*' --enablerepo='rhui-microsoft-azure*'

echo "running yum update"
yum -y -x WALinuxAgent -x WALinuxAgent-udev update --allowerasing

echo "extending partition table"
# Linux block devices are inconsistently named
# it's difficult to tie the lvm pv to the physical disk using /dev/disk files, which is why lvs is used here
physical_disk="$(lvs -o devices -a | head -n2 | tail -n1 | cut -d ' ' -f 3 | cut -d \( -f 1 | tr -d '[:digit:]')"
growpart "$physical_disk" 2

echo "extending filesystems"
lvextend -l +20%FREE /dev/rootvg/rootlv
xfs_growfs /

lvextend -l +100%FREE /dev/rootvg/varlv
xfs_growfs /var

rpm --import https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8
rpm --import https://packages.microsoft.com/keys/microsoft.asc

for attempt in {1..5}; do
  yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm && break
  if [[ ${attempt} -lt 5 ]]; then sleep 10; else exit 1; fi
done

echo "configuring logrotate"

# gateway_logdir is a readonly variable that specifies the host path mount point for the gateway container log file
# for the purpose of rotating the gateway logs
declare -r gateway_logdir='/var/log/aro-gateway'

cat >/etc/logrotate.conf <<EOF
# see "man logrotate" for details
# rotate log files weekly
weekly

# keep 2 weeks worth of backlogs
rotate 2

# create new (empty) log files after rotating old ones
create

# use date as a suffix of the rotated file
dateext

# uncomment this if you want your log files compressed
compress

# RPM packages drop log rotation information into this directory
include /etc/logrotate.d

# no packages own wtmp and btmp -- we'll rotate them here
/var/log/wtmp {
    monthly
    create 0664 root utmp
        minsize 1M
    rotate 1
}

/var/log/btmp {
    missingok
    monthly
    create 0600 root utmp
    rotate 1
}

# Maximum log directory size is 100G with this configuration
# Setting limit to 100G to allow space for other logging services
# copytruncate is a critical option used to prevent logs from being shipped twice
${gateway_logdir} {
    size 20G
    rotate 5
    create 0600 root root
    copytruncate
    noolddir
    compress
}
EOF

echo "configuring yum repository and running yum update"
cat >/etc/yum.repos.d/azure.repo <<'EOF'
[azure-cli]
name=azure-cli
baseurl=https://packages.microsoft.com/yumrepos/azure-cli
enabled=yes
gpgcheck=yes

[azurecore]
name=azurecore
baseurl=https://packages.microsoft.com/yumrepos/azurecore
enabled=yes
gpgcheck=no
EOF

semanage fcontext -a -t var_log_t "/var/log/journal(/.*)?"
mkdir -p /var/log/journal

for attempt in {1..5}; do
  yum -y install clamav azsec-clamav azsec-monitor azure-cli azure-mdsd azure-security podman-docker openssl-perl python3 && break
  # hack - we are installing python3 on hosts due to an issue with Azure Linux Extensions https://github.com/Azure/azure-linux-extensions/pull/1505
  if [[ ${attempt} -lt 5 ]]; then sleep 10; else exit 1; fi
done

echo "applying firewall rules"
# https://access.redhat.com/security/cve/cve-2020-13401
cat >/etc/sysctl.d/02-disable-accept-ra.conf <<'EOF'
net.ipv6.conf.all.accept_ra=0
EOF

cat >/etc/sysctl.d/01-disable-core.conf <<'EOF'
kernel.core_pattern = |/bin/true
EOF
sysctl --system

firewall-cmd --add-port=80/tcp --permanent
firewall-cmd --add-port=8081/tcp --permanent
firewall-cmd --add-port=443/tcp --permanent

echo "logging into prod acr"
export AZURE_CLOUD_NAME=$AZURECLOUDNAME
az login -i --allow-no-subscriptions

# The managed identity that the VM runs as only has a single roleassignment.
# This role assignment is ACRPull which is not necessarily present in the
# subscription we're deploying into.  If the identity does not have any
# role assignments scoped on the subscription we're deploying into, it will
# not show on az login -i, which is why the below line is commented.
# az account set -s "$SUBSCRIPTIONID"

# Suppress emulation output for podman instead of docker for az acr compatability
mkdir -p /etc/containers/
touch /etc/containers/nodocker

mkdir -p /root/.docker
REGISTRY_AUTH_FILE=/root/.docker/config.json az acr login --name "$(sed -e 's|.*/||' <<<"$ACRRESOURCEID")"

MDMIMAGE="${RPIMAGE%%/*}/${MDMIMAGE##*/}"
docker pull "$MDMIMAGE"
docker pull "$RPIMAGE"
docker pull "$FLUENTBITIMAGE"

az logout

echo "configuring fluentbit service"
mkdir -p /etc/fluentbit/
mkdir -p /var/lib/fluent

cat >/etc/fluentbit/fluentbit.conf <<'EOF'
[INPUT]
	Name systemd
	Tag journald
	Systemd_Filter _COMM=aro
	DB /var/lib/fluent/journaldb

[FILTER]
	Name modify
	Match journald
	Remove_wildcard _
	Remove TIMESTAMP

[OUTPUT]
	Name forward
	Match *
	Port 29230
EOF

echo "FLUENTBITIMAGE=$FLUENTBITIMAGE" >/etc/sysconfig/fluentbit

cat >/etc/systemd/system/fluentbit.service <<'EOF'
[Unit]
After=network-online.target
Wants=network-online.target
StartLimitIntervalSec=0

[Service]
RestartSec=1s
EnvironmentFile=/etc/sysconfig/fluentbit
ExecStartPre=-/usr/bin/docker rm -f %N
ExecStart=/usr/bin/docker run \
  --security-opt label=disable \
  --entrypoint /opt/td-agent-bit/bin/td-agent-bit \
  --net=host \
  --hostname %H \
  --name %N \
  --rm \
  --cap-drop net_raw \
  -v /etc/fluentbit/fluentbit.conf:/etc/fluentbit/fluentbit.conf \
  -v /var/lib/fluent:/var/lib/fluent:z \
  -v /var/log/journal:/var/log/journal:ro \
  -v /etc/machine-id:/etc/machine-id:ro \
  $FLUENTBITIMAGE \
  -c /etc/fluentbit/fluentbit.conf

ExecStop=/usr/bin/docker stop %N
Restart=always
RestartSec=5
StartLimitInterval=0

[Install]
WantedBy=multi-user.target
EOF

echo "configuring mdm service"
cat >/etc/sysconfig/mdm <<EOF
MDMFRONTENDURL='$MDMFRONTENDURL'
MDMIMAGE='$MDMIMAGE'
MDMSOURCEENVIRONMENT='$LOCATION'
MDMSOURCEROLE=gateway
MDMSOURCEROLEINSTANCE='$(hostname)'
EOF

mkdir /var/etw
cat >/etc/systemd/system/mdm.service <<'EOF'
[Unit]
After=network-online.target
Wants=network-online.target

[Service]
EnvironmentFile=/etc/sysconfig/mdm
ExecStartPre=-/usr/bin/docker rm -f %N
ExecStart=/usr/bin/docker run \
  --entrypoint /usr/sbin/MetricsExtension \
  --hostname %H \
  --name %N \
  --rm \
  --cap-drop net_raw \
  -m 2g \
  -v /etc/mdm.pem:/etc/mdm.pem \
  -v /var/etw:/var/etw:z \
  $MDMIMAGE \
  -CertFile /etc/mdm.pem \
  -FrontEndUrl $MDMFRONTENDURL \
  -Logger Console \
  -LogLevel Warning \
  -PrivateKeyFile /etc/mdm.pem \
  -SourceEnvironment $MDMSOURCEENVIRONMENT \
  -SourceRole $MDMSOURCEROLE \
  -SourceRoleInstance $MDMSOURCEROLEINSTANCE
ExecStop=/usr/bin/docker stop %N
Restart=always
RestartSec=1
StartLimitInterval=0

[Install]
WantedBy=multi-user.target
EOF

echo "configuring aro-gateway service"
cat >/etc/sysconfig/aro-gateway <<EOF
ACR_RESOURCE_ID='$ACRRESOURCEID'
DATABASE_ACCOUNT_NAME='$DATABASEACCOUNTNAME'
AZURE_DBTOKEN_CLIENT_ID='$DBTOKENCLIENTID'
DBTOKEN_URL='$DBTOKENURL'
MDM_ACCOUNT="$RPMDMACCOUNT"
MDM_NAMESPACE=Gateway
GATEWAY_DOMAINS='$GATEWAYDOMAINS'
GATEWAY_FEATURES='$GATEWAYFEATURES'
RPIMAGE='$RPIMAGE'
EOF

cat >/etc/systemd/system/aro-gateway.service <<EOF
[Unit]
After=network-online.target
Wants=network-online.target

[Service]
EnvironmentFile=/etc/sysconfig/aro-gateway
ExecStartPre=-/usr/bin/docker rm -f %N
ExecStartPre=/usr/bin/mkdir -p ${gateway_logdir}
ExecStart=/usr/bin/docker run \
  --hostname %H \
  --name %N \
  --rm \
  --cap-drop net_raw \
  -e ACR_RESOURCE_ID \
  -e DATABASE_ACCOUNT_NAME \
  -e AZURE_DBTOKEN_CLIENT_ID \
  -e DBTOKEN_URL \
  -e GATEWAY_DOMAINS \
  -e GATEWAY_FEATURES \
  -e MDM_ACCOUNT \
  -e MDM_NAMESPACE \
  -m 2g \
  -p 80:8080 \
  -p 8081:8081 \
  -p 443:8443 \
  -v /run/systemd/journal:/run/systemd/journal \
  -v /var/etw:/var/etw:z \
  -v /ctr.log:${gateway_logdir}:z \
  \$RPIMAGE \
  gateway
ExecStop=/usr/bin/docker stop -t 3600 %N
TimeoutStopSec=3600
Restart=always
RestartSec=1
StartLimitInterval=0

[Install]
WantedBy=multi-user.target
EOF

chcon -R system_u:object_r:var_log_t:s0 /var/opt/microsoft/linuxmonagent

mkdir -p /var/lib/waagent/Microsoft.Azure.KeyVault.Store

echo "configuring mdsd and mdm services"
for var in "mdsd" "mdm"; do
cat >/etc/systemd/system/download-$var-credentials.service <<EOF
[Unit]
Description=Periodic $var credentials refresh

[Service]
Type=oneshot
ExecStart=/usr/local/bin/download-credentials.sh $var
EOF

cat >/etc/systemd/system/download-$var-credentials.timer <<EOF
[Unit]
Description=Periodic $var credentials refresh
After=network-online.target
Wants=network-online.target

[Timer]
OnBootSec=0min
OnCalendar=0/12:00:00
AccuracySec=5s

[Install]
WantedBy=timers.target
EOF
done

cat >/usr/local/bin/download-credentials.sh <<EOF
#!/bin/bash
set -eu

COMPONENT="\$1"
echo "Download \$COMPONENT credentials"

TEMP_DIR=\$(mktemp -d)
export AZURE_CONFIG_DIR=\$(mktemp -d)

echo "Logging into Azure..."
RETRIES=3
while [ "\$RETRIES" -gt 0 ]; do
    if az login -i --allow-no-subscriptions
    then
        echo "az login successful"
        break
    else
        echo "az login failed. Retrying..."
        let RETRIES-=1
        sleep 5
    fi
done

trap "cleanup" EXIT

cleanup() {
  az logout
  [[ "\$TEMP_DIR" =~ /tmp/.+ ]] && rm -rf \$TEMP_DIR
  [[ "\$AZURE_CONFIG_DIR" =~ /tmp/.+ ]] && rm -rf \$AZURE_CONFIG_DIR
}

if [ "\$COMPONENT" = "mdm" ]; then
  CURRENT_CERT_FILE="/etc/mdm.pem"
elif [ "\$COMPONENT" = "mdsd" ]; then
  CURRENT_CERT_FILE="/var/lib/waagent/Microsoft.Azure.KeyVault.Store/mdsd.pem"
else
  echo Invalid usage && exit 1
fi

SECRET_NAME="gwy-\${COMPONENT}"
NEW_CERT_FILE="\$TEMP_DIR/\$COMPONENT.pem"
for attempt in {1..5}; do
  az keyvault secret download --file \$NEW_CERT_FILE --id "https://$KEYVAULTPREFIX-gwy.$KEYVAULTDNSSUFFIX/secrets/\$SECRET_NAME" && break
  if [[ \$attempt -lt 5 ]]; then sleep 10; else exit 1; fi
done

if [ -f \$NEW_CERT_FILE ]; then
  if [ "\$COMPONENT" = "mdsd" ]; then
    chown syslog:syslog \$NEW_CERT_FILE
  else
    sed -i -ne '1,/END CERTIFICATE/ p' \$NEW_CERT_FILE
  fi
  if ! diff $NEW_CERT_FILE $CURRENT_CERT_FILE >/dev/null 2>&1; then
    chmod 0600 \$NEW_CERT_FILE
    mv \$NEW_CERT_FILE \$CURRENT_CERT_FILE
  fi
else
  echo Failed to refresh certificate for \$COMPONENT && exit 1
fi
EOF

chmod u+x /usr/local/bin/download-credentials.sh

systemctl enable download-mdsd-credentials.timer
systemctl enable download-mdm-credentials.timer

/usr/local/bin/download-credentials.sh mdsd
/usr/local/bin/download-credentials.sh mdm
MDSDCERTIFICATESAN=$(openssl x509 -in /var/lib/waagent/Microsoft.Azure.KeyVault.Store/mdsd.pem -noout -subject | sed -e 's/.*CN = //')

cat >/etc/systemd/system/watch-mdm-credentials.service <<EOF
[Unit]
Description=Watch for changes in mdm.pem and restarts the mdm service

[Service]
Type=oneshot
ExecStart=/usr/bin/systemctl restart mdm.service

[Install]
WantedBy=multi-user.target
EOF

cat >/etc/systemd/system/watch-mdm-credentials.path <<EOF
[Path]
PathModified=/etc/mdm.pem

[Install]
WantedBy=multi-user.target
EOF

systemctl enable watch-mdm-credentials.path
systemctl start watch-mdm-credentials.path

mkdir /etc/systemd/system/mdsd.service.d
cat >/etc/systemd/system/mdsd.service.d/override.conf <<'EOF'
[Unit]
After=network-online.target
EOF

cat >/etc/default/mdsd <<EOF
MDSD_ROLE_PREFIX=/var/run/mdsd/default
MDSD_OPTIONS="-A -d -r \$MDSD_ROLE_PREFIX"

export MONITORING_GCS_ENVIRONMENT='$MDSDENVIRONMENT'
export MONITORING_GCS_ACCOUNT='$RPMDSDACCOUNT'
export MONITORING_GCS_REGION='$LOCATION'
export MONITORING_GCS_AUTH_ID_TYPE=AuthKeyVault
export MONITORING_GCS_AUTH_ID='$MDSDCERTIFICATESAN'
export MONITORING_GCS_NAMESPACE='$RPMDSDNAMESPACE'
export MONITORING_CONFIG_VERSION='$GATEWAYMDSDCONFIGVERSION'
export MONITORING_USE_GENEVA_CONFIG_SERVICE=true

export MONITORING_TENANT='$LOCATION'
export MONITORING_ROLE=gateway
export MONITORING_ROLE_INSTANCE='$(hostname)'

export MDSD_MSGPACK_SORT_COLUMNS=1
EOF

# setting MONITORING_GCS_AUTH_ID_TYPE=AuthKeyVault seems to have caused mdsd not
# to honour SSL_CERT_FILE any more, heaven only knows why.
mkdir -p /usr/lib/ssl/certs
csplit -f /usr/lib/ssl/certs/cert- -b %03d.pem /etc/pki/tls/certs/ca-bundle.crt /^$/1 {*} >/dev/null
c_rehash /usr/lib/ssl/certs

# we leave clientId blank as long as only 1 managed identity assigned to vmss
# if we have more than 1, we will need to populate with clientId used for off-node scanning
cat >/etc/default/vsa-nodescan-agent.config <<EOF
{
    "Nice": 19,
    "Timeout": 10800,
    "ClientId": "",
    "TenantId": "$AZURESECPACKVSATENANTID",
    "QualysStoreBaseUrl": "$AZURESECPACKQUALYSURL",
    "ProcessTimeout": 300,
    "CommandDelay": 0
  }
EOF

# we start a cron job to run every hour to ensure the said directory is accessible
# by the correct user as it gets created by root and may cause a race condition
# where root owns the dir instead of syslog
# TODO: https://msazure.visualstudio.com/AzureRedHatOpenShift/_workitems/edit/12591207
cat >/etc/cron.d/mdsd-chown-workaround <<EOF
SHELL=/bin/bash
PATH=/bin
0 * * * * root chown syslog:syslog /var/opt/microsoft/linuxmonagent/eh/EventNotice/arorplogs*
EOF

echo "enabling aro services"
for service in aro-gateway auoms azsecd azsecmond mdsd mdm chronyd fluentbit; do
  systemctl enable $service.service
done

for scan in baseline clamav software; do
  /usr/local/bin/azsecd config -s $scan -d P1D
done

echo "rebooting"
restorecon -RF /var/log/*
(sleep 30; reboot) &
')))]" } } } diff --git a/pkg/deploy/generator/scripts/gatewayVMSS.sh b/pkg/deploy/generator/scripts/gatewayVMSS.sh index 856a549a539..7e46aa7404e 100644 --- a/pkg/deploy/generator/scripts/gatewayVMSS.sh +++ b/pkg/deploy/generator/scripts/gatewayVMSS.sh @@ -14,8 +14,8 @@ yum -y -x WALinuxAgent -x WALinuxAgent-udev update --allowerasing echo "extending partition table" # Linux block devices are inconsistently named # it's difficult to tie the lvm pv to the physical disk using /dev/disk files, which is why lvs is used here -physicalDisk="$(lvs -o devices -a | head -n2 | tail -n1 | cut -d ' ' -f 3 | cut -d \( -f 1 | tr -d '[:digit:]')" -growpart "$physicalDisk" 2 +physical_disk="$(lvs -o devices -a | head -n2 | tail -n1 | cut -d ' ' -f 3 | cut -d \( -f 1 | tr -d '[:digit:]')" +growpart "$physical_disk" 2 echo "extending filesystems" lvextend -l +20%FREE /dev/rootvg/rootlv @@ -33,7 +33,12 @@ for attempt in {1..5}; do done echo "configuring logrotate" -cat >/etc/logrotate.conf <<'EOF' + +# gateway_logdir is a readonly variable that specifies the host path mount point for the gateway container log file +# for the purpose of rotating the gateway logs +declare -r gateway_logdir='/var/log/aro-gateway' + +cat >/etc/logrotate.conf </etc/systemd/system/aro-gateway.service <<'EOF' +cat >/etc/systemd/system/aro-gateway.service < 1 { + return nil, fmt.Errorf("found multiple etcd pods with conflicting IP addresses, only one degraded etcd is supported, unable to recover. Conflicting IPs found: %v", degradedEtcds) + // happens if the env variables are empty, check statuses next + } else if len(degradedEtcds) == 0 { + de = °radedEtcd{} + } else { + // array is no longer needed + de = °radedEtcds[0] + } + + return de, nil +} + +// comparePodEnvToIp compares the etcd container's environment variables to the pod's actual IP address +func findDegradedEtcd(log *logrus.Entry, pods *corev1.PodList) (*degradedEtcd, error) { + de, err := comparePodEnvToIp(log, pods) + if err != nil { + return °radedEtcd{}, err + } + + crashingPodSearchDe, err := findCrashloopingPods(log, pods) + log.Infof("Found degraded etcd while searching by Pod statuses: %v", crashingPodSearchDe) + if err != nil { + return °radedEtcd{}, err + } + + // Sanity check + // Since we are checking for both an etcd Pod with an IP mis match, and the statuses of all etcd pods, let's make sure the Pod's returned by both are the same + if de.Pod != crashingPodSearchDe.Pod && de.Pod != "" { + return de, fmt.Errorf("etcd Pod found in crashlooping state %s is not equal to etcd Pod with IP ENV mis match %s... failed sanity check", de.Pod, crashingPodSearchDe.Pod) + } + + // If no conflict is found a recent IP change may still be causing an issue + // Sometimes etcd can recovery the deployment itself, however there is still a data directory with the previous member's IP address present causing a failure + // This can still be remediated by relying on the pod statuses + if de.Node == "" { + log.Info("Unable to find an IP address conflict, using etcd Pod found during search by statuses") + return crashingPodSearchDe, nil + } + + return de, nil +} + +func ipFromEnv(containers []corev1.Container, podName string) string { + for _, c := range containers { + if c.Name == "etcd" { + for _, e := range c.Env { + // The environment variable that contains etcd's IP address has the following naming convention + // NODE_cluster_name_infra_ID_master_0_IP + // while the pod looks like this + // etcd-cluster-name-infra-id-master-0 + // To find the pod's IP address by variable name we use the pod's name + envName := strings.ReplaceAll(strings.ReplaceAll(podName, "-", "_"), "etcd_", "NODE_") + if e.Name == fmt.Sprintf("%s_IP", envName) { + return e.Value + } + } + } + } + + return "" +} + +func findCrashloopingPods(log *logrus.Entry, pods *corev1.PodList) (*degradedEtcd, error) { + // pods are collected in a list to check for multiple crashing etcd instances + // multiple etcd failures aren't supported so an error will be returned, rather than assuming the first found is the only one + crashingPods := &corev1.PodList{} + for _, p := range pods.Items { + for _, c := range p.Status.ContainerStatuses { + if !c.Ready && c.Name == "etcd" { + log.Infof("Found etcd container with status: %v", c) + crashingPods.Items = append(crashingPods.Items, p) + } + } + } + + if len(crashingPods.Items) > 1 { + // log multiple names in a readable way + names := []string{} + for _, c := range crashingPods.Items { + names = append(names, c.Name) + } + return nil, fmt.Errorf("only a single degraded etcd pod can can be recovered from, more than one NotReady etcd pods were found: %v", names) + } else if len(crashingPods.Items) == 0 { + return nil, errors.New("no etcd pod's were found in a CrashLoopBackOff state, unable to remediate etcd deployment") + } + crashingPod := &crashingPods.Items[0] + + return °radedEtcd{ + Node: strings.ReplaceAll(crashingPod.Name, "etcd-", ""), + Pod: crashingPod.Name, + OldIP: "unknown", + NewIP: "unknown", + }, nil +} diff --git a/pkg/frontend/fixetcd_test.go b/pkg/frontend/fixetcd_test.go new file mode 100644 index 00000000000..96d6bf28997 --- /dev/null +++ b/pkg/frontend/fixetcd_test.go @@ -0,0 +1,751 @@ +package frontend + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "bytes" + "context" + "errors" + "strings" + "testing" + + "github.com/Azure/go-autorest/autorest/to" + "github.com/golang/mock/gomock" + operatorv1fake "github.com/openshift/client-go/operator/clientset/versioned/typed/operator/v1/fake" + "github.com/ugorji/go/codec" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/watch" + ktesting "k8s.io/client-go/testing" + + "github.com/Azure/ARO-RP/pkg/api" + "github.com/Azure/ARO-RP/pkg/metrics/noop" + mock_adminactions "github.com/Azure/ARO-RP/pkg/util/mocks/adminactions" + testdatabase "github.com/Azure/ARO-RP/test/database" +) + +const degradedNode = "master-2" + +func TestFixEtcd(t *testing.T) { + // Context leak is intentional to make use of cancel function, and make it to our error check + ctx, ctxCancel := context.WithCancel(context.Background()) + const ( + mockSubID = "00000000-0000-0000-0000-000000000000" + mockTenantID = mockSubID + ) + resourceID := testdatabase.GetResourcePath(mockSubID, "cluster") + doc := &api.OpenShiftClusterDocument{ + Key: strings.ToLower(resourceID), + OpenShiftCluster: &api.OpenShiftCluster{ + Name: "cluster", + ID: resourceID, + Type: "Microsoft.RedHatOpenShift/openshiftClusters", + Properties: api.OpenShiftClusterProperties{ + InfraID: "zfsbk", + }, + }, + } + + type test struct { + name string + mocks func(tt *test, t *testing.T, ti *testInfra, k *mock_adminactions.MockKubeActions, pods *corev1.PodList, ctxCancel context.CancelFunc) + wantErr string + pods *corev1.PodList + ctxCancel context.CancelFunc + cancel bool + } + + for _, tt := range []*test{ + { + name: "fail: list pods", + wantErr: "500: InternalServerError: : oh no, can't list pods", + mocks: func(tt *test, t *testing.T, ti *testInfra, k *mock_adminactions.MockKubeActions, pods *corev1.PodList, ctxCancel context.CancelFunc) { + k.EXPECT().KubeList(ctx, "Pod", namespaceEtcds).MaxTimes(1).Return(nil, errors.New("oh no, can't list pods")) + }, + }, + { + name: "fail: invalid json, can't decode pods", + wantErr: "500: InternalServerError: : failed to decode pods, json decode error [pos 1]: only encoded map or array can decode into struct", + mocks: func(tt *test, t *testing.T, ti *testInfra, k *mock_adminactions.MockKubeActions, pods *corev1.PodList, ctxCancel context.CancelFunc) { + buf := &bytes.Buffer{} + err := codec.NewEncoder(buf, &codec.JsonHandle{}).Encode(`{`) + if err != nil { + t.Fatalf("failed to encode pods, %s", err.Error()) + } + k.EXPECT().KubeList(ctx, "Pod", namespaceEtcds).MaxTimes(1).Return(buf.Bytes(), nil) + }, + }, + { + name: "pass: Expected degraded etcd scenario", + pods: newEtcdPods(t, doc, false, false, false), + mocks: func(tt *test, t *testing.T, ti *testInfra, k *mock_adminactions.MockKubeActions, pods *corev1.PodList, ctxCancel context.CancelFunc) { + buf := &bytes.Buffer{} + err := codec.NewEncoder(buf, &codec.JsonHandle{}).Encode(pods) + if err != nil { + t.Fatalf("%s failed to encode pods, %s", t.Name(), err.Error()) + } + k.EXPECT().KubeList(ctx, "Pod", namespaceEtcds).MaxTimes(1).Return(buf.Bytes(), nil) + + // backupEtcd + jobBackupEtcd := createBackupEtcdDataJob(doc.OpenShiftCluster.Name, buildNodeName(doc, degradedNode)) + k.EXPECT().KubeCreateOrUpdate(ctx, jobBackupEtcd).MaxTimes(1).Return(nil) + + expectWatchEvent(gomock.Any(), jobBackupEtcd, k, "app", corev1.PodSucceeded, false)() + + k.EXPECT().KubeGetPodLogs(ctx, jobBackupEtcd.GetNamespace(), jobBackupEtcd.GetName(), jobBackupEtcd.GetName()).Times(1).Return([]byte("Backup job doing backup things..."), nil) + + propPolicy := metav1.DeletePropagationBackground + k.EXPECT().KubeDelete(ctx, "Job", namespaceEtcds, jobBackupEtcd.GetName(), true, &propPolicy).Times(1).Return(nil) + + // fixPeers + // createPrivilegedServiceAccount + serviceAcc := newServiceAccount(serviceAccountName, doc.OpenShiftCluster.Name) + clusterRole := newClusterRole(kubeServiceAccount, doc.OpenShiftCluster.Name) + crb := newClusterRoleBinding(serviceAccountName, doc.OpenShiftCluster.Name) + scc := newSecurityContextConstraint(serviceAccountName, doc.OpenShiftCluster.Name, kubeServiceAccount) + + k.EXPECT().KubeCreateOrUpdate(ctx, serviceAcc).Times(1).Return(nil) + k.EXPECT().KubeCreateOrUpdate(ctx, clusterRole).Times(1).Return(nil) + k.EXPECT().KubeCreateOrUpdate(ctx, crb).Times(1).Return(nil) + k.EXPECT().KubeCreateOrUpdate(ctx, scc).Times(1).Return(nil) + + de, err := findDegradedEtcd(ti.log, pods) + if err != nil { + t.Fatal(err) + } + peerPods, err := getPeerPods(pods.Items, de, doc.OpenShiftCluster.Name) + if err != nil { + t.Fatal(err) + } + + jobFixPeers := newJobFixPeers(doc.OpenShiftCluster.Name, peerPods, de.Node) + k.EXPECT().KubeCreateOrUpdate(ctx, jobFixPeers).Times(1).Return(nil) + expectWatchEvent(gomock.Any(), jobFixPeers, k, "app", corev1.PodSucceeded, false)() + + k.EXPECT().KubeGetPodLogs(ctx, jobFixPeers.GetNamespace(), jobFixPeers.GetName(), jobFixPeers.GetName()).Times(1).Return([]byte("Fix peer job fixing peers..."), nil) + k.EXPECT().KubeDelete(ctx, "Job", namespaceEtcds, jobFixPeers.GetName(), true, &propPolicy).Times(1).Return(nil) + + // cleanup + k.EXPECT().KubeDelete(ctx, serviceAcc.GetKind(), serviceAcc.GetNamespace(), serviceAcc.GetName(), true, nil).Times(1).Return(nil) + k.EXPECT().KubeDelete(ctx, scc.GetKind(), scc.GetNamespace(), scc.GetName(), true, nil).Times(1).Return(nil) + k.EXPECT().KubeDelete(ctx, clusterRole.GetKind(), clusterRole.GetNamespace(), clusterRole.GetName(), true, nil).Times(1).Return(nil) + k.EXPECT().KubeDelete(ctx, crb.GetKind(), crb.GetNamespace(), crb.GetName(), true, nil).Times(1).Return(nil) + + err = codec.NewEncoder(buf, &codec.JsonHandle{}).Encode(&operatorv1fake.FakeEtcds{}) + if err != nil { + t.Fatal(err) + } + k.EXPECT().KubeGet(ctx, "Etcd", "", doc.OpenShiftCluster.Name).MaxTimes(1).Return(buf.Bytes(), nil) + + // delete secrets + for _, prefix := range []string{"etcd-peer-", "etcd-serving-", "etcd-serving-metrics-"} { + k.EXPECT().KubeDelete(ctx, "Secret", namespaceEtcds, prefix+buildNodeName(doc, degradedNode), false, nil) + } + }, + }, + { + name: "pass: Empty env vars scenario", + pods: newEtcdPods(t, doc, false, false, true), + mocks: func(tt *test, t *testing.T, ti *testInfra, k *mock_adminactions.MockKubeActions, pods *corev1.PodList, ctxCancel context.CancelFunc) { + buf := &bytes.Buffer{} + err := codec.NewEncoder(buf, &codec.JsonHandle{}).Encode(pods) + if err != nil { + t.Fatalf("%s failed to encode pods, %s", t.Name(), err.Error()) + } + k.EXPECT().KubeList(ctx, "Pod", namespaceEtcds).MaxTimes(1).Return(buf.Bytes(), nil) + + // backupEtcd + jobBackupEtcd := createBackupEtcdDataJob(doc.OpenShiftCluster.Name, buildNodeName(doc, degradedNode)) + k.EXPECT().KubeCreateOrUpdate(ctx, jobBackupEtcd).MaxTimes(1).Return(nil) + expectWatchEvent(gomock.Any(), jobBackupEtcd, k, "app", corev1.PodSucceeded, false)() + k.EXPECT().KubeGetPodLogs(ctx, jobBackupEtcd.GetNamespace(), jobBackupEtcd.GetName(), jobBackupEtcd.GetName()).MaxTimes(1).Return([]byte("Backup job doing backup things..."), nil) + propPolicy := metav1.DeletePropagationBackground + k.EXPECT().KubeDelete(ctx, "Job", namespaceEtcds, jobBackupEtcd.GetName(), true, &propPolicy).MaxTimes(1).Return(nil) + + // fixPeers + // createPrivilegedServiceAccount + serviceAcc := newServiceAccount(serviceAccountName, doc.OpenShiftCluster.Name) + clusterRole := newClusterRole(kubeServiceAccount, doc.OpenShiftCluster.Name) + crb := newClusterRoleBinding(serviceAccountName, doc.OpenShiftCluster.Name) + scc := newSecurityContextConstraint(serviceAccountName, doc.OpenShiftCluster.Name, kubeServiceAccount) + + k.EXPECT().KubeCreateOrUpdate(ctx, serviceAcc).MaxTimes(1).Return(nil) + k.EXPECT().KubeCreateOrUpdate(ctx, clusterRole).MaxTimes(1).Return(nil) + k.EXPECT().KubeCreateOrUpdate(ctx, crb).MaxTimes(1).Return(nil) + k.EXPECT().KubeCreateOrUpdate(ctx, scc).MaxTimes(1).Return(nil) + + de, err := findDegradedEtcd(ti.log, pods) + if err != nil { + t.Fatal(err) + } + peerPods, err := getPeerPods(pods.Items, de, doc.OpenShiftCluster.Name) + if err != nil { + t.Fatal(err) + } + + jobFixPeers := newJobFixPeers(doc.OpenShiftCluster.Name, peerPods, de.Node) + k.EXPECT().KubeCreateOrUpdate(ctx, jobFixPeers).MaxTimes(1).Return(nil) + expectWatchEvent(gomock.Any(), jobFixPeers, k, "app", corev1.PodSucceeded, false)() + k.EXPECT().KubeGetPodLogs(ctx, jobFixPeers.GetNamespace(), jobFixPeers.GetName(), jobFixPeers.GetName()).MaxTimes(1).Return([]byte("Fix peer job fixing peers..."), nil) + k.EXPECT().KubeDelete(ctx, "Job", namespaceEtcds, jobFixPeers.GetName(), true, &propPolicy).MaxTimes(1).Return(nil) + + // cleanup + k.EXPECT().KubeDelete(ctx, serviceAcc.GetKind(), serviceAcc.GetNamespace(), serviceAcc.GetName(), true, nil).MaxTimes(1).Return(nil) + k.EXPECT().KubeDelete(ctx, scc.GetKind(), scc.GetNamespace(), scc.GetName(), true, nil).MaxTimes(1).Return(nil) + k.EXPECT().KubeDelete(ctx, clusterRole.GetKind(), clusterRole.GetNamespace(), clusterRole.GetName(), true, nil).MaxTimes(1).Return(nil) + k.EXPECT().KubeDelete(ctx, crb.GetKind(), crb.GetNamespace(), crb.GetName(), true, nil).MaxTimes(1).Return(nil) + + err = codec.NewEncoder(buf, &codec.JsonHandle{}).Encode(&operatorv1fake.FakeEtcds{}) + if err != nil { + t.Fatal(err) + } + k.EXPECT().KubeGet(ctx, "Etcd", "", doc.OpenShiftCluster.Name).MaxTimes(1).Return(buf.Bytes(), nil) + + // delete secrets + for _, prefix := range []string{"etcd-peer-", "etcd-serving-", "etcd-serving-metrics-"} { + k.EXPECT().KubeDelete(ctx, "Secret", namespaceEtcds, prefix+buildNodeName(doc, degradedNode), false, nil) + } + }, + }, + { + name: "fail: Multiple degraded etcd instances scenario", + wantErr: "500: InternalServerError: : only a single degraded etcd pod can can be recovered from, more than one NotReady etcd pods were found: [etcd-cluster-zfsbk-master-0 etcd-cluster-zfsbk-master-1 etcd-cluster-zfsbk-master-2]", + pods: newEtcdPods(t, doc, false, true, true), + mocks: func(tt *test, t *testing.T, ti *testInfra, k *mock_adminactions.MockKubeActions, pods *corev1.PodList, ctxCancel context.CancelFunc) { + buf := &bytes.Buffer{} + err := codec.NewEncoder(buf, &codec.JsonHandle{}).Encode(pods) + if err != nil { + t.Fatalf("%s failed to encode pods, %s", t.Name(), err.Error()) + } + k.EXPECT().KubeList(ctx, "Pod", namespaceEtcds).MaxTimes(1).Return(buf.Bytes(), nil) + }, + }, + { + name: "fail: empty/correct pod env and no bad container statuses", + wantErr: "500: InternalServerError: : no etcd pod's were found in a CrashLoopBackOff state, unable to remediate etcd deployment", + pods: newEtcdPods(t, doc, true, false, false), + mocks: func(tt *test, t *testing.T, ti *testInfra, k *mock_adminactions.MockKubeActions, pods *corev1.PodList, ctxCancel context.CancelFunc) { + buf := &bytes.Buffer{} + err := codec.NewEncoder(buf, &codec.JsonHandle{}).Encode(pods) + if err != nil { + t.Fatalf("%s failed to encode pods, %s", t.Name(), err.Error()) + } + k.EXPECT().KubeList(ctx, "Pod", namespaceEtcds).MaxTimes(1).Return(buf.Bytes(), nil) + }, + }, + { + name: "fail: create job data backup", + wantErr: "500: InternalServerError: : oh no, can't create job data backup", + pods: newEtcdPods(t, doc, false, false, false), + mocks: func(tt *test, t *testing.T, ti *testInfra, k *mock_adminactions.MockKubeActions, pods *corev1.PodList, ctxCancel context.CancelFunc) { + buf := &bytes.Buffer{} + err := codec.NewEncoder(buf, &codec.JsonHandle{}).Encode(pods) + if err != nil { + t.Fatalf("%s failed to encode pods, %s", t.Name(), err.Error()) + } + k.EXPECT().KubeList(ctx, "Pod", namespaceEtcds).MaxTimes(1).Return(buf.Bytes(), nil) + + // backupEtcd + jobBackupEtcd := createBackupEtcdDataJob(doc.OpenShiftCluster.Name, buildNodeName(doc, degradedNode)) + k.EXPECT().KubeCreateOrUpdate(ctx, jobBackupEtcd).MaxTimes(1).Return(errors.New("oh no, can't create job data backup")) + }, + }, + { + name: "fail: create job fix peers", + wantErr: "500: InternalServerError: : oh no, can't create job fix peers", + pods: newEtcdPods(t, doc, false, false, false), + mocks: func(tt *test, t *testing.T, ti *testInfra, k *mock_adminactions.MockKubeActions, pods *corev1.PodList, ctxCancel context.CancelFunc) { + buf := &bytes.Buffer{} + err := codec.NewEncoder(buf, &codec.JsonHandle{}).Encode(pods) + if err != nil { + t.Fatalf("%s failed to encode pods, %s", t.Name(), err.Error()) + } + k.EXPECT().KubeList(ctx, "Pod", namespaceEtcds).MaxTimes(1).Return(buf.Bytes(), nil) + + // backupEtcd + jobBackupEtcd := createBackupEtcdDataJob(doc.OpenShiftCluster.Name, buildNodeName(doc, degradedNode)) + k.EXPECT().KubeCreateOrUpdate(ctx, jobBackupEtcd).MaxTimes(1).Return(nil) + expectWatchEvent(gomock.Any(), jobBackupEtcd, k, "app", corev1.PodSucceeded, false)() + k.EXPECT().KubeGetPodLogs(ctx, jobBackupEtcd.GetNamespace(), jobBackupEtcd.GetName(), jobBackupEtcd.GetName()).MaxTimes(1).Return([]byte("Backup job doing backup things..."), nil) + propPolicy := metav1.DeletePropagationBackground + k.EXPECT().KubeDelete(ctx, "Job", namespaceEtcds, jobBackupEtcd.GetName(), true, &propPolicy).MaxTimes(1).Return(nil) + + // fixPeers + // createPrivilegedServiceAccount + serviceAcc := newServiceAccount(serviceAccountName, doc.OpenShiftCluster.Name) + clusterRole := newClusterRole(kubeServiceAccount, doc.OpenShiftCluster.Name) + crb := newClusterRoleBinding(serviceAccountName, doc.OpenShiftCluster.Name) + scc := newSecurityContextConstraint(serviceAccountName, doc.OpenShiftCluster.Name, kubeServiceAccount) + + k.EXPECT().KubeCreateOrUpdate(ctx, serviceAcc).MaxTimes(1).Return(nil) + k.EXPECT().KubeCreateOrUpdate(ctx, clusterRole).MaxTimes(1).Return(nil) + k.EXPECT().KubeCreateOrUpdate(ctx, crb).MaxTimes(1).Return(nil) + k.EXPECT().KubeCreateOrUpdate(ctx, scc).MaxTimes(1).Return(nil) + + de, err := findDegradedEtcd(ti.log, pods) + if err != nil { + t.Fatal(err) + } + peerPods, err := getPeerPods(pods.Items, de, doc.OpenShiftCluster.Name) + if err != nil { + t.Fatal(err) + } + + jobFixPeers := newJobFixPeers(doc.OpenShiftCluster.Name, peerPods, de.Node) + k.EXPECT().KubeCreateOrUpdate(ctx, jobFixPeers).MaxTimes(1).Return(errors.New("oh no, can't create job fix peers")) + expectWatchEvent(gomock.Any(), jobFixPeers, k, "app", corev1.PodSucceeded, false)() + + // cleanup + k.EXPECT().KubeDelete(ctx, serviceAcc.GetKind(), serviceAcc.GetNamespace(), serviceAcc.GetName(), true, nil).MaxTimes(1).Return(nil) + k.EXPECT().KubeDelete(ctx, scc.GetKind(), scc.GetNamespace(), scc.GetName(), true, nil).MaxTimes(1).Return(nil) + k.EXPECT().KubeDelete(ctx, clusterRole.GetKind(), clusterRole.GetNamespace(), clusterRole.GetName(), true, nil).MaxTimes(1).Return(nil) + k.EXPECT().KubeDelete(ctx, crb.GetKind(), crb.GetNamespace(), crb.GetName(), true, nil).MaxTimes(1).Return(nil) + }, + }, + { + name: "fail: create service account", + wantErr: "500: InternalServerError: : oh no, can't create service account %!!(MISSING)s()", + pods: newEtcdPods(t, doc, false, false, false), + mocks: func(tt *test, t *testing.T, ti *testInfra, k *mock_adminactions.MockKubeActions, pods *corev1.PodList, ctxCancel context.CancelFunc) { + buf := &bytes.Buffer{} + err := codec.NewEncoder(buf, &codec.JsonHandle{}).Encode(pods) + if err != nil { + t.Fatalf("%s failed to encode pods, %s", t.Name(), err.Error()) + } + k.EXPECT().KubeList(ctx, "Pod", namespaceEtcds).MaxTimes(1).Return(buf.Bytes(), nil) + + // backupEtcd + jobBackupEtcd := createBackupEtcdDataJob(doc.OpenShiftCluster.Name, buildNodeName(doc, degradedNode)) + k.EXPECT().KubeCreateOrUpdate(ctx, jobBackupEtcd).MaxTimes(1).Return(nil) + expectWatchEvent(gomock.Any(), jobBackupEtcd, k, "app", corev1.PodSucceeded, false)() + k.EXPECT().KubeGetPodLogs(ctx, jobBackupEtcd.GetNamespace(), jobBackupEtcd.GetName(), jobBackupEtcd.GetName()).MaxTimes(1).Return([]byte("Backup job doing backup things..."), nil) + + // fixPeers + serviceAcc := newServiceAccount(serviceAccountName, doc.OpenShiftCluster.Name) + + // k.EXPECT().KubeCreateOrUpdate(ctx, serviceAcc).MaxTimes(1).Return(errors.New(tt.wantErr)) + k.EXPECT().KubeCreateOrUpdate(ctx, serviceAcc).MaxTimes(1).Return(errors.New("oh no, can't create service account")) + + // nested cleanup + propPolicy := metav1.DeletePropagationBackground + k.EXPECT().KubeDelete(ctx, "ServiceAccount", namespaceEtcds, serviceAcc.GetName(), true, nil).MaxTimes(1).Return(nil) + k.EXPECT().KubeDelete(ctx, "SecurityContextConstraints", "", serviceAcc.GetName(), true, nil).MaxTimes(1).Return(nil) + k.EXPECT().KubeDelete(ctx, "ClusterRole", "", "system:serviceaccountopenshift-etcd:etcd-recovery-privileged", true, nil).MaxTimes(1).Return(nil) + k.EXPECT().KubeDelete(ctx, "ClusterRoleBinding", "", serviceAcc.GetName(), true, nil).MaxTimes(1).Return(nil) + k.EXPECT().KubeDelete(ctx, "Job", namespaceEtcds, jobBackupEtcd.GetName(), true, &propPolicy).MaxTimes(1).Return(nil) + }, + }, + { + name: "fail: create cluster role", + wantErr: "500: InternalServerError: : oh no, can't create job fix peers %!!(MISSING)s()", + pods: newEtcdPods(t, doc, false, false, false), + mocks: func(tt *test, t *testing.T, ti *testInfra, k *mock_adminactions.MockKubeActions, pods *corev1.PodList, ctxCancel context.CancelFunc) { + buf := &bytes.Buffer{} + err := codec.NewEncoder(buf, &codec.JsonHandle{}).Encode(pods) + if err != nil { + t.Fatalf("%s failed to encode pods, %s", t.Name(), err.Error()) + } + k.EXPECT().KubeList(ctx, "Pod", namespaceEtcds).MaxTimes(1).Return(buf.Bytes(), nil) + + // backupEtcd + jobBackupEtcd := createBackupEtcdDataJob(doc.OpenShiftCluster.Name, buildNodeName(doc, degradedNode)) + k.EXPECT().KubeCreateOrUpdate(ctx, jobBackupEtcd).MaxTimes(1).Return(nil) + expectWatchEvent(gomock.Any(), jobBackupEtcd, k, "app", corev1.PodSucceeded, false)() + k.EXPECT().KubeGetPodLogs(ctx, jobBackupEtcd.GetNamespace(), jobBackupEtcd.GetName(), jobBackupEtcd.GetName()).MaxTimes(1).Return([]byte("Backup job doing backup things..."), nil) + propPolicy := metav1.DeletePropagationBackground + k.EXPECT().KubeDelete(ctx, "Job", namespaceEtcds, jobBackupEtcd.GetName(), true, &propPolicy).MaxTimes(1).Return(nil) + + // fixPeers + // createPrivilegedServiceAccount + serviceAcc := newServiceAccount(serviceAccountName, doc.OpenShiftCluster.Name) + clusterRole := newClusterRole(kubeServiceAccount, doc.OpenShiftCluster.Name) + + k.EXPECT().KubeCreateOrUpdate(ctx, serviceAcc).MaxTimes(1).Return(nil) + k.EXPECT().KubeCreateOrUpdate(ctx, clusterRole).MaxTimes(1).Return(errors.New("oh no, can't create job fix peers")) + k.EXPECT().KubeDelete(ctx, "ServiceAccount", namespaceEtcds, serviceAcc.GetName(), true, nil).MaxTimes(1).Return(nil) + k.EXPECT().KubeDelete(ctx, "SecurityContextConstraints", "", serviceAcc.GetName(), true, nil).MaxTimes(1).Return(nil) + k.EXPECT().KubeDelete(ctx, "ClusterRole", "", "system:serviceaccountopenshift-etcd:etcd-recovery-privileged", true, nil).MaxTimes(1).Return(nil) + k.EXPECT().KubeDelete(ctx, "ClusterRoleBinding", "", serviceAcc.GetName(), true, nil).MaxTimes(1).Return(nil) + k.EXPECT().KubeDelete(ctx, "Job", namespaceEtcds, jobBackupEtcd.GetName(), true, &propPolicy).MaxTimes(1).Return(nil) + }, + }, + { + name: "fail: create cluster role binding", + wantErr: "500: InternalServerError: : oh no, can't create cluster role binding %!!(MISSING)s()", + pods: newEtcdPods(t, doc, false, false, false), + mocks: func(tt *test, t *testing.T, ti *testInfra, k *mock_adminactions.MockKubeActions, pods *corev1.PodList, ctxCancel context.CancelFunc) { + buf := &bytes.Buffer{} + err := codec.NewEncoder(buf, &codec.JsonHandle{}).Encode(pods) + if err != nil { + t.Fatalf("%s failed to encode pods, %s", t.Name(), err.Error()) + } + k.EXPECT().KubeList(ctx, "Pod", namespaceEtcds).MaxTimes(1).Return(buf.Bytes(), nil) + + // backupEtcd + jobBackupEtcd := createBackupEtcdDataJob(doc.OpenShiftCluster.Name, buildNodeName(doc, degradedNode)) + k.EXPECT().KubeCreateOrUpdate(ctx, jobBackupEtcd).MaxTimes(1).Return(nil) + expectWatchEvent(gomock.Any(), jobBackupEtcd, k, "app", corev1.PodSucceeded, false)() + k.EXPECT().KubeGetPodLogs(ctx, jobBackupEtcd.GetNamespace(), jobBackupEtcd.GetName(), jobBackupEtcd.GetName()).MaxTimes(1).Return([]byte("Backup job doing backup things..."), nil) + propPolicy := metav1.DeletePropagationBackground + k.EXPECT().KubeDelete(ctx, "Job", namespaceEtcds, jobBackupEtcd.GetName(), true, &propPolicy).MaxTimes(1).Return(nil) + + // fixPeers + // createPrivilegedServiceAccount + serviceAcc := newServiceAccount(serviceAccountName, doc.OpenShiftCluster.Name) + clusterRole := newClusterRole(kubeServiceAccount, doc.OpenShiftCluster.Name) + crb := newClusterRoleBinding(serviceAccountName, doc.OpenShiftCluster.Name) + + // cleanup + k.EXPECT().KubeCreateOrUpdate(ctx, serviceAcc).MaxTimes(1).Return(nil) + k.EXPECT().KubeCreateOrUpdate(ctx, clusterRole).MaxTimes(1).Return(nil) + k.EXPECT().KubeCreateOrUpdate(ctx, crb).MaxTimes(1).Return(errors.New("oh no, can't create cluster role binding")) + k.EXPECT().KubeDelete(ctx, "ServiceAccount", namespaceEtcds, serviceAcc.GetName(), true, nil).MaxTimes(1).Return(nil) + k.EXPECT().KubeDelete(ctx, "SecurityContextConstraints", "", serviceAcc.GetName(), true, nil).MaxTimes(1).Return(nil) + k.EXPECT().KubeDelete(ctx, "ClusterRole", "", "system:serviceaccountopenshift-etcd:etcd-recovery-privileged", true, nil).MaxTimes(1).Return(nil) + k.EXPECT().KubeDelete(ctx, "ClusterRoleBinding", "", serviceAcc.GetName(), true, nil).MaxTimes(1).Return(nil) + k.EXPECT().KubeDelete(ctx, "Job", namespaceEtcds, jobBackupEtcd.GetName(), true, &propPolicy).MaxTimes(1).Return(nil) + }, + }, + { + name: "fail: create security context constraint", + wantErr: "500: InternalServerError: : oh no, can't create security context constraint %!!(MISSING)s()", + pods: newEtcdPods(t, doc, false, false, false), + mocks: func(tt *test, t *testing.T, ti *testInfra, k *mock_adminactions.MockKubeActions, pods *corev1.PodList, ctxCancel context.CancelFunc) { + buf := &bytes.Buffer{} + err := codec.NewEncoder(buf, &codec.JsonHandle{}).Encode(pods) + if err != nil { + t.Fatalf("%s failed to encode pods, %s", t.Name(), err.Error()) + } + k.EXPECT().KubeList(ctx, "Pod", namespaceEtcds).MaxTimes(1).Return(buf.Bytes(), nil) + + // backupEtcd + jobBackupEtcd := createBackupEtcdDataJob(doc.OpenShiftCluster.Name, buildNodeName(doc, degradedNode)) + k.EXPECT().KubeCreateOrUpdate(ctx, jobBackupEtcd).MaxTimes(1).Return(nil) + expectWatchEvent(gomock.Any(), jobBackupEtcd, k, "app", corev1.PodSucceeded, false)() + k.EXPECT().KubeGetPodLogs(ctx, jobBackupEtcd.GetNamespace(), jobBackupEtcd.GetName(), jobBackupEtcd.GetName()).MaxTimes(1).Return([]byte("Backup job doing backup things..."), nil) + propPolicy := metav1.DeletePropagationBackground + k.EXPECT().KubeDelete(ctx, "Job", namespaceEtcds, jobBackupEtcd.GetName(), true, &propPolicy).MaxTimes(1).Return(nil) + + // fixPeers + // createPrivilegedServiceAccount + serviceAcc := newServiceAccount(serviceAccountName, doc.OpenShiftCluster.Name) + clusterRole := newClusterRole(kubeServiceAccount, doc.OpenShiftCluster.Name) + crb := newClusterRoleBinding(serviceAccountName, doc.OpenShiftCluster.Name) + scc := newSecurityContextConstraint(serviceAccountName, doc.OpenShiftCluster.Name, kubeServiceAccount) + + k.EXPECT().KubeCreateOrUpdate(ctx, serviceAcc).MaxTimes(1).Return(nil) + k.EXPECT().KubeCreateOrUpdate(ctx, clusterRole).MaxTimes(1).Return(nil) + k.EXPECT().KubeCreateOrUpdate(ctx, crb).MaxTimes(1).Return(nil) + k.EXPECT().KubeCreateOrUpdate(ctx, scc).MaxTimes(1).Return(errors.New("oh no, can't create security context constraint")) + + // cleanup + k.EXPECT().KubeDelete(ctx, "ServiceAccount", namespaceEtcds, serviceAcc.GetName(), true, nil).MaxTimes(1).Return(nil) + k.EXPECT().KubeDelete(ctx, "SecurityContextConstraints", "", serviceAcc.GetName(), true, nil).MaxTimes(1).Return(nil) + k.EXPECT().KubeDelete(ctx, "ClusterRole", "", "system:serviceaccountopenshift-etcd:etcd-recovery-privileged", true, nil).MaxTimes(1).Return(nil) + k.EXPECT().KubeDelete(ctx, "ClusterRoleBinding", "", serviceAcc.GetName(), true, nil).MaxTimes(1).Return(nil) + k.EXPECT().KubeDelete(ctx, "Job", namespaceEtcds, jobBackupEtcd.GetName(), true, &propPolicy).MaxTimes(1).Return(nil) + }, + }, + { + name: "fail: Backup job Pod failed", + wantErr: "500: InternalServerError: : pod etcd-recovery-data-backup event Failed received with message Pod Failed for reasons XYZ...", + pods: newEtcdPods(t, doc, false, false, false), + mocks: func(tt *test, t *testing.T, ti *testInfra, k *mock_adminactions.MockKubeActions, pods *corev1.PodList, ctxCancel context.CancelFunc) { + buf := &bytes.Buffer{} + err := codec.NewEncoder(buf, &codec.JsonHandle{}).Encode(pods) + if err != nil { + t.Fatalf("%s failed to encode pods, %s", t.Name(), err.Error()) + } + k.EXPECT().KubeList(ctx, "Pod", namespaceEtcds).MaxTimes(1).Return(buf.Bytes(), nil) + + // backupEtcd + jobBackupEtcd := createBackupEtcdDataJob(doc.OpenShiftCluster.Name, buildNodeName(doc, degradedNode)) + k.EXPECT().KubeCreateOrUpdate(ctx, jobBackupEtcd).MaxTimes(1).Return(nil).MaxTimes(1) + expectWatchEvent(gomock.Any(), jobBackupEtcd, k, "app", corev1.PodFailed, false)() + k.EXPECT().KubeGetPodLogs(ctx, jobBackupEtcd.GetNamespace(), jobBackupEtcd.GetName(), jobBackupEtcd.GetName()).MaxTimes(1).Return([]byte("oh no, Pod is in a failed state"), nil) + propPolicy := metav1.DeletePropagationBackground + k.EXPECT().KubeDelete(ctx, "Job", namespaceEtcds, jobBackupEtcd.GetName(), true, &propPolicy).MaxTimes(1).Return(nil) + }, + }, + { + name: "fail: Context cancelled", + wantErr: "500: InternalServerError: : context was cancelled while waiting for etcd-recovery-data-backup because context canceled", + pods: newEtcdPods(t, doc, false, false, false), + cancel: true, + ctxCancel: ctxCancel, + mocks: func(tt *test, t *testing.T, ti *testInfra, k *mock_adminactions.MockKubeActions, pods *corev1.PodList, ctxCancel context.CancelFunc) { + buf := &bytes.Buffer{} + err := codec.NewEncoder(buf, &codec.JsonHandle{}).Encode(pods) + if err != nil { + t.Fatalf("%s failed to encode pods, %s", t.Name(), err.Error()) + } + k.EXPECT().KubeList(ctx, "Pod", namespaceEtcds).MaxTimes(1).Return(buf.Bytes(), nil) + + // backupEtcd + jobBackupEtcd := createBackupEtcdDataJob(doc.OpenShiftCluster.Name, buildNodeName(doc, degradedNode)) + k.EXPECT().KubeCreateOrUpdate(ctx, jobBackupEtcd).MaxTimes(1).Return(nil).MaxTimes(1) + expectWatchEvent(gomock.Any(), jobBackupEtcd, k, "app", corev1.PodPending, true) + if tt.cancel { + tt.ctxCancel() + } + k.EXPECT().KubeGetPodLogs(ctx, jobBackupEtcd.GetNamespace(), jobBackupEtcd.GetName(), jobBackupEtcd.GetName()).MaxTimes(1).Return([]byte(tt.wantErr), nil) + propPolicy := metav1.DeletePropagationBackground + k.EXPECT().KubeDelete(ctx, "Job", namespaceEtcds, jobBackupEtcd.GetName(), true, &propPolicy).MaxTimes(1).Return(nil) + }, + }, + } { + t.Run(tt.name, func(t *testing.T) { + ti := newTestInfra(t).WithOpenShiftClusters().WithSubscriptions() + defer ti.done() + + k := mock_adminactions.NewMockKubeActions(ti.controller) + tt.mocks(tt, t, ti, k, tt.pods, ctxCancel) + + ti.fixture.AddOpenShiftClusterDocuments(doc) + ti.fixture.AddSubscriptionDocuments(&api.SubscriptionDocument{ + ID: mockSubID, + Subscription: &api.Subscription{ + State: api.SubscriptionStateRegistered, + Properties: &api.SubscriptionProperties{ + TenantID: mockTenantID, + }, + }, + }) + + f, err := NewFrontend(ctx, + ti.audit, + ti.log, + ti.env, + ti.asyncOperationsDatabase, + ti.clusterManagerDatabase, + ti.openShiftClustersDatabase, + ti.subscriptionsDatabase, + nil, + api.APIs, + &noop.Noop{}, + nil, + nil, + nil, + nil, + ti.enricher) + if err != nil { + t.Fatal(err) + } + + containerLogs, err := f.fixEtcd(ctx, ti.log, ti.env, doc, k, &operatorv1fake.FakeEtcds{ + Fake: &operatorv1fake.FakeOperatorV1{ + Fake: &ktesting.Fake{}, + }, + }) + ti.log.Infof("Container logs: \n%s", containerLogs) + if err != nil && err.Error() != tt.wantErr || + err == nil && tt.wantErr != "" { + t.Errorf("\n%s\n !=\n%s", err.Error(), tt.wantErr) + } + }) + } +} + +func expectWatchEvent(ctx gomock.Matcher, o *unstructured.Unstructured, k *mock_adminactions.MockKubeActions, labelKey string, podPhase corev1.PodPhase, noUpdates bool) func() { + message := "" + switch podPhase { + case corev1.PodSucceeded: + message = "Pod succeeded Successfully" + case corev1.PodFailed: + message = "Pod Failed for reasons XYZ..." + case corev1.PodPending: + message = "Pod is pending..." + case corev1.PodUnknown: + message = "Pod status is unknown..." + } + w := watch.NewFake() + k.EXPECT().KubeWatch(ctx, o, labelKey).MaxTimes(1).Return(watch.Interface(w), nil) + return func() { + go func() { + w.Add(&corev1.Pod{ + TypeMeta: metav1.TypeMeta{ + Kind: "Pod", + APIVersion: "v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: o.GetName(), + Namespace: o.GetNamespace(), + }, + Status: corev1.PodStatus{ + Phase: podPhase, + Message: message, + }, + }) + w.Reset() + }() + } +} + +func buildClusterName(doc *api.OpenShiftClusterDocument) string { + return doc.OpenShiftCluster.Name + "-" + doc.OpenShiftCluster.Properties.InfraID +} + +func buildNodeName(doc *api.OpenShiftClusterDocument, node string) string { + c := buildClusterName(doc) + return c + "-" + node +} + +func newEtcdPods(t *testing.T, doc *api.OpenShiftClusterDocument, healthy, multiDegraded, emptyEnv bool) *corev1.PodList { + var ( + degradedNodeMaster2 = buildNodeName(doc, degradedNode) + nodeMaster0 = buildNodeName(doc, "master-0") + nodeMaster1 = buildNodeName(doc, "master-1") + ) + const ( + master0IP = "10.0.0.1" + master1IP = "10.0.0.2" + master2IP = "10.0.0.3" + master2ChangedIP = "10.0.0.9" + ) + + if healthy && multiDegraded { + t.Fatalf("TEST %s: healthy (value %t) and multiDegraded (value %t) cannot both be true, failed sanity check", t.Name(), healthy, multiDegraded) + } + + // Used to test scenario when etcd's env vars are empty, or there is no conflict found + // then statuses will be tests + envs := []corev1.EnvVar{ + { + Name: "NODE_" + doc.OpenShiftCluster.Name + "_" + doc.OpenShiftCluster.Properties.InfraID + "_master_0_IP", + Value: master0IP, + }, + { + Name: "NODE_ " + doc.OpenShiftCluster.Name + "_" + doc.OpenShiftCluster.Properties.InfraID + "_master_1_IP", + Value: master1IP, + }, + { + Name: "NODE_" + doc.OpenShiftCluster.Name + "_" + doc.OpenShiftCluster.Properties.InfraID + "_master_2_IP", + Value: master2IP, + }, + } + if emptyEnv { + envs = []corev1.EnvVar{} + } + containerID := "quay://etcd-container-id" + badStatus := []corev1.ContainerStatus{ + { + Name: "etcd", + Ready: false, + Started: to.BoolPtr(false), + RestartCount: 50, + State: corev1.ContainerState{ + Waiting: &corev1.ContainerStateWaiting{ + Reason: "Container is in a crashloop backoff", + Message: "Container crashloop backoff", + }, + }, + ContainerID: containerID, + }, + } + + statuses := []corev1.ContainerStatus{ + { + State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}}, + ContainerID: containerID, + }, + } + if multiDegraded { + statuses = badStatus + } + + pods := &corev1.PodList{ + TypeMeta: metav1.TypeMeta{ + Kind: "Etcd", + }, + Items: []corev1.Pod{ + // healthy pod + { + TypeMeta: metav1.TypeMeta{}, + ObjectMeta: metav1.ObjectMeta{ + Name: "etcd-" + nodeMaster0, + Namespace: namespaceEtcds, + }, + Status: corev1.PodStatus{ + ContainerStatuses: statuses, + PodIPs: []corev1.PodIP{ + { + IP: master0IP, + }, + }, + }, + Spec: corev1.PodSpec{ + NodeName: nodeMaster0, + Containers: []corev1.Container{ + { + Name: "etcd", + Env: envs, + }, + }, + }, + }, + // healthy pod + { + TypeMeta: metav1.TypeMeta{}, + ObjectMeta: metav1.ObjectMeta{ + Name: "etcd-" + nodeMaster1, + Namespace: namespaceEtcds, + }, + Status: corev1.PodStatus{ + ContainerStatuses: statuses, + PodIPs: []corev1.PodIP{ + { + IP: master1IP, + }, + }, + }, + Spec: corev1.PodSpec{ + NodeName: nodeMaster1, + Containers: []corev1.Container{ + { + Name: "etcd", + Env: envs, + }, + }, + }, + }, + // degraded pod + { + TypeMeta: metav1.TypeMeta{}, + ObjectMeta: metav1.ObjectMeta{ + Name: "etcd-" + degradedNodeMaster2, + Namespace: namespaceEtcds, + }, + Status: corev1.PodStatus{ + ContainerStatuses: badStatus, + PodIPs: []corev1.PodIP{ + { + IP: master2ChangedIP, + }, + }, + }, + Spec: corev1.PodSpec{ + NodeName: degradedNodeMaster2, + Containers: []corev1.Container{ + { + Name: "etcd", + Env: envs, + }, + }, + }, + }, + }, + } + + if healthy { + pods.Items[len(pods.Items)-1].Status.ContainerStatuses = statuses + pods.Items[len(pods.Items)-1].Status.PodIPs = []corev1.PodIP{ + { + IP: master2IP, + }, + } + } + + return pods +} diff --git a/pkg/frontend/frontend.go b/pkg/frontend/frontend.go index 24c461f8f57..5f7abe2f5ed 100644 --- a/pkg/frontend/frontend.go +++ b/pkg/frontend/frontend.go @@ -294,6 +294,11 @@ func (f *frontend) chiAuthenticatedRoutes(router chi.Router) { }) r.Get("/supportedvmsizes", f.supportedvmsizes) + r.Route("/subscriptions/{subscriptionId}/resourcegroups/{resourceGroupName}/providers/{resourceProviderNamespace}/{resourceType}/{resourceName}/etcdrecovery", + func(r chi.Router) { + r.Post("/", f.postAdminOpenShiftClusterEtcdRecovery) + }) + r.Route("/subscriptions/{subscriptionId}/resourcegroups/{resourceGroupName}/providers/{resourceProviderNamespace}/{resourceType}/{resourceName}/kubernetesobjects", func(r chi.Router) { r.Get("/", f.getAdminKubernetesObjects) diff --git a/pkg/frontend/scripts.go b/pkg/frontend/scripts.go new file mode 100644 index 00000000000..d04efe91a34 --- /dev/null +++ b/pkg/frontend/scripts.go @@ -0,0 +1,9 @@ +package frontend + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import _ "embed" + +//go:embed scripts/backupandfixetcd.sh +var backupOrFixEtcd string diff --git a/pkg/frontend/scripts/backupandfixetcd.sh b/pkg/frontend/scripts/backupandfixetcd.sh new file mode 100755 index 00000000000..10520850fb7 --- /dev/null +++ b/pkg/frontend/scripts/backupandfixetcd.sh @@ -0,0 +1,72 @@ +#!/bin/bash +# +# See for more information: https://docs.openshift.com/container-platform/4.10/backup_and_restore/control_plane_backup_and_restore/replacing-unhealthy-etcd-member.html + +remove_peer_members() { + echo "${PEER_PODS}" + for p in ${PEER_PODS}; do + echo "Attempting to get ID for pod/${p}" + members="$(oc rsh -n openshift-etcd -c etcdctl "pod/${p}" etcdctl member list -w json --hex true)" + id="$(jq -r --arg node "$DEGRADED_NODE" '.members[] | select( .name == $node).ID' <<< "$members")" + echo "id: ${id:-Not Found}" + if [[ -n $id ]]; then + echo "rshing into pod/${p} now to remove member id $id" + oc rsh \ + -n openshift-etcd \ + -c etcdctl \ + "pod/${p}" etcdctl member remove "$id" + else + echo "${DEGRADED_NODE} id not found in etcd member list for pod ${p}" + fi + done +} + +# jq expects it's required shared libraries to be present in /usr/lib64, not /host/usr/lib64. +# Because we are using jq mount under /host and haven't installed jq, those libraries exist under /host/usr/lib64 rather than /usr/lib64. +# Creating the symbolic links allows jq to resolve it's libraries without the need for installing. +create_sym_links() { + jq_lib1="/usr/lib64/libjq.so.1" + jq_lib2="/usr/lib64/libonig.so.5" + + if [[ ! -f $jq_lib1 ]]; then + ln -s "/host${jq_lib1}" "$jq_lib1" + fi + if [[ ! -f $jq_lib2 ]]; then + ln -s "/host${jq_lib2}" "$jq_lib2" + fi +} + +backup_etcd() { + local bdir etcd_yaml etcd_dir + bdir=/var/lib/etcd-backup + etcd_yaml=/etc/kubernetes/manifests/etcd-pod.yaml + etcd_dir=/var/lib/etcd + if [[ -d $etcd_dir ]] && [[ -f $etcd_yaml ]]; then + echo "Creating $bdir" + mkdir -p "$bdir" || abort "failed to make backup directory" + echo "Moving $etcd_yaml to $bdir" + mv "$etcd_yaml" "$bdir" || abort "failed to move $etcd_yaml to $bdir" + echo "Moving $etcd_dir to /host/tmp" + mv "$etcd_dir" /tmp || abort "failed to move $etcd_dir to /tmp" + else + echo "$etcd_dir doesn't exist or $etcd_yaml has already been moved" + echo "Not taking host etcd backup" + fi +} + +abort() { + echo "${1}, Aborting." + exit 1 +} + +if [[ -n $FIX_PEERS ]]; then + PATH+="${PATH}:/host/usr/bin" + create_sym_links + echo "Starting peer etcd member removal" + remove_peer_members +elif [[ -n $BACKUP ]]; then + echo "Starting etcd data backup" + backup_etcd +else + abort "BACKUP and FIX_PEERS are unset, no actions taken." +fi diff --git a/pkg/operator/controllers/clusteroperatoraro/clusteroperatoraro_controller_test.go b/pkg/operator/controllers/clusteroperatoraro/clusteroperatoraro_controller_test.go new file mode 100644 index 00000000000..b5102004f3f --- /dev/null +++ b/pkg/operator/controllers/clusteroperatoraro/clusteroperatoraro_controller_test.go @@ -0,0 +1,274 @@ +package clusteroperatoraro + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "context" + "testing" + "time" + + "github.com/google/go-cmp/cmp/cmpopts" + configv1 "github.com/openshift/api/config/v1" + operatorv1 "github.com/openshift/api/operator/v1" + "github.com/sirupsen/logrus" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/pointer" + ctrl "sigs.k8s.io/controller-runtime" + ctrlfake "sigs.k8s.io/controller-runtime/pkg/client/fake" + + arov1alpha1 "github.com/Azure/ARO-RP/pkg/operator/apis/aro.openshift.io/v1alpha1" + "github.com/Azure/ARO-RP/pkg/util/cmp" + _ "github.com/Azure/ARO-RP/pkg/util/scheme" + "github.com/Azure/ARO-RP/pkg/util/version" + utilconditions "github.com/Azure/ARO-RP/test/util/conditions" + utilerror "github.com/Azure/ARO-RP/test/util/error" +) + +func TestConditions(t *testing.T) { + tests := []struct { + name string + controllerConditions []operatorv1.OperatorCondition + wantConditions []configv1.ClusterOperatorStatusCondition + wantErr string + }{ + { + name: "no conditions sets defaults", + controllerConditions: []operatorv1.OperatorCondition{}, + wantConditions: []configv1.ClusterOperatorStatusCondition{ + { + Type: configv1.OperatorAvailable, + Status: configv1.ConditionUnknown, + LastTransitionTime: metav1.NewTime(time.Now()), + Reason: "NoData", + }, + { + Type: configv1.OperatorProgressing, + Status: configv1.ConditionUnknown, + LastTransitionTime: metav1.NewTime(time.Now()), + Reason: "NoData", + }, + { + Type: configv1.OperatorDegraded, + Status: configv1.ConditionFalse, + LastTransitionTime: metav1.NewTime(time.Now()), + Reason: "AsExpected", + }, + }, + }, + { + name: "All controllers available sets Available=True", + controllerConditions: []operatorv1.OperatorCondition{ + utilconditions.ControllerDefaultAvailable("ControllerA"), + utilconditions.ControllerDefaultAvailable("ControllerB"), + utilconditions.ControllerDefaultAvailable("ControllerC"), + }, + wantConditions: []configv1.ClusterOperatorStatusCondition{ + { + Type: configv1.OperatorAvailable, + Status: configv1.ConditionTrue, + LastTransitionTime: metav1.NewTime(time.Now()), + Reason: "AsExpected", + Message: "All is well", + }, + { + Type: configv1.OperatorProgressing, + Status: configv1.ConditionUnknown, + LastTransitionTime: metav1.NewTime(time.Now()), + Reason: "NoData", + }, + { + Type: configv1.OperatorDegraded, + Status: configv1.ConditionFalse, + LastTransitionTime: metav1.NewTime(time.Now()), + Reason: "AsExpected", + }, + }, + }, + { + name: "Controller not available sets Available=False", + controllerConditions: []operatorv1.OperatorCondition{ + utilconditions.ControllerDefaultAvailable("ControllerA"), + { + Type: "ControllerBAvailable", + Status: operatorv1.ConditionFalse, + LastTransitionTime: metav1.NewTime(time.Now()), + Reason: "SomeError", + Message: "An error occurred", + }, + utilconditions.ControllerDefaultAvailable("ControllerC"), + }, + wantConditions: []configv1.ClusterOperatorStatusCondition{ + { + Type: configv1.OperatorAvailable, + Status: configv1.ConditionFalse, + LastTransitionTime: metav1.NewTime(time.Now()), + Reason: "ControllerB_SomeError", + Message: "ControllerBAvailable: An error occurred", + }, + { + Type: configv1.OperatorProgressing, + Status: configv1.ConditionUnknown, + LastTransitionTime: metav1.NewTime(time.Now()), + Reason: "NoData", + }, + { + Type: configv1.OperatorDegraded, + Status: configv1.ConditionFalse, + LastTransitionTime: metav1.NewTime(time.Now()), + Reason: "AsExpected", + }, + }, + }, + { + name: "All controllers not progressing sets Progressing=False", + controllerConditions: []operatorv1.OperatorCondition{ + utilconditions.ControllerDefaultProgressing("ControllerA"), + utilconditions.ControllerDefaultProgressing("ControllerB"), + utilconditions.ControllerDefaultProgressing("ControllerC"), + }, + wantConditions: []configv1.ClusterOperatorStatusCondition{ + { + Type: configv1.OperatorAvailable, + Status: configv1.ConditionUnknown, + LastTransitionTime: metav1.NewTime(time.Now()), + Reason: "NoData", + }, + { + Type: configv1.OperatorProgressing, + Status: configv1.ConditionFalse, + LastTransitionTime: metav1.NewTime(time.Now()), + Reason: "AsExpected", + Message: "All is well", + }, + { + Type: configv1.OperatorDegraded, + Status: configv1.ConditionFalse, + LastTransitionTime: metav1.NewTime(time.Now()), + Reason: "AsExpected", + }, + }, + }, + { + name: "Controller progressing sets Progressing=True", + controllerConditions: []operatorv1.OperatorCondition{ + utilconditions.ControllerDefaultProgressing("ControllerA"), + { + Type: "ControllerBProgressing", + Status: operatorv1.ConditionTrue, + LastTransitionTime: metav1.NewTime(time.Now()), + Reason: "SomeProcess", + Message: "Something is happening", + }, + utilconditions.ControllerDefaultProgressing("ControllerC"), + }, + wantConditions: []configv1.ClusterOperatorStatusCondition{ + { + Type: configv1.OperatorAvailable, + Status: configv1.ConditionUnknown, + LastTransitionTime: metav1.NewTime(time.Now()), + Reason: "NoData", + }, + { + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + LastTransitionTime: metav1.NewTime(time.Now()), + Reason: "ControllerB_SomeProcess", + Message: "ControllerBProgressing: Something is happening", + }, + { + Type: configv1.OperatorDegraded, + Status: configv1.ConditionFalse, + LastTransitionTime: metav1.NewTime(time.Now()), + Reason: "AsExpected", + }, + }, + }, + { + name: "Controller degraded does NOT set Degraded=True", + controllerConditions: []operatorv1.OperatorCondition{ + utilconditions.ControllerDefaultDegraded("ControllerA"), + { + Type: "ControllerBDegraded", + Status: operatorv1.ConditionTrue, + LastTransitionTime: metav1.NewTime(time.Now()), + Reason: "SomeProcess", + Message: "Something bad is happening", + }, + utilconditions.ControllerDefaultDegraded("ControllerC"), + }, + wantConditions: []configv1.ClusterOperatorStatusCondition{ + { + Type: configv1.OperatorAvailable, + Status: configv1.ConditionUnknown, + LastTransitionTime: metav1.NewTime(time.Now()), + Reason: "NoData", + }, + { + Type: configv1.OperatorProgressing, + Status: configv1.ConditionUnknown, + LastTransitionTime: metav1.NewTime(time.Now()), + Reason: "NoData", + }, + { + Type: configv1.OperatorDegraded, + Status: configv1.ConditionFalse, + LastTransitionTime: metav1.NewTime(time.Now()), + Reason: "AsExpected", + }, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cluster := &arov1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{Name: arov1alpha1.SingletonClusterName}, + Status: arov1alpha1.ClusterStatus{ + Conditions: tt.controllerConditions, + }, + } + clientFake := ctrlfake.NewClientBuilder(). + WithObjects(cluster). + Build() + + r := NewReconciler(logrus.NewEntry(logrus.StandardLogger()), clientFake) + + request := ctrl.Request{} + ctx := context.Background() + + _, err := r.Reconcile(ctx, request) + + utilerror.AssertErrorMessage(t, err, tt.wantErr) + + operator := &configv1.ClusterOperator{} + if err := clientFake.Get(ctx, types.NamespacedName{Name: clusterOperatorName}, operator); err != nil { + t.Error(err) + } + if diff := cmp.Diff(tt.wantConditions, operator.Status.Conditions, cmpopts.EquateApproxTime(time.Second)); diff != "" { + t.Error(diff) + } + + // static checks - these should always be set on the operator resource after every reconcile + wantVersion := []configv1.OperandVersion{{ + Name: "operator", + Version: version.GitCommit, + }} + if diff := cmp.Diff(wantVersion, operator.Status.Versions); diff != "" { + t.Error(diff) + } + + wantOwnerReference := []metav1.OwnerReference{{ + APIVersion: arov1alpha1.GroupVersion.Identifier(), + Kind: "Cluster", + Name: arov1alpha1.SingletonClusterName, + Controller: pointer.BoolPtr(true), + BlockOwnerDeletion: pointer.BoolPtr(true), + }} + if diff := cmp.Diff(wantOwnerReference, operator.ObjectMeta.OwnerReferences); diff != "" { + t.Error(diff) + } + }) + } +} diff --git a/pkg/operator/controllers/dnsmasq/cluster_controller.go b/pkg/operator/controllers/dnsmasq/cluster_controller.go index b316b3e5709..d898e4558f3 100644 --- a/pkg/operator/controllers/dnsmasq/cluster_controller.go +++ b/pkg/operator/controllers/dnsmasq/cluster_controller.go @@ -9,7 +9,6 @@ import ( mcv1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1" "github.com/sirupsen/logrus" kruntime "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" @@ -17,6 +16,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" arov1alpha1 "github.com/Azure/ARO-RP/pkg/operator/apis/aro.openshift.io/v1alpha1" + "github.com/Azure/ARO-RP/pkg/operator/controllers/base" "github.com/Azure/ARO-RP/pkg/util/dynamichelper" ) @@ -27,49 +27,51 @@ const ( ) type ClusterReconciler struct { - log *logrus.Entry - + base.AROController dh dynamichelper.Interface - - client client.Client } func NewClusterReconciler(log *logrus.Entry, client client.Client, dh dynamichelper.Interface) *ClusterReconciler { return &ClusterReconciler{ - log: log, - dh: dh, - client: client, + AROController: base.AROController{ + Log: log, + Client: client, + Name: ClusterControllerName, + }, + dh: dh, } } // Reconcile watches the ARO object, and if it changes, reconciles all the // 99-%s-aro-dns machineconfigs func (r *ClusterReconciler) Reconcile(ctx context.Context, request ctrl.Request) (ctrl.Result, error) { - instance := &arov1alpha1.Cluster{} - err := r.client.Get(ctx, types.NamespacedName{Name: arov1alpha1.SingletonClusterName}, instance) + instance, err := r.GetCluster(ctx) if err != nil { return reconcile.Result{}, err } if !instance.Spec.OperatorFlags.GetSimpleBoolean(controllerEnabled) { - r.log.Debug("controller is disabled") + r.Log.Debug("controller is disabled") return reconcile.Result{}, nil } - r.log.Debug("running") + r.Log.Debug("running") mcps := &mcv1.MachineConfigPoolList{} - err = r.client.List(ctx, mcps) + err = r.Client.List(ctx, mcps) if err != nil { - r.log.Error(err) + r.Log.Error(err) + r.SetDegraded(ctx, err) return reconcile.Result{}, err } err = reconcileMachineConfigs(ctx, instance, r.dh, mcps.Items...) if err != nil { - r.log.Error(err) + r.Log.Error(err) + r.SetDegraded(ctx, err) return reconcile.Result{}, err } + r.ClearConditions(ctx) return reconcile.Result{}, nil } diff --git a/pkg/operator/controllers/dnsmasq/cluster_controller_test.go b/pkg/operator/controllers/dnsmasq/cluster_controller_test.go index 55e39429b23..f2830ac5cb3 100644 --- a/pkg/operator/controllers/dnsmasq/cluster_controller_test.go +++ b/pkg/operator/controllers/dnsmasq/cluster_controller_test.go @@ -6,8 +6,10 @@ package dnsmasq import ( "context" "testing" + "time" "github.com/golang/mock/gomock" + operatorv1 "github.com/openshift/api/operator/v1" mcv1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1" "github.com/sirupsen/logrus" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -17,20 +19,28 @@ import ( arov1alpha1 "github.com/Azure/ARO-RP/pkg/operator/apis/aro.openshift.io/v1alpha1" mock_dynamichelper "github.com/Azure/ARO-RP/pkg/util/mocks/dynamichelper" + utilconditions "github.com/Azure/ARO-RP/test/util/conditions" utilerror "github.com/Azure/ARO-RP/test/util/error" ) func TestClusterReconciler(t *testing.T) { + transitionTime := metav1.Time{Time: time.Now()} + defaultAvailable := utilconditions.ControllerDefaultAvailable(ClusterControllerName) + defaultProgressing := utilconditions.ControllerDefaultProgressing(ClusterControllerName) + defaultDegraded := utilconditions.ControllerDefaultDegraded(ClusterControllerName) + defaultConditions := []operatorv1.OperatorCondition{defaultAvailable, defaultProgressing, defaultDegraded} + fakeDh := func(controller *gomock.Controller) *mock_dynamichelper.MockInterface { return mock_dynamichelper.NewMockInterface(controller) } tests := []struct { - name string - objects []client.Object - mocks func(mdh *mock_dynamichelper.MockInterface) - request ctrl.Request - wantErrMsg string + name string + objects []client.Object + mocks func(mdh *mock_dynamichelper.MockInterface) + request ctrl.Request + wantErrMsg string + wantConditions []operatorv1.OperatorCondition }{ { name: "no cluster", @@ -44,7 +54,9 @@ func TestClusterReconciler(t *testing.T) { objects: []client.Object{ &arov1alpha1.Cluster{ ObjectMeta: metav1.ObjectMeta{Name: "cluster"}, - Status: arov1alpha1.ClusterStatus{}, + Status: arov1alpha1.ClusterStatus{ + Conditions: defaultConditions, + }, Spec: arov1alpha1.ClusterSpec{ OperatorFlags: arov1alpha1.OperatorFlags{ controllerEnabled: "false", @@ -52,16 +64,27 @@ func TestClusterReconciler(t *testing.T) { }, }, }, - mocks: func(mdh *mock_dynamichelper.MockInterface) {}, - request: ctrl.Request{}, - wantErrMsg: "", + mocks: func(mdh *mock_dynamichelper.MockInterface) {}, + request: ctrl.Request{}, + wantErrMsg: "", + wantConditions: defaultConditions, }, { name: "no MachineConfigPools does nothing", objects: []client.Object{ &arov1alpha1.Cluster{ ObjectMeta: metav1.ObjectMeta{Name: "cluster"}, - Status: arov1alpha1.ClusterStatus{}, + Status: arov1alpha1.ClusterStatus{ + Conditions: []operatorv1.OperatorCondition{ + defaultAvailable, + defaultProgressing, + { + Type: ClusterControllerName + "Controller" + operatorv1.OperatorStatusTypeDegraded, + Status: operatorv1.ConditionTrue, + LastTransitionTime: transitionTime, + }, + }, + }, Spec: arov1alpha1.ClusterSpec{ OperatorFlags: arov1alpha1.OperatorFlags{ controllerEnabled: "true", @@ -72,15 +95,18 @@ func TestClusterReconciler(t *testing.T) { mocks: func(mdh *mock_dynamichelper.MockInterface) { mdh.EXPECT().Ensure(gomock.Any()).Times(1) }, - request: ctrl.Request{}, - wantErrMsg: "", + request: ctrl.Request{}, + wantErrMsg: "", + wantConditions: defaultConditions, }, { name: "valid MachineConfigPool creates ARO DNS MachineConfig", objects: []client.Object{ &arov1alpha1.Cluster{ ObjectMeta: metav1.ObjectMeta{Name: "cluster"}, - Status: arov1alpha1.ClusterStatus{}, + Status: arov1alpha1.ClusterStatus{ + Conditions: defaultConditions, + }, Spec: arov1alpha1.ClusterSpec{ OperatorFlags: arov1alpha1.OperatorFlags{ controllerEnabled: "true", @@ -96,8 +122,9 @@ func TestClusterReconciler(t *testing.T) { mocks: func(mdh *mock_dynamichelper.MockInterface) { mdh.EXPECT().Ensure(gomock.Any(), gomock.AssignableToTypeOf(&mcv1.MachineConfig{})).Times(1) }, - request: ctrl.Request{}, - wantErrMsg: "", + request: ctrl.Request{}, + wantErrMsg: "", + wantConditions: defaultConditions, }, } @@ -118,10 +145,11 @@ func TestClusterReconciler(t *testing.T) { client, dh, ) - - _, err := r.Reconcile(context.Background(), tt.request) + ctx := context.Background() + _, err := r.Reconcile(ctx, tt.request) utilerror.AssertErrorMessage(t, err, tt.wantErrMsg) + utilconditions.AssertControllerConditions(t, ctx, client, tt.wantConditions) }) } } diff --git a/pkg/operator/controllers/dnsmasq/machineconfig_controller.go b/pkg/operator/controllers/dnsmasq/machineconfig_controller.go index c93a9f7ad69..45c5c576d90 100644 --- a/pkg/operator/controllers/dnsmasq/machineconfig_controller.go +++ b/pkg/operator/controllers/dnsmasq/machineconfig_controller.go @@ -15,7 +15,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/reconcile" - arov1alpha1 "github.com/Azure/ARO-RP/pkg/operator/apis/aro.openshift.io/v1alpha1" + "github.com/Azure/ARO-RP/pkg/operator/controllers/base" "github.com/Azure/ARO-RP/pkg/util/dynamichelper" ) @@ -24,38 +24,38 @@ const ( ) type MachineConfigReconciler struct { - log *logrus.Entry + base.AROController dh dynamichelper.Interface - - client client.Client } var rxARODNS = regexp.MustCompile("^99-(.*)-aro-dns$") func NewMachineConfigReconciler(log *logrus.Entry, client client.Client, dh dynamichelper.Interface) *MachineConfigReconciler { return &MachineConfigReconciler{ - log: log, - dh: dh, - client: client, + AROController: base.AROController{ + Log: log, + Client: client, + Name: MachineConfigControllerName, + }, + dh: dh, } } // Reconcile watches ARO DNS MachineConfig objects, and if any changes, // reconciles it func (r *MachineConfigReconciler) Reconcile(ctx context.Context, request ctrl.Request) (ctrl.Result, error) { - instance := &arov1alpha1.Cluster{} - err := r.client.Get(ctx, types.NamespacedName{Name: arov1alpha1.SingletonClusterName}, instance) + instance, err := r.GetCluster(ctx) if err != nil { return reconcile.Result{}, err } if !instance.Spec.OperatorFlags.GetSimpleBoolean(controllerEnabled) { - r.log.Debug("controller is disabled") + r.Log.Debug("controller is disabled") return reconcile.Result{}, nil } - r.log.Debug("running") + r.Log.Debug("running") m := rxARODNS.FindStringSubmatch(request.Name) if m == nil { return reconcile.Result{}, nil @@ -63,12 +63,14 @@ func (r *MachineConfigReconciler) Reconcile(ctx context.Context, request ctrl.Re role := m[1] mcp := &mcv1.MachineConfigPool{} - err = r.client.Get(ctx, types.NamespacedName{Name: role}, mcp) + err = r.Client.Get(ctx, types.NamespacedName{Name: role}, mcp) if kerrors.IsNotFound(err) { + r.ClearDegraded(ctx) return reconcile.Result{}, nil } if err != nil { - r.log.Error(err) + r.Log.Error(err) + r.SetDegraded(ctx, err) return reconcile.Result{}, err } if mcp.GetDeletionTimestamp() != nil { @@ -77,10 +79,12 @@ func (r *MachineConfigReconciler) Reconcile(ctx context.Context, request ctrl.Re err = reconcileMachineConfigs(ctx, instance, r.dh, *mcp) if err != nil { - r.log.Error(err) + r.Log.Error(err) + r.SetDegraded(ctx, err) return reconcile.Result{}, err } + r.ClearConditions(ctx) return reconcile.Result{}, nil } diff --git a/pkg/operator/controllers/dnsmasq/machineconfig_controller_test.go b/pkg/operator/controllers/dnsmasq/machineconfig_controller_test.go index 6d45d5f203e..e47638d4f92 100644 --- a/pkg/operator/controllers/dnsmasq/machineconfig_controller_test.go +++ b/pkg/operator/controllers/dnsmasq/machineconfig_controller_test.go @@ -6,8 +6,10 @@ package dnsmasq import ( "context" "testing" + "time" "github.com/golang/mock/gomock" + operatorv1 "github.com/openshift/api/operator/v1" mcv1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1" "github.com/sirupsen/logrus" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -18,20 +20,27 @@ import ( arov1alpha1 "github.com/Azure/ARO-RP/pkg/operator/apis/aro.openshift.io/v1alpha1" mock_dynamichelper "github.com/Azure/ARO-RP/pkg/util/mocks/dynamichelper" + utilconditions "github.com/Azure/ARO-RP/test/util/conditions" utilerror "github.com/Azure/ARO-RP/test/util/error" ) func TestMachineConfigReconciler(t *testing.T) { + transitionTime := metav1.Time{Time: time.Now()} + defaultAvailable := utilconditions.ControllerDefaultAvailable(MachineConfigControllerName) + defaultProgressing := utilconditions.ControllerDefaultProgressing(MachineConfigControllerName) + defaultDegraded := utilconditions.ControllerDefaultDegraded(MachineConfigControllerName) + defaultConditions := []operatorv1.OperatorCondition{defaultAvailable, defaultProgressing, defaultDegraded} fakeDh := func(controller *gomock.Controller) *mock_dynamichelper.MockInterface { return mock_dynamichelper.NewMockInterface(controller) } tests := []struct { - name string - objects []client.Object - mocks func(mdh *mock_dynamichelper.MockInterface) - request ctrl.Request - wantErrMsg string + name string + objects []client.Object + mocks func(mdh *mock_dynamichelper.MockInterface) + request ctrl.Request + wantErrMsg string + wantConditions []operatorv1.OperatorCondition }{ { name: "no cluster", @@ -45,7 +54,9 @@ func TestMachineConfigReconciler(t *testing.T) { objects: []client.Object{ &arov1alpha1.Cluster{ ObjectMeta: metav1.ObjectMeta{Name: "cluster"}, - Status: arov1alpha1.ClusterStatus{}, + Status: arov1alpha1.ClusterStatus{ + Conditions: defaultConditions, + }, Spec: arov1alpha1.ClusterSpec{ OperatorFlags: arov1alpha1.OperatorFlags{ controllerEnabled: "false", @@ -53,16 +64,27 @@ func TestMachineConfigReconciler(t *testing.T) { }, }, }, - mocks: func(mdh *mock_dynamichelper.MockInterface) {}, - request: ctrl.Request{}, - wantErrMsg: "", + mocks: func(mdh *mock_dynamichelper.MockInterface) {}, + request: ctrl.Request{}, + wantErrMsg: "", + wantConditions: defaultConditions, }, { name: "no MachineConfigPool for MachineConfig does nothing", objects: []client.Object{ &arov1alpha1.Cluster{ ObjectMeta: metav1.ObjectMeta{Name: "cluster"}, - Status: arov1alpha1.ClusterStatus{}, + Status: arov1alpha1.ClusterStatus{ + Conditions: []operatorv1.OperatorCondition{ + defaultAvailable, + defaultProgressing, + { + Type: MachineConfigControllerName + "Controller" + operatorv1.OperatorStatusTypeDegraded, + Status: operatorv1.ConditionTrue, + LastTransitionTime: transitionTime, + }, + }, + }, Spec: arov1alpha1.ClusterSpec{ OperatorFlags: arov1alpha1.OperatorFlags{ controllerEnabled: "true", @@ -77,14 +99,17 @@ func TestMachineConfigReconciler(t *testing.T) { Name: "99-custom-aro-dns", }, }, - wantErrMsg: "", + wantErrMsg: "", + wantConditions: defaultConditions, }, { name: "valid MachineConfigPool for MachineConfig reconciles MachineConfig", objects: []client.Object{ &arov1alpha1.Cluster{ ObjectMeta: metav1.ObjectMeta{Name: "cluster"}, - Status: arov1alpha1.ClusterStatus{}, + Status: arov1alpha1.ClusterStatus{ + Conditions: defaultConditions, + }, Spec: arov1alpha1.ClusterSpec{ OperatorFlags: arov1alpha1.OperatorFlags{ controllerEnabled: "true", @@ -106,7 +131,8 @@ func TestMachineConfigReconciler(t *testing.T) { Name: "99-custom-aro-dns", }, }, - wantErrMsg: "", + wantErrMsg: "", + wantConditions: defaultConditions, }, } @@ -126,10 +152,11 @@ func TestMachineConfigReconciler(t *testing.T) { client, dh, ) - - _, err := r.Reconcile(context.Background(), tt.request) + ctx := context.Background() + _, err := r.Reconcile(ctx, tt.request) utilerror.AssertErrorMessage(t, err, tt.wantErrMsg) + utilconditions.AssertControllerConditions(t, ctx, client, tt.wantConditions) }) } } diff --git a/pkg/operator/controllers/dnsmasq/machineconfigpool_controller.go b/pkg/operator/controllers/dnsmasq/machineconfigpool_controller.go index 5be6a108cba..cb255db94f1 100644 --- a/pkg/operator/controllers/dnsmasq/machineconfigpool_controller.go +++ b/pkg/operator/controllers/dnsmasq/machineconfigpool_controller.go @@ -14,7 +14,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/reconcile" - arov1alpha1 "github.com/Azure/ARO-RP/pkg/operator/apis/aro.openshift.io/v1alpha1" + "github.com/Azure/ARO-RP/pkg/operator/controllers/base" "github.com/Azure/ARO-RP/pkg/util/dynamichelper" ) @@ -23,52 +23,56 @@ const ( ) type MachineConfigPoolReconciler struct { - log *logrus.Entry + base.AROController dh dynamichelper.Interface - - client client.Client } func NewMachineConfigPoolReconciler(log *logrus.Entry, client client.Client, dh dynamichelper.Interface) *MachineConfigPoolReconciler { return &MachineConfigPoolReconciler{ - log: log, - dh: dh, - client: client, + AROController: base.AROController{ + Log: log, + Client: client, + Name: MachineConfigPoolControllerName, + }, + dh: dh, } } // Reconcile watches MachineConfigPool objects, and if any changes, // reconciles the associated ARO DNS MachineConfig object func (r *MachineConfigPoolReconciler) Reconcile(ctx context.Context, request ctrl.Request) (ctrl.Result, error) { - instance := &arov1alpha1.Cluster{} - err := r.client.Get(ctx, types.NamespacedName{Name: arov1alpha1.SingletonClusterName}, instance) + instance, err := r.GetCluster(ctx) if err != nil { return reconcile.Result{}, err } if !instance.Spec.OperatorFlags.GetSimpleBoolean(controllerEnabled) { - r.log.Debug("controller is disabled") + r.Log.Debug("controller is disabled") return reconcile.Result{}, nil } - r.log.Debug("running") + r.Log.Debug("running") mcp := &mcv1.MachineConfigPool{} - err = r.client.Get(ctx, types.NamespacedName{Name: request.Name}, mcp) + err = r.Client.Get(ctx, types.NamespacedName{Name: request.Name}, mcp) if kerrors.IsNotFound(err) { + r.ClearDegraded(ctx) return reconcile.Result{}, nil } if err != nil { - r.log.Error(err) + r.Log.Error(err) + r.SetDegraded(ctx, err) return reconcile.Result{}, err } err = reconcileMachineConfigs(ctx, instance, r.dh, *mcp) if err != nil { - r.log.Error(err) + r.Log.Error(err) + r.SetDegraded(ctx, err) return reconcile.Result{}, err } + r.ClearConditions(ctx) return reconcile.Result{}, nil } diff --git a/pkg/operator/controllers/dnsmasq/machineconfigpool_controller_test.go b/pkg/operator/controllers/dnsmasq/machineconfigpool_controller_test.go index d88efa00b6e..f6d0ee32cbb 100644 --- a/pkg/operator/controllers/dnsmasq/machineconfigpool_controller_test.go +++ b/pkg/operator/controllers/dnsmasq/machineconfigpool_controller_test.go @@ -6,8 +6,10 @@ package dnsmasq import ( "context" "testing" + "time" "github.com/golang/mock/gomock" + operatorv1 "github.com/openshift/api/operator/v1" mcv1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1" "github.com/sirupsen/logrus" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -18,20 +20,28 @@ import ( arov1alpha1 "github.com/Azure/ARO-RP/pkg/operator/apis/aro.openshift.io/v1alpha1" mock_dynamichelper "github.com/Azure/ARO-RP/pkg/util/mocks/dynamichelper" + utilconditions "github.com/Azure/ARO-RP/test/util/conditions" utilerror "github.com/Azure/ARO-RP/test/util/error" ) func TestMachineConfigPoolReconciler(t *testing.T) { + transitionTime := metav1.Time{Time: time.Now()} + defaultAvailable := utilconditions.ControllerDefaultAvailable(MachineConfigPoolControllerName) + defaultProgressing := utilconditions.ControllerDefaultProgressing(MachineConfigPoolControllerName) + defaultDegraded := utilconditions.ControllerDefaultDegraded(MachineConfigPoolControllerName) + defaultConditions := []operatorv1.OperatorCondition{defaultAvailable, defaultProgressing, defaultDegraded} + fakeDh := func(controller *gomock.Controller) *mock_dynamichelper.MockInterface { return mock_dynamichelper.NewMockInterface(controller) } tests := []struct { - name string - objects []client.Object - mocks func(mdh *mock_dynamichelper.MockInterface) - request ctrl.Request - wantErrMsg string + name string + objects []client.Object + mocks func(mdh *mock_dynamichelper.MockInterface) + request ctrl.Request + wantErrMsg string + wantConditions []operatorv1.OperatorCondition }{ { name: "no cluster", @@ -45,7 +55,9 @@ func TestMachineConfigPoolReconciler(t *testing.T) { objects: []client.Object{ &arov1alpha1.Cluster{ ObjectMeta: metav1.ObjectMeta{Name: "cluster"}, - Status: arov1alpha1.ClusterStatus{}, + Status: arov1alpha1.ClusterStatus{ + Conditions: defaultConditions, + }, Spec: arov1alpha1.ClusterSpec{ OperatorFlags: arov1alpha1.OperatorFlags{ controllerEnabled: "false", @@ -62,7 +74,17 @@ func TestMachineConfigPoolReconciler(t *testing.T) { objects: []client.Object{ &arov1alpha1.Cluster{ ObjectMeta: metav1.ObjectMeta{Name: "cluster"}, - Status: arov1alpha1.ClusterStatus{}, + Status: arov1alpha1.ClusterStatus{ + Conditions: []operatorv1.OperatorCondition{ + defaultAvailable, + defaultProgressing, + { + Type: MachineConfigPoolControllerName + "Controller" + operatorv1.OperatorStatusTypeDegraded, + Status: operatorv1.ConditionTrue, + LastTransitionTime: transitionTime, + }, + }, + }, Spec: arov1alpha1.ClusterSpec{ OperatorFlags: arov1alpha1.OperatorFlags{ controllerEnabled: "true", @@ -77,14 +99,17 @@ func TestMachineConfigPoolReconciler(t *testing.T) { Name: "custom", }, }, - wantErrMsg: "", + wantErrMsg: "", + wantConditions: defaultConditions, }, { name: "MachineConfigPool reconciles ARO DNS MachineConfig", objects: []client.Object{ &arov1alpha1.Cluster{ ObjectMeta: metav1.ObjectMeta{Name: "cluster"}, - Status: arov1alpha1.ClusterStatus{}, + Status: arov1alpha1.ClusterStatus{ + Conditions: defaultConditions, + }, Spec: arov1alpha1.ClusterSpec{ OperatorFlags: arov1alpha1.OperatorFlags{ controllerEnabled: "true", @@ -109,7 +134,8 @@ func TestMachineConfigPoolReconciler(t *testing.T) { Name: "custom", }, }, - wantErrMsg: "", + wantErrMsg: "", + wantConditions: defaultConditions, }, } @@ -129,10 +155,11 @@ func TestMachineConfigPoolReconciler(t *testing.T) { client, dh, ) - - _, err := r.Reconcile(context.Background(), tt.request) + ctx := context.Background() + _, err := r.Reconcile(ctx, tt.request) utilerror.AssertErrorMessage(t, err, tt.wantErrMsg) + utilconditions.AssertControllerConditions(t, ctx, client, tt.wantConditions) }) } } diff --git a/pkg/operator/controllers/guardrails/staticresources/gk_audit_controller_deployment.yaml b/pkg/operator/controllers/guardrails/staticresources/gk_audit_controller_deployment.yaml index 01fa1aa0242..733afdd29c8 100644 --- a/pkg/operator/controllers/guardrails/staticresources/gk_audit_controller_deployment.yaml +++ b/pkg/operator/controllers/guardrails/staticresources/gk_audit_controller_deployment.yaml @@ -100,11 +100,9 @@ spec: drop: - ALL readOnlyRootFilesystem: true - runAsGroup: 999 runAsNonRoot: true - runAsUser: 1000 - # seccompProfile: - # type: RuntimeDefault + seccompProfile: + type: RuntimeDefault volumeMounts: - mountPath: /certs name: cert diff --git a/pkg/operator/controllers/guardrails/staticresources/gk_cluster_role.yaml b/pkg/operator/controllers/guardrails/staticresources/gk_cluster_role.yaml index 2c538e9dd13..5359564b5fd 100644 --- a/pkg/operator/controllers/guardrails/staticresources/gk_cluster_role.yaml +++ b/pkg/operator/controllers/guardrails/staticresources/gk_cluster_role.yaml @@ -157,11 +157,3 @@ rules: - patch - update - watch -- apiGroups: # https://open-policy-agent.github.io/gatekeeper/website/docs/vendor-specific/#running-on-openshift-4x - - security.openshift.io - resourceNames: - - anyuid - resources: - - securitycontextconstraints - verbs: - - use diff --git a/pkg/operator/controllers/guardrails/staticresources/gk_controller_manager_deployment.yaml b/pkg/operator/controllers/guardrails/staticresources/gk_controller_manager_deployment.yaml index c46784988b7..f3bb7beaa00 100644 --- a/pkg/operator/controllers/guardrails/staticresources/gk_controller_manager_deployment.yaml +++ b/pkg/operator/controllers/guardrails/staticresources/gk_controller_manager_deployment.yaml @@ -112,11 +112,9 @@ spec: drop: - ALL readOnlyRootFilesystem: true - runAsGroup: 999 runAsNonRoot: true - runAsUser: 1000 - # seccompProfile: - # type: RuntimeDefault + seccompProfile: + type: RuntimeDefault volumeMounts: - mountPath: /certs name: cert diff --git a/pkg/operator/controllers/guardrails/staticresources/gk_role.yaml b/pkg/operator/controllers/guardrails/staticresources/gk_role.yaml index 41c2bd5476e..967a3bb43ef 100644 --- a/pkg/operator/controllers/guardrails/staticresources/gk_role.yaml +++ b/pkg/operator/controllers/guardrails/staticresources/gk_role.yaml @@ -26,11 +26,3 @@ rules: - patch - update - watch -- apiGroups: # https://open-policy-agent.github.io/gatekeeper/website/docs/vendor-specific/#running-on-openshift-4x - - security.openshift.io - resourceNames: - - anyuid - resources: - - securitycontextconstraints - verbs: - - use diff --git a/pkg/util/mocks/adminactions/adminactions.go b/pkg/util/mocks/adminactions/adminactions.go index 6dcd15f5751..e7366bc9ebc 100644 --- a/pkg/util/mocks/adminactions/adminactions.go +++ b/pkg/util/mocks/adminactions/adminactions.go @@ -14,8 +14,10 @@ import ( features "github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2019-07-01/features" gomock "github.com/golang/mock/gomock" logrus "github.com/sirupsen/logrus" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" unstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" schema "k8s.io/apimachinery/pkg/runtime/schema" + watch "k8s.io/apimachinery/pkg/watch" ) // MockKubeActions is a mock of KubeActions interface. @@ -112,17 +114,17 @@ func (mr *MockKubeActionsMockRecorder) KubeCreateOrUpdate(arg0, arg1 interface{} } // KubeDelete mocks base method. -func (m *MockKubeActions) KubeDelete(arg0 context.Context, arg1, arg2, arg3 string, arg4 bool) error { +func (m *MockKubeActions) KubeDelete(arg0 context.Context, arg1, arg2, arg3 string, arg4 bool, arg5 *v1.DeletionPropagation) error { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "KubeDelete", arg0, arg1, arg2, arg3, arg4) + ret := m.ctrl.Call(m, "KubeDelete", arg0, arg1, arg2, arg3, arg4, arg5) ret0, _ := ret[0].(error) return ret0 } // KubeDelete indicates an expected call of KubeDelete. -func (mr *MockKubeActionsMockRecorder) KubeDelete(arg0, arg1, arg2, arg3, arg4 interface{}) *gomock.Call { +func (mr *MockKubeActionsMockRecorder) KubeDelete(arg0, arg1, arg2, arg3, arg4, arg5 interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "KubeDelete", reflect.TypeOf((*MockKubeActions)(nil).KubeDelete), arg0, arg1, arg2, arg3, arg4) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "KubeDelete", reflect.TypeOf((*MockKubeActions)(nil).KubeDelete), arg0, arg1, arg2, arg3, arg4, arg5) } // KubeGet mocks base method. @@ -170,6 +172,21 @@ func (mr *MockKubeActionsMockRecorder) KubeList(arg0, arg1, arg2 interface{}) *g return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "KubeList", reflect.TypeOf((*MockKubeActions)(nil).KubeList), arg0, arg1, arg2) } +// KubeWatch mocks base method. +func (m *MockKubeActions) KubeWatch(arg0 context.Context, arg1 *unstructured.Unstructured, arg2 string) (watch.Interface, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "KubeWatch", arg0, arg1, arg2) + ret0, _ := ret[0].(watch.Interface) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// KubeWatch indicates an expected call of KubeWatch. +func (mr *MockKubeActionsMockRecorder) KubeWatch(arg0, arg1, arg2 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "KubeWatch", reflect.TypeOf((*MockKubeActions)(nil).KubeWatch), arg0, arg1, arg2) +} + // ResolveGVR mocks base method. func (m *MockKubeActions) ResolveGVR(arg0 string) (*schema.GroupVersionResource, error) { m.ctrl.T.Helper() diff --git a/pkg/util/mocks/samples/samples.go b/pkg/util/mocks/samples/samples.go new file mode 100644 index 00000000000..c14a29c2b60 --- /dev/null +++ b/pkg/util/mocks/samples/samples.go @@ -0,0 +1,230 @@ +// Code generated by MockGen. DO NOT EDIT. +// Source: github.com/openshift/client-go/samples/clientset/versioned/typed/samples/v1 (interfaces: SamplesV1Interface,ConfigInterface) + +// Package mock_v1 is a generated GoMock package. +package mock_v1 + +import ( + context "context" + reflect "reflect" + + gomock "github.com/golang/mock/gomock" + v1 "github.com/openshift/api/samples/v1" + v10 "github.com/openshift/client-go/samples/clientset/versioned/typed/samples/v1" + v11 "k8s.io/apimachinery/pkg/apis/meta/v1" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + rest "k8s.io/client-go/rest" +) + +// MockSamplesV1Interface is a mock of SamplesV1Interface interface. +type MockSamplesV1Interface struct { + ctrl *gomock.Controller + recorder *MockSamplesV1InterfaceMockRecorder +} + +// MockSamplesV1InterfaceMockRecorder is the mock recorder for MockSamplesV1Interface. +type MockSamplesV1InterfaceMockRecorder struct { + mock *MockSamplesV1Interface +} + +// NewMockSamplesV1Interface creates a new mock instance. +func NewMockSamplesV1Interface(ctrl *gomock.Controller) *MockSamplesV1Interface { + mock := &MockSamplesV1Interface{ctrl: ctrl} + mock.recorder = &MockSamplesV1InterfaceMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockSamplesV1Interface) EXPECT() *MockSamplesV1InterfaceMockRecorder { + return m.recorder +} + +// Configs mocks base method. +func (m *MockSamplesV1Interface) Configs() v10.ConfigInterface { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Configs") + ret0, _ := ret[0].(v10.ConfigInterface) + return ret0 +} + +// Configs indicates an expected call of Configs. +func (mr *MockSamplesV1InterfaceMockRecorder) Configs() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Configs", reflect.TypeOf((*MockSamplesV1Interface)(nil).Configs)) +} + +// RESTClient mocks base method. +func (m *MockSamplesV1Interface) RESTClient() rest.Interface { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "RESTClient") + ret0, _ := ret[0].(rest.Interface) + return ret0 +} + +// RESTClient indicates an expected call of RESTClient. +func (mr *MockSamplesV1InterfaceMockRecorder) RESTClient() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RESTClient", reflect.TypeOf((*MockSamplesV1Interface)(nil).RESTClient)) +} + +// MockConfigInterface is a mock of ConfigInterface interface. +type MockConfigInterface struct { + ctrl *gomock.Controller + recorder *MockConfigInterfaceMockRecorder +} + +// MockConfigInterfaceMockRecorder is the mock recorder for MockConfigInterface. +type MockConfigInterfaceMockRecorder struct { + mock *MockConfigInterface +} + +// NewMockConfigInterface creates a new mock instance. +func NewMockConfigInterface(ctrl *gomock.Controller) *MockConfigInterface { + mock := &MockConfigInterface{ctrl: ctrl} + mock.recorder = &MockConfigInterfaceMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockConfigInterface) EXPECT() *MockConfigInterfaceMockRecorder { + return m.recorder +} + +// Create mocks base method. +func (m *MockConfigInterface) Create(arg0 context.Context, arg1 *v1.Config, arg2 v11.CreateOptions) (*v1.Config, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Create", arg0, arg1, arg2) + ret0, _ := ret[0].(*v1.Config) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Create indicates an expected call of Create. +func (mr *MockConfigInterfaceMockRecorder) Create(arg0, arg1, arg2 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Create", reflect.TypeOf((*MockConfigInterface)(nil).Create), arg0, arg1, arg2) +} + +// Delete mocks base method. +func (m *MockConfigInterface) Delete(arg0 context.Context, arg1 string, arg2 v11.DeleteOptions) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Delete", arg0, arg1, arg2) + ret0, _ := ret[0].(error) + return ret0 +} + +// Delete indicates an expected call of Delete. +func (mr *MockConfigInterfaceMockRecorder) Delete(arg0, arg1, arg2 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Delete", reflect.TypeOf((*MockConfigInterface)(nil).Delete), arg0, arg1, arg2) +} + +// DeleteCollection mocks base method. +func (m *MockConfigInterface) DeleteCollection(arg0 context.Context, arg1 v11.DeleteOptions, arg2 v11.ListOptions) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "DeleteCollection", arg0, arg1, arg2) + ret0, _ := ret[0].(error) + return ret0 +} + +// DeleteCollection indicates an expected call of DeleteCollection. +func (mr *MockConfigInterfaceMockRecorder) DeleteCollection(arg0, arg1, arg2 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DeleteCollection", reflect.TypeOf((*MockConfigInterface)(nil).DeleteCollection), arg0, arg1, arg2) +} + +// Get mocks base method. +func (m *MockConfigInterface) Get(arg0 context.Context, arg1 string, arg2 v11.GetOptions) (*v1.Config, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Get", arg0, arg1, arg2) + ret0, _ := ret[0].(*v1.Config) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Get indicates an expected call of Get. +func (mr *MockConfigInterfaceMockRecorder) Get(arg0, arg1, arg2 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Get", reflect.TypeOf((*MockConfigInterface)(nil).Get), arg0, arg1, arg2) +} + +// List mocks base method. +func (m *MockConfigInterface) List(arg0 context.Context, arg1 v11.ListOptions) (*v1.ConfigList, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "List", arg0, arg1) + ret0, _ := ret[0].(*v1.ConfigList) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// List indicates an expected call of List. +func (mr *MockConfigInterfaceMockRecorder) List(arg0, arg1 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "List", reflect.TypeOf((*MockConfigInterface)(nil).List), arg0, arg1) +} + +// Patch mocks base method. +func (m *MockConfigInterface) Patch(arg0 context.Context, arg1 string, arg2 types.PatchType, arg3 []byte, arg4 v11.PatchOptions, arg5 ...string) (*v1.Config, error) { + m.ctrl.T.Helper() + varargs := []interface{}{arg0, arg1, arg2, arg3, arg4} + for _, a := range arg5 { + varargs = append(varargs, a) + } + ret := m.ctrl.Call(m, "Patch", varargs...) + ret0, _ := ret[0].(*v1.Config) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Patch indicates an expected call of Patch. +func (mr *MockConfigInterfaceMockRecorder) Patch(arg0, arg1, arg2, arg3, arg4 interface{}, arg5 ...interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + varargs := append([]interface{}{arg0, arg1, arg2, arg3, arg4}, arg5...) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Patch", reflect.TypeOf((*MockConfigInterface)(nil).Patch), varargs...) +} + +// Update mocks base method. +func (m *MockConfigInterface) Update(arg0 context.Context, arg1 *v1.Config, arg2 v11.UpdateOptions) (*v1.Config, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Update", arg0, arg1, arg2) + ret0, _ := ret[0].(*v1.Config) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Update indicates an expected call of Update. +func (mr *MockConfigInterfaceMockRecorder) Update(arg0, arg1, arg2 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Update", reflect.TypeOf((*MockConfigInterface)(nil).Update), arg0, arg1, arg2) +} + +// UpdateStatus mocks base method. +func (m *MockConfigInterface) UpdateStatus(arg0 context.Context, arg1 *v1.Config, arg2 v11.UpdateOptions) (*v1.Config, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "UpdateStatus", arg0, arg1, arg2) + ret0, _ := ret[0].(*v1.Config) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// UpdateStatus indicates an expected call of UpdateStatus. +func (mr *MockConfigInterfaceMockRecorder) UpdateStatus(arg0, arg1, arg2 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateStatus", reflect.TypeOf((*MockConfigInterface)(nil).UpdateStatus), arg0, arg1, arg2) +} + +// Watch mocks base method. +func (m *MockConfigInterface) Watch(arg0 context.Context, arg1 v11.ListOptions) (watch.Interface, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Watch", arg0, arg1) + ret0, _ := ret[0].(watch.Interface) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Watch indicates an expected call of Watch. +func (mr *MockConfigInterfaceMockRecorder) Watch(arg0, arg1 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Watch", reflect.TypeOf((*MockConfigInterface)(nil).Watch), arg0, arg1) +} diff --git a/pkg/util/mocks/samplesclient/versioned.go b/pkg/util/mocks/samplesclient/versioned.go new file mode 100644 index 00000000000..cd7a8c6f0dc --- /dev/null +++ b/pkg/util/mocks/samplesclient/versioned.go @@ -0,0 +1,64 @@ +// Code generated by MockGen. DO NOT EDIT. +// Source: github.com/openshift/client-go/samples/clientset/versioned (interfaces: Interface) + +// Package mock_versioned is a generated GoMock package. +package mock_versioned + +import ( + reflect "reflect" + + gomock "github.com/golang/mock/gomock" + v1 "github.com/openshift/client-go/samples/clientset/versioned/typed/samples/v1" + discovery "k8s.io/client-go/discovery" +) + +// MockInterface is a mock of Interface interface. +type MockInterface struct { + ctrl *gomock.Controller + recorder *MockInterfaceMockRecorder +} + +// MockInterfaceMockRecorder is the mock recorder for MockInterface. +type MockInterfaceMockRecorder struct { + mock *MockInterface +} + +// NewMockInterface creates a new mock instance. +func NewMockInterface(ctrl *gomock.Controller) *MockInterface { + mock := &MockInterface{ctrl: ctrl} + mock.recorder = &MockInterfaceMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockInterface) EXPECT() *MockInterfaceMockRecorder { + return m.recorder +} + +// Discovery mocks base method. +func (m *MockInterface) Discovery() discovery.DiscoveryInterface { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Discovery") + ret0, _ := ret[0].(discovery.DiscoveryInterface) + return ret0 +} + +// Discovery indicates an expected call of Discovery. +func (mr *MockInterfaceMockRecorder) Discovery() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Discovery", reflect.TypeOf((*MockInterface)(nil).Discovery)) +} + +// SamplesV1 mocks base method. +func (m *MockInterface) SamplesV1() v1.SamplesV1Interface { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "SamplesV1") + ret0, _ := ret[0].(v1.SamplesV1Interface) + return ret0 +} + +// SamplesV1 indicates an expected call of SamplesV1. +func (mr *MockInterfaceMockRecorder) SamplesV1() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SamplesV1", reflect.TypeOf((*MockInterface)(nil).SamplesV1)) +} diff --git a/pkg/util/steps/condition.go b/pkg/util/steps/condition.go index adcc953441f..3a6f8e8a517 100644 --- a/pkg/util/steps/condition.go +++ b/pkg/util/steps/condition.go @@ -5,13 +5,35 @@ package steps import ( "context" + "errors" "fmt" + "net/http" + "strings" "time" "github.com/sirupsen/logrus" "k8s.io/apimachinery/pkg/util/wait" + + "github.com/Azure/ARO-RP/pkg/api" ) +// Functions that run as condition-steps return Error +// instead of InternalServerError +// Efforts are being made to not have generic Hive errors but specific, actionable failure cases. +// Instead of providing Hive-specific error messages to customers, the below will send a timeout error message. +var timeoutConditionErrors = map[string]string{ + "apiServersReady": "Kube API has not initialised successfully and is unavailable.", + "minimumWorkerNodesReady": "Minimum number of worker nodes have not been successfully created.", + "operatorConsoleExists": "Console Cluster Operator has failed to initialize successfully.", + "operatorConsoleReady": "Console Cluster Operator has not started successfully.", + "clusterVersionReady": "Cluster Verion is not reporting status as ready.", + "ingressControllerReady": "Ingress Cluster Operator has not started successfully.", + "aroDeploymentReady": "ARO Cluster Operator has failed to initialize successfully.", + "ensureAROOperatorRunningDesiredVersion": "ARO Cluster Operator is not running desired version.", + "hiveClusterDeploymentReady": "Timed out waiting for a condition, cluster Installation is unsuccessful.", + "hiveClusterInstallationComplete": "Timed out waiting for a condition, cluster Installation is unsuccessful.", +} + // conditionFunction is a function that takes a context and returns whether the // condition has been met and an error. // @@ -54,21 +76,49 @@ func (c conditionStep) run(ctx context.Context, log *logrus.Entry) error { // Run the condition function immediately, and then every // runner.pollInterval, until the condition returns true or timeoutCtx's - // timeout fires. Errors from `f` are returned directly. + // timeout fires. Errors from `f` are returned directly unless the error + // is ErrWaitTimeout. Internal ErrWaitTimeout errors are wrapped to avoid + // confusion with wait.PollImmediateUntil's own behavior of returning + // ErrWaitTimeout when the condition is not met. err := wait.PollImmediateUntil(pollInterval, func() (bool, error) { // We use the outer context, not the timeout context, as we do not want // to time out the condition function itself, only stop retrying once // timeoutCtx's timeout has fired. - return c.f(ctx) + cnd, cndErr := c.f(ctx) + if errors.Is(cndErr, wait.ErrWaitTimeout) { + return cnd, fmt.Errorf("condition encountered internal timeout: %w", cndErr) + } + + return cnd, cndErr }, timeoutCtx.Done()) if err != nil && !c.fail { log.Warnf("step %s failed but has configured 'fail=%t'. Continuing. Error: %s", c, c.fail, err.Error()) return nil } + if errors.Is(err, wait.ErrWaitTimeout) { + return enrichConditionTimeoutError(c.f) + } return err } +// Instead of giving Generic, timed out waiting for a condition, error +// returns enriched error messages mentioned in timeoutConditionErrors +func enrichConditionTimeoutError(f conditionFunction) error { + funcNameParts := strings.Split(FriendlyName(f), ".") + funcName := strings.TrimSuffix(funcNameParts[len(funcNameParts)-1], "-fm") + + message, exists := timeoutConditionErrors[funcName] + if !exists { + return errors.New("timed out waiting for the condition") + } + return api.NewCloudError( + http.StatusInternalServerError, + api.CloudErrorCodeDeploymentFailed, + "", message+"Please retry, if issue persists: raise azure support ticket", + ) +} + func (c conditionStep) String() string { return fmt.Sprintf("[Condition %s, timeout %s]", FriendlyName(c.f), c.timeout) } diff --git a/pkg/util/steps/condition_test.go b/pkg/util/steps/condition_test.go new file mode 100644 index 00000000000..cfecdaed7a1 --- /dev/null +++ b/pkg/util/steps/condition_test.go @@ -0,0 +1,95 @@ +package steps + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "context" + "testing" +) + +// functionnames that will be used in the conditionFunction below +// All the keys of map timeoutConditionErrors +func apiServersReady(context.Context) (bool, error) { return false, nil } +func minimumWorkerNodesReady(context.Context) (bool, error) { return false, nil } +func operatorConsoleExists(context.Context) (bool, error) { return false, nil } +func operatorConsoleReady(context.Context) (bool, error) { return false, nil } +func clusterVersionReady(context.Context) (bool, error) { return false, nil } +func ingressControllerReady(context.Context) (bool, error) { return false, nil } +func aroDeploymentReady(context.Context) (bool, error) { return false, nil } +func ensureAROOperatorRunningDesiredVersion(context.Context) (bool, error) { return false, nil } +func hiveClusterDeploymentReady(context.Context) (bool, error) { return false, nil } +func hiveClusterInstallationComplete(context.Context) (bool, error) { return false, nil } + +func TestEnrichConditionTimeoutError(t *testing.T) { + for _, tt := range []struct { + desc string + function conditionFunction + wantErr string + }{ + // Verify response for func's mention in timeoutConditionErrors and + // Emit generic Error if an unknown func + { + // unknown function + desc: "test conditionfail for func - unknownFunc", + function: timingOutCondition, + wantErr: "timed out waiting for the condition", + }, + { + desc: "test conditionfail for func - apiServersReady", + function: apiServersReady, + wantErr: "500: DeploymentFailed: : Kube API has not initialised successfully and is unavailable.Please retry, if issue persists: raise azure support ticket", + }, + { + desc: "test conditionfail for func - minimumWorkerNodesReady", + function: minimumWorkerNodesReady, + wantErr: "500: DeploymentFailed: : Minimum number of worker nodes have not been successfully created.Please retry, if issue persists: raise azure support ticket", + }, + { + desc: "test conditionfail for func - operatorConsoleExists", + function: operatorConsoleExists, + wantErr: "500: DeploymentFailed: : Console Cluster Operator has failed to initialize successfully.Please retry, if issue persists: raise azure support ticket", + }, + { + desc: "test conditionfail for func - operatorConsoleReady", + function: operatorConsoleReady, + wantErr: "500: DeploymentFailed: : Console Cluster Operator has not started successfully.Please retry, if issue persists: raise azure support ticket", + }, + { + desc: "test conditionfail for func - clusterVersionReady", + function: clusterVersionReady, + wantErr: "500: DeploymentFailed: : Cluster Verion is not reporting status as ready.Please retry, if issue persists: raise azure support ticket", + }, + { + desc: "test conditionfail for func - clusterVersionReady", + function: ingressControllerReady, + wantErr: "500: DeploymentFailed: : Ingress Cluster Operator has not started successfully.Please retry, if issue persists: raise azure support ticket", + }, + { + desc: "test conditionfail for func - aroDeploymentReady", + function: aroDeploymentReady, + wantErr: "500: DeploymentFailed: : ARO Cluster Operator has failed to initialize successfully.Please retry, if issue persists: raise azure support ticket", + }, + { + desc: "test conditionfail for func - ensureAROOperatorRunningDesiredVersion", + function: ensureAROOperatorRunningDesiredVersion, + wantErr: "500: DeploymentFailed: : ARO Cluster Operator is not running desired version.Please retry, if issue persists: raise azure support ticket", + }, + { + desc: "test conditionfail for func - hiveClusterDeploymentReady", + function: hiveClusterDeploymentReady, + wantErr: "500: DeploymentFailed: : Timed out waiting for a condition, cluster Installation is unsuccessful.Please retry, if issue persists: raise azure support ticket", + }, + { + desc: "test conditionfail for func - hiveClusterInstallationComplete", + function: hiveClusterInstallationComplete, + wantErr: "500: DeploymentFailed: : Timed out waiting for a condition, cluster Installation is unsuccessful.Please retry, if issue persists: raise azure support ticket", + }, + } { + t.Run(tt.desc, func(t *testing.T) { + if got := enrichConditionTimeoutError(tt.function); got.Error() != tt.wantErr { + t.Errorf("invlaid enrichConditionTimeoutError: %s, got: %s", tt.wantErr, got) + } + }) + } +} diff --git a/pkg/util/steps/runner_test.go b/pkg/util/steps/runner_test.go index 69d6a9e76c9..4b3cf30ec3f 100644 --- a/pkg/util/steps/runner_test.go +++ b/pkg/util/steps/runner_test.go @@ -13,6 +13,7 @@ import ( "github.com/onsi/gomega" "github.com/onsi/gomega/types" "github.com/sirupsen/logrus" + "k8s.io/apimachinery/pkg/util/wait" utilerror "github.com/Azure/ARO-RP/test/util/error" testlog "github.com/Azure/ARO-RP/test/util/log" @@ -26,6 +27,9 @@ func timingOutCondition(ctx context.Context) (bool, error) { time.Sleep(60 * time.Millisecond) return false, nil } +func internalTimeoutCondition(ctx context.Context) (bool, error) { + return false, wait.ErrWaitTimeout +} func currentTimeFunc() time.Time { return time.Now() @@ -169,6 +173,36 @@ func TestStepRunner(t *testing.T) { }, wantErr: "timed out waiting for the condition", }, + { + name: "A Condition that returns a timeout error causes a different failure from a timed out Condition", + steps: func(controller *gomock.Controller) []Step { + return []Step{ + Action(successfulFunc), + &conditionStep{ + f: internalTimeoutCondition, + fail: true, + pollInterval: 20 * time.Millisecond, + timeout: 50 * time.Millisecond, + }, + Action(successfulFunc), + } + }, + wantEntries: []map[string]types.GomegaMatcher{ + { + "msg": gomega.Equal("running step [Action github.com/Azure/ARO-RP/pkg/util/steps.successfulFunc]"), + "level": gomega.Equal(logrus.InfoLevel), + }, + { + "msg": gomega.Equal("running step [Condition github.com/Azure/ARO-RP/pkg/util/steps.internalTimeoutCondition, timeout 50ms]"), + "level": gomega.Equal(logrus.InfoLevel), + }, + { + "msg": gomega.Equal("step [Condition github.com/Azure/ARO-RP/pkg/util/steps.internalTimeoutCondition, timeout 50ms] encountered error: condition encountered internal timeout: timed out waiting for the condition"), + "level": gomega.Equal(logrus.ErrorLevel), + }, + }, + wantErr: "condition encountered internal timeout: timed out waiting for the condition", + }, { name: "A Condition that does not return true in the timeout time causes a failure", steps: func(controller *gomock.Controller) []Step { diff --git a/python/az/aro/azext_aro/_client_factory.py b/python/az/aro/azext_aro/_client_factory.py index c2402e2a9a2..775fba938d0 100644 --- a/python/az/aro/azext_aro/_client_factory.py +++ b/python/az/aro/azext_aro/_client_factory.py @@ -4,7 +4,7 @@ import urllib3 from azext_aro.custom import rp_mode_development -from azext_aro.vendored_sdks.azure.mgmt.redhatopenshift.v2022_09_04 import AzureRedHatOpenShiftClient +from azext_aro.vendored_sdks.azure.mgmt.redhatopenshift.v2023_04_01 import AzureRedHatOpenShiftClient from azure.cli.core.commands.client_factory import get_mgmt_service_client diff --git a/python/az/aro/azext_aro/_params.py b/python/az/aro/azext_aro/_params.py index a3e6a95a8c6..fcb56e536a0 100644 --- a/python/az/aro/azext_aro/_params.py +++ b/python/az/aro/azext_aro/_params.py @@ -16,6 +16,7 @@ from azext_aro._validators import validate_worker_vm_disk_size_gb from azext_aro._validators import validate_refresh_cluster_credentials from azext_aro._validators import validate_version_format +from azext_aro._validators import validate_outbound_type from azure.cli.core.commands.parameters import name_type from azure.cli.core.commands.parameters import get_enum_type, get_three_state_flag from azure.cli.core.commands.parameters import resource_group_name_type @@ -64,7 +65,9 @@ def load_arguments(self, _): c.argument('service_cidr', help='CIDR of service network. Must be a minimum of /18 or larger.', validator=validate_cidr('service_cidr')) - + c.argument('outbound_type', + help='Outbound type of cluster. Must be "Loadbalancer" (default) or "UserDefinedRouting".', + validator=validate_outbound_type) c.argument('disk_encryption_set', help='ResourceID of the DiskEncryptionSet to be used for master and worker VMs.', validator=validate_disk_encryption_set) diff --git a/python/az/aro/azext_aro/_validators.py b/python/az/aro/azext_aro/_validators.py index 5f18ed30e0c..06d9b5d9abb 100644 --- a/python/az/aro/azext_aro/_validators.py +++ b/python/az/aro/azext_aro/_validators.py @@ -118,6 +118,25 @@ def validate_pull_secret(namespace): raise InvalidArgumentValueError("Invalid --pull-secret.") from e +def validate_outbound_type(namespace): + outbound_type = getattr(namespace, 'outbound_type') + if outbound_type not in {'UserDefinedRouting', 'Loadbalancer', None}: + raise InvalidArgumentValueError('Invalid --outbound-type: must be "UserDefinedRouting" or "Loadbalancer"') + + ingress_visibility = getattr(namespace, 'ingress_visibility') + apiserver_visibility = getattr(namespace, 'apiserver_visibility') + + if (outbound_type == 'UserDefinedRouting' and + (is_visibility_public(ingress_visibility) or is_visibility_public(apiserver_visibility))): + raise InvalidArgumentValueError('Invalid --outbound-type: cannot use UserDefinedRouting when ' + + 'either --apiserver-visibility or --ingress-visibility is set ' + + 'to Public or not defined') + + +def is_visibility_public(visibility): + return visibility == 'Public' or visibility is None + + def validate_subnet(key): def _validate_subnet(cmd, namespace): subnet = getattr(namespace, key) diff --git a/python/az/aro/azext_aro/commands.py b/python/az/aro/azext_aro/commands.py index 247c62f8b34..e11171dcace 100644 --- a/python/az/aro/azext_aro/commands.py +++ b/python/az/aro/azext_aro/commands.py @@ -11,7 +11,7 @@ def load_command_table(self, _): aro_sdk = CliCommandType( - operations_tmpl='azext_aro.vendored_sdks.azure.mgmt.redhatopenshift.v2022_09_04.operations#OpenShiftClustersOperations.{}', # pylint: disable=line-too-long + operations_tmpl='azext_aro.vendored_sdks.azure.mgmt.redhatopenshift.v2023_04_01.operations#OpenShiftClustersOperations.{}', # pylint: disable=line-too-long client_factory=cf_aro) with self.command_group('aro', aro_sdk, client_factory=cf_aro) as g: diff --git a/python/az/aro/azext_aro/custom.py b/python/az/aro/azext_aro/custom.py index 6c2ec5520cc..459a5ec12ec 100644 --- a/python/az/aro/azext_aro/custom.py +++ b/python/az/aro/azext_aro/custom.py @@ -7,7 +7,7 @@ from base64 import b64decode import textwrap -import azext_aro.vendored_sdks.azure.mgmt.redhatopenshift.v2022_09_04.models as openshiftcluster +import azext_aro.vendored_sdks.azure.mgmt.redhatopenshift.v2023_04_01.models as openshiftcluster from azure.cli.command_modules.role import GraphError from azure.cli.core.commands.client_factory import get_mgmt_service_client @@ -53,6 +53,7 @@ def aro_create(cmd, # pylint: disable=too-many-locals client_secret=None, pod_cidr=None, service_cidr=None, + outbound_type=None, disk_encryption_set=None, master_encryption_at_host=False, master_vm_size=None, @@ -139,6 +140,7 @@ def aro_create(cmd, # pylint: disable=too-many-locals network_profile=openshiftcluster.NetworkProfile( pod_cidr=pod_cidr or '10.128.0.0/14', service_cidr=service_cidr or '172.30.0.0/16', + outbound_type=outbound_type or '', ), master_profile=openshiftcluster.MasterProfile( vm_size=master_vm_size or 'Standard_D8s_v3', diff --git a/python/az/aro/azext_aro/tests/latest/unit/test_validators.py b/python/az/aro/azext_aro/tests/latest/unit/test_validators.py index 4e3ce912416..bcae4a441af 100644 --- a/python/az/aro/azext_aro/tests/latest/unit/test_validators.py +++ b/python/az/aro/azext_aro/tests/latest/unit/test_validators.py @@ -3,7 +3,7 @@ from unittest.mock import Mock, patch from azext_aro._validators import ( - validate_cidr, validate_client_id, validate_client_secret, validate_cluster_resource_group, + validate_cidr, validate_client_id, validate_client_secret, validate_cluster_resource_group, validate_outbound_type, validate_disk_encryption_set, validate_domain, validate_pull_secret, validate_subnet, validate_subnets, validate_visibility, validate_vnet_resource_group_name, validate_worker_count, validate_worker_vm_disk_size_gb, validate_refresh_cluster_credentials ) @@ -775,3 +775,107 @@ def test_validate_refresh_cluster_credentials(test_description, namespace, expec else: with pytest.raises(expected_exception): validate_refresh_cluster_credentials(namespace) + + +test_validate_outbound_type_data = [ + ( + "Should not raise exception when key is Loadbalancer.", + Mock(outbound_type='Loadbalancer'), + None + ), + ( + "Should not raise exception when key is Loadbalancer and ingress visibility private", + Mock( + outbound_type='Loadbalancer', + apiserver_visibility="Public", + ingress_visibility="Private" + ), + None + ), + ( + "Should not raise exception when key is Loadbalancer and apiserver visibility private", + Mock( + outbound_type='Loadbalancer', + apiserver_visibility="Private", + ingress_visibility="Public" + ), + None + ), + ( + "Should not raise exception when key is Loadbalancer and ingress/apiserver visibility private", + Mock( + outbound_type='Loadbalancer', + apiserver_visibility="Private", + ingress_visibility="Private" + ), + None + ), + ( + "Should not raise exception when key is empty.", + Mock(outbound_type=None), + None + ), + ( + "Should not raise exception with UDR and ingress/apiserver visibility private", + Mock( + outbound_type="UserDefinedRouting", + apiserver_visibility="Private", + ingress_visibility="Private" + ), + None + ), + ( + "Should raise exception with UDR and ingress visibility is public", + Mock( + outbound_type="UserDefinedRouting", + apiserver_visibility="Private", + ingress_visibility="Public" + ), + InvalidArgumentValueError + ), + ( + "Should raise exception with UDR and apiserver visibility is public", + Mock( + outbound_type="UserDefinedRouting", + apiserver_visibility="Public", + ingress_visibility="Private" + ), + InvalidArgumentValueError + ), + ( + "Should raise exception with UDR and apiserver/ingress visibility is public", + Mock( + outbound_type="UserDefinedRouting", + apiserver_visibility="Public", + ingress_visibility="Public" + ), + InvalidArgumentValueError + ), + ( + "Should raise exception when key is UserDefinedRouting and apiserver/ingress visibilities are not defined.", + Mock( + outbound_type="UserDefinedRouting", + apiserver_visibility=None, + ingress_visibility=None + ), + InvalidArgumentValueError + ), + ( + "Should raise exception when key is a different value.", + Mock(outbound_type='testFail'), + InvalidArgumentValueError + ), +] + + +@pytest.mark.parametrize( + "test_description, namespace, expected_exception", + test_validate_outbound_type_data, + ids=[i[0] for i in test_validate_outbound_type_data] +) +def test_validate_outbound_type(test_description, namespace, expected_exception): + if expected_exception is None: + validate_outbound_type(namespace) + else: + with pytest.raises(expected_exception): + validate_outbound_type(namespace)