Skip to content

Commit a53221e

Browse files
committed
test/cases/worker-executor-crash: verify error details on failure
If osbuild fails its stdout should be captured and added to the error details, also when using the ec2 executor. This test case makes sure that the error details contain the stacktrace and the error message from osbuild.
1 parent ac2ae18 commit a53221e

File tree

2 files changed

+246
-0
lines changed

2 files changed

+246
-0
lines changed

.gitlab-ci.yml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -889,6 +889,19 @@ WorkerExecutor:
889889
RUNNER: aws/fedora-41-x86_64
890890
IAM_INSTANCE_PROFILE: worker-executor
891891

892+
WorkerExecutorFailure:
893+
stage: test
894+
extends: .terraform
895+
rules:
896+
- !reference [.upstream_rules_all, rules]
897+
- !reference [.ga_rules_all, rules]
898+
script:
899+
- schutzbot/deploy.sh
900+
- /usr/libexec/tests/osbuild-composer/worker-executor-crash.sh
901+
variables:
902+
RUNNER: aws/fedora-41-x86_64
903+
IAM_INSTANCE_PROFILE: worker-executor
904+
892905
finish:
893906
stage: finish
894907
dependencies: []
Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
#!/bin/bash
2+
3+
set -euo pipefail
4+
5+
source /usr/libexec/osbuild-composer-test/set-env-variables.sh
6+
source /usr/libexec/tests/osbuild-composer/shared_lib.sh
7+
8+
9+
# Container image used for cloud provider CLI tools
10+
CONTAINER_IMAGE_CLOUD_TOOLS="quay.io/osbuild/cloud-tools:latest"
11+
12+
# Provision the software under test.
13+
/usr/libexec/osbuild-composer-test/provision.sh none
14+
15+
TEMPDIR=$(mktemp -d)
16+
BLUEPRINT_FILE=${TEMPDIR}/blueprint.toml
17+
COMPOSE_START=${TEMPDIR}/compose-start.json
18+
COMPOSE_INFO=${TEMPDIR}/compose-info.json
19+
DESCR_INST=${TEMPDIR}/descr-inst.json
20+
AUTH_SG=${TEMPDIR}/auth-sgrule.json
21+
DESCR_SGRULE=${TEMPDIR}/descr-sgrule.json
22+
KEYPAIR=${TEMPDIR}/keypair.pem
23+
INSTANCE_ID=$(curl -Ls http://169.254.169.254/latest/meta-data/instance-id)
24+
WORKER_HOST=$(curl -Ls http://169.254.169.254/latest/meta-data/local-ipv4)
25+
26+
# Check available container runtime
27+
if type -p podman 2>/dev/null >&2; then
28+
CONTAINER_RUNTIME=podman
29+
elif type -p docker 2>/dev/null >&2; then
30+
CONTAINER_RUNTIME=docker
31+
else
32+
echo No container runtime found, install podman or docker.
33+
exit 2
34+
fi
35+
36+
if ! hash aws; then
37+
echo "Using 'awscli' from a container"
38+
sudo "${CONTAINER_RUNTIME}" pull ${CONTAINER_IMAGE_CLOUD_TOOLS}
39+
40+
AWS_CMD="sudo ${CONTAINER_RUNTIME} run --rm \
41+
-v ${TEMPDIR}:${TEMPDIR}:Z \
42+
${CONTAINER_IMAGE_CLOUD_TOOLS} aws --region $AWS_REGION --output json --color on"
43+
else
44+
echo "Using pre-installed 'aws' from the system"
45+
AWS_CMD="aws --region $AWS_REGION --output json --color on"
46+
fi
47+
$AWS_CMD --version
48+
49+
subprocessPIDs=()
50+
function cleanup() {
51+
# since this function can be called at any time, ensure that we don't expand unbound variables
52+
AWS_CMD="${AWS_CMD:-}"
53+
54+
if [ -n "$AWS_CMD" ] && [ -f "$KEYPAIR" ]; then
55+
$AWS_CMD ec2 delete-key-pair --key-name "key-for-$INSTANCE_ID-executor"
56+
fi
57+
58+
for p in "${subprocessPIDs[@]}"; do
59+
sudo pkill -P "$p" || true
60+
done
61+
}
62+
63+
trap cleanup EXIT
64+
65+
$AWS_CMD ec2 create-key-pair --key-name "key-for-$INSTANCE_ID-executor" --query 'KeyMaterial' --output text > "$KEYPAIR"
66+
chmod 400 "$KEYPAIR"
67+
$AWS_CMD ec2 describe-key-pairs --key-names "key-for-$INSTANCE_ID-executor"
68+
69+
sudo tee "/etc/osbuild-worker/osbuild-worker.toml" <<EOF
70+
[osbuild_executor]
71+
type = "aws.ec2"
72+
key_name = "key-for-$INSTANCE_ID-executor"
73+
EOF
74+
75+
sudo systemctl restart [email protected]
76+
77+
# Write a basic blueprint for our image.
78+
tee "$BLUEPRINT_FILE" > /dev/null << EOF
79+
name = "bash"
80+
description = "A base system"
81+
version = "0.0.1"
82+
83+
[customizations]
84+
[customizations.services]
85+
enabled = ["blergh"]
86+
EOF
87+
88+
sudo composer-cli blueprints push "$BLUEPRINT_FILE"
89+
90+
WORKER_UNIT=$(sudo systemctl list-units | grep -o -E "osbuild.*worker.*\.service")
91+
sudo journalctl -af -n 1 -u "${WORKER_UNIT}" &
92+
subprocessPIDs+=( $! )
93+
94+
sudo composer-cli --json compose start bash container | tee "$COMPOSE_START"
95+
COMPOSE_ID=$(get_build_info ".build_id" "$COMPOSE_START")
96+
97+
EXECUTOR_IP=0
98+
for _ in {1..60}; do
99+
$AWS_CMD ec2 describe-instances --filter "Name=tag:parent,Values=$INSTANCE_ID" > "$DESCR_INST"
100+
RESERVATIONS=$(jq -r '.Reservations | length' "$DESCR_INST")
101+
if [ "$RESERVATIONS" -gt 0 ]; then
102+
EXECUTOR_IP=$(jq -r .Reservations[0].Instances[0].PrivateIpAddress "$DESCR_INST")
103+
break
104+
fi
105+
106+
echo "Reservation not ready ret, waiting..."
107+
sleep 60
108+
done
109+
110+
if [ "$EXECUTOR_IP" = 0 ]; then
111+
redprint "Unable to find executor host"
112+
exit 1
113+
fi
114+
115+
RDY=0
116+
for _ in {0..60}; do
117+
if ssh-keyscan "$EXECUTOR_IP" > /dev/null 2>&1; then
118+
RDY=1
119+
break
120+
fi
121+
sleep 10
122+
done
123+
124+
if [ "$RDY" = 0 ]; then
125+
redprint "Unable to reach executor host $EXECUTOR_IP"
126+
exit 1
127+
fi
128+
129+
greenprint "Setting up executor"
130+
# the executor should be created with exactly one egress rule (allowing traffic to the worker host)
131+
SGID=$(jq -r .Reservations[0].Instances[0].SecurityGroups[0].GroupId "$DESCR_INST")
132+
$AWS_CMD ec2 describe-security-group-rules --filters "Name=group-id,Values=$SGID" > "$DESCR_SGRULE"
133+
134+
EGRESS_TARGET=$(jq -r '.SecurityGroupRules[] | select(.IsEgress).CidrIpv4' "$DESCR_SGRULE")
135+
if [ "$EGRESS_TARGET" != "$WORKER_HOST/32" ]; then
136+
echo executors "$EGRESS_TARGET" is not the expected "$WORKER_HOST/32"
137+
exit 1
138+
fi
139+
140+
# allow the executor to access the internet for the setup:
141+
$AWS_CMD ec2 authorize-security-group-egress --group-id "$SGID" --protocol tcp --cidr 0.0.0.0/0 --port 1-65535 > "$AUTH_SG"
142+
SGRULEID=$(jq -r .SecurityGroupRules[0].SecurityGroupRuleId "$AUTH_SG")
143+
144+
GIT_COMMIT="${GIT_COMMIT:-${CI_COMMIT_SHA}}"
145+
OSBUILD_GIT_COMMIT=$(cat Schutzfile | jq -r '.["'"${ID}-${VERSION_ID}"'"].dependencies.osbuild.commit')
146+
# shellcheck disable=SC2087
147+
ssh -oStrictHostKeyChecking=no -i "$KEYPAIR" "fedora@$EXECUTOR_IP" sudo tee "/etc/yum.repos.d/osbuild.repo" <<EOF
148+
[osbuild-composer]
149+
name=osbuild-composer
150+
baseurl=http://osbuild-composer-repos.s3-website.us-east-2.amazonaws.com/osbuild-composer/${ID}-${VERSION_ID}/${ARCH}/${GIT_COMMIT}
151+
enabled=1
152+
gpgcheck=0
153+
priority=10
154+
[osbuild]
155+
name=osbuild
156+
baseurl=http://osbuild-composer-repos.s3-website.us-east-2.amazonaws.com/osbuild/${ID}-${VERSION_ID}/${ARCH}/${OSBUILD_GIT_COMMIT}
157+
enabled=1
158+
gpgcheck=0
159+
priority=10
160+
EOF
161+
162+
ssh -oStrictHostKeyChecking=no -i "$KEYPAIR" "fedora@EXECUTOR_IP" sudo journalctl -f &
163+
subprocessPIDs+=( $! )
164+
165+
ssh -oStrictHostKeyChecking=no -i "$KEYPAIR" "fedora@$EXECUTOR_IP" sudo dnf install -y osbuild-composer osbuild
166+
167+
# revoke internet access again during the build
168+
$AWS_CMD ec2 revoke-security-group-egress --group-id "$SGID" --security-group-rule-ids "$SGRULEID"
169+
$AWS_CMD ec2 describe-security-group-rules --filters "Name=group-id,Values=$SGID" > "$DESCR_SGRULE"
170+
171+
SGRULES_LENGTH=$(jq -r '.SecurityGroupRules | length' "$DESCR_SGRULE")
172+
if [ "$SGRULES_LENGTH" != 2 ]; then
173+
echo "Expected exactly 2 security group rules (got $SGRULES_LENGTH)"
174+
exit 1
175+
fi
176+
177+
greenprint "🔥 opening worker-executor port on firewall"
178+
ssh -oStrictHostKeyChecking=no -i "$KEYPAIR" "fedora@$EXECUTOR_IP" sudo firewall-cmd --zone=public --add-port=8001/tcp --permanent || true
179+
ssh -oStrictHostKeyChecking=no -i "$KEYPAIR" "fedora@$EXECUTOR_IP" sudo firewall-cmd --reload || true
180+
181+
greenprint "🚀 Starting worker executor"
182+
ssh -oStrictHostKeyChecking=no -i "$KEYPAIR" "fedora@$EXECUTOR_IP" sudo /usr/libexec/osbuild-composer/osbuild-worker-executor -host 0.0.0.0 &
183+
subprocessPIDs+=( $! )
184+
185+
# wait for compose to complete
186+
greenprint "⏱ Waiting for compose to finish: ${COMPOSE_ID}"
187+
while true; do
188+
sudo composer-cli --json compose info "${COMPOSE_ID}" | tee "$COMPOSE_INFO" > /dev/null
189+
COMPOSE_STATUS=$(get_build_info ".queue_status" "$COMPOSE_INFO")
190+
# Is the compose finished?
191+
if [[ $COMPOSE_STATUS != RUNNING ]] && [[ $COMPOSE_STATUS != WAITING ]]; then
192+
break
193+
fi
194+
sleep 30
195+
done
196+
197+
198+
echo "COMPOSES"
199+
sudo curl --silent --show-error --unix-socket /run/cloudapi/api.socket http:///localhost/api/image-builder-composer/v2/composes/
200+
201+
STATUS=$(sudo curl --silent --show-error --unix-socket /run/cloudapi/api.socket http:///localhost/api/image-builder-composer/v2/composes/ | jq -r .[0])
202+
COMPOSE_STATUS=$(echo "$STATUS" | jq -r '.image_status.status')
203+
COMPOSE_ERROR=$(echo "$STATUS" | jq -r '.image_status.error.reason')
204+
COMPOSE_ERROR_DETAILS=$(echo "$STATUS" | jq -r '.image_status.error.details')
205+
206+
if [ "$COMPOSE_STATUS" != "failure" ]; then
207+
echo "expected build failure, got $STATUS"
208+
exit 1
209+
fi
210+
211+
if [ "$COMPOSE_ERROR" != "osbuild build failed" ]; then
212+
echo "expected build failure, got $STATUS"
213+
exit 1
214+
fi
215+
216+
# Look for a clear error message and python stacktrace in the error details
217+
if ! echo "$COMPOSE_ERROR_DETAILS" | grep -q 'Failed to enable unit: Unit blergh.service does not exist'; then
218+
echo "details: $COMPOSE_ERROR_DETAILS"
219+
echo 'error details do not contain "Failed to enable unit..."'
220+
exit 1
221+
fi
222+
223+
if ! echo "$COMPOSE_ERROR_DETAILS" | grep -q "Traceback (most recent call last)"; then
224+
echo "details: $COMPOSE_ERROR_DETAILS"
225+
echo 'error details do not contain "Traceback (most recent call last)"'
226+
exit 1
227+
fi
228+
229+
if ! echo "$COMPOSE_ERROR_DETAILS" | grep -q "subprocess.CalledProcessError: Command '\['systemctl', '--root', '/run/osbuild/tree', 'enable', '--', 'blergh'\]' returned non-zero exit status 1."; then
230+
echo "details: $COMPOSE_ERROR_DETAILS"
231+
echo 'error details trace "subprocess.CalledProcessErorr:..."'
232+
exit 1
233+
fi

0 commit comments

Comments
 (0)