diff --git a/.github/workflows/ansible-lint.yml b/.github/workflows/ansible-lint.yml index 6a070ce47..201678dda 100644 --- a/.github/workflows/ansible-lint.yml +++ b/.github/workflows/ansible-lint.yml @@ -3,11 +3,8 @@ name: Run ansible-lint # Controls when the action will run. on: - # Triggers the workflow on push or pull request events but only for the main branch pull_request: - branches: [main] push: - branches: [main] schedule: - cron: '0 */8 * * *' # Allows you to run this workflow manually from the Actions tab diff --git a/config/ansible-lint.yml b/config/ansible-lint.yml index 9bb28edf0..5eea32934 100644 --- a/config/ansible-lint.yml +++ b/config/ansible-lint.yml @@ -14,11 +14,8 @@ skip_list: - 'command-instead-of-module' - 'command-instead-of-shell' - 'deprecated-local-action' - - 'key-order[task]' - - 'jinja[spacing]' - 'no-free-form' - 'chema[meta]' - - 'name[missing]' - 'var-naming[no-reserved]' - 'var-naming[no-role-prefix]' - 'var-naming[pattern]' @@ -29,15 +26,11 @@ skip_list: - 'yaml[indentation]' - 'yaml[key-duplicates]' - 'yaml[line-length]' - - 'yaml[new-line-at-end-of-file]' - 'yaml[octal-values]' - - 'yaml[trailing-spaces]' - 'yaml[truthy]' - 'name[template]' - 'name[casing]' - 'risky-file-permissions' - - 'risky-shell-pipe' - 'ignore-errors' - 'no-changed-when' - 'fqcn' - - 'args[module]' diff --git a/roles/benchmarking/benchmarking_run_mlperf_ssd/tasks/main.yml b/roles/benchmarking/benchmarking_run_mlperf_ssd/tasks/main.yml index 99b8991a5..1b12434c9 100644 --- a/roles/benchmarking/benchmarking_run_mlperf_ssd/tasks/main.yml +++ b/roles/benchmarking/benchmarking_run_mlperf_ssd/tasks/main.yml @@ -45,7 +45,8 @@ command: oc create -f "{{ artifact_extra_logs_dir }}/001_pod_run-mlperf-ssd.yml" -- block: +- name: Make sure the benchmark completes + block: - name: Wait for the benchmark completion command: oc get pod/{{ benchmarking_run_mlperf_ssd_name }} diff --git a/roles/cluster/cluster_capture_environment/tasks/main.yml b/roles/cluster/cluster_capture_environment/tasks/main.yml index 5e007541d..d0006390f 100644 --- a/roles/cluster/cluster_capture_environment/tasks/main.yml +++ b/roles/cluster/cluster_capture_environment/tasks/main.yml @@ -37,8 +37,6 @@ command: git describe HEAD --long --always register: git_version - args: - warn: false # don't warn about using git here - name: Store ci-artifact version from Git copy: @@ -50,8 +48,6 @@ command: git show --no-patch register: git_show - args: - warn: false # don't warn about using git here - name: Store ci-artifact last git commit copy: diff --git a/roles/cluster/cluster_create_osd/tasks/main.yml b/roles/cluster/cluster_create_osd/tasks/main.yml index 91233317d..af53a4adf 100644 --- a/roles/cluster/cluster_create_osd/tasks/main.yml +++ b/roles/cluster/cluster_create_osd/tasks/main.yml @@ -121,7 +121,7 @@ ocm edit machinepool {{ cluster_create_osd_machinepool_name }} --cluster={{ cluster_create_osd_cluster_name }} - --replicas={{ [2, cluster_create_osd_compute_nodes|int] |max }} + --replicas={{ [2, cluster_create_osd_compute_nodes | int] | max }} - name: Wait for the desired worker node count shell: | diff --git a/roles/cluster/cluster_deploy_aws_efs/tasks/aws-efs.yaml b/roles/cluster/cluster_deploy_aws_efs/tasks/aws-efs.yaml index 7ace0b7d3..026481868 100644 --- a/roles/cluster/cluster_deploy_aws_efs/tasks/aws-efs.yaml +++ b/roles/cluster/cluster_deploy_aws_efs/tasks/aws-efs.yaml @@ -96,11 +96,11 @@ - name: Populate the tags dict set_fact: - tags: "{{ tags | default({}) | combine ({ item.key : item.value }) }}" + tags: "{{ tags | default({}) | combine({item.key: item.value}) }}" with_items: - - { 'key': 'Name' , 'value': '{{ cluster_name_cmd.stdout }}'} - - { 'key': '{{ cluster_name_tag_cmd.stdout }}' , 'value': 'owned'} - - { 'key': 'Purpose', 'value': ''} + - {'key': 'Name' , 'value': '{{ cluster_name_cmd.stdout }}'} + - {'key': '{{ cluster_name_tag_cmd.stdout }}' , 'value': 'owned'} + - {'key': 'Purpose', 'value': ''} - name: Get the SecurityGroup content amazon.aws.ec2_group_info: diff --git a/roles/cluster/cluster_deploy_ldap/tasks/main.yml b/roles/cluster/cluster_deploy_ldap/tasks/main.yml index 2a20c6783..defe2ad16 100644 --- a/roles/cluster/cluster_deploy_ldap/tasks/main.yml +++ b/roles/cluster/cluster_deploy_ldap/tasks/main.yml @@ -145,6 +145,7 @@ - name: Get the cluster ID shell: + set -o pipefail; ocm describe cluster "{{ cluster_deploy_ldap_cluster_name }}" --json | jq .id -r register: cluster_id_cmd diff --git a/roles/cluster/cluster_deploy_operator/tasks/main.yml b/roles/cluster/cluster_deploy_operator/tasks/main.yml index f54d27053..af33ef870 100644 --- a/roles/cluster/cluster_deploy_operator/tasks/main.yml +++ b/roles/cluster/cluster_deploy_operator/tasks/main.yml @@ -231,7 +231,8 @@ - name: Instantiate the Subscription command: oc apply -f "{{ artifact_extra_logs_dir }}/src/002_sub.yml" -- block: +- name: Make sure the InstallPlan is deployed + block: - name: Find the operator InstallPlan command: oc get InstallPlan @@ -276,6 +277,20 @@ fail: msg="ClusterServiceVersion install not successful ({{ operator_csv_phase.stdout }})" when: operator_csv_phase.stdout != "Succeeded" + rescue: + - name: Capture the Catalog Operator logs (debug) + shell: + oc logs deployment.apps/catalog-operator + -n openshift-operator-lifecycle-manager + > {{ artifact_extra_logs_dir }}/catalog_operator.log + failed_when: false + + - name: Indicate where the Catalog-operator logs have been saved + debug: msg="The logs of Catalog Operator have been saved in {{ artifact_extra_logs_dir }}/catalog_operator.log" + + - name: Failed because the operator could not be installed from the CatalogSource + fail: msg="Failed because the operator could not be installed from the CatalogSource" + always: - name: Store the YAML of the operator CSV that was installed (debug) shell: @@ -283,25 +298,13 @@ -oyaml -n "{{ cluster_deploy_operator_namespace }}" > {{ artifact_extra_logs_dir }}/operator_csv.yml + - name: Store the YAML of the subscription shell: oc get -f "{{ artifact_extra_logs_dir }}/src/002_sub.yml" -oyaml -n "{{ cluster_deploy_operator_namespace }}" > {{ artifact_extra_logs_dir }}/operator_sub.yml - rescue: - - name: Capture the Catalog Operator logs (debug) - shell: - oc logs deployment.apps/catalog-operator - -n openshift-operator-lifecycle-manager - > {{ artifact_extra_logs_dir }}/catalog_operator.log - failed_when: false - - - name: Indicate where the Catalog-operator logs have been saved - debug: msg="The logs of Catalog Operator have been saved in {{ artifact_extra_logs_dir }}/catalog_operator.log" - - - name: Failed because the operator could not be installed from the CatalogSource - fail: msg="Failed because the operator could not be installed from the CatalogSource" - name: Deploy the operator CustomResource from its ClusterServiceVersion include_tasks: deploy_cr.yml diff --git a/roles/cluster/cluster_ensure_machineset/tasks/main.yml b/roles/cluster/cluster_ensure_machineset/tasks/main.yml index e83ef253e..05174434a 100644 --- a/roles/cluster/cluster_ensure_machineset/tasks/main.yml +++ b/roles/cluster/cluster_ensure_machineset/tasks/main.yml @@ -1,6 +1,7 @@ --- - name: "Check if the cluster already has a {{ machineset_instance_type }} machineset" shell: + set -o pipefail; oc get machineset -n openshift-machine-api {% if machineset_name | length > 0 %} -ojson | jq '.items[] | select(.spec.template.spec.providerSpec.value.instanceType=="{{ machineset_instance_type }}" and .metadata.name=="{{ machineset_name }}") | .metadata.name' -r diff --git a/roles/cluster/cluster_fill_workernodes/tasks/main.yml b/roles/cluster/cluster_fill_workernodes/tasks/main.yml index 13cd5d334..782d33e12 100644 --- a/roles/cluster/cluster_fill_workernodes/tasks/main.yml +++ b/roles/cluster/cluster_fill_workernodes/tasks/main.yml @@ -1,5 +1,6 @@ - name: Get the list of the worker nodes shell: + set -o pipefail; oc get nodes -l{{ cluster_fill_workernodes_label_selector }} -oname diff --git a/roles/cluster/cluster_set_project_annotation/tasks/cluster_set.yaml b/roles/cluster/cluster_set_project_annotation/tasks/cluster_set.yaml index 9c132023f..84291798e 100644 --- a/roles/cluster/cluster_set_project_annotation/tasks/cluster_set.yaml +++ b/roles/cluster/cluster_set_project_annotation/tasks/cluster_set.yaml @@ -55,8 +55,8 @@ echo "- deleting the project $test_project_name ..." oc delete ns "$test_project_name" >/dev/null echo "--> project annotation value: $project_annotation_value" - echo "==> expected value: "'{{ cluster_set_project_annotation_value or "null"}}' - [[ "$project_annotation_value" == '{{ cluster_set_project_annotation_value or "null"}}' ]] + echo "==> expected value: "'{{ cluster_set_project_annotation_value or "null" }}' + [[ "$project_annotation_value" == '{{ cluster_set_project_annotation_value or "null" }}' ]] retries: 120 delay: 5 register: wait_project_template_active diff --git a/roles/cluster/cluster_set_scale/tasks/main.yml b/roles/cluster/cluster_set_scale/tasks/main.yml index 23e08eabc..5302f0f3e 100644 --- a/roles/cluster/cluster_set_scale/tasks/main.yml +++ b/roles/cluster/cluster_set_scale/tasks/main.yml @@ -48,8 +48,8 @@ register: oc_get_machinesets failed_when: not oc_get_machinesets.stdout -- when: current_replicas_sum != scale - name: Change all {{ machineset_instance_type }} machinesets replicas to have sum {{ scale }} +- name: Change all {{ machineset_instance_type }} machinesets replicas to have sum {{ scale }} + when: current_replicas_sum != scale block: - name: Do not downscale any machinesets other than the first one, unless the user used force block: @@ -80,7 +80,8 @@ oc patch machineset -n openshift-machine-api {{ first_machineset }} --patch '{"spec": {"replicas": {{ scale }} }}' --type merge -- block: +- name: Make sure the machinesets are ready + block: - name: Wait for all machinesets with type {{ machineset_instance_type }} to be ready # This is done by verifying that at the availableReplicas @@ -92,6 +93,7 @@ # for more information. # 3. Perform some extra formatting for nicer logging shell: >- + set -o pipefail; oc get machinesets -n openshift-machine-api \ {% if machineset_name | length > 0 %} "{{ machineset_name }}" -ojson \ @@ -111,6 +113,10 @@ retries: 120 delay: 30 + rescue: + - name: Fail because the cluster machineset creation failed + fail: msg="Failing because cluster machineset creation failed" + always: # info about the 'machines' - name: Capture the description of the machines @@ -150,7 +156,4 @@ failed_when: false loop: "{{ oc_get_machinesets.stdout_lines }}" - rescue: - - name: Fail because the cluster machineset creation failed - fail: msg="Failing because cluster machineset creation failed" diff --git a/roles/codeflare/codeflare_cleanup_appwrappers/tasks/main.yml b/roles/codeflare/codeflare_cleanup_appwrappers/tasks/main.yml index c97c8deaa..c2cb53d74 100644 --- a/roles/codeflare/codeflare_cleanup_appwrappers/tasks/main.yml +++ b/roles/codeflare/codeflare_cleanup_appwrappers/tasks/main.yml @@ -1,10 +1,12 @@ - name: List all the AppWrappers in the namespace shell: | + set -o pipefail; oc get appwrappers -n {{ codeflare_cleanup_appwrappers_namespace }} -ojson | jq '.items[] | .metadata.name + " ==> "+ .status.state' -r > "{{ artifact_extra_logs_dir }}/appwrappers.status" oc get appwrappers -n {{ codeflare_cleanup_appwrappers_namespace }} -oyaml > "{{ artifact_extra_logs_dir }}/appwrappers.yaml" - name: Count the AppWrappers in the namespace shell: + set -o pipefail; oc get appwrappers -n {{ codeflare_cleanup_appwrappers_namespace }} -oname | wc -l - name: Create a configmap for the beginning of the test timestamp diff --git a/roles/codeflare/codeflare_generate_mcad_load/tasks/main.yml b/roles/codeflare/codeflare_generate_mcad_load/tasks/main.yml index a91ca5909..0c2305158 100644 --- a/roles/codeflare/codeflare_generate_mcad_load/tasks/main.yml +++ b/roles/codeflare/codeflare_generate_mcad_load/tasks/main.yml @@ -43,6 +43,7 @@ - name: Create the namespace if it does not exist shell: + set -o pipefail; oc create ns "{{ codeflare_generate_mcad_load_namespace }}" -oyaml --dry-run=client | tee "{{ artifact_extra_logs_dir }}/src/namespace.yaml" | oc apply -f- - name: Create a configmap for the beginning of the test timestamp diff --git a/roles/entitlement/entitlement_deploy/tasks/main.yml b/roles/entitlement/entitlement_deploy/tasks/main.yml index 2af07d59e..297299d99 100644 --- a/roles/entitlement/entitlement_deploy/tasks/main.yml +++ b/roles/entitlement/entitlement_deploy/tasks/main.yml @@ -31,20 +31,21 @@ - name: "Deploy RHSM from file '{{ entitlement_rhsm }}'" shell: set -o pipefail; - cat "{{ entitlement_mc_rhsm }}" - | python3 "{{ entitlement_py_apply }}" BASE64_ENCODED_RHSM_FILE @"{{ entitlement_rhsm }}" + cat "{{ entitlement_mc_rhsm }}" + | python3 "{{ entitlement_py_apply }}" BASE64_ENCODED_RHSM_FILE @"{{ entitlement_rhsm }}" | python3 "{{ entitlement_py_apply }}" MACHINE_CONFIG_ROLE "{{ machine_config_role }}" | oc apply -f- - name: "Deploy the pem and key-pem from file '{{ entitlement_pem }}'" shell: set -o pipefail; - cat "{{ entitlement_mc_pem }}" + cat "{{ entitlement_mc_pem }}" | python3 "{{ entitlement_py_apply }}" BASE64_ENCODED_PEM_FILE @"{{ entitlement_pem }}" | python3 "{{ entitlement_py_apply }}" MACHINE_CONFIG_ROLE "{{ machine_config_role }}" | oc apply -f- - name: "Deploy the repo CA from file '{{ entitlement_mc_rhsm_ca }}' if requested" + when: entitlement_repo_ca | default('', true) | trim != '' block: - name: Compute the md5sum of the CA file (debug) command: md5sum '{{ entitlement_repo_ca }}' @@ -52,8 +53,7 @@ - name: "Deploy the repo CA from file '{{ entitlement_mc_rhsm_ca }}'" shell: set -o pipefail; - cat "{{ entitlement_mc_rhsm_ca }}" + cat "{{ entitlement_mc_rhsm_ca }}" | python3 "{{ entitlement_py_apply }}" BASE64_ENCODED_RHSM_CA_FILE @"{{ entitlement_repo_ca }}" | python3 "{{ entitlement_py_apply }}" MACHINE_CONFIG_ROLE "{{ machine_config_role }}" | oc apply -f- - when: entitlement_repo_ca | default('', true) | trim != '' diff --git a/roles/entitlement/entitlement_test_in_cluster/tasks/main.yml b/roles/entitlement/entitlement_test_in_cluster/tasks/main.yml index 4f852a985..fd931988e 100644 --- a/roles/entitlement/entitlement_test_in_cluster/tasks/main.yml +++ b/roles/entitlement/entitlement_test_in_cluster/tasks/main.yml @@ -26,7 +26,8 @@ - name: Delete the entitlement tester Pod if it exists command: oc delete --ignore-not-found=true -f "{{ entitlement_tester_pod }}" -- block: +- name: Make sure the entitlement Pod is created + block: - name: Create the entitlement tester Pod command: oc create -f "{{ entitlement_tester_pod }}" diff --git a/roles/entitlement/entitlement_test_wait_deployment/tasks/main.yml b/roles/entitlement/entitlement_test_wait_deployment/tasks/main.yml index fa97dc17a..083d958a1 100644 --- a/roles/entitlement/entitlement_test_wait_deployment/tasks/main.yml +++ b/roles/entitlement/entitlement_test_wait_deployment/tasks/main.yml @@ -9,7 +9,8 @@ entitlement_retries: "{{ entitlement_nb_wait_retries }}" when: entitlement_test_and_wait == 'yes' -- block: +- name: Make sure the entitlement Pod is deployed + block: - name: Wait for the entitlement Pod to succeed shell: | set -o errexit; diff --git a/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_clusterpolicy.yml b/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_clusterpolicy.yml index 049159fdf..536cbcfc7 100644 --- a/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_clusterpolicy.yml +++ b/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_clusterpolicy.yml @@ -10,7 +10,8 @@ retries: 15 delay: 30 -- block: +- name: Make sure the GPU Operator ClusterPolicy is ready + block: - name: Wait for the GPU Operator ClusterPolicy CRD to appear command: oc get crd clusterpolicies.nvidia.com register: has_clusterpolicy_crd diff --git a/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_from_catalog.yml b/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_from_catalog.yml index 29c8446fc..282ef6230 100644 --- a/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_from_catalog.yml +++ b/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_from_catalog.yml @@ -113,7 +113,8 @@ - name: Instantiate the OperatorHub subscription command: oc create -f "{{ artifact_extra_logs_dir }}/gpu_operator_sub.yml" -- block: +- name: Install the GPU Operator + block: - name: Find the GPU Operator OperatorHub InstallPlan command: oc get InstallPlan diff --git a/roles/gpu_operator/gpu_operator_get_csv_version/tasks/main.yml b/roles/gpu_operator/gpu_operator_get_csv_version/tasks/main.yml index b0f62d319..4aa2d8154 100644 --- a/roles/gpu_operator/gpu_operator_get_csv_version/tasks/main.yml +++ b/roles/gpu_operator/gpu_operator_get_csv_version/tasks/main.yml @@ -9,11 +9,15 @@ - name: Get the gpu-operator subscription package name block: - name: Count gpu operator subscription candidates - shell: oc get subscription --all-namespaces -ojson | jq '[.items[] | select(.spec.name | test("gpu-operator"))] | length' + shell: + set -o pipefail; + oc get subscription --all-namespaces -ojson | jq '[.items[] | select(.spec.name | test("gpu-operator"))] | length' register: gpu_subscriptions failed_when: gpu_subscriptions.stdout != '1' - name: Read the package name from the first gpu-operator subscription - shell: oc get subscription -A -ojson | jq '[.items[] | select(.spec.name | test("gpu-operator"))][0].spec.name' + shell: + set -o pipefail; + oc get subscription -A -ojson | jq '[.items[] | select(.spec.name | test("gpu-operator"))][0].spec.name' register: gpu_operator_subscription_package_name - name: Ensure that there is a CSV for the GPU Operator command: diff --git a/roles/gpu_operator/gpu_operator_run_gpu_burn/tasks/main.yml b/roles/gpu_operator/gpu_operator_run_gpu_burn/tasks/main.yml index 3f9cce344..f6524087e 100644 --- a/roles/gpu_operator/gpu_operator_run_gpu_burn/tasks/main.yml +++ b/roles/gpu_operator/gpu_operator_run_gpu_burn/tasks/main.yml @@ -91,13 +91,16 @@ - name: Ensure that no GPU was faulty loop: "{{ gpu_burn_gpu_nodes.stdout_lines }}" shell: + set -o pipefail; oc logs pod/gpu-burn-{{ item }} -n {{ gpu_operator_run_gpu_burn_namespace }} | grep FAULTY register: gpu_burn_test_faulty failed_when: gpu_burn_test_faulty.rc == 0 always: - name: Save the logs of the GPU burn Pods - shell: oc logs pod/gpu-burn-{{ item }} -n {{ gpu_operator_run_gpu_burn_namespace }} | grep -o "[^$(printf '\r')]*$" + shell: + set -o pipefail; + oc logs pod/gpu-burn-{{ item }} -n {{ gpu_operator_run_gpu_burn_namespace }} | grep -o "[^$(printf '\r')]*$" with_items: "{{ gpu_burn_gpu_nodes.stdout_lines }}" failed_when: false @@ -124,4 +127,4 @@ - name: Delete the src ConfigMap command: oc --ignore-not-found=true delete configmap gpu-burn-src -n {{ gpu_operator_run_gpu_burn_namespace }} failed_when: false - when: not gpu_operator_run_gpu_burn_keep_resources \ No newline at end of file + when: not gpu_operator_run_gpu_burn_keep_resources diff --git a/roles/gpu_operator/gpu_operator_wait_deployment/tasks/main.yml b/roles/gpu_operator/gpu_operator_wait_deployment/tasks/main.yml index b90de5a92..e5950f328 100644 --- a/roles/gpu_operator/gpu_operator_wait_deployment/tasks/main.yml +++ b/roles/gpu_operator/gpu_operator_wait_deployment/tasks/main.yml @@ -78,7 +78,8 @@ retries: 15 delay: 60 -- block: +- name: Wait for the nodes labeling + block: - name: Wait for the gpu-feature-discovery Pod to label the nodes command: oc get nodes -l nvidia.com/gpu.count -oname register: has_gpu_feature_discovery_labels diff --git a/roles/gpu_operator/gpu_operator_wait_deployment/tasks/metrics.yml b/roles/gpu_operator/gpu_operator_wait_deployment/tasks/metrics.yml index 57e3ac7ac..3cf94ef2b 100644 --- a/roles/gpu_operator/gpu_operator_wait_deployment/tasks/metrics.yml +++ b/roles/gpu_operator/gpu_operator_wait_deployment/tasks/metrics.yml @@ -1,5 +1,6 @@ --- -- block: +- name: Create the GPU operator requirements + block: - name: Check if the GPU Operator namespace has the openshift.io/cluster-monitoring label shell: set -o pipefail; diff --git a/roles/load_aware/load_aware_deploy_trimaran/tasks/main.yml b/roles/load_aware/load_aware_deploy_trimaran/tasks/main.yml index 29c7ad151..6182b8ec9 100644 --- a/roles/load_aware/load_aware_deploy_trimaran/tasks/main.yml +++ b/roles/load_aware/load_aware_deploy_trimaran/tasks/main.yml @@ -5,7 +5,6 @@ - name: Ensure user monitoring enabled shell: set -o pipefail; - oc -n openshift-user-workload-monitoring get pod --no-headers | awk '{ print $3}' register: monitoring_enabled @@ -16,6 +15,7 @@ - name: Get monitoring secret name shell: + set -o pipefail; oc get secret -n openshift-user-workload-monitoring | grep prometheus-user-workload-token | head -n 1 @@ -27,9 +27,11 @@ - name: Get Thanos Endpoint shell: + set -o pipefail; oc get route thanos-querier -n openshift-monitoring -o json | jq -r '.spec.host' register: thanos_endpoint_cmd + - name: Format the Thanos Endpoint set_fact: thanos_endpoint: "https://{{ thanos_endpoint_cmd.stdout }}" @@ -75,11 +77,11 @@ echo "Route to thanos monitoring endpoint: {{ thanos_endpoint }}" export MONITORING_TOKEN=$(oc get secret {{ monitoring_secret.stdout }} -n openshift-user-workload-monitoring -o json | jq -r '.data.token' | base64 -d) cat {{ trimaran_setup_config }} | envsubst | oc apply -f - - register: deploy_trimaran - name: Ensure Trimaran is Running shell: + set -o pipefail; oc get pods -n trimaran | grep "trimaran-scheduler" | awk '{print $3}' @@ -96,6 +98,7 @@ - name: Ensure the Trimaran test pods completes shell: + set -o pipefail; oc get pod trimaran-test -n trimaran --no-headers | awk '{print $3}' register: trimaran_test_pod_state diff --git a/roles/load_aware/load_aware_scale_test/tasks/main.yml b/roles/load_aware/load_aware_scale_test/tasks/main.yml index 054bb3524..c2d4724ca 100644 --- a/roles/load_aware/load_aware_scale_test/tasks/main.yml +++ b/roles/load_aware/load_aware_scale_test/tasks/main.yml @@ -1,6 +1,6 @@ - name: Generate load timeline shell: - python3 {{ load_timeline_generator }} {{ load_aware_scale_test_distribution }} {{ load_aware_scale_test_duration}} {{ load_aware_scale_test_instances }} "{{ artifact_extra_logs_dir }}/schedule_plan.yaml" + python3 {{ load_timeline_generator }} {{ load_aware_scale_test_distribution }} {{ load_aware_scale_test_duration }} {{ load_aware_scale_test_instances }} "{{ artifact_extra_logs_dir }}/schedule_plan.yaml" - name: Run workload and dump stats block: @@ -10,6 +10,7 @@ - name: Wait for workloads to finish shell: + set -o pipefail; oc get pods -n {{ load_aware_scale_test_namespace }} --no-headers | awk '{ print $3 }' register: load_aware_workload delay: 60 diff --git a/roles/local_ci/local_ci_run_multi/tasks/main.yml b/roles/local_ci/local_ci_run_multi/tasks/main.yml index adfcd5272..aafdc6f5e 100644 --- a/roles/local_ci/local_ci_run_multi/tasks/main.yml +++ b/roles/local_ci/local_ci_run_multi/tasks/main.yml @@ -214,6 +214,7 @@ - name: Store the success count shell: + set -o pipefail; echo "{{ success_count_cmd.stdout_lines[0] }}/{{ local_ci_run_multi_user_count }}" | tee "{{ artifact_extra_logs_dir }}/success_count" diff --git a/roles/nfd/nfd_test_wait_gpu/tasks/main.yml b/roles/nfd/nfd_test_wait_gpu/tasks/main.yml index b38cafaa9..4a03ff750 100644 --- a/roles/nfd/nfd_test_wait_gpu/tasks/main.yml +++ b/roles/nfd/nfd_test_wait_gpu/tasks/main.yml @@ -9,11 +9,13 @@ nfd_wait_gpu_retries: "{{ nfd_wait_gpu_nb_retries }}" when: nfd_wait_gpu_nodes == 'yes' -- block: +- name: Coverge the GPU nodes creation + block: - name: Wait for the GPU nodes to appear # label list should be in sync with: # https://github.com/NVIDIA/gpu-operator/blob/master/pkg/controller/clusterpolicy/state_manager.go#L26 shell: + set -o pipefail; ( oc get nodes -oname --ignore-not-found=false -l feature.node.kubernetes.io/pci-10de.present || oc get nodes -oname --ignore-not-found=false -l feature.node.kubernetes.io/pci-0302_10de.present || oc get nodes -oname --ignore-not-found=false -l feature.node.kubernetes.io/pci-0300_10de.present @@ -25,7 +27,9 @@ rescue: - name: Get the labels of the worker nodes (debug) - shell: oc get nodes --show-labels --selector='!node-role.kubernetes.io/master' | tr , '\n' + shell: + set -o pipefail; + oc get nodes --show-labels --selector='!node-role.kubernetes.io/master' | tr , '\n' - name: Failing because no GPU node showed up fail: msg="Failed because no GPU node showed up" diff --git a/roles/ocm/ocm_deploy_addon/tasks/main.yml b/roles/ocm/ocm_deploy_addon/tasks/main.yml index 5ebe520c0..94d7b674c 100644 --- a/roles/ocm/ocm_deploy_addon/tasks/main.yml +++ b/roles/ocm/ocm_deploy_addon/tasks/main.yml @@ -26,7 +26,7 @@ - name: Check if addon is already installed shell: set -o pipefail; - ocm get /api/clusters_mgmt/v1/clusters/{{ cluster_id }}/addons/{{ ocm_deploy_addon_id }} |& jq -r '.kind' || true + ocm get /api/clusters_mgmt/v1/clusters/{{ cluster_id }}/addons/{{ ocm_deploy_addon_id }} |& jq -r '.kind' || true register: ocm_addon_precheck - name: Create ocm addon install payload @@ -36,11 +36,11 @@ mode: 0400 when: '"Error" in ocm_addon_precheck.stdout' -- name: "Install addon {{ ocm_deploy_addon_id }} via OCM API" +- name: Install addon {{ ocm_deploy_addon_id }} via OCM API shell: | set -o pipefail; url="/api/clusters_mgmt/v1/clusters/{{ cluster_id }}/addons" - body="{{ artifact_extra_logs_dir }}/addon_{{ ocm_deploy_addon_id }}.json" + body="{{ artifact_extra_logs_dir }}/addon_{{ ocm_deploy_addon_id }}.json" output=$(ocm post "$url" --body=$body 2>&1); echo "$output" >&2; # for observation echo "$output" | jq -r '.kind' diff --git a/roles/pipelines/pipelines_capture_state/tasks/main.yml b/roles/pipelines/pipelines_capture_state/tasks/main.yml index 08e39e2a2..b43f650fb 100644 --- a/roles/pipelines/pipelines_capture_state/tasks/main.yml +++ b/roles/pipelines/pipelines_capture_state/tasks/main.yml @@ -11,6 +11,7 @@ - name: Compute the DSP application name shell: + set -o pipefail; oc get dspa -oname -n "{{ namespace }}" | head -1 | cut -d/ -f2 register: dspa_name_cmd when: not pipelines_capture_state_dsp_application_name diff --git a/roles/pipelines/pipelines_run_kfp_notebook/tasks/main.yml b/roles/pipelines/pipelines_run_kfp_notebook/tasks/main.yml index 7893b694e..dbbdb87f8 100644 --- a/roles/pipelines/pipelines_run_kfp_notebook/tasks/main.yml +++ b/roles/pipelines/pipelines_run_kfp_notebook/tasks/main.yml @@ -107,6 +107,7 @@ - name: Compute the DSP application name shell: + set -o pipefail; oc get dspa -oname -n "{{ notebook_namespace }}" | head -1 | cut -d/ -f2 register: dspa_name_cmd when: not pipelines_run_kfp_notebook_dsp_application_name @@ -162,7 +163,7 @@ set -o pipefail; oc adm policy add-cluster-role-to-user "{{ sa_cluster_role }}" -n "{{ notebook_namespace }}" - -z "{{ sa_name}}" + -z "{{ sa_name }}" --dry-run=client -oyaml | tee "{{ artifact_extra_logs_dir }}/src/clusterrolebinding.yaml" | oc apply -f- @@ -193,7 +194,7 @@ - name: Create the secret token shell: oc create secret generic "{{ secret_token_name }}" - "--from-literal=token=$(oc create token '{{ sa_name}}' -n '{{ notebook_namespace }}')" + "--from-literal=token=$(oc create token '{{ sa_name }}' -n '{{ notebook_namespace }}')" -n "{{ notebook_namespace }}" register: create_secret_token_cmd failed_when: '"no token is currently in use for this session" in create_secret_token_cmd.stderr' diff --git a/roles/rhods/rhods_benchmark_notebook_performance/tasks/main.yml b/roles/rhods/rhods_benchmark_notebook_performance/tasks/main.yml index f317cdc22..50c783d74 100644 --- a/roles/rhods/rhods_benchmark_notebook_performance/tasks/main.yml +++ b/roles/rhods/rhods_benchmark_notebook_performance/tasks/main.yml @@ -40,7 +40,7 @@ - name: Save the imagestream tag set_fact: - image_tag: "{% if rhods_benchmark_notebook_performance_imagestream_tag | string %}{{ rhods_benchmark_notebook_performance_imagestream_tag }}{% else %}{{ image_tag_cmd.stdout }}{% endif %}" + image_tag: "{% if rhods_benchmark_notebook_performance_imagestream_tag | string %}{{ rhods_benchmark_notebook_performance_imagestream_tag }}{% else %}{{ image_tag_cmd.stdout }}{% endif %}" - name: Get the image address command: @@ -54,6 +54,7 @@ block: - name: Compute the imagestream filename shell: + set -o pipefail; echo -n "{{ rhods_benchmark_notebook_performance_imagestream }}" | sed 's/^s2i-//g' | sed 's/-notebook//g'; echo "-notebook-imagestream.yaml" register: imagestream_filename_cmd diff --git a/roles/rhods/rhods_notebook_ods_ci_scale_test/tasks/main.yml b/roles/rhods/rhods_notebook_ods_ci_scale_test/tasks/main.yml index 909225c43..466174384 100644 --- a/roles/rhods/rhods_notebook_ods_ci_scale_test/tasks/main.yml +++ b/roles/rhods/rhods_notebook_ods_ci_scale_test/tasks/main.yml @@ -21,7 +21,7 @@ - name: Update the Exclude tags if necessary set_fact: - rhods_notebook_ods_ci_scale_test_ods_ci_exclude_tags: "{% if rhods_notebook_ods_ci_scale_test_only_create_notebooks|bool %}JupyterLabORWait{% else %}{{ rhods_notebook_ods_ci_scale_test_ods_ci_exclude_tags }}{% endif %}" + rhods_notebook_ods_ci_scale_test_ods_ci_exclude_tags: "{% if rhods_notebook_ods_ci_scale_test_only_create_notebooks | bool %}JupyterLabORWait{% else %}{{ rhods_notebook_ods_ci_scale_test_ods_ci_exclude_tags }}{% endif %}" - name: Define the test environments set_fact: @@ -64,11 +64,13 @@ - name: Get the RHODS CSV name shell: + set -o pipefail; oc get csv -oname -n redhat-ods-operator | grep rhods-operator register: rhods_csv_cmd - name: Get the RHODS version shell: + set -o pipefail; oc get {{ rhods_csv_cmd.stdout }} -n redhat-ods-operator -oname | grep rhods-operator | cut -d/ -f2 | cut -d. -f2- register: rhods_version_cmd diff --git a/roles/watsonx_serving/watsonx_serving_deploy_model/tasks/main.yml b/roles/watsonx_serving/watsonx_serving_deploy_model/tasks/main.yml index a33c30dd4..39cea6921 100644 --- a/roles/watsonx_serving/watsonx_serving_deploy_model/tasks/main.yml +++ b/roles/watsonx_serving/watsonx_serving_deploy_model/tasks/main.yml @@ -37,6 +37,7 @@ - name: Check that the namespace is already registered when: patch_smmr_cmd.rc != 0 shell: + set -o pipefail; oc get smmr/default -n istio-system -ojsonpath={.spec.members} | jq .[] -r register: smmr_members_cmd failed_when: watsonx_serving_deploy_model_namespace not in smmr_members_cmd.stdout_lines diff --git a/roles/wisdom/wisdom_deploy_model/tasks/main.yml b/roles/wisdom/wisdom_deploy_model/tasks/main.yml index 060186c4f..01809de9d 100644 --- a/roles/wisdom/wisdom_deploy_model/tasks/main.yml +++ b/roles/wisdom/wisdom_deploy_model/tasks/main.yml @@ -104,8 +104,10 @@ command: oc delete secret/wisdom-protos --ignore-not-found -n {{ wisdom_deploy_model_namespace }} - name: Create the wisdom protos Secret - shell: oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_deploy_model_protos_path }}/* | tr ' ' ,) -n {{ wisdom_deploy_model_namespace }} --dry-run=client -oyaml | oc apply -f - - + shell: + set -o pipefail; + oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_deploy_model_protos_path }}/* | tr ' ' ,) -n {{ wisdom_deploy_model_namespace }} --dry-run=client -oyaml | oc apply -f - + - name: Delete the entrypoint ConfigMap if it already exists command: oc delete --ignore-not-found configmap/wisdom-tester-entrypoint -n {{ wisdom_deploy_model_namespace }} @@ -129,17 +131,16 @@ command: oc create -f "{{ artifact_extra_logs_dir }}/src/003_wisdom_tester_pod.yml" -n {{ wisdom_deploy_model_namespace }} # Wait for model to respond without error -- block: - - name: Wait for the wisdom test Pod to terminate - command: - oc get pod/wisdom-tester - -n {{ wisdom_deploy_model_namespace }} - -o custom-columns=:.status.phase - --no-headers - register: tester_wait - until: tester_wait.stdout == "Succeeded" or tester_wait.stdout == "Error" or tester_wait.stdout == "Failed" - retries: 10 - delay: 30 +- name: Wait for the wisdom test Pod to terminate + command: + oc get pod/wisdom-tester + -n {{ wisdom_deploy_model_namespace }} + -o custom-columns=:.status.phase + --no-headers + register: tester_wait + until: tester_wait.stdout == "Succeeded" or tester_wait.stdout == "Error" or tester_wait.stdout == "Failed" + retries: 10 + delay: 30 - name: Save the wisdom test Pod logs shell: oc logs pod/wisdom-tester -n {{ wisdom_deploy_model_namespace }} > {{ artifact_extra_logs_dir }}/wisdom-tester-pod.log diff --git a/roles/wisdom/wisdom_llm_load_test/tasks/main.yml b/roles/wisdom/wisdom_llm_load_test/tasks/main.yml index 04ec4fe2b..bba245544 100644 --- a/roles/wisdom/wisdom_llm_load_test/tasks/main.yml +++ b/roles/wisdom/wisdom_llm_load_test/tasks/main.yml @@ -13,7 +13,9 @@ command: oc delete secret/wisdom-protos --ignore-not-found -n {{ wisdom_llm_load_test_namespace }} - name: Create the wisdom protos Secret - shell: oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_llm_load_test_protos_path }}/* | tr ' ' ,) -n {{ wisdom_llm_load_test_namespace }} --dry-run=client -oyaml | oc apply -f - + shell: + set -o pipefail; + oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_llm_load_test_protos_path }}/* | tr ' ' ,) -n {{ wisdom_llm_load_test_namespace }} --dry-run=client -oyaml | oc apply -f - # The results need to be pushed to S3 after the run, put S3 credentials in a Secret - name: Delete the S3 push secret for results if it exists diff --git a/roles/wisdom/wisdom_llm_load_test_multiplexed/tasks/main.yml b/roles/wisdom/wisdom_llm_load_test_multiplexed/tasks/main.yml index d90196982..2afe15e5e 100644 --- a/roles/wisdom/wisdom_llm_load_test_multiplexed/tasks/main.yml +++ b/roles/wisdom/wisdom_llm_load_test_multiplexed/tasks/main.yml @@ -13,7 +13,9 @@ command: oc delete secret/wisdom-protos --ignore-not-found -n {{ wisdom_llm_load_test_multiplexed_namespace }} - name: Create the wisdom protos Secret - shell: oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_llm_load_test_multiplexed_protos_path }}/* | tr ' ' ,) -n {{ wisdom_llm_load_test_multiplexed_namespace }} --dry-run=client -oyaml | oc apply -f - + shell: + set -o pipefail; + oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_llm_load_test_multiplexed_protos_path }}/* | tr ' ' ,) -n {{ wisdom_llm_load_test_multiplexed_namespace }} --dry-run=client -oyaml | oc apply -f - # The results need to be pushed to S3 after the run, put S3 credentials in a Secret - name: Delete the S3 push secret for results if it exists diff --git a/roles/wisdom/wisdom_warmup_model/tasks/main.yml b/roles/wisdom/wisdom_warmup_model/tasks/main.yml index b25016b0f..5cc06cd1a 100644 --- a/roles/wisdom/wisdom_warmup_model/tasks/main.yml +++ b/roles/wisdom/wisdom_warmup_model/tasks/main.yml @@ -20,8 +20,10 @@ command: oc delete secret/wisdom-protos --ignore-not-found -n {{ wisdom_warmup_model_namespace }} - name: Create the wisdom protos Secret - shell: oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_warmup_model_protos_path }}/* | tr ' ' ,) -n {{ wisdom_warmup_model_namespace }} --dry-run=client -oyaml | oc apply -f - - + shell: + set -o pipefail; + oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_warmup_model_protos_path }}/* | tr ' ' ,) -n {{ wisdom_warmup_model_namespace }} --dry-run=client -oyaml | oc apply -f - + - name: Apply the warmup Pod template template: src: "{{ wisdom_warmup_pod_template }}"