From d43c7e2e40abf695220243343d89f3aa28923664 Mon Sep 17 00:00:00 2001 From: Carlos Camacho Date: Wed, 23 Aug 2023 14:54:30 +0200 Subject: [PATCH] testing: lints partial fixes This commit resolves the following lint rules: - 'key-order[task]' - 'jinja[spacing]' - 'name[missing]' - 'yaml[new-line-at-end-of-file]' - 'yaml[trailing-spaces]' - 'risky-shell-pipe' - 'args[module]' Also removes the branch constraint to run the Ansible lint checks from any fork. This is useful because in each contributor's fork when pushing specific named branches the checks will be executed, a future improvement could be to run all the lints checks from a single environment like `tox -e linters` instead of consuming the GH action. Partially-solves: #10 --- .github/workflows/ansible-lint.yml | 3 -- config/ansible-lint.yml | 7 ----- .../tasks/main.yml | 3 +- .../tasks/main.yml | 4 --- .../cluster/cluster_create_osd/tasks/main.yml | 2 +- .../cluster_deploy_aws_efs/tasks/aws-efs.yaml | 8 ++--- .../cluster_deploy_ldap/tasks/main.yml | 1 + .../cluster_deploy_operator/tasks/main.yml | 31 ++++++++++--------- .../cluster_ensure_machineset/tasks/main.yml | 1 + .../cluster_fill_workernodes/tasks/main.yml | 1 + .../tasks/cluster_set.yaml | 4 +-- .../cluster/cluster_set_scale/tasks/main.yml | 15 +++++---- .../tasks/main.yml | 2 ++ .../tasks/main.yml | 1 + .../entitlement_deploy/tasks/main.yml | 10 +++--- .../tasks/main.yml | 3 +- .../tasks/main.yml | 3 +- .../tasks/deploy_clusterpolicy.yml | 3 +- .../tasks/deploy_from_catalog.yml | 3 +- .../tasks/main.yml | 8 +++-- .../gpu_operator_run_gpu_burn/tasks/main.yml | 7 +++-- .../tasks/main.yml | 3 +- .../tasks/metrics.yml | 3 +- .../load_aware_deploy_trimaran/tasks/main.yml | 7 +++-- .../load_aware_scale_test/tasks/main.yml | 3 +- .../local_ci_run_multi/tasks/main.yml | 1 + roles/nfd/nfd_test_wait_gpu/tasks/main.yml | 8 +++-- roles/ocm/ocm_deploy_addon/tasks/main.yml | 6 ++-- .../pipelines_capture_state/tasks/main.yml | 1 + .../pipelines_run_kfp_notebook/tasks/main.yml | 5 +-- .../tasks/main.yml | 3 +- .../tasks/main.yml | 4 ++- .../tasks/main.yml | 1 + .../wisdom/wisdom_deploy_model/tasks/main.yml | 27 ++++++++-------- .../wisdom_llm_load_test/tasks/main.yml | 4 ++- .../tasks/main.yml | 4 ++- .../wisdom/wisdom_warmup_model/tasks/main.yml | 6 ++-- 37 files changed, 120 insertions(+), 86 deletions(-) diff --git a/.github/workflows/ansible-lint.yml b/.github/workflows/ansible-lint.yml index 6a070ce47..201678dda 100644 --- a/.github/workflows/ansible-lint.yml +++ b/.github/workflows/ansible-lint.yml @@ -3,11 +3,8 @@ name: Run ansible-lint # Controls when the action will run. on: - # Triggers the workflow on push or pull request events but only for the main branch pull_request: - branches: [main] push: - branches: [main] schedule: - cron: '0 */8 * * *' # Allows you to run this workflow manually from the Actions tab diff --git a/config/ansible-lint.yml b/config/ansible-lint.yml index 9bb28edf0..5eea32934 100644 --- a/config/ansible-lint.yml +++ b/config/ansible-lint.yml @@ -14,11 +14,8 @@ skip_list: - 'command-instead-of-module' - 'command-instead-of-shell' - 'deprecated-local-action' - - 'key-order[task]' - - 'jinja[spacing]' - 'no-free-form' - 'chema[meta]' - - 'name[missing]' - 'var-naming[no-reserved]' - 'var-naming[no-role-prefix]' - 'var-naming[pattern]' @@ -29,15 +26,11 @@ skip_list: - 'yaml[indentation]' - 'yaml[key-duplicates]' - 'yaml[line-length]' - - 'yaml[new-line-at-end-of-file]' - 'yaml[octal-values]' - - 'yaml[trailing-spaces]' - 'yaml[truthy]' - 'name[template]' - 'name[casing]' - 'risky-file-permissions' - - 'risky-shell-pipe' - 'ignore-errors' - 'no-changed-when' - 'fqcn' - - 'args[module]' diff --git a/roles/benchmarking/benchmarking_run_mlperf_ssd/tasks/main.yml b/roles/benchmarking/benchmarking_run_mlperf_ssd/tasks/main.yml index 99b8991a5..1b12434c9 100644 --- a/roles/benchmarking/benchmarking_run_mlperf_ssd/tasks/main.yml +++ b/roles/benchmarking/benchmarking_run_mlperf_ssd/tasks/main.yml @@ -45,7 +45,8 @@ command: oc create -f "{{ artifact_extra_logs_dir }}/001_pod_run-mlperf-ssd.yml" -- block: +- name: Make sure the benchmark completes + block: - name: Wait for the benchmark completion command: oc get pod/{{ benchmarking_run_mlperf_ssd_name }} diff --git a/roles/cluster/cluster_capture_environment/tasks/main.yml b/roles/cluster/cluster_capture_environment/tasks/main.yml index 5e007541d..d0006390f 100644 --- a/roles/cluster/cluster_capture_environment/tasks/main.yml +++ b/roles/cluster/cluster_capture_environment/tasks/main.yml @@ -37,8 +37,6 @@ command: git describe HEAD --long --always register: git_version - args: - warn: false # don't warn about using git here - name: Store ci-artifact version from Git copy: @@ -50,8 +48,6 @@ command: git show --no-patch register: git_show - args: - warn: false # don't warn about using git here - name: Store ci-artifact last git commit copy: diff --git a/roles/cluster/cluster_create_osd/tasks/main.yml b/roles/cluster/cluster_create_osd/tasks/main.yml index 91233317d..af53a4adf 100644 --- a/roles/cluster/cluster_create_osd/tasks/main.yml +++ b/roles/cluster/cluster_create_osd/tasks/main.yml @@ -121,7 +121,7 @@ ocm edit machinepool {{ cluster_create_osd_machinepool_name }} --cluster={{ cluster_create_osd_cluster_name }} - --replicas={{ [2, cluster_create_osd_compute_nodes|int] |max }} + --replicas={{ [2, cluster_create_osd_compute_nodes | int] | max }} - name: Wait for the desired worker node count shell: | diff --git a/roles/cluster/cluster_deploy_aws_efs/tasks/aws-efs.yaml b/roles/cluster/cluster_deploy_aws_efs/tasks/aws-efs.yaml index 7ace0b7d3..026481868 100644 --- a/roles/cluster/cluster_deploy_aws_efs/tasks/aws-efs.yaml +++ b/roles/cluster/cluster_deploy_aws_efs/tasks/aws-efs.yaml @@ -96,11 +96,11 @@ - name: Populate the tags dict set_fact: - tags: "{{ tags | default({}) | combine ({ item.key : item.value }) }}" + tags: "{{ tags | default({}) | combine({item.key: item.value}) }}" with_items: - - { 'key': 'Name' , 'value': '{{ cluster_name_cmd.stdout }}'} - - { 'key': '{{ cluster_name_tag_cmd.stdout }}' , 'value': 'owned'} - - { 'key': 'Purpose', 'value': ''} + - {'key': 'Name' , 'value': '{{ cluster_name_cmd.stdout }}'} + - {'key': '{{ cluster_name_tag_cmd.stdout }}' , 'value': 'owned'} + - {'key': 'Purpose', 'value': ''} - name: Get the SecurityGroup content amazon.aws.ec2_group_info: diff --git a/roles/cluster/cluster_deploy_ldap/tasks/main.yml b/roles/cluster/cluster_deploy_ldap/tasks/main.yml index 2a20c6783..defe2ad16 100644 --- a/roles/cluster/cluster_deploy_ldap/tasks/main.yml +++ b/roles/cluster/cluster_deploy_ldap/tasks/main.yml @@ -145,6 +145,7 @@ - name: Get the cluster ID shell: + set -o pipefail; ocm describe cluster "{{ cluster_deploy_ldap_cluster_name }}" --json | jq .id -r register: cluster_id_cmd diff --git a/roles/cluster/cluster_deploy_operator/tasks/main.yml b/roles/cluster/cluster_deploy_operator/tasks/main.yml index f54d27053..af33ef870 100644 --- a/roles/cluster/cluster_deploy_operator/tasks/main.yml +++ b/roles/cluster/cluster_deploy_operator/tasks/main.yml @@ -231,7 +231,8 @@ - name: Instantiate the Subscription command: oc apply -f "{{ artifact_extra_logs_dir }}/src/002_sub.yml" -- block: +- name: Make sure the InstallPlan is deployed + block: - name: Find the operator InstallPlan command: oc get InstallPlan @@ -276,6 +277,20 @@ fail: msg="ClusterServiceVersion install not successful ({{ operator_csv_phase.stdout }})" when: operator_csv_phase.stdout != "Succeeded" + rescue: + - name: Capture the Catalog Operator logs (debug) + shell: + oc logs deployment.apps/catalog-operator + -n openshift-operator-lifecycle-manager + > {{ artifact_extra_logs_dir }}/catalog_operator.log + failed_when: false + + - name: Indicate where the Catalog-operator logs have been saved + debug: msg="The logs of Catalog Operator have been saved in {{ artifact_extra_logs_dir }}/catalog_operator.log" + + - name: Failed because the operator could not be installed from the CatalogSource + fail: msg="Failed because the operator could not be installed from the CatalogSource" + always: - name: Store the YAML of the operator CSV that was installed (debug) shell: @@ -283,25 +298,13 @@ -oyaml -n "{{ cluster_deploy_operator_namespace }}" > {{ artifact_extra_logs_dir }}/operator_csv.yml + - name: Store the YAML of the subscription shell: oc get -f "{{ artifact_extra_logs_dir }}/src/002_sub.yml" -oyaml -n "{{ cluster_deploy_operator_namespace }}" > {{ artifact_extra_logs_dir }}/operator_sub.yml - rescue: - - name: Capture the Catalog Operator logs (debug) - shell: - oc logs deployment.apps/catalog-operator - -n openshift-operator-lifecycle-manager - > {{ artifact_extra_logs_dir }}/catalog_operator.log - failed_when: false - - - name: Indicate where the Catalog-operator logs have been saved - debug: msg="The logs of Catalog Operator have been saved in {{ artifact_extra_logs_dir }}/catalog_operator.log" - - - name: Failed because the operator could not be installed from the CatalogSource - fail: msg="Failed because the operator could not be installed from the CatalogSource" - name: Deploy the operator CustomResource from its ClusterServiceVersion include_tasks: deploy_cr.yml diff --git a/roles/cluster/cluster_ensure_machineset/tasks/main.yml b/roles/cluster/cluster_ensure_machineset/tasks/main.yml index e83ef253e..05174434a 100644 --- a/roles/cluster/cluster_ensure_machineset/tasks/main.yml +++ b/roles/cluster/cluster_ensure_machineset/tasks/main.yml @@ -1,6 +1,7 @@ --- - name: "Check if the cluster already has a {{ machineset_instance_type }} machineset" shell: + set -o pipefail; oc get machineset -n openshift-machine-api {% if machineset_name | length > 0 %} -ojson | jq '.items[] | select(.spec.template.spec.providerSpec.value.instanceType=="{{ machineset_instance_type }}" and .metadata.name=="{{ machineset_name }}") | .metadata.name' -r diff --git a/roles/cluster/cluster_fill_workernodes/tasks/main.yml b/roles/cluster/cluster_fill_workernodes/tasks/main.yml index 13cd5d334..782d33e12 100644 --- a/roles/cluster/cluster_fill_workernodes/tasks/main.yml +++ b/roles/cluster/cluster_fill_workernodes/tasks/main.yml @@ -1,5 +1,6 @@ - name: Get the list of the worker nodes shell: + set -o pipefail; oc get nodes -l{{ cluster_fill_workernodes_label_selector }} -oname diff --git a/roles/cluster/cluster_set_project_annotation/tasks/cluster_set.yaml b/roles/cluster/cluster_set_project_annotation/tasks/cluster_set.yaml index 9c132023f..84291798e 100644 --- a/roles/cluster/cluster_set_project_annotation/tasks/cluster_set.yaml +++ b/roles/cluster/cluster_set_project_annotation/tasks/cluster_set.yaml @@ -55,8 +55,8 @@ echo "- deleting the project $test_project_name ..." oc delete ns "$test_project_name" >/dev/null echo "--> project annotation value: $project_annotation_value" - echo "==> expected value: "'{{ cluster_set_project_annotation_value or "null"}}' - [[ "$project_annotation_value" == '{{ cluster_set_project_annotation_value or "null"}}' ]] + echo "==> expected value: "'{{ cluster_set_project_annotation_value or "null" }}' + [[ "$project_annotation_value" == '{{ cluster_set_project_annotation_value or "null" }}' ]] retries: 120 delay: 5 register: wait_project_template_active diff --git a/roles/cluster/cluster_set_scale/tasks/main.yml b/roles/cluster/cluster_set_scale/tasks/main.yml index 23e08eabc..5302f0f3e 100644 --- a/roles/cluster/cluster_set_scale/tasks/main.yml +++ b/roles/cluster/cluster_set_scale/tasks/main.yml @@ -48,8 +48,8 @@ register: oc_get_machinesets failed_when: not oc_get_machinesets.stdout -- when: current_replicas_sum != scale - name: Change all {{ machineset_instance_type }} machinesets replicas to have sum {{ scale }} +- name: Change all {{ machineset_instance_type }} machinesets replicas to have sum {{ scale }} + when: current_replicas_sum != scale block: - name: Do not downscale any machinesets other than the first one, unless the user used force block: @@ -80,7 +80,8 @@ oc patch machineset -n openshift-machine-api {{ first_machineset }} --patch '{"spec": {"replicas": {{ scale }} }}' --type merge -- block: +- name: Make sure the machinesets are ready + block: - name: Wait for all machinesets with type {{ machineset_instance_type }} to be ready # This is done by verifying that at the availableReplicas @@ -92,6 +93,7 @@ # for more information. # 3. Perform some extra formatting for nicer logging shell: >- + set -o pipefail; oc get machinesets -n openshift-machine-api \ {% if machineset_name | length > 0 %} "{{ machineset_name }}" -ojson \ @@ -111,6 +113,10 @@ retries: 120 delay: 30 + rescue: + - name: Fail because the cluster machineset creation failed + fail: msg="Failing because cluster machineset creation failed" + always: # info about the 'machines' - name: Capture the description of the machines @@ -150,7 +156,4 @@ failed_when: false loop: "{{ oc_get_machinesets.stdout_lines }}" - rescue: - - name: Fail because the cluster machineset creation failed - fail: msg="Failing because cluster machineset creation failed" diff --git a/roles/codeflare/codeflare_cleanup_appwrappers/tasks/main.yml b/roles/codeflare/codeflare_cleanup_appwrappers/tasks/main.yml index c97c8deaa..c2cb53d74 100644 --- a/roles/codeflare/codeflare_cleanup_appwrappers/tasks/main.yml +++ b/roles/codeflare/codeflare_cleanup_appwrappers/tasks/main.yml @@ -1,10 +1,12 @@ - name: List all the AppWrappers in the namespace shell: | + set -o pipefail; oc get appwrappers -n {{ codeflare_cleanup_appwrappers_namespace }} -ojson | jq '.items[] | .metadata.name + " ==> "+ .status.state' -r > "{{ artifact_extra_logs_dir }}/appwrappers.status" oc get appwrappers -n {{ codeflare_cleanup_appwrappers_namespace }} -oyaml > "{{ artifact_extra_logs_dir }}/appwrappers.yaml" - name: Count the AppWrappers in the namespace shell: + set -o pipefail; oc get appwrappers -n {{ codeflare_cleanup_appwrappers_namespace }} -oname | wc -l - name: Create a configmap for the beginning of the test timestamp diff --git a/roles/codeflare/codeflare_generate_mcad_load/tasks/main.yml b/roles/codeflare/codeflare_generate_mcad_load/tasks/main.yml index a91ca5909..0c2305158 100644 --- a/roles/codeflare/codeflare_generate_mcad_load/tasks/main.yml +++ b/roles/codeflare/codeflare_generate_mcad_load/tasks/main.yml @@ -43,6 +43,7 @@ - name: Create the namespace if it does not exist shell: + set -o pipefail; oc create ns "{{ codeflare_generate_mcad_load_namespace }}" -oyaml --dry-run=client | tee "{{ artifact_extra_logs_dir }}/src/namespace.yaml" | oc apply -f- - name: Create a configmap for the beginning of the test timestamp diff --git a/roles/entitlement/entitlement_deploy/tasks/main.yml b/roles/entitlement/entitlement_deploy/tasks/main.yml index 2af07d59e..297299d99 100644 --- a/roles/entitlement/entitlement_deploy/tasks/main.yml +++ b/roles/entitlement/entitlement_deploy/tasks/main.yml @@ -31,20 +31,21 @@ - name: "Deploy RHSM from file '{{ entitlement_rhsm }}'" shell: set -o pipefail; - cat "{{ entitlement_mc_rhsm }}" - | python3 "{{ entitlement_py_apply }}" BASE64_ENCODED_RHSM_FILE @"{{ entitlement_rhsm }}" + cat "{{ entitlement_mc_rhsm }}" + | python3 "{{ entitlement_py_apply }}" BASE64_ENCODED_RHSM_FILE @"{{ entitlement_rhsm }}" | python3 "{{ entitlement_py_apply }}" MACHINE_CONFIG_ROLE "{{ machine_config_role }}" | oc apply -f- - name: "Deploy the pem and key-pem from file '{{ entitlement_pem }}'" shell: set -o pipefail; - cat "{{ entitlement_mc_pem }}" + cat "{{ entitlement_mc_pem }}" | python3 "{{ entitlement_py_apply }}" BASE64_ENCODED_PEM_FILE @"{{ entitlement_pem }}" | python3 "{{ entitlement_py_apply }}" MACHINE_CONFIG_ROLE "{{ machine_config_role }}" | oc apply -f- - name: "Deploy the repo CA from file '{{ entitlement_mc_rhsm_ca }}' if requested" + when: entitlement_repo_ca | default('', true) | trim != '' block: - name: Compute the md5sum of the CA file (debug) command: md5sum '{{ entitlement_repo_ca }}' @@ -52,8 +53,7 @@ - name: "Deploy the repo CA from file '{{ entitlement_mc_rhsm_ca }}'" shell: set -o pipefail; - cat "{{ entitlement_mc_rhsm_ca }}" + cat "{{ entitlement_mc_rhsm_ca }}" | python3 "{{ entitlement_py_apply }}" BASE64_ENCODED_RHSM_CA_FILE @"{{ entitlement_repo_ca }}" | python3 "{{ entitlement_py_apply }}" MACHINE_CONFIG_ROLE "{{ machine_config_role }}" | oc apply -f- - when: entitlement_repo_ca | default('', true) | trim != '' diff --git a/roles/entitlement/entitlement_test_in_cluster/tasks/main.yml b/roles/entitlement/entitlement_test_in_cluster/tasks/main.yml index 4f852a985..fd931988e 100644 --- a/roles/entitlement/entitlement_test_in_cluster/tasks/main.yml +++ b/roles/entitlement/entitlement_test_in_cluster/tasks/main.yml @@ -26,7 +26,8 @@ - name: Delete the entitlement tester Pod if it exists command: oc delete --ignore-not-found=true -f "{{ entitlement_tester_pod }}" -- block: +- name: Make sure the entitlement Pod is created + block: - name: Create the entitlement tester Pod command: oc create -f "{{ entitlement_tester_pod }}" diff --git a/roles/entitlement/entitlement_test_wait_deployment/tasks/main.yml b/roles/entitlement/entitlement_test_wait_deployment/tasks/main.yml index fa97dc17a..083d958a1 100644 --- a/roles/entitlement/entitlement_test_wait_deployment/tasks/main.yml +++ b/roles/entitlement/entitlement_test_wait_deployment/tasks/main.yml @@ -9,7 +9,8 @@ entitlement_retries: "{{ entitlement_nb_wait_retries }}" when: entitlement_test_and_wait == 'yes' -- block: +- name: Make sure the entitlement Pod is deployed + block: - name: Wait for the entitlement Pod to succeed shell: | set -o errexit; diff --git a/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_clusterpolicy.yml b/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_clusterpolicy.yml index 049159fdf..536cbcfc7 100644 --- a/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_clusterpolicy.yml +++ b/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_clusterpolicy.yml @@ -10,7 +10,8 @@ retries: 15 delay: 30 -- block: +- name: Make sure the GPU Operator ClusterPolicy is ready + block: - name: Wait for the GPU Operator ClusterPolicy CRD to appear command: oc get crd clusterpolicies.nvidia.com register: has_clusterpolicy_crd diff --git a/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_from_catalog.yml b/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_from_catalog.yml index 29c8446fc..282ef6230 100644 --- a/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_from_catalog.yml +++ b/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_from_catalog.yml @@ -113,7 +113,8 @@ - name: Instantiate the OperatorHub subscription command: oc create -f "{{ artifact_extra_logs_dir }}/gpu_operator_sub.yml" -- block: +- name: Install the GPU Operator + block: - name: Find the GPU Operator OperatorHub InstallPlan command: oc get InstallPlan diff --git a/roles/gpu_operator/gpu_operator_get_csv_version/tasks/main.yml b/roles/gpu_operator/gpu_operator_get_csv_version/tasks/main.yml index b0f62d319..4aa2d8154 100644 --- a/roles/gpu_operator/gpu_operator_get_csv_version/tasks/main.yml +++ b/roles/gpu_operator/gpu_operator_get_csv_version/tasks/main.yml @@ -9,11 +9,15 @@ - name: Get the gpu-operator subscription package name block: - name: Count gpu operator subscription candidates - shell: oc get subscription --all-namespaces -ojson | jq '[.items[] | select(.spec.name | test("gpu-operator"))] | length' + shell: + set -o pipefail; + oc get subscription --all-namespaces -ojson | jq '[.items[] | select(.spec.name | test("gpu-operator"))] | length' register: gpu_subscriptions failed_when: gpu_subscriptions.stdout != '1' - name: Read the package name from the first gpu-operator subscription - shell: oc get subscription -A -ojson | jq '[.items[] | select(.spec.name | test("gpu-operator"))][0].spec.name' + shell: + set -o pipefail; + oc get subscription -A -ojson | jq '[.items[] | select(.spec.name | test("gpu-operator"))][0].spec.name' register: gpu_operator_subscription_package_name - name: Ensure that there is a CSV for the GPU Operator command: diff --git a/roles/gpu_operator/gpu_operator_run_gpu_burn/tasks/main.yml b/roles/gpu_operator/gpu_operator_run_gpu_burn/tasks/main.yml index 3f9cce344..f6524087e 100644 --- a/roles/gpu_operator/gpu_operator_run_gpu_burn/tasks/main.yml +++ b/roles/gpu_operator/gpu_operator_run_gpu_burn/tasks/main.yml @@ -91,13 +91,16 @@ - name: Ensure that no GPU was faulty loop: "{{ gpu_burn_gpu_nodes.stdout_lines }}" shell: + set -o pipefail; oc logs pod/gpu-burn-{{ item }} -n {{ gpu_operator_run_gpu_burn_namespace }} | grep FAULTY register: gpu_burn_test_faulty failed_when: gpu_burn_test_faulty.rc == 0 always: - name: Save the logs of the GPU burn Pods - shell: oc logs pod/gpu-burn-{{ item }} -n {{ gpu_operator_run_gpu_burn_namespace }} | grep -o "[^$(printf '\r')]*$" + shell: + set -o pipefail; + oc logs pod/gpu-burn-{{ item }} -n {{ gpu_operator_run_gpu_burn_namespace }} | grep -o "[^$(printf '\r')]*$" with_items: "{{ gpu_burn_gpu_nodes.stdout_lines }}" failed_when: false @@ -124,4 +127,4 @@ - name: Delete the src ConfigMap command: oc --ignore-not-found=true delete configmap gpu-burn-src -n {{ gpu_operator_run_gpu_burn_namespace }} failed_when: false - when: not gpu_operator_run_gpu_burn_keep_resources \ No newline at end of file + when: not gpu_operator_run_gpu_burn_keep_resources diff --git a/roles/gpu_operator/gpu_operator_wait_deployment/tasks/main.yml b/roles/gpu_operator/gpu_operator_wait_deployment/tasks/main.yml index b90de5a92..e5950f328 100644 --- a/roles/gpu_operator/gpu_operator_wait_deployment/tasks/main.yml +++ b/roles/gpu_operator/gpu_operator_wait_deployment/tasks/main.yml @@ -78,7 +78,8 @@ retries: 15 delay: 60 -- block: +- name: Wait for the nodes labeling + block: - name: Wait for the gpu-feature-discovery Pod to label the nodes command: oc get nodes -l nvidia.com/gpu.count -oname register: has_gpu_feature_discovery_labels diff --git a/roles/gpu_operator/gpu_operator_wait_deployment/tasks/metrics.yml b/roles/gpu_operator/gpu_operator_wait_deployment/tasks/metrics.yml index 57e3ac7ac..3cf94ef2b 100644 --- a/roles/gpu_operator/gpu_operator_wait_deployment/tasks/metrics.yml +++ b/roles/gpu_operator/gpu_operator_wait_deployment/tasks/metrics.yml @@ -1,5 +1,6 @@ --- -- block: +- name: Create the GPU operator requirements + block: - name: Check if the GPU Operator namespace has the openshift.io/cluster-monitoring label shell: set -o pipefail; diff --git a/roles/load_aware/load_aware_deploy_trimaran/tasks/main.yml b/roles/load_aware/load_aware_deploy_trimaran/tasks/main.yml index 29c7ad151..6182b8ec9 100644 --- a/roles/load_aware/load_aware_deploy_trimaran/tasks/main.yml +++ b/roles/load_aware/load_aware_deploy_trimaran/tasks/main.yml @@ -5,7 +5,6 @@ - name: Ensure user monitoring enabled shell: set -o pipefail; - oc -n openshift-user-workload-monitoring get pod --no-headers | awk '{ print $3}' register: monitoring_enabled @@ -16,6 +15,7 @@ - name: Get monitoring secret name shell: + set -o pipefail; oc get secret -n openshift-user-workload-monitoring | grep prometheus-user-workload-token | head -n 1 @@ -27,9 +27,11 @@ - name: Get Thanos Endpoint shell: + set -o pipefail; oc get route thanos-querier -n openshift-monitoring -o json | jq -r '.spec.host' register: thanos_endpoint_cmd + - name: Format the Thanos Endpoint set_fact: thanos_endpoint: "https://{{ thanos_endpoint_cmd.stdout }}" @@ -75,11 +77,11 @@ echo "Route to thanos monitoring endpoint: {{ thanos_endpoint }}" export MONITORING_TOKEN=$(oc get secret {{ monitoring_secret.stdout }} -n openshift-user-workload-monitoring -o json | jq -r '.data.token' | base64 -d) cat {{ trimaran_setup_config }} | envsubst | oc apply -f - - register: deploy_trimaran - name: Ensure Trimaran is Running shell: + set -o pipefail; oc get pods -n trimaran | grep "trimaran-scheduler" | awk '{print $3}' @@ -96,6 +98,7 @@ - name: Ensure the Trimaran test pods completes shell: + set -o pipefail; oc get pod trimaran-test -n trimaran --no-headers | awk '{print $3}' register: trimaran_test_pod_state diff --git a/roles/load_aware/load_aware_scale_test/tasks/main.yml b/roles/load_aware/load_aware_scale_test/tasks/main.yml index 054bb3524..c2d4724ca 100644 --- a/roles/load_aware/load_aware_scale_test/tasks/main.yml +++ b/roles/load_aware/load_aware_scale_test/tasks/main.yml @@ -1,6 +1,6 @@ - name: Generate load timeline shell: - python3 {{ load_timeline_generator }} {{ load_aware_scale_test_distribution }} {{ load_aware_scale_test_duration}} {{ load_aware_scale_test_instances }} "{{ artifact_extra_logs_dir }}/schedule_plan.yaml" + python3 {{ load_timeline_generator }} {{ load_aware_scale_test_distribution }} {{ load_aware_scale_test_duration }} {{ load_aware_scale_test_instances }} "{{ artifact_extra_logs_dir }}/schedule_plan.yaml" - name: Run workload and dump stats block: @@ -10,6 +10,7 @@ - name: Wait for workloads to finish shell: + set -o pipefail; oc get pods -n {{ load_aware_scale_test_namespace }} --no-headers | awk '{ print $3 }' register: load_aware_workload delay: 60 diff --git a/roles/local_ci/local_ci_run_multi/tasks/main.yml b/roles/local_ci/local_ci_run_multi/tasks/main.yml index adfcd5272..aafdc6f5e 100644 --- a/roles/local_ci/local_ci_run_multi/tasks/main.yml +++ b/roles/local_ci/local_ci_run_multi/tasks/main.yml @@ -214,6 +214,7 @@ - name: Store the success count shell: + set -o pipefail; echo "{{ success_count_cmd.stdout_lines[0] }}/{{ local_ci_run_multi_user_count }}" | tee "{{ artifact_extra_logs_dir }}/success_count" diff --git a/roles/nfd/nfd_test_wait_gpu/tasks/main.yml b/roles/nfd/nfd_test_wait_gpu/tasks/main.yml index b38cafaa9..4a03ff750 100644 --- a/roles/nfd/nfd_test_wait_gpu/tasks/main.yml +++ b/roles/nfd/nfd_test_wait_gpu/tasks/main.yml @@ -9,11 +9,13 @@ nfd_wait_gpu_retries: "{{ nfd_wait_gpu_nb_retries }}" when: nfd_wait_gpu_nodes == 'yes' -- block: +- name: Coverge the GPU nodes creation + block: - name: Wait for the GPU nodes to appear # label list should be in sync with: # https://github.com/NVIDIA/gpu-operator/blob/master/pkg/controller/clusterpolicy/state_manager.go#L26 shell: + set -o pipefail; ( oc get nodes -oname --ignore-not-found=false -l feature.node.kubernetes.io/pci-10de.present || oc get nodes -oname --ignore-not-found=false -l feature.node.kubernetes.io/pci-0302_10de.present || oc get nodes -oname --ignore-not-found=false -l feature.node.kubernetes.io/pci-0300_10de.present @@ -25,7 +27,9 @@ rescue: - name: Get the labels of the worker nodes (debug) - shell: oc get nodes --show-labels --selector='!node-role.kubernetes.io/master' | tr , '\n' + shell: + set -o pipefail; + oc get nodes --show-labels --selector='!node-role.kubernetes.io/master' | tr , '\n' - name: Failing because no GPU node showed up fail: msg="Failed because no GPU node showed up" diff --git a/roles/ocm/ocm_deploy_addon/tasks/main.yml b/roles/ocm/ocm_deploy_addon/tasks/main.yml index 5ebe520c0..94d7b674c 100644 --- a/roles/ocm/ocm_deploy_addon/tasks/main.yml +++ b/roles/ocm/ocm_deploy_addon/tasks/main.yml @@ -26,7 +26,7 @@ - name: Check if addon is already installed shell: set -o pipefail; - ocm get /api/clusters_mgmt/v1/clusters/{{ cluster_id }}/addons/{{ ocm_deploy_addon_id }} |& jq -r '.kind' || true + ocm get /api/clusters_mgmt/v1/clusters/{{ cluster_id }}/addons/{{ ocm_deploy_addon_id }} |& jq -r '.kind' || true register: ocm_addon_precheck - name: Create ocm addon install payload @@ -36,11 +36,11 @@ mode: 0400 when: '"Error" in ocm_addon_precheck.stdout' -- name: "Install addon {{ ocm_deploy_addon_id }} via OCM API" +- name: Install addon {{ ocm_deploy_addon_id }} via OCM API shell: | set -o pipefail; url="/api/clusters_mgmt/v1/clusters/{{ cluster_id }}/addons" - body="{{ artifact_extra_logs_dir }}/addon_{{ ocm_deploy_addon_id }}.json" + body="{{ artifact_extra_logs_dir }}/addon_{{ ocm_deploy_addon_id }}.json" output=$(ocm post "$url" --body=$body 2>&1); echo "$output" >&2; # for observation echo "$output" | jq -r '.kind' diff --git a/roles/pipelines/pipelines_capture_state/tasks/main.yml b/roles/pipelines/pipelines_capture_state/tasks/main.yml index 08e39e2a2..b43f650fb 100644 --- a/roles/pipelines/pipelines_capture_state/tasks/main.yml +++ b/roles/pipelines/pipelines_capture_state/tasks/main.yml @@ -11,6 +11,7 @@ - name: Compute the DSP application name shell: + set -o pipefail; oc get dspa -oname -n "{{ namespace }}" | head -1 | cut -d/ -f2 register: dspa_name_cmd when: not pipelines_capture_state_dsp_application_name diff --git a/roles/pipelines/pipelines_run_kfp_notebook/tasks/main.yml b/roles/pipelines/pipelines_run_kfp_notebook/tasks/main.yml index 7893b694e..dbbdb87f8 100644 --- a/roles/pipelines/pipelines_run_kfp_notebook/tasks/main.yml +++ b/roles/pipelines/pipelines_run_kfp_notebook/tasks/main.yml @@ -107,6 +107,7 @@ - name: Compute the DSP application name shell: + set -o pipefail; oc get dspa -oname -n "{{ notebook_namespace }}" | head -1 | cut -d/ -f2 register: dspa_name_cmd when: not pipelines_run_kfp_notebook_dsp_application_name @@ -162,7 +163,7 @@ set -o pipefail; oc adm policy add-cluster-role-to-user "{{ sa_cluster_role }}" -n "{{ notebook_namespace }}" - -z "{{ sa_name}}" + -z "{{ sa_name }}" --dry-run=client -oyaml | tee "{{ artifact_extra_logs_dir }}/src/clusterrolebinding.yaml" | oc apply -f- @@ -193,7 +194,7 @@ - name: Create the secret token shell: oc create secret generic "{{ secret_token_name }}" - "--from-literal=token=$(oc create token '{{ sa_name}}' -n '{{ notebook_namespace }}')" + "--from-literal=token=$(oc create token '{{ sa_name }}' -n '{{ notebook_namespace }}')" -n "{{ notebook_namespace }}" register: create_secret_token_cmd failed_when: '"no token is currently in use for this session" in create_secret_token_cmd.stderr' diff --git a/roles/rhods/rhods_benchmark_notebook_performance/tasks/main.yml b/roles/rhods/rhods_benchmark_notebook_performance/tasks/main.yml index f317cdc22..50c783d74 100644 --- a/roles/rhods/rhods_benchmark_notebook_performance/tasks/main.yml +++ b/roles/rhods/rhods_benchmark_notebook_performance/tasks/main.yml @@ -40,7 +40,7 @@ - name: Save the imagestream tag set_fact: - image_tag: "{% if rhods_benchmark_notebook_performance_imagestream_tag | string %}{{ rhods_benchmark_notebook_performance_imagestream_tag }}{% else %}{{ image_tag_cmd.stdout }}{% endif %}" + image_tag: "{% if rhods_benchmark_notebook_performance_imagestream_tag | string %}{{ rhods_benchmark_notebook_performance_imagestream_tag }}{% else %}{{ image_tag_cmd.stdout }}{% endif %}" - name: Get the image address command: @@ -54,6 +54,7 @@ block: - name: Compute the imagestream filename shell: + set -o pipefail; echo -n "{{ rhods_benchmark_notebook_performance_imagestream }}" | sed 's/^s2i-//g' | sed 's/-notebook//g'; echo "-notebook-imagestream.yaml" register: imagestream_filename_cmd diff --git a/roles/rhods/rhods_notebook_ods_ci_scale_test/tasks/main.yml b/roles/rhods/rhods_notebook_ods_ci_scale_test/tasks/main.yml index 909225c43..466174384 100644 --- a/roles/rhods/rhods_notebook_ods_ci_scale_test/tasks/main.yml +++ b/roles/rhods/rhods_notebook_ods_ci_scale_test/tasks/main.yml @@ -21,7 +21,7 @@ - name: Update the Exclude tags if necessary set_fact: - rhods_notebook_ods_ci_scale_test_ods_ci_exclude_tags: "{% if rhods_notebook_ods_ci_scale_test_only_create_notebooks|bool %}JupyterLabORWait{% else %}{{ rhods_notebook_ods_ci_scale_test_ods_ci_exclude_tags }}{% endif %}" + rhods_notebook_ods_ci_scale_test_ods_ci_exclude_tags: "{% if rhods_notebook_ods_ci_scale_test_only_create_notebooks | bool %}JupyterLabORWait{% else %}{{ rhods_notebook_ods_ci_scale_test_ods_ci_exclude_tags }}{% endif %}" - name: Define the test environments set_fact: @@ -64,11 +64,13 @@ - name: Get the RHODS CSV name shell: + set -o pipefail; oc get csv -oname -n redhat-ods-operator | grep rhods-operator register: rhods_csv_cmd - name: Get the RHODS version shell: + set -o pipefail; oc get {{ rhods_csv_cmd.stdout }} -n redhat-ods-operator -oname | grep rhods-operator | cut -d/ -f2 | cut -d. -f2- register: rhods_version_cmd diff --git a/roles/watsonx_serving/watsonx_serving_deploy_model/tasks/main.yml b/roles/watsonx_serving/watsonx_serving_deploy_model/tasks/main.yml index a33c30dd4..39cea6921 100644 --- a/roles/watsonx_serving/watsonx_serving_deploy_model/tasks/main.yml +++ b/roles/watsonx_serving/watsonx_serving_deploy_model/tasks/main.yml @@ -37,6 +37,7 @@ - name: Check that the namespace is already registered when: patch_smmr_cmd.rc != 0 shell: + set -o pipefail; oc get smmr/default -n istio-system -ojsonpath={.spec.members} | jq .[] -r register: smmr_members_cmd failed_when: watsonx_serving_deploy_model_namespace not in smmr_members_cmd.stdout_lines diff --git a/roles/wisdom/wisdom_deploy_model/tasks/main.yml b/roles/wisdom/wisdom_deploy_model/tasks/main.yml index 060186c4f..01809de9d 100644 --- a/roles/wisdom/wisdom_deploy_model/tasks/main.yml +++ b/roles/wisdom/wisdom_deploy_model/tasks/main.yml @@ -104,8 +104,10 @@ command: oc delete secret/wisdom-protos --ignore-not-found -n {{ wisdom_deploy_model_namespace }} - name: Create the wisdom protos Secret - shell: oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_deploy_model_protos_path }}/* | tr ' ' ,) -n {{ wisdom_deploy_model_namespace }} --dry-run=client -oyaml | oc apply -f - - + shell: + set -o pipefail; + oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_deploy_model_protos_path }}/* | tr ' ' ,) -n {{ wisdom_deploy_model_namespace }} --dry-run=client -oyaml | oc apply -f - + - name: Delete the entrypoint ConfigMap if it already exists command: oc delete --ignore-not-found configmap/wisdom-tester-entrypoint -n {{ wisdom_deploy_model_namespace }} @@ -129,17 +131,16 @@ command: oc create -f "{{ artifact_extra_logs_dir }}/src/003_wisdom_tester_pod.yml" -n {{ wisdom_deploy_model_namespace }} # Wait for model to respond without error -- block: - - name: Wait for the wisdom test Pod to terminate - command: - oc get pod/wisdom-tester - -n {{ wisdom_deploy_model_namespace }} - -o custom-columns=:.status.phase - --no-headers - register: tester_wait - until: tester_wait.stdout == "Succeeded" or tester_wait.stdout == "Error" or tester_wait.stdout == "Failed" - retries: 10 - delay: 30 +- name: Wait for the wisdom test Pod to terminate + command: + oc get pod/wisdom-tester + -n {{ wisdom_deploy_model_namespace }} + -o custom-columns=:.status.phase + --no-headers + register: tester_wait + until: tester_wait.stdout == "Succeeded" or tester_wait.stdout == "Error" or tester_wait.stdout == "Failed" + retries: 10 + delay: 30 - name: Save the wisdom test Pod logs shell: oc logs pod/wisdom-tester -n {{ wisdom_deploy_model_namespace }} > {{ artifact_extra_logs_dir }}/wisdom-tester-pod.log diff --git a/roles/wisdom/wisdom_llm_load_test/tasks/main.yml b/roles/wisdom/wisdom_llm_load_test/tasks/main.yml index 04ec4fe2b..bba245544 100644 --- a/roles/wisdom/wisdom_llm_load_test/tasks/main.yml +++ b/roles/wisdom/wisdom_llm_load_test/tasks/main.yml @@ -13,7 +13,9 @@ command: oc delete secret/wisdom-protos --ignore-not-found -n {{ wisdom_llm_load_test_namespace }} - name: Create the wisdom protos Secret - shell: oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_llm_load_test_protos_path }}/* | tr ' ' ,) -n {{ wisdom_llm_load_test_namespace }} --dry-run=client -oyaml | oc apply -f - + shell: + set -o pipefail; + oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_llm_load_test_protos_path }}/* | tr ' ' ,) -n {{ wisdom_llm_load_test_namespace }} --dry-run=client -oyaml | oc apply -f - # The results need to be pushed to S3 after the run, put S3 credentials in a Secret - name: Delete the S3 push secret for results if it exists diff --git a/roles/wisdom/wisdom_llm_load_test_multiplexed/tasks/main.yml b/roles/wisdom/wisdom_llm_load_test_multiplexed/tasks/main.yml index d90196982..2afe15e5e 100644 --- a/roles/wisdom/wisdom_llm_load_test_multiplexed/tasks/main.yml +++ b/roles/wisdom/wisdom_llm_load_test_multiplexed/tasks/main.yml @@ -13,7 +13,9 @@ command: oc delete secret/wisdom-protos --ignore-not-found -n {{ wisdom_llm_load_test_multiplexed_namespace }} - name: Create the wisdom protos Secret - shell: oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_llm_load_test_multiplexed_protos_path }}/* | tr ' ' ,) -n {{ wisdom_llm_load_test_multiplexed_namespace }} --dry-run=client -oyaml | oc apply -f - + shell: + set -o pipefail; + oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_llm_load_test_multiplexed_protos_path }}/* | tr ' ' ,) -n {{ wisdom_llm_load_test_multiplexed_namespace }} --dry-run=client -oyaml | oc apply -f - # The results need to be pushed to S3 after the run, put S3 credentials in a Secret - name: Delete the S3 push secret for results if it exists diff --git a/roles/wisdom/wisdom_warmup_model/tasks/main.yml b/roles/wisdom/wisdom_warmup_model/tasks/main.yml index b25016b0f..5cc06cd1a 100644 --- a/roles/wisdom/wisdom_warmup_model/tasks/main.yml +++ b/roles/wisdom/wisdom_warmup_model/tasks/main.yml @@ -20,8 +20,10 @@ command: oc delete secret/wisdom-protos --ignore-not-found -n {{ wisdom_warmup_model_namespace }} - name: Create the wisdom protos Secret - shell: oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_warmup_model_protos_path }}/* | tr ' ' ,) -n {{ wisdom_warmup_model_namespace }} --dry-run=client -oyaml | oc apply -f - - + shell: + set -o pipefail; + oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_warmup_model_protos_path }}/* | tr ' ' ,) -n {{ wisdom_warmup_model_namespace }} --dry-run=client -oyaml | oc apply -f - + - name: Apply the warmup Pod template template: src: "{{ wisdom_warmup_pod_template }}"