From c7cfe86a226a63b44bd35055c994b7527793ede8 Mon Sep 17 00:00:00 2001 From: Carlos Camacho Date: Wed, 23 Aug 2023 14:54:30 +0200 Subject: [PATCH] testing: lints partial fixes This commit resolves the following lint rules: - 'key-order[task]' - 'name[missing]' - 'yaml[trailing-spaces]' - 'risky-shell-pipe' - 'args[module]' Also removes the branch constraint to run the Ansible lint checks from any fork. This is useful because in each contributor's fork when pushing specific named branches the checks will be executed, a future improvement could be to run all the lints checks from a single environment like `tox -e linters` instead of consuming the GH action. Partially-solves: #10 --- .github/workflows/ansible-lint.yml | 3 - config/ansible-lint.yml | 7 -- .../tasks/main.yml | 37 +++++----- .../tasks/main.yml | 30 ++++---- .../cluster/cluster_create_osd/tasks/main.yml | 10 +-- .../cluster_deploy_aws_efs/tasks/aws-efs.yaml | 8 +-- .../cluster_deploy_ldap/tasks/main.yml | 7 +- .../cluster_deploy_operator/tasks/main.yml | 68 +++++++++---------- .../cluster_ensure_machineset/tasks/main.yml | 5 +- .../cluster_fill_workernodes/tasks/main.yml | 19 +++--- .../tasks/cluster_set.yaml | 10 +-- .../cluster/cluster_set_scale/tasks/main.yml | 19 ++++-- .../tasks/main.yml | 8 ++- .../tasks/main.yml | 5 +- .../entitlement_deploy/tasks/main.yml | 14 ++-- .../tasks/main.yml | 3 +- .../tasks/main.yml | 3 +- .../tasks/deploy_clusterpolicy.yml | 9 +-- .../tasks/deploy_from_catalog.yml | 3 +- .../tasks/main.yml | 12 +++- .../gpu_operator_run_gpu_burn/tasks/main.yml | 13 +++- .../tasks/main.yml | 3 +- .../tasks/metrics.yml | 3 +- .../load_aware_deploy_trimaran/tasks/main.yml | 32 +++++++-- .../load_aware_scale_test/tasks/main.yml | 7 +- .../local_ci_run_multi/tasks/main.yml | 7 +- roles/nfd/nfd_test_wait_gpu/tasks/main.yml | 14 +++- roles/ocm/ocm_deploy_addon/tasks/main.yml | 8 +-- .../pipelines_capture_state/tasks/main.yml | 6 +- .../pipelines_run_kfp_notebook/tasks/main.yml | 17 +++-- .../tasks/main.yml | 7 +- .../tasks/main.yml | 12 +++- .../wisdom/wisdom_deploy_model/tasks/main.yml | 29 ++++---- .../wisdom_llm_load_test/tasks/main.yml | 6 +- .../tasks/main.yml | 6 +- .../wisdom/wisdom_warmup_model/tasks/main.yml | 8 ++- 36 files changed, 280 insertions(+), 178 deletions(-) diff --git a/.github/workflows/ansible-lint.yml b/.github/workflows/ansible-lint.yml index 6a070ce47..201678dda 100644 --- a/.github/workflows/ansible-lint.yml +++ b/.github/workflows/ansible-lint.yml @@ -3,11 +3,8 @@ name: Run ansible-lint # Controls when the action will run. on: - # Triggers the workflow on push or pull request events but only for the main branch pull_request: - branches: [main] push: - branches: [main] schedule: - cron: '0 */8 * * *' # Allows you to run this workflow manually from the Actions tab diff --git a/config/ansible-lint.yml b/config/ansible-lint.yml index 9bb28edf0..5eea32934 100644 --- a/config/ansible-lint.yml +++ b/config/ansible-lint.yml @@ -14,11 +14,8 @@ skip_list: - 'command-instead-of-module' - 'command-instead-of-shell' - 'deprecated-local-action' - - 'key-order[task]' - - 'jinja[spacing]' - 'no-free-form' - 'chema[meta]' - - 'name[missing]' - 'var-naming[no-reserved]' - 'var-naming[no-role-prefix]' - 'var-naming[pattern]' @@ -29,15 +26,11 @@ skip_list: - 'yaml[indentation]' - 'yaml[key-duplicates]' - 'yaml[line-length]' - - 'yaml[new-line-at-end-of-file]' - 'yaml[octal-values]' - - 'yaml[trailing-spaces]' - 'yaml[truthy]' - 'name[template]' - 'name[casing]' - 'risky-file-permissions' - - 'risky-shell-pipe' - 'ignore-errors' - 'no-changed-when' - 'fqcn' - - 'args[module]' diff --git a/roles/benchmarking/benchmarking_run_mlperf_ssd/tasks/main.yml b/roles/benchmarking/benchmarking_run_mlperf_ssd/tasks/main.yml index 99b8991a5..82c1289ec 100644 --- a/roles/benchmarking/benchmarking_run_mlperf_ssd/tasks/main.yml +++ b/roles/benchmarking/benchmarking_run_mlperf_ssd/tasks/main.yml @@ -7,24 +7,24 @@ command: oc get nodes -l kubernetes.io/hostname={{ benchmarking_run_mlperf_ssd_node_hostname }} -oname - name: Ensure that the coco dataset PVC exists - command: - oc get pvc/{{ benchmarking_run_mlperf_ssd_dataset_pvc_name }} + shell: | + oc get pvc/{{ benchmarking_run_mlperf_ssd_dataset_pvc_name }} \ -n {{ benchmarking_run_mlperf_ssd_namespace }} - name: Fetch the coco dataset PVC definition (debug) - shell: - oc get pvc/{{ benchmarking_run_mlperf_ssd_pvc_name }} - -n {{ benchmarking_run_mlperf_ssd_namespace }} - -oyaml + shell: | + oc get pvc/{{ benchmarking_run_mlperf_ssd_pvc_name }} \ + -n {{ benchmarking_run_mlperf_ssd_namespace }} \ + -oyaml \ > {{ artifact_extra_logs_dir }}/pvc_coco-dataset.yml - name: Create the entrypoint ConfigMap file - shell: - oc create cm {{ benchmarking_run_mlperf_ssd_entrypoint_cm_name }} - --from-file="{{ benchmarking_mlperf_ssd_entrypoint }}" - -n {{ benchmarking_run_mlperf_ssd_namespace }} - --dry-run=client - -oyaml + shell: | + oc create cm {{ benchmarking_run_mlperf_ssd_entrypoint_cm_name }} \ + --from-file="{{ benchmarking_mlperf_ssd_entrypoint }}" \ + -n {{ benchmarking_run_mlperf_ssd_namespace }} \ + --dry-run=client \ + -oyaml \ > {{ artifact_extra_logs_dir }}/000_configmap_run-mlperf-ssd_entrypoint.yml - name: Create the entrypoint ConfigMap resource @@ -45,7 +45,8 @@ command: oc create -f "{{ artifact_extra_logs_dir }}/001_pod_run-mlperf-ssd.yml" -- block: +- name: Make sure the benchmark completes + block: - name: Wait for the benchmark completion command: oc get pod/{{ benchmarking_run_mlperf_ssd_name }} @@ -63,8 +64,8 @@ always: - name: Store the logs of benchmark execution (for post-processing) - shell: - oc logs pod/{{ benchmarking_run_mlperf_ssd_name }} -n {{ benchmarking_run_mlperf_ssd_namespace }} + shell: | + oc logs pod/{{ benchmarking_run_mlperf_ssd_name }} -n {{ benchmarking_run_mlperf_ssd_namespace }} \ > "{{ artifact_extra_logs_dir }}/pod_run-mlperf-ssd.log" failed_when: false @@ -73,13 +74,13 @@ echo "{{ wait_benchmark_pod_cmd.stdout }}" > "{{ artifact_extra_logs_dir }}/pod_run-mlperf-ssd.status" - name: Store the description of benchmark execution (debug) - shell: - oc describe pod/{{ benchmarking_run_mlperf_ssd_name }} -n {{ benchmarking_run_mlperf_ssd_namespace }} + shell: | + oc describe pod/{{ benchmarking_run_mlperf_ssd_name }} -n {{ benchmarking_run_mlperf_ssd_namespace }} \ > "{{ artifact_extra_logs_dir }}/pod_run-mlperf-ssd.descr" failed_when: false - name: Get average sample rate - shell: + shell: | set -o pipefail; cat "{{ artifact_extra_logs_dir }}/pod_run-mlperf-ssd.log" | grep avg. | tail -n1 | awk '{ print $NF " samples/sec" }' > "{{ artifact_dir }}/benchmarking_run_ssd_sample_rate.log"; cp {{ artifact_dir }}/benchmarking_run_ssd_sample_rate.log {{ artifact_extra_logs_dir }}/benchmarking_run_ssd_sample_rate.log diff --git a/roles/cluster/cluster_capture_environment/tasks/main.yml b/roles/cluster/cluster_capture_environment/tasks/main.yml index 5e007541d..e62ed54fd 100644 --- a/roles/cluster/cluster_capture_environment/tasks/main.yml +++ b/roles/cluster/cluster_capture_environment/tasks/main.yml @@ -1,34 +1,34 @@ - name: Store OpenShift version identifier - shell: + shell: | set -o pipefail; - oc version -o json - | jq --raw-output '.openshiftVersion' + oc version -o json \ + | jq --raw-output '.openshiftVersion' \ > {{ artifact_extra_logs_dir }}/ocp.version - name: Store OpenShift YAML version - shell: - oc version -oyaml + shell: | + oc version -oyaml \ > {{ artifact_extra_logs_dir }}/ocp_version.yml - name: Store OpenShift YAML clusterversion - shell: - oc get clusterversion/version -oyaml + shell: | + oc get clusterversion/version -oyaml \ > {{ artifact_extra_logs_dir }}/ocp_clusterversion.yml # --- - name: Store the OpenShift nodes - shell: - oc get nodes -owide + shell: | + oc get nodes -owide \ > {{ artifact_extra_logs_dir }}/nodes.status; - oc get nodes -oyaml + oc get nodes -oyaml \ > {{ artifact_extra_logs_dir }}/nodes.yaml; - name: Store the OpenShift machines - shell: - oc get machines -n openshift-machine-api -owide + shell: | + oc get machines -n openshift-machine-api -owide \ > {{ artifact_extra_logs_dir }}/machines.status; - oc get machines -n openshift-machine-api -oyaml + oc get machines -n openshift-machine-api -oyaml \ > {{ artifact_extra_logs_dir }}/machines.yaml; # --- @@ -37,8 +37,6 @@ command: git describe HEAD --long --always register: git_version - args: - warn: false # don't warn about using git here - name: Store ci-artifact version from Git copy: @@ -50,8 +48,6 @@ command: git show --no-patch register: git_show - args: - warn: false # don't warn about using git here - name: Store ci-artifact last git commit copy: diff --git a/roles/cluster/cluster_create_osd/tasks/main.yml b/roles/cluster/cluster_create_osd/tasks/main.yml index 91233317d..1e8cf8d79 100644 --- a/roles/cluster/cluster_create_osd/tasks/main.yml +++ b/roles/cluster/cluster_create_osd/tasks/main.yml @@ -117,11 +117,11 @@ oc get nodes > "{{ artifact_extra_logs_dir }}/nodes.status" - name: Set the desired worker node count - command: - ocm edit machinepool - {{ cluster_create_osd_machinepool_name }} - --cluster={{ cluster_create_osd_cluster_name }} - --replicas={{ [2, cluster_create_osd_compute_nodes|int] |max }} + shell: | + ocm edit machinepool \ + {{ cluster_create_osd_machinepool_name }} \ + --cluster={{ cluster_create_osd_cluster_name }} \ + --replicas={{ [2, cluster_create_osd_compute_nodes|int]|max }} - name: Wait for the desired worker node count shell: | diff --git a/roles/cluster/cluster_deploy_aws_efs/tasks/aws-efs.yaml b/roles/cluster/cluster_deploy_aws_efs/tasks/aws-efs.yaml index 7ace0b7d3..20d2e6409 100644 --- a/roles/cluster/cluster_deploy_aws_efs/tasks/aws-efs.yaml +++ b/roles/cluster/cluster_deploy_aws_efs/tasks/aws-efs.yaml @@ -96,11 +96,11 @@ - name: Populate the tags dict set_fact: - tags: "{{ tags | default({}) | combine ({ item.key : item.value }) }}" + tags: "{{ tags | default({}) | combine ({item.key: item.value}) }}" with_items: - - { 'key': 'Name' , 'value': '{{ cluster_name_cmd.stdout }}'} - - { 'key': '{{ cluster_name_tag_cmd.stdout }}' , 'value': 'owned'} - - { 'key': 'Purpose', 'value': ''} + - {'key': 'Name' , 'value': '{{ cluster_name_cmd.stdout }}'} + - {'key': '{{ cluster_name_tag_cmd.stdout }}' , 'value': 'owned'} + - {'key': 'Purpose', 'value': ''} - name: Get the SecurityGroup content amazon.aws.ec2_group_info: diff --git a/roles/cluster/cluster_deploy_ldap/tasks/main.yml b/roles/cluster/cluster_deploy_ldap/tasks/main.yml index 2a20c6783..a7222f2ca 100644 --- a/roles/cluster/cluster_deploy_ldap/tasks/main.yml +++ b/roles/cluster/cluster_deploy_ldap/tasks/main.yml @@ -144,9 +144,12 @@ # Workaround until `ocm` supports the --insecure flag - name: Get the cluster ID - shell: + shell: | + set -o pipefail ocm describe cluster "{{ cluster_deploy_ldap_cluster_name }}" --json | jq .id -r register: cluster_id_cmd + args: + executable: /bin/bash - name: Create the IDP resource manually shell: | @@ -157,6 +160,8 @@ url="https://api.openshift.com/api/clusters_mgmt/v1/clusters/{{ cluster_id_cmd.stdout }}/identity_providers"; cat "{{ cluster_deploy_ldap_ocm_idp }}" | envsubst > /tmp/idp.json ocm post "$url" --body /tmp/idp.json + args: + executable: /bin/bash - name: Get the API URL command: oc whoami --show-server diff --git a/roles/cluster/cluster_deploy_operator/tasks/main.yml b/roles/cluster/cluster_deploy_operator/tasks/main.yml index f54d27053..3012d54ea 100644 --- a/roles/cluster/cluster_deploy_operator/tasks/main.yml +++ b/roles/cluster/cluster_deploy_operator/tasks/main.yml @@ -45,10 +45,10 @@ -n {{ cluster_deploy_operator_catalog_namespace }} - name: Capture the state of the CatalogSource (debug) - shell: - oc get -oyaml CatalogSource/{{ cluster_deploy_operator_catalog }} - -n {{ cluster_deploy_operator_catalog_namespace }} - -oyaml + shell: | + oc get -oyaml CatalogSource/{{ cluster_deploy_operator_catalog }} \ + -n {{ cluster_deploy_operator_catalog_namespace }} \ + -oyaml \ > {{ artifact_extra_logs_dir }}/catalogsource.yml failed_when: false @@ -61,24 +61,24 @@ delay: 30 - name: Save the operator PackageManifest YAML (debug) - shell: - oc get packagemanifests/{{ cluster_deploy_operator_manifest_name }} - -n {{ cluster_deploy_operator_catalog_namespace }} - -oyaml + shell: | + oc get packagemanifests/{{ cluster_deploy_operator_manifest_name }} \ + -n {{ cluster_deploy_operator_catalog_namespace }} \ + -oyaml \ > {{ artifact_extra_logs_dir }}/operator_packagemanifest.yml - name: Store the operator PackageManifest JSON - shell: - oc get packagemanifests/{{ cluster_deploy_operator_manifest_name }} - -n {{ cluster_deploy_operator_catalog_namespace }} - -ojson + shell: | + oc get packagemanifests/{{ cluster_deploy_operator_manifest_name }} \ + -n {{ cluster_deploy_operator_catalog_namespace }} \ + -ojson \ > {{ artifact_extra_logs_dir }}/operator_packagemanifest.json rescue: - name: Capture the Catalog Operator logs (debug) - shell: - oc logs deployment.apps/catalog-operator - -n openshift-operator-lifecycle-manager + shell: | + oc logs deployment.apps/catalog-operator \ + -n openshift-operator-lifecycle-manager \ > {{ artifact_extra_logs_dir }}/catalog_operator.log failed_when: false @@ -86,8 +86,8 @@ debug: msg="The logs of Catalog Operator have been saved in {{ artifact_extra_logs_dir }}/catalog_operator.log" - name: Mark the failure as flake - shell: - echo "Failed because the {{ cluster_deploy_operator_manifest_name }} PackageManifest is not available" + shell: | + echo "Failed because the {{ cluster_deploy_operator_manifest_name }} PackageManifest is not available" \ > "{{ artifact_extra_logs_dir }}/FLAKE" - name: Failed because the operator could not be found in the CatalogSource @@ -231,7 +231,8 @@ - name: Instantiate the Subscription command: oc apply -f "{{ artifact_extra_logs_dir }}/src/002_sub.yml" -- block: +- name: Make sure the InstallPlan is deployed + block: - name: Find the operator InstallPlan command: oc get InstallPlan @@ -276,33 +277,32 @@ fail: msg="ClusterServiceVersion install not successful ({{ operator_csv_phase.stdout }})" when: operator_csv_phase.stdout != "Succeeded" - always: - - name: Store the YAML of the operator CSV that was installed (debug) - shell: - oc get ClusterServiceVersion/{{ operator_csv_name }} - -oyaml - -n "{{ cluster_deploy_operator_namespace }}" - > {{ artifact_extra_logs_dir }}/operator_csv.yml - - name: Store the YAML of the subscription - shell: - oc get -f "{{ artifact_extra_logs_dir }}/src/002_sub.yml" - -oyaml - -n "{{ cluster_deploy_operator_namespace }}" - > {{ artifact_extra_logs_dir }}/operator_sub.yml rescue: - name: Capture the Catalog Operator logs (debug) - shell: + shell: | oc logs deployment.apps/catalog-operator -n openshift-operator-lifecycle-manager > {{ artifact_extra_logs_dir }}/catalog_operator.log failed_when: false - - name: Indicate where the Catalog-operator logs have been saved debug: msg="The logs of Catalog Operator have been saved in {{ artifact_extra_logs_dir }}/catalog_operator.log" - - name: Failed because the operator could not be installed from the CatalogSource fail: msg="Failed because the operator could not be installed from the CatalogSource" + always: + - name: Store the YAML of the operator CSV that was installed (debug) + shell: | + oc get ClusterServiceVersion/{{ operator_csv_name }} \ + -oyaml \ + -n "{{ cluster_deploy_operator_namespace }}" \ + > {{ artifact_extra_logs_dir }}/operator_csv.yml + - name: Store the YAML of the subscription + shell: | + oc get -f "{{ artifact_extra_logs_dir }}/src/002_sub.yml" \ + -oyaml \ + -n "{{ cluster_deploy_operator_namespace }}" \ + > {{ artifact_extra_logs_dir }}/operator_sub.yml + - name: Deploy the operator CustomResource from its ClusterServiceVersion include_tasks: deploy_cr.yml when: cluster_deploy_operator_deploy_cr | bool diff --git a/roles/cluster/cluster_ensure_machineset/tasks/main.yml b/roles/cluster/cluster_ensure_machineset/tasks/main.yml index e83ef253e..b4976b39f 100644 --- a/roles/cluster/cluster_ensure_machineset/tasks/main.yml +++ b/roles/cluster/cluster_ensure_machineset/tasks/main.yml @@ -1,6 +1,7 @@ --- - name: "Check if the cluster already has a {{ machineset_instance_type }} machineset" - shell: + shell: | + set -o pipefail oc get machineset -n openshift-machine-api {% if machineset_name | length > 0 %} -ojson | jq '.items[] | select(.spec.template.spec.providerSpec.value.instanceType=="{{ machineset_instance_type }}" and .metadata.name=="{{ machineset_name }}") | .metadata.name' -r @@ -8,6 +9,8 @@ -o=jsonpath='{.items[?(@.spec.template.spec.providerSpec.value.instanceType=="{{ machineset_instance_type }}")].metadata.name}' {% endif %} register: cluster_has_machineset + args: + executable: /bin/bash - name: Delete the machineset if it is set but has the wrong instance type when: not cluster_has_machineset.stdout and machineset_name | length > 0 diff --git a/roles/cluster/cluster_fill_workernodes/tasks/main.yml b/roles/cluster/cluster_fill_workernodes/tasks/main.yml index 13cd5d334..9e31e92da 100644 --- a/roles/cluster/cluster_fill_workernodes/tasks/main.yml +++ b/roles/cluster/cluster_fill_workernodes/tasks/main.yml @@ -1,12 +1,15 @@ - name: Get the list of the worker nodes - shell: - oc get nodes - -l{{ cluster_fill_workernodes_label_selector }} - -oname + shell: | + set -o pipefail + oc get nodes \ + -l{{ cluster_fill_workernodes_label_selector }} \ + -oname \ | cut -d/ -f2 register: worker_node_names_cmd failed_when: not worker_node_names_cmd.stdout + args: + executable: /bin/bash - name: Create the src artifacts directory file: @@ -21,8 +24,8 @@ include_tasks: fill_node.yaml - name: Store the definition of the nodes - shell: - oc get nodes - -lnode-role.kubernetes.io/worker - -oyaml + shell: | + oc get nodes \ + -lnode-role.kubernetes.io/worker \ + -oyaml \ > "{{ artifact_extra_logs_dir }}/nodes.yaml" diff --git a/roles/cluster/cluster_set_project_annotation/tasks/cluster_set.yaml b/roles/cluster/cluster_set_project_annotation/tasks/cluster_set.yaml index 9c132023f..451c379d2 100644 --- a/roles/cluster/cluster_set_project_annotation/tasks/cluster_set.yaml +++ b/roles/cluster/cluster_set_project_annotation/tasks/cluster_set.yaml @@ -1,5 +1,5 @@ - name: Get the project template if it exists - shell: + shell: | set -o pipefail; oc adm create-bootstrap-project-template -ojson | oc get -n openshift-config -f- -ojson | jq .items[0] > "{{ artifact_extra_logs_dir }}/base_project_template.yaml" @@ -8,7 +8,7 @@ - name: Create the project template if it did not exist when: get_project_template_cmd.rc != 0 - shell: + shell: | oc adm create-bootstrap-project-template -ojson > "{{ artifact_extra_logs_dir }}/base_project_template.yaml" @@ -24,7 +24,7 @@ - name: Remove the annotation when: (cluster_set_project_annotation_value | default('') or '') | length == 0 - shell: + shell: | set -o pipefail; cat "{{ artifact_extra_logs_dir }}/base_project_template.yaml" | jq 'del(.objects[0].metadata.annotations["{{ cluster_set_project_annotation_key }}"])' @@ -55,8 +55,8 @@ echo "- deleting the project $test_project_name ..." oc delete ns "$test_project_name" >/dev/null echo "--> project annotation value: $project_annotation_value" - echo "==> expected value: "'{{ cluster_set_project_annotation_value or "null"}}' - [[ "$project_annotation_value" == '{{ cluster_set_project_annotation_value or "null"}}' ]] + echo "==> expected value: "'{{ cluster_set_project_annotation_value or "null" }}' + [[ "$project_annotation_value" == '{{ cluster_set_project_annotation_value or "null" }}' ]] retries: 120 delay: 5 register: wait_project_template_active diff --git a/roles/cluster/cluster_set_scale/tasks/main.yml b/roles/cluster/cluster_set_scale/tasks/main.yml index 23e08eabc..76ff43744 100644 --- a/roles/cluster/cluster_set_scale/tasks/main.yml +++ b/roles/cluster/cluster_set_scale/tasks/main.yml @@ -48,8 +48,8 @@ register: oc_get_machinesets failed_when: not oc_get_machinesets.stdout -- when: current_replicas_sum != scale - name: Change all {{ machineset_instance_type }} machinesets replicas to have sum {{ scale }} +- name: Change all {{ machineset_instance_type }} machinesets replicas to have sum {{ scale }} + when: current_replicas_sum != scale block: - name: Do not downscale any machinesets other than the first one, unless the user used force block: @@ -80,7 +80,8 @@ oc patch machineset -n openshift-machine-api {{ first_machineset }} --patch '{"spec": {"replicas": {{ scale }} }}' --type merge -- block: +- name: Make sure the machinesets are ready + block: - name: Wait for all machinesets with type {{ machineset_instance_type }} to be ready # This is done by verifying that at the availableReplicas @@ -91,7 +92,8 @@ # See https://docs.openshift.com/container-platform/4.7/rest_api/machine_apis/machineset-machine-openshift-io-v1beta1.html # for more information. # 3. Perform some extra formatting for nicer logging - shell: >- + shell: | + set -o pipefail oc get machinesets -n openshift-machine-api \ {% if machineset_name | length > 0 %} "{{ machineset_name }}" -ojson \ @@ -110,6 +112,12 @@ until: not non_ready_replicas.stdout_lines retries: 120 delay: 30 + args: + executable: /bin/bash + + rescue: + - name: Fail because the cluster machineset creation failed + fail: msg="Failing because cluster machineset creation failed" always: # info about the 'machines' @@ -150,7 +158,4 @@ failed_when: false loop: "{{ oc_get_machinesets.stdout_lines }}" - rescue: - - name: Fail because the cluster machineset creation failed - fail: msg="Failing because cluster machineset creation failed" diff --git a/roles/codeflare/codeflare_cleanup_appwrappers/tasks/main.yml b/roles/codeflare/codeflare_cleanup_appwrappers/tasks/main.yml index c97c8deaa..a7e058fa0 100644 --- a/roles/codeflare/codeflare_cleanup_appwrappers/tasks/main.yml +++ b/roles/codeflare/codeflare_cleanup_appwrappers/tasks/main.yml @@ -1,11 +1,17 @@ - name: List all the AppWrappers in the namespace shell: | + set -o pipefail oc get appwrappers -n {{ codeflare_cleanup_appwrappers_namespace }} -ojson | jq '.items[] | .metadata.name + " ==> "+ .status.state' -r > "{{ artifact_extra_logs_dir }}/appwrappers.status" oc get appwrappers -n {{ codeflare_cleanup_appwrappers_namespace }} -oyaml > "{{ artifact_extra_logs_dir }}/appwrappers.yaml" + args: + executable: /bin/bash - name: Count the AppWrappers in the namespace - shell: + shell: | + set -o pipefail oc get appwrappers -n {{ codeflare_cleanup_appwrappers_namespace }} -oname | wc -l + args: + executable: /bin/bash - name: Create a configmap for the beginning of the test timestamp shell: diff --git a/roles/codeflare/codeflare_generate_mcad_load/tasks/main.yml b/roles/codeflare/codeflare_generate_mcad_load/tasks/main.yml index a91ca5909..698b57dd3 100644 --- a/roles/codeflare/codeflare_generate_mcad_load/tasks/main.yml +++ b/roles/codeflare/codeflare_generate_mcad_load/tasks/main.yml @@ -42,8 +42,11 @@ mode: '0755' - name: Create the namespace if it does not exist - shell: + shell: | + set -o pipefail oc create ns "{{ codeflare_generate_mcad_load_namespace }}" -oyaml --dry-run=client | tee "{{ artifact_extra_logs_dir }}/src/namespace.yaml" | oc apply -f- + args: + executable: /bin/bash - name: Create a configmap for the beginning of the test timestamp shell: diff --git a/roles/entitlement/entitlement_deploy/tasks/main.yml b/roles/entitlement/entitlement_deploy/tasks/main.yml index 2af07d59e..362ba7653 100644 --- a/roles/entitlement/entitlement_deploy/tasks/main.yml +++ b/roles/entitlement/entitlement_deploy/tasks/main.yml @@ -29,31 +29,37 @@ machine_config_role: worker - name: "Deploy RHSM from file '{{ entitlement_rhsm }}'" - shell: + shell: | set -o pipefail; cat "{{ entitlement_mc_rhsm }}" | python3 "{{ entitlement_py_apply }}" BASE64_ENCODED_RHSM_FILE @"{{ entitlement_rhsm }}" | python3 "{{ entitlement_py_apply }}" MACHINE_CONFIG_ROLE "{{ machine_config_role }}" | oc apply -f- + args: + executable: /bin/bash - name: "Deploy the pem and key-pem from file '{{ entitlement_pem }}'" - shell: + shell: | set -o pipefail; cat "{{ entitlement_mc_pem }}" | python3 "{{ entitlement_py_apply }}" BASE64_ENCODED_PEM_FILE @"{{ entitlement_pem }}" | python3 "{{ entitlement_py_apply }}" MACHINE_CONFIG_ROLE "{{ machine_config_role }}" | oc apply -f- + args: + executable: /bin/bash - name: "Deploy the repo CA from file '{{ entitlement_mc_rhsm_ca }}' if requested" + when: entitlement_repo_ca | default('', true) | trim != '' block: - name: Compute the md5sum of the CA file (debug) command: md5sum '{{ entitlement_repo_ca }}' - name: "Deploy the repo CA from file '{{ entitlement_mc_rhsm_ca }}'" - shell: + shell: | set -o pipefail; cat "{{ entitlement_mc_rhsm_ca }}" | python3 "{{ entitlement_py_apply }}" BASE64_ENCODED_RHSM_CA_FILE @"{{ entitlement_repo_ca }}" | python3 "{{ entitlement_py_apply }}" MACHINE_CONFIG_ROLE "{{ machine_config_role }}" | oc apply -f- - when: entitlement_repo_ca | default('', true) | trim != '' + args: + executable: /bin/bash diff --git a/roles/entitlement/entitlement_test_in_cluster/tasks/main.yml b/roles/entitlement/entitlement_test_in_cluster/tasks/main.yml index 4f852a985..fd931988e 100644 --- a/roles/entitlement/entitlement_test_in_cluster/tasks/main.yml +++ b/roles/entitlement/entitlement_test_in_cluster/tasks/main.yml @@ -26,7 +26,8 @@ - name: Delete the entitlement tester Pod if it exists command: oc delete --ignore-not-found=true -f "{{ entitlement_tester_pod }}" -- block: +- name: Make sure the entitlement Pod is created + block: - name: Create the entitlement tester Pod command: oc create -f "{{ entitlement_tester_pod }}" diff --git a/roles/entitlement/entitlement_test_wait_deployment/tasks/main.yml b/roles/entitlement/entitlement_test_wait_deployment/tasks/main.yml index fa97dc17a..083d958a1 100644 --- a/roles/entitlement/entitlement_test_wait_deployment/tasks/main.yml +++ b/roles/entitlement/entitlement_test_wait_deployment/tasks/main.yml @@ -9,7 +9,8 @@ entitlement_retries: "{{ entitlement_nb_wait_retries }}" when: entitlement_test_and_wait == 'yes' -- block: +- name: Make sure the entitlement Pod is deployed + block: - name: Wait for the entitlement Pod to succeed shell: | set -o errexit; diff --git a/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_clusterpolicy.yml b/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_clusterpolicy.yml index 049159fdf..5fc04be4d 100644 --- a/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_clusterpolicy.yml +++ b/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_clusterpolicy.yml @@ -10,7 +10,8 @@ retries: 15 delay: 30 -- block: +- name: Make sure the GPU Operator ClusterPolicy is ready + block: - name: Wait for the GPU Operator ClusterPolicy CRD to appear command: oc get crd clusterpolicies.nvidia.com register: has_clusterpolicy_crd @@ -20,14 +21,14 @@ rescue: - name: Inspect the Subscriptions status (debug) - shell: + shell: | (oc get subscriptions.operators.coreos.com -n "{{ deploy_bundle_namespace }}" && oc describe subscriptions.operators.coreos.com/gpu-operator-certified -n "{{ deploy_bundle_namespace }}") > {{ artifact_extra_logs_dir }}/gpu_operator_Subscription.log failed_when: false - name: Get the ClusterServiceVersion status (debug) - shell: + shell: | (oc get ClusterServiceVersion -A && oc describe "{{ gpu_operator_csv_name_cmd.stdout }}" -n "{{ deploy_bundle_namespace }}") > {{ artifact_extra_logs_dir }}/gpu_operator_ClusterServiceVersion.log @@ -37,7 +38,7 @@ fail: msg="Failed because the ClusterPolicy CR cannot be created" - name: Get the clusterpolicy of the GPU Operator from OperatorHub CSV - shell: + shell: | set -o pipefail; oc get "{{ gpu_operator_csv_name_cmd.stdout }}" -n "{{ deploy_bundle_namespace }}" diff --git a/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_from_catalog.yml b/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_from_catalog.yml index 29c8446fc..282ef6230 100644 --- a/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_from_catalog.yml +++ b/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_from_catalog.yml @@ -113,7 +113,8 @@ - name: Instantiate the OperatorHub subscription command: oc create -f "{{ artifact_extra_logs_dir }}/gpu_operator_sub.yml" -- block: +- name: Install the GPU Operator + block: - name: Find the GPU Operator OperatorHub InstallPlan command: oc get InstallPlan diff --git a/roles/gpu_operator/gpu_operator_get_csv_version/tasks/main.yml b/roles/gpu_operator/gpu_operator_get_csv_version/tasks/main.yml index b0f62d319..db0a503a0 100644 --- a/roles/gpu_operator/gpu_operator_get_csv_version/tasks/main.yml +++ b/roles/gpu_operator/gpu_operator_get_csv_version/tasks/main.yml @@ -9,12 +9,20 @@ - name: Get the gpu-operator subscription package name block: - name: Count gpu operator subscription candidates - shell: oc get subscription --all-namespaces -ojson | jq '[.items[] | select(.spec.name | test("gpu-operator"))] | length' + shell: | + set -o pipefail + oc get subscription --all-namespaces -ojson | jq '[.items[] | select(.spec.name | test("gpu-operator"))] | length' register: gpu_subscriptions failed_when: gpu_subscriptions.stdout != '1' + args: + executable: /bin/bash - name: Read the package name from the first gpu-operator subscription - shell: oc get subscription -A -ojson | jq '[.items[] | select(.spec.name | test("gpu-operator"))][0].spec.name' + shell: | + set -o pipefail + oc get subscription -A -ojson | jq '[.items[] | select(.spec.name | test("gpu-operator"))][0].spec.name' register: gpu_operator_subscription_package_name + args: + executable: /bin/bash - name: Ensure that there is a CSV for the GPU Operator command: oc get csv diff --git a/roles/gpu_operator/gpu_operator_run_gpu_burn/tasks/main.yml b/roles/gpu_operator/gpu_operator_run_gpu_burn/tasks/main.yml index 3f9cce344..6bc2e9311 100644 --- a/roles/gpu_operator/gpu_operator_run_gpu_burn/tasks/main.yml +++ b/roles/gpu_operator/gpu_operator_run_gpu_burn/tasks/main.yml @@ -90,16 +90,23 @@ - name: Ensure that no GPU was faulty loop: "{{ gpu_burn_gpu_nodes.stdout_lines }}" - shell: + shell: | + set -o pipefail oc logs pod/gpu-burn-{{ item }} -n {{ gpu_operator_run_gpu_burn_namespace }} | grep FAULTY register: gpu_burn_test_faulty failed_when: gpu_burn_test_faulty.rc == 0 + args: + executable: /bin/bash always: - name: Save the logs of the GPU burn Pods - shell: oc logs pod/gpu-burn-{{ item }} -n {{ gpu_operator_run_gpu_burn_namespace }} | grep -o "[^$(printf '\r')]*$" + shell: | + set -o pipefail + oc logs pod/gpu-burn-{{ item }} -n {{ gpu_operator_run_gpu_burn_namespace }} | grep -o "[^$(printf '\r')]*$" with_items: "{{ gpu_burn_gpu_nodes.stdout_lines }}" failed_when: false + args: + executable: /bin/bash - name: Save the description of the GPU burn Pods shell: oc describe pod/gpu-burn-{{ item }} -n {{ gpu_operator_run_gpu_burn_namespace }} > {{ artifact_extra_logs_dir }}/gpu_burn.{{ item }}.description.txt @@ -124,4 +131,4 @@ - name: Delete the src ConfigMap command: oc --ignore-not-found=true delete configmap gpu-burn-src -n {{ gpu_operator_run_gpu_burn_namespace }} failed_when: false - when: not gpu_operator_run_gpu_burn_keep_resources \ No newline at end of file + when: not gpu_operator_run_gpu_burn_keep_resources diff --git a/roles/gpu_operator/gpu_operator_wait_deployment/tasks/main.yml b/roles/gpu_operator/gpu_operator_wait_deployment/tasks/main.yml index b90de5a92..e5950f328 100644 --- a/roles/gpu_operator/gpu_operator_wait_deployment/tasks/main.yml +++ b/roles/gpu_operator/gpu_operator_wait_deployment/tasks/main.yml @@ -78,7 +78,8 @@ retries: 15 delay: 60 -- block: +- name: Wait for the nodes labeling + block: - name: Wait for the gpu-feature-discovery Pod to label the nodes command: oc get nodes -l nvidia.com/gpu.count -oname register: has_gpu_feature_discovery_labels diff --git a/roles/gpu_operator/gpu_operator_wait_deployment/tasks/metrics.yml b/roles/gpu_operator/gpu_operator_wait_deployment/tasks/metrics.yml index 57e3ac7ac..3cf94ef2b 100644 --- a/roles/gpu_operator/gpu_operator_wait_deployment/tasks/metrics.yml +++ b/roles/gpu_operator/gpu_operator_wait_deployment/tasks/metrics.yml @@ -1,5 +1,6 @@ --- -- block: +- name: Create the GPU operator requirements + block: - name: Check if the GPU Operator namespace has the openshift.io/cluster-monitoring label shell: set -o pipefail; diff --git a/roles/load_aware/load_aware_deploy_trimaran/tasks/main.yml b/roles/load_aware/load_aware_deploy_trimaran/tasks/main.yml index 29c7ad151..d3669bb75 100644 --- a/roles/load_aware/load_aware_deploy_trimaran/tasks/main.yml +++ b/roles/load_aware/load_aware_deploy_trimaran/tasks/main.yml @@ -3,9 +3,8 @@ oc apply -f {{ user_applications_monitor_config }} - name: Ensure user monitoring enabled - shell: + shell: | set -o pipefail; - oc -n openshift-user-workload-monitoring get pod --no-headers | awk '{ print $3}' register: monitoring_enabled @@ -13,9 +12,12 @@ retries: 20 # until all of the pods are in the running state until: "'Completed' not in monitoring_enabled.stdout and 'Failed' not in monitoring_enabled.stdout and 'Pending' not in monitoring_enabled.stdout" + args: + executable: /bin/bash - name: Get monitoring secret name - shell: + shell: | + set -o pipefail oc get secret -n openshift-user-workload-monitoring | grep prometheus-user-workload-token | head -n 1 @@ -24,18 +26,24 @@ delay: 3 retries: 20 until: "'prometheus-user-workload-token' in monitoring_secret.stdout" + args: + executable: /bin/bash - name: Get Thanos Endpoint - shell: + shell: | + set -o pipefail oc get route thanos-querier -n openshift-monitoring -o json | jq -r '.spec.host' register: thanos_endpoint_cmd + args: + executable: /bin/bash + - name: Format the Thanos Endpoint set_fact: thanos_endpoint: "https://{{ thanos_endpoint_cmd.stdout }}" - name: Checking monitoring token size - shell: + shell: | set -o errexit; set -o pipefail; set -o nounset; @@ -49,6 +57,8 @@ delay: 2 retries: 30 until: token_size.stdout | int > 1000 + args: + executable: /bin/bash - name: Create the src artifacts directory file: @@ -79,7 +89,8 @@ register: deploy_trimaran - name: Ensure Trimaran is Running - shell: + shell: | + set -o pipefail oc get pods -n trimaran | grep "trimaran-scheduler" | awk '{print $3}' @@ -87,6 +98,10 @@ delay: 3 retries: 20 until: trimaran_running.stdout == 'Running' + args: + executable: /bin/bash + + - name: Ensure a pod can be scheduled block: @@ -95,13 +110,16 @@ oc apply -f {{ trimaran_test_pod }} -n trimaran - name: Ensure the Trimaran test pods completes - shell: + shell: | + set -o pipefail oc get pod trimaran-test -n trimaran --no-headers | awk '{print $3}' register: trimaran_test_pod_state delay: 5 retries: 20 until: trimaran_test_pod_state.stdout == 'Completed' + args: + executable: /bin/bash always: - name: Dump trimaran info diff --git a/roles/load_aware/load_aware_scale_test/tasks/main.yml b/roles/load_aware/load_aware_scale_test/tasks/main.yml index 054bb3524..635e56eb2 100644 --- a/roles/load_aware/load_aware_scale_test/tasks/main.yml +++ b/roles/load_aware/load_aware_scale_test/tasks/main.yml @@ -1,6 +1,6 @@ - name: Generate load timeline shell: - python3 {{ load_timeline_generator }} {{ load_aware_scale_test_distribution }} {{ load_aware_scale_test_duration}} {{ load_aware_scale_test_instances }} "{{ artifact_extra_logs_dir }}/schedule_plan.yaml" + python3 {{ load_timeline_generator }} {{ load_aware_scale_test_distribution }} {{ load_aware_scale_test_duration }} {{ load_aware_scale_test_instances }} "{{ artifact_extra_logs_dir }}/schedule_plan.yaml" - name: Run workload and dump stats block: @@ -9,7 +9,8 @@ python3 {{ pod_start_scheduler }} {{ load_aware_scale_test_scheduler }} {{ load_aware_scale_test_namespace }} "{{ artifact_extra_logs_dir }}/schedule_plan.yaml" "{{ artifact_extra_logs_dir }}/schedule_execution.yaml" {{ load_aware_scale_test_sleep_duration }} - name: Wait for workloads to finish - shell: + shell: | + set -o pipefail oc get pods -n {{ load_aware_scale_test_namespace }} --no-headers | awk '{ print $3 }' register: load_aware_workload delay: 60 @@ -20,6 +21,8 @@ and 'Failed' not in load_aware_workload.stdout and 'ContainerCreating' not in load_aware_workload.stdout and 'ImagePullBackOff' not in load_aware_workload.stdout" + args: + executable: /bin/bash always: - name: Dump info about scale test resources diff --git a/roles/local_ci/local_ci_run_multi/tasks/main.yml b/roles/local_ci/local_ci_run_multi/tasks/main.yml index adfcd5272..0295ad158 100644 --- a/roles/local_ci/local_ci_run_multi/tasks/main.yml +++ b/roles/local_ci/local_ci_run_multi/tasks/main.yml @@ -213,9 +213,12 @@ register: success_count_cmd - name: Store the success count - shell: + shell: | + set -o pipefail echo "{{ success_count_cmd.stdout_lines[0] }}/{{ local_ci_run_multi_user_count }}" - | tee "{{ artifact_extra_logs_dir }}/success_count" + | tee "{{ artifact_extra_logs_dir }}/success_count" + args: + executable: /bin/bash # the tasks below will abort the execution in case of problems diff --git a/roles/nfd/nfd_test_wait_gpu/tasks/main.yml b/roles/nfd/nfd_test_wait_gpu/tasks/main.yml index b38cafaa9..1e5c95e9b 100644 --- a/roles/nfd/nfd_test_wait_gpu/tasks/main.yml +++ b/roles/nfd/nfd_test_wait_gpu/tasks/main.yml @@ -9,11 +9,13 @@ nfd_wait_gpu_retries: "{{ nfd_wait_gpu_nb_retries }}" when: nfd_wait_gpu_nodes == 'yes' -- block: +- name: Coverge the GPU nodes creation + block: - name: Wait for the GPU nodes to appear # label list should be in sync with: # https://github.com/NVIDIA/gpu-operator/blob/master/pkg/controller/clusterpolicy/state_manager.go#L26 - shell: + shell: | + set -o pipefail ( oc get nodes -oname --ignore-not-found=false -l feature.node.kubernetes.io/pci-10de.present || oc get nodes -oname --ignore-not-found=false -l feature.node.kubernetes.io/pci-0302_10de.present || oc get nodes -oname --ignore-not-found=false -l feature.node.kubernetes.io/pci-0300_10de.present @@ -22,10 +24,16 @@ until: nfd_gpu_wait.rc == 0 retries: "{{ nfd_wait_gpu_retries }}" delay: 30 + args: + executable: /bin/bash rescue: - name: Get the labels of the worker nodes (debug) - shell: oc get nodes --show-labels --selector='!node-role.kubernetes.io/master' | tr , '\n' + shell: | + set -o pipefail + oc get nodes --show-labels --selector='!node-role.kubernetes.io/master' | tr , '\n' + args: + executable: /bin/bash - name: Failing because no GPU node showed up fail: msg="Failed because no GPU node showed up" diff --git a/roles/ocm/ocm_deploy_addon/tasks/main.yml b/roles/ocm/ocm_deploy_addon/tasks/main.yml index 5ebe520c0..c55375400 100644 --- a/roles/ocm/ocm_deploy_addon/tasks/main.yml +++ b/roles/ocm/ocm_deploy_addon/tasks/main.yml @@ -24,9 +24,9 @@ cluster_id: "{{ cluster_id_cmd.stdout }}" - name: Check if addon is already installed - shell: + shell: | set -o pipefail; - ocm get /api/clusters_mgmt/v1/clusters/{{ cluster_id }}/addons/{{ ocm_deploy_addon_id }} |& jq -r '.kind' || true + ocm get /api/clusters_mgmt/v1/clusters/{{ cluster_id }}/addons/{{ ocm_deploy_addon_id }} |& jq -r '.kind' || true register: ocm_addon_precheck - name: Create ocm addon install payload @@ -36,11 +36,11 @@ mode: 0400 when: '"Error" in ocm_addon_precheck.stdout' -- name: "Install addon {{ ocm_deploy_addon_id }} via OCM API" +- name: Install addon {{ ocm_deploy_addon_id }} via OCM API shell: | set -o pipefail; url="/api/clusters_mgmt/v1/clusters/{{ cluster_id }}/addons" - body="{{ artifact_extra_logs_dir }}/addon_{{ ocm_deploy_addon_id }}.json" + body="{{ artifact_extra_logs_dir }}/addon_{{ ocm_deploy_addon_id }}.json" output=$(ocm post "$url" --body=$body 2>&1); echo "$output" >&2; # for observation echo "$output" | jq -r '.kind' diff --git a/roles/pipelines/pipelines_capture_state/tasks/main.yml b/roles/pipelines/pipelines_capture_state/tasks/main.yml index 08e39e2a2..aabd30d2b 100644 --- a/roles/pipelines/pipelines_capture_state/tasks/main.yml +++ b/roles/pipelines/pipelines_capture_state/tasks/main.yml @@ -10,11 +10,15 @@ namespace: "{% if pipelines_capture_state_namespace | length > 0 %}{{ pipelines_capture_state_namespace }}{% else %}{{ project_name_cmd.stdout }}{% endif %}" - name: Compute the DSP application name - shell: + shell: | + set -o pipefail oc get dspa -oname -n "{{ namespace }}" | head -1 | cut -d/ -f2 register: dspa_name_cmd when: not pipelines_capture_state_dsp_application_name failed_when: not dspa_name_cmd.stdout + args: + executable: /bin/bash + - name: Save the DSP application name set_fact: diff --git a/roles/pipelines/pipelines_run_kfp_notebook/tasks/main.yml b/roles/pipelines/pipelines_run_kfp_notebook/tasks/main.yml index 7893b694e..1fe017b1c 100644 --- a/roles/pipelines/pipelines_run_kfp_notebook/tasks/main.yml +++ b/roles/pipelines/pipelines_run_kfp_notebook/tasks/main.yml @@ -106,11 +106,14 @@ # but Task 'Store the listing of the notebook directory' will catch it earlier - name: Compute the DSP application name - shell: + shell: | + set -o pipefail oc get dspa -oname -n "{{ notebook_namespace }}" | head -1 | cut -d/ -f2 register: dspa_name_cmd when: not pipelines_run_kfp_notebook_dsp_application_name failed_when: not dspa_name_cmd.stdout + args: + executable: /bin/bash - name: Save the DSP application name set_fact: @@ -149,7 +152,7 @@ sa_cluster_role: cluster-admin - name: Create the service account that will be used in the Notebook - shell: + shell: | set -o pipefail; oc create serviceaccount "{{ sa_name }}" -n "{{ notebook_namespace }}" @@ -158,11 +161,11 @@ | oc apply -f- - name: Grant all the privileges to the service account - shell: + shell: | set -o pipefail; oc adm policy add-cluster-role-to-user "{{ sa_cluster_role }}" -n "{{ notebook_namespace }}" - -z "{{ sa_name}}" + -z "{{ sa_name }}" --dry-run=client -oyaml | tee "{{ artifact_extra_logs_dir }}/src/clusterrolebinding.yaml" | oc apply -f- @@ -191,9 +194,9 @@ - name: Run the test notebook block: - name: Create the secret token - shell: + shell: | oc create secret generic "{{ secret_token_name }}" - "--from-literal=token=$(oc create token '{{ sa_name}}' -n '{{ notebook_namespace }}')" + "--from-literal=token=$(oc create token '{{ sa_name }}' -n '{{ notebook_namespace }}')" -n "{{ notebook_namespace }}" register: create_secret_token_cmd failed_when: '"no token is currently in use for this session" in create_secret_token_cmd.stderr' @@ -205,7 +208,7 @@ -n "{{ notebook_namespace }}" - name: Wait for the Notebook Pod to start running - shell: + shell: | set -o pipefail; oc get pod {{ notebook_search_labels }} --ignore-not-found diff --git a/roles/rhods/rhods_benchmark_notebook_performance/tasks/main.yml b/roles/rhods/rhods_benchmark_notebook_performance/tasks/main.yml index f317cdc22..ee360ec15 100644 --- a/roles/rhods/rhods_benchmark_notebook_performance/tasks/main.yml +++ b/roles/rhods/rhods_benchmark_notebook_performance/tasks/main.yml @@ -40,7 +40,7 @@ - name: Save the imagestream tag set_fact: - image_tag: "{% if rhods_benchmark_notebook_performance_imagestream_tag | string %}{{ rhods_benchmark_notebook_performance_imagestream_tag }}{% else %}{{ image_tag_cmd.stdout }}{% endif %}" + image_tag: "{% if rhods_benchmark_notebook_performance_imagestream_tag | string %}{{ rhods_benchmark_notebook_performance_imagestream_tag }}{% else %}{{ image_tag_cmd.stdout }}{% endif %}" - name: Get the image address command: @@ -53,10 +53,13 @@ when: not rhods_benchmark_notebook_performance_use_rhods | bool block: - name: Compute the imagestream filename - shell: + shell: | + set -o pipefail echo -n "{{ rhods_benchmark_notebook_performance_imagestream }}" | sed 's/^s2i-//g' | sed 's/-notebook//g'; echo "-notebook-imagestream.yaml" register: imagestream_filename_cmd + args: + executable: /bin/bash - name: Fetch the imagestream definition from the source repository get_url: diff --git a/roles/rhods/rhods_notebook_ods_ci_scale_test/tasks/main.yml b/roles/rhods/rhods_notebook_ods_ci_scale_test/tasks/main.yml index 909225c43..a21647de2 100644 --- a/roles/rhods/rhods_notebook_ods_ci_scale_test/tasks/main.yml +++ b/roles/rhods/rhods_notebook_ods_ci_scale_test/tasks/main.yml @@ -21,7 +21,7 @@ - name: Update the Exclude tags if necessary set_fact: - rhods_notebook_ods_ci_scale_test_ods_ci_exclude_tags: "{% if rhods_notebook_ods_ci_scale_test_only_create_notebooks|bool %}JupyterLabORWait{% else %}{{ rhods_notebook_ods_ci_scale_test_ods_ci_exclude_tags }}{% endif %}" + rhods_notebook_ods_ci_scale_test_ods_ci_exclude_tags: "{% if rhods_notebook_ods_ci_scale_test_only_create_notebooks | bool %}JupyterLabORWait{% else %}{{ rhods_notebook_ods_ci_scale_test_ods_ci_exclude_tags }}{% endif %}" - name: Define the test environments set_fact: @@ -63,14 +63,20 @@ register: oc_api_url_cmd - name: Get the RHODS CSV name - shell: + shell: | + set -o pipefail oc get csv -oname -n redhat-ods-operator | grep rhods-operator register: rhods_csv_cmd + args: + executable: /bin/bash - name: Get the RHODS version - shell: + shell: | + set -o pipefail oc get {{ rhods_csv_cmd.stdout }} -n redhat-ods-operator -oname | grep rhods-operator | cut -d/ -f2 | cut -d. -f2- register: rhods_version_cmd + args: + executable: /bin/bash - name: Get the Dashboard Product name (to distinguish RHODS from ODH). Currently hardcoded to RHODS. # We'll have to find another way to distinguish RHODS from ODH, this doesn't work anymore: diff --git a/roles/wisdom/wisdom_deploy_model/tasks/main.yml b/roles/wisdom/wisdom_deploy_model/tasks/main.yml index 060186c4f..dd6591b3a 100644 --- a/roles/wisdom/wisdom_deploy_model/tasks/main.yml +++ b/roles/wisdom/wisdom_deploy_model/tasks/main.yml @@ -104,8 +104,12 @@ command: oc delete secret/wisdom-protos --ignore-not-found -n {{ wisdom_deploy_model_namespace }} - name: Create the wisdom protos Secret - shell: oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_deploy_model_protos_path }}/* | tr ' ' ,) -n {{ wisdom_deploy_model_namespace }} --dry-run=client -oyaml | oc apply -f - - + shell: | + set -o pipefail + oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_deploy_model_protos_path }}/* | tr ' ' ,) -n {{ wisdom_deploy_model_namespace }} --dry-run=client -oyaml | oc apply -f - + args: + executable: /bin/bash + - name: Delete the entrypoint ConfigMap if it already exists command: oc delete --ignore-not-found configmap/wisdom-tester-entrypoint -n {{ wisdom_deploy_model_namespace }} @@ -129,17 +133,16 @@ command: oc create -f "{{ artifact_extra_logs_dir }}/src/003_wisdom_tester_pod.yml" -n {{ wisdom_deploy_model_namespace }} # Wait for model to respond without error -- block: - - name: Wait for the wisdom test Pod to terminate - command: - oc get pod/wisdom-tester - -n {{ wisdom_deploy_model_namespace }} - -o custom-columns=:.status.phase - --no-headers - register: tester_wait - until: tester_wait.stdout == "Succeeded" or tester_wait.stdout == "Error" or tester_wait.stdout == "Failed" - retries: 10 - delay: 30 +- name: Wait for the wisdom test Pod to terminate + command: + oc get pod/wisdom-tester + -n {{ wisdom_deploy_model_namespace }} + -o custom-columns=:.status.phase + --no-headers + register: tester_wait + until: tester_wait.stdout == "Succeeded" or tester_wait.stdout == "Error" or tester_wait.stdout == "Failed" + retries: 10 + delay: 30 - name: Save the wisdom test Pod logs shell: oc logs pod/wisdom-tester -n {{ wisdom_deploy_model_namespace }} > {{ artifact_extra_logs_dir }}/wisdom-tester-pod.log diff --git a/roles/wisdom/wisdom_llm_load_test/tasks/main.yml b/roles/wisdom/wisdom_llm_load_test/tasks/main.yml index 04ec4fe2b..7e44b6c8c 100644 --- a/roles/wisdom/wisdom_llm_load_test/tasks/main.yml +++ b/roles/wisdom/wisdom_llm_load_test/tasks/main.yml @@ -13,7 +13,11 @@ command: oc delete secret/wisdom-protos --ignore-not-found -n {{ wisdom_llm_load_test_namespace }} - name: Create the wisdom protos Secret - shell: oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_llm_load_test_protos_path }}/* | tr ' ' ,) -n {{ wisdom_llm_load_test_namespace }} --dry-run=client -oyaml | oc apply -f - + shell: | + set -o pipefail + oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_llm_load_test_protos_path }}/* | tr ' ' ,) -n {{ wisdom_llm_load_test_namespace }} --dry-run=client -oyaml | oc apply -f - + args: + executable: /bin/bash # The results need to be pushed to S3 after the run, put S3 credentials in a Secret - name: Delete the S3 push secret for results if it exists diff --git a/roles/wisdom/wisdom_llm_load_test_multiplexed/tasks/main.yml b/roles/wisdom/wisdom_llm_load_test_multiplexed/tasks/main.yml index d90196982..b9c18ebf0 100644 --- a/roles/wisdom/wisdom_llm_load_test_multiplexed/tasks/main.yml +++ b/roles/wisdom/wisdom_llm_load_test_multiplexed/tasks/main.yml @@ -13,7 +13,11 @@ command: oc delete secret/wisdom-protos --ignore-not-found -n {{ wisdom_llm_load_test_multiplexed_namespace }} - name: Create the wisdom protos Secret - shell: oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_llm_load_test_multiplexed_protos_path }}/* | tr ' ' ,) -n {{ wisdom_llm_load_test_multiplexed_namespace }} --dry-run=client -oyaml | oc apply -f - + shell: | + set -o pipefail + oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_llm_load_test_multiplexed_protos_path }}/* | tr ' ' ,) -n {{ wisdom_llm_load_test_multiplexed_namespace }} --dry-run=client -oyaml | oc apply -f - + args: + executable: /bin/bash # The results need to be pushed to S3 after the run, put S3 credentials in a Secret - name: Delete the S3 push secret for results if it exists diff --git a/roles/wisdom/wisdom_warmup_model/tasks/main.yml b/roles/wisdom/wisdom_warmup_model/tasks/main.yml index b25016b0f..28b35d8d4 100644 --- a/roles/wisdom/wisdom_warmup_model/tasks/main.yml +++ b/roles/wisdom/wisdom_warmup_model/tasks/main.yml @@ -20,8 +20,12 @@ command: oc delete secret/wisdom-protos --ignore-not-found -n {{ wisdom_warmup_model_namespace }} - name: Create the wisdom protos Secret - shell: oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_warmup_model_protos_path }}/* | tr ' ' ,) -n {{ wisdom_warmup_model_namespace }} --dry-run=client -oyaml | oc apply -f - - + shell: | + set -o pipefail + oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_warmup_model_protos_path }}/* | tr ' ' ,) -n {{ wisdom_warmup_model_namespace }} --dry-run=client -oyaml | oc apply -f - + args: + executable: /bin/bash + - name: Apply the warmup Pod template template: src: "{{ wisdom_warmup_pod_template }}"