diff --git a/.github/workflows/ansible-lint.yml b/.github/workflows/ansible-lint.yml index 6a070ce47..201678dda 100644 --- a/.github/workflows/ansible-lint.yml +++ b/.github/workflows/ansible-lint.yml @@ -3,11 +3,8 @@ name: Run ansible-lint # Controls when the action will run. on: - # Triggers the workflow on push or pull request events but only for the main branch pull_request: - branches: [main] push: - branches: [main] schedule: - cron: '0 */8 * * *' # Allows you to run this workflow manually from the Actions tab diff --git a/config/ansible-lint.yml b/config/ansible-lint.yml index 9bb28edf0..5eea32934 100644 --- a/config/ansible-lint.yml +++ b/config/ansible-lint.yml @@ -14,11 +14,8 @@ skip_list: - 'command-instead-of-module' - 'command-instead-of-shell' - 'deprecated-local-action' - - 'key-order[task]' - - 'jinja[spacing]' - 'no-free-form' - 'chema[meta]' - - 'name[missing]' - 'var-naming[no-reserved]' - 'var-naming[no-role-prefix]' - 'var-naming[pattern]' @@ -29,15 +26,11 @@ skip_list: - 'yaml[indentation]' - 'yaml[key-duplicates]' - 'yaml[line-length]' - - 'yaml[new-line-at-end-of-file]' - 'yaml[octal-values]' - - 'yaml[trailing-spaces]' - 'yaml[truthy]' - 'name[template]' - 'name[casing]' - 'risky-file-permissions' - - 'risky-shell-pipe' - 'ignore-errors' - 'no-changed-when' - 'fqcn' - - 'args[module]' diff --git a/roles/benchmarking/benchmarking_run_mlperf_ssd/tasks/main.yml b/roles/benchmarking/benchmarking_run_mlperf_ssd/tasks/main.yml index 99b8991a5..82c1289ec 100644 --- a/roles/benchmarking/benchmarking_run_mlperf_ssd/tasks/main.yml +++ b/roles/benchmarking/benchmarking_run_mlperf_ssd/tasks/main.yml @@ -7,24 +7,24 @@ command: oc get nodes -l kubernetes.io/hostname={{ benchmarking_run_mlperf_ssd_node_hostname }} -oname - name: Ensure that the coco dataset PVC exists - command: - oc get pvc/{{ benchmarking_run_mlperf_ssd_dataset_pvc_name }} + shell: | + oc get pvc/{{ benchmarking_run_mlperf_ssd_dataset_pvc_name }} \ -n {{ benchmarking_run_mlperf_ssd_namespace }} - name: Fetch the coco dataset PVC definition (debug) - shell: - oc get pvc/{{ benchmarking_run_mlperf_ssd_pvc_name }} - -n {{ benchmarking_run_mlperf_ssd_namespace }} - -oyaml + shell: | + oc get pvc/{{ benchmarking_run_mlperf_ssd_pvc_name }} \ + -n {{ benchmarking_run_mlperf_ssd_namespace }} \ + -oyaml \ > {{ artifact_extra_logs_dir }}/pvc_coco-dataset.yml - name: Create the entrypoint ConfigMap file - shell: - oc create cm {{ benchmarking_run_mlperf_ssd_entrypoint_cm_name }} - --from-file="{{ benchmarking_mlperf_ssd_entrypoint }}" - -n {{ benchmarking_run_mlperf_ssd_namespace }} - --dry-run=client - -oyaml + shell: | + oc create cm {{ benchmarking_run_mlperf_ssd_entrypoint_cm_name }} \ + --from-file="{{ benchmarking_mlperf_ssd_entrypoint }}" \ + -n {{ benchmarking_run_mlperf_ssd_namespace }} \ + --dry-run=client \ + -oyaml \ > {{ artifact_extra_logs_dir }}/000_configmap_run-mlperf-ssd_entrypoint.yml - name: Create the entrypoint ConfigMap resource @@ -45,7 +45,8 @@ command: oc create -f "{{ artifact_extra_logs_dir }}/001_pod_run-mlperf-ssd.yml" -- block: +- name: Make sure the benchmark completes + block: - name: Wait for the benchmark completion command: oc get pod/{{ benchmarking_run_mlperf_ssd_name }} @@ -63,8 +64,8 @@ always: - name: Store the logs of benchmark execution (for post-processing) - shell: - oc logs pod/{{ benchmarking_run_mlperf_ssd_name }} -n {{ benchmarking_run_mlperf_ssd_namespace }} + shell: | + oc logs pod/{{ benchmarking_run_mlperf_ssd_name }} -n {{ benchmarking_run_mlperf_ssd_namespace }} \ > "{{ artifact_extra_logs_dir }}/pod_run-mlperf-ssd.log" failed_when: false @@ -73,13 +74,13 @@ echo "{{ wait_benchmark_pod_cmd.stdout }}" > "{{ artifact_extra_logs_dir }}/pod_run-mlperf-ssd.status" - name: Store the description of benchmark execution (debug) - shell: - oc describe pod/{{ benchmarking_run_mlperf_ssd_name }} -n {{ benchmarking_run_mlperf_ssd_namespace }} + shell: | + oc describe pod/{{ benchmarking_run_mlperf_ssd_name }} -n {{ benchmarking_run_mlperf_ssd_namespace }} \ > "{{ artifact_extra_logs_dir }}/pod_run-mlperf-ssd.descr" failed_when: false - name: Get average sample rate - shell: + shell: | set -o pipefail; cat "{{ artifact_extra_logs_dir }}/pod_run-mlperf-ssd.log" | grep avg. | tail -n1 | awk '{ print $NF " samples/sec" }' > "{{ artifact_dir }}/benchmarking_run_ssd_sample_rate.log"; cp {{ artifact_dir }}/benchmarking_run_ssd_sample_rate.log {{ artifact_extra_logs_dir }}/benchmarking_run_ssd_sample_rate.log diff --git a/roles/cluster/cluster_capture_environment/tasks/main.yml b/roles/cluster/cluster_capture_environment/tasks/main.yml index 5e007541d..e62ed54fd 100644 --- a/roles/cluster/cluster_capture_environment/tasks/main.yml +++ b/roles/cluster/cluster_capture_environment/tasks/main.yml @@ -1,34 +1,34 @@ - name: Store OpenShift version identifier - shell: + shell: | set -o pipefail; - oc version -o json - | jq --raw-output '.openshiftVersion' + oc version -o json \ + | jq --raw-output '.openshiftVersion' \ > {{ artifact_extra_logs_dir }}/ocp.version - name: Store OpenShift YAML version - shell: - oc version -oyaml + shell: | + oc version -oyaml \ > {{ artifact_extra_logs_dir }}/ocp_version.yml - name: Store OpenShift YAML clusterversion - shell: - oc get clusterversion/version -oyaml + shell: | + oc get clusterversion/version -oyaml \ > {{ artifact_extra_logs_dir }}/ocp_clusterversion.yml # --- - name: Store the OpenShift nodes - shell: - oc get nodes -owide + shell: | + oc get nodes -owide \ > {{ artifact_extra_logs_dir }}/nodes.status; - oc get nodes -oyaml + oc get nodes -oyaml \ > {{ artifact_extra_logs_dir }}/nodes.yaml; - name: Store the OpenShift machines - shell: - oc get machines -n openshift-machine-api -owide + shell: | + oc get machines -n openshift-machine-api -owide \ > {{ artifact_extra_logs_dir }}/machines.status; - oc get machines -n openshift-machine-api -oyaml + oc get machines -n openshift-machine-api -oyaml \ > {{ artifact_extra_logs_dir }}/machines.yaml; # --- @@ -37,8 +37,6 @@ command: git describe HEAD --long --always register: git_version - args: - warn: false # don't warn about using git here - name: Store ci-artifact version from Git copy: @@ -50,8 +48,6 @@ command: git show --no-patch register: git_show - args: - warn: false # don't warn about using git here - name: Store ci-artifact last git commit copy: diff --git a/roles/cluster/cluster_create_osd/tasks/main.yml b/roles/cluster/cluster_create_osd/tasks/main.yml index 91233317d..1e8cf8d79 100644 --- a/roles/cluster/cluster_create_osd/tasks/main.yml +++ b/roles/cluster/cluster_create_osd/tasks/main.yml @@ -117,11 +117,11 @@ oc get nodes > "{{ artifact_extra_logs_dir }}/nodes.status" - name: Set the desired worker node count - command: - ocm edit machinepool - {{ cluster_create_osd_machinepool_name }} - --cluster={{ cluster_create_osd_cluster_name }} - --replicas={{ [2, cluster_create_osd_compute_nodes|int] |max }} + shell: | + ocm edit machinepool \ + {{ cluster_create_osd_machinepool_name }} \ + --cluster={{ cluster_create_osd_cluster_name }} \ + --replicas={{ [2, cluster_create_osd_compute_nodes|int]|max }} - name: Wait for the desired worker node count shell: | diff --git a/roles/cluster/cluster_deploy_aws_efs/tasks/aws-efs.yaml b/roles/cluster/cluster_deploy_aws_efs/tasks/aws-efs.yaml index 7ace0b7d3..20d2e6409 100644 --- a/roles/cluster/cluster_deploy_aws_efs/tasks/aws-efs.yaml +++ b/roles/cluster/cluster_deploy_aws_efs/tasks/aws-efs.yaml @@ -96,11 +96,11 @@ - name: Populate the tags dict set_fact: - tags: "{{ tags | default({}) | combine ({ item.key : item.value }) }}" + tags: "{{ tags | default({}) | combine ({item.key: item.value}) }}" with_items: - - { 'key': 'Name' , 'value': '{{ cluster_name_cmd.stdout }}'} - - { 'key': '{{ cluster_name_tag_cmd.stdout }}' , 'value': 'owned'} - - { 'key': 'Purpose', 'value': ''} + - {'key': 'Name' , 'value': '{{ cluster_name_cmd.stdout }}'} + - {'key': '{{ cluster_name_tag_cmd.stdout }}' , 'value': 'owned'} + - {'key': 'Purpose', 'value': ''} - name: Get the SecurityGroup content amazon.aws.ec2_group_info: diff --git a/roles/cluster/cluster_deploy_ldap/tasks/main.yml b/roles/cluster/cluster_deploy_ldap/tasks/main.yml index 2a20c6783..a7222f2ca 100644 --- a/roles/cluster/cluster_deploy_ldap/tasks/main.yml +++ b/roles/cluster/cluster_deploy_ldap/tasks/main.yml @@ -144,9 +144,12 @@ # Workaround until `ocm` supports the --insecure flag - name: Get the cluster ID - shell: + shell: | + set -o pipefail ocm describe cluster "{{ cluster_deploy_ldap_cluster_name }}" --json | jq .id -r register: cluster_id_cmd + args: + executable: /bin/bash - name: Create the IDP resource manually shell: | @@ -157,6 +160,8 @@ url="https://api.openshift.com/api/clusters_mgmt/v1/clusters/{{ cluster_id_cmd.stdout }}/identity_providers"; cat "{{ cluster_deploy_ldap_ocm_idp }}" | envsubst > /tmp/idp.json ocm post "$url" --body /tmp/idp.json + args: + executable: /bin/bash - name: Get the API URL command: oc whoami --show-server diff --git a/roles/cluster/cluster_deploy_operator/tasks/main.yml b/roles/cluster/cluster_deploy_operator/tasks/main.yml index f54d27053..3012d54ea 100644 --- a/roles/cluster/cluster_deploy_operator/tasks/main.yml +++ b/roles/cluster/cluster_deploy_operator/tasks/main.yml @@ -45,10 +45,10 @@ -n {{ cluster_deploy_operator_catalog_namespace }} - name: Capture the state of the CatalogSource (debug) - shell: - oc get -oyaml CatalogSource/{{ cluster_deploy_operator_catalog }} - -n {{ cluster_deploy_operator_catalog_namespace }} - -oyaml + shell: | + oc get -oyaml CatalogSource/{{ cluster_deploy_operator_catalog }} \ + -n {{ cluster_deploy_operator_catalog_namespace }} \ + -oyaml \ > {{ artifact_extra_logs_dir }}/catalogsource.yml failed_when: false @@ -61,24 +61,24 @@ delay: 30 - name: Save the operator PackageManifest YAML (debug) - shell: - oc get packagemanifests/{{ cluster_deploy_operator_manifest_name }} - -n {{ cluster_deploy_operator_catalog_namespace }} - -oyaml + shell: | + oc get packagemanifests/{{ cluster_deploy_operator_manifest_name }} \ + -n {{ cluster_deploy_operator_catalog_namespace }} \ + -oyaml \ > {{ artifact_extra_logs_dir }}/operator_packagemanifest.yml - name: Store the operator PackageManifest JSON - shell: - oc get packagemanifests/{{ cluster_deploy_operator_manifest_name }} - -n {{ cluster_deploy_operator_catalog_namespace }} - -ojson + shell: | + oc get packagemanifests/{{ cluster_deploy_operator_manifest_name }} \ + -n {{ cluster_deploy_operator_catalog_namespace }} \ + -ojson \ > {{ artifact_extra_logs_dir }}/operator_packagemanifest.json rescue: - name: Capture the Catalog Operator logs (debug) - shell: - oc logs deployment.apps/catalog-operator - -n openshift-operator-lifecycle-manager + shell: | + oc logs deployment.apps/catalog-operator \ + -n openshift-operator-lifecycle-manager \ > {{ artifact_extra_logs_dir }}/catalog_operator.log failed_when: false @@ -86,8 +86,8 @@ debug: msg="The logs of Catalog Operator have been saved in {{ artifact_extra_logs_dir }}/catalog_operator.log" - name: Mark the failure as flake - shell: - echo "Failed because the {{ cluster_deploy_operator_manifest_name }} PackageManifest is not available" + shell: | + echo "Failed because the {{ cluster_deploy_operator_manifest_name }} PackageManifest is not available" \ > "{{ artifact_extra_logs_dir }}/FLAKE" - name: Failed because the operator could not be found in the CatalogSource @@ -231,7 +231,8 @@ - name: Instantiate the Subscription command: oc apply -f "{{ artifact_extra_logs_dir }}/src/002_sub.yml" -- block: +- name: Make sure the InstallPlan is deployed + block: - name: Find the operator InstallPlan command: oc get InstallPlan @@ -276,33 +277,32 @@ fail: msg="ClusterServiceVersion install not successful ({{ operator_csv_phase.stdout }})" when: operator_csv_phase.stdout != "Succeeded" - always: - - name: Store the YAML of the operator CSV that was installed (debug) - shell: - oc get ClusterServiceVersion/{{ operator_csv_name }} - -oyaml - -n "{{ cluster_deploy_operator_namespace }}" - > {{ artifact_extra_logs_dir }}/operator_csv.yml - - name: Store the YAML of the subscription - shell: - oc get -f "{{ artifact_extra_logs_dir }}/src/002_sub.yml" - -oyaml - -n "{{ cluster_deploy_operator_namespace }}" - > {{ artifact_extra_logs_dir }}/operator_sub.yml rescue: - name: Capture the Catalog Operator logs (debug) - shell: + shell: | oc logs deployment.apps/catalog-operator -n openshift-operator-lifecycle-manager > {{ artifact_extra_logs_dir }}/catalog_operator.log failed_when: false - - name: Indicate where the Catalog-operator logs have been saved debug: msg="The logs of Catalog Operator have been saved in {{ artifact_extra_logs_dir }}/catalog_operator.log" - - name: Failed because the operator could not be installed from the CatalogSource fail: msg="Failed because the operator could not be installed from the CatalogSource" + always: + - name: Store the YAML of the operator CSV that was installed (debug) + shell: | + oc get ClusterServiceVersion/{{ operator_csv_name }} \ + -oyaml \ + -n "{{ cluster_deploy_operator_namespace }}" \ + > {{ artifact_extra_logs_dir }}/operator_csv.yml + - name: Store the YAML of the subscription + shell: | + oc get -f "{{ artifact_extra_logs_dir }}/src/002_sub.yml" \ + -oyaml \ + -n "{{ cluster_deploy_operator_namespace }}" \ + > {{ artifact_extra_logs_dir }}/operator_sub.yml + - name: Deploy the operator CustomResource from its ClusterServiceVersion include_tasks: deploy_cr.yml when: cluster_deploy_operator_deploy_cr | bool diff --git a/roles/cluster/cluster_ensure_machineset/tasks/main.yml b/roles/cluster/cluster_ensure_machineset/tasks/main.yml index e83ef253e..b4976b39f 100644 --- a/roles/cluster/cluster_ensure_machineset/tasks/main.yml +++ b/roles/cluster/cluster_ensure_machineset/tasks/main.yml @@ -1,6 +1,7 @@ --- - name: "Check if the cluster already has a {{ machineset_instance_type }} machineset" - shell: + shell: | + set -o pipefail oc get machineset -n openshift-machine-api {% if machineset_name | length > 0 %} -ojson | jq '.items[] | select(.spec.template.spec.providerSpec.value.instanceType=="{{ machineset_instance_type }}" and .metadata.name=="{{ machineset_name }}") | .metadata.name' -r @@ -8,6 +9,8 @@ -o=jsonpath='{.items[?(@.spec.template.spec.providerSpec.value.instanceType=="{{ machineset_instance_type }}")].metadata.name}' {% endif %} register: cluster_has_machineset + args: + executable: /bin/bash - name: Delete the machineset if it is set but has the wrong instance type when: not cluster_has_machineset.stdout and machineset_name | length > 0 diff --git a/roles/cluster/cluster_fill_workernodes/tasks/main.yml b/roles/cluster/cluster_fill_workernodes/tasks/main.yml index 13cd5d334..9e31e92da 100644 --- a/roles/cluster/cluster_fill_workernodes/tasks/main.yml +++ b/roles/cluster/cluster_fill_workernodes/tasks/main.yml @@ -1,12 +1,15 @@ - name: Get the list of the worker nodes - shell: - oc get nodes - -l{{ cluster_fill_workernodes_label_selector }} - -oname + shell: | + set -o pipefail + oc get nodes \ + -l{{ cluster_fill_workernodes_label_selector }} \ + -oname \ | cut -d/ -f2 register: worker_node_names_cmd failed_when: not worker_node_names_cmd.stdout + args: + executable: /bin/bash - name: Create the src artifacts directory file: @@ -21,8 +24,8 @@ include_tasks: fill_node.yaml - name: Store the definition of the nodes - shell: - oc get nodes - -lnode-role.kubernetes.io/worker - -oyaml + shell: | + oc get nodes \ + -lnode-role.kubernetes.io/worker \ + -oyaml \ > "{{ artifact_extra_logs_dir }}/nodes.yaml" diff --git a/roles/cluster/cluster_set_project_annotation/tasks/cluster_set.yaml b/roles/cluster/cluster_set_project_annotation/tasks/cluster_set.yaml index 9c132023f..451c379d2 100644 --- a/roles/cluster/cluster_set_project_annotation/tasks/cluster_set.yaml +++ b/roles/cluster/cluster_set_project_annotation/tasks/cluster_set.yaml @@ -1,5 +1,5 @@ - name: Get the project template if it exists - shell: + shell: | set -o pipefail; oc adm create-bootstrap-project-template -ojson | oc get -n openshift-config -f- -ojson | jq .items[0] > "{{ artifact_extra_logs_dir }}/base_project_template.yaml" @@ -8,7 +8,7 @@ - name: Create the project template if it did not exist when: get_project_template_cmd.rc != 0 - shell: + shell: | oc adm create-bootstrap-project-template -ojson > "{{ artifact_extra_logs_dir }}/base_project_template.yaml" @@ -24,7 +24,7 @@ - name: Remove the annotation when: (cluster_set_project_annotation_value | default('') or '') | length == 0 - shell: + shell: | set -o pipefail; cat "{{ artifact_extra_logs_dir }}/base_project_template.yaml" | jq 'del(.objects[0].metadata.annotations["{{ cluster_set_project_annotation_key }}"])' @@ -55,8 +55,8 @@ echo "- deleting the project $test_project_name ..." oc delete ns "$test_project_name" >/dev/null echo "--> project annotation value: $project_annotation_value" - echo "==> expected value: "'{{ cluster_set_project_annotation_value or "null"}}' - [[ "$project_annotation_value" == '{{ cluster_set_project_annotation_value or "null"}}' ]] + echo "==> expected value: "'{{ cluster_set_project_annotation_value or "null" }}' + [[ "$project_annotation_value" == '{{ cluster_set_project_annotation_value or "null" }}' ]] retries: 120 delay: 5 register: wait_project_template_active diff --git a/roles/cluster/cluster_set_scale/tasks/main.yml b/roles/cluster/cluster_set_scale/tasks/main.yml index 23e08eabc..76ff43744 100644 --- a/roles/cluster/cluster_set_scale/tasks/main.yml +++ b/roles/cluster/cluster_set_scale/tasks/main.yml @@ -48,8 +48,8 @@ register: oc_get_machinesets failed_when: not oc_get_machinesets.stdout -- when: current_replicas_sum != scale - name: Change all {{ machineset_instance_type }} machinesets replicas to have sum {{ scale }} +- name: Change all {{ machineset_instance_type }} machinesets replicas to have sum {{ scale }} + when: current_replicas_sum != scale block: - name: Do not downscale any machinesets other than the first one, unless the user used force block: @@ -80,7 +80,8 @@ oc patch machineset -n openshift-machine-api {{ first_machineset }} --patch '{"spec": {"replicas": {{ scale }} }}' --type merge -- block: +- name: Make sure the machinesets are ready + block: - name: Wait for all machinesets with type {{ machineset_instance_type }} to be ready # This is done by verifying that at the availableReplicas @@ -91,7 +92,8 @@ # See https://docs.openshift.com/container-platform/4.7/rest_api/machine_apis/machineset-machine-openshift-io-v1beta1.html # for more information. # 3. Perform some extra formatting for nicer logging - shell: >- + shell: | + set -o pipefail oc get machinesets -n openshift-machine-api \ {% if machineset_name | length > 0 %} "{{ machineset_name }}" -ojson \ @@ -110,6 +112,12 @@ until: not non_ready_replicas.stdout_lines retries: 120 delay: 30 + args: + executable: /bin/bash + + rescue: + - name: Fail because the cluster machineset creation failed + fail: msg="Failing because cluster machineset creation failed" always: # info about the 'machines' @@ -150,7 +158,4 @@ failed_when: false loop: "{{ oc_get_machinesets.stdout_lines }}" - rescue: - - name: Fail because the cluster machineset creation failed - fail: msg="Failing because cluster machineset creation failed" diff --git a/roles/codeflare/codeflare_cleanup_appwrappers/tasks/main.yml b/roles/codeflare/codeflare_cleanup_appwrappers/tasks/main.yml index c97c8deaa..a7e058fa0 100644 --- a/roles/codeflare/codeflare_cleanup_appwrappers/tasks/main.yml +++ b/roles/codeflare/codeflare_cleanup_appwrappers/tasks/main.yml @@ -1,11 +1,17 @@ - name: List all the AppWrappers in the namespace shell: | + set -o pipefail oc get appwrappers -n {{ codeflare_cleanup_appwrappers_namespace }} -ojson | jq '.items[] | .metadata.name + " ==> "+ .status.state' -r > "{{ artifact_extra_logs_dir }}/appwrappers.status" oc get appwrappers -n {{ codeflare_cleanup_appwrappers_namespace }} -oyaml > "{{ artifact_extra_logs_dir }}/appwrappers.yaml" + args: + executable: /bin/bash - name: Count the AppWrappers in the namespace - shell: + shell: | + set -o pipefail oc get appwrappers -n {{ codeflare_cleanup_appwrappers_namespace }} -oname | wc -l + args: + executable: /bin/bash - name: Create a configmap for the beginning of the test timestamp shell: diff --git a/roles/codeflare/codeflare_generate_mcad_load/tasks/main.yml b/roles/codeflare/codeflare_generate_mcad_load/tasks/main.yml index a91ca5909..698b57dd3 100644 --- a/roles/codeflare/codeflare_generate_mcad_load/tasks/main.yml +++ b/roles/codeflare/codeflare_generate_mcad_load/tasks/main.yml @@ -42,8 +42,11 @@ mode: '0755' - name: Create the namespace if it does not exist - shell: + shell: | + set -o pipefail oc create ns "{{ codeflare_generate_mcad_load_namespace }}" -oyaml --dry-run=client | tee "{{ artifact_extra_logs_dir }}/src/namespace.yaml" | oc apply -f- + args: + executable: /bin/bash - name: Create a configmap for the beginning of the test timestamp shell: diff --git a/roles/entitlement/entitlement_deploy/tasks/main.yml b/roles/entitlement/entitlement_deploy/tasks/main.yml index 2af07d59e..362ba7653 100644 --- a/roles/entitlement/entitlement_deploy/tasks/main.yml +++ b/roles/entitlement/entitlement_deploy/tasks/main.yml @@ -29,31 +29,37 @@ machine_config_role: worker - name: "Deploy RHSM from file '{{ entitlement_rhsm }}'" - shell: + shell: | set -o pipefail; cat "{{ entitlement_mc_rhsm }}" | python3 "{{ entitlement_py_apply }}" BASE64_ENCODED_RHSM_FILE @"{{ entitlement_rhsm }}" | python3 "{{ entitlement_py_apply }}" MACHINE_CONFIG_ROLE "{{ machine_config_role }}" | oc apply -f- + args: + executable: /bin/bash - name: "Deploy the pem and key-pem from file '{{ entitlement_pem }}'" - shell: + shell: | set -o pipefail; cat "{{ entitlement_mc_pem }}" | python3 "{{ entitlement_py_apply }}" BASE64_ENCODED_PEM_FILE @"{{ entitlement_pem }}" | python3 "{{ entitlement_py_apply }}" MACHINE_CONFIG_ROLE "{{ machine_config_role }}" | oc apply -f- + args: + executable: /bin/bash - name: "Deploy the repo CA from file '{{ entitlement_mc_rhsm_ca }}' if requested" + when: entitlement_repo_ca | default('', true) | trim != '' block: - name: Compute the md5sum of the CA file (debug) command: md5sum '{{ entitlement_repo_ca }}' - name: "Deploy the repo CA from file '{{ entitlement_mc_rhsm_ca }}'" - shell: + shell: | set -o pipefail; cat "{{ entitlement_mc_rhsm_ca }}" | python3 "{{ entitlement_py_apply }}" BASE64_ENCODED_RHSM_CA_FILE @"{{ entitlement_repo_ca }}" | python3 "{{ entitlement_py_apply }}" MACHINE_CONFIG_ROLE "{{ machine_config_role }}" | oc apply -f- - when: entitlement_repo_ca | default('', true) | trim != '' + args: + executable: /bin/bash diff --git a/roles/entitlement/entitlement_test_in_cluster/tasks/main.yml b/roles/entitlement/entitlement_test_in_cluster/tasks/main.yml index 4f852a985..fd931988e 100644 --- a/roles/entitlement/entitlement_test_in_cluster/tasks/main.yml +++ b/roles/entitlement/entitlement_test_in_cluster/tasks/main.yml @@ -26,7 +26,8 @@ - name: Delete the entitlement tester Pod if it exists command: oc delete --ignore-not-found=true -f "{{ entitlement_tester_pod }}" -- block: +- name: Make sure the entitlement Pod is created + block: - name: Create the entitlement tester Pod command: oc create -f "{{ entitlement_tester_pod }}" diff --git a/roles/entitlement/entitlement_test_wait_deployment/tasks/main.yml b/roles/entitlement/entitlement_test_wait_deployment/tasks/main.yml index fa97dc17a..083d958a1 100644 --- a/roles/entitlement/entitlement_test_wait_deployment/tasks/main.yml +++ b/roles/entitlement/entitlement_test_wait_deployment/tasks/main.yml @@ -9,7 +9,8 @@ entitlement_retries: "{{ entitlement_nb_wait_retries }}" when: entitlement_test_and_wait == 'yes' -- block: +- name: Make sure the entitlement Pod is deployed + block: - name: Wait for the entitlement Pod to succeed shell: | set -o errexit; diff --git a/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_clusterpolicy.yml b/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_clusterpolicy.yml index 049159fdf..5fc04be4d 100644 --- a/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_clusterpolicy.yml +++ b/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_clusterpolicy.yml @@ -10,7 +10,8 @@ retries: 15 delay: 30 -- block: +- name: Make sure the GPU Operator ClusterPolicy is ready + block: - name: Wait for the GPU Operator ClusterPolicy CRD to appear command: oc get crd clusterpolicies.nvidia.com register: has_clusterpolicy_crd @@ -20,14 +21,14 @@ rescue: - name: Inspect the Subscriptions status (debug) - shell: + shell: | (oc get subscriptions.operators.coreos.com -n "{{ deploy_bundle_namespace }}" && oc describe subscriptions.operators.coreos.com/gpu-operator-certified -n "{{ deploy_bundle_namespace }}") > {{ artifact_extra_logs_dir }}/gpu_operator_Subscription.log failed_when: false - name: Get the ClusterServiceVersion status (debug) - shell: + shell: | (oc get ClusterServiceVersion -A && oc describe "{{ gpu_operator_csv_name_cmd.stdout }}" -n "{{ deploy_bundle_namespace }}") > {{ artifact_extra_logs_dir }}/gpu_operator_ClusterServiceVersion.log @@ -37,7 +38,7 @@ fail: msg="Failed because the ClusterPolicy CR cannot be created" - name: Get the clusterpolicy of the GPU Operator from OperatorHub CSV - shell: + shell: | set -o pipefail; oc get "{{ gpu_operator_csv_name_cmd.stdout }}" -n "{{ deploy_bundle_namespace }}" diff --git a/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_from_catalog.yml b/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_from_catalog.yml index 29c8446fc..282ef6230 100644 --- a/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_from_catalog.yml +++ b/roles/gpu_operator/gpu_operator_deploy_from_operatorhub/tasks/deploy_from_catalog.yml @@ -113,7 +113,8 @@ - name: Instantiate the OperatorHub subscription command: oc create -f "{{ artifact_extra_logs_dir }}/gpu_operator_sub.yml" -- block: +- name: Install the GPU Operator + block: - name: Find the GPU Operator OperatorHub InstallPlan command: oc get InstallPlan diff --git a/roles/gpu_operator/gpu_operator_get_csv_version/tasks/main.yml b/roles/gpu_operator/gpu_operator_get_csv_version/tasks/main.yml index b0f62d319..db0a503a0 100644 --- a/roles/gpu_operator/gpu_operator_get_csv_version/tasks/main.yml +++ b/roles/gpu_operator/gpu_operator_get_csv_version/tasks/main.yml @@ -9,12 +9,20 @@ - name: Get the gpu-operator subscription package name block: - name: Count gpu operator subscription candidates - shell: oc get subscription --all-namespaces -ojson | jq '[.items[] | select(.spec.name | test("gpu-operator"))] | length' + shell: | + set -o pipefail + oc get subscription --all-namespaces -ojson | jq '[.items[] | select(.spec.name | test("gpu-operator"))] | length' register: gpu_subscriptions failed_when: gpu_subscriptions.stdout != '1' + args: + executable: /bin/bash - name: Read the package name from the first gpu-operator subscription - shell: oc get subscription -A -ojson | jq '[.items[] | select(.spec.name | test("gpu-operator"))][0].spec.name' + shell: | + set -o pipefail + oc get subscription -A -ojson | jq '[.items[] | select(.spec.name | test("gpu-operator"))][0].spec.name' register: gpu_operator_subscription_package_name + args: + executable: /bin/bash - name: Ensure that there is a CSV for the GPU Operator command: oc get csv diff --git a/roles/gpu_operator/gpu_operator_run_gpu_burn/tasks/main.yml b/roles/gpu_operator/gpu_operator_run_gpu_burn/tasks/main.yml index 3f9cce344..6bc2e9311 100644 --- a/roles/gpu_operator/gpu_operator_run_gpu_burn/tasks/main.yml +++ b/roles/gpu_operator/gpu_operator_run_gpu_burn/tasks/main.yml @@ -90,16 +90,23 @@ - name: Ensure that no GPU was faulty loop: "{{ gpu_burn_gpu_nodes.stdout_lines }}" - shell: + shell: | + set -o pipefail oc logs pod/gpu-burn-{{ item }} -n {{ gpu_operator_run_gpu_burn_namespace }} | grep FAULTY register: gpu_burn_test_faulty failed_when: gpu_burn_test_faulty.rc == 0 + args: + executable: /bin/bash always: - name: Save the logs of the GPU burn Pods - shell: oc logs pod/gpu-burn-{{ item }} -n {{ gpu_operator_run_gpu_burn_namespace }} | grep -o "[^$(printf '\r')]*$" + shell: | + set -o pipefail + oc logs pod/gpu-burn-{{ item }} -n {{ gpu_operator_run_gpu_burn_namespace }} | grep -o "[^$(printf '\r')]*$" with_items: "{{ gpu_burn_gpu_nodes.stdout_lines }}" failed_when: false + args: + executable: /bin/bash - name: Save the description of the GPU burn Pods shell: oc describe pod/gpu-burn-{{ item }} -n {{ gpu_operator_run_gpu_burn_namespace }} > {{ artifact_extra_logs_dir }}/gpu_burn.{{ item }}.description.txt @@ -124,4 +131,4 @@ - name: Delete the src ConfigMap command: oc --ignore-not-found=true delete configmap gpu-burn-src -n {{ gpu_operator_run_gpu_burn_namespace }} failed_when: false - when: not gpu_operator_run_gpu_burn_keep_resources \ No newline at end of file + when: not gpu_operator_run_gpu_burn_keep_resources diff --git a/roles/gpu_operator/gpu_operator_wait_deployment/tasks/main.yml b/roles/gpu_operator/gpu_operator_wait_deployment/tasks/main.yml index b90de5a92..e5950f328 100644 --- a/roles/gpu_operator/gpu_operator_wait_deployment/tasks/main.yml +++ b/roles/gpu_operator/gpu_operator_wait_deployment/tasks/main.yml @@ -78,7 +78,8 @@ retries: 15 delay: 60 -- block: +- name: Wait for the nodes labeling + block: - name: Wait for the gpu-feature-discovery Pod to label the nodes command: oc get nodes -l nvidia.com/gpu.count -oname register: has_gpu_feature_discovery_labels diff --git a/roles/gpu_operator/gpu_operator_wait_deployment/tasks/metrics.yml b/roles/gpu_operator/gpu_operator_wait_deployment/tasks/metrics.yml index 57e3ac7ac..3cf94ef2b 100644 --- a/roles/gpu_operator/gpu_operator_wait_deployment/tasks/metrics.yml +++ b/roles/gpu_operator/gpu_operator_wait_deployment/tasks/metrics.yml @@ -1,5 +1,6 @@ --- -- block: +- name: Create the GPU operator requirements + block: - name: Check if the GPU Operator namespace has the openshift.io/cluster-monitoring label shell: set -o pipefail; diff --git a/roles/load_aware/load_aware_deploy_trimaran/tasks/main.yml b/roles/load_aware/load_aware_deploy_trimaran/tasks/main.yml index 29c7ad151..d3669bb75 100644 --- a/roles/load_aware/load_aware_deploy_trimaran/tasks/main.yml +++ b/roles/load_aware/load_aware_deploy_trimaran/tasks/main.yml @@ -3,9 +3,8 @@ oc apply -f {{ user_applications_monitor_config }} - name: Ensure user monitoring enabled - shell: + shell: | set -o pipefail; - oc -n openshift-user-workload-monitoring get pod --no-headers | awk '{ print $3}' register: monitoring_enabled @@ -13,9 +12,12 @@ retries: 20 # until all of the pods are in the running state until: "'Completed' not in monitoring_enabled.stdout and 'Failed' not in monitoring_enabled.stdout and 'Pending' not in monitoring_enabled.stdout" + args: + executable: /bin/bash - name: Get monitoring secret name - shell: + shell: | + set -o pipefail oc get secret -n openshift-user-workload-monitoring | grep prometheus-user-workload-token | head -n 1 @@ -24,18 +26,24 @@ delay: 3 retries: 20 until: "'prometheus-user-workload-token' in monitoring_secret.stdout" + args: + executable: /bin/bash - name: Get Thanos Endpoint - shell: + shell: | + set -o pipefail oc get route thanos-querier -n openshift-monitoring -o json | jq -r '.spec.host' register: thanos_endpoint_cmd + args: + executable: /bin/bash + - name: Format the Thanos Endpoint set_fact: thanos_endpoint: "https://{{ thanos_endpoint_cmd.stdout }}" - name: Checking monitoring token size - shell: + shell: | set -o errexit; set -o pipefail; set -o nounset; @@ -49,6 +57,8 @@ delay: 2 retries: 30 until: token_size.stdout | int > 1000 + args: + executable: /bin/bash - name: Create the src artifacts directory file: @@ -79,7 +89,8 @@ register: deploy_trimaran - name: Ensure Trimaran is Running - shell: + shell: | + set -o pipefail oc get pods -n trimaran | grep "trimaran-scheduler" | awk '{print $3}' @@ -87,6 +98,10 @@ delay: 3 retries: 20 until: trimaran_running.stdout == 'Running' + args: + executable: /bin/bash + + - name: Ensure a pod can be scheduled block: @@ -95,13 +110,16 @@ oc apply -f {{ trimaran_test_pod }} -n trimaran - name: Ensure the Trimaran test pods completes - shell: + shell: | + set -o pipefail oc get pod trimaran-test -n trimaran --no-headers | awk '{print $3}' register: trimaran_test_pod_state delay: 5 retries: 20 until: trimaran_test_pod_state.stdout == 'Completed' + args: + executable: /bin/bash always: - name: Dump trimaran info diff --git a/roles/load_aware/load_aware_scale_test/tasks/main.yml b/roles/load_aware/load_aware_scale_test/tasks/main.yml index 054bb3524..635e56eb2 100644 --- a/roles/load_aware/load_aware_scale_test/tasks/main.yml +++ b/roles/load_aware/load_aware_scale_test/tasks/main.yml @@ -1,6 +1,6 @@ - name: Generate load timeline shell: - python3 {{ load_timeline_generator }} {{ load_aware_scale_test_distribution }} {{ load_aware_scale_test_duration}} {{ load_aware_scale_test_instances }} "{{ artifact_extra_logs_dir }}/schedule_plan.yaml" + python3 {{ load_timeline_generator }} {{ load_aware_scale_test_distribution }} {{ load_aware_scale_test_duration }} {{ load_aware_scale_test_instances }} "{{ artifact_extra_logs_dir }}/schedule_plan.yaml" - name: Run workload and dump stats block: @@ -9,7 +9,8 @@ python3 {{ pod_start_scheduler }} {{ load_aware_scale_test_scheduler }} {{ load_aware_scale_test_namespace }} "{{ artifact_extra_logs_dir }}/schedule_plan.yaml" "{{ artifact_extra_logs_dir }}/schedule_execution.yaml" {{ load_aware_scale_test_sleep_duration }} - name: Wait for workloads to finish - shell: + shell: | + set -o pipefail oc get pods -n {{ load_aware_scale_test_namespace }} --no-headers | awk '{ print $3 }' register: load_aware_workload delay: 60 @@ -20,6 +21,8 @@ and 'Failed' not in load_aware_workload.stdout and 'ContainerCreating' not in load_aware_workload.stdout and 'ImagePullBackOff' not in load_aware_workload.stdout" + args: + executable: /bin/bash always: - name: Dump info about scale test resources diff --git a/roles/local_ci/local_ci_run_multi/tasks/main.yml b/roles/local_ci/local_ci_run_multi/tasks/main.yml index adfcd5272..0295ad158 100644 --- a/roles/local_ci/local_ci_run_multi/tasks/main.yml +++ b/roles/local_ci/local_ci_run_multi/tasks/main.yml @@ -213,9 +213,12 @@ register: success_count_cmd - name: Store the success count - shell: + shell: | + set -o pipefail echo "{{ success_count_cmd.stdout_lines[0] }}/{{ local_ci_run_multi_user_count }}" - | tee "{{ artifact_extra_logs_dir }}/success_count" + | tee "{{ artifact_extra_logs_dir }}/success_count" + args: + executable: /bin/bash # the tasks below will abort the execution in case of problems diff --git a/roles/nfd/nfd_test_wait_gpu/tasks/main.yml b/roles/nfd/nfd_test_wait_gpu/tasks/main.yml index b38cafaa9..1e5c95e9b 100644 --- a/roles/nfd/nfd_test_wait_gpu/tasks/main.yml +++ b/roles/nfd/nfd_test_wait_gpu/tasks/main.yml @@ -9,11 +9,13 @@ nfd_wait_gpu_retries: "{{ nfd_wait_gpu_nb_retries }}" when: nfd_wait_gpu_nodes == 'yes' -- block: +- name: Coverge the GPU nodes creation + block: - name: Wait for the GPU nodes to appear # label list should be in sync with: # https://github.com/NVIDIA/gpu-operator/blob/master/pkg/controller/clusterpolicy/state_manager.go#L26 - shell: + shell: | + set -o pipefail ( oc get nodes -oname --ignore-not-found=false -l feature.node.kubernetes.io/pci-10de.present || oc get nodes -oname --ignore-not-found=false -l feature.node.kubernetes.io/pci-0302_10de.present || oc get nodes -oname --ignore-not-found=false -l feature.node.kubernetes.io/pci-0300_10de.present @@ -22,10 +24,16 @@ until: nfd_gpu_wait.rc == 0 retries: "{{ nfd_wait_gpu_retries }}" delay: 30 + args: + executable: /bin/bash rescue: - name: Get the labels of the worker nodes (debug) - shell: oc get nodes --show-labels --selector='!node-role.kubernetes.io/master' | tr , '\n' + shell: | + set -o pipefail + oc get nodes --show-labels --selector='!node-role.kubernetes.io/master' | tr , '\n' + args: + executable: /bin/bash - name: Failing because no GPU node showed up fail: msg="Failed because no GPU node showed up" diff --git a/roles/ocm/ocm_deploy_addon/tasks/main.yml b/roles/ocm/ocm_deploy_addon/tasks/main.yml index 5ebe520c0..c55375400 100644 --- a/roles/ocm/ocm_deploy_addon/tasks/main.yml +++ b/roles/ocm/ocm_deploy_addon/tasks/main.yml @@ -24,9 +24,9 @@ cluster_id: "{{ cluster_id_cmd.stdout }}" - name: Check if addon is already installed - shell: + shell: | set -o pipefail; - ocm get /api/clusters_mgmt/v1/clusters/{{ cluster_id }}/addons/{{ ocm_deploy_addon_id }} |& jq -r '.kind' || true + ocm get /api/clusters_mgmt/v1/clusters/{{ cluster_id }}/addons/{{ ocm_deploy_addon_id }} |& jq -r '.kind' || true register: ocm_addon_precheck - name: Create ocm addon install payload @@ -36,11 +36,11 @@ mode: 0400 when: '"Error" in ocm_addon_precheck.stdout' -- name: "Install addon {{ ocm_deploy_addon_id }} via OCM API" +- name: Install addon {{ ocm_deploy_addon_id }} via OCM API shell: | set -o pipefail; url="/api/clusters_mgmt/v1/clusters/{{ cluster_id }}/addons" - body="{{ artifact_extra_logs_dir }}/addon_{{ ocm_deploy_addon_id }}.json" + body="{{ artifact_extra_logs_dir }}/addon_{{ ocm_deploy_addon_id }}.json" output=$(ocm post "$url" --body=$body 2>&1); echo "$output" >&2; # for observation echo "$output" | jq -r '.kind' diff --git a/roles/pipelines/pipelines_capture_state/tasks/main.yml b/roles/pipelines/pipelines_capture_state/tasks/main.yml index 08e39e2a2..aabd30d2b 100644 --- a/roles/pipelines/pipelines_capture_state/tasks/main.yml +++ b/roles/pipelines/pipelines_capture_state/tasks/main.yml @@ -10,11 +10,15 @@ namespace: "{% if pipelines_capture_state_namespace | length > 0 %}{{ pipelines_capture_state_namespace }}{% else %}{{ project_name_cmd.stdout }}{% endif %}" - name: Compute the DSP application name - shell: + shell: | + set -o pipefail oc get dspa -oname -n "{{ namespace }}" | head -1 | cut -d/ -f2 register: dspa_name_cmd when: not pipelines_capture_state_dsp_application_name failed_when: not dspa_name_cmd.stdout + args: + executable: /bin/bash + - name: Save the DSP application name set_fact: diff --git a/roles/pipelines/pipelines_run_kfp_notebook/tasks/main.yml b/roles/pipelines/pipelines_run_kfp_notebook/tasks/main.yml index 7893b694e..1fe017b1c 100644 --- a/roles/pipelines/pipelines_run_kfp_notebook/tasks/main.yml +++ b/roles/pipelines/pipelines_run_kfp_notebook/tasks/main.yml @@ -106,11 +106,14 @@ # but Task 'Store the listing of the notebook directory' will catch it earlier - name: Compute the DSP application name - shell: + shell: | + set -o pipefail oc get dspa -oname -n "{{ notebook_namespace }}" | head -1 | cut -d/ -f2 register: dspa_name_cmd when: not pipelines_run_kfp_notebook_dsp_application_name failed_when: not dspa_name_cmd.stdout + args: + executable: /bin/bash - name: Save the DSP application name set_fact: @@ -149,7 +152,7 @@ sa_cluster_role: cluster-admin - name: Create the service account that will be used in the Notebook - shell: + shell: | set -o pipefail; oc create serviceaccount "{{ sa_name }}" -n "{{ notebook_namespace }}" @@ -158,11 +161,11 @@ | oc apply -f- - name: Grant all the privileges to the service account - shell: + shell: | set -o pipefail; oc adm policy add-cluster-role-to-user "{{ sa_cluster_role }}" -n "{{ notebook_namespace }}" - -z "{{ sa_name}}" + -z "{{ sa_name }}" --dry-run=client -oyaml | tee "{{ artifact_extra_logs_dir }}/src/clusterrolebinding.yaml" | oc apply -f- @@ -191,9 +194,9 @@ - name: Run the test notebook block: - name: Create the secret token - shell: + shell: | oc create secret generic "{{ secret_token_name }}" - "--from-literal=token=$(oc create token '{{ sa_name}}' -n '{{ notebook_namespace }}')" + "--from-literal=token=$(oc create token '{{ sa_name }}' -n '{{ notebook_namespace }}')" -n "{{ notebook_namespace }}" register: create_secret_token_cmd failed_when: '"no token is currently in use for this session" in create_secret_token_cmd.stderr' @@ -205,7 +208,7 @@ -n "{{ notebook_namespace }}" - name: Wait for the Notebook Pod to start running - shell: + shell: | set -o pipefail; oc get pod {{ notebook_search_labels }} --ignore-not-found diff --git a/roles/rhods/rhods_benchmark_notebook_performance/tasks/main.yml b/roles/rhods/rhods_benchmark_notebook_performance/tasks/main.yml index f317cdc22..ee360ec15 100644 --- a/roles/rhods/rhods_benchmark_notebook_performance/tasks/main.yml +++ b/roles/rhods/rhods_benchmark_notebook_performance/tasks/main.yml @@ -40,7 +40,7 @@ - name: Save the imagestream tag set_fact: - image_tag: "{% if rhods_benchmark_notebook_performance_imagestream_tag | string %}{{ rhods_benchmark_notebook_performance_imagestream_tag }}{% else %}{{ image_tag_cmd.stdout }}{% endif %}" + image_tag: "{% if rhods_benchmark_notebook_performance_imagestream_tag | string %}{{ rhods_benchmark_notebook_performance_imagestream_tag }}{% else %}{{ image_tag_cmd.stdout }}{% endif %}" - name: Get the image address command: @@ -53,10 +53,13 @@ when: not rhods_benchmark_notebook_performance_use_rhods | bool block: - name: Compute the imagestream filename - shell: + shell: | + set -o pipefail echo -n "{{ rhods_benchmark_notebook_performance_imagestream }}" | sed 's/^s2i-//g' | sed 's/-notebook//g'; echo "-notebook-imagestream.yaml" register: imagestream_filename_cmd + args: + executable: /bin/bash - name: Fetch the imagestream definition from the source repository get_url: diff --git a/roles/rhods/rhods_notebook_ods_ci_scale_test/tasks/main.yml b/roles/rhods/rhods_notebook_ods_ci_scale_test/tasks/main.yml index 909225c43..a21647de2 100644 --- a/roles/rhods/rhods_notebook_ods_ci_scale_test/tasks/main.yml +++ b/roles/rhods/rhods_notebook_ods_ci_scale_test/tasks/main.yml @@ -21,7 +21,7 @@ - name: Update the Exclude tags if necessary set_fact: - rhods_notebook_ods_ci_scale_test_ods_ci_exclude_tags: "{% if rhods_notebook_ods_ci_scale_test_only_create_notebooks|bool %}JupyterLabORWait{% else %}{{ rhods_notebook_ods_ci_scale_test_ods_ci_exclude_tags }}{% endif %}" + rhods_notebook_ods_ci_scale_test_ods_ci_exclude_tags: "{% if rhods_notebook_ods_ci_scale_test_only_create_notebooks | bool %}JupyterLabORWait{% else %}{{ rhods_notebook_ods_ci_scale_test_ods_ci_exclude_tags }}{% endif %}" - name: Define the test environments set_fact: @@ -63,14 +63,20 @@ register: oc_api_url_cmd - name: Get the RHODS CSV name - shell: + shell: | + set -o pipefail oc get csv -oname -n redhat-ods-operator | grep rhods-operator register: rhods_csv_cmd + args: + executable: /bin/bash - name: Get the RHODS version - shell: + shell: | + set -o pipefail oc get {{ rhods_csv_cmd.stdout }} -n redhat-ods-operator -oname | grep rhods-operator | cut -d/ -f2 | cut -d. -f2- register: rhods_version_cmd + args: + executable: /bin/bash - name: Get the Dashboard Product name (to distinguish RHODS from ODH). Currently hardcoded to RHODS. # We'll have to find another way to distinguish RHODS from ODH, this doesn't work anymore: diff --git a/roles/wisdom/wisdom_deploy_model/tasks/main.yml b/roles/wisdom/wisdom_deploy_model/tasks/main.yml index 060186c4f..dd6591b3a 100644 --- a/roles/wisdom/wisdom_deploy_model/tasks/main.yml +++ b/roles/wisdom/wisdom_deploy_model/tasks/main.yml @@ -104,8 +104,12 @@ command: oc delete secret/wisdom-protos --ignore-not-found -n {{ wisdom_deploy_model_namespace }} - name: Create the wisdom protos Secret - shell: oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_deploy_model_protos_path }}/* | tr ' ' ,) -n {{ wisdom_deploy_model_namespace }} --dry-run=client -oyaml | oc apply -f - - + shell: | + set -o pipefail + oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_deploy_model_protos_path }}/* | tr ' ' ,) -n {{ wisdom_deploy_model_namespace }} --dry-run=client -oyaml | oc apply -f - + args: + executable: /bin/bash + - name: Delete the entrypoint ConfigMap if it already exists command: oc delete --ignore-not-found configmap/wisdom-tester-entrypoint -n {{ wisdom_deploy_model_namespace }} @@ -129,17 +133,16 @@ command: oc create -f "{{ artifact_extra_logs_dir }}/src/003_wisdom_tester_pod.yml" -n {{ wisdom_deploy_model_namespace }} # Wait for model to respond without error -- block: - - name: Wait for the wisdom test Pod to terminate - command: - oc get pod/wisdom-tester - -n {{ wisdom_deploy_model_namespace }} - -o custom-columns=:.status.phase - --no-headers - register: tester_wait - until: tester_wait.stdout == "Succeeded" or tester_wait.stdout == "Error" or tester_wait.stdout == "Failed" - retries: 10 - delay: 30 +- name: Wait for the wisdom test Pod to terminate + command: + oc get pod/wisdom-tester + -n {{ wisdom_deploy_model_namespace }} + -o custom-columns=:.status.phase + --no-headers + register: tester_wait + until: tester_wait.stdout == "Succeeded" or tester_wait.stdout == "Error" or tester_wait.stdout == "Failed" + retries: 10 + delay: 30 - name: Save the wisdom test Pod logs shell: oc logs pod/wisdom-tester -n {{ wisdom_deploy_model_namespace }} > {{ artifact_extra_logs_dir }}/wisdom-tester-pod.log diff --git a/roles/wisdom/wisdom_llm_load_test/tasks/main.yml b/roles/wisdom/wisdom_llm_load_test/tasks/main.yml index 04ec4fe2b..7e44b6c8c 100644 --- a/roles/wisdom/wisdom_llm_load_test/tasks/main.yml +++ b/roles/wisdom/wisdom_llm_load_test/tasks/main.yml @@ -13,7 +13,11 @@ command: oc delete secret/wisdom-protos --ignore-not-found -n {{ wisdom_llm_load_test_namespace }} - name: Create the wisdom protos Secret - shell: oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_llm_load_test_protos_path }}/* | tr ' ' ,) -n {{ wisdom_llm_load_test_namespace }} --dry-run=client -oyaml | oc apply -f - + shell: | + set -o pipefail + oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_llm_load_test_protos_path }}/* | tr ' ' ,) -n {{ wisdom_llm_load_test_namespace }} --dry-run=client -oyaml | oc apply -f - + args: + executable: /bin/bash # The results need to be pushed to S3 after the run, put S3 credentials in a Secret - name: Delete the S3 push secret for results if it exists diff --git a/roles/wisdom/wisdom_llm_load_test_multiplexed/tasks/main.yml b/roles/wisdom/wisdom_llm_load_test_multiplexed/tasks/main.yml index d90196982..b9c18ebf0 100644 --- a/roles/wisdom/wisdom_llm_load_test_multiplexed/tasks/main.yml +++ b/roles/wisdom/wisdom_llm_load_test_multiplexed/tasks/main.yml @@ -13,7 +13,11 @@ command: oc delete secret/wisdom-protos --ignore-not-found -n {{ wisdom_llm_load_test_multiplexed_namespace }} - name: Create the wisdom protos Secret - shell: oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_llm_load_test_multiplexed_protos_path }}/* | tr ' ' ,) -n {{ wisdom_llm_load_test_multiplexed_namespace }} --dry-run=client -oyaml | oc apply -f - + shell: | + set -o pipefail + oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_llm_load_test_multiplexed_protos_path }}/* | tr ' ' ,) -n {{ wisdom_llm_load_test_multiplexed_namespace }} --dry-run=client -oyaml | oc apply -f - + args: + executable: /bin/bash # The results need to be pushed to S3 after the run, put S3 credentials in a Secret - name: Delete the S3 push secret for results if it exists diff --git a/roles/wisdom/wisdom_warmup_model/tasks/main.yml b/roles/wisdom/wisdom_warmup_model/tasks/main.yml index b25016b0f..28b35d8d4 100644 --- a/roles/wisdom/wisdom_warmup_model/tasks/main.yml +++ b/roles/wisdom/wisdom_warmup_model/tasks/main.yml @@ -20,8 +20,12 @@ command: oc delete secret/wisdom-protos --ignore-not-found -n {{ wisdom_warmup_model_namespace }} - name: Create the wisdom protos Secret - shell: oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_warmup_model_protos_path }}/* | tr ' ' ,) -n {{ wisdom_warmup_model_namespace }} --dry-run=client -oyaml | oc apply -f - - + shell: | + set -o pipefail + oc create secret generic wisdom-protos --from-file=$(echo {{ wisdom_warmup_model_protos_path }}/* | tr ' ' ,) -n {{ wisdom_warmup_model_namespace }} --dry-run=client -oyaml | oc apply -f - + args: + executable: /bin/bash + - name: Apply the warmup Pod template template: src: "{{ wisdom_warmup_pod_template }}"