Watsonx: keep working on the scale test (openshift-psap#19)

ccamacho · Aug 28, 2023 · 8d9085c · 8d9085c
2 parents 56a37fe + d5ff011
commit 8d9085c
Show file tree

Hide file tree

Showing 17 changed files with 240 additions and 186 deletions.
diff --git a/roles/watsonx_serving/watsonx_serving_deploy_model/tasks/main.yml b/roles/watsonx_serving/watsonx_serving_deploy_model/tasks/main.yml
@@ -94,24 +94,67 @@
 
 - name: Prepare the InferenceService
   block:
+  - name: Wait for the InferenceService Pod to appear
+    command:
+      oc get pod
+      -oname
+      -lserving.kserve.io/inferenceservice={{ watsonx_serving_deploy_model_inference_service_name }}
+      -n {{ watsonx_serving_deploy_model_namespace }}
+    register: inference_service_pod_name
+    # wait 5 minutes
+    retries: 30
+    delay: 10
+    until: inference_service_pod_name.stdout | length > 0
+
+  - name: Wait for the InferenceService Pod to be scheduled
+    command:
+      oc get pod
+      -ojsonpath={.items[0].spec.nodeName}
+      -lserving.kserve.io/inferenceservice={{ watsonx_serving_deploy_model_inference_service_name }}
+      -n {{ watsonx_serving_deploy_model_namespace }}
+    register: inference_service_pod_nodename
+    # wait 1 minutes
+    retries: 6
+    delay: 10
+    until: inference_service_pod_nodename.stdout | length > 0
+
   - name: Wait for the InferenceService to be loaded
     shell:
       set -o pipefail;
       oc get -f "{{ artifact_extra_logs_dir }}/src/inference_service.yaml"
          -ojsonpath={.status.modelStatus.states.targetModelState}
     register: inference_service_state_cmd
-    # wait 20 minutes
-    retries: 240
-    delay: 5
+    # wait 15 minutes
+    retries: 90
+    delay: 10
     until: inference_service_state_cmd.stdout == "Loaded"
 
+  - name: Capture the state of the InferenceService Pod resource
+    shell:
+      oc get pod
+         -lserving.kserve.io/inferenceservice={{ watsonx_serving_deploy_model_inference_service_name }}
+         -owide
+
   - name: Save timestamp
     shell: |
       NAME=inference-service-loaded
       oc create configmap $NAME -n {{ watsonx_serving_deploy_model_namespace }}
       oc label cm/$NAME topsail.time-tracking=yes -n {{ watsonx_serving_deploy_model_namespace }}
 
   always:
+  - name: Capture the state of the InferenceService Pod resource
+    shell:
+      oc get pod
+         -lserving.kserve.io/inferenceservice={{ watsonx_serving_deploy_model_inference_service_name }}
+         -owide
+         -n {{ watsonx_serving_deploy_model_namespace }}
+         > {{ artifact_extra_logs_dir }}/artifacts/pod.status;
+      oc describe pod
+         -lserving.kserve.io/inferenceservice={{ watsonx_serving_deploy_model_inference_service_name }}
+         -n {{ watsonx_serving_deploy_model_namespace }}
+         > {{ artifact_extra_logs_dir }}/artifacts/pod.desc
+    ignore_errors: true
+
   - name: Capture the state of the InferenceService resource
     shell:
       oc get -f "{{ artifact_extra_logs_dir }}/src/inference_service.yaml"

diff --git a/roles/watsonx_serving/watsonx_serving_validate_model/defaults/main/config.yml b/roles/watsonx_serving/watsonx_serving_validate_model/defaults/main/config.yml
@@ -0,0 +1,19 @@
+# Auto-generated file, do not edit manually ... 
+# Toolbox generate command: repo generate_ansible_default_settings
+# Source component: Watsonx_Serving.validate_model
+
+# a list of names of the inference service to validate
+# Mandatory value
+watsonx_serving_validate_model_inference_service_names:
+
+# the model-id to pass to the inference service
+# Mandatory value
+watsonx_serving_validate_model_model_id:
+
+# the data to pass to the model query
+# Mandatory value
+watsonx_serving_validate_model_query_data:
+
+# the namespace in which the Serving stack was deployed. If empty, use the current project.
+watsonx_serving_validate_model_namespace:
+
diff --git a/roles/watsonx_serving/watsonx_serving_validate_model/files/.keep b/roles/watsonx_serving/watsonx_serving_validate_model/files/.keep
diff --git a/roles/watsonx_serving/watsonx_serving_validate_model/meta/main.yml b/roles/watsonx_serving/watsonx_serving_validate_model/meta/main.yml
@@ -0,0 +1,3 @@
+---
+dependencies:
+- role: check_deps
diff --git a/roles/watsonx_serving/watsonx_serving_validate_model/tasks/main.yml b/roles/watsonx_serving/watsonx_serving_validate_model/tasks/main.yml
@@ -0,0 +1,6 @@
+---
+- name: Validate the models
+  loop: "{{ watsonx_serving_validate_model_inference_service_names }}"
+  loop_control:
+    loop_var: watsonx_serving_validate_model_inference_service_name
+  include_tasks: validate_model.yaml
diff --git a/roles/watsonx_serving/watsonx_serving_validate_model/tasks/validate_model.yaml b/roles/watsonx_serving/watsonx_serving_validate_model/tasks/validate_model.yaml
@@ -0,0 +1,59 @@
+---
+- name: Ensure that GRPCurl is available
+  shell:
+    which grpcurl
+
+- name: Create the artifact directory
+  file:
+    path: "{{ artifact_extra_logs_dir }}/{{ watsonx_serving_validate_model_inference_service_name }}"
+    state: directory
+    mode: '0755'
+
+- name: Get the name of the KSVC
+  shell:
+    set -o pipefail;
+    oc get ksvc
+       -lserving.kserve.io/inferenceservice={{ watsonx_serving_validate_model_inference_service_name }}
+       -n {{ watsonx_serving_validate_model_namespace }}
+       -ojsonpath='{.items[0].status.url}'
+       | sed 's|https://||'
+  register: ksvc_hostname_cmd
+
+
+- name: Wait for the model to answer successfully
+  shell: |
+    set -o pipefail
+    GRPCURL_DATA=$(echo "{{ watsonx_serving_validate_model_query_data }}" | sed "s/'/\"/g")
+    grpcurl \
+      -insecure \
+      -d "$GRPCURL_DATA" \
+      -H "mm-model-id: {{ watsonx_serving_validate_model_model_id }}" \
+      {{ ksvc_hostname_cmd.stdout }}:443 \
+      caikit.runtime.Nlp.NlpService/TextGenerationTaskPredict \
+      > {{ artifact_extra_logs_dir }}/{{ watsonx_serving_validate_model_inference_service_name }}/TextGenerationTaskPredict.answer
+  register: grpc_working_cmd
+  until: grpc_working_cmd.rc == 0
+  retries: 600
+  delay: 1
+
+- name: Save the number of attempts
+  local_action:
+    copy content={{ grpc_working_cmd }} dest={{ artifact_extra_logs_dir }}/{{ watsonx_serving_validate_model_inference_service_name }}/attempts.json
+
+- name: Prettify the file
+  shell: |
+    set -o pipefail
+    content=$(cat "{{ artifact_extra_logs_dir }}/{{ watsonx_serving_validate_model_inference_service_name }}/attempts.json")
+    echo "$content" | jq > "{{ artifact_extra_logs_dir }}/{{ watsonx_serving_validate_model_inference_service_name }}/attempts.json"
+
+- name: Query the model 2nd endpoint
+  shell: |
+    set -o pipefail
+    GRPCURL_DATA=$(echo "{{ watsonx_serving_validate_model_query_data }}" | sed "s/'/\"/g")
+    grpcurl \
+      -insecure \
+      -d "$GRPCURL_DATA" \
+      -H "mm-model-id: {{ watsonx_serving_validate_model_model_id }}" \
+      {{ ksvc_hostname_cmd.stdout }}:443 \
+      caikit.runtime.Nlp.NlpService/ServerStreamingTextGenerationTaskPredict \
+      > {{ artifact_extra_logs_dir }}/{{ watsonx_serving_validate_model_inference_service_name }}/ServerStreamingTextGenerationTaskPredict.answer
diff --git a/roles/watsonx_serving/watsonx_serving_validate_model/vars/main/resources.yml b/roles/watsonx_serving/watsonx_serving_validate_model/vars/main/resources.yml
diff --git a/testing/common/prepare_user_pods.py b/testing/common/prepare_user_pods.py
@@ -59,19 +59,19 @@ def prepare_base_image_container(namespace):
     run.run(f"./run_toolbox.py from_config utils build_push_image --prefix extended_image")
 
 
-def compute_driver_node_requirement():
+def compute_driver_node_requirement(user_count):
     # must match 'roles/local_ci/local_ci_run_multi/templates/job.yaml.j2'
     kwargs = dict(
         cpu = 0.250,
         memory = 2,
         machine_type = config.ci_artifacts.get_config("clusters.driver.compute.machineset.type"),
-        user_count = config.ci_artifacts.get_config("tests.scale.user_count"),
+        user_count = user_count,
         )
 
     return sizing.main(**kwargs)
 
 
-def prepare_user_pods(namespace):
+def prepare_user_pods(namespace, user_count):
     config.ci_artifacts.set_config("base_image.namespace", namespace)
 
     service_account = config.ci_artifacts.get_config("base_image.user.service_account")
@@ -97,7 +97,7 @@ def prepare_user_pods(namespace):
         nodes_count = config.ci_artifacts.get_config("clusters.driver.compute.machineset.count")
         extra = ""
         if nodes_count is None:
-            node_count = compute_driver_node_requirement()
+            node_count = compute_driver_node_requirement(user_count)
 
             extra = f"--extra '{{scale: {node_count}}}'"
 

diff --git a/testing/utils/local-ci/command_args.yaml → testing/utils/local-ci/command_args.yml.j2 b/testing/utils/local-ci/command_args.yaml → testing/utils/local-ci/command_args.yml.j2
@@ -27,7 +27,7 @@ local_ci run/notebooks:
   pr_number: null # set at runtime
   pr_config: null # set at runtime if required
 
-local_ci run/pipelines:
+local_ci run/rhods:
   namespace: "{{ base_image.namespace }}"
   istag: "{{ base_image.imagestream }}:{{ base_image.repo.ref }}"
   service_account: "{{ base_image.user.service_account }}"
@@ -44,36 +44,3 @@ local_ci run/pipelines:
   ci_command: SET_AT_RUNTIME
   pr_number: null # set at runtime
   pr_config: null # set at runtime if required
-
-local_ci run/codeflare:
-  namespace: "{{ base_image.namespace }}"
-  istag: "{{ base_image.imagestream }}:{{ base_image.repo.ref }}"
-  service_account: "{{ base_image.user.service_account }}"
-  test_name: SET_AT_RUNTIME
-  test_args: SET_AT_RUNTIME
-
-  secret_name: "psap-ods-secret"
-  secret_env_key: "{{ secrets['psap-ods-secret'].env_key }}"
-
-  export_test_run_identifier: SET_AT_RUNTIME
-  export_bucket_name: rhods-baremetal-results
-
-  ci_command: SET_AT_RUNTIME
-  pr_number: null # set at runtime
-  pr_config: null # set at runtime if required
-
-local_ci run/load-aware:
-  namespace: "{{ base_image.namespace }}"
-  istag: "{{ base_image.imagestream }}:{{ base_image.repo.ref }}"
-  service_account: "{{ base_image.user.service_account }}"
-  test_name: SET_AT_RUNTIME
-  test_args: SET_AT_RUNTIME
-  secret_name: "psap-ods-secret"
-  secret_env_key: "{{ secrets['psap-ods-secret'].env_key }}"
-
-  export_test_run_identifier: SET_AT_RUNTIME
-  export_bucket_name: rhods-baremetal-results
-
-  ci_command: SET_AT_RUNTIME
-  pr_number: null # set at runtime
-  pr_config: null # set at runtime if required
diff --git a/testing/utils/local-ci/config.yaml b/testing/utils/local-ci/config.yaml
@@ -13,24 +13,6 @@ secrets:
     env_key: PSAP_ODS_SECRET_PATH
 
 workloads:
-  notebook-burst:
-    command_group: notebooks
-    test_args:
-    - icelake
-    - notebook_test_burst
-    steps:
-      test:
-        command: notebooks test run_tests_and_plots
-
-  notebook-gating:
-    command_group: notebooks
-    test_args:
-    - icelake
-    - notebook_gating_test
-    steps:
-      test:
-        command: notebooks test run_tests_and_plots
-
   notebook-scale-test:
     command_group: notebooks
     test_args:
@@ -44,53 +26,8 @@ workloads:
         command: utils gather-extra
         always_run: true
 
-  notebook-just-scale-test:
-    command_group: notebooks
-    test_args:
-    - icelake_notebook_test
-    steps:
-      test:
-        command: notebooks test run_tests_and_plots
-
-  notebook-scale-test-light:
-    command_group: notebooks
-    test_args:
-    - notebooks_light
-    - notebook_test
-    steps:
-      prepare:
-        command: notebooks test cleanup_rhods
-      test:
-        command: notebooks test run_tests_and_plots
-
-  notebook-scale-test-scaleup:
-    command_group: notebooks
-    test_args:
-    - icelake_notebook_test_scaleup
-    steps:
-      cleanup:
-        command: notebooks test cleanup_rhods
-      test:
-        command: notebooks test run_tests_and_plots
-      gather:
-        command: utils gather-extra
-        alway_run: true
-
-  notebook-scale-test-burst:
-    command_group: notebooks
-    test_args:
-    - icelake_notebook_test_burst
-    steps:
-      cleanup:
-        command: notebooks test cleanup_rhods
-      test:
-        command: notebooks test run_tests_and_plots
-      gather:
-        command: utils gather-extra
-        alway_run: true
-
   pipelines:
-    command_group: pipelines
+    command_group: rhods
     test_args:
     - icelake
     steps:
@@ -105,15 +42,22 @@ workloads:
         alway_run: true
 
   codeflare-light:
-    command_group: codeflare
+    command_group: rhods
     test_args:
     steps:
       test:
         command: codeflare test test_ci
 
   load-aware-light:
-    command_group: load-aware
+    command_group: rhods
     test_args:
     steps:
       test:
         command: load-aware test test_ci
+
+  watsonx-serving:
+    command_group: rhods
+    test_args:
+    steps:
+      test:
+        command: watsonx-serving test test_ci
diff --git a/testing/watsonx-serving/command_args.yml.j2 b/testing/watsonx-serving/command_args.yml.j2
@@ -114,7 +114,7 @@ cluster deploy_minio_s3_server:
 #
 
 local_ci run_multi/scale:
-  user_count: "{{ tests.scale.user_count }}"
+  user_count: "{{ tests.scale.namespace_count }}"
   namespace: "{{ base_image.namespace }}"
   istag: "{{ base_image.imagestream }}:{{ base_image.extend.tag }}"
   service_account: "{{ base_image.user.service_account }}"
@@ -131,7 +131,7 @@ local_ci run_multi/scale:
   minio_secret_key_key: s3_ldap.passwords
 
   sleep_factor: {{ tests.scale.sleep_factor }}
-  user_batch_size: {{ tests.scale.user_batch_size }}
+  user_batch_size: 1
 
   git_pull: null #refs/pull/716/merge
   capture_prom_db: "{{ tests.capture_prom }}"
@@ -150,3 +150,10 @@ watsonx_serving deploy_model:
 
   inference_service_name: {{ watsonx_serving.inference_service.name }}
   storage_uri: {{ watsonx_serving.inference_service.storage_uri }}
+
+watsonx_serving validate_model:
+  namespace: {{ tests.scale.namespace }}
+  inference_service_names: [{{ watsonx_serving.inference_service.name }}]
+
+  model_id: {{ watsonx_serving.model.id }}
+  query_data: {{ watsonx_serving.model.query_data }}