Skip to content

Commit

Permalink
Watsonx: keep working on the scale test (openshift-psap#19)
Browse files Browse the repository at this point in the history
  • Loading branch information
kpouget authored Aug 28, 2023
2 parents 56a37fe + d5ff011 commit 8d9085c
Show file tree
Hide file tree
Showing 17 changed files with 240 additions and 186 deletions.
49 changes: 46 additions & 3 deletions roles/watsonx_serving/watsonx_serving_deploy_model/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,24 +94,67 @@

- name: Prepare the InferenceService
block:
- name: Wait for the InferenceService Pod to appear
command:
oc get pod
-oname
-lserving.kserve.io/inferenceservice={{ watsonx_serving_deploy_model_inference_service_name }}
-n {{ watsonx_serving_deploy_model_namespace }}
register: inference_service_pod_name
# wait 5 minutes
retries: 30
delay: 10
until: inference_service_pod_name.stdout | length > 0

- name: Wait for the InferenceService Pod to be scheduled
command:
oc get pod
-ojsonpath={.items[0].spec.nodeName}
-lserving.kserve.io/inferenceservice={{ watsonx_serving_deploy_model_inference_service_name }}
-n {{ watsonx_serving_deploy_model_namespace }}
register: inference_service_pod_nodename
# wait 1 minutes
retries: 6
delay: 10
until: inference_service_pod_nodename.stdout | length > 0

- name: Wait for the InferenceService to be loaded
shell:
set -o pipefail;
oc get -f "{{ artifact_extra_logs_dir }}/src/inference_service.yaml"
-ojsonpath={.status.modelStatus.states.targetModelState}
register: inference_service_state_cmd
# wait 20 minutes
retries: 240
delay: 5
# wait 15 minutes
retries: 90
delay: 10
until: inference_service_state_cmd.stdout == "Loaded"

- name: Capture the state of the InferenceService Pod resource
shell:
oc get pod
-lserving.kserve.io/inferenceservice={{ watsonx_serving_deploy_model_inference_service_name }}
-owide

- name: Save timestamp
shell: |
NAME=inference-service-loaded
oc create configmap $NAME -n {{ watsonx_serving_deploy_model_namespace }}
oc label cm/$NAME topsail.time-tracking=yes -n {{ watsonx_serving_deploy_model_namespace }}
always:
- name: Capture the state of the InferenceService Pod resource
shell:
oc get pod
-lserving.kserve.io/inferenceservice={{ watsonx_serving_deploy_model_inference_service_name }}
-owide
-n {{ watsonx_serving_deploy_model_namespace }}
> {{ artifact_extra_logs_dir }}/artifacts/pod.status;
oc describe pod
-lserving.kserve.io/inferenceservice={{ watsonx_serving_deploy_model_inference_service_name }}
-n {{ watsonx_serving_deploy_model_namespace }}
> {{ artifact_extra_logs_dir }}/artifacts/pod.desc
ignore_errors: true

- name: Capture the state of the InferenceService resource
shell:
oc get -f "{{ artifact_extra_logs_dir }}/src/inference_service.yaml"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Auto-generated file, do not edit manually ...
# Toolbox generate command: repo generate_ansible_default_settings
# Source component: Watsonx_Serving.validate_model

# a list of names of the inference service to validate
# Mandatory value
watsonx_serving_validate_model_inference_service_names:

# the model-id to pass to the inference service
# Mandatory value
watsonx_serving_validate_model_model_id:

# the data to pass to the model query
# Mandatory value
watsonx_serving_validate_model_query_data:

# the namespace in which the Serving stack was deployed. If empty, use the current project.
watsonx_serving_validate_model_namespace:

Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
---
dependencies:
- role: check_deps
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
- name: Validate the models
loop: "{{ watsonx_serving_validate_model_inference_service_names }}"
loop_control:
loop_var: watsonx_serving_validate_model_inference_service_name
include_tasks: validate_model.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
---
- name: Ensure that GRPCurl is available
shell:
which grpcurl

- name: Create the artifact directory
file:
path: "{{ artifact_extra_logs_dir }}/{{ watsonx_serving_validate_model_inference_service_name }}"
state: directory
mode: '0755'

- name: Get the name of the KSVC
shell:
set -o pipefail;
oc get ksvc
-lserving.kserve.io/inferenceservice={{ watsonx_serving_validate_model_inference_service_name }}
-n {{ watsonx_serving_validate_model_namespace }}
-ojsonpath='{.items[0].status.url}'
| sed 's|https://||'
register: ksvc_hostname_cmd


- name: Wait for the model to answer successfully
shell: |
set -o pipefail
GRPCURL_DATA=$(echo "{{ watsonx_serving_validate_model_query_data }}" | sed "s/'/\"/g")
grpcurl \
-insecure \
-d "$GRPCURL_DATA" \
-H "mm-model-id: {{ watsonx_serving_validate_model_model_id }}" \
{{ ksvc_hostname_cmd.stdout }}:443 \
caikit.runtime.Nlp.NlpService/TextGenerationTaskPredict \
> {{ artifact_extra_logs_dir }}/{{ watsonx_serving_validate_model_inference_service_name }}/TextGenerationTaskPredict.answer
register: grpc_working_cmd
until: grpc_working_cmd.rc == 0
retries: 600
delay: 1

- name: Save the number of attempts
local_action:
copy content={{ grpc_working_cmd }} dest={{ artifact_extra_logs_dir }}/{{ watsonx_serving_validate_model_inference_service_name }}/attempts.json

- name: Prettify the file
shell: |
set -o pipefail
content=$(cat "{{ artifact_extra_logs_dir }}/{{ watsonx_serving_validate_model_inference_service_name }}/attempts.json")
echo "$content" | jq > "{{ artifact_extra_logs_dir }}/{{ watsonx_serving_validate_model_inference_service_name }}/attempts.json"
- name: Query the model 2nd endpoint
shell: |
set -o pipefail
GRPCURL_DATA=$(echo "{{ watsonx_serving_validate_model_query_data }}" | sed "s/'/\"/g")
grpcurl \
-insecure \
-d "$GRPCURL_DATA" \
-H "mm-model-id: {{ watsonx_serving_validate_model_model_id }}" \
{{ ksvc_hostname_cmd.stdout }}:443 \
caikit.runtime.Nlp.NlpService/ServerStreamingTextGenerationTaskPredict \
> {{ artifact_extra_logs_dir }}/{{ watsonx_serving_validate_model_inference_service_name }}/ServerStreamingTextGenerationTaskPredict.answer
Empty file.
8 changes: 4 additions & 4 deletions testing/common/prepare_user_pods.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,19 +59,19 @@ def prepare_base_image_container(namespace):
run.run(f"./run_toolbox.py from_config utils build_push_image --prefix extended_image")


def compute_driver_node_requirement():
def compute_driver_node_requirement(user_count):
# must match 'roles/local_ci/local_ci_run_multi/templates/job.yaml.j2'
kwargs = dict(
cpu = 0.250,
memory = 2,
machine_type = config.ci_artifacts.get_config("clusters.driver.compute.machineset.type"),
user_count = config.ci_artifacts.get_config("tests.scale.user_count"),
user_count = user_count,
)

return sizing.main(**kwargs)


def prepare_user_pods(namespace):
def prepare_user_pods(namespace, user_count):
config.ci_artifacts.set_config("base_image.namespace", namespace)

service_account = config.ci_artifacts.get_config("base_image.user.service_account")
Expand All @@ -97,7 +97,7 @@ def prepare_user_pods(namespace):
nodes_count = config.ci_artifacts.get_config("clusters.driver.compute.machineset.count")
extra = ""
if nodes_count is None:
node_count = compute_driver_node_requirement()
node_count = compute_driver_node_requirement(user_count)

extra = f"--extra '{{scale: {node_count}}}'"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ local_ci run/notebooks:
pr_number: null # set at runtime
pr_config: null # set at runtime if required

local_ci run/pipelines:
local_ci run/rhods:
namespace: "{{ base_image.namespace }}"
istag: "{{ base_image.imagestream }}:{{ base_image.repo.ref }}"
service_account: "{{ base_image.user.service_account }}"
Expand All @@ -44,36 +44,3 @@ local_ci run/pipelines:
ci_command: SET_AT_RUNTIME
pr_number: null # set at runtime
pr_config: null # set at runtime if required

local_ci run/codeflare:
namespace: "{{ base_image.namespace }}"
istag: "{{ base_image.imagestream }}:{{ base_image.repo.ref }}"
service_account: "{{ base_image.user.service_account }}"
test_name: SET_AT_RUNTIME
test_args: SET_AT_RUNTIME

secret_name: "psap-ods-secret"
secret_env_key: "{{ secrets['psap-ods-secret'].env_key }}"

export_test_run_identifier: SET_AT_RUNTIME
export_bucket_name: rhods-baremetal-results

ci_command: SET_AT_RUNTIME
pr_number: null # set at runtime
pr_config: null # set at runtime if required

local_ci run/load-aware:
namespace: "{{ base_image.namespace }}"
istag: "{{ base_image.imagestream }}:{{ base_image.repo.ref }}"
service_account: "{{ base_image.user.service_account }}"
test_name: SET_AT_RUNTIME
test_args: SET_AT_RUNTIME
secret_name: "psap-ods-secret"
secret_env_key: "{{ secrets['psap-ods-secret'].env_key }}"

export_test_run_identifier: SET_AT_RUNTIME
export_bucket_name: rhods-baremetal-results

ci_command: SET_AT_RUNTIME
pr_number: null # set at runtime
pr_config: null # set at runtime if required
76 changes: 10 additions & 66 deletions testing/utils/local-ci/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,6 @@ secrets:
env_key: PSAP_ODS_SECRET_PATH

workloads:
notebook-burst:
command_group: notebooks
test_args:
- icelake
- notebook_test_burst
steps:
test:
command: notebooks test run_tests_and_plots

notebook-gating:
command_group: notebooks
test_args:
- icelake
- notebook_gating_test
steps:
test:
command: notebooks test run_tests_and_plots

notebook-scale-test:
command_group: notebooks
test_args:
Expand All @@ -44,53 +26,8 @@ workloads:
command: utils gather-extra
always_run: true

notebook-just-scale-test:
command_group: notebooks
test_args:
- icelake_notebook_test
steps:
test:
command: notebooks test run_tests_and_plots

notebook-scale-test-light:
command_group: notebooks
test_args:
- notebooks_light
- notebook_test
steps:
prepare:
command: notebooks test cleanup_rhods
test:
command: notebooks test run_tests_and_plots

notebook-scale-test-scaleup:
command_group: notebooks
test_args:
- icelake_notebook_test_scaleup
steps:
cleanup:
command: notebooks test cleanup_rhods
test:
command: notebooks test run_tests_and_plots
gather:
command: utils gather-extra
alway_run: true

notebook-scale-test-burst:
command_group: notebooks
test_args:
- icelake_notebook_test_burst
steps:
cleanup:
command: notebooks test cleanup_rhods
test:
command: notebooks test run_tests_and_plots
gather:
command: utils gather-extra
alway_run: true

pipelines:
command_group: pipelines
command_group: rhods
test_args:
- icelake
steps:
Expand All @@ -105,15 +42,22 @@ workloads:
alway_run: true

codeflare-light:
command_group: codeflare
command_group: rhods
test_args:
steps:
test:
command: codeflare test test_ci

load-aware-light:
command_group: load-aware
command_group: rhods
test_args:
steps:
test:
command: load-aware test test_ci

watsonx-serving:
command_group: rhods
test_args:
steps:
test:
command: watsonx-serving test test_ci
11 changes: 9 additions & 2 deletions testing/watsonx-serving/command_args.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ cluster deploy_minio_s3_server:
#

local_ci run_multi/scale:
user_count: "{{ tests.scale.user_count }}"
user_count: "{{ tests.scale.namespace_count }}"
namespace: "{{ base_image.namespace }}"
istag: "{{ base_image.imagestream }}:{{ base_image.extend.tag }}"
service_account: "{{ base_image.user.service_account }}"
Expand All @@ -131,7 +131,7 @@ local_ci run_multi/scale:
minio_secret_key_key: s3_ldap.passwords

sleep_factor: {{ tests.scale.sleep_factor }}
user_batch_size: {{ tests.scale.user_batch_size }}
user_batch_size: 1

git_pull: null #refs/pull/716/merge
capture_prom_db: "{{ tests.capture_prom }}"
Expand All @@ -150,3 +150,10 @@ watsonx_serving deploy_model:

inference_service_name: {{ watsonx_serving.inference_service.name }}
storage_uri: {{ watsonx_serving.inference_service.storage_uri }}

watsonx_serving validate_model:
namespace: {{ tests.scale.namespace }}
inference_service_names: [{{ watsonx_serving.inference_service.name }}]

model_id: {{ watsonx_serving.model.id }}
query_data: {{ watsonx_serving.model.query_data }}
Loading

0 comments on commit 8d9085c

Please sign in to comment.