Skip to content

Commit

Permalink
WatsonX-Serving: keep working on the scale test (openshift-psap#13)
Browse files Browse the repository at this point in the history
  • Loading branch information
kpouget committed Aug 24, 2023
2 parents 5589752 + d95a992 commit 690108f
Show file tree
Hide file tree
Showing 6 changed files with 126 additions and 59 deletions.
89 changes: 67 additions & 22 deletions roles/cluster/cluster_preload_image/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -1,69 +1,114 @@
---
- name: Create the src directory
file:
path: "{{ artifact_extra_logs_dir }}/src"
state: directory
mode: '0755'

- name: Apply the DaemonSet template
template:
src: "{{ cluster_preload_image_ds_template }}"
dest: "{{ artifact_extra_logs_dir }}/preload_daemonset.yaml"
dest: "{{ artifact_extra_logs_dir }}/src/preload_daemonset.yaml"
mode: 0400

- name: Delete the DaemonSet, it it exists
command:
oc delete -f "{{ artifact_extra_logs_dir }}/preload_daemonset.yaml"
oc delete -f "{{ artifact_extra_logs_dir }}/src/preload_daemonset.yaml"
--ignore-not-found

- name: Create the DaemonSet
command:
oc create -f "{{ artifact_extra_logs_dir }}/preload_daemonset.yaml"
- name: Get the number of nodes matching the label
shell:
set -o pipefail;
oc get nodes -oname
-l{{ cluster_preload_image_node_selector_key }}={{ cluster_preload_image_node_selector_value }}
| wc -l
register: number_target_nodes

- name: Fail if there is no target node
fail: msg="The node selector '{{ cluster_preload_image_node_selector_key }}={{ cluster_preload_image_node_selector_value }}' does not match any node"
when: number_target_nodes.stdout == "0"

- name: Preload the image
block:
- name: Get the number of desired Pods
- name: Create the DaemonSet
command:
oc get ds/{{ cluster_preload_image_name }}
'-ojsonpath={.status.desiredNumberScheduled}'
-n {{ cluster_preload_image_namespace }}
register: desiredNumberScheduled_cmd
oc create -f "{{ artifact_extra_logs_dir }}/src/preload_daemonset.yaml"

# ---

- name: Wait for the pods to be scheduled
- name: Wait for the desired number to be populated
command:
oc get ds/{{ cluster_preload_image_name }}
'-ojsonpath={.status.currentNumberScheduled}'
'-ojsonpath={.status.desiredNumberScheduled}'
-n {{ cluster_preload_image_namespace }}
register: desiredNumberScheduled_cmd
retries: 3
delay: 15
until: desiredNumberScheduled_cmd or "0" | int > 0

- name: Wait for the pods to be scheduled
shell:
set -o pipefail;
oc get ds/{{ cluster_preload_image_name }}
-ojson
-n {{ cluster_preload_image_namespace }}
register: currentNumberScheduled_cmd
| jq '.status.desiredNumberScheduled - .status.currentNumberScheduled'
register: currentNumberNotScheduled_cmd
retries: 3
delay: 15
until: currentNumberScheduled_cmd.stdout >= desiredNumberScheduled_cmd.stdout
until: currentNumberNotScheduled_cmd.stdout | int == 0

- name: Wait for the pods to be ready
command:
shell:
set -o pipefail;
oc get ds/{{ cluster_preload_image_name }}
'-ojsonpath={.status.numberReady}'
-ojson
-n {{ cluster_preload_image_namespace }}
register: numberReady_cmd
| jq '.status.desiredNumberScheduled - .status.numberReady'
register: currentNumberNotReady_cmd
retries: 20
delay: 30
until: numberReady_cmd.stdout == desiredNumberScheduled_cmd.stdout
until: currentNumberNotReady_cmd.stdout | int == 0

- name: Get the final desired number
command:
oc get ds/{{ cluster_preload_image_name }}
'-ojsonpath={.status.desiredNumberScheduled}'
-n {{ cluster_preload_image_namespace }}
register: finalDesiredNumberScheduled_cmd

- name: Fail if the image has been preloaded on 0 nodes
fail: msg="The node selector '{{ cluster_preload_image_node_selector_key }}={{ cluster_preload_image_node_selector_value }}' did not match any node"
when: desiredNumberScheduled_cmd.stdout == "0"
when: finalDesiredNumberScheduled_cmd.stdout | int == 0

always:
- name: Get the description of the preload Pods
shell:
oc describe pods -l name={{ cluster_preload_image_name }}
-n {{ cluster_preload_image_namespace }}
> "{{ artifact_extra_logs_dir }}/preload_pods.descr"
> "{{ artifact_extra_logs_dir }}/pods.descr"

- name: Get the status of the preload Pods
shell:
oc get pods -l name={{ cluster_preload_image_name }}
-owide
-n {{ cluster_preload_image_namespace }}
> "{{ artifact_extra_logs_dir }}/preload_pods.status"
> "{{ artifact_extra_logs_dir }}/pods.status"

- name: Get the yaml of the daemonset
shell:
oc get ds/{{ cluster_preload_image_name }}
-oyaml
-n {{ cluster_preload_image_namespace }}
> "{{ artifact_extra_logs_dir }}/daemonset.yaml"

- name: Get the status of the daemonset
shell:
oc get ds/{{ cluster_preload_image_name }}
-n {{ cluster_preload_image_namespace }}
> "{{ artifact_extra_logs_dir }}/daemonset.status"

- name: Delete the DaemonSet, it it exists
command:
oc delete -f "{{ artifact_extra_logs_dir }}/preload_daemonset.yaml"
oc delete -f "{{ artifact_extra_logs_dir }}/src/preload_daemonset.yaml" --ignore-not-found
failed_when: false
2 changes: 1 addition & 1 deletion testing/run
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ postchecks() {
if [ ! -e "$ARTIFACT_DIR/FAILURES" ]; then
echo "$(date) Test of '$@' succeeded $duration." | tee "$ARTIFACT_DIR/FINISHED"
else
echo "$(date) Test of '$@' failed after $duration." | tee "$ARTIFACT_DIR/FINISHED"
echo "$(date) Test of '$@' failed $duration." | tee "$ARTIFACT_DIR/FINISHED"
fi
fi
}
Expand Down
4 changes: 3 additions & 1 deletion testing/utils/ci_init_configure.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ else
elif [[ ! -d "$PSAP_ODS_SECRET_PATH" ]]; then
_error "the PSAP_ODS_SECRET_PATH does not point to a valid directory"
fi
sha256sum "$PSAP_ODS_SECRET_PATH"/* > "$ARTIFACT_DIR/secrets.sha256sum"
if [[ "${ARTIFACT_DIR:-}" ]]; then
sha256sum "$PSAP_ODS_SECRET_PATH"/* > "$ARTIFACT_DIR/secrets.sha256sum"
fi
fi
if [[ "${CONFIG_DEST_DIR:-}" ]]; then
echo "Using CONFIG_DEST_DIR=$CONFIG_DEST_DIR ..."
Expand Down
70 changes: 43 additions & 27 deletions testing/utils/openshift_clusters/clusters.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ set -x
TESTING_UTILS_OCP_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
TESTING_UTILS_DIR="${TESTING_UTILS_OCP_DIR}/.."

if [[ -z "${CI_ARTIFACTS_FROM_CONFIG_FILE:-}" ]]; then
echo "WARNING: CI_ARTIFACTS_FROM_CONFIG_FILE is not set. Commands relying on 'get_config' won't work ..."
export CI_ARTIFACTS_FROM_CONFIG_FILE="/CI_ARTIFACTS_FROM_CONFIG_FILE/not/set"
fi

source "$TESTING_UTILS_DIR/process_ctrl.sh"
source "$TESTING_UTILS_DIR/logging.sh"
source "$TESTING_UTILS_DIR/configure.sh"
Expand Down Expand Up @@ -116,41 +121,17 @@ create_clusters() {
local KUBECONFIG_DRIVER="${CONFIG_DEST_DIR}/driver_kubeconfig" # cluster driving the test
local KUBECONFIG_SUTEST="${CONFIG_DEST_DIR}/sutest_kubeconfig" # system under test

keep_cluster() {
local cluster_role=$1
local cluster_region=$2

echo "Keeping the $cluster_role cluster ..."
oc create cm keep-cluster -n default --from-literal=keep=true

local pr_author=$(echo "$JOB_SPEC" | jq -r .refs.pulls[0].author)
local keep_cluster_password_file="$PSAP_ODS_SECRET_PATH/$(get_config secrets.keep_cluster_password_file)"
./run_toolbox.py cluster create_htpasswd_adminuser "$pr_author" "$keep_cluster_password_file"

oc whoami --show-console > "$ARTIFACT_DIR/${cluster_role}_console.link"
cat <<EOF > "$ARTIFACT_DIR/${cluster_role}_oc-login.cmd"
source "\$PSAP_ODS_SECRET_PATH/get_cluster.password"
oc login $(oc whoami --show-server) --insecure-skip-tls-verify --username=$pr_author --password="\$password"
EOF

local cluster_tag=$(oc get machines -n openshift-machine-api -ojsonpath={.items[0].spec.providerSpec.value.tags[0].name} | cut -d/ -f3)

cat <<EOF > "$ARTIFACT_DIR/${cluster_role}_destroy_cluster.cmd"
./run_toolbox.py cluster destroy_ocp $cluster_region $cluster_tag
EOF
}

local ocp_region=$(get_config clusters.create.ocp.region)

# * 'osd' clusters already have their kubeadmin password
# populated during the cluster bring up
# * 'single' clusters already have been modified with the
# keep_cluster call of the sutest cluster.
if [[ "$cluster_type" == "single" ]]; then
KUBECONFIG=$KUBECONFIG_SUTEST keep_cluster sutest "$ocp_region"
KUBECONFIG=$KUBECONFIG_SUTEST keep_cluster sutest
elif [[ "$cluster_type" == "ocp" ]]; then
KUBECONFIG=$KUBECONFIG_SUTEST keep_cluster sutest "$ocp_region"
KUBECONFIG=$KUBECONFIG_DRIVER keep_cluster driver "$ocp_region"
KUBECONFIG=$KUBECONFIG_SUTEST keep_cluster sutest
KUBECONFIG=$KUBECONFIG_DRIVER keep_cluster driver
fi
fi
}
Expand All @@ -162,6 +143,30 @@ destroy_clusters() {
process_ctrl::wait_bg_processes
}

keep_cluster() {
local cluster_role=$1

echo "Keeping the $cluster_role cluster ..."
oc create cm keep-cluster -n default --from-literal=keep=true -oyaml --dry-run=client | oc apply -f-

local pr_author=$(echo "$JOB_SPEC" | jq -r .refs.pulls[0].author)
local keep_cluster_password_file="$PSAP_ODS_SECRET_PATH/$(get_config secrets.keep_cluster_password_file)"
./run_toolbox.py cluster create_htpasswd_adminuser "$pr_author" "$keep_cluster_password_file"

oc whoami --show-console > "$ARTIFACT_DIR/${cluster_role}_console.link"
cat <<EOF > "$ARTIFACT_DIR/${cluster_role}_oc-login.cmd"
source "\$PSAP_ODS_SECRET_PATH/get_cluster.password"
oc login $(oc whoami --show-server) --insecure-skip-tls-verify --username=$pr_author --password="\$password"
EOF

local cluster_region=$(oc get machines -n openshift-machine-api -ojsonpath={.items[0].spec.providerSpec.value.placement.region})
local cluster_tag=$(oc get machines -n openshift-machine-api -ojsonpath={.items[0].spec.providerSpec.value.tags[0].name} | cut -d/ -f3)

cat <<EOF > "$ARTIFACT_DIR/${cluster_role}_destroy_cluster.cmd"
./run_toolbox.py cluster destroy_ocp $cluster_region $cluster_tag
EOF
}

# ---

main() {
Expand Down Expand Up @@ -212,6 +217,17 @@ main() {
destroy_clusters
exit 0
;;
"keep")
if [[ -z "${PSAP_ODS_SECRET_PATH:-}" ]]; then
echo "PSAP_ODS_SECRET_PATH is not set, using the default value."
export PSAP_ODS_SECRET_PATH=/run/psap-ods-secret-1
fi
if ! [[ -d $PSAP_ODS_SECRET_PATH ]]; then
echo "ERROR: PSAP_ODS_SECRET_PATH=$PSAP_ODS_SECRET_PATH is not a directory ..."
exit 1
fi
keep_cluster cluster
;;
*)
_error "Unknown action '$action'"
;;
Expand Down
2 changes: 1 addition & 1 deletion testing/watsonx-serving/prepare_watsonx_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,6 @@ def prepare():
if config.ci_artifacts.get_config("clusters.sutest.compute.dedicated"):
# this is required to properly create the namespace used to preload the image
test_namespace = config.ci_artifacts.get_config("tests.scale.namespace")
test_scale.prepare_user_namespace(test_namespace)
test_scale.prepare_user_namespace(test_namespace, register_namespace_smmr=False)

run.run("./run_toolbox.py from_config cluster preload_image --prefix sutest --suffix watsonx-serving-runtime")
18 changes: 11 additions & 7 deletions testing/watsonx-serving/test_scale.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,14 +59,22 @@ def run_test(dry_mode):
run.run(f"./run_toolbox.py from_config local_ci run_multi --suffix scale")


def prepare_user_namespace(namespace):
def prepare_user_namespace(namespace, register_namespace_smmr=True):
if run.run(f'oc get project "{namespace}" -oname 2>/dev/null', check=False).returncode == 0:
logging.warning(f"Project {namespace} already exists.")
(env.ARTIFACT_DIR / "PROJECT_ALREADY_EXISTS").touch()
return

run.run(f'oc new-project "{namespace}" --skip-config-write >/dev/null')

dedicated = config.ci_artifacts.get_config("clusters.sutest.compute.dedicated")
if dedicated:
run.run("./run_toolbox.py from_config cluster set_project_annotation --prefix sutest --suffix scale_test_node_selector", capture_stdout=True)
run.run("./run_toolbox.py from_config cluster set_project_annotation --prefix sutest --suffix scale_test_toleration", capture_stdout=True)

if not register_namespace_smmr:
return

if run.run("""oc patch smmr/default -n istio-system --type=json -p="[{'op': 'add', 'path': '/spec/members/-', 'value': \""""+namespace+"""\"}]" """, check=False).returncode != 0:
smmr_members = run.run("oc get smmr/default -n istio-system -ojsonpath={.spec.members} | jq .[] -r", capture_stdout=True).stdout
if namespace not in smmr_members.split("\n"):
Expand All @@ -77,12 +85,8 @@ def prepare_user_namespace(namespace):
logging.warning(f"Namespace '{namespace}' was already in the SMMR members. Continuing.")


run.run(f"oc get smmr/default -n istio-system -oyaml > {env.ARTIFACT_DIR / 'istio-system_smmr-default.yaml'}")

dedicated = config.ci_artifacts.get_config("clusters.sutest.compute.dedicated")
if dedicated:
run.run("./run_toolbox.py from_config cluster set_project_annotation --prefix sutest --suffix scale_test_node_selector")
run.run("./run_toolbox.py from_config cluster set_project_annotation --prefix sutest --suffix scale_test_toleration")
(env.ARTIFACT_DIR / 'artifacts').mkdir(exist_ok=True)
run.run(f"oc get smmr/default -n istio-system -oyaml > {env.ARTIFACT_DIR / 'artifacts' / 'istio-system_smmr-default.yaml'}")


def save_and_create(name, content, namespace):
Expand Down

0 comments on commit 690108f

Please sign in to comment.