Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[model server] add metrics smoke tests #83

Merged
merged 37 commits into from
Jan 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
b362382
Create size-labeler.yml
rnetser Dec 18, 2024
3c6a875
Delete .github/workflows/size-labeler.yml
rnetser Dec 18, 2024
ccb63af
Merge branch 'main' of github.com:rnetser/opendatahub-tests
rnetser Dec 24, 2024
da0c898
Merge branch 'main' of https://github.com/opendatahub-io/opendatahub-…
rnetser Dec 25, 2024
94a82ec
Merge branch 'main' of https://github.com/opendatahub-io/opendatahub-…
rnetser Dec 26, 2024
c0c82dd
Merge branch 'main' of https://github.com/opendatahub-io/opendatahub-…
rnetser Dec 27, 2024
5feb447
Merge branch 'main' of https://github.com/opendatahub-io/opendatahub-…
rnetser Dec 30, 2024
19b9c56
Merge branch 'main' of https://github.com/opendatahub-io/opendatahub-…
rnetser Dec 31, 2024
e22ac1a
Merge branch 'main' of https://github.com/opendatahub-io/opendatahub-…
rnetser Dec 31, 2024
56ab9c5
Merge branch 'main' of https://github.com/opendatahub-io/opendatahub-…
rnetser Dec 31, 2024
5a17f03
Merge branch 'main' of https://github.com/opendatahub-io/opendatahub-…
rnetser Dec 31, 2024
ef5fe65
Merge branch 'main' of https://github.com/opendatahub-io/opendatahub-…
rnetser Dec 31, 2024
1875a44
Merge branch 'main' of https://github.com/opendatahub-io/opendatahub-…
rnetser Jan 1, 2025
840d442
Merge branch 'main' of https://github.com/opendatahub-io/opendatahub-…
rnetser Jan 2, 2025
c0d4436
Merge branch 'main' of https://github.com/opendatahub-io/opendatahub-…
rnetser Jan 2, 2025
ba7971a
Merge branch 'main' of https://github.com/opendatahub-io/opendatahub-…
rnetser Jan 3, 2025
235bf96
add smoke metrics
rnetser Jan 3, 2025
9d22522
add smoke metrics
rnetser Jan 3, 2025
93d63db
add smoke metrics
rnetser Jan 3, 2025
c5c7d41
add smoke metrics
rnetser Jan 3, 2025
a3d3263
add smoke metrics
rnetser Jan 3, 2025
8bf2f58
add smoke metrics
rnetser Jan 3, 2025
fea4e6b
add smoke metrics
rnetser Jan 3, 2025
93e5fff
fix metric fixture and rename var
rnetser Jan 3, 2025
5c97867
fix metric fixture and rename var
rnetser Jan 3, 2025
df86301
fix model name
rnetser Jan 3, 2025
c17d560
fix model name
rnetser Jan 3, 2025
ccc963f
fix model name
rnetser Jan 3, 2025
8eccfbb
fix model name
rnetser Jan 3, 2025
ac4f8e2
addd cpu
rnetser Jan 5, 2025
5ed5ae8
add bug id
rnetser Jan 6, 2025
dc2e7de
add clean metrics
rnetser Jan 6, 2025
82f8a40
Merge branch 'main' of https://github.com/opendatahub-io/opendatahub-…
rnetser Jan 7, 2025
1d0e027
Merge branch 'main' of https://github.com/opendatahub-io/opendatahub-…
rnetser Jan 7, 2025
bc38cd6
resolve conflitcs
rnetser Jan 8, 2025
ae25de9
use new fixtures
rnetser Jan 8, 2025
8b3e71f
Merge branch 'main' of https://github.com/opendatahub-io/opendatahub-…
rnetser Jan 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
31 changes: 31 additions & 0 deletions tests/model_serving/model_server/metrics/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import pytest
import requests
from kubernetes.dynamic import DynamicClient
from ocp_utilities.monitoring import Prometheus
from simple_logger.logger import get_logger

from utilities.infra import get_openshift_token


LOGGER = get_logger(name=__name__)


@pytest.fixture(scope="session")
def prometheus(admin_client: DynamicClient) -> Prometheus:
return Prometheus(
client=admin_client,
resource_name="thanos-querier",
verify_ssl=False,
bearer_token=get_openshift_token(),
)


@pytest.fixture(scope="class")
def deleted_metrics(prometheus: Prometheus) -> None:
for metric in ("tgi_request_success", "tgi_request_count"):
LOGGER.info(f"deleting {metric} metric")
requests.get(
f"{prometheus.api_url}/api/v1/admin/tsdb/delete_series?match[]={metric}",
headers=prometheus.headers,
verify=prometheus.verify_ssl,
)
91 changes: 91 additions & 0 deletions tests/model_serving/model_server/metrics/test_model_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import pytest

from tests.model_serving.model_server.metrics.utils import run_inference_multiple_times
from tests.model_serving.model_server.utils import verify_inference_response
from utilities.constants import (
KServeDeploymentType,
ModelFormat,
ModelInferenceRuntime,
ModelStoragePath,
Protocols,
RuntimeTemplates,
)
from utilities.inference_utils import Inference
from utilities.monitoring import get_metrics_value, validate_metrics_value

pytestmark = pytest.mark.usefixtures("skip_if_no_deployed_openshift_serverless", "valid_aws_config", "deleted_metrics")


@pytest.mark.serverless
@pytest.mark.jira("RHOAIENG-3236", run=False)
@pytest.mark.parametrize(
"model_namespace, s3_models_storage_uri, serving_runtime_from_template, s3_models_inference_service",
[
pytest.param(
{"name": "kserve-tgis-metrics"},
{"model-dir": ModelStoragePath.FLAN_T5_SMALL},
{
"name": f"{Protocols.HTTP}-{ModelInferenceRuntime.CAIKIT_TGIS_RUNTIME}",
"template-name": RuntimeTemplates.CAIKIT_TGIS_SERVING,
"multi-model": False,
"enable-http": True,
},
{"name": f"{Protocols.HTTP}-{ModelFormat.CAIKIT}", "deployment-mode": KServeDeploymentType.SERVERLESS},
)
],
indirect=True,
)
class TestModelMetrics:
@pytest.mark.smoke
@pytest.mark.polarion("ODS-2555")
@pytest.mark.dependency(name="test_model_metrics_num_success_requests")
def test_model_metrics_num_success_requests(self, s3_models_inference_service, prometheus):
"""Verify number of successful model requests in OpenShift monitoring system (UserWorkloadMonitoring)metrics"""
verify_inference_response(
inference_service=s3_models_inference_service,
runtime=ModelInferenceRuntime.CAIKIT_TGIS_RUNTIME,
inference_type=Inference.ALL_TOKENS,
protocol=Protocols.HTTPS,
model_name=ModelFormat.CAIKIT,
use_default_query=True,
)
validate_metrics_value(
prometheus=prometheus,
metrics_query="tgi_request_success",
expected_value="1",
)

@pytest.mark.smoke
@pytest.mark.polarion("ODS-2555")
@pytest.mark.dependency(
name="test_model_metrics_num_total_requests",
depends=["test_model_metrics_num_success_requests"],
)
def test_model_metrics_num_total_requests(self, s3_models_inference_service, prometheus):
"""Verify number of total model requests in OpenShift monitoring system (UserWorkloadMonitoring)metrics"""
total_runs = 5

run_inference_multiple_times(
isvc=s3_models_inference_service,
runtime=ModelInferenceRuntime.CAIKIT_TGIS_RUNTIME,
inference_type=Inference.ALL_TOKENS,
protocol=Protocols.HTTPS,
model_name=ModelFormat.CAIKIT,
iterations=total_runs,
run_in_parallel=True,
)
validate_metrics_value(
prometheus=prometheus,
metrics_query="tgi_request_count",
expected_value=str(total_runs + 1),
)

@pytest.mark.smoke
@pytest.mark.polarion("ODS-2555")
@pytest.mark.dependency(depends=["test_model_metrics_num_total_requests"])
def test_model_metrics_cpu_utilization(self, s3_models_inference_service, prometheus):
"""Verify CPU utilization data in OpenShift monitoring system (UserWorkloadMonitoring)metrics"""
assert get_metrics_value(
prometheus=prometheus,
metrics_query=f"pod:container_cpu_usage:sum{{namespace='${s3_models_inference_service.namespace}'}}",
)
43 changes: 43 additions & 0 deletions tests/model_serving/model_server/metrics/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from concurrent.futures import ThreadPoolExecutor, as_completed

from ocp_resources.inference_service import InferenceService
from simple_logger.logger import get_logger

from tests.model_serving.model_server.utils import verify_inference_response


LOGGER = get_logger(name=__name__)


def run_inference_multiple_times(
isvc: InferenceService,
runtime: str,
inference_type: str,
protocol: str,
model_name: str,
iterations: int,
run_in_parallel: bool = False,
) -> None:
futures = []

with ThreadPoolExecutor() as executor:
for iteration in range(iterations):
infer_kwargs = {
"inference_service": isvc,
"runtime": runtime,
"inference_type": inference_type,
"protocol": protocol,
"model_name": model_name,
"use_default_query": True,
}

if run_in_parallel:
futures.append(executor.submit(verify_inference_response, **infer_kwargs))
else:
verify_inference_response(**infer_kwargs)

if futures:
for result in as_completed(futures):
_exception = result.exception()
if _exception:
LOGGER.error(f"Failed to run inference. Error: {_exception}")
5 changes: 2 additions & 3 deletions tests/trustyai/conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import subprocess

import pytest
import yaml
from kubernetes.dynamic import DynamicClient
Expand All @@ -15,6 +13,7 @@
from tests.trustyai.constants import TRUSTYAI_SERVICE
from utilities.constants import MODELMESH_SERVING
from tests.trustyai.utils import update_configmap_data
from utilities.infra import get_openshift_token

MINIO: str = "minio"
OPENDATAHUB_IO: str = "opendatahub.io"
Expand Down Expand Up @@ -45,7 +44,7 @@ def trustyai_service_with_pvc_storage(

@pytest.fixture(scope="class")
def openshift_token(ns_with_modelmesh_enabled):
return subprocess.check_output(["oc", "whoami", "-t", ns_with_modelmesh_enabled.name]).decode().strip()
return get_openshift_token()


@pytest.fixture(scope="class")
Expand Down
4 changes: 4 additions & 0 deletions utilities/infra.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,3 +293,7 @@ def get_pods_by_isvc_label(client: DynamicClient, isvc: InferenceService) -> Lis
return pods

raise ResourceNotFoundError(f"{isvc.name} has no pods")


def get_openshift_token() -> str:
return run_command(command=shlex.split("oc whoami -t"))[1].strip()
35 changes: 35 additions & 0 deletions utilities/monitoring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from typing import Any

from ocp_resources.prometheus import Prometheus
from simple_logger.logger import get_logger
from timeout_sampler import TimeoutExpiredError, TimeoutSampler

LOGGER = get_logger(name=__name__)


def validate_metrics_value(
prometheus: Prometheus, metrics_query: str, expected_value: Any, timeout: int = 60 * 4
) -> None:
sample = None
try:
for sample in TimeoutSampler(
wait_timeout=timeout,
sleep=15,
func=get_metrics_value,
prometheus=prometheus,
metrics_query=metrics_query,
):
if sample:
LOGGER.info(f"metric: {metrics_query} value is: {sample}, the expected value is {expected_value}")
if sample == expected_value:
LOGGER.info("Metrics value matches the expected value!")
return
except TimeoutExpiredError:
LOGGER.info(f"Metrics value: {sample}, expected: {expected_value}")
raise


def get_metrics_value(prometheus: Prometheus, metrics_query: str) -> Any:
metric_results = prometheus.query_sampler(query=metrics_query)
if metric_values_list := [value for metric_val in metric_results for value in metric_val.get("value")]:
return metric_values_list[1]
Loading