diff --git a/deployer/src/infra/rest.rs b/deployer/src/infra/rest.rs index 4e3bce43a..c4ba70254 100644 --- a/deployer/src/infra/rest.rs +++ b/deployer/src/infra/rest.rs @@ -44,6 +44,10 @@ impl ComponentAction for Rest { } } + if let Some(core_health_freq) = &options.rest_core_health_freq { + binary = binary.with_args(vec!["--core-health-freq", core_health_freq]); + } + if cfg.container_exists("jaeger") { let jaeger_config = format!("jaeger.{}:4317", cfg.get_name()); binary = binary.with_args(vec!["--jaeger", &jaeger_config]) diff --git a/deployer/src/lib.rs b/deployer/src/lib.rs index ba6ea0f73..c92a87589 100644 --- a/deployer/src/lib.rs +++ b/deployer/src/lib.rs @@ -136,6 +136,10 @@ pub struct StartOptions { #[clap(long, conflicts_with = "no_rest")] pub rest_jwk: Option, + /// Set the rest-to-core health probe frequency on the rest. + #[arg(long)] + pub rest_core_health_freq: Option, + /// Use the following image pull policy when creating containers from images. #[clap(long, default_value = "ifnotpresent")] pub image_pull_policy: composer::ImagePullPolicy, diff --git a/tests/bdd/common/deployer.py b/tests/bdd/common/deployer.py index e278dd9d2..d129636b9 100644 --- a/tests/bdd/common/deployer.py +++ b/tests/bdd/common/deployer.py @@ -38,6 +38,7 @@ class StartOptions: no_min_timeouts: bool = False rust_log: str = None rust_log_silence: str = None + rest_core_health_freq: str = None def args(self): args = [ @@ -99,6 +100,9 @@ def args(self): if rust_log_silence is not None: args.append(f"--rust-log-silence={rust_log_silence}") + if self.rest_core_health_freq: + args.append(f"--rest-core-health-freq={self.rest_core_health_freq}") + agent_arg = "--agents=Core" if self.ha_node_agent: agent_arg += ",HaNode" @@ -106,6 +110,7 @@ def args(self): agent_arg += ",HaCluster" if self.ha_cluster_agent_fast is not None: args.append(f"--cluster-fast-requeue={self.ha_cluster_agent_fast}") + args.append(agent_arg) return args @@ -139,6 +144,7 @@ def start( no_min_timeouts=False, rust_log: str = None, rust_log_silence: str = None, + rest_core_health_freq: str = None, ): options = StartOptions( io_engines, @@ -165,6 +171,7 @@ def start( no_min_timeouts=no_min_timeouts, rust_log=rust_log, rust_log_silence=rust_log_silence, + rest_core_health_freq=rest_core_health_freq, ) pytest.deployer_options = options Deployer.start_with_opts(options) diff --git a/tests/bdd/features/health_probes/readiness_probe.feature b/tests/bdd/features/health_probes/readiness_probe.feature new file mode 100644 index 000000000..b31361cc5 --- /dev/null +++ b/tests/bdd/features/health_probes/readiness_probe.feature @@ -0,0 +1,12 @@ +Feature: Readiness Probe + + Background: + Given a running agent-core service + And a running REST service with "--core-health-freq" set to "10s" + + Scenario: The REST API /ready service should not update its readiness status more than once in 10 seconds + Given agent-core service is available + And the REST service returns a 200 status code for an HTTP GET request to the /ready endpoint + When the agent-core service is brought down forcefully + Then the REST service returns 200 for /ready endpoint for 5 more seconds + And after a delay of 5 seconds the REST service returns 503 for /ready endpoint for the following 12 seconds diff --git a/tests/bdd/features/health_probes/test_readiness_probe.py b/tests/bdd/features/health_probes/test_readiness_probe.py new file mode 100644 index 000000000..661c25a19 --- /dev/null +++ b/tests/bdd/features/health_probes/test_readiness_probe.py @@ -0,0 +1,99 @@ +"""Readiness Probe feature tests.""" + +import time + +import pytest +from common.apiclient import REST_SERVER +from common.deployer import Deployer +from common.docker import Docker +from pytest_bdd import given, scenario, then, when +from requests import get as http_get +from retrying import retry + +READINESS_API_ENDPOINT = REST_SERVER + "/ready" + + +@pytest.fixture(scope="module") +def setup(): + Deployer.start(io_engines=1, rest_core_health_freq="10s") + yield + Deployer.stop() + + +@scenario( + "readiness_probe.feature", + "The REST API /ready service should not update its readiness status more than once in 10 seconds", +) +def test_the_rest_api_ready_service_should_not_update_its_readiness_status_more_than_once_in_5_seconds(): + """The REST API /ready service should not update its readiness status more than once in 5 seconds.""" + + +@given('a running REST service with "--core-health-freq" set to "10s"') +def a_running_rest_service(setup): + """a running REST service with "--core-health-freq" set to "10s".""" + + +@given("a running agent-core service") +def a_running_agent_core_service(setup): + """a running agent-core service.""" + + +@given("agent-core service is available") +def agent_core_service_is_available(setup): + """agent-core service is available.""" + + +@given( + "the REST service returns a 200 status code for an HTTP GET request to the /ready endpoint" +) +def the_rest_service_returns_a_200_status_code_for_an_http_get_request_to_the_ready_endpoint( + setup, +): + """the REST service returns a 200 status code for an HTTP GET request to the /ready endpoint.""" + + # 5 minute retry. + @retry( + stop_max_attempt_number=1500, + wait_fixed=200, + ) + def rest_is_ready(): + response = http_get(READINESS_API_ENDPOINT) + assert response.status_code == 200 + + rest_is_ready() + + +@when("the agent-core service is brought down forcefully") +def the_agent_core_service_is_brought_down_forcefully(setup): + """the agent-core service is brought down forcefully.""" + Docker.kill_container("core") + + +@then("the REST service returns 200 for /ready endpoint for 5 more seconds") +def the_rest_service_returns_200_for_ready_endpoint_for_5_more_seconds(setup): + """the REST service returns 200 for /ready endpoint for 5 more seconds.""" + start_time = time.time() + while time.time() - start_time < 5: + response = http_get(READINESS_API_ENDPOINT) + if response.status_code != 200: + raise ValueError( + "Expected Readiness probe to return 200 for this duration of 5 seconds" + ) + + +@then( + "after a delay of 5 seconds the REST service returns 503 for /ready endpoint for the following 12 seconds" +) +def after_a_delay_of_5_seconds_the_rest_service_returns_503_for_ready_endpoint_for_the_following_12_seconds( + setup, +): + """after a delay of 5 seconds the REST service returns 503 for /ready endpoint for the following 12 seconds.""" + time.sleep(5) + + start_time = time.time() + while time.time() - start_time < 12: + response = http_get(READINESS_API_ENDPOINT) + if response.status_code != 503: + raise ValueError( + "Expected Readiness probe to return 503 for this duration of 12 seconds" + )