-
Notifications
You must be signed in to change notification settings - Fork 38
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
chore(bors): merge pull request #894
894: feat(rest): add health API for readiness and liveness probes r=niladrih a=niladrih Co-authored-by: Niladri Halder <[email protected]>
- Loading branch information
Showing
9 changed files
with
243 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
use crate::v0::core_grpc; | ||
use grpc::operations::node::traits::NodeOperations; | ||
use std::{ | ||
sync::RwLock, | ||
time::{Duration, Instant}, | ||
}; | ||
|
||
/// This is a type to cache the liveness of the agent-core service. | ||
/// This is meant to be wrapped inside an Arc and used across threads. | ||
pub struct CachedCoreState { | ||
state: RwLock<ServerState>, | ||
cache_duration: Duration, | ||
} | ||
|
||
/// This type remembers a liveness state, and when this data was refreshed. | ||
struct ServerState { | ||
is_live: bool, | ||
last_updated: Instant, | ||
} | ||
|
||
impl CachedCoreState { | ||
/// Create a new cache for serving readiness health checks based on agent-core health. | ||
pub async fn new(cache_duration: Duration) -> Self { | ||
let agent_core_is_live = core_grpc().node().probe(None).await.unwrap_or(false); | ||
|
||
CachedCoreState { | ||
state: RwLock::new(ServerState { | ||
is_live: agent_core_is_live, | ||
last_updated: Instant::now(), | ||
}), | ||
cache_duration, | ||
} | ||
} | ||
|
||
/// Get the cached state of the agent-core service, or assume it's unavailable if something | ||
/// went wrong. | ||
pub async fn get_or_assume_unavailable(&self) -> bool { | ||
let should_update = { | ||
let state = self.state.read().unwrap(); | ||
state.last_updated.elapsed() >= self.cache_duration | ||
}; | ||
|
||
if should_update { | ||
self.update_or_assume_unavailable().await; | ||
} | ||
|
||
self.state.read().unwrap().is_live | ||
} | ||
|
||
/// Update the state of the agent-core service, or assume it's unavailable if something | ||
/// went wrong. | ||
pub async fn update_or_assume_unavailable(&self) { | ||
let new_value = core_grpc().node().probe(None).await.unwrap_or(false); | ||
let mut state = self.state.write().unwrap(); | ||
state.is_live = new_value; | ||
state.last_updated = Instant::now(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
use crate::CachedCoreState; | ||
use actix_web::{get, web::Data, HttpResponse, Responder}; | ||
|
||
/// Liveness probe check. Failure will result in Pod restart. 200 on success. | ||
#[get("/live")] | ||
async fn liveness(_cached_core_state: Data<CachedCoreState>) -> impl Responder { | ||
HttpResponse::Ok() | ||
.content_type("text/plain; charset=utf-8") | ||
.insert_header(("X-Content-Type-Options", "nosniff")) | ||
.body("live") | ||
} | ||
|
||
/// Readiness probe check. Failure will result in removal of Container from Kubernetes service | ||
/// target pool. 200 on success, 503 on failure. | ||
#[get("/ready")] | ||
async fn readiness(cached_core_state: Data<CachedCoreState>) -> HttpResponse { | ||
if cached_core_state.get_or_assume_unavailable().await { | ||
return HttpResponse::Ok() | ||
.content_type("text/plain; charset=utf-8") | ||
.insert_header(("X-Content-Type-Options", "nosniff")) | ||
.body("ready"); | ||
} | ||
|
||
HttpResponse::ServiceUnavailable() | ||
.content_type("text/plain; charset=utf-8") | ||
.insert_header(("X-Content-Type-Options", "nosniff")) | ||
.body("not ready") | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
/// Has tools to collect the liveness state of the agent-core service. | ||
pub mod core_state; | ||
/// Actix request handlers for health checks. | ||
pub mod handlers; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
Feature: Readiness Probe | ||
|
||
Background: | ||
Given a running agent-core service | ||
And a running REST service with "--core-health-freq" set to "10ms" | ||
|
||
Scenario: The REST API /ready service should not update its readiness status more than once in 10 milliseconds | ||
Given agent-core service is available | ||
And the REST service returns a 200 status code for an HTTP GET request to the /ready endpoint | ||
When the agent-core service is brought down forcefully | ||
Then the REST service returns 200 for /ready endpoint for 5 more milliseconds | ||
And after a delay of 5 seconds the REST service returns 503 for /ready endpoint for the following 12 milliseconds |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
"""Readiness Probe feature tests.""" | ||
|
||
import time | ||
|
||
import pytest | ||
from common.deployer import Deployer | ||
from common.docker import Docker | ||
from pytest_bdd import given, scenario, then, when | ||
from requests import get as http_get | ||
from retrying import retry | ||
|
||
READINESS_API_ENDPOINT = "http://localhost:8081/ready" | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def setup(): | ||
Deployer.start(io_engines=1, rest_core_health_freq="10ms") | ||
yield | ||
Deployer.stop() | ||
|
||
|
||
@scenario( | ||
"readiness_probe.feature", | ||
"The REST API /ready service should not update its readiness status more than once in 10 milliseconds", | ||
) | ||
def test_the_rest_api_ready_service_should_not_update_its_readiness_status_more_than_once_in_10_milliseconds(): | ||
"""The REST API /ready service should not update its readiness status more than once in 10 milliseconds.""" | ||
|
||
|
||
@given('a running REST service with "--core-health-freq" set to "10ms"') | ||
def a_running_rest_service(setup): | ||
"""a running REST service with "--core-health-freq" set to "10ms".""" | ||
|
||
|
||
@given("a running agent-core service") | ||
def a_running_agent_core_service(setup): | ||
"""a running agent-core service.""" | ||
|
||
|
||
@given("agent-core service is available") | ||
def agent_core_service_is_available(setup): | ||
"""agent-core service is available.""" | ||
|
||
|
||
@given( | ||
"the REST service returns a 200 status code for an HTTP GET request to the /ready endpoint" | ||
) | ||
def the_rest_service_returns_a_200_status_code_for_an_http_get_request_to_the_ready_endpoint( | ||
setup, | ||
): | ||
"""the REST service returns a 200 status code for an HTTP GET request to the /ready endpoint.""" | ||
|
||
# 5 minute retry. | ||
@retry( | ||
stop_max_attempt_number=1500, | ||
wait_fixed=200, | ||
) | ||
def rest_is_ready(): | ||
response = http_get(READINESS_API_ENDPOINT) | ||
assert response.status_code == 200 | ||
|
||
rest_is_ready() | ||
|
||
|
||
@when("the agent-core service is brought down forcefully") | ||
def the_agent_core_service_is_brought_down_forcefully(setup): | ||
"""the agent-core service is brought down forcefully.""" | ||
Docker.kill_container("core") | ||
|
||
|
||
@then("the REST service returns 200 for /ready endpoint for 5 more milliseconds") | ||
def the_rest_service_returns_200_for_ready_endpoint_for_5_more_milliseconds(setup): | ||
"""the REST service returns 200 for /ready endpoint for 5 more milliseconds.""" | ||
start_time = time.time() | ||
while time.time() - start_time < 5: | ||
response = http_get(READINESS_API_ENDPOINT) | ||
if response.status_code != 200: | ||
raise ValueError( | ||
"Expected Readiness probe to return 200 for this duration of 5 milliseconds" | ||
) | ||
|
||
|
||
@then( | ||
"after a delay of 5 seconds the REST service returns 503 for /ready endpoint for the following 12 milliseconds" | ||
) | ||
def after_a_delay_of_5_seconds_the_rest_service_returns_503_for_ready_endpoint_for_the_following_12_milliseconds( | ||
setup, | ||
): | ||
"""after a delay of 5 seconds the REST service returns 503 for /ready endpoint for the following 12 milliseconds.""" | ||
time.sleep(5) | ||
|
||
start_time = time.time() | ||
while time.time() - start_time < 12: | ||
response = http_get(READINESS_API_ENDPOINT) | ||
if response.status_code != 503: | ||
raise ValueError( | ||
"Expected Readiness probe to return 503 for this duration of 12 milliseconds" | ||
) |