Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,14 @@ jobs:
--commit-sha ${{ github.sha }} \
--healthcheck-url https://kelvin.cs.vsb.cz/api/v2/health \
--url https://kelvin.cs.vsb.cz/deployment/

python3 deployment_service/deploy.py \
--service-name evaluator_scheduler \
--container-name kelvin_evaluator_scheduler \
--image ${{ steps.load_image.outputs.evaluator_image_tag }} \
--commit-sha ${{ github.sha }} \
--url https://kelvin.cs.vsb.cz/deployment/ \
--health-check-timeout 240
env:
WEBHOOK_SECRET: ${{ secrets.WEBHOOK_SECRET }}

Expand Down
1 change: 1 addition & 0 deletions deployment_service/app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class Settings(BaseSettings):
docker: Docker
debug: bool = False
log_level: str = "INFO"
health_check_timeout: int = 90

model_config = SettingsConfigDict(
extra="ignore",
Expand Down
62 changes: 54 additions & 8 deletions deployment_service/app/deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from app.config import get_settings
from app.models import ImageInfo

HEALTH_CHECK_TIMEOUT = 90 # seconds
HEALTH_CHECK_INTERVAL = 5 # seconds
IMAGE_PULL_TIMEOUT = 600 # 10 minutes

Expand Down Expand Up @@ -66,11 +65,13 @@ def __init__(
compose_path: Path,
compose_env_file: Path | None,
container_name: str,
healthcheck_url: str,
healthcheck_url: str | None,
health_check_timeout: int | None = None,
):
self.service_name = service_name
self.container_name = container_name
self.healthcheck_url = healthcheck_url
self.health_check_timeout = health_check_timeout
self.image_tag = image["tag"]
self.commit_sha = commit_sha
self.stable_compose_path = str(compose_path.resolve())
Expand Down Expand Up @@ -275,24 +276,69 @@ async def _swap_service(
)
return True

async def _health_check(self) -> bool:
"""Performs a health check by making HTTP requests to a specified URL."""
self.logger.info(f"Performing health check on {self.healthcheck_url}...")
end_time = time.time() + HEALTH_CHECK_TIMEOUT
async def _health_check_http(self, end_time: float, healthcheck_url: str) -> bool:
self.logger.info(f"Performing HTTP health check on {healthcheck_url}...")
async with httpx.AsyncClient(verify=not get_settings().debug) as client:
while time.time() < end_time:
try:
response = await client.get(self.healthcheck_url, timeout=2.0)
response = await client.get(healthcheck_url, timeout=2.0)
self.logger.info(f"Health check response status: {response.status_code}")
if response.status_code == 200:
self.logger.info("Health check passed.")
return True
except httpx.RequestError as exc:
self.logger.warning(f"Health check request failed: {exc}")
await asyncio.sleep(HEALTH_CHECK_INTERVAL)
self.logger.error("Health check timed out.")
self.logger.error("HTTP health check timed out.")
return False

async def _health_check_docker(self, end_time: float, container_name: str) -> bool:
self.logger.info(f"Performing Docker container health state check for {container_name}...")
while time.time() < end_time:
try:
container = self.client.containers.get(container_name)
container.reload()

state = container.attrs.get("State", {})
if "Health" not in state:
self.logger.error(
"Container has no HEALTHCHECK configured. "
"Please add a HEALTHCHECK to your Dockerfile or provide a --healthcheck-url."
)
return False

health_status = state.get("Health", {}).get("Status")

self.logger.info(f"Container health status: {health_status}")

if health_status == "healthy":
self.logger.info("Docker health check passed.")
return True
if health_status == "unhealthy":
self.logger.warning("Container is unhealthy.")
return False
except NotFound:
self.logger.warning("Container not found during health check.")
except Exception as e:
self.logger.warning(f"Error checking container health: {e}")

await asyncio.sleep(HEALTH_CHECK_INTERVAL)

self.logger.error("Docker health check timed out.")
return False

async def _health_check(self) -> bool:
"""Performs a health check.
If healthcheck_url is provided, makes HTTP requests to it.
If healthcheck_url is None, checks the container's Docker health status.
"""
timeout = self.health_check_timeout or get_settings().health_check_timeout
end_time = time.time() + timeout

if self.healthcheck_url:
return await self._health_check_http(end_time, self.healthcheck_url)
return await self._health_check_docker(end_time, self.container_name)

def _cleanup(self, old_image_id: str) -> None:
"""Removes the old Docker image after a successful deployment."""
if not old_image_id or old_image_id == self.image_tag:
Expand Down
3 changes: 2 additions & 1 deletion deployment_service/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,8 @@ async def deploy(request: DeploymentRequest, response: Response):
compose_path=get_settings().docker.compose_file_path,
compose_env_file=get_settings().docker.compose_env_file,
container_name=request.container_name,
healthcheck_url=str(request.healthcheck_url),
healthcheck_url=str(request.healthcheck_url) if request.healthcheck_url else None,
health_check_timeout=request.health_check_timeout,
)
try:
logs = await manager.run()
Expand Down
13 changes: 10 additions & 3 deletions deployment_service/app/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,22 @@ class DeploymentRequest(BaseModel):
healthcheck_url: Annotated[
AnyHttpUrl | None,
Field(
default="https://kelvin.cs.vsb.cz/api/v2/health",
description="The URL to check the health of the service.",
default=None,
description="The URL to check the health of the service. If not provided, the container's internal health status is checked.",
examples=["https://kelvin.cs.vsb.cz/api/v2/health"],
),
]
health_check_timeout: int | None = Field(
default=None,
description="Optional timeout for the health check in seconds. Defaults to global setting if not provided.",
examples=[120],
)

@field_validator("healthcheck_url", mode="after")
@classmethod
def validate_healthcheck_url(cls, value: AnyHttpUrl) -> AnyHttpUrl:
def validate_healthcheck_url(cls, value: AnyHttpUrl | None) -> AnyHttpUrl | None:
if value is None:
return value
host = value.host
if host not in get_settings().security.allowed_hosts:
raise ValueError(
Expand Down
19 changes: 13 additions & 6 deletions deployment_service/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,18 @@
import urllib.request


def format_for_github_summary(status_code, response_json):
def format_for_github_summary(status_code, response_json, service_name):
logs = response_json.get("logs", [])
error_message = response_json.get("error")

if not (200 <= status_code < 300):
title = f"## ❌ Deployment Failed (Status: {status_code})"
title = f"## ❌ Deployment Failed: {service_name} (Status: {status_code})"
if not error_message:
summary_lines = [f"**Error:** `{response_json.get('detail', 'Unknown error')}`"]
else:
summary_lines = [f"**Error:** `{error_message}`"]
else:
title = f"## ✅ Deployment Succeeded (Status: {status_code})"
title = f"## ✅ Deployment Succeeded: {service_name} (Status: {status_code})"
summary_lines = ["The deployment process completed successfully."]

summary_lines.append("\n<details>\n\n<summary>View full deployment logs</summary>\n\n```text")
Expand Down Expand Up @@ -63,8 +63,14 @@ def main():
)
parser.add_argument(
"--healthcheck-url",
default="https://kelvin.cs.vsb.cz/api/v2/health",
help="The full URL for the application's health check endpoint. (e.g., 'https://nginx/api/v2/health')",
default=None,
help="The full URL for the application's health check endpoint. (e.g., 'https://nginx/api/v2/health'). If not provided, the container's health status will be checked.",
)
parser.add_argument(
"--health-check-timeout",
type=int,
default=None,
help="Optional timeout for the health check in seconds. Overrides the server-side default.",
)

parser.add_argument(
Expand Down Expand Up @@ -97,6 +103,7 @@ def main():
"image": args.image,
"commit_sha": args.commit_sha,
"healthcheck_url": args.healthcheck_url,
"health_check_timeout": args.health_check_timeout,
}
message_data = json.dumps(message_dict).encode("utf-8")
signature = hmac.new(secret.encode("utf-8"), message_data, hashlib.sha256).hexdigest()
Expand Down Expand Up @@ -128,7 +135,7 @@ def main():
"error": f"Invalid JSON response from server (Status: {status_code}).",
}

summary_content = format_for_github_summary(status_code, response_json)
summary_content = format_for_github_summary(status_code, response_json, args.service_name)
if is_github_env:
summary_file_path = str(os.getenv("GITHUB_STEP_SUMMARY"))
with open(summary_file_path, "a", encoding="utf-8") as f:
Expand Down
36 changes: 35 additions & 1 deletion deployment_service/tests/test_deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,44 @@ def fake_time():
result = await manager_instance._health_check()

assert result is False
assert "Health check timed out" in manager_instance.logs[-1]
assert "HTTP health check timed out." in manager_instance.logs[-1]
assert mock_sleep.called


@pytest.mark.asyncio
async def test_health_check_docker_healthy(manager_instance):
manager_instance.healthcheck_url = None
mock_container = MagicMock()
mock_container.attrs = {"State": {"Health": {"Status": "healthy"}}}
manager_instance.client.containers.get.return_value = mock_container

result = await manager_instance._health_check()
assert result is True
assert "Docker health check passed." in manager_instance.logs[-1]


@pytest.mark.asyncio
@patch("asyncio.sleep", new_callable=AsyncMock)
async def test_health_check_docker_timeout(mock_sleep, manager_instance):
manager_instance.healthcheck_url = None
# Fake time that increases every call
current_time = 0

def fake_time():
nonlocal current_time
current_time += 1
return current_time

with patch("time.time", side_effect=fake_time):
mock_container = MagicMock()
mock_container.attrs = {"State": {"Health": {"Status": "starting"}}}
manager_instance.client.containers.get.return_value = mock_container

result = await manager_instance._health_check()
assert result is False
assert "Docker health check timed out." in manager_instance.logs[-1]


@pytest.mark.asyncio
async def test_swap_service_critical_error(manager_instance):
manager_instance._run_command = AsyncMock(side_effect=[True, False])
Expand Down
6 changes: 4 additions & 2 deletions docs/docs/02-developers-guide/02-deployment.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,10 @@ the specified commit_sha into it using git show. This isolates the new configura

### 4. Health Check

The manager performs an active health check by repeatedly sending HTTP GET requests to a specified `healthcheck_url`.
It continuously polls this endpoint until it receives a 200 OK status code, which indicates the service is ready. If the health check times out, it triggers a rollback.
The manager performs an active health check by repeatedly sending HTTP GET requests to a specified `healthcheck_url` or by using the Docker healthcheck (if the `healthcheck_url` is not specified).
It continuously polls this endpoint until it receives a 200 OK status code or docker container status changes to `healthy`, which indicates the service is ready.

The timeout for this check defaults to **90 seconds** (configurable via `health_check_timeout` in `config.py`). It can also be overridden per-request by providing a `health_check_timeout` in the deployment payload. If the health check times out, it triggers a rollback.

### 5. Rollback (on Failure)

Expand Down
Loading