Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,14 @@ jobs:
--commit-sha ${{ github.sha }} \
--healthcheck-url https://kelvin.cs.vsb.cz/api/v2/health \
--url https://kelvin.cs.vsb.cz/deployment/

python3 deployment_service/deploy.py \
--service-name evaluator_scheduler \
--container-name kelvin_evaluator_scheduler \
--image ${{ steps.load_image.outputs.evaluator_image_tag }} \
--commit-sha ${{ github.sha }} \
--url https://kelvin.cs.vsb.cz/deployment/ \
--health-check-timeout 240
env:
WEBHOOK_SECRET: ${{ secrets.WEBHOOK_SECRET }}

Expand Down
1 change: 1 addition & 0 deletions deployment_service/app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class Settings(BaseSettings):
docker: Docker
debug: bool = False
log_level: str = "INFO"
health_check_timeout: int = 90

model_config = SettingsConfigDict(
extra="ignore",
Expand Down
62 changes: 54 additions & 8 deletions deployment_service/app/deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from app.config import get_settings
from app.models import ImageInfo

HEALTH_CHECK_TIMEOUT = 90 # seconds
HEALTH_CHECK_INTERVAL = 5 # seconds
IMAGE_PULL_TIMEOUT = 600 # 10 minutes

Expand Down Expand Up @@ -66,11 +65,13 @@ def __init__(
compose_path: Path,
compose_env_file: Path | None,
container_name: str,
healthcheck_url: str,
healthcheck_url: str | None,
health_check_timeout: int | None = None,
):
self.service_name = service_name
self.container_name = container_name
self.healthcheck_url = healthcheck_url
self.health_check_timeout = health_check_timeout
self.image_tag = image["tag"]
self.commit_sha = commit_sha
self.stable_compose_path = str(compose_path.resolve())
Expand Down Expand Up @@ -275,24 +276,69 @@ async def _swap_service(
)
return True

async def _health_check(self) -> bool:
"""Performs a health check by making HTTP requests to a specified URL."""
self.logger.info(f"Performing health check on {self.healthcheck_url}...")
end_time = time.time() + HEALTH_CHECK_TIMEOUT
async def _health_check_http(self, end_time: float, healthcheck_url: str) -> bool:
self.logger.info(f"Performing HTTP health check on {healthcheck_url}...")
async with httpx.AsyncClient(verify=not get_settings().debug) as client:
while time.time() < end_time:
try:
response = await client.get(self.healthcheck_url, timeout=2.0)
response = await client.get(healthcheck_url, timeout=2.0)
self.logger.info(f"Health check response status: {response.status_code}")
if response.status_code == 200:
self.logger.info("Health check passed.")
return True
except httpx.RequestError as exc:
self.logger.warning(f"Health check request failed: {exc}")
await asyncio.sleep(HEALTH_CHECK_INTERVAL)
self.logger.error("Health check timed out.")
self.logger.error("HTTP health check timed out.")
return False

async def _health_check_docker(self, end_time: float, container_name: str) -> bool:
self.logger.info(f"Performing Docker container health state check for {container_name}...")
while time.time() < end_time:
try:
container = self.client.containers.get(container_name)
container.reload()

state = container.attrs.get("State", {})
if "Health" not in state:
self.logger.error(
"Container has no HEALTHCHECK configured. "
"Please add a HEALTHCHECK to your Dockerfile or provide a healthcheck_url."
)
return False

health_status = state.get("Health", {}).get("Status")

self.logger.info(f"Container health status: {health_status}")

if health_status == "healthy":
self.logger.info("Docker health check passed.")
return True
if health_status == "unhealthy":
self.logger.warning("Container is unhealthy.")
return False
except NotFound:
self.logger.warning("Container not found during health check.")
except Exception as e:
self.logger.warning(f"Error checking container health: {e}")

await asyncio.sleep(HEALTH_CHECK_INTERVAL)

self.logger.error("Docker health check timed out.")
return False

async def _health_check(self) -> bool:
"""Performs a health check.
If healthcheck_url is provided, makes HTTP requests to it.
If healthcheck_url is None, checks the container's Docker health status.
"""
timeout = self.health_check_timeout or get_settings().health_check_timeout
end_time = time.time() + timeout

if self.healthcheck_url:
return await self._health_check_http(end_time, self.healthcheck_url)
return await self._health_check_docker(end_time, self.container_name)

def _cleanup(self, old_image_id: str) -> None:
"""Removes the old Docker image after a successful deployment."""
if not old_image_id or old_image_id == self.image_tag:
Expand Down
3 changes: 2 additions & 1 deletion deployment_service/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,8 @@ async def deploy(request: DeploymentRequest, response: Response):
compose_path=get_settings().docker.compose_file_path,
compose_env_file=get_settings().docker.compose_env_file,
container_name=request.container_name,
healthcheck_url=str(request.healthcheck_url),
healthcheck_url=str(request.healthcheck_url) if request.healthcheck_url else None,
health_check_timeout=request.health_check_timeout,
)
try:
logs = await manager.run()
Expand Down
13 changes: 10 additions & 3 deletions deployment_service/app/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,22 @@ class DeploymentRequest(BaseModel):
healthcheck_url: Annotated[
AnyHttpUrl | None,
Field(
default="https://kelvin.cs.vsb.cz/api/v2/health",
description="The URL to check the health of the service.",
default=None,
description="The URL to check the health of the service. If not provided, the container's internal health status is checked.",
examples=["https://kelvin.cs.vsb.cz/api/v2/health"],
),
]
health_check_timeout: int | None = Field(
default=None,
description="Optional timeout for the health check in seconds. Defaults to global setting if not provided.",
examples=[120],
)

@field_validator("healthcheck_url", mode="after")
@classmethod
def validate_healthcheck_url(cls, value: AnyHttpUrl) -> AnyHttpUrl:
def validate_healthcheck_url(cls, value: AnyHttpUrl | None) -> AnyHttpUrl | None:
if value is None:
return value
host = value.host
if host not in get_settings().security.allowed_hosts:
raise ValueError(
Expand Down
19 changes: 13 additions & 6 deletions deployment_service/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,18 @@
import urllib.request


def format_for_github_summary(status_code, response_json):
def format_for_github_summary(status_code, response_json, service_name):
logs = response_json.get("logs", [])
error_message = response_json.get("error")

if not (200 <= status_code < 300):
title = f"## ❌ Deployment Failed (Status: {status_code})"
title = f"## ❌ Deployment Failed: {service_name} (Status: {status_code})"
if not error_message:
summary_lines = [f"**Error:** `{response_json.get('detail', 'Unknown error')}`"]
else:
summary_lines = [f"**Error:** `{error_message}`"]
else:
title = f"## ✅ Deployment Succeeded (Status: {status_code})"
title = f"## ✅ Deployment Succeeded: {service_name} (Status: {status_code})"
summary_lines = ["The deployment process completed successfully."]

summary_lines.append("\n<details>\n\n<summary>View full deployment logs</summary>\n\n```text")
Expand Down Expand Up @@ -63,8 +63,14 @@ def main():
)
parser.add_argument(
"--healthcheck-url",
default="https://kelvin.cs.vsb.cz/api/v2/health",
help="The full URL for the application's health check endpoint. (e.g., 'https://nginx/api/v2/health')",
default=None,
help="The full URL for the application's health check endpoint. (e.g., 'https://nginx/api/v2/health'). If not provided, the container's health status will be checked.",
)
parser.add_argument(
"--health-check-timeout",
type=int,
default=None,
help="Optional timeout for the health check in seconds. Overrides the server-side default.",
)

parser.add_argument(
Expand Down Expand Up @@ -97,6 +103,7 @@ def main():
"image": args.image,
"commit_sha": args.commit_sha,
"healthcheck_url": args.healthcheck_url,
"health_check_timeout": args.health_check_timeout,
}
message_data = json.dumps(message_dict).encode("utf-8")
signature = hmac.new(secret.encode("utf-8"), message_data, hashlib.sha256).hexdigest()
Expand Down Expand Up @@ -128,7 +135,7 @@ def main():
"error": f"Invalid JSON response from server (Status: {status_code}).",
}

summary_content = format_for_github_summary(status_code, response_json)
summary_content = format_for_github_summary(status_code, response_json, args.service_name)
if is_github_env:
summary_file_path = str(os.getenv("GITHUB_STEP_SUMMARY"))
with open(summary_file_path, "a", encoding="utf-8") as f:
Expand Down
14 changes: 13 additions & 1 deletion deployment_service/tests/test_deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,22 @@ def fake_time():
result = await manager_instance._health_check()

assert result is False
assert "Health check timed out" in manager_instance.logs[-1]
assert "HTTP health check timed out." in manager_instance.logs[-1]
assert mock_sleep.called


@pytest.mark.asyncio
async def test_health_check_docker_healthy(manager_instance):
manager_instance.healthcheck_url = None
mock_container = MagicMock()
mock_container.attrs = {"State": {"Health": {"Status": "healthy"}}}
manager_instance.client.containers.get.return_value = mock_container

result = await manager_instance._health_check()
assert result is True
assert "Docker health check passed." in manager_instance.logs[-1]


@pytest.mark.asyncio
async def test_swap_service_critical_error(manager_instance):
manager_instance._run_command = AsyncMock(side_effect=[True, False])
Expand Down
14 changes: 12 additions & 2 deletions docs/docs/02-developers-guide/02-deployment.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,18 @@ the specified commit_sha into it using git show. This isolates the new configura

### 4. Health Check

The manager performs an active health check by repeatedly sending HTTP GET requests to a specified `healthcheck_url`.
It continuously polls this endpoint until it receives a 200 OK status code, which indicates the service is ready. If the health check times out, it triggers a rollback.
The manager performs an active health check by repeatedly sending HTTP GET requests to a specified `healthcheck_url` or,
if no `healthcheck_url` is provided, by relying on the Docker `HEALTHCHECK` status of the container.
It continuously polls this endpoint until it receives a 200 OK status code or
the Docker container status changes to `healthy`, which indicates the service is ready.

The timeout for this check defaults to **90 seconds** (configurable via `health_check_timeout` in `config.py`).
It can also be overridden per-request by providing a `health_check_timeout` in the deployment payload.
If the health check times out, it triggers a rollback.

:::note
The Docker-healthcheck fallback only works if the container image defines a Docker `HEALTHCHECK`; if no `HEALTHCHECK` is configured and no `healthcheck_url` is provided, the deployment will fail immediately.
:::

### 5. Rollback (on Failure)

Expand Down
Loading