diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 451d2dd95..c678e7f39 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -302,6 +302,14 @@ jobs: --commit-sha ${{ github.sha }} \ --healthcheck-url https://kelvin.cs.vsb.cz/api/v2/health \ --url https://kelvin.cs.vsb.cz/deployment/ + + python3 deployment_service/deploy.py \ + --service-name evaluator_scheduler \ + --container-name kelvin_evaluator_scheduler \ + --image ${{ steps.load_image.outputs.evaluator_image_tag }} \ + --commit-sha ${{ github.sha }} \ + --url https://kelvin.cs.vsb.cz/deployment/ \ + --health-check-timeout 240 env: WEBHOOK_SECRET: ${{ secrets.WEBHOOK_SECRET }} diff --git a/deployment_service/app/config.py b/deployment_service/app/config.py index 8baac655a..979775124 100644 --- a/deployment_service/app/config.py +++ b/deployment_service/app/config.py @@ -41,6 +41,7 @@ class Settings(BaseSettings): docker: Docker debug: bool = False log_level: str = "INFO" + health_check_timeout: int = 90 model_config = SettingsConfigDict( extra="ignore", diff --git a/deployment_service/app/deployment.py b/deployment_service/app/deployment.py index c7541a535..a63f53a75 100644 --- a/deployment_service/app/deployment.py +++ b/deployment_service/app/deployment.py @@ -15,7 +15,6 @@ from app.config import get_settings from app.models import ImageInfo -HEALTH_CHECK_TIMEOUT = 90 # seconds HEALTH_CHECK_INTERVAL = 5 # seconds IMAGE_PULL_TIMEOUT = 600 # 10 minutes @@ -66,11 +65,13 @@ def __init__( compose_path: Path, compose_env_file: Path | None, container_name: str, - healthcheck_url: str, + healthcheck_url: str | None, + health_check_timeout: int | None = None, ): self.service_name = service_name self.container_name = container_name self.healthcheck_url = healthcheck_url + self.health_check_timeout = health_check_timeout self.image_tag = image["tag"] self.commit_sha = commit_sha self.stable_compose_path = str(compose_path.resolve()) @@ -275,14 +276,12 @@ async def _swap_service( ) return True - async def _health_check(self) -> bool: - """Performs a health check by making HTTP requests to a specified URL.""" - self.logger.info(f"Performing health check on {self.healthcheck_url}...") - end_time = time.time() + HEALTH_CHECK_TIMEOUT + async def _health_check_http(self, end_time: float, healthcheck_url: str) -> bool: + self.logger.info(f"Performing HTTP health check on {healthcheck_url}...") async with httpx.AsyncClient(verify=not get_settings().debug) as client: while time.time() < end_time: try: - response = await client.get(self.healthcheck_url, timeout=2.0) + response = await client.get(healthcheck_url, timeout=2.0) self.logger.info(f"Health check response status: {response.status_code}") if response.status_code == 200: self.logger.info("Health check passed.") @@ -290,9 +289,56 @@ async def _health_check(self) -> bool: except httpx.RequestError as exc: self.logger.warning(f"Health check request failed: {exc}") await asyncio.sleep(HEALTH_CHECK_INTERVAL) - self.logger.error("Health check timed out.") + self.logger.error("HTTP health check timed out.") + return False + + async def _health_check_docker(self, end_time: float, container_name: str) -> bool: + self.logger.info(f"Performing Docker container health state check for {container_name}...") + while time.time() < end_time: + try: + container = self.client.containers.get(container_name) + container.reload() + + state = container.attrs.get("State", {}) + if "Health" not in state: + self.logger.error( + "Container has no HEALTHCHECK configured. " + "Please add a HEALTHCHECK to your Dockerfile or provide a healthcheck_url." + ) + return False + + health_status = state.get("Health", {}).get("Status") + + self.logger.info(f"Container health status: {health_status}") + + if health_status == "healthy": + self.logger.info("Docker health check passed.") + return True + if health_status == "unhealthy": + self.logger.warning("Container is unhealthy.") + return False + except NotFound: + self.logger.warning("Container not found during health check.") + except Exception as e: + self.logger.warning(f"Error checking container health: {e}") + + await asyncio.sleep(HEALTH_CHECK_INTERVAL) + + self.logger.error("Docker health check timed out.") return False + async def _health_check(self) -> bool: + """Performs a health check. + If healthcheck_url is provided, makes HTTP requests to it. + If healthcheck_url is None, checks the container's Docker health status. + """ + timeout = self.health_check_timeout or get_settings().health_check_timeout + end_time = time.time() + timeout + + if self.healthcheck_url: + return await self._health_check_http(end_time, self.healthcheck_url) + return await self._health_check_docker(end_time, self.container_name) + def _cleanup(self, old_image_id: str) -> None: """Removes the old Docker image after a successful deployment.""" if not old_image_id or old_image_id == self.image_tag: diff --git a/deployment_service/app/main.py b/deployment_service/app/main.py index f3e59da9a..39ebb7d10 100644 --- a/deployment_service/app/main.py +++ b/deployment_service/app/main.py @@ -127,7 +127,8 @@ async def deploy(request: DeploymentRequest, response: Response): compose_path=get_settings().docker.compose_file_path, compose_env_file=get_settings().docker.compose_env_file, container_name=request.container_name, - healthcheck_url=str(request.healthcheck_url), + healthcheck_url=str(request.healthcheck_url) if request.healthcheck_url else None, + health_check_timeout=request.health_check_timeout, ) try: logs = await manager.run() diff --git a/deployment_service/app/models.py b/deployment_service/app/models.py index 25fc573ba..b586f457e 100644 --- a/deployment_service/app/models.py +++ b/deployment_service/app/models.py @@ -49,15 +49,22 @@ class DeploymentRequest(BaseModel): healthcheck_url: Annotated[ AnyHttpUrl | None, Field( - default="https://kelvin.cs.vsb.cz/api/v2/health", - description="The URL to check the health of the service.", + default=None, + description="The URL to check the health of the service. If not provided, the container's internal health status is checked.", examples=["https://kelvin.cs.vsb.cz/api/v2/health"], ), ] + health_check_timeout: int | None = Field( + default=None, + description="Optional timeout for the health check in seconds. Defaults to global setting if not provided.", + examples=[120], + ) @field_validator("healthcheck_url", mode="after") @classmethod - def validate_healthcheck_url(cls, value: AnyHttpUrl) -> AnyHttpUrl: + def validate_healthcheck_url(cls, value: AnyHttpUrl | None) -> AnyHttpUrl | None: + if value is None: + return value host = value.host if host not in get_settings().security.allowed_hosts: raise ValueError( diff --git a/deployment_service/deploy.py b/deployment_service/deploy.py index 29c969bfd..b4ee38a3e 100755 --- a/deployment_service/deploy.py +++ b/deployment_service/deploy.py @@ -10,18 +10,18 @@ import urllib.request -def format_for_github_summary(status_code, response_json): +def format_for_github_summary(status_code, response_json, service_name): logs = response_json.get("logs", []) error_message = response_json.get("error") if not (200 <= status_code < 300): - title = f"## ❌ Deployment Failed (Status: {status_code})" + title = f"## ❌ Deployment Failed: {service_name} (Status: {status_code})" if not error_message: summary_lines = [f"**Error:** `{response_json.get('detail', 'Unknown error')}`"] else: summary_lines = [f"**Error:** `{error_message}`"] else: - title = f"## ✅ Deployment Succeeded (Status: {status_code})" + title = f"## ✅ Deployment Succeeded: {service_name} (Status: {status_code})" summary_lines = ["The deployment process completed successfully."] summary_lines.append("\n
\n\nView full deployment logs\n\n```text") @@ -63,8 +63,14 @@ def main(): ) parser.add_argument( "--healthcheck-url", - default="https://kelvin.cs.vsb.cz/api/v2/health", - help="The full URL for the application's health check endpoint. (e.g., 'https://nginx/api/v2/health')", + default=None, + help="The full URL for the application's health check endpoint. (e.g., 'https://nginx/api/v2/health'). If not provided, the container's health status will be checked.", + ) + parser.add_argument( + "--health-check-timeout", + type=int, + default=None, + help="Optional timeout for the health check in seconds. Overrides the server-side default.", ) parser.add_argument( @@ -97,6 +103,7 @@ def main(): "image": args.image, "commit_sha": args.commit_sha, "healthcheck_url": args.healthcheck_url, + "health_check_timeout": args.health_check_timeout, } message_data = json.dumps(message_dict).encode("utf-8") signature = hmac.new(secret.encode("utf-8"), message_data, hashlib.sha256).hexdigest() @@ -128,7 +135,7 @@ def main(): "error": f"Invalid JSON response from server (Status: {status_code}).", } - summary_content = format_for_github_summary(status_code, response_json) + summary_content = format_for_github_summary(status_code, response_json, args.service_name) if is_github_env: summary_file_path = str(os.getenv("GITHUB_STEP_SUMMARY")) with open(summary_file_path, "a", encoding="utf-8") as f: diff --git a/deployment_service/tests/test_deployment.py b/deployment_service/tests/test_deployment.py index 7c930f987..3cbfb28c3 100644 --- a/deployment_service/tests/test_deployment.py +++ b/deployment_service/tests/test_deployment.py @@ -110,10 +110,22 @@ def fake_time(): result = await manager_instance._health_check() assert result is False - assert "Health check timed out" in manager_instance.logs[-1] + assert "HTTP health check timed out." in manager_instance.logs[-1] assert mock_sleep.called +@pytest.mark.asyncio +async def test_health_check_docker_healthy(manager_instance): + manager_instance.healthcheck_url = None + mock_container = MagicMock() + mock_container.attrs = {"State": {"Health": {"Status": "healthy"}}} + manager_instance.client.containers.get.return_value = mock_container + + result = await manager_instance._health_check() + assert result is True + assert "Docker health check passed." in manager_instance.logs[-1] + + @pytest.mark.asyncio async def test_swap_service_critical_error(manager_instance): manager_instance._run_command = AsyncMock(side_effect=[True, False]) diff --git a/docs/docs/02-developers-guide/02-deployment.mdx b/docs/docs/02-developers-guide/02-deployment.mdx index 940141d06..61919799c 100644 --- a/docs/docs/02-developers-guide/02-deployment.mdx +++ b/docs/docs/02-developers-guide/02-deployment.mdx @@ -129,8 +129,18 @@ the specified commit_sha into it using git show. This isolates the new configura ### 4. Health Check -The manager performs an active health check by repeatedly sending HTTP GET requests to a specified `healthcheck_url`. -It continuously polls this endpoint until it receives a 200 OK status code, which indicates the service is ready. If the health check times out, it triggers a rollback. +The manager performs an active health check by repeatedly sending HTTP GET requests to a specified `healthcheck_url` or, +if no `healthcheck_url` is provided, by relying on the Docker `HEALTHCHECK` status of the container. +It continuously polls this endpoint until it receives a 200 OK status code or +the Docker container status changes to `healthy`, which indicates the service is ready. + +The timeout for this check defaults to **90 seconds** (configurable via `health_check_timeout` in `config.py`). +It can also be overridden per-request by providing a `health_check_timeout` in the deployment payload. +If the health check times out, it triggers a rollback. + +:::note +The Docker-healthcheck fallback only works if the container image defines a Docker `HEALTHCHECK`; if no `HEALTHCHECK` is configured and no `healthcheck_url` is provided, the deployment will fail immediately. +::: ### 5. Rollback (on Failure)