Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
3ca5cbf
feat: add Solana RPC and GitHub API checks to health endpoint
LaphoqueRC Mar 23, 2026
79d7a72
fix: address CodeRabbit review feedback on health check endpoint
Mar 23, 2026
a352ece
fix: apply ruff format fixes to health check files
Mar 23, 2026
53f876a
fix: add latency_ms to timeout and HTTP error responses for consistency
LaphoqueRC Mar 23, 2026
aa5368d
test: add missing coverage for Redis unexpected error, Solana/GitHub …
LaphoqueRC Mar 23, 2026
9d65220
style: apply ruff formatter
LaphoqueRC Mar 23, 2026
5e6d9bd
style: apply ruff formatter to test file
LaphoqueRC Mar 23, 2026
9294e9e
fix: convert async tests to sync via run_async; fix malformed-respons…
LaphoqueRC Mar 23, 2026
e59d641
fix: address coderabbit review - ruff format, latency_ms consistency,…
LaphoqueRC Mar 23, 2026
21e6e7f
fix: ruff format, fix engine.connect mock, update health format asser…
Mar 23, 2026
3c690d5
fix(tests): fix test_health.py and test_logging_and_errors.py
LaphoqueRC Mar 23, 2026
34c2dcd
fix: remove unused engine import from test_health.py (ruff F401)
LaphoqueRC Mar 23, 2026
9d96cd6
fix: remove btree index on JSON column (PostgreSQL doesn't support it)
LaphoqueRC Mar 23, 2026
d94eabd
fix: add latency_ms to DB and Redis error responses for shape consist…
LaphoqueRC Mar 23, 2026
82fa9b9
fix: apply ruff formatter to health.py
LaphoqueRC Mar 23, 2026
6db6de4
fix: use strict key access in GitHub API shape validation
LaphoqueRC Mar 23, 2026
344245e
chore: retrigger review pipeline
LaphoqueRC Mar 23, 2026
6ea3635
fix: remove trailing blank line in health.py (ruff format)
LaphoqueRC Mar 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
271 changes: 245 additions & 26 deletions backend/app/api/health.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
"""Health check endpoint for uptime monitoring and load balancers."""
"""Health check endpoint for uptime monitoring and load balancers.

Checks four services:
- PostgreSQL database connectivity
- Redis connectivity
- Solana RPC endpoint responsiveness
- GitHub API rate limit availability
"""

import asyncio
import logging
import os
import time
from datetime import datetime, timezone

import httpx
from fastapi import APIRouter
from sqlalchemy import text
from sqlalchemy.exc import SQLAlchemyError
Expand All @@ -17,50 +26,260 @@

router = APIRouter(tags=["health"])

# Timeout for external service checks (Solana RPC, GitHub API)
_EXTERNAL_TIMEOUT_MS = 200
_EXTERNAL_TIMEOUT_S = _EXTERNAL_TIMEOUT_MS / 1000


# ---------------------------------------------------------------------------
# Service check helpers
# ---------------------------------------------------------------------------

async def _check_database() -> str:

async def _check_database() -> dict:
"""Check PostgreSQL connectivity via a simple query."""
start = time.monotonic()
try:
async with engine.connect() as conn:
await conn.execute(text("SELECT 1"))
return "connected"
except SQLAlchemyError:
logger.warning("Health check DB failure: connection error")
return "disconnected"
except Exception:
logger.warning("Health check DB failure: unexpected error")
return "disconnected"
latency_ms = round((time.monotonic() - start) * 1000)
return {"status": "healthy", "latency_ms": latency_ms}
except SQLAlchemyError as exc:
latency_ms = round((time.monotonic() - start) * 1000)
logger.warning("Health check DB failure: %s", exc)
return {
"status": "unavailable",
"latency_ms": latency_ms,
"error": "connection_error",
}
except Exception as exc:
latency_ms = round((time.monotonic() - start) * 1000)
logger.warning("Health check DB failure: %s", exc)
return {
"status": "unavailable",
"latency_ms": latency_ms,
"error": "unexpected_error",
}


async def _check_redis() -> str:
async def _check_redis() -> dict:
"""Check Redis connectivity via PING."""
start = time.monotonic()
try:
redis_url = os.getenv("REDIS_URL", "redis://localhost:6379/0")
client = from_url(redis_url, decode_responses=True)
async with client:
await client.ping()
return "connected"
except RedisError:
logger.warning("Health check Redis failure: connection error")
return "disconnected"
except Exception:
logger.warning("Health check Redis failure: unexpected error")
return "disconnected"
latency_ms = round((time.monotonic() - start) * 1000)
return {"status": "healthy", "latency_ms": latency_ms}
except RedisError as exc:
latency_ms = round((time.monotonic() - start) * 1000)
logger.warning("Health check Redis failure: %s", exc)
return {
"status": "unavailable",
"latency_ms": latency_ms,
"error": "connection_error",
}
except Exception as exc:
latency_ms = round((time.monotonic() - start) * 1000)
logger.warning("Health check Redis failure: %s", exc)
return {
"status": "unavailable",
"latency_ms": latency_ms,
"error": "unexpected_error",
}


async def _check_solana_rpc() -> dict:
"""Check Solana RPC by requesting the latest slot.

Uses the configured SOLANA_RPC_URL or defaults to mainnet-beta.
Enforces a strict 200ms timeout to avoid blocking the health response.
"""
rpc_url = os.getenv("SOLANA_RPC_URL", "https://api.mainnet-beta.solana.com")
start = time.monotonic()
try:
async with httpx.AsyncClient(timeout=_EXTERNAL_TIMEOUT_S) as client:
resp = await client.post(
rpc_url,
json={"jsonrpc": "2.0", "id": 1, "method": "getSlot"},
)
resp.raise_for_status()
try:
data = resp.json()
if not isinstance(data, dict):
raise ValueError(f"unexpected response type: {type(data)}")
slot = data.get("result")
except Exception as exc:
logger.warning("Solana RPC malformed response: %s", exc)
latency_ms = round((time.monotonic() - start) * 1000)
return {
"status": "degraded",
"latency_ms": latency_ms,
"error": "malformed_response",
}
latency_ms = round((time.monotonic() - start) * 1000)
if slot is not None:
return {"status": "healthy", "latency_ms": latency_ms, "slot": slot}
return {
"status": "degraded",
"latency_ms": latency_ms,
"error": "no_slot_in_response",
}
except httpx.TimeoutException:
latency_ms = round((time.monotonic() - start) * 1000)
return {"status": "degraded", "latency_ms": latency_ms, "error": "timeout"}
except httpx.HTTPStatusError as exc:
latency_ms = round((time.monotonic() - start) * 1000)
logger.warning("Solana RPC HTTP error: %s", exc.response.status_code)
return {
"status": "degraded",
"latency_ms": latency_ms,
"error": f"http_{exc.response.status_code}",
}
except Exception as exc:
latency_ms = round((time.monotonic() - start) * 1000)
logger.warning("Solana RPC check failed: %s", exc)
return {
"status": "unavailable",
"latency_ms": latency_ms,
"error": "connection_error",
}


async def _check_github_api() -> dict:
"""Check GitHub API availability via the rate_limit endpoint.

Uses GITHUB_TOKEN if available for authenticated rate limits.
Reports remaining calls and reset time.
"""
start = time.monotonic()
headers: dict[str, str] = {"Accept": "application/vnd.github+json"}
token = os.getenv("GITHUB_TOKEN", "")
if token:
headers["Authorization"] = f"Bearer {token}"
try:
async with httpx.AsyncClient(timeout=_EXTERNAL_TIMEOUT_S) as client:
resp = await client.get(
"https://api.github.com/rate_limit",
headers=headers,
)
resp.raise_for_status()
try:
data = resp.json()
if not isinstance(data, dict):
raise ValueError(f"unexpected response type: {type(data)}")
# Validate expected shape; KeyError raised here if keys missing.
_ = data["resources"]["core"]
except Exception as exc:
logger.warning("GitHub API malformed response: %s", exc)
latency_ms = round((time.monotonic() - start) * 1000)
return {
"status": "degraded",
"latency_ms": latency_ms,
"error": "malformed_response",
}
latency_ms = round((time.monotonic() - start) * 1000)
core = data.get("resources", {}).get("core", {})
remaining = core.get("remaining", 0)
limit = core.get("limit", 0)
reset_at = core.get("reset", 0)

# Consider degraded if less than 10% of rate limit remaining.
# When limit is 0 (unexpected), treat as degraded.
if limit > 0:
status = "healthy" if remaining >= limit * 0.1 else "degraded"
else:
status = "degraded"

return {
"status": status,
"latency_ms": latency_ms,
"rate_limit": {
"remaining": remaining,
"limit": limit,
"reset_at": datetime.fromtimestamp(reset_at, tz=timezone.utc).strftime(
"%Y-%m-%dT%H:%M:%SZ"
)
if reset_at
else None,
},
}
except httpx.TimeoutException:
latency_ms = round((time.monotonic() - start) * 1000)
return {"status": "degraded", "latency_ms": latency_ms, "error": "timeout"}
except httpx.HTTPStatusError as exc:
latency_ms = round((time.monotonic() - start) * 1000)
logger.warning("GitHub API HTTP error: %s", exc.response.status_code)
return {
"status": "degraded",
"latency_ms": latency_ms,
"error": f"http_{exc.response.status_code}",
}
except Exception as exc:
latency_ms = round((time.monotonic() - start) * 1000)
logger.warning("GitHub API check failed: %s", exc)
return {
"status": "unavailable",
"latency_ms": latency_ms,
"error": "connection_error",
}


def _overall_status(services: dict) -> str:
"""Compute overall health from individual service statuses.

Returns:
"healthy" — all services healthy
"degraded" — at least one degraded but core (db+redis) healthy
"unavailable" — any core service unavailable
"""
statuses = [s.get("status", "unavailable") for s in services.values()]
core_statuses = [
services.get("database", {}).get("status", "unavailable"),
services.get("redis", {}).get("status", "unavailable"),
]

if "unavailable" in core_statuses:
return "unavailable"
if "unavailable" in statuses or "degraded" in statuses:
return "degraded"
return "healthy"


# ---------------------------------------------------------------------------
# Endpoint
# ---------------------------------------------------------------------------


@router.get("/health", summary="Service health check")
async def health_check() -> dict:
"""Return service status including database and Redis connectivity."""
db_status = await _check_database()
redis_status = await _check_redis()
"""Return service status including database, Redis, Solana RPC,
and GitHub API connectivity.

is_healthy = db_status == "connected" and redis_status == "connected"
Status vocabulary:
- ``healthy``: service is fully operational
- ``degraded``: service is reachable but impaired (slow, rate-limited)
- ``unavailable``: service cannot be reached
"""
db, redis, solana, github = await asyncio.gather(
_check_database(),
_check_redis(),
_check_solana_rpc(),
_check_github_api(),
)

services = {
"database": db,
"redis": redis,
"solana_rpc": solana,
"github_api": github,
}

return {
"status": "healthy" if is_healthy else "degraded",
"status": _overall_status(services),
"version": "1.0.0",
"uptime_seconds": round(time.monotonic() - START_TIME),
"timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
"services": {
"database": db_status,
"redis": redis_status,
},
"services": services,
}
2 changes: 1 addition & 1 deletion backend/app/models/bounty_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,5 +69,5 @@ class BountyTable(Base):
Index("ix_bounties_reward", reward_amount),
Index("ix_bounties_deadline", deadline),
Index("ix_bounties_popularity", popularity),
Index("ix_bounties_skills", skills),
# ix_bounties_skills removed: JSON column cannot use btree index (PostgreSQL limitation)
)
Loading
Loading