diff --git a/.dockerignore b/.dockerignore index 8d02ebe4a..f58228b2a 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,5 +1,34 @@ +# Kelvin tasks/ submits/ submit_results/ +kelvin_data/ + +# Python .venv/ +__pycache__/ +*.py[cod] +*.pyd +*.pyo +*.so +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +.coverage +htmlcov/ + +# Node node_modules/ +**/dist/ +**/.vite/ + +# VCS / tooling +.git/ + +# Logs +**/*.log + +# Editor +.vscode/ +.idea/ +.DS_Store diff --git a/.env.example b/.env.example index b0b85a140..1dcce7b0e 100644 --- a/.env.example +++ b/.env.example @@ -1,4 +1,5 @@ ### Kelvin +# ------------------------------------------------------------------------------ # !!! IMPORTANT: For Production deployments using Deployment Service, all file paths must be specified as absolute due to use of DooD (Docker out of Docker) @@ -12,6 +13,13 @@ KELVIN__TASKS_PATH=./tasks KELVIN__SUBMITS_PATH=./submits # Path where submit results will be stored KELVIN__SUBMIT_RESULTS_PATH=./submit_results +# (Optional) Internal base URL used by the evaluator to contact the app. +# Required for local Docker development, where the request Host is 'localhost' +# (unreachable from other containers). Set to 'https://nginx' so the evaluator +# reaches the app through the internal nginx container. +# In production, leave unset — the DNS alias on the nginx service routes the +# real public hostname (e.g. kelvin.cs.vsb.cz) to nginx inside Docker. +# EVALUATION_LINK_BASEURL=https://nginx ### Postgres DATABASE__HOST=127.0.0.1 @@ -40,9 +48,21 @@ OPENAI__API_KEY=your_openai_api_key_here OPENAI__API_URL=http://localhost:8080/v1 OPENAI__MODEL=openai/gpt-oss-120b +### Evaluator Workers +# ------------------------------------------------------------------------------ +# Number of worker processes +EVALUATOR_CPU_REPLICAS=32 +EVALUATOR_CUDA_REPLICAS=32 + +# Redis Connection for Evaluators +# - If running LOCALLY (same machine as app): Leave these commented out or set to 'redis' and '6379'. +# - If running DISTRIBUTED (on a different machine): Set these to the IP/Host and Port of the main server's Redis. +# EVALUATOR_REDIS__HOST=redis +# EVALUATOR_REDIS__PORT=6379 + + ### Deployment Service -# ID of the docker group on the host machine (get it via `getent group docker | cut -d: -f3`) -DOCKER_GROUP_ID=999 +# ------------------------------------------------------------------------------ SECURITY__WEBHOOK_SECRET=yoursecretvalue SECURITY__ALLOWED_HOSTS=["localhost", "127.0.0.1", "nginx", "kelvin.cs.vsb.cz"] diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 37448ec83..79ae46d6e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -175,14 +175,25 @@ jobs: uses: docker/build-push-action@v6 with: context: "{{defaultContext}}:deployment_service" - cache-from: type=registry,ref=ghcr.io/mrlvsb/deployment-ci-cache - # Only write the cache in the master branch or workflow_dispatch builds + cache-from: type=gha + # Only write the cache in the merge_group or workflow_dispatch builds # https://github.com/docker/build-push-action/issues/845#issuecomment-1512619265 - cache-to: ${{ (github.event_name == 'merge_group' || github.event_name == 'workflow_dispatch') && 'type=registry,ref=ghcr.io/mrlvsb/deployment-ci-cache,compression=zstd' || '' }} + cache-to: ${{ (github.event_name == 'merge_group' || github.event_name == 'workflow_dispatch') && 'type=gha,mode=max' || '' }} tags: ghcr.io/mrlvsb/deployment:latest,ghcr.io/mrlvsb/deployment:${{ github.sha }} outputs: type=docker,dest=${{ runner.temp }}/deployment.tar - - name: Share Kelvin built image + - name: Build Kelvin-Evaluator Docker image + uses: docker/build-push-action@v6 + with: + target: evaluator + cache-from: type=gha + # Only write the cache in the merge_group or workflow_dispatch builds + # https://github.com/docker/build-push-action/issues/845#issuecomment-1512619265 + cache-to: ${{ (github.event_name == 'merge_group' || github.event_name == 'workflow_dispatch') && 'type=gha,mode=max' || '' }} + tags: ghcr.io/mrlvsb/kelvin-evaluator:latest,ghcr.io/mrlvsb/kelvin-evaluator:${{ github.sha }} + outputs: type=docker,dest=${{ runner.temp }}/kelvin-evaluator.tar + + - name: Share built image uses: actions/upload-artifact@v6 with: name: kelvin @@ -196,6 +207,13 @@ jobs: path: ${{ runner.temp }}/deployment.tar retention-days: 1 + - name: Share Kelvin-Evaluator image + uses: actions/upload-artifact@v6 + with: + name: kelvin-evaluator + path: ${{ runner.temp }}/kelvin-evaluator.tar + retention-days: 1 + build-docs: runs-on: ubuntu-latest steps: @@ -266,6 +284,12 @@ jobs: name: deployment path: ${{ runner.temp }} + - name: Download Kelvin-Evaluator image + uses: actions/download-artifact@v6 + with: + name: kelvin-evaluator + path: ${{ runner.temp }} + - name: Load image id: load_image run: | @@ -276,6 +300,12 @@ jobs: echo "$LOADED" SHA_TAG=$(echo "$LOADED" | grep -v ':latest' | awk '{print $3}') echo "app_image_tag=$SHA_TAG" >> $GITHUB_OUTPUT + + LOADED_EVAL=$(docker load --input ${{ runner.temp }}/kelvin-evaluator.tar) + echo "$LOADED_EVAL" + SHA_TAG_EVAL=$(echo "$LOADED_EVAL" | grep -v ':latest' | awk '{print $3}') + echo "evaluator_image_tag=$SHA_TAG_EVAL" >> $GITHUB_OUTPUT + if [ "${{ steps.changed-files-deployment.outputs.any_changed }}" = "true" ]; then docker load --input ${{ runner.temp }}/deployment.tar fi @@ -291,6 +321,9 @@ jobs: - name: Push Docker image with SHA tag run: docker push ${{ steps.load_image.outputs.app_image_tag }} + - name: Push Kelvin-Evaluator Docker image with SHA tag + run: docker push ${{ steps.load_image.outputs.evaluator_image_tag }} + - name: Trigger on-prem deployment run: | python3 deployment_service/deploy.py \ @@ -306,6 +339,9 @@ jobs: - name: Push Kelvin Docker image with latest tag run: docker push ghcr.io/mrlvsb/kelvin:latest + - name: Push Kelvin Evaluator Docker image with latest tag + run: docker push ghcr.io/mrlvsb/kelvin-evaluator:latest + - name: Push Deployment_service Docker image with all tags if: steps.changed-files-deployment.outputs.any_changed == 'true' run: docker push --all-tags ghcr.io/mrlvsb/deployment @@ -323,6 +359,12 @@ jobs: package-type: 'container' min-versions-to-keep: 15 + - uses: actions/delete-package-versions@v5 + with: + package-name: 'kelvin-evaluator' + package-type: 'container' + min-versions-to-keep: 15 + deploy-docs: runs-on: ubuntu-latest needs: [ build-docs ] diff --git a/Dockerfile b/Dockerfile index 88d6313d7..25453f74f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,13 @@ -FROM ghcr.io/astral-sh/uv:python3.12-bookworm AS build-backend +FROM python:3.12-slim-bookworm AS build-backend + +COPY --from=ghcr.io/astral-sh/uv:0.10.0 /uv /usr/local/bin/uv RUN export DEBIAN_FRONTEND=noninteractive && \ apt-get update && \ apt-get install -y \ -o APT::Install-Recommends=false \ -o APT::Install-Suggests=false \ + build-essential \ libsasl2-dev \ libgraphviz-dev @@ -26,14 +29,15 @@ RUN npm ci RUN npm run build -FROM python:3.12-bookworm AS runtime +FROM python:3.12-slim-bookworm AS runtime RUN export DEBIAN_FRONTEND=noninteractive && \ apt-get update && \ apt-get install -y \ -o APT::Install-Recommends=false \ -o APT::Install-Suggests=false \ - graphviz && \ + graphviz \ + libmagic1 && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* @@ -43,6 +47,8 @@ WORKDIR /app # We want to use ID 1000, to have the same ID as the default outside user # And we also want group 101, to provide share access to the Unix uWSGI # socket with the nginx image. +RUN getent group 101 >/dev/null || groupadd -g 101 webserver + RUN useradd --uid 1000 --gid 101 --shell /bin/false --system webserver RUN chown -R webserver . @@ -72,3 +78,44 @@ COPY --chown=webserver deploy/entrypoint.sh ./ STOPSIGNAL SIGINT ENTRYPOINT ["/app/entrypoint.sh"] + +FROM runtime AS evaluator + +# Switch temporary to root user to install Docker CLI and other system dependencies +USER root + +RUN export DEBIAN_FRONTEND=noninteractive && \ + apt-get update && \ + apt-get install -y \ + -o APT::Install-Recommends=false \ + -o APT::Install-Suggests=false \ + ca-certificates \ + curl \ + procps && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +RUN mkdir -p /etc/apt/keyrings && \ + curl -fsSL https://download.docker.com/linux/debian/gpg -o /etc/apt/keyrings/docker.asc +RUN chmod a+r /etc/apt/keyrings/docker.asc + +RUN echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/debian \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + tee /etc/apt/sources.list.d/docker.list > /dev/null + +RUN export DEBIAN_FRONTEND=noninteractive && \ + apt-get update && \ + apt-get install -y \ + -o APT::Install-Recommends=false \ + -o APT::Install-Suggests=false \ + docker-ce docker-ce-cli containerd.io docker-compose-plugin && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +USER webserver + +ENTRYPOINT [] +CMD ["python", "manage.py", "rqworker", "default", "evaluator", "--with-scheduler"] +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD pgrep -f "rqworker" || exit 1 diff --git a/common/ai_review/job.py b/common/ai_review/job.py index 68b49746f..70f2efc3f 100644 --- a/common/ai_review/job.py +++ b/common/ai_review/job.py @@ -5,6 +5,7 @@ import django_rq import requests +from django.conf import settings from serde import from_dict from serde.json import to_json @@ -32,6 +33,17 @@ def detect_language(filename: str) -> Optional[str]: def upload_result(submit_url: str, result: AIReviewResult) -> None: session = requests.Session() + # Disable SSL verification in DEBUG mode (local Docker development environment). + # + # EXPLANATION: + # In the local Docker development environment (DEBUG=True), the services communicate + # via internal Docker network names (e.g. 'https://nginx'). + # The Nginx service uses self-signed certificates for HTTPS. + # Since these certificates are not issued by a trusted Certificate Authority (CA), + # requests would fail with an SSL error. Disabling verification allows + # the evaluator to download submissions and upload results in this dev environment. + if settings.DEBUG: + session.verify = False json_body = to_json(result, indent=2) logging.debug("Result JSON body: \n%s", json_body) diff --git a/common/ai_review/processor.py b/common/ai_review/processor.py index 1805e034c..6a2d3cbda 100644 --- a/common/ai_review/processor.py +++ b/common/ai_review/processor.py @@ -14,7 +14,7 @@ ) from common.ai_review.job import review_job from common.models import SuggestedComment, Submit -from common.utils import build_absolute_uri +from common.utils import build_evaluation_download_uri AI_REVIEW_COMMENT_TYPE: str = "ai-review" AI_REVIEW_COMMENT_AUTHOR: str = "LLM" @@ -30,7 +30,7 @@ def enqueue_llm_review_job( if not llm_config.enabled: return None - review_upload_url = build_absolute_uri( + review_upload_url = build_evaluation_download_uri( request, reverse( "v2:upload_submit_llm_review_result", @@ -40,7 +40,7 @@ def enqueue_llm_review_job( ), ) - review_prompt_url = build_absolute_uri( + review_prompt_url = build_evaluation_download_uri( request, reverse( "v2:retrieve_llm_review_prompt", diff --git a/common/evaluate.py b/common/evaluate.py index 484420444..e43d457d3 100644 --- a/common/evaluate.py +++ b/common/evaluate.py @@ -8,12 +8,13 @@ import django_rq import requests import yaml +from django.conf import settings from django.core import signing from django.urls import reverse from django.utils import timezone from common.ai_review.processor import enqueue_llm_review_job -from common.utils import is_teacher, build_absolute_uri +from common.utils import is_teacher, build_evaluation_download_uri from evaluator.evaluator import Evaluation from evaluator.testsets import TestSet from kelvin.settings import BASE_DIR @@ -39,7 +40,7 @@ def load_task_config(task_path: str) -> Optional[dict]: def evaluate_submit(request, submit, meta=None): - submit_url = build_absolute_uri( + submit_url = build_evaluation_download_uri( request, reverse( "task_detail", @@ -51,7 +52,7 @@ def evaluate_submit(request, submit, meta=None): ), ) - task_url = build_absolute_uri( + task_url = build_evaluation_download_uri( request, reverse( "teacher_task_tar", @@ -102,6 +103,18 @@ def evaluate_job(submit_url, task_url, token, meta): logging.basicConfig(level=logging.DEBUG) s = requests.Session() + # Disable SSL verification in DEBUG mode (local Docker development environment). + # + # EXPLANATION: + # In the local Docker development environment (DEBUG=True), the services communicate + # via internal Docker network names (e.g. 'https://nginx'). + # The Nginx service uses self-signed certificates for HTTPS. + # Since these certificates are not issued by a trusted Certificate Authority (CA), + # requests would fail with an SSL error. Disabling verification allows + # the evaluator to download submissions and upload results in this dev environment. + if settings.DEBUG: + s.verify = False + logging.info(f"Evaluating {submit_url}") with tempfile.TemporaryDirectory() as workdir: diff --git a/common/utils.py b/common/utils.py index 8c05f0e47..2d92f3844 100644 --- a/common/utils.py +++ b/common/utils.py @@ -1,5 +1,4 @@ import io -import os import re import tarfile from datetime import timedelta @@ -8,6 +7,7 @@ import django.contrib.auth.models import requests +from django.conf import settings from django.http import HttpRequest from ipware import get_client_ip @@ -98,6 +98,18 @@ def download_source_to_path(source_url: str, destination_path: str) -> None: """ session = requests.Session() + # Disable SSL verification in DEBUG mode (local Docker development environment). + # + # EXPLANATION: + # In the local Docker development environment (DEBUG=True), the services communicate + # via internal Docker network names (e.g. 'https://nginx'). + # The Nginx service uses self-signed certificates for HTTPS. + # Since these certificates are not issued by a trusted Certificate Authority (CA), + # requests would fail with an SSL error. Disabling verification allows + # the evaluator to download submissions and upload results in this dev environment. + if settings.DEBUG: + session.verify = False + response = session.get(source_url) if response.status_code != 200: @@ -107,8 +119,7 @@ def download_source_to_path(source_url: str, destination_path: str) -> None: tar.extractall(destination_path) -def build_absolute_uri(request, location): - base_uri = os.getenv("API_INTERNAL_BASEURL", None) - if base_uri: - return "".join([base_uri, location]) +def build_evaluation_download_uri(request, location): + if settings.EVALUATION_LINK_BASEURL: + return settings.EVALUATION_LINK_BASEURL + location return request.build_absolute_uri(location) diff --git a/docker-compose.yml b/docker-compose.yml index 931f61675..06c4ad6fc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -26,6 +26,9 @@ services: - DATABASE__USERNAME=${DATABASE__USERNAME} - DATABASE__PASSWORD=${DATABASE__PASSWORD} - KELVIN__HOST_URL=${KELVIN__HOST_URL} + # Only needed for local Docker development (see .env.example for explanation). + # Leave unset in production. + - EVALUATION_LINK_BASEURL=${EVALUATION_LINK_BASEURL:-} volumes: - app_static:/app/static - app_socket:/socket @@ -75,9 +78,109 @@ services: - ${NGINX__LOGS_PATH}:/var/log/nginx - app_static:/app/static:ro - app_socket:/socket + networks: + default: + aliases: + # Allows containers (e.g. evaluators) to reach nginx using the public hostname internally. + # Safe to use KELVIN__HOST_URL directly: if set to an IP (e.g. 127.0.0.1 for local development), + # docker registers it as an alias but it is never DNS-queried (IPs bypass DNS resolution entirely). + - ${KELVIN__HOST_URL:-} + + evaluator_scheduler: + container_name: kelvin_evaluator_scheduler + depends_on: + - redis + - app + - docker_proxy + profiles: [ prod ] + build: + context: . + dockerfile: Dockerfile + target: evaluator + image: "ghcr.io/mrlvsb/kelvin-evaluator:${EVALUATOR_IMAGE_TAG:-latest}" + pull_policy: always + restart: unless-stopped + command: "python manage.py rqworker default --with-scheduler" + environment: + - REDIS__HOST=redis + - REDIS__PORT=6379 + # For local development set debug to True in .env (e.g. disable TLS verification for HTTP client) + - DEBUG=${DEBUG:-false} + - DOCKER_HOST=docker_proxy:2375 + volumes: + # Mount /tmp/kelvin so Docker-in-Docker can access temporary evaluation directories + - /tmp:/tmp + networks: + - default + - docker_proxy_net + + evaluator_cpu: + container_name: kelvin_evaluator_cpu + depends_on: + - docker_proxy + profiles: [ evaluator_cpu ] + build: + context: . + dockerfile: Dockerfile + target: evaluator + image: "ghcr.io/mrlvsb/kelvin-evaluator:${EVALUATOR_IMAGE_TAG:-latest}" + pull_policy: always + restart: unless-stopped + command: "sh -c 'python /app/evaluator/images/build.py && python manage.py rqworker-pool evaluator --num-workers ${EVALUATOR_CPU_REPLICAS:-32}'" + environment: + # Option to specify Redis host and port to connect from other machines where evaluator runs, + # fallbacks to 'redis' hostname and default port, which means same machine + - REDIS__HOST=${EVALUATOR_REDIS__HOST:-redis} + - REDIS__PORT=${EVALUATOR_REDIS__PORT:-6379} + # For local development set debug to True in .env (e.g. disable TLS verification for HTTP client) + - DEBUG=${DEBUG:-false} + - DOCKER_HOST=docker_proxy:2375 + volumes: + # Mount /tmp so Docker-in-Docker can access temporary evaluation directories + - /tmp:/tmp + networks: + - default + - docker_proxy_net + + evaluator_cuda: + container_name: kelvin_evaluator_cuda + depends_on: + - docker_proxy + profiles: [ evaluator_cuda ] + build: + context: . + dockerfile: Dockerfile + target: evaluator + image: "ghcr.io/mrlvsb/kelvin-evaluator:${EVALUATOR_IMAGE_TAG:-latest}" + pull_policy: always + restart: unless-stopped + command: "sh -c 'python /app/evaluator/images/build.py && python manage.py rqworker-pool cuda --num-workers ${EVALUATOR_CUDA_REPLICAS:-32}'" + environment: + # Option to specify Redis host and port to connect from other machines where evaluator runs, + # fallbacks to 'redis' hostname and default port, which means same machine + - REDIS__HOST=${EVALUATOR_REDIS__HOST:-redis} + - REDIS__PORT=${EVALUATOR_REDIS__PORT:-6379} + # For local development set debug to True in .env (e.g. disable TLS verification for HTTP client) + - DEBUG=${DEBUG:-false} + - DOCKER_HOST=docker_proxy:2375 + volumes: + # Mount /tmp so Docker-in-Docker can access temporary evaluation directories + - /tmp:/tmp + networks: + - default + - docker_proxy_net + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [ gpu ] deployment: container_name: kelvin_deployment_service + depends_on: + - docker_proxy profiles: [ prod ] build: context: deployment_service @@ -86,8 +189,6 @@ services: image: "ghcr.io/mrlvsb/deployment:latest" pull_policy: always restart: unless-stopped - group_add: - - "${DOCKER_GROUP_ID:?DOCKER_GROUP_ID is not set}" environment: # Hardcode the docker-compose.yml path inside the container to match the repository volume mount path (must be in sync with volume that mounts the repo) - DOCKER__COMPOSE_FILE_PATH=/kelvin/docker-compose.yml @@ -95,11 +196,33 @@ services: - DEBUG=${DEBUG:-false} - SECURITY__WEBHOOK_SECRET=${SECURITY__WEBHOOK_SECRET} - SECURITY__ALLOWED_HOSTS=${SECURITY__ALLOWED_HOSTS} + - DOCKER_HOST=docker_proxy:2375 volumes: # Mount the Kelvin repo to /kelvin inside the container,must be in sync with DOCKER__COMPOSE_FILE_PATH - ${REPO__DIRECTORY_PATH}:/kelvin + networks: + - default + - docker_proxy_net + + # proxy from TCP to unix socket, so we dont need to run web as root with different uid + # or solve issues with socket permissions + docker_proxy: + container_name: kelvin_docker_proxy + profiles: [ prod,evaluator_cpu,evaluator_cuda ] + image: alpine/socat + restart: unless-stopped + command: tcp-listen:2375,fork,reuseaddr unix-connect:/var/run/docker.sock + user: root + networks: + - docker_proxy_net + volumes: - /var/run/docker.sock:/var/run/docker.sock volumes: app_static: app_socket: + +networks: + docker_proxy_net: + # Internal network: no external internet access, only reachable by attached services + internal: true diff --git a/kelvin/settings.py b/kelvin/settings.py index 7414c80d5..0c28d99ae 100644 --- a/kelvin/settings.py +++ b/kelvin/settings.py @@ -17,7 +17,7 @@ # Build paths inside the project like this: os.path.join(BASE_DIR, ...) BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -# Load environment variables from an .env file +# Load environment variables from an '.env' file dotenv_file = os.path.join(BASE_DIR, ".env") if os.path.isfile(dotenv_file): dotenv.load_dotenv(dotenv_file) @@ -26,6 +26,13 @@ PUBLIC_URL = f"https://{KELVIN_ROOT_HOST}" +# Base URL used internally by the evaluator to contact the app. +# Needed in local Docker development where the request Host header is 'localhost' +# (unreachable from other containers). Set to e.g. 'https://nginx' in '.env' . +# In production, leave unset — the DNS alias on the nginx service makes the +# real hostname resolve to nginx inside the Docker network. +EVALUATION_LINK_BASEURL: str | None = os.getenv("EVALUATION_LINK_BASEURL", None) + # Quick-start development settings - unsuitable for production # See https://docs.djangoproject.com/en/dev/howto/deployment/checklist/ @@ -34,7 +41,7 @@ SECRET_KEY = "***REMOVED***" # SECURITY WARNING: don't run with debug turned on in production! -DEBUG = True +DEBUG: bool = os.getenv("DEBUG", "true").lower() in ("1", "true", "yes") ALLOWED_HOSTS = ["127.0.0.1", "localhost", "app", "nginx", KELVIN_ROOT_HOST]