lilubot · lilubot · Mar 19, 2026 · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026
diff --git a/backend/app/__init__.py b/backend/app/__init__.py
@@ -80,11 +80,22 @@ def log_response(response):
     app.register_blueprint(simulation_bp, url_prefix='/api/simulation')
     app.register_blueprint(report_bp, url_prefix='/api/report')
 
-    # Health check
+    # Register ops/health blueprints (Prometheus monitoring layer)
+    from .routes import health_bp, ops_bp
+    app.register_blueprint(health_bp)
+    app.register_blueprint(ops_bp)
+
+    # Legacy simple health route (keep for backward compatibility)
     @app.route('/health')
     def health():
         return {'status': 'ok', 'service': 'MiroFish-Offline Backend'}
 
+    # Run startup preflight checks (logs warnings; never blocks startup)
+    from .preflight import run_preflight_checks
+    with app.app_context():
+        preflight_results = run_preflight_checks()
+        app.extensions['preflight'] = preflight_results
+
     if should_log_startup:
         logger.info("MiroFish-Offline Backend startup complete")
 

diff --git a/backend/app/api/report.py b/backend/app/api/report.py
@@ -6,11 +6,12 @@
 import os
 import traceback
 import threading
-from flask import request, jsonify, send_file
+from flask import request, jsonify, send_file, current_app
 
 from . import report_bp
 from ..config import Config
 from ..services.report_agent import ReportAgent, ReportManager, ReportStatus
+from ..services.graph_tools import GraphToolsService
 from ..services.simulation_manager import SimulationManager
 from ..models.project import ProjectManager
 from ..models.task import TaskManager, TaskStatus
@@ -109,6 +110,14 @@ def generate_report():
         import uuid
         report_id = f"report_{uuid.uuid4().hex[:12]}"
 
+        # Capture GraphStorage before background thread starts
+        storage = current_app.extensions.get('neo4j_storage')
+        if not storage:
+            return jsonify({
+                "success": False,
+                "error": "GraphStorage not initialized - check Neo4j connection"
+            }), 500
+
         # Create async task
         task_manager = TaskManager()
         task_id = task_manager.create_task(
@@ -130,11 +139,13 @@ def run_generate():
                     message="Initializing Report Agent..."
                 )
 
-                # Create Report Agent
+                # Create Report Agent with injected graph tools
+                tools = GraphToolsService(storage=storage)
                 agent = ReportAgent(
                     graph_id=graph_id,
                     simulation_id=simulation_id,
-                    simulation_requirement=simulation_requirement
+                    simulation_requirement=simulation_requirement,
+                    graph_tools=tools
                 )
 
                 # Progress callback

diff --git a/backend/app/config.py b/backend/app/config.py
@@ -31,6 +31,7 @@ class Config:
     LLM_API_KEY = os.environ.get('LLM_API_KEY')
     LLM_BASE_URL = os.environ.get('LLM_BASE_URL', 'http://localhost:11434/v1')
     LLM_MODEL_NAME = os.environ.get('LLM_MODEL_NAME', 'qwen2.5:32b')
+    NER_MODEL = os.environ.get('NER_MODEL', 'minimax-m2.7:cloud')
 
     # Neo4j configuration
     NEO4J_URI = os.environ.get('NEO4J_URI', 'bolt://localhost:7687')

diff --git a/backend/app/preflight.py b/backend/app/preflight.py
@@ -0,0 +1,147 @@
+"""
+MiroFish Startup Preflight Checks
+Run on app startup to detect missing services/config early.
+Logs warnings but never blocks startup.
+"""
+
+import os
+import time
+import requests
+import logging
+
+from .config import Config
+
+logger = logging.getLogger('mirofish.preflight')
+
+# Required environment variable names (as strings, checked against os.environ)
+_REQUIRED_ENV_VARS = [
+    'NEO4J_URI',
+    'NEO4J_PASSWORD',
+    'LLM_BASE_URL',
+    'LLM_MODEL_NAME',
+]
+
+
+# ---------------------------------------------------------------------------
+# Individual checks
+# ---------------------------------------------------------------------------
+
+def _check_neo4j() -> dict:
+    """Verify Neo4j HTTP endpoint is reachable."""
+    try:
+        uri = Config.NEO4J_URI or 'bolt://localhost:7687'
+        http_uri = uri.replace('bolt://', 'http://').replace(':7687', ':7474')
+        r = requests.get(http_uri, timeout=5)
+        if r.status_code == 200:
+            return {"ok": True, "url": http_uri}
+        return {"ok": False, "url": http_uri, "error": f"HTTP {r.status_code}"}
+    except Exception as e:
+        return {"ok": False, "error": str(e)}
+
+
+def _check_ollama() -> dict:
+    """Verify Ollama /api/tags endpoint is reachable."""
+    try:
+        base = (Config.LLM_BASE_URL or 'http://localhost:11434/v1').rstrip('/')
+        base = base.replace('host.docker.internal', 'localhost')
+        if base.endswith('/v1'):
+            base = base[:-3]
+        url = f"{base}/api/tags"
+        r = requests.get(url, timeout=5)
+        if r.status_code == 200:
+            return {"ok": True, "url": url}
+        return {"ok": False, "url": url, "error": f"HTTP {r.status_code}"}
+    except Exception as e:
+        return {"ok": False, "error": str(e)}
+
+
+def _check_required_model() -> dict:
+    """Verify the configured LLM model is present in Ollama."""
+    try:
+        base = (Config.LLM_BASE_URL or 'http://localhost:11434/v1').rstrip('/')
+        base = base.replace('host.docker.internal', 'localhost')
+        if base.endswith('/v1'):
+            base = base[:-3]
+        r = requests.get(f"{base}/api/tags", timeout=5)
+        models = [m['name'] for m in r.json().get('models', [])]
+        required = Config.LLM_MODEL_NAME
+        ok = required in models
+        result = {"ok": ok, "required": required}
+        if not ok:
+            result["available"] = models
+        return result
+    except Exception as e:
+        return {"ok": False, "error": str(e)}
+
+
+def _check_uploads_dir() -> dict:
+    """Verify uploads directory exists and is writable."""
+    folder = os.path.abspath(Config.UPLOAD_FOLDER)
+    try:
+        os.makedirs(folder, exist_ok=True)
+        test_path = os.path.join(folder, '.preflight')
+        with open(test_path, 'w') as f:
+            f.write('ok')
+        os.remove(test_path)
+        return {"ok": True, "path": folder}
+    except Exception as e:
+        return {"ok": False, "path": folder, "error": str(e)}
+
+
+def _check_required_env_vars() -> dict:
+    """Check that required environment variables are set."""
+    missing = []
+    for var in _REQUIRED_ENV_VARS:
+        # Accept values set either via os.environ or via Config class attributes
+        env_val = os.environ.get(var)
+        config_attr = var.replace('LLM_BASE_URL', 'LLM_BASE_URL') \
+                        .replace('LLM_MODEL_NAME', 'LLM_MODEL_NAME')
+        config_val = getattr(Config, config_attr, None)
+        if not env_val and not config_val:
+            missing.append(var)
+    return {"ok": len(missing) == 0, "missing": missing}
+
+
+# ---------------------------------------------------------------------------
+# Main entry point
+# ---------------------------------------------------------------------------
+
+def run_preflight_checks() -> dict:
+    """
+    Run all startup preflight checks.
+
+    Logs warnings for any failures but does NOT prevent app startup.
+    Returns a structured dict that the app can store and expose via /api/health.
+
+    Example return value::
+
+        {
+            "neo4j":          {"ok": True,  "url": "http://localhost:7474"},
+            "ollama":         {"ok": False, "error": "Connection refused"},
+            "required_model": {"ok": True,  "required": "qwen2.5:32b"},
+            "uploads_dir":    {"ok": True,  "path": "/app/uploads"},
+            "env_vars":       {"ok": True,  "missing": []},
+        }
+    """
+    results: dict = {}
+
+    logger.info("Running startup preflight checks...")
+
+    results['neo4j'] = _check_neo4j()
+    results['ollama'] = _check_ollama()
+    results['required_model'] = _check_required_model()
+    results['uploads_dir'] = _check_uploads_dir()
+    results['env_vars'] = _check_required_env_vars()
+
+    failed = [k for k, v in results.items() if not v.get('ok', False)]
+
+    if not failed:
+        logger.info("✅ Preflight checks passed — all systems nominal")
+    else:
+        logger.warning("⚠️  Preflight warnings: %s", failed)
+        for key in failed:
+            detail = results[key]
+            error_msg = detail.get('error') or detail.get('missing') or 'check failed'
+            logger.warning("   %s: %s", key, error_msg)
+
+    return results
diff --git a/backend/app/routes/__init__.py b/backend/app/routes/__init__.py
@@ -0,0 +1,9 @@
+"""
+MiroFish Routes Package
+Ops and health endpoints for Prometheus monitoring.
+"""
+
+from .health import health_bp
+from .ops import ops_bp
+
+__all__ = ['health_bp', 'ops_bp']
diff --git a/backend/app/routes/health.py b/backend/app/routes/health.py
@@ -0,0 +1,154 @@
+"""
+MiroFish Health Endpoints
+Prometheus-ready health checks for Neo4j, Ollama, and uploads directory.
+"""
+
+import os
+import time
+import requests
+from datetime import datetime
+from flask import Blueprint, jsonify, current_app
+
+from ..config import Config
+from ..utils.logger import get_logger
+
+health_bp = Blueprint('health', __name__)
+logger = get_logger('mirofish.health')
+
+
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+
+def _check_neo4j() -> tuple:
+    """Ping the Neo4j HTTP browser endpoint. Returns (ok: bool, latency_ms: int)."""
+    try:
+        # Convert bolt://host:7687 → http://host:7474
+        uri = Config.NEO4J_URI or 'bolt://localhost:7687'
+        http_uri = uri.replace('bolt://', 'http://').replace(':7687', ':7474')
+        start = time.monotonic()
+        r = requests.get(http_uri, timeout=5)
+        ms = int((time.monotonic() - start) * 1000)
+        return r.status_code == 200, ms
+    except Exception as e:
+        logger.debug(f"Neo4j health check failed: {e}")
+        return False, -1
+
+
+def _check_ollama() -> tuple:
+    """Ping Ollama /api/tags. Returns (ok: bool, latency_ms: int)."""
+    try:
+        base = (Config.LLM_BASE_URL or 'http://localhost:11434/v1').rstrip('/v1').rstrip('/')
+        # Normalise docker-internal hostname when running outside container
+        base = base.replace('host.docker.internal', 'localhost')
+        # If base still ends with /v1, strip it
+        if base.endswith('/v1'):
+            base = base[:-3]
+        start = time.monotonic()
+        r = requests.get(f"{base}/api/tags", timeout=5)
+        ms = int((time.monotonic() - start) * 1000)
+        return r.status_code == 200, ms
+    except Exception as e:
+        logger.debug(f"Ollama health check failed: {e}")
+        return False, -1
+
+
+def _check_uploads_dir() -> bool:
+    """Verify the uploads directory exists and is writable."""
+    try:
+        folder = os.path.abspath(Config.UPLOAD_FOLDER)
+        os.makedirs(folder, exist_ok=True)
+        test_path = os.path.join(folder, '.health_check')
+        with open(test_path, 'w') as f:
+            f.write('ok')
+        os.remove(test_path)
+        return True
+    except Exception as e:
+        logger.debug(f"Uploads dir check failed: {e}")
+        return False
+
+
+# ---------------------------------------------------------------------------
+# Routes
+# ---------------------------------------------------------------------------
+
+@health_bp.route('/api/health', methods=['GET'])
+def health_check():
+    """
+    Core health check — returns status of all stack components.
+
+    Response schema::
+
+        {
+            "status": "healthy" | "degraded" | "unhealthy",
+            "timestamp": "<ISO8601>",
+            "components": {
+                "backend":     {"status": "up",   "latency_ms": 0},
+                "neo4j":       {"status": "up",   "latency_ms": 42},
+                "ollama":      {"status": "down", "latency_ms": -1},
+                "uploads_dir": {"status": "ok",   "writable": true}
+            }
+        }
+
+    HTTP 200 when healthy/degraded, 503 when unhealthy.
+    """
+    components = {}
+    overall = "healthy"
+
+    # Neo4j — critical; down → unhealthy
+    neo4j_ok, neo4j_ms = _check_neo4j()
+    components["neo4j"] = {"status": "up" if neo4j_ok else "down", "latency_ms": neo4j_ms}
+    if not neo4j_ok:
+        overall = "unhealthy"
+
+    # Ollama — important but not fatal; down → degraded (unless already unhealthy)
+    ollama_ok, ollama_ms = _check_ollama()
+    components["ollama"] = {"status": "up" if ollama_ok else "down", "latency_ms": ollama_ms}
+    if not ollama_ok and overall == "healthy":
+        overall = "degraded"
+
+    # Uploads dir — needed for artifacts; unavailable → degraded
+    uploads_ok = _check_uploads_dir()
+    components["uploads_dir"] = {"status": "ok" if uploads_ok else "error", "writable": uploads_ok}
+    if not uploads_ok and overall == "healthy":
+        overall = "degraded"
+
+    # Backend is trivially up (we got here)
+    components["backend"] = {"status": "up", "latency_ms": 0}
+
+    # Surface any stored preflight results
+    preflight = current_app.extensions.get('preflight')
+
+    body = {
+        "status": overall,
+        "timestamp": datetime.now().isoformat(),
+        "components": components,
+    }
+    if preflight is not None:
+        body["preflight"] = preflight
-    # Surface any stored preflight results
-    preflight = current_app.extensions.get('preflight')
-
-    body = {
-        "status": overall,
-        "timestamp": datetime.now().isoformat(),
-        "components": components,
-    }
-    if preflight is not None:
-        body["preflight"] = preflight
+    # Surface any stored preflight results
+    preflight = current_app.extensions.get('preflight')
+
+    body = {
+        "status": overall,
+        "timestamp": datetime.now().isoformat(),
+        "components": components,
+    }
+    if preflight is not None:
+        body["preflight"] = {
+            name: {"ok": detail.get("ok", False)}
+            for name, detail in preflight.items()
+        }
-    # Surface any stored preflight results
-    preflight = current_app.extensions.get('preflight')
-
-    body = {
-        "status": overall,
-        "timestamp": datetime.now().isoformat(),
-        "components": components,
-    }
-    if preflight is not None:
-        body["preflight"] = preflight
+    # Surface any stored preflight results
+    preflight = current_app.extensions.get('preflight')
+
+    body = {
+        "status": overall,
+        "timestamp": datetime.now().isoformat(),
+        "components": components,
+    }
+    if preflight is not None:
+        body["preflight"] = {
+            name: {"ok": detail.get("ok", False)}
+            for name, detail in preflight.items()
+        }
+
+    http_code = 503 if overall == "unhealthy" else 200
+    return jsonify(body), http_code
+
+
+@health_bp.route('/api/health/ready', methods=['GET'])
+def readiness_check():
+    """
+    Readiness check — are ALL components up and ready for simulation?
+
+    Prometheus should call this before starting any simulation run.
+    Returns HTTP 200 when ready, 503 when not.
+    """
+    neo4j_ok, _ = _check_neo4j()
+    ollama_ok, _ = _check_ollama()
+    uploads_ok = _check_uploads_dir()
+
+    ready = neo4j_ok and ollama_ok and uploads_ok
+
+    return jsonify({
+        "ready": ready,
+        "timestamp": datetime.now().isoformat(),
+        "neo4j": neo4j_ok,
+        "ollama": ollama_ok,
+        "uploads": uploads_ok,
+    }), 200 if ready else 503