Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion backend/app/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,22 @@ def log_response(response):
app.register_blueprint(simulation_bp, url_prefix='/api/simulation')
app.register_blueprint(report_bp, url_prefix='/api/report')

# Health check
# Register ops/health blueprints (Prometheus monitoring layer)
from .routes import health_bp, ops_bp
app.register_blueprint(health_bp)
app.register_blueprint(ops_bp)

# Legacy simple health route (keep for backward compatibility)
@app.route('/health')
def health():
return {'status': 'ok', 'service': 'MiroFish-Offline Backend'}

# Run startup preflight checks (logs warnings; never blocks startup)
from .preflight import run_preflight_checks
with app.app_context():
preflight_results = run_preflight_checks()
app.extensions['preflight'] = preflight_results

if should_log_startup:
logger.info("MiroFish-Offline Backend startup complete")

Expand Down
17 changes: 14 additions & 3 deletions backend/app/api/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
import os
import traceback
import threading
from flask import request, jsonify, send_file
from flask import request, jsonify, send_file, current_app

from . import report_bp
from ..config import Config
from ..services.report_agent import ReportAgent, ReportManager, ReportStatus
from ..services.graph_tools import GraphToolsService
from ..services.simulation_manager import SimulationManager
from ..models.project import ProjectManager
from ..models.task import TaskManager, TaskStatus
Expand Down Expand Up @@ -109,6 +110,14 @@ def generate_report():
import uuid
report_id = f"report_{uuid.uuid4().hex[:12]}"

# Capture GraphStorage before background thread starts
storage = current_app.extensions.get('neo4j_storage')
if not storage:
return jsonify({
"success": False,
"error": "GraphStorage not initialized - check Neo4j connection"
}), 500

# Create async task
task_manager = TaskManager()
task_id = task_manager.create_task(
Expand All @@ -130,11 +139,13 @@ def run_generate():
message="Initializing Report Agent..."
)

# Create Report Agent
# Create Report Agent with injected graph tools
tools = GraphToolsService(storage=storage)
agent = ReportAgent(
graph_id=graph_id,
simulation_id=simulation_id,
simulation_requirement=simulation_requirement
simulation_requirement=simulation_requirement,
graph_tools=tools
)

# Progress callback
Expand Down
1 change: 1 addition & 0 deletions backend/app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class Config:
LLM_API_KEY = os.environ.get('LLM_API_KEY')
LLM_BASE_URL = os.environ.get('LLM_BASE_URL', 'http://localhost:11434/v1')
LLM_MODEL_NAME = os.environ.get('LLM_MODEL_NAME', 'qwen2.5:32b')
NER_MODEL = os.environ.get('NER_MODEL', 'minimax-m2.7:cloud')

# Neo4j configuration
NEO4J_URI = os.environ.get('NEO4J_URI', 'bolt://localhost:7687')
Expand Down
147 changes: 147 additions & 0 deletions backend/app/preflight.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
"""
MiroFish Startup Preflight Checks
Run on app startup to detect missing services/config early.
Logs warnings but never blocks startup.
"""

import os
import time
import requests
import logging

from .config import Config

logger = logging.getLogger('mirofish.preflight')

# Required environment variable names (as strings, checked against os.environ)
_REQUIRED_ENV_VARS = [
'NEO4J_URI',
'NEO4J_PASSWORD',
'LLM_BASE_URL',
'LLM_MODEL_NAME',
]
Comment on lines +16 to +22
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Make env_vars check the environment, not the defaulted config.

This currently falls back to backend/app/config.py::Config, so NEO4J_PASSWORD, LLM_BASE_URL, and LLM_MODEL_NAME read as “present” even when nothing is set in os.environ. LLM_API_KEY is also missing from _REQUIRED_ENV_VARS, so the preflight can report success while backend/app/config.py::Config.validate() would still fail.

🛠️ Suggested fix
 _REQUIRED_ENV_VARS = [
+    'LLM_API_KEY',
     'NEO4J_URI',
     'NEO4J_PASSWORD',
     'LLM_BASE_URL',
     'LLM_MODEL_NAME',
 ]
@@
 def _check_required_env_vars() -> dict:
     """Check that required environment variables are set."""
-    missing = []
-    for var in _REQUIRED_ENV_VARS:
-        # Accept values set either via os.environ or via Config class attributes
-        env_val = os.environ.get(var)
-        config_attr = var.replace('LLM_BASE_URL', 'LLM_BASE_URL') \
-                        .replace('LLM_MODEL_NAME', 'LLM_MODEL_NAME')
-        config_val = getattr(Config, config_attr, None)
-        if not env_val and not config_val:
-            missing.append(var)
+    missing = [var for var in _REQUIRED_ENV_VARS if not os.environ.get(var)]
     return {"ok": len(missing) == 0, "missing": missing}

Also applies to: 91-102

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@backend/app/preflight.py` around lines 16 - 22, The preflight currently
treats config defaults as "present" so update the check to read from the actual
environment: change logic that builds env_vars to inspect os.environ (e.g., use
k in os.environ or os.environ.get(k)) instead of importing values from
backend.app.config.Config; add 'LLM_API_KEY' to _REQUIRED_ENV_VARS so it is
validated too; apply the same fix to the later duplicate check around lines
91-102 (ensure both places reference os.environ directly and not Config) and
keep using the same unique symbols _REQUIRED_ENV_VARS and env_vars so the
preflight reports true missing env vars.



# ---------------------------------------------------------------------------
# Individual checks
# ---------------------------------------------------------------------------

def _check_neo4j() -> dict:
"""Verify Neo4j HTTP endpoint is reachable."""
try:
uri = Config.NEO4J_URI or 'bolt://localhost:7687'
http_uri = uri.replace('bolt://', 'http://').replace(':7687', ':7474')
r = requests.get(http_uri, timeout=5)
if r.status_code == 200:
return {"ok": True, "url": http_uri}
return {"ok": False, "url": http_uri, "error": f"HTTP {r.status_code}"}
except Exception as e:
return {"ok": False, "error": str(e)}


def _check_ollama() -> dict:
"""Verify Ollama /api/tags endpoint is reachable."""
try:
base = (Config.LLM_BASE_URL or 'http://localhost:11434/v1').rstrip('/')
base = base.replace('host.docker.internal', 'localhost')
if base.endswith('/v1'):
base = base[:-3]
url = f"{base}/api/tags"
r = requests.get(url, timeout=5)
if r.status_code == 200:
return {"ok": True, "url": url}
return {"ok": False, "url": url, "error": f"HTTP {r.status_code}"}
except Exception as e:
return {"ok": False, "error": str(e)}


def _check_required_model() -> dict:
"""Verify the configured LLM model is present in Ollama."""
try:
base = (Config.LLM_BASE_URL or 'http://localhost:11434/v1').rstrip('/')
base = base.replace('host.docker.internal', 'localhost')
if base.endswith('/v1'):
base = base[:-3]
r = requests.get(f"{base}/api/tags", timeout=5)
models = [m['name'] for m in r.json().get('models', [])]
required = Config.LLM_MODEL_NAME
ok = required in models
result = {"ok": ok, "required": required}
if not ok:
result["available"] = models
return result
except Exception as e:
return {"ok": False, "error": str(e)}


def _check_uploads_dir() -> dict:
"""Verify uploads directory exists and is writable."""
folder = os.path.abspath(Config.UPLOAD_FOLDER)
try:
os.makedirs(folder, exist_ok=True)
test_path = os.path.join(folder, '.preflight')
with open(test_path, 'w') as f:
f.write('ok')
os.remove(test_path)
return {"ok": True, "path": folder}
except Exception as e:
return {"ok": False, "path": folder, "error": str(e)}


def _check_required_env_vars() -> dict:
"""Check that required environment variables are set."""
missing = []
for var in _REQUIRED_ENV_VARS:
# Accept values set either via os.environ or via Config class attributes
env_val = os.environ.get(var)
config_attr = var.replace('LLM_BASE_URL', 'LLM_BASE_URL') \
.replace('LLM_MODEL_NAME', 'LLM_MODEL_NAME')
config_val = getattr(Config, config_attr, None)
if not env_val and not config_val:
missing.append(var)
return {"ok": len(missing) == 0, "missing": missing}


# ---------------------------------------------------------------------------
# Main entry point
# ---------------------------------------------------------------------------

def run_preflight_checks() -> dict:
"""
Run all startup preflight checks.

Logs warnings for any failures but does NOT prevent app startup.
Returns a structured dict that the app can store and expose via /api/health.

Example return value::

{
"neo4j": {"ok": True, "url": "http://localhost:7474"},
"ollama": {"ok": False, "error": "Connection refused"},
"required_model": {"ok": True, "required": "qwen2.5:32b"},
"uploads_dir": {"ok": True, "path": "/app/uploads"},
"env_vars": {"ok": True, "missing": []},
}
"""
results: dict = {}

logger.info("Running startup preflight checks...")

results['neo4j'] = _check_neo4j()
results['ollama'] = _check_ollama()
results['required_model'] = _check_required_model()
results['uploads_dir'] = _check_uploads_dir()
results['env_vars'] = _check_required_env_vars()

failed = [k for k, v in results.items() if not v.get('ok', False)]

if not failed:
logger.info("✅ Preflight checks passed — all systems nominal")
else:
logger.warning("⚠️ Preflight warnings: %s", failed)
for key in failed:
detail = results[key]
error_msg = detail.get('error') or detail.get('missing') or 'check failed'
logger.warning(" %s: %s", key, error_msg)

return results
9 changes: 9 additions & 0 deletions backend/app/routes/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""
MiroFish Routes Package
Ops and health endpoints for Prometheus monitoring.
"""

from .health import health_bp
from .ops import ops_bp

__all__ = ['health_bp', 'ops_bp']
154 changes: 154 additions & 0 deletions backend/app/routes/health.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
"""
MiroFish Health Endpoints
Prometheus-ready health checks for Neo4j, Ollama, and uploads directory.
"""

import os
import time
import requests
from datetime import datetime
from flask import Blueprint, jsonify, current_app

from ..config import Config
from ..utils.logger import get_logger

health_bp = Blueprint('health', __name__)
logger = get_logger('mirofish.health')


# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------

def _check_neo4j() -> tuple:
"""Ping the Neo4j HTTP browser endpoint. Returns (ok: bool, latency_ms: int)."""
try:
# Convert bolt://host:7687 → http://host:7474
uri = Config.NEO4J_URI or 'bolt://localhost:7687'
http_uri = uri.replace('bolt://', 'http://').replace(':7687', ':7474')
start = time.monotonic()
r = requests.get(http_uri, timeout=5)
ms = int((time.monotonic() - start) * 1000)
return r.status_code == 200, ms
except Exception as e:
logger.debug(f"Neo4j health check failed: {e}")
return False, -1


def _check_ollama() -> tuple:
"""Ping Ollama /api/tags. Returns (ok: bool, latency_ms: int)."""
try:
base = (Config.LLM_BASE_URL or 'http://localhost:11434/v1').rstrip('/v1').rstrip('/')
# Normalise docker-internal hostname when running outside container
base = base.replace('host.docker.internal', 'localhost')
# If base still ends with /v1, strip it
if base.endswith('/v1'):
base = base[:-3]
start = time.monotonic()
r = requests.get(f"{base}/api/tags", timeout=5)
ms = int((time.monotonic() - start) * 1000)
return r.status_code == 200, ms
except Exception as e:
logger.debug(f"Ollama health check failed: {e}")
return False, -1


def _check_uploads_dir() -> bool:
"""Verify the uploads directory exists and is writable."""
try:
folder = os.path.abspath(Config.UPLOAD_FOLDER)
os.makedirs(folder, exist_ok=True)
test_path = os.path.join(folder, '.health_check')
with open(test_path, 'w') as f:
f.write('ok')
os.remove(test_path)
return True
except Exception as e:
logger.debug(f"Uploads dir check failed: {e}")
return False
Comment on lines +56 to +68
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Use a unique temp file for the writability probe.

Every request touches the same .health_check path. Two concurrent /api/health or /api/health/ready calls can race on os.remove() and flap the endpoint to false-negative failures.

🛠️ Suggested fix
-        test_path = os.path.join(folder, '.health_check')
-        with open(test_path, 'w') as f:
-            f.write('ok')
-        os.remove(test_path)
+        with tempfile.NamedTemporaryFile(
+            mode='w',
+            dir=folder,
+            prefix='.health_check_',
+            delete=True,
+        ) as f:
+            f.write('ok')
         return True

Also add import tempfile at the top of the file.

🧰 Tools
🪛 Ruff (0.15.6)

[warning] 66-66: Do not catch blind exception: Exception

(BLE001)

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@backend/app/routes/health.py` around lines 56 - 68, _check_uploads_dir
currently uses a fixed ".health_check" filename which causes races on os.remove;
change the probe to create a unique temp file inside the uploads folder (use
tempfile.NamedTemporaryFile or tempfile.mkstemp with dir=folder) write/read as
needed, close and unlink that unique path in a finally block to ensure cleanup,
and keep returning True/False on success/exception; also add import tempfile at
the top of the file.



# ---------------------------------------------------------------------------
# Routes
# ---------------------------------------------------------------------------

@health_bp.route('/api/health', methods=['GET'])
def health_check():
"""
Core health check — returns status of all stack components.

Response schema::

{
"status": "healthy" | "degraded" | "unhealthy",
"timestamp": "<ISO8601>",
"components": {
"backend": {"status": "up", "latency_ms": 0},
"neo4j": {"status": "up", "latency_ms": 42},
"ollama": {"status": "down", "latency_ms": -1},
"uploads_dir": {"status": "ok", "writable": true}
}
}

HTTP 200 when healthy/degraded, 503 when unhealthy.
"""
components = {}
overall = "healthy"

# Neo4j — critical; down → unhealthy
neo4j_ok, neo4j_ms = _check_neo4j()
components["neo4j"] = {"status": "up" if neo4j_ok else "down", "latency_ms": neo4j_ms}
if not neo4j_ok:
overall = "unhealthy"

# Ollama — important but not fatal; down → degraded (unless already unhealthy)
ollama_ok, ollama_ms = _check_ollama()
components["ollama"] = {"status": "up" if ollama_ok else "down", "latency_ms": ollama_ms}
if not ollama_ok and overall == "healthy":
overall = "degraded"

# Uploads dir — needed for artifacts; unavailable → degraded
uploads_ok = _check_uploads_dir()
components["uploads_dir"] = {"status": "ok" if uploads_ok else "error", "writable": uploads_ok}
if not uploads_ok and overall == "healthy":
overall = "degraded"

# Backend is trivially up (we got here)
components["backend"] = {"status": "up", "latency_ms": 0}

# Surface any stored preflight results
preflight = current_app.extensions.get('preflight')

body = {
"status": overall,
"timestamp": datetime.now().isoformat(),
"components": components,
}
if preflight is not None:
body["preflight"] = preflight
Comment on lines +119 to +128
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Avoid returning raw preflight diagnostics from /api/health.

preflight includes internal URLs, filesystem paths, model inventory, and missing env var names. Exposing that verbatim on an unauthenticated health endpoint is unnecessary information leakage; return coarse ok flags only, or gate the detailed payload behind authenticated ops access.

🛠️ Suggested fix
-    if preflight is not None:
-        body["preflight"] = preflight
+    if preflight is not None:
+        body["preflight"] = {
+            name: {"ok": detail.get("ok", False)}
+            for name, detail in preflight.items()
+        }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Surface any stored preflight results
preflight = current_app.extensions.get('preflight')
body = {
"status": overall,
"timestamp": datetime.now().isoformat(),
"components": components,
}
if preflight is not None:
body["preflight"] = preflight
# Surface any stored preflight results
preflight = current_app.extensions.get('preflight')
body = {
"status": overall,
"timestamp": datetime.now().isoformat(),
"components": components,
}
if preflight is not None:
body["preflight"] = {
name: {"ok": detail.get("ok", False)}
for name, detail in preflight.items()
}
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@backend/app/routes/health.py` around lines 119 - 128, The route currently
attaches the raw preflight payload from current_app.extensions.get('preflight')
to the health response (variable preflight and body) which leaks internal
paths/URLs/model lists; instead, replace that with a sanitized summary or gate
detailed data behind authenticated ops access: when preflight is present set
body["preflight"] to a coarse object (e.g. {"ok": bool(preflight.passed or all
checks), "checks": {name: bool}}) or simply {"ok": True/False}, and only include
the full preflight payload if the requester is authenticated/authorized for ops
(check your auth helper in the health route) — update the logic around
current_app.extensions.get('preflight') and the body assignment to implement
this sanitization/gating.


http_code = 503 if overall == "unhealthy" else 200
return jsonify(body), http_code


@health_bp.route('/api/health/ready', methods=['GET'])
def readiness_check():
"""
Readiness check — are ALL components up and ready for simulation?

Prometheus should call this before starting any simulation run.
Returns HTTP 200 when ready, 503 when not.
"""
neo4j_ok, _ = _check_neo4j()
ollama_ok, _ = _check_ollama()
uploads_ok = _check_uploads_dir()

ready = neo4j_ok and ollama_ok and uploads_ok

return jsonify({
"ready": ready,
"timestamp": datetime.now().isoformat(),
"neo4j": neo4j_ok,
"ollama": ollama_ok,
"uploads": uploads_ok,
}), 200 if ready else 503
Loading