Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 14 additions & 8 deletions modules/llm/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,14 +392,8 @@ async def admin_set_llm_backend(
"""Переключить LLM backend с горячей перезагрузкой сервиса"""
container = get_container()

# Stop bridge if currently running and switching away from it
current_service = container.llm_service
if current_service and getattr(current_service, "provider_type", None) == "claude_bridge":
from bridge_manager import bridge_manager

if bridge_manager.is_running:
logger.info("🛑 Stopping bridge (switching backend)...")
await bridge_manager.stop()
# NOTE: bridge is stopped AFTER successful switch (not before),
# so if the target backend fails, bridge remains running.

# Auto-convert "gemini" to default cloud Gemini provider
if request.backend == "gemini":
Expand Down Expand Up @@ -513,6 +507,18 @@ async def check_vllm_health() -> bool:
except ImportError:
raise HTTPException(status_code=503, detail="VLLMLLMService не доступен")

# Stop bridge only after vLLM is confirmed working
current_service = container.llm_service
if (
current_service
and getattr(current_service, "provider_type", None) == "claude_bridge"
):
from bridge_manager import bridge_manager

if bridge_manager.is_running:
logger.info("🛑 Stopping bridge (switching to vLLM)...")
await bridge_manager.stop()

container.llm_service = new_service
os.environ["LLM_BACKEND"] = "vllm"

Expand Down
26 changes: 26 additions & 0 deletions modules/llm/startup.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,3 +255,29 @@ async def auto_start_bridge() -> None:
logger.warning(f"🌉 Bridge auto-start failed: {result.get('error', 'unknown')}")
except Exception as e:
logger.error(f"🌉 Error during bridge auto-start: {e}")


async def bridge_health_check() -> None:
"""Periodic health check for bridge — auto-restart if crashed."""
from bridge_manager import bridge_manager
from db.integration import async_cloud_provider_manager

try:
bridge_providers = await async_cloud_provider_manager.get_by_type(
"claude_bridge", enabled_only=True
)
if not bridge_providers:
return

if bridge_manager.is_running:
return

# Bridge should be running but isn't — restart it
logger.warning("🌉 Bridge not running, auto-restarting...")
result = await bridge_manager.start()
if result.get("status") == "ok":
logger.info(f"🌉 Bridge auto-restarted on port {result.get('port', 8787)}")
else:
logger.warning(f"🌉 Bridge auto-restart failed: {result.get('error', 'unknown')}")
except Exception as e:
logger.error(f"🌉 Bridge health check error: {e}")
5 changes: 4 additions & 1 deletion orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ async def startup_event():
# Auto-start bots and bridge
from modules.channels.telegram.startup import auto_start_bots as auto_start_telegram
from modules.channels.whatsapp.startup import auto_start_bots as auto_start_whatsapp
from modules.llm.startup import auto_start_bridge
from modules.llm.startup import auto_start_bridge, bridge_health_check

await auto_start_telegram()
await auto_start_whatsapp()
Expand All @@ -260,6 +260,9 @@ async def startup_event():
from modules.kanban.tasks import sync_kanban_issues

task_registry.register("session-cleanup", cleanup_expired_sessions, interval=3600)
task_registry.register(
"bridge-health-check", bridge_health_check, interval=60, initial_delay=30
)
task_registry.register(
"periodic-vacuum", periodic_vacuum, interval=7 * 24 * 3600, initial_delay=24 * 3600
)
Expand Down
Loading