From 8f8ab15782497549506f9ceb9b71ec57aa8583d2 Mon Sep 17 00:00:00 2001 From: ShaerWare Date: Tue, 31 Mar 2026 21:57:50 +0500 Subject: [PATCH] fix: prevent bridge from dying on failed backend switch + add auto-restart Bridge was killed BEFORE verifying the target backend (vLLM) is available. If vLLM failed to start, bridge stayed dead with no auto-recovery. Now bridge is stopped only AFTER successful switch, and a periodic health check (60s) auto-restarts it if it crashes. Co-Authored-By: Claude Opus 4.6 (1M context) --- modules/llm/router.py | 22 ++++++++++++++-------- modules/llm/startup.py | 26 ++++++++++++++++++++++++++ orchestrator.py | 5 ++++- 3 files changed, 44 insertions(+), 9 deletions(-) diff --git a/modules/llm/router.py b/modules/llm/router.py index 697af37..ace4a45 100644 --- a/modules/llm/router.py +++ b/modules/llm/router.py @@ -392,14 +392,8 @@ async def admin_set_llm_backend( """Переключить LLM backend с горячей перезагрузкой сервиса""" container = get_container() - # Stop bridge if currently running and switching away from it - current_service = container.llm_service - if current_service and getattr(current_service, "provider_type", None) == "claude_bridge": - from bridge_manager import bridge_manager - - if bridge_manager.is_running: - logger.info("🛑 Stopping bridge (switching backend)...") - await bridge_manager.stop() + # NOTE: bridge is stopped AFTER successful switch (not before), + # so if the target backend fails, bridge remains running. # Auto-convert "gemini" to default cloud Gemini provider if request.backend == "gemini": @@ -513,6 +507,18 @@ async def check_vllm_health() -> bool: except ImportError: raise HTTPException(status_code=503, detail="VLLMLLMService не доступен") + # Stop bridge only after vLLM is confirmed working + current_service = container.llm_service + if ( + current_service + and getattr(current_service, "provider_type", None) == "claude_bridge" + ): + from bridge_manager import bridge_manager + + if bridge_manager.is_running: + logger.info("🛑 Stopping bridge (switching to vLLM)...") + await bridge_manager.stop() + container.llm_service = new_service os.environ["LLM_BACKEND"] = "vllm" diff --git a/modules/llm/startup.py b/modules/llm/startup.py index ef71033..5675f28 100644 --- a/modules/llm/startup.py +++ b/modules/llm/startup.py @@ -255,3 +255,29 @@ async def auto_start_bridge() -> None: logger.warning(f"🌉 Bridge auto-start failed: {result.get('error', 'unknown')}") except Exception as e: logger.error(f"🌉 Error during bridge auto-start: {e}") + + +async def bridge_health_check() -> None: + """Periodic health check for bridge — auto-restart if crashed.""" + from bridge_manager import bridge_manager + from db.integration import async_cloud_provider_manager + + try: + bridge_providers = await async_cloud_provider_manager.get_by_type( + "claude_bridge", enabled_only=True + ) + if not bridge_providers: + return + + if bridge_manager.is_running: + return + + # Bridge should be running but isn't — restart it + logger.warning("🌉 Bridge not running, auto-restarting...") + result = await bridge_manager.start() + if result.get("status") == "ok": + logger.info(f"🌉 Bridge auto-restarted on port {result.get('port', 8787)}") + else: + logger.warning(f"🌉 Bridge auto-restart failed: {result.get('error', 'unknown')}") + except Exception as e: + logger.error(f"🌉 Bridge health check error: {e}") diff --git a/orchestrator.py b/orchestrator.py index 48e4567..e74ac76 100644 --- a/orchestrator.py +++ b/orchestrator.py @@ -247,7 +247,7 @@ async def startup_event(): # Auto-start bots and bridge from modules.channels.telegram.startup import auto_start_bots as auto_start_telegram from modules.channels.whatsapp.startup import auto_start_bots as auto_start_whatsapp - from modules.llm.startup import auto_start_bridge + from modules.llm.startup import auto_start_bridge, bridge_health_check await auto_start_telegram() await auto_start_whatsapp() @@ -260,6 +260,9 @@ async def startup_event(): from modules.kanban.tasks import sync_kanban_issues task_registry.register("session-cleanup", cleanup_expired_sessions, interval=3600) + task_registry.register( + "bridge-health-check", bridge_health_check, interval=60, initial_delay=30 + ) task_registry.register( "periodic-vacuum", periodic_vacuum, interval=7 * 24 * 3600, initial_delay=24 * 3600 )