|
3 | 3 | """ |
4 | 4 |
|
5 | 5 | from api.src.backend.entities import AgentStatus, MinerAgent |
6 | | -from api.src.backend.queries.agents import set_agent_status |
7 | | -from api.src.backend.queries.evaluations import get_evaluation_by_evaluation_id, get_running_evaluations, get_stuck_evaluations, get_waiting_evaluations, cancel_dangling_evaluation_runs, reset_evaluation_to_waiting |
| 6 | +from api.src.backend.queries.agents import get_top_agent, set_agent_status |
| 7 | +from api.src.backend.queries.evaluations import get_running_evaluations, get_stuck_evaluations, get_waiting_evaluations, cancel_dangling_evaluation_runs, reset_evaluation_to_waiting, update_evaluation_to_error |
8 | 8 | from api.src.backend.queries.agents import agent_startup_recovery |
9 | | -from api.src.utils.config import SCREENING_1_THRESHOLD, SCREENING_2_THRESHOLD |
| 9 | +from api.src.endpoints.screener import atomically_update_agent_status, finish_evaluation, prune_queue |
10 | 10 | from loggers.logging_utils import get_logger |
11 | 11 |
|
12 | 12 | logger = get_logger(__name__) |
@@ -38,44 +38,51 @@ async def startup_recovery(): |
38 | 38 | # Reset running evaluations |
39 | 39 | running_evals = await get_running_evaluations() |
40 | 40 | for eval_row in running_evals: |
41 | | - evaluation_id = eval_row["evaluation_id"] |
42 | | - evaluation = await get_evaluation_by_evaluation_id(evaluation_id) |
43 | | - if evaluation: |
44 | | - if evaluation.is_screening: |
45 | | - await evaluation.error("Disconnected from screener (error code 2)") |
46 | | - else: |
47 | | - # await evaluation.reset_to_waiting() |
48 | | - # set evaluation to waiting, and its runs to cancelled |
49 | | - await reset_evaluation_to_waiting(evaluation_id) |
50 | | - # set agent status to waiting |
51 | | - agent_version_id = evaluation.version_id |
52 | | - await set_agent_status( |
53 | | - version_id=agent_version_id, |
54 | | - status=AgentStatus.waiting.value |
55 | | - ) |
| 41 | + evaluation_id = eval_row.evaluation_id |
| 42 | + agent_version_id = eval_row.version_id |
| 43 | + from api.src.models.screener import Screener |
| 44 | + is_screening = Screener.get_stage(eval_row.validator_hotkey) is not None |
| 45 | + if is_screening: |
| 46 | + await update_evaluation_to_error(evaluation_id, "Disconnected from screener (error code 2)") |
| 47 | + await atomically_update_agent_status(version_id=agent_version_id) |
| 48 | + else: |
| 49 | + # set evaluation to waiting, and its runs to cancelled |
| 50 | + await reset_evaluation_to_waiting(evaluation_id) |
| 51 | + # set agent status to waiting |
| 52 | + await set_agent_status( |
| 53 | + version_id=agent_version_id, |
| 54 | + status=AgentStatus.waiting.value |
| 55 | + ) |
56 | 56 |
|
57 | 57 | # Check for running evaluations that should be auto-completed |
58 | 58 | stuck_evaluations = await get_stuck_evaluations() |
59 | 59 |
|
60 | 60 | for stuck_eval in stuck_evaluations: |
61 | | - evaluation = await get_evaluation_by_evaluation_id(stuck_eval.evaluation_id) |
62 | | - if evaluation: |
63 | | - logger.info(f"Auto-completing stuck evaluation {evaluation.evaluation_id} during startup recovery") |
64 | | - # During startup recovery, don't trigger notifications |
65 | | - _ = await evaluation.finish() |
| 61 | + evaluation_id = stuck_eval.evaluation_id |
| 62 | + # evaluation = await get_evaluation_by_evaluation_id(evaluation_id) |
| 63 | + validator_hotkey = stuck_eval.validator_hotkey |
| 64 | + |
| 65 | + logger.info(f"Auto-completing stuck evaluation {evaluation_id} during startup recovery") |
| 66 | + # During startup recovery, don't trigger notifications |
| 67 | + _ = await finish_evaluation(evaluation_id, validator_hotkey, errored=True, reason="Platform restarted") |
66 | 68 |
|
67 | 69 | # Cancel waiting screenings for all screener types |
68 | 70 | waiting_screenings = await get_waiting_evaluations() |
69 | 71 | for screening_row in waiting_screenings: |
70 | | - evaluation = await get_evaluation_by_evaluation_id(screening_row.evaluation_id) |
71 | | - if evaluation: |
72 | | - await evaluation.error("Disconnected from screener (error code 3)") |
| 72 | + evaluation_id = screening_row.evaluation_id |
| 73 | + evaluation_version_id = screening_row.version_id |
| 74 | + |
| 75 | + # await evaluation.error("Disconnected from screener (error code 3)") |
| 76 | + await update_evaluation_to_error(evaluation_id, "Disconnected from screener (error code 3)") |
| 77 | + await atomically_update_agent_status(version_id=evaluation_version_id) |
73 | 78 |
|
74 | 79 | # Cancel dangling evaluation runs |
75 | 80 | await cancel_dangling_evaluation_runs() |
76 | 81 |
|
77 | 82 | # Prune low-scoring evaluations that should not continue waiting |
78 | | - # await Evaluation.prune_low_waiting(conn) |
| 83 | + top_agent = await get_top_agent() |
| 84 | + if top_agent: |
| 85 | + await prune_queue(top_agent) |
79 | 86 |
|
80 | 87 | logger.info("Application startup recovery completed with multi-stage screening support") |
81 | 88 |
|
0 commit comments