Skip to content

Commit e983950

Browse files
committed
fix: hook up startup_recovery with new functions
1 parent 86dcd6f commit e983950

File tree

3 files changed

+40
-28
lines changed

3 files changed

+40
-28
lines changed

api/src/backend/queries/agents.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,3 +183,7 @@ async def agent_startup_recovery(conn: asyncpg.Connection):
183183
# Legacy status recovery for backward compatibility
184184
await conn.execute("UPDATE miner_agents SET status = 'awaiting_screening_1' WHERE status = 'screening'")
185185
await conn.execute("UPDATE miner_agents SET status = 'waiting' WHERE status = 'evaluation'") # Legacy alias
186+
187+
@db_operation
188+
async def set_agent_status_by_version_id(conn: asyncpg.Connection, version_id: str, status: str):
189+
await conn.execute("UPDATE miner_agents SET status = $1 WHERE version_id = $2", status, version_id)

api/src/backend/queries/agents.pyi

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,5 @@ async def set_approved_agents_to_awaiting_screening() -> List[MinerAgent]: ...
1515
async def get_all_approved_version_ids() -> List[str]: ...
1616
async def set_agent_status(version_id: str, status: str): ...
1717
async def upload_miner_agent(version_id: str, miner_hotkey: str, agent_name: str, version_num: int, ip_address: str): ...
18-
async def agent_startup_recovery() -> None: ...
18+
async def agent_startup_recovery() -> None: ...
19+
async def set_agent_status_by_version_id(version_id: str, status: str): ...

api/src/endpoints/model_replacers.py

Lines changed: 34 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33
"""
44

55
from api.src.backend.entities import AgentStatus, MinerAgent
6-
from api.src.backend.queries.agents import set_agent_status
7-
from api.src.backend.queries.evaluations import get_evaluation_by_evaluation_id, get_running_evaluations, get_stuck_evaluations, get_waiting_evaluations, cancel_dangling_evaluation_runs, reset_evaluation_to_waiting
6+
from api.src.backend.queries.agents import get_top_agent, set_agent_status
7+
from api.src.backend.queries.evaluations import get_running_evaluations, get_stuck_evaluations, get_waiting_evaluations, cancel_dangling_evaluation_runs, reset_evaluation_to_waiting, update_evaluation_to_error
88
from api.src.backend.queries.agents import agent_startup_recovery
9-
from api.src.utils.config import SCREENING_1_THRESHOLD, SCREENING_2_THRESHOLD
9+
from api.src.endpoints.screener import atomically_update_agent_status, finish_evaluation, prune_queue
1010
from loggers.logging_utils import get_logger
1111

1212
logger = get_logger(__name__)
@@ -38,44 +38,51 @@ async def startup_recovery():
3838
# Reset running evaluations
3939
running_evals = await get_running_evaluations()
4040
for eval_row in running_evals:
41-
evaluation_id = eval_row["evaluation_id"]
42-
evaluation = await get_evaluation_by_evaluation_id(evaluation_id)
43-
if evaluation:
44-
if evaluation.is_screening:
45-
await evaluation.error("Disconnected from screener (error code 2)")
46-
else:
47-
# await evaluation.reset_to_waiting()
48-
# set evaluation to waiting, and its runs to cancelled
49-
await reset_evaluation_to_waiting(evaluation_id)
50-
# set agent status to waiting
51-
agent_version_id = evaluation.version_id
52-
await set_agent_status(
53-
version_id=agent_version_id,
54-
status=AgentStatus.waiting.value
55-
)
41+
evaluation_id = eval_row.evaluation_id
42+
agent_version_id = eval_row.version_id
43+
from api.src.models.screener import Screener
44+
is_screening = Screener.get_stage(eval_row.validator_hotkey) is not None
45+
if is_screening:
46+
await update_evaluation_to_error(evaluation_id, "Disconnected from screener (error code 2)")
47+
await atomically_update_agent_status(version_id=agent_version_id)
48+
else:
49+
# set evaluation to waiting, and its runs to cancelled
50+
await reset_evaluation_to_waiting(evaluation_id)
51+
# set agent status to waiting
52+
await set_agent_status(
53+
version_id=agent_version_id,
54+
status=AgentStatus.waiting.value
55+
)
5656

5757
# Check for running evaluations that should be auto-completed
5858
stuck_evaluations = await get_stuck_evaluations()
5959

6060
for stuck_eval in stuck_evaluations:
61-
evaluation = await get_evaluation_by_evaluation_id(stuck_eval.evaluation_id)
62-
if evaluation:
63-
logger.info(f"Auto-completing stuck evaluation {evaluation.evaluation_id} during startup recovery")
64-
# During startup recovery, don't trigger notifications
65-
_ = await evaluation.finish()
61+
evaluation_id = stuck_eval.evaluation_id
62+
# evaluation = await get_evaluation_by_evaluation_id(evaluation_id)
63+
validator_hotkey = stuck_eval.validator_hotkey
64+
65+
logger.info(f"Auto-completing stuck evaluation {evaluation_id} during startup recovery")
66+
# During startup recovery, don't trigger notifications
67+
_ = await finish_evaluation(evaluation_id, validator_hotkey, errored=True, reason="Platform restarted")
6668

6769
# Cancel waiting screenings for all screener types
6870
waiting_screenings = await get_waiting_evaluations()
6971
for screening_row in waiting_screenings:
70-
evaluation = await get_evaluation_by_evaluation_id(screening_row.evaluation_id)
71-
if evaluation:
72-
await evaluation.error("Disconnected from screener (error code 3)")
72+
evaluation_id = screening_row.evaluation_id
73+
evaluation_version_id = screening_row.version_id
74+
75+
# await evaluation.error("Disconnected from screener (error code 3)")
76+
await update_evaluation_to_error(evaluation_id, "Disconnected from screener (error code 3)")
77+
await atomically_update_agent_status(version_id=evaluation_version_id)
7378

7479
# Cancel dangling evaluation runs
7580
await cancel_dangling_evaluation_runs()
7681

7782
# Prune low-scoring evaluations that should not continue waiting
78-
# await Evaluation.prune_low_waiting(conn)
83+
top_agent = await get_top_agent()
84+
if top_agent:
85+
await prune_queue(top_agent)
7986

8087
logger.info("Application startup recovery completed with multi-stage screening support")
8188

0 commit comments

Comments
 (0)