@@ -106,16 +106,32 @@ async def start(self, conn: asyncpg.Connection) -> List[EvaluationRun]:
106106 async def finish (self , conn : asyncpg .Connection ):
107107 """Finish evaluation, but retry if >=50% of inferences failed and any run errored"""
108108
109- # DEBUG: Check if this evaluation already has a terminated_reason set
109+ # Check if evaluation is already completed - prevent duplicate finish calls
110+ current_status = await conn .fetchval (
111+ "SELECT status FROM evaluations WHERE evaluation_id = $1" ,
112+ self .evaluation_id
113+ )
114+
115+ if current_status == 'completed' :
116+ logger .warning (
117+ f"finish() called on already completed evaluation { self .evaluation_id } "
118+ f"(version: { self .version_id } , validator: { self .validator_hotkey } ). "
119+ f"Skipping to prevent agent status overwrites."
120+ )
121+ return None
122+
123+ # Use the evaluation lock to prevent race conditions with disconnection handling
124+ # async with Evaluation.get_lock():
125+ # DEBUG: Check if this evaluation already has a terminated_reason set
110126 current_terminated_reason = await conn .fetchval (
111127 "SELECT terminated_reason FROM evaluations WHERE evaluation_id = $1" ,
112128 self .evaluation_id
113129 )
114130 if current_terminated_reason :
115- current_status = await conn .fetchval (
116- "SELECT status FROM evaluations WHERE evaluation_id = $1" ,
117- self .evaluation_id
118- )
131+ # current_status = await conn.fetchval(
132+ # "SELECT status FROM evaluations WHERE evaluation_id = $1",
133+ # self.evaluation_id
134+ # )
119135
120136 # Print very noticeable debug information
121137 print ("=" * 80 )
@@ -484,19 +500,25 @@ async def create_screening_and_send(conn: asyncpg.Connection, agent: 'MinerAgent
484500 """Create screening evaluation"""
485501 from api .src .socket .websocket_manager import WebSocketManager
486502
487- # Safety check: Ensure screener doesn't already have a running evaluation
488- existing_evaluation = await conn .fetchrow (
489- """
490- SELECT evaluation_id, status FROM evaluations
491- WHERE validator_hotkey = $1 AND status = 'running'
492- LIMIT 1
493- """ ,
494- screener .hotkey
495- )
503+ # # Additional safety check: Ensure this agent doesn't already have a running screening at the same stage (lowk useless)
504+ # screener_stage = screener.stage
505+ # agent_running_screening = await conn.fetchval(
506+ # """
507+ # SELECT COUNT(*) FROM evaluations e
508+ # JOIN miner_agents ma ON e.version_id = ma.version_id
509+ # WHERE ma.version_id = $1
510+ # AND (
511+ # (e.validator_hotkey LIKE 'screener-1-%' OR e.validator_hotkey LIKE 'i-0%')
512+ # OR e.validator_hotkey LIKE 'screener-2-%'
513+ # )
514+ # AND e.status = 'running'
515+ # """,
516+ # agent.version_id
517+ # )
496518
497- if existing_evaluation :
498- logger .error (f"CRITICAL: Screener { screener . hotkey } already has running evaluation { existing_evaluation [ 'evaluation_id' ] } - refusing to create duplicate screening" )
499- return "" , False
519+ # if agent_running_screening > 0 :
520+ # logger.error(f"CRITICAL: Agent {agent.version_id } already has running screening - refusing to create duplicate screening")
521+ # return "", False
500522
501523 ws = WebSocketManager .get_instance ()
502524
@@ -575,7 +597,7 @@ async def screen_next_awaiting_agent(screener: "Screener"):
575597 # Log the agents for debugging
576598 awaiting_agents = await conn .fetch (
577599 """
578- SELECT version_id, miner_hotkey, agent_name, created_at FROM miner_agents
600+ SELECT version_id, miner_hotkey, agent_name, created_at, version_num, created_at FROM miner_agents
579601 WHERE status = $1
580602 AND miner_hotkey NOT IN (SELECT miner_hotkey from banned_hotkeys)
581603 ORDER BY created_at ASC
@@ -585,45 +607,20 @@ async def screen_next_awaiting_agent(screener: "Screener"):
585607 for agent in awaiting_agents [:3 ]: # Log first 3
586608 logger .info (f"Awaiting stage { screener .stage } agent: { agent ['agent_name' ]} ({ agent ['version_id' ]} ) from { agent ['miner_hotkey' ]} " )
587609
588- # Atomically claim the next awaiting agent for this stage using CTE with FOR UPDATE SKIP LOCKED
589- logger .debug (f"Stage { screener .stage } screener { screener .hotkey } attempting to claim agent with status '{ target_status } '" )
590- try :
591- claimed_agent = await conn .fetchrow (
592- """
593- WITH next_agent AS (
594- SELECT version_id FROM miner_agents
595- WHERE status = $1
596- AND miner_hotkey NOT IN (SELECT miner_hotkey from banned_hotkeys)
597- ORDER BY created_at ASC
598- FOR UPDATE SKIP LOCKED
599- LIMIT 1
600- )
601- UPDATE miner_agents
602- SET status = $2
603- FROM next_agent
604- WHERE miner_agents.version_id = next_agent.version_id
605- RETURNING miner_agents.version_id, miner_hotkey, agent_name, version_num, created_at
606- """ ,
607- target_status ,
608- target_screening_status
609- )
610- except Exception as e :
611- logger .warning (f"Database error while claiming agent for screener { screener .hotkey } : { e } " )
612- claimed_agent = None
613610
614- if not claimed_agent :
611+ else :
615612 screener .set_available () # Ensure available state is set
616613 logger .info (f"No stage { screener .stage } agents claimed by screener { screener .hotkey } despite { awaiting_count } awaiting" )
617614 return
618615
619- logger .info (f"Stage { screener .stage } screener { screener .hotkey } claimed agent { claimed_agent [ 'agent_name' ]} ({ claimed_agent ['version_id' ]} )" )
616+ logger .info (f"Stage { screener .stage } screener { screener .hotkey } claimed agent { awaiting_agents [ 0 ][ 'agent_name' ]} ({ awaiting_agents [ 0 ] ['version_id' ]} )" )
620617
621618 agent = MinerAgent (
622- version_id = claimed_agent ["version_id" ],
623- miner_hotkey = claimed_agent ["miner_hotkey" ],
624- agent_name = claimed_agent ["agent_name" ],
625- version_num = claimed_agent ["version_num" ],
626- created_at = claimed_agent ["created_at" ],
619+ version_id = awaiting_agents [ 0 ] ["version_id" ],
620+ miner_hotkey = awaiting_agents [ 0 ] ["miner_hotkey" ],
621+ agent_name = awaiting_agents [ 0 ] ["agent_name" ],
622+ version_num = awaiting_agents [ 0 ] ["version_num" ],
623+ created_at = awaiting_agents [ 0 ] ["created_at" ],
627624 status = target_screening_status , # Already set to correct status in query
628625 )
629626
@@ -769,6 +766,7 @@ async def handle_validator_disconnection(validator_hotkey: str):
769766 @staticmethod
770767 async def handle_screener_disconnection (screener_hotkey : str ):
771768 """Atomically handle screener disconnection: error active evaluations and reset agents"""
769+ # async with Evaluation.get_lock():
772770 async with get_transaction () as conn :
773771 # Get active screening evaluations for all screener types
774772 active_screenings = await conn .fetch (
0 commit comments