diff --git a/api/endpoints/statistics.py b/api/endpoints/statistics.py index 99d0f0a2..397af9be 100644 --- a/api/endpoints/statistics.py +++ b/api/endpoints/statistics.py @@ -2,6 +2,7 @@ import datetime from pydantic import BaseModel +from queries.statistics import ValidatorStats, get_validator_stats from utils.ttl import ttl_cache from typing import List, Optional from fastapi import APIRouter, HTTPException @@ -51,3 +52,9 @@ async def problem_statistics(set_id: Optional[int] = None) -> ProblemStatisticsR problem_set_id=set_id, problem_set_created_at=problem_set_created_at ) + +# /statistics/validator-stats +@router.get("/validator-stats") +@ttl_cache(ttl_seconds=15*60) # 15 mins +async def validator_stats() -> List[ValidatorStats]: + return await get_validator_stats() diff --git a/queries/statistics.py b/queries/statistics.py index ec5b2e2a..6c6292c4 100644 --- a/queries/statistics.py +++ b/queries/statistics.py @@ -1,8 +1,9 @@ +import json import api.config as config from datetime import datetime from pydantic import BaseModel -from typing import Dict, Optional +from typing import List, Dict, Optional from models.evaluation_set import EvaluationSetGroup from utils.database import db_operation, DatabaseConnection @@ -278,3 +279,184 @@ async def get_problem_set_creation_times(conn: DatabaseConnection) -> list[Probl ) return [ProblemSetCreationTime(**row) for row in rows] + + + +class ValidatorStatsErrorCodeInfo(BaseModel): + error_code: int + count: int + +class ValidatorStats(BaseModel): + validator_hotkey: str + + num_evals: int + + num_eval_runs: int + num_success_eval_runs: int + num_pass_eval_runs: int + num_fail_eval_runs: int + num_error_eval_runs: int + error_code_distribution: List[ValidatorStatsErrorCodeInfo] + + num_inferences: int + num_embeddings: int + + runtime_min: Optional[float] = None + runtime_q1: Optional[float] = None + runtime_median: Optional[float] = None + runtime_q3: Optional[float] = None + runtime_max: Optional[float] = None + runtime_mean: Optional[float] = None + + score_min: Optional[float] = None + score_q1: Optional[float] = None + score_median: Optional[float] = None + score_q3: Optional[float] = None + score_max: Optional[float] = None + score_mean: Optional[float] = None + + def __init__(self, **data): + if "error_code_distribution" in data: + data["error_code_distribution"] = [ValidatorStatsErrorCodeInfo(**item) for item in json.loads(data["error_code_distribution"])] + + super().__init__(**data) + +@db_operation +async def get_validator_stats(conn: DatabaseConnection) -> int: + rows = await conn.fetch( + """ + WITH current_set AS ( + SELECT MAX(set_id) as set_id FROM evaluation_sets + ), + validator_eval_runs AS ( + SELECT + e.validator_hotkey, + erh.evaluation_id, + erh.evaluation_run_id, + erh.status, + erh.solved, + erh.error_code, + erh.started_initializing_agent_at, + erh.finished_or_errored_at, + EXTRACT(EPOCH FROM (erh.finished_or_errored_at - erh.started_initializing_agent_at)) as runtime_seconds, + COUNT(DISTINCT i.inference_id) as num_inferences, + COUNT(DISTINCT em.embedding_id) as num_embeddings + FROM evaluations e + JOIN evaluation_runs_hydrated erh ON e.evaluation_id = erh.evaluation_id + JOIN agents a ON e.agent_id = a.agent_id + LEFT JOIN inferences i ON erh.evaluation_run_id = i.evaluation_run_id + LEFT JOIN embeddings em ON erh.evaluation_run_id = em.evaluation_run_id + WHERE e.set_id = (SELECT set_id FROM current_set) + AND e.evaluation_set_group = 'validator'::EvaluationSetGroup + AND a.miner_hotkey NOT IN (SELECT miner_hotkey FROM banned_hotkeys) + AND e.agent_id NOT IN (SELECT agent_id FROM unapproved_agent_ids) + AND e.agent_id NOT IN (SELECT agent_id FROM benchmark_agent_ids) + GROUP BY + e.validator_hotkey, + erh.evaluation_run_id, + erh.evaluation_id, + erh.status, + erh.error_code, + erh.finished_or_errored_at, + erh.started_initializing_agent_at, + erh.solved + ), + validator_stats AS ( + SELECT + validator_hotkey, + COUNT(DISTINCT evaluation_id) as num_evals, + COUNT(evaluation_run_id) as num_eval_runs, + COUNT(*) FILTER (WHERE status = 'finished') as num_success_eval_runs, + COUNT(*) FILTER (WHERE status = 'finished' AND solved) as num_pass_eval_runs, + COUNT(*) FILTER (WHERE status = 'finished' AND NOT solved) as num_fail_eval_runs, + COUNT(*) FILTER (WHERE status = 'error') as num_error_eval_runs, + COALESCE(SUM(num_inferences), 0) as num_inferences, + COALESCE(SUM(num_embeddings), 0) as num_embeddings + FROM validator_eval_runs + GROUP BY validator_hotkey + ), + error_code_distribution AS ( + SELECT + validator_hotkey, + COALESCE( + json_agg( + jsonb_build_object('error_code', error_code, 'count', error_count) + ), + '[]'::json + ) as error_code_distribution + FROM ( + SELECT + validator_hotkey, + error_code, + COUNT(*) as error_count + FROM validator_eval_runs + WHERE status = 'error' AND error_code IS NOT NULL + GROUP BY validator_hotkey, error_code + ) + GROUP BY validator_hotkey + ), + runtime_quartiles AS ( + SELECT + validator_hotkey, + PERCENTILE_CONT(0.00) WITHIN GROUP (ORDER BY runtime_seconds) as runtime_min, + PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY runtime_seconds) as runtime_q1, + PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY runtime_seconds) as runtime_median, + PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY runtime_seconds) as runtime_q3, + PERCENTILE_CONT(1.00) WITHIN GROUP (ORDER BY runtime_seconds) as runtime_max, + AVG(runtime_seconds) as runtime_mean + FROM validator_eval_runs + WHERE runtime_seconds IS NOT NULL + GROUP BY validator_hotkey + ), + score_quartiles AS ( + SELECT + validator_hotkey, + PERCENTILE_CONT(0.00) WITHIN GROUP (ORDER BY score) as score_min, + PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY score) as score_q1, + PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY score) as score_median, + PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY score) as score_q3, + PERCENTILE_CONT(1.00) WITHIN GROUP (ORDER BY score) as score_max, + AVG(score) as score_mean + FROM ( + SELECT + validator_hotkey, + evaluation_id, + COUNT(*) FILTER (WHERE solved = true)::float / NULLIF(COUNT(*) FILTER (WHERE status = 'finished'), 0) AS score + FROM validator_eval_runs + WHERE status IN ('finished', 'error') + GROUP BY validator_hotkey, evaluation_id + HAVING COUNT(*) FILTER (WHERE status = 'finished') > 0 + ) + GROUP BY validator_hotkey + ) + SELECT + vs.validator_hotkey, + vs.num_evals, + vs.num_eval_runs, + vs.num_success_eval_runs, + vs.num_pass_eval_runs, + vs.num_fail_eval_runs, + vs.num_error_eval_runs, + ecd.error_code_distribution, + vs.num_inferences, + vs.num_embeddings, + rq.runtime_min, + rq.runtime_q1, + rq.runtime_median, + rq.runtime_q3, + rq.runtime_max, + rq.runtime_mean, + sq.score_min, + sq.score_q1, + sq.score_median, + sq.score_q3, + sq.score_max, + sq.score_mean + FROM validator_stats vs + LEFT JOIN error_code_distribution ecd ON vs.validator_hotkey = ecd.validator_hotkey + LEFT JOIN runtime_quartiles rq ON vs.validator_hotkey = rq.validator_hotkey + LEFT JOIN score_quartiles sq ON vs.validator_hotkey = sq.validator_hotkey + ORDER BY vs.validator_hotkey; + """ + ) + return [ValidatorStats(**row) for row in rows]