Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions api/endpoints/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import datetime

from pydantic import BaseModel
from queries.statistics import ValidatorStats, get_validator_stats
from utils.ttl import ttl_cache
from typing import List, Optional
from fastapi import APIRouter, HTTPException
Expand Down Expand Up @@ -51,3 +52,9 @@ async def problem_statistics(set_id: Optional[int] = None) -> ProblemStatisticsR
problem_set_id=set_id,
problem_set_created_at=problem_set_created_at
)

# /statistics/validator-stats
@router.get("/validator-stats")
@ttl_cache(ttl_seconds=15*60) # 15 mins
async def validator_stats() -> List[ValidatorStats]:
return await get_validator_stats()
184 changes: 183 additions & 1 deletion queries/statistics.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import json
import api.config as config

from datetime import datetime
from pydantic import BaseModel
from typing import Dict, Optional
from typing import List, Dict, Optional
from models.evaluation_set import EvaluationSetGroup
from utils.database import db_operation, DatabaseConnection

Expand Down Expand Up @@ -278,3 +279,184 @@ async def get_problem_set_creation_times(conn: DatabaseConnection) -> list[Probl
)

return [ProblemSetCreationTime(**row) for row in rows]



class ValidatorStatsErrorCodeInfo(BaseModel):
error_code: int
count: int

class ValidatorStats(BaseModel):
validator_hotkey: str

num_evals: int

num_eval_runs: int
num_success_eval_runs: int
num_pass_eval_runs: int
num_fail_eval_runs: int
num_error_eval_runs: int
error_code_distribution: List[ValidatorStatsErrorCodeInfo]

num_inferences: int
num_embeddings: int

runtime_min: Optional[float] = None
runtime_q1: Optional[float] = None
runtime_median: Optional[float] = None
runtime_q3: Optional[float] = None
runtime_max: Optional[float] = None
runtime_mean: Optional[float] = None

score_min: Optional[float] = None
score_q1: Optional[float] = None
score_median: Optional[float] = None
score_q3: Optional[float] = None
score_max: Optional[float] = None
score_mean: Optional[float] = None

def __init__(self, **data):
if "error_code_distribution" in data:
data["error_code_distribution"] = [ValidatorStatsErrorCodeInfo(**item) for item in json.loads(data["error_code_distribution"])]

super().__init__(**data)

@db_operation
async def get_validator_stats(conn: DatabaseConnection) -> int:
rows = await conn.fetch(
"""
WITH current_set AS (
SELECT MAX(set_id) as set_id FROM evaluation_sets
),
validator_eval_runs AS (
SELECT
e.validator_hotkey,
erh.evaluation_id,
erh.evaluation_run_id,
erh.status,
erh.solved,
erh.error_code,
erh.started_initializing_agent_at,
erh.finished_or_errored_at,
EXTRACT(EPOCH FROM (erh.finished_or_errored_at - erh.started_initializing_agent_at)) as runtime_seconds,
COUNT(DISTINCT i.inference_id) as num_inferences,
COUNT(DISTINCT em.embedding_id) as num_embeddings
FROM evaluations e
JOIN evaluation_runs_hydrated erh ON e.evaluation_id = erh.evaluation_id
JOIN agents a ON e.agent_id = a.agent_id
LEFT JOIN inferences i ON erh.evaluation_run_id = i.evaluation_run_id
LEFT JOIN embeddings em ON erh.evaluation_run_id = em.evaluation_run_id
WHERE e.set_id = (SELECT set_id FROM current_set)
AND e.evaluation_set_group = 'validator'::EvaluationSetGroup
AND a.miner_hotkey NOT IN (SELECT miner_hotkey FROM banned_hotkeys)
AND e.agent_id NOT IN (SELECT agent_id FROM unapproved_agent_ids)
AND e.agent_id NOT IN (SELECT agent_id FROM benchmark_agent_ids)
GROUP BY
e.validator_hotkey,
erh.evaluation_run_id,
erh.evaluation_id,
erh.status,
erh.error_code,
erh.finished_or_errored_at,
erh.started_initializing_agent_at,
erh.solved
),
validator_stats AS (
SELECT
validator_hotkey,
COUNT(DISTINCT evaluation_id) as num_evals,
COUNT(evaluation_run_id) as num_eval_runs,
COUNT(*) FILTER (WHERE status = 'finished') as num_success_eval_runs,
COUNT(*) FILTER (WHERE status = 'finished' AND solved) as num_pass_eval_runs,
COUNT(*) FILTER (WHERE status = 'finished' AND NOT solved) as num_fail_eval_runs,
COUNT(*) FILTER (WHERE status = 'error') as num_error_eval_runs,
COALESCE(SUM(num_inferences), 0) as num_inferences,
COALESCE(SUM(num_embeddings), 0) as num_embeddings
FROM validator_eval_runs
GROUP BY validator_hotkey
),
error_code_distribution AS (
SELECT
validator_hotkey,
COALESCE(
json_agg(
jsonb_build_object('error_code', error_code, 'count', error_count)
),
'[]'::json
) as error_code_distribution
FROM (
SELECT
validator_hotkey,
error_code,
COUNT(*) as error_count
FROM validator_eval_runs
WHERE status = 'error' AND error_code IS NOT NULL
GROUP BY validator_hotkey, error_code
)
GROUP BY validator_hotkey
),
runtime_quartiles AS (
SELECT
validator_hotkey,
PERCENTILE_CONT(0.00) WITHIN GROUP (ORDER BY runtime_seconds) as runtime_min,
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY runtime_seconds) as runtime_q1,
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY runtime_seconds) as runtime_median,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY runtime_seconds) as runtime_q3,
PERCENTILE_CONT(1.00) WITHIN GROUP (ORDER BY runtime_seconds) as runtime_max,
AVG(runtime_seconds) as runtime_mean
FROM validator_eval_runs
WHERE runtime_seconds IS NOT NULL
GROUP BY validator_hotkey
),
score_quartiles AS (
SELECT
validator_hotkey,
PERCENTILE_CONT(0.00) WITHIN GROUP (ORDER BY score) as score_min,
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY score) as score_q1,
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY score) as score_median,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY score) as score_q3,
PERCENTILE_CONT(1.00) WITHIN GROUP (ORDER BY score) as score_max,
AVG(score) as score_mean
FROM (
SELECT
validator_hotkey,
evaluation_id,
COUNT(*) FILTER (WHERE solved = true)::float / NULLIF(COUNT(*) FILTER (WHERE status = 'finished'), 0) AS score
FROM validator_eval_runs
WHERE status IN ('finished', 'error')
GROUP BY validator_hotkey, evaluation_id
HAVING COUNT(*) FILTER (WHERE status = 'finished') > 0
)
GROUP BY validator_hotkey
)
SELECT
vs.validator_hotkey,
vs.num_evals,
vs.num_eval_runs,
vs.num_success_eval_runs,
vs.num_pass_eval_runs,
vs.num_fail_eval_runs,
vs.num_error_eval_runs,
ecd.error_code_distribution,
vs.num_inferences,
vs.num_embeddings,
rq.runtime_min,
rq.runtime_q1,
rq.runtime_median,
rq.runtime_q3,
rq.runtime_max,
rq.runtime_mean,
sq.score_min,
sq.score_q1,
sq.score_median,
sq.score_q3,
sq.score_max,
sq.score_mean
FROM validator_stats vs
LEFT JOIN error_code_distribution ecd ON vs.validator_hotkey = ecd.validator_hotkey
LEFT JOIN runtime_quartiles rq ON vs.validator_hotkey = rq.validator_hotkey
LEFT JOIN score_quartiles sq ON vs.validator_hotkey = sq.validator_hotkey
ORDER BY vs.validator_hotkey;
"""
)
return [ValidatorStats(**row) for row in rows]