makes requested changes

RobotSail · RobotSail · commit ffe9c94f2153 · 2025-07-08T02:33:29.000-04:00
diff --git a/scripts/evaluate_best_checkpoint.py b/scripts/evaluate_best_checkpoint.py
@@ -10,24 +10,59 @@
 # Standard
 from pathlib import Path
 from typing import Optional
-from typing_extensions import Annotated
 import json
 
 # Third Party
 from rich import print
+from typing_extensions import Annotated
 import typer
 
 app = typer.Typer()
 
 
+def print_metrics(result: dict, checkpoint_name: str = None, prefix: str = ""):
+    """
+    Print formatted metrics for a checkpoint result.
+
+    Args:
+        result: The evaluation result dictionary
+        checkpoint_name: Optional checkpoint name to display
+        prefix: Optional prefix for each line
+    """
+    if checkpoint_name:
+        print(f"{prefix}[bold]Leaderboard results[/bold]: {checkpoint_name}")
+    print(f"{prefix}Overall: {result['overall_score'] * 100:.2f}%")
+    if "leaderboard_bbh" in result:
+        print(f"{prefix}BBH: {result['leaderboard_bbh']['score'] * 100:.2f}%")
+    if "leaderboard_gpqa" in result:
+        print(f"{prefix}GPQA: {result['leaderboard_gpqa']['score'] * 100:.2f}%")
+    if "leaderboard_ifeval" in result:
+        print(f"{prefix}IFEval: {result['leaderboard_ifeval']['score'] * 100:.2f}%")
+    if "leaderboard_math_hard" in result:
+        print(
+            f"{prefix}MATH-Hard: {result['leaderboard_math_hard']['score'] * 100:.2f}%"
+        )
+    if "leaderboard_mmlu_pro" in result:
+        print(f"{prefix}MMLU-Pro: {result['leaderboard_mmlu_pro']['score'] * 100:.2f}%")
+    if "leaderboard_musr" in result:
+        print(f"{prefix}MUSR: {result['leaderboard_musr']['score'] * 100:.2f}%")
+
+
 @app.command()
 def best_checkpoint(
     input_dir: Path = typer.Argument(..., help="Input directory to process"),
     output_file: Optional[Path] = typer.Option(None, help="Optional output file path"),
-    tasks: Annotated[Optional[list[str]], typer.Option()] = None,
+    tasks: Annotated[
+        Optional[list[str]],
+        typer.Option(
+            help="Specific tasks to evaluate (e.g., 'leaderboard_bbh', 'leaderboard_gpqa')"
+        ),
+    ] = None,
+    num_gpus: int = typer.Option(8, help="Number of GPUs to use for evaluation"),
 ):
     """
-    Process files in the input directory and optionally save results to an output file.
+    Find the best checkpoint by evaluating all checkpoints in the input directory.
+    Processes all checkpoint subdirectories and ranks them by overall score.
     """
     if not input_dir.exists():
         typer.echo(f"Error: Input directory '{input_dir}' does not exist")
@@ -55,7 +90,7 @@ def best_checkpoint(
         typer.echo(f"Processing checkpoint: {checkpoint}")
         ckpt_output_file = checkpoint / "leaderboard_results.json"
         evaluator = LeaderboardV2Evaluator(
-            model_path=str(checkpoint), output_file=ckpt_output_file, num_gpus=8
+            model_path=str(checkpoint), output_file=ckpt_output_file, num_gpus=num_gpus
         )
         if tasks:
             evaluator.tasks = tasks
@@ -72,28 +107,12 @@ def best_checkpoint(
         typer.echo(f"{'=' * 100}")
         # Add [BEST CHECKPOINT] label for the first checkpoint
         if i == 0:
-            typer.echo(
-                f"[bold]Leaderboard results[/bold]: {checkpoint_name} [bold green][BEST CHECKPOINT][/bold green]"
+            checkpoint_display = (
+                f"{checkpoint_name} [bold green][BEST CHECKPOINT][/bold green]"
             )
         else:
-            typer.echo(f"[bold]Leaderboard results[/bold]: {checkpoint_name}")
-        typer.echo(f"Overall: {result['overall_score'] * 100:.2f}%")
-        if "leaderboard_bbh" in result:
-            typer.echo(f"BBH: {result['leaderboard_bbh']['score'] * 100:.2f}%")
-        if "leaderboard_gpqa" in result:
-            typer.echo(f"GPQA: {result['leaderboard_gpqa']['score'] * 100:.2f}%")
-        if "leaderboard_ifeval" in result:
-            typer.echo(f"IFEval: {result['leaderboard_ifeval']['score'] * 100:.2f}%")
-        if "leaderboard_math_hard" in result:
-            typer.echo(
-                f"MATH-Hard: {result['leaderboard_math_hard']['score'] * 100:.2f}%"
-            )
-        if "leaderboard_mmlu_pro" in result:
-            typer.echo(
-                f"MMLU-Pro: {result['leaderboard_mmlu_pro']['score'] * 100:.2f}%"
-            )
-        if "leaderboard_musr" in result:
-            typer.echo(f"MUSR: {result['leaderboard_musr']['score'] * 100:.2f}%")
+            checkpoint_display = checkpoint_name
+        print_metrics(result, checkpoint_display)
 
     typer.echo(f"{'=' * 100}")
     typer.echo(
@@ -113,10 +132,20 @@ def best_checkpoint(
 @app.command()
 def evaluate(
     input_dir: Path = typer.Argument(..., help="Input directory to process"),
-    tasks: Annotated[Optional[list[str]], typer.Option()] = None,
+    tasks: Annotated[
+        Optional[list[str]],
+        typer.Option(
+            help="Specific tasks to evaluate (e.g., 'leaderboard_bbh', 'leaderboard_gpqa')"
+        ),
+    ] = None,
+    num_gpus: int = typer.Option(8, help="Number of GPUs to use for evaluation"),
+    output_file: Optional[Path] = typer.Option(
+        None,
+        help="Custom output file path (default: input_dir/leaderboard_results.json)",
+    ),
 ):
     """
-    Process files in the input directory and optionally save results to an output file.
+    Evaluate a single checkpoint directory and save results to JSON file.
     """
     if not input_dir.exists():
         typer.echo(f"Error: Input directory '{input_dir}' does not exist")
@@ -133,30 +162,27 @@ def evaluate(
     typer.echo("done")
 
     evaluator = LeaderboardV2Evaluator(
-        model_path=str(input_dir), num_gpus=8, eval_config={"batch_size": "auto"}
+        model_path=str(input_dir), num_gpus=num_gpus, eval_config={"batch_size": "auto"}
     )
     if tasks:
         evaluator.tasks = tasks
     result = evaluator.run()
 
     # now just print out the checkpoint results
-    print(f"[bold]Leaderboard results[/bold]: {input_dir}")
-    print(f"Overall: {result['overall_score'] * 100:.2f}%")
-    if "leaderboard_bbh" in result:
-        print(f"BBH: {result['leaderboard_bbh']['score'] * 100:.2f}%")
-    if "leaderboard_gpqa" in result:
-        print(f"GPQA: {result['leaderboard_gpqa']['score'] * 100:.2f}%")
-    if "leaderboard_ifeval" in result:
-        print(f"IFEval: {result['leaderboard_ifeval']['score'] * 100:.2f}%")
-    if "leaderboard_math_hard" in result:
-        print(f"MATH-Hard: {result['leaderboard_math_hard']['score'] * 100:.2f}%")
-    if "leaderboard_mmlu_pro" in result:
-        print(f"MMLU-Pro: {result['leaderboard_mmlu_pro']['score'] * 100:.2f}%")
-    if "leaderboard_musr" in result:
-        print(f"MUSR: {result['leaderboard_musr']['score'] * 100:.2f}%")
+    print_metrics(result, str(input_dir))
+
+    # Determine output file path
+    if output_file is None:
+        output_file = input_dir / "leaderboard_results.json"
+
+    # Check if file exists and warn user
+    if output_file.exists():
+        typer.echo(
+            f"Warning: Output file '{output_file}' already exists and will be overwritten"
+        )
 
-    output_file = input_dir / "leaderboard_results.json"
     output_file.write_text(json.dumps(result, indent=2))
+    typer.echo(f"Results saved to: {output_file}")
 
 
 @app.command()
@@ -218,43 +244,11 @@ def find_best(
             is_best = checkpoint == best_checkpoint
             prefix = "→ " if is_best else "  "
             print(f"\n{prefix}Checkpoint: {checkpoint}")
-            print(f"  Overall score: {score * 100:.2f}%")
-            if "leaderboard_bbh" in results:
-                print(f"  BBH: {results['leaderboard_bbh']['score'] * 100:.2f}%")
-            if "leaderboard_gpqa" in results:
-                print(f"  GPQA: {results['leaderboard_gpqa']['score'] * 100:.2f}%")
-            if "leaderboard_ifeval" in results:
-                print(f"  IFEval: {results['leaderboard_ifeval']['score'] * 100:.2f}%")
-            if "leaderboard_math_hard" in results:
-                print(
-                    f"  MATH-Hard: {results['leaderboard_math_hard']['score'] * 100:.2f}%"
-                )
-            if "leaderboard_mmlu_pro" in results:
-                print(
-                    f"  MMLU-Pro: {results['leaderboard_mmlu_pro']['score'] * 100:.2f}%"
-                )
-            if "leaderboard_musr" in results:
-                print(f"  MUSR: {results['leaderboard_musr']['score'] * 100:.2f}%")
+            print_metrics(results, prefix="  ")
     else:
         # Print only best results
         print(f"\n[bold]Best checkpoint found[/bold]: {best_checkpoint}")
-        print(f"Overall score: {best_score * 100:.2f}%")
-        if "leaderboard_bbh" in best_results:
-            print(f"BBH: {best_results['leaderboard_bbh']['score'] * 100:.2f}%")
-        if "leaderboard_gpqa" in best_results:
-            print(f"GPQA: {best_results['leaderboard_gpqa']['score'] * 100:.2f}%")
-        if "leaderboard_ifeval" in best_results:
-            print(f"IFEval: {best_results['leaderboard_ifeval']['score'] * 100:.2f}%")
-        if "leaderboard_math_hard" in best_results:
-            print(
-                f"MATH-Hard: {best_results['leaderboard_math_hard']['score'] * 100:.2f}%"
-            )
-        if "leaderboard_mmlu_pro" in best_results:
-            print(
-                f"MMLU-Pro: {best_results['leaderboard_mmlu_pro']['score'] * 100:.2f}%"
-            )
-        if "leaderboard_musr" in best_results:
-            print(f"MUSR: {best_results['leaderboard_musr']['score'] * 100:.2f}%")
+        print_metrics(best_results)
 
 
 if __name__ == "__main__":