1313import json
1414
1515# Third Party
16+ from rich import print
17+ from typing_extensions import Annotated
1618import typer
1719
1820app = typer .Typer ()
1921
2022
23+ def print_metrics (result : dict , checkpoint_name : str = None , prefix : str = "" ):
24+ """
25+ Print formatted metrics for a checkpoint result.
26+
27+ Args:
28+ result: The evaluation result dictionary
29+ checkpoint_name: Optional checkpoint name to display
30+ prefix: Optional prefix for each line
31+ """
32+ if checkpoint_name :
33+ print (f"{ prefix } [bold]Leaderboard results[/bold]: { checkpoint_name } " )
34+ print (f"{ prefix } Overall: { result ['overall_score' ] * 100 :.2f} %" )
35+ if "leaderboard_bbh" in result :
36+ print (f"{ prefix } BBH: { result ['leaderboard_bbh' ]['score' ] * 100 :.2f} %" )
37+ if "leaderboard_gpqa" in result :
38+ print (f"{ prefix } GPQA: { result ['leaderboard_gpqa' ]['score' ] * 100 :.2f} %" )
39+ if "leaderboard_ifeval" in result :
40+ print (f"{ prefix } IFEval: { result ['leaderboard_ifeval' ]['score' ] * 100 :.2f} %" )
41+ if "leaderboard_math_hard" in result :
42+ print (
43+ f"{ prefix } MATH-Hard: { result ['leaderboard_math_hard' ]['score' ] * 100 :.2f} %"
44+ )
45+ if "leaderboard_mmlu_pro" in result :
46+ print (f"{ prefix } MMLU-Pro: { result ['leaderboard_mmlu_pro' ]['score' ] * 100 :.2f} %" )
47+ if "leaderboard_musr" in result :
48+ print (f"{ prefix } MUSR: { result ['leaderboard_musr' ]['score' ] * 100 :.2f} %" )
49+
50+
2151@app .command ()
22- def main (
52+ def best_checkpoint (
2353 input_dir : Path = typer .Argument (..., help = "Input directory to process" ),
2454 output_file : Optional [Path ] = typer .Option (None , help = "Optional output file path" ),
55+ tasks : Annotated [
56+ Optional [list [str ]],
57+ typer .Option (
58+ help = "Specific tasks to evaluate (e.g., 'leaderboard_bbh', 'leaderboard_gpqa')"
59+ ),
60+ ] = None ,
61+ num_gpus : int = typer .Option (8 , help = "Number of GPUs to use for evaluation" ),
2562):
2663 """
27- Process files in the input directory and optionally save results to an output file.
64+ Find the best checkpoint by evaluating all checkpoints in the input directory.
65+ Processes all checkpoint subdirectories and ranks them by overall score.
2866 """
2967 if not input_dir .exists ():
3068 typer .echo (f"Error: Input directory '{ input_dir } ' does not exist" )
@@ -52,8 +90,10 @@ def main(
5290 typer .echo (f"Processing checkpoint: { checkpoint } " )
5391 ckpt_output_file = checkpoint / "leaderboard_results.json"
5492 evaluator = LeaderboardV2Evaluator (
55- model_path = str (checkpoint ), output_file = ckpt_output_file , num_gpus = 8
93+ model_path = str (checkpoint ), output_file = ckpt_output_file , num_gpus = num_gpus
5694 )
95+ if tasks :
96+ evaluator .tasks = tasks
5797 result = evaluator .run ()
5898 checkpoint_results [checkpoint .name ] = result
5999 typer .echo (f"Checkpoint { checkpoint .name } results: { result ['overall_score' ]} " )
@@ -63,12 +103,21 @@ def main(
63103 checkpoint_results .items (), key = lambda x : x [1 ]["overall_score" ], reverse = True
64104 )
65105 typer .echo ("Sorted checkpoints by score:" )
66- for checkpoint_name , result in sorted_checkpoints :
106+ for i , ( checkpoint_name , result ) in enumerate ( sorted_checkpoints ) :
67107 typer .echo (f"{ '=' * 100 } " )
68- typer .echo (json .dumps (result , indent = 2 ))
108+ # Add [BEST CHECKPOINT] label for the first checkpoint
109+ if i == 0 :
110+ checkpoint_display = (
111+ f"{ checkpoint_name } [bold green][BEST CHECKPOINT][/bold green]"
112+ )
113+ else :
114+ checkpoint_display = checkpoint_name
115+ print_metrics (result , checkpoint_display )
69116
70117 typer .echo (f"{ '=' * 100 } " )
71- typer .echo (f"Best checkpoint: { sorted_checkpoints [0 ][0 ]} " )
118+ typer .echo (
119+ f"Best checkpoint: { sorted_checkpoints [0 ][0 ]} [bold green][BEST CHECKPOINT][/bold green]"
120+ )
72121
73122 if output_file :
74123 typer .echo (f"Output will be saved to: { output_file } " )
@@ -80,5 +129,127 @@ def main(
80129 typer .echo ("Processing complete!" )
81130
82131
132+ @app .command ()
133+ def evaluate (
134+ input_dir : Path = typer .Argument (..., help = "Input directory to process" ),
135+ tasks : Annotated [
136+ Optional [list [str ]],
137+ typer .Option (
138+ help = "Specific tasks to evaluate (e.g., 'leaderboard_bbh', 'leaderboard_gpqa')"
139+ ),
140+ ] = None ,
141+ num_gpus : int = typer .Option (8 , help = "Number of GPUs to use for evaluation" ),
142+ output_file : Optional [Path ] = typer .Option (
143+ None ,
144+ help = "Custom output file path (default: input_dir/leaderboard_results.json)" ,
145+ ),
146+ ):
147+ """
148+ Evaluate a single checkpoint directory and save results to JSON file.
149+ """
150+ if not input_dir .exists ():
151+ typer .echo (f"Error: Input directory '{ input_dir } ' does not exist" )
152+ raise typer .Exit (1 )
153+
154+ if not input_dir .is_dir ():
155+ typer .echo (f"Error: '{ input_dir } ' is not a directory" )
156+ raise typer .Exit (1 )
157+
158+ typer .echo ("importing LeaderboardV2Evaluator, this may take a while..." )
159+ # First Party
160+ from instructlab .eval .leaderboard import LeaderboardV2Evaluator
161+
162+ typer .echo ("done" )
163+
164+ evaluator = LeaderboardV2Evaluator (
165+ model_path = str (input_dir ), num_gpus = num_gpus , eval_config = {"batch_size" : "auto" }
166+ )
167+ if tasks :
168+ evaluator .tasks = tasks
169+ result = evaluator .run ()
170+
171+ # now just print out the checkpoint results
172+ print_metrics (result , str (input_dir ))
173+
174+ # Determine output file path
175+ if output_file is None :
176+ output_file = input_dir / "leaderboard_results.json"
177+
178+ # Check if file exists and warn user
179+ if output_file .exists ():
180+ typer .echo (
181+ f"Warning: Output file '{ output_file } ' already exists and will be overwritten"
182+ )
183+
184+ output_file .write_text (json .dumps (result , indent = 2 ))
185+ typer .echo (f"Results saved to: { output_file } " )
186+
187+
188+ @app .command ()
189+ def find_best (
190+ input_dir : Path = typer .Argument (..., help = "Input directory to process" ),
191+ show_all : bool = typer .Option (
192+ False , "--show-all" , help = "Show scores for all checkpoints"
193+ ),
194+ ):
195+ """
196+ Find the best checkpoint by looking through leaderboard_results.json files.
197+ """
198+ if not input_dir .exists ():
199+ typer .echo (f"Error: Input directory '{ input_dir } ' does not exist" )
200+ raise typer .Exit (1 )
201+
202+ if not input_dir .is_dir ():
203+ typer .echo (f"Error: '{ input_dir } ' is not a directory" )
204+ raise typer .Exit (1 )
205+
206+ # Find all leaderboard_results.json files
207+ result_files = list (input_dir .glob ("**/leaderboard_results.json" ))
208+
209+ if not result_files :
210+ typer .echo ("No leaderboard results found in any subdirectories" )
211+ raise typer .Exit (1 )
212+
213+ # Load and compare results
214+ best_score = - 1
215+ best_checkpoint = None
216+ best_results = None
217+ all_results = []
218+
219+ for result_file in result_files :
220+ try :
221+ results = json .loads (result_file .read_text ())
222+ score = results .get ("overall_score" , - 1 )
223+ all_results .append ((result_file .parent , score , results ))
224+
225+ if score > best_score :
226+ best_score = score
227+ best_checkpoint = result_file .parent
228+ best_results = results
229+ except Exception as e :
230+ typer .echo (f"Error reading { result_file } : { e } " )
231+ continue
232+
233+ if best_checkpoint is None :
234+ typer .echo ("No valid results found" )
235+ raise typer .Exit (1 )
236+
237+ # Sort all results by score
238+ all_results .sort (key = lambda x : x [1 ], reverse = True )
239+
240+ # Print all results if requested
241+ if show_all :
242+ print ("\n [bold]All checkpoint results:[/bold]" )
243+ for checkpoint , score , results in all_results :
244+ is_best = checkpoint == best_checkpoint
245+ prefix = "→ " if is_best else " "
246+ print (f"\n { prefix } Checkpoint: { checkpoint } " )
247+ print_metrics (results , prefix = " " )
248+ else :
249+ # Print only best results
250+ print (f"\n [bold]Best checkpoint found[/bold]: { best_checkpoint } " )
251+ print_metrics (best_results )
252+
253+
83254if __name__ == "__main__" :
84255 app ()
0 commit comments