1010# Standard
1111from pathlib import Path
1212from typing import Optional
13+ from typing_extensions import Annotated
1314import json
1415
1516# Third Party
17+ from rich import print
1618import typer
1719
1820app = typer .Typer ()
1921
2022
2123@app .command ()
22- def main (
24+ def best_checkpoint (
2325 input_dir : Path = typer .Argument (..., help = "Input directory to process" ),
2426 output_file : Optional [Path ] = typer .Option (None , help = "Optional output file path" ),
27+ tasks : Annotated [Optional [list [str ]], typer .Option ()] = None ,
2528):
2629 """
2730 Process files in the input directory and optionally save results to an output file.
@@ -54,6 +57,8 @@ def main(
5457 evaluator = LeaderboardV2Evaluator (
5558 model_path = str (checkpoint ), output_file = ckpt_output_file , num_gpus = 8
5659 )
60+ if tasks :
61+ evaluator .tasks = tasks
5762 result = evaluator .run ()
5863 checkpoint_results [checkpoint .name ] = result
5964 typer .echo (f"Checkpoint { checkpoint .name } results: { result ['overall_score' ]} " )
@@ -63,12 +68,37 @@ def main(
6368 checkpoint_results .items (), key = lambda x : x [1 ]["overall_score" ], reverse = True
6469 )
6570 typer .echo ("Sorted checkpoints by score:" )
66- for checkpoint_name , result in sorted_checkpoints :
71+ for i , ( checkpoint_name , result ) in enumerate ( sorted_checkpoints ) :
6772 typer .echo (f"{ '=' * 100 } " )
68- typer .echo (json .dumps (result , indent = 2 ))
73+ # Add [BEST CHECKPOINT] label for the first checkpoint
74+ if i == 0 :
75+ typer .echo (
76+ f"[bold]Leaderboard results[/bold]: { checkpoint_name } [bold green][BEST CHECKPOINT][/bold green]"
77+ )
78+ else :
79+ typer .echo (f"[bold]Leaderboard results[/bold]: { checkpoint_name } " )
80+ typer .echo (f"Overall: { result ['overall_score' ] * 100 :.2f} %" )
81+ if "leaderboard_bbh" in result :
82+ typer .echo (f"BBH: { result ['leaderboard_bbh' ]['score' ] * 100 :.2f} %" )
83+ if "leaderboard_gpqa" in result :
84+ typer .echo (f"GPQA: { result ['leaderboard_gpqa' ]['score' ] * 100 :.2f} %" )
85+ if "leaderboard_ifeval" in result :
86+ typer .echo (f"IFEval: { result ['leaderboard_ifeval' ]['score' ] * 100 :.2f} %" )
87+ if "leaderboard_math_hard" in result :
88+ typer .echo (
89+ f"MATH-Hard: { result ['leaderboard_math_hard' ]['score' ] * 100 :.2f} %"
90+ )
91+ if "leaderboard_mmlu_pro" in result :
92+ typer .echo (
93+ f"MMLU-Pro: { result ['leaderboard_mmlu_pro' ]['score' ] * 100 :.2f} %"
94+ )
95+ if "leaderboard_musr" in result :
96+ typer .echo (f"MUSR: { result ['leaderboard_musr' ]['score' ] * 100 :.2f} %" )
6997
7098 typer .echo (f"{ '=' * 100 } " )
71- typer .echo (f"Best checkpoint: { sorted_checkpoints [0 ][0 ]} " )
99+ typer .echo (
100+ f"Best checkpoint: { sorted_checkpoints [0 ][0 ]} [bold green][BEST CHECKPOINT][/bold green]"
101+ )
72102
73103 if output_file :
74104 typer .echo (f"Output will be saved to: { output_file } " )
@@ -80,5 +110,152 @@ def main(
80110 typer .echo ("Processing complete!" )
81111
82112
113+ @app .command ()
114+ def evaluate (
115+ input_dir : Path = typer .Argument (..., help = "Input directory to process" ),
116+ tasks : Annotated [Optional [list [str ]], typer .Option ()] = None ,
117+ ):
118+ """
119+ Process files in the input directory and optionally save results to an output file.
120+ """
121+ if not input_dir .exists ():
122+ typer .echo (f"Error: Input directory '{ input_dir } ' does not exist" )
123+ raise typer .Exit (1 )
124+
125+ if not input_dir .is_dir ():
126+ typer .echo (f"Error: '{ input_dir } ' is not a directory" )
127+ raise typer .Exit (1 )
128+
129+ typer .echo ("importing LeaderboardV2Evaluator, this may take a while..." )
130+ # First Party
131+ from instructlab .eval .leaderboard import LeaderboardV2Evaluator
132+
133+ typer .echo ("done" )
134+
135+ evaluator = LeaderboardV2Evaluator (
136+ model_path = str (input_dir ), num_gpus = 8 , eval_config = {"batch_size" : "auto" }
137+ )
138+ if tasks :
139+ evaluator .tasks = tasks
140+ result = evaluator .run ()
141+
142+ # now just print out the checkpoint results
143+ print (f"[bold]Leaderboard results[/bold]: { input_dir } " )
144+ print (f"Overall: { result ['overall_score' ] * 100 :.2f} %" )
145+ if "leaderboard_bbh" in result :
146+ print (f"BBH: { result ['leaderboard_bbh' ]['score' ] * 100 :.2f} %" )
147+ if "leaderboard_gpqa" in result :
148+ print (f"GPQA: { result ['leaderboard_gpqa' ]['score' ] * 100 :.2f} %" )
149+ if "leaderboard_ifeval" in result :
150+ print (f"IFEval: { result ['leaderboard_ifeval' ]['score' ] * 100 :.2f} %" )
151+ if "leaderboard_math_hard" in result :
152+ print (f"MATH-Hard: { result ['leaderboard_math_hard' ]['score' ] * 100 :.2f} %" )
153+ if "leaderboard_mmlu_pro" in result :
154+ print (f"MMLU-Pro: { result ['leaderboard_mmlu_pro' ]['score' ] * 100 :.2f} %" )
155+ if "leaderboard_musr" in result :
156+ print (f"MUSR: { result ['leaderboard_musr' ]['score' ] * 100 :.2f} %" )
157+
158+ output_file = input_dir / "leaderboard_results.json"
159+ output_file .write_text (json .dumps (result , indent = 2 ))
160+
161+
162+ @app .command ()
163+ def find_best (
164+ input_dir : Path = typer .Argument (..., help = "Input directory to process" ),
165+ show_all : bool = typer .Option (
166+ False , "--show-all" , help = "Show scores for all checkpoints"
167+ ),
168+ ):
169+ """
170+ Find the best checkpoint by looking through leaderboard_results.json files.
171+ """
172+ if not input_dir .exists ():
173+ typer .echo (f"Error: Input directory '{ input_dir } ' does not exist" )
174+ raise typer .Exit (1 )
175+
176+ if not input_dir .is_dir ():
177+ typer .echo (f"Error: '{ input_dir } ' is not a directory" )
178+ raise typer .Exit (1 )
179+
180+ # Find all leaderboard_results.json files
181+ result_files = list (input_dir .glob ("**/leaderboard_results.json" ))
182+
183+ if not result_files :
184+ typer .echo ("No leaderboard results found in any subdirectories" )
185+ raise typer .Exit (1 )
186+
187+ # Load and compare results
188+ best_score = - 1
189+ best_checkpoint = None
190+ best_results = None
191+ all_results = []
192+
193+ for result_file in result_files :
194+ try :
195+ results = json .loads (result_file .read_text ())
196+ score = results .get ("overall_score" , - 1 )
197+ all_results .append ((result_file .parent , score , results ))
198+
199+ if score > best_score :
200+ best_score = score
201+ best_checkpoint = result_file .parent
202+ best_results = results
203+ except Exception as e :
204+ typer .echo (f"Error reading { result_file } : { e } " )
205+ continue
206+
207+ if best_checkpoint is None :
208+ typer .echo ("No valid results found" )
209+ raise typer .Exit (1 )
210+
211+ # Sort all results by score
212+ all_results .sort (key = lambda x : x [1 ], reverse = True )
213+
214+ # Print all results if requested
215+ if show_all :
216+ print ("\n [bold]All checkpoint results:[/bold]" )
217+ for checkpoint , score , results in all_results :
218+ is_best = checkpoint == best_checkpoint
219+ prefix = "→ " if is_best else " "
220+ print (f"\n { prefix } Checkpoint: { checkpoint } " )
221+ print (f" Overall score: { score * 100 :.2f} %" )
222+ if "leaderboard_bbh" in results :
223+ print (f" BBH: { results ['leaderboard_bbh' ]['score' ] * 100 :.2f} %" )
224+ if "leaderboard_gpqa" in results :
225+ print (f" GPQA: { results ['leaderboard_gpqa' ]['score' ] * 100 :.2f} %" )
226+ if "leaderboard_ifeval" in results :
227+ print (f" IFEval: { results ['leaderboard_ifeval' ]['score' ] * 100 :.2f} %" )
228+ if "leaderboard_math_hard" in results :
229+ print (
230+ f" MATH-Hard: { results ['leaderboard_math_hard' ]['score' ] * 100 :.2f} %"
231+ )
232+ if "leaderboard_mmlu_pro" in results :
233+ print (
234+ f" MMLU-Pro: { results ['leaderboard_mmlu_pro' ]['score' ] * 100 :.2f} %"
235+ )
236+ if "leaderboard_musr" in results :
237+ print (f" MUSR: { results ['leaderboard_musr' ]['score' ] * 100 :.2f} %" )
238+ else :
239+ # Print only best results
240+ print (f"\n [bold]Best checkpoint found[/bold]: { best_checkpoint } " )
241+ print (f"Overall score: { best_score * 100 :.2f} %" )
242+ if "leaderboard_bbh" in best_results :
243+ print (f"BBH: { best_results ['leaderboard_bbh' ]['score' ] * 100 :.2f} %" )
244+ if "leaderboard_gpqa" in best_results :
245+ print (f"GPQA: { best_results ['leaderboard_gpqa' ]['score' ] * 100 :.2f} %" )
246+ if "leaderboard_ifeval" in best_results :
247+ print (f"IFEval: { best_results ['leaderboard_ifeval' ]['score' ] * 100 :.2f} %" )
248+ if "leaderboard_math_hard" in best_results :
249+ print (
250+ f"MATH-Hard: { best_results ['leaderboard_math_hard' ]['score' ] * 100 :.2f} %"
251+ )
252+ if "leaderboard_mmlu_pro" in best_results :
253+ print (
254+ f"MMLU-Pro: { best_results ['leaderboard_mmlu_pro' ]['score' ] * 100 :.2f} %"
255+ )
256+ if "leaderboard_musr" in best_results :
257+ print (f"MUSR: { best_results ['leaderboard_musr' ]['score' ] * 100 :.2f} %" )
258+
259+
83260if __name__ == "__main__" :
84261 app ()
0 commit comments