3
3
import os
4
4
5
5
import pandas as pd
6
- import wandb
7
6
from common import (
8
7
JUDGEMENT_DIR ,
9
- load_judgements ,
10
- filter_single_judgements ,
8
+ PREDICTION_DIR ,
11
9
filter_pairwise_judgements ,
10
+ filter_single_judgements ,
11
+ load_judgements ,
12
+ load_model_config ,
12
13
)
13
14
from show_result import calculate_win_rate
14
15
16
+ import wandb
17
+
15
18
logger = logging .getLogger (__name__ )
16
19
17
20
21
+ def get_run_config_from_result (mode : str , result : dict ) -> dict :
22
+ if mode == "single" :
23
+ model = result ["model" ]
24
+ model_config = load_model_config (PREDICTION_DIR / model )
25
+ judge_model = result ["judge_model" ]
26
+ return {
27
+ "mode" : mode ,
28
+ "judge_model" : judge_model ,
29
+ "model" : model ,
30
+ "model_config" : model_config ,
31
+ }
32
+ else :
33
+ model_1 = result ["model_1" ]
34
+ model_1_config = load_model_config (PREDICTION_DIR / model_1 )
35
+ model_2 = result ["model_2" ]
36
+ model_2_config = load_model_config (PREDICTION_DIR / model_2 )
37
+ judge_model = result ["judge_model" ]
38
+ return {
39
+ "mode" : mode ,
40
+ "judge_model" : judge_model ,
41
+ "model_1" : model_1 ,
42
+ "model_2" : model_2 ,
43
+ "model_1_config" : model_1_config ,
44
+ "model_2_config" : model_2_config ,
45
+ }
46
+
47
+
18
48
def upload_results (
19
49
mode : str ,
20
50
result_id : str ,
@@ -30,7 +60,11 @@ def upload_results(
30
60
baseline_model: Baseline model name. Only used in `pairwise-baseline` mode.
31
61
"""
32
62
project = os .getenv ("WANDB_PROJECT" , "ja-vicuna-qa-benchmark" )
33
- run = wandb .init (project = project , name = result_id , reinit = True )
63
+ if len (results ) == 0 :
64
+ logger .warning (f"No results found for { result_id } " )
65
+ return
66
+ config = get_run_config_from_result (mode , results [0 ])
67
+ run = wandb .init (project = project , name = result_id , config = config , reinit = True )
34
68
35
69
table_prefix = mode
36
70
if mode == "pairwise-baseline" :
0 commit comments