1
- from typing import Dict , Union
2
1
import argparse
3
- import re
4
2
import json
3
+ import re
5
4
import time
6
- from datetime import datetime
7
- from threading import Lock
8
5
from concurrent .futures import ThreadPoolExecutor
6
+ from datetime import datetime
9
7
from pathlib import Path
8
+ from threading import Lock
9
+ from typing import Dict , Union
10
+
10
11
import pandas as pd
11
12
from openai import OpenAI
13
+
12
14
from templates import JUDGE_TEMPLATE
13
15
16
+
14
17
# Constants
15
18
TIME_START = datetime .now ().strftime ("%Y%m%d_%H%M%S" )
16
19
LOCK = Lock ()
17
20
21
+
18
22
def get_args ():
19
23
parser = argparse .ArgumentParser ()
20
- parser .add_argument ('-o' , '--model-output-dir' , help = 'Model Output Directory' , required = True )
21
- parser .add_argument ('-k' , '--openai-api-key' , help = 'OpenAI API Key' , required = True )
22
- parser .add_argument ('-j' , '--judge-model' , help = 'Judge Model' , default = 'gpt-4-1106-preview' )
23
- parser .add_argument ('-t' , '--threads' , help = 'Thread count' , default = 42 , type = int )
24
+ parser .add_argument (
25
+ "-o" , "--model-output-dir" , help = "Model Output Directory" , required = True
26
+ )
27
+ parser .add_argument ("-k" , "--openai-api-key" , help = "OpenAI API Key" , required = True )
28
+ parser .add_argument (
29
+ "-j" , "--judge-model" , help = "Judge Model" , default = "gpt-4-1106-preview"
30
+ )
31
+ parser .add_argument ("-t" , "--threads" , help = "Thread count" , default = 42 , type = int )
24
32
return parser .parse_args ()
25
33
34
+
26
35
def create_azure_client (api_key : str ):
27
- return OpenAI (
28
- api_key = api_key
29
- )
36
+ return OpenAI (api_key = api_key )
37
+
30
38
31
- def create_answers (client , model_output , judge_model , is_multi_turn : bool = False , i = 0 ) -> Dict [str , Union [str , float ]]:
32
- model_questions = model_output ['questions' ]
33
- model_outputs = model_output ['outputs' ]
34
- model_references = model_output ['references' ]
39
+ def create_answers (
40
+ client , model_output , judge_model , is_multi_turn : bool = False , i = 0
41
+ ) -> Dict [str , Union [str , float ]]:
42
+ model_questions = model_output ["questions" ]
43
+ model_outputs = model_output ["outputs" ]
44
+ model_references = model_output ["references" ]
35
45
36
46
prompt = (
37
47
f"아래의 내용을 주어진 평가 기준들을 충실히 반영하여 평가해라. 특히 모델 답변이 언어 요구사항을 준수하는지 반드시 확인해야 한다.\n \n "
38
48
f"**Question**\n { model_questions [0 ]} "
39
49
)
40
-
50
+
41
51
if model_references and model_references [0 ]:
42
52
prompt += f"\n \n **Additional Reference**\n { model_references [0 ]} "
43
-
53
+
44
54
prompt += f"\n \n **Model's Response**\n { model_outputs [0 ]} "
45
-
55
+
46
56
if is_multi_turn :
47
57
prompt += f"\n \n **Follow-up Question.**\n { model_questions [1 ]} "
48
58
if model_references and model_references [1 ]:
49
59
prompt += f"\n \n **Additional Reference**\n { model_references [1 ]} "
50
60
prompt += f"\n \n **Model's Response**\n { model_outputs [1 ]} "
51
-
61
+
52
62
prompt += "\n \n [[대화 종료. 평가 시작.]]"
53
63
54
64
try :
@@ -57,24 +67,34 @@ def create_answers(client, model_output, judge_model, is_multi_turn: bool = Fals
57
67
temperature = 0.0 ,
58
68
n = 1 ,
59
69
messages = [
60
- {"role" : "system" , "content" : JUDGE_TEMPLATE ['multi_turn' if is_multi_turn else 'single_turn' ]},
61
- {"role" : "user" , "content" : prompt }
62
- ]
70
+ {
71
+ "role" : "system" ,
72
+ "content" : JUDGE_TEMPLATE [
73
+ "multi_turn" if is_multi_turn else "single_turn"
74
+ ],
75
+ },
76
+ {"role" : "user" , "content" : prompt },
77
+ ],
63
78
)
64
79
65
80
content = response .choices [0 ].message .content
66
- judge_message_match = re .search (r"평가:(.*?)점수:" , content .replace ("*" , '' ), re .DOTALL )
67
- judge_message = judge_message_match .group (1 ).strip () if judge_message_match else "No judge message found"
68
- judge_score_match = re .search (r"점수:\s*(\d+(\.\d+)?)" , content .replace ("*" , '' ))
81
+ judge_message_match = re .search (
82
+ r"평가:(.*?)점수:" , content .replace ("*" , "" ), re .DOTALL
83
+ )
84
+ judge_message = (
85
+ judge_message_match .group (1 ).strip ()
86
+ if judge_message_match
87
+ else "No judge message found"
88
+ )
89
+ judge_score_match = re .search (
90
+ r"점수:\s*(\d+(\.\d+)?)" , content .replace ("*" , "" )
91
+ )
69
92
if judge_score_match :
70
93
judge_score = float (judge_score_match .group (1 ))
71
94
else :
72
95
raise ValueError ("No score found in response" )
73
96
74
- return {
75
- 'judge_message' : judge_message ,
76
- 'judge_score' : judge_score
77
- }
97
+ return {"judge_message" : judge_message , "judge_score" : judge_score }
78
98
79
99
except Exception as e :
80
100
print ("Error. Retrying after 20 sec" , e )
@@ -84,26 +104,30 @@ def create_answers(client, model_output, judge_model, is_multi_turn: bool = Fals
84
104
if i > 3 :
85
105
print ("Impossible prompt, aborting..!" )
86
106
return {
87
- ' judge_message' : "Impossible to judge due to repetition." ,
88
- ' judge_score' : 0.0
107
+ " judge_message" : "Impossible to judge due to repetition." ,
108
+ " judge_score" : 0.0 ,
89
109
}
90
110
i += 1
91
111
return create_answers (client , model_output , judge_model , is_multi_turn , i )
92
112
113
+
93
114
def process_item (client , row , judge_model , output_file ):
94
115
query_single = create_answers (client , row , judge_model )
95
116
query_multi = create_answers (client , row , judge_model , is_multi_turn = True )
96
117
97
- row [' query_single' ] = query_single
98
- row [' query_multi' ] = query_multi
118
+ row [" query_single" ] = query_single
119
+ row [" query_multi" ] = query_multi
99
120
row = row .to_dict ()
100
121
101
122
with LOCK :
102
- with output_file .open ('a' , encoding = ' utf-8-sig' ) as f :
123
+ with output_file .open ("a" , encoding = " utf-8-sig" ) as f :
103
124
f .write (json .dumps (row , ensure_ascii = False ))
104
- f .write (' \n ' )
125
+ f .write (" \n " )
105
126
106
- def process_file (client , file_path : Path , output_dir : Path , judge_model , threads : int , args ):
127
+
128
+ def process_file (
129
+ client , file_path : Path , output_dir : Path , judge_model , threads : int , args
130
+ ):
107
131
print (f"- 현재 Processing : { file_path } " )
108
132
df_model_outputs = pd .read_json (file_path , lines = True )
109
133
@@ -114,26 +138,31 @@ def process_file(client, file_path: Path, output_dir: Path, judge_model, threads
114
138
for row in df_model_outputs .iterrows ():
115
139
executor .submit (process_item , client , row [1 ], judge_model , output_file )
116
140
141
+
117
142
def is_hidden (filepath : Path ) -> bool :
118
- return any (part .startswith ('.' ) for part in filepath .parts )
143
+ return any (part .startswith ("." ) for part in filepath .parts )
144
+
119
145
120
146
def main ():
121
147
args = get_args ()
122
148
client = create_azure_client (args .openai_api_key )
123
149
124
150
input_dir = Path (args .model_output_dir )
125
- output_dir = Path (' ./evaluated' )
151
+ output_dir = Path (" ./evaluated" )
126
152
127
153
# Filter out hidden files
128
- json_files = [file for file in input_dir .rglob (' *.jsonl' ) if not is_hidden (file )]
154
+ json_files = [file for file in input_dir .rglob (" *.jsonl" ) if not is_hidden (file )]
129
155
130
156
for file_path in json_files :
131
157
output_file_path = output_dir / file_path .relative_to (input_dir )
132
158
if output_file_path .exists ():
133
159
print (f"이미 평가 완료.. : { file_path } " )
134
160
continue
135
- process_file (client , file_path , output_dir , args .judge_model , args .threads , args )
136
- time .sleep (20 ) # to handle ratelimit!
161
+ process_file (
162
+ client , file_path , output_dir , args .judge_model , args .threads , args
163
+ )
164
+ time .sleep (20 ) # to handle ratelimit!
165
+
137
166
138
167
if __name__ == "__main__" :
139
- main ()
168
+ main ()
0 commit comments