Skip to content
This repository was archived by the owner on Oct 17, 2024. It is now read-only.

Commit a61d06f

Browse files
committed
format w/ ruff
1 parent f3f087c commit a61d06f

File tree

4 files changed

+172
-90
lines changed

4 files changed

+172
-90
lines changed

Diff for: evaluator.py

+71-42
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,64 @@
1-
from typing import Dict, Union
21
import argparse
3-
import re
42
import json
3+
import re
54
import time
6-
from datetime import datetime
7-
from threading import Lock
85
from concurrent.futures import ThreadPoolExecutor
6+
from datetime import datetime
97
from pathlib import Path
8+
from threading import Lock
9+
from typing import Dict, Union
10+
1011
import pandas as pd
1112
from openai import OpenAI
13+
1214
from templates import JUDGE_TEMPLATE
1315

16+
1417
# Constants
1518
TIME_START = datetime.now().strftime("%Y%m%d_%H%M%S")
1619
LOCK = Lock()
1720

21+
1822
def get_args():
1923
parser = argparse.ArgumentParser()
20-
parser.add_argument('-o', '--model-output-dir', help='Model Output Directory', required=True)
21-
parser.add_argument('-k', '--openai-api-key', help='OpenAI API Key', required=True)
22-
parser.add_argument('-j', '--judge-model', help='Judge Model', default='gpt-4-1106-preview')
23-
parser.add_argument('-t', '--threads', help='Thread count', default=42, type=int)
24+
parser.add_argument(
25+
"-o", "--model-output-dir", help="Model Output Directory", required=True
26+
)
27+
parser.add_argument("-k", "--openai-api-key", help="OpenAI API Key", required=True)
28+
parser.add_argument(
29+
"-j", "--judge-model", help="Judge Model", default="gpt-4-1106-preview"
30+
)
31+
parser.add_argument("-t", "--threads", help="Thread count", default=42, type=int)
2432
return parser.parse_args()
2533

34+
2635
def create_azure_client(api_key: str):
27-
return OpenAI(
28-
api_key=api_key
29-
)
36+
return OpenAI(api_key=api_key)
37+
3038

31-
def create_answers(client, model_output, judge_model, is_multi_turn: bool = False, i=0) -> Dict[str, Union[str, float]]:
32-
model_questions = model_output['questions']
33-
model_outputs = model_output['outputs']
34-
model_references = model_output['references']
39+
def create_answers(
40+
client, model_output, judge_model, is_multi_turn: bool = False, i=0
41+
) -> Dict[str, Union[str, float]]:
42+
model_questions = model_output["questions"]
43+
model_outputs = model_output["outputs"]
44+
model_references = model_output["references"]
3545

3646
prompt = (
3747
f"아래의 내용을 주어진 평가 기준들을 충실히 반영하여 평가해라. 특히 모델 답변이 언어 요구사항을 준수하는지 반드시 확인해야 한다.\n\n"
3848
f"**Question**\n{model_questions[0]}"
3949
)
40-
50+
4151
if model_references and model_references[0]:
4252
prompt += f"\n\n**Additional Reference**\n{model_references[0]}"
43-
53+
4454
prompt += f"\n\n**Model's Response**\n{model_outputs[0]}"
45-
55+
4656
if is_multi_turn:
4757
prompt += f"\n\n**Follow-up Question.**\n{model_questions[1]}"
4858
if model_references and model_references[1]:
4959
prompt += f"\n\n**Additional Reference**\n{model_references[1]}"
5060
prompt += f"\n\n**Model's Response**\n{model_outputs[1]}"
51-
61+
5262
prompt += "\n\n[[대화 종료. 평가 시작.]]"
5363

5464
try:
@@ -57,24 +67,34 @@ def create_answers(client, model_output, judge_model, is_multi_turn: bool = Fals
5767
temperature=0.0,
5868
n=1,
5969
messages=[
60-
{"role": "system", "content": JUDGE_TEMPLATE['multi_turn' if is_multi_turn else 'single_turn']},
61-
{"role": "user", "content": prompt}
62-
]
70+
{
71+
"role": "system",
72+
"content": JUDGE_TEMPLATE[
73+
"multi_turn" if is_multi_turn else "single_turn"
74+
],
75+
},
76+
{"role": "user", "content": prompt},
77+
],
6378
)
6479

6580
content = response.choices[0].message.content
66-
judge_message_match = re.search(r"평가:(.*?)점수:", content.replace("*", ''), re.DOTALL)
67-
judge_message = judge_message_match.group(1).strip() if judge_message_match else "No judge message found"
68-
judge_score_match = re.search(r"점수:\s*(\d+(\.\d+)?)", content.replace("*", ''))
81+
judge_message_match = re.search(
82+
r"평가:(.*?)점수:", content.replace("*", ""), re.DOTALL
83+
)
84+
judge_message = (
85+
judge_message_match.group(1).strip()
86+
if judge_message_match
87+
else "No judge message found"
88+
)
89+
judge_score_match = re.search(
90+
r"점수:\s*(\d+(\.\d+)?)", content.replace("*", "")
91+
)
6992
if judge_score_match:
7093
judge_score = float(judge_score_match.group(1))
7194
else:
7295
raise ValueError("No score found in response")
7396

74-
return {
75-
'judge_message': judge_message,
76-
'judge_score': judge_score
77-
}
97+
return {"judge_message": judge_message, "judge_score": judge_score}
7898

7999
except Exception as e:
80100
print("Error. Retrying after 20 sec", e)
@@ -84,26 +104,30 @@ def create_answers(client, model_output, judge_model, is_multi_turn: bool = Fals
84104
if i > 3:
85105
print("Impossible prompt, aborting..!")
86106
return {
87-
'judge_message': "Impossible to judge due to repetition.",
88-
'judge_score': 0.0
107+
"judge_message": "Impossible to judge due to repetition.",
108+
"judge_score": 0.0,
89109
}
90110
i += 1
91111
return create_answers(client, model_output, judge_model, is_multi_turn, i)
92112

113+
93114
def process_item(client, row, judge_model, output_file):
94115
query_single = create_answers(client, row, judge_model)
95116
query_multi = create_answers(client, row, judge_model, is_multi_turn=True)
96117

97-
row['query_single'] = query_single
98-
row['query_multi'] = query_multi
118+
row["query_single"] = query_single
119+
row["query_multi"] = query_multi
99120
row = row.to_dict()
100121

101122
with LOCK:
102-
with output_file.open('a', encoding='utf-8-sig') as f:
123+
with output_file.open("a", encoding="utf-8-sig") as f:
103124
f.write(json.dumps(row, ensure_ascii=False))
104-
f.write('\n')
125+
f.write("\n")
105126

106-
def process_file(client, file_path: Path, output_dir: Path, judge_model, threads: int, args):
127+
128+
def process_file(
129+
client, file_path: Path, output_dir: Path, judge_model, threads: int, args
130+
):
107131
print(f"- 현재 Processing : {file_path}")
108132
df_model_outputs = pd.read_json(file_path, lines=True)
109133

@@ -114,26 +138,31 @@ def process_file(client, file_path: Path, output_dir: Path, judge_model, threads
114138
for row in df_model_outputs.iterrows():
115139
executor.submit(process_item, client, row[1], judge_model, output_file)
116140

141+
117142
def is_hidden(filepath: Path) -> bool:
118-
return any(part.startswith('.') for part in filepath.parts)
143+
return any(part.startswith(".") for part in filepath.parts)
144+
119145

120146
def main():
121147
args = get_args()
122148
client = create_azure_client(args.openai_api_key)
123149

124150
input_dir = Path(args.model_output_dir)
125-
output_dir = Path('./evaluated')
151+
output_dir = Path("./evaluated")
126152

127153
# Filter out hidden files
128-
json_files = [file for file in input_dir.rglob('*.jsonl') if not is_hidden(file)]
154+
json_files = [file for file in input_dir.rglob("*.jsonl") if not is_hidden(file)]
129155

130156
for file_path in json_files:
131157
output_file_path = output_dir / file_path.relative_to(input_dir)
132158
if output_file_path.exists():
133159
print(f"이미 평가 완료.. : {file_path}")
134160
continue
135-
process_file(client, file_path, output_dir, args.judge_model, args.threads, args)
136-
time.sleep(20) # to handle ratelimit!
161+
process_file(
162+
client, file_path, output_dir, args.judge_model, args.threads, args
163+
)
164+
time.sleep(20) # to handle ratelimit!
165+
137166

138167
if __name__ == "__main__":
139-
main()
168+
main()

Diff for: generator.py

+73-33
Original file line numberDiff line numberDiff line change
@@ -1,73 +1,113 @@
11
import argparse
2-
import pandas as pd
32
import os
3+
4+
import pandas as pd
5+
46
from templates import PROMPT_STRATEGY
57

8+
69
# Use aphrodite-engine or vLLM
710
try:
811
from aphrodite import LLM, SamplingParams
12+
913
print("- Using aphrodite-engine")
1014

1115
except ImportError:
1216
from vllm import LLM, SamplingParams
17+
1318
print("- Using vLLM")
1419

1520
parser = argparse.ArgumentParser()
16-
parser.add_argument('-g' ,'--gpu_devices', help=' : CUDA_VISIBLE_DEVICES', default='0')
17-
parser.add_argument('-m', '--model', help=' : Model to evaluate', default='yanolja/EEVE-Korean-Instruct-2.8B-v1.0')
18-
parser.add_argument('-ml', '--model_len', help=' : Maximum Model Length', default=4096, type=int)
21+
parser.add_argument("-g", "--gpu_devices", help=" : CUDA_VISIBLE_DEVICES", default="0")
22+
parser.add_argument(
23+
"-m",
24+
"--model",
25+
help=" : Model to evaluate",
26+
default="yanolja/EEVE-Korean-Instruct-2.8B-v1.0",
27+
)
28+
parser.add_argument(
29+
"-ml", "--model_len", help=" : Maximum Model Length", default=4096, type=int
30+
)
1931
args = parser.parse_args()
2032

2133
print(f"Args - {args}")
2234

2335
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_devices
24-
gpu_counts = len(args.gpu_devices.split(','))
36+
gpu_counts = len(args.gpu_devices.split(","))
2537

2638
llm = LLM(
2739
model=args.model,
2840
tensor_parallel_size=gpu_counts,
2941
max_model_len=args.model_len,
3042
gpu_memory_utilization=0.8,
31-
trust_remote_code=True # !
32-
)
43+
trust_remote_code=True, # !
44+
)
3345

3446
sampling_params = SamplingParams(
3547
temperature=0,
3648
skip_special_tokens=True,
3749
max_tokens=args.model_len,
38-
stop=[
39-
'<|endoftext|>',
40-
'[INST]',
41-
'[/INST]',
42-
'<|im_end|>',
43-
'<|end|>',
44-
'<|eot_id|>'
45-
]
46-
)
50+
stop=["<|endoftext|>", "[INST]", "[/INST]", "<|im_end|>", "<|end|>", "<|eot_id|>"],
51+
)
4752

4853
df_questions = pd.read_json(
49-
'questions.jsonl',
50-
orient='records',
51-
encoding="utf-8-sig",
52-
lines=True
53-
)
54+
"questions.jsonl", orient="records", encoding="utf-8-sig", lines=True
55+
)
5456

5557
if not os.path.exists("./generated/" + args.model):
5658
os.makedirs("./generated/" + args.model)
5759

5860
for strategy_name, prompts in PROMPT_STRATEGY.items():
61+
5962
def format_single_turn_question(question):
60-
return llm.llm_engine.tokenizer.tokenizer.apply_chat_template(prompts + [{"role": "user", "content": question[0]}], tokenize=False, add_generation_prompt=True)
61-
62-
single_turn_questions = df_questions['questions'].map(format_single_turn_question)
63+
return llm.llm_engine.tokenizer.tokenizer.apply_chat_template(
64+
prompts + [{"role": "user", "content": question[0]}],
65+
tokenize=False,
66+
add_generation_prompt=True,
67+
)
68+
69+
single_turn_questions = df_questions["questions"].map(format_single_turn_question)
6370
print(single_turn_questions.iloc[0])
64-
single_turn_outputs = [output.outputs[0].text.strip() for output in llm.generate(single_turn_questions, sampling_params)]
65-
71+
single_turn_outputs = [
72+
output.outputs[0].text.strip()
73+
for output in llm.generate(single_turn_questions, sampling_params)
74+
]
75+
6676
def format_double_turn_question(question, single_turn_output):
67-
return llm.llm_engine.tokenizer.tokenizer.apply_chat_template(prompts + [{"role": "user", "content": question[0]}, {"role": "assistant", "content": single_turn_output}, {"role": "user", "content": question[1]}], tokenize=False, add_generation_prompt=True)
68-
69-
multi_turn_questions = df_questions[['questions', 'id']].apply(lambda x: format_double_turn_question(x['questions'], single_turn_outputs[x['id']-1]), axis=1)
70-
multi_turn_outputs = [output.outputs[0].text.strip() for output in llm.generate(multi_turn_questions, sampling_params)]
71-
72-
df_output = pd.DataFrame({'id': df_questions['id'], 'category': df_questions['category'], 'questions': df_questions['questions'], 'outputs': list(zip(single_turn_outputs, multi_turn_outputs)), "references": df_questions['references']})
73-
df_output.to_json('./generated/' + os.path.join(args.model, f'{strategy_name}.jsonl'), orient='records', lines=True, force_ascii=False)
77+
return llm.llm_engine.tokenizer.tokenizer.apply_chat_template(
78+
prompts
79+
+ [
80+
{"role": "user", "content": question[0]},
81+
{"role": "assistant", "content": single_turn_output},
82+
{"role": "user", "content": question[1]},
83+
],
84+
tokenize=False,
85+
add_generation_prompt=True,
86+
)
87+
88+
multi_turn_questions = df_questions[["questions", "id"]].apply(
89+
lambda x: format_double_turn_question(
90+
x["questions"], single_turn_outputs[x["id"] - 1]
91+
),
92+
axis=1,
93+
)
94+
multi_turn_outputs = [
95+
output.outputs[0].text.strip()
96+
for output in llm.generate(multi_turn_questions, sampling_params)
97+
]
98+
99+
df_output = pd.DataFrame(
100+
{
101+
"id": df_questions["id"],
102+
"category": df_questions["category"],
103+
"questions": df_questions["questions"],
104+
"outputs": list(zip(single_turn_outputs, multi_turn_outputs)),
105+
"references": df_questions["references"],
106+
}
107+
)
108+
df_output.to_json(
109+
"./generated/" + os.path.join(args.model, f"{strategy_name}.jsonl"),
110+
orient="records",
111+
lines=True,
112+
force_ascii=False,
113+
)

Diff for: pyproject.toml

+1-2
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,4 @@ ignore = ["C408", "C901", "E501", "E731", "E741", "W605"]
88
select = ["C", "E", "F", "I", "W"]
99

1010
[tool.ruff.lint.isort]
11-
lines-after-imports = 2
12-
11+
lines-after-imports = 2

0 commit comments

Comments
 (0)