Skip to content

Commit 5f17047

Browse files
authored
Merge pull request #49 from ku-nlp/v2.0.3
v2.0.3
2 parents 909bd7c + 3c60e86 commit 5f17047

8 files changed

+128
-51
lines changed

data/jp_bench/model_judgment/pairwise/gpt-4/pairwise:cyberagent--calm2-7b-chat_openai--text-davinci-003.jsonl

Lines changed: 8 additions & 8 deletions
Large diffs are not rendered by default.

data/jp_bench/model_judgment/pairwise/gpt-4/pairwise:llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0_openai--text-davinci-003.jsonl

Lines changed: 5 additions & 5 deletions
Large diffs are not rendered by default.

data/jp_bench/model_judgment/pairwise/gpt-4/pairwise:llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0_openai--text-davinci-003.jsonl

Lines changed: 7 additions & 7 deletions
Large diffs are not rendered by default.

data/jp_bench/model_judgment/pairwise/gpt-4/pairwise:openai--text-davinci-003_rinna--japanese-gpt-neox-3.6b-instruction-ppo.jsonl

Lines changed: 5 additions & 5 deletions
Large diffs are not rendered by default.

data/jp_bench/model_judgment/pairwise/gpt-4/pairwise:openai--text-davinci-003_rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.jsonl

Lines changed: 5 additions & 5 deletions
Large diffs are not rendered by default.

data/jp_bench/model_judgment/pairwise/gpt-4/pairwise:openai--text-davinci-003_tokyotech-llm--Swallow-70b-instruct-hf.jsonl

Lines changed: 3 additions & 3 deletions
Large diffs are not rendered by default.

llm_judge/common.py

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -167,24 +167,27 @@ def play(answer_a, answer_b):
167167
}
168168
if self.ref_answer is not None:
169169
kwargs["ref_answer_1"] = self.ref_answer["choices"][0]["turns"][0]
170-
judgment = self.judge.judge(**kwargs)
171-
172-
if "[[A]]" in judgment:
173-
winner = "A"
174-
elif "[[B]]" in judgment:
175-
winner = "B"
176-
elif "[[C]]" in judgment:
177-
winner = "tie"
178-
else:
179-
winner = "error"
180-
181-
return winner, judgment
182-
183-
g1_winner, g1_judgment = play(self.answer_1, self.answer_2)
184-
g1_winner = "model_1" if g1_winner == "A" else "model_2"
185-
186-
g2_winner, g2_judgment = play(self.answer_2, self.answer_1)
187-
g2_winner = "model_2" if g2_winner == "A" else "model_1"
170+
return self.judge.judge(**kwargs)
171+
172+
g1_judgment = play(self.answer_1, self.answer_2)
173+
if "[[A]]" in g1_judgment:
174+
g1_winner = "model_1"
175+
elif "[[B]]" in g1_judgment:
176+
g1_winner = "model_2"
177+
elif "[[C]]" in g1_judgment:
178+
g1_winner = "tie"
179+
else:
180+
g1_winner = "error"
181+
182+
g2_judgment = play(self.answer_2, self.answer_1)
183+
if "[[A]]" in g2_judgment:
184+
g2_winner = "model_2"
185+
elif "[[B]]" in g2_judgment:
186+
g2_winner = "model_1"
187+
elif "[[C]]" in g2_judgment:
188+
g2_winner = "tie"
189+
else:
190+
g2_winner = "error"
188191

189192
result = {
190193
"model_1": self.model_1,
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import argparse
2+
import json
3+
import logging
4+
5+
from common import JUDGEMENT_DIR, load_judgements
6+
7+
logger = logging.getLogger(__name__)
8+
9+
10+
def reparse_result_pairwise(result: dict) -> dict:
11+
"""Reparse the result to determine the winner.
12+
13+
Args:
14+
result: A result.
15+
"""
16+
reparsed_result = result.copy()
17+
18+
g1_judgment = result["g1_judgment"]
19+
if "[[A]]" in g1_judgment:
20+
g1_winner = "model_1"
21+
elif "[[B]]" in g1_judgment:
22+
g1_winner = "model_2"
23+
elif "[[C]]" in g1_judgment:
24+
g1_winner = "tie"
25+
else:
26+
g1_winner = "error"
27+
reparsed_result["g1_winner"] = g1_winner
28+
29+
g2_judgment = result["g2_judgment"]
30+
if "[[A]]" in g2_judgment:
31+
g2_winner = "model_2"
32+
elif "[[B]]" in g2_judgment:
33+
g2_winner = "model_1"
34+
elif "[[C]]" in g2_judgment:
35+
g2_winner = "tie"
36+
else:
37+
g2_winner = "error"
38+
reparsed_result["g2_winner"] = g2_winner
39+
40+
return reparsed_result
41+
42+
43+
if __name__ == "__main__":
44+
parser = argparse.ArgumentParser()
45+
parser.add_argument(
46+
"--verbose", "-v", action="count", default=0, help="Verbosity level"
47+
)
48+
args = parser.parse_args()
49+
50+
if args.verbose == 0:
51+
level = logging.INFO
52+
else:
53+
level = logging.DEBUG
54+
logging.basicConfig(
55+
format="| %(asctime)s | %(levelname)s | %(message)s",
56+
datefmt="%Y-%m-%d %H:%M:%S",
57+
level=level,
58+
)
59+
60+
logger.info("Load judgements")
61+
for judgement_dir in (JUDGEMENT_DIR / "pairwise").iterdir():
62+
result_id_results_map = load_judgements(judgement_dir)
63+
for result_id, results in result_id_results_map.items():
64+
reparsed_results = [reparse_result_pairwise(result) for result in results]
65+
if any(
66+
result != reparsed_result
67+
for result, reparsed_result in zip(results, reparsed_results)
68+
):
69+
output_file = judgement_dir / f"{result_id}.jsonl"
70+
with open(output_file, "w") as f:
71+
for result in reparsed_results:
72+
f.write(json.dumps(result, ensure_ascii=False) + "\n")
73+
logger.info(f"Fixed {output_file}")
74+
logger.info("Done")

0 commit comments

Comments
 (0)