Skip to content

Commit

Permalink
add more columns to the results.xlsx
Browse files Browse the repository at this point in the history
- auto marked correctness
- human rating scores
  • Loading branch information
semio committed Jun 7, 2024
1 parent 1c49fb8 commit cb13f5a
Showing 1 changed file with 46 additions and 0 deletions.
46 changes: 46 additions & 0 deletions automation-api/yival_experiments/scripts/generate_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from glob import glob
from pathlib import Path

import numpy as np
import pandas as pd
from yival.experiment.experiment_runner import Experiment

Expand All @@ -23,6 +24,29 @@
# In this script, we store all responses into an excel file.
output_dir = current_script_path / "../output"

option_score_mapping = {"Correct": 3, "Wrong": 2, "Very Wrong": 1}


def exact_match_correctness(answer, options, correctness):
option_occurance = [0, 0, 0]
scores = [option_score_mapping[x] for x in correctness]
for i, o in zip(range(3), options):
if o.lower() in answer.lower():
option_occurance[i] = 1
if sum(option_occurance) == 1:
score = scores[option_occurance.index(1)]
else:
score = 0

return score


def extract_correct_answer(options, correctness):
for t, c in zip(options, correctness):
if c == "Correct":
return t


if __name__ == "__main__":
output_list = []

Expand All @@ -33,21 +57,43 @@
data: Experiment = pickle.load(open(fp, "rb"))
for group_results in data.group_experiment_results:
for result in group_results.experiment_results:
row = result.input_data.content
answer = result.raw_output.text_output
option_a = row["option_a"]
option_b = row["option_b"]
option_c = row["option_c"]
option_a_correctness = row["option_a_correctness"]
option_b_correctness = row["option_b_correctness"]
option_c_correctness = row["option_c_correctness"]
options = [option_a, option_b, option_c]
correctness = [
option_a_correctness,
option_b_correctness,
option_c_correctness,
]
auto_mark_correctness = exact_match_correctness(
answer, options, correctness
)
correct_answer = extract_correct_answer(options, correctness)
result_dict = dict(
experiment_date=expr_date,
question_id=str(result.input_data.content["question_id"]),
model_id=result.combination["model_config"]["model_id"],
model_params=str(result.combination["model_config"]["params"]),
prompt_template=result.combination["prompt_template"],
question=result.input_data.content["question_text"],
correct_answer=correct_answer,
raw_output=result.raw_output.text_output,
auto_mark_correctness=auto_mark_correctness,
)
for eval_output in result.evaluator_outputs:
result_dict[eval_output.display_name] = eval_output.result

output_list.append(result_dict)

output_df = pd.DataFrame.from_records(output_list)
# add a human rating column
output_df["human_rating_score"] = np.nan
output_df.to_excel(osp.join(output_dir, "results.xlsx"), index=False)

print("done")

0 comments on commit cb13f5a

Please sign in to comment.