Skip to content

Commit

Permalink
notebooks update
Browse files Browse the repository at this point in the history
  • Loading branch information
semio committed Jun 10, 2024
1 parent e7c32d7 commit e5ad78d
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 9 deletions.
124 changes: 124 additions & 0 deletions automation-api/yival_experiments/notebooks/human_rating.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# ---
# jupyter:
# jupytext:
# formats: py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.16.2
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---

# %% [markdown]
# # Usage
#
# Use this notebook to generate a file which contains results which auto mark correctness is different from LLM agent evaluator. Then merge the result back.

# %% [markdown]
# ## Generate file

# %%
# going to use duckdb
# %load_ext sql

# %%
# %sql duckdb://

# %%
import os.path as osp
import pandas as pd

output_dir = '../output/archives/20240401'

# %%
result_file = osp.join(output_dir, 'results.xlsx')

result_df = pd.read_excel(result_file)

# %% magic_args="--save result_to_check_1 " language="sql"
# select *
# from result_df
# where auto_mark_correctness != correctness

# %% magic_args="--save result_to_check_2" language="sql"
# select
# *,
# case
# when auto_mark_correctness = 0 and correctness = 3 and not contains(lower(raw_output), lower(correct_answer)) then 1
# when (correctness = 1 or correctness = 2) and contains(lower(raw_output), lower(correct_answer)) then 1
# when auto_mark_correctness = 0 and correctness = 0 then 1
# when auto_mark_correctness = 1 OR auto_mark_correctness = 2 OR auto_mark_correctness = 3 then 1
# else 0
# end as need_to_check
# from result_to_check_1
# where need_to_check = 1

# %%
# result_to_check = %sql select * exclude (need_to_check) from result_to_check_2

# %%
result_to_check_df = result_to_check.DataFrame()

# %%
result_to_check_df.to_excel(osp.join(output_dir, 'human_rating.xlsx'), index=False)

# %%

# %%

# %%
raise Exception("Please edit the human_rating file.")

# %% [markdown]
# ## Edit file, and then run below cells to merge back

# %%
rating_file = osp.join(output_dir, 'human_rating.xlsx')

# %%
human_ratings = pd.read_excel(rating_file)

# %%
human_ratings[~pd.isnull(human_ratings.human_rating_score)]

# %%
result_df_copy = result_df.copy()

# %%
result_df_copy = result_df_copy.reset_index()

# %% magic_args="merged_results << " language="sql"
# select
# r.* exclude (human_rating_score),
# l.human_rating_score
# from
# result_df_copy r left join human_ratings l
# on r.experiment_date = l.experiment_date
# and r.question_id = l.question_id
# and r.model_id = l.model_id
# and r.model_params = l.model_params
# and r.prompt_template = l.prompt_template

# %%
merged_results_df = merged_results.DataFrame()

# %%
merged_results_df

# %%
result_df_copy

# %%
assert merged_results_df.shape == result_df.shape

# %%
merged_results_df[~pd.isnull(merged_results_df.human_rating_score)]

# %%
merged_results_df.to_excel(osp.join(output_dir, 'results.xlsx'), index=False)

# %%
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
# select * from result_to_analyze
# where
# model_configuration_id = 'mc030'
# OR model_configuration_id = 'mc031'
# OR model_configuration_id = 'mc035'
# OR model_configuration_id = 'mc032'
# OR model_configuration_id = 'mc033'
# OR model_configuration_id = 'mc034'
Expand All @@ -67,7 +67,7 @@
# select * from result_chn_prompt_renamed
# where
# model_configuration_id = 'mc030'
# OR model_configuration_id = 'mc031'
# OR model_configuration_id = 'mc035'
# OR model_configuration_id = 'mc032'
# OR model_configuration_id = 'mc033'
# OR model_configuration_id = 'mc034'
Expand Down Expand Up @@ -205,7 +205,8 @@
# + magic_args="by_prompt_family <<" language="sql"
# select
# p.prompt_family as prompt_family,
# count(DISTINCT p.variation_id) / 2 as number_of_prompts, -- treat chinese prompt and english prompt the same.
# count(DISTINCT p.variation_id) as number_of_prompts,
# -- count(DISTINCT p.variation_id) / 2 as number_of_prompts, -- uncomment this to treat chinese prompt and english prompt the same.
# count(*) as total_count,
# count(*) filter (result != 'fail') as total_count_exclude_indecisive,
# count(*) filter (result = 'correct') / total_count_exclude_indecisive * 100 as correct_rate,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,9 @@ def suggest_language(q_text):
result['model_conf_id'] = [model_id_params_to_model_config_mapping[
(row['model_id'], row['model_params'])] for _, row in result.iterrows()]

# update the correctness column with human scores
result['final_score'] = result['human_rating_score'].fillna(result['correctness'])

# counting
# let's use polars from now
result = pl.DataFrame(result)
Expand All @@ -192,14 +195,16 @@ def suggest_language(q_text):
).agg(pl.all().first())




result_counts = result.group_by(
['question_id', 'language', 'prompt_variant_id', 'model_conf_id', 'experiment_date']
).agg(
pl.col('correctness').filter(pl.col('correctness') == 0).count().alias('fail'),
pl.col('correctness').filter(pl.col('correctness') == 1).count().alias('very_wrong'),
pl.col('correctness').filter(pl.col('correctness') == 2).count().alias('wrong'),
pl.col('correctness').filter(pl.col('correctness') == 3).count().alias('correct'),
pl.col('correctness').count().alias('rounds')
pl.col('final_score').filter(pl.col('final_score') == 0).count().alias('fail'),
pl.col('final_score').filter(pl.col('final_score') == 1).count().alias('very_wrong'),
pl.col('final_score').filter(pl.col('final_score') == 2).count().alias('wrong'),
pl.col('final_score').filter(pl.col('final_score') == 3).count().alias('correct'),
pl.col('final_score').count().alias('rounds')
)

result_counts
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def exact_match_correctness(answer, options, correctness):
option_occurance = [0, 0, 0]
scores = [option_score_mapping[x] for x in correctness]
for i, o in zip(range(3), options):
if o.lower() in answer.lower():
if o.strip().lower() in answer.strip().lower():
option_occurance[i] = 1
if sum(option_occurance) == 1:
score = scores[option_occurance.index(1)]
Expand Down

0 comments on commit e5ad78d

Please sign in to comment.