diff --git a/automation-api/yival_experiments/notebooks/human_rating.py b/automation-api/yival_experiments/notebooks/human_rating.py new file mode 100644 index 0000000..fbcee3e --- /dev/null +++ b/automation-api/yival_experiments/notebooks/human_rating.py @@ -0,0 +1,124 @@ +# --- +# jupyter: +# jupytext: +# formats: py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.16.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Usage +# +# Use this notebook to generate a file which contains results which auto mark correctness is different from LLM agent evaluator. Then merge the result back. + +# %% [markdown] +# ## Generate file + +# %% +# going to use duckdb +# %load_ext sql + +# %% +# %sql duckdb:// + +# %% +import os.path as osp +import pandas as pd + +output_dir = '../output/archives/20240401' + +# %% +result_file = osp.join(output_dir, 'results.xlsx') + +result_df = pd.read_excel(result_file) + +# %% magic_args="--save result_to_check_1 " language="sql" +# select * +# from result_df +# where auto_mark_correctness != correctness + +# %% magic_args="--save result_to_check_2" language="sql" +# select +# *, +# case +# when auto_mark_correctness = 0 and correctness = 3 and not contains(lower(raw_output), lower(correct_answer)) then 1 +# when (correctness = 1 or correctness = 2) and contains(lower(raw_output), lower(correct_answer)) then 1 +# when auto_mark_correctness = 0 and correctness = 0 then 1 +# when auto_mark_correctness = 1 OR auto_mark_correctness = 2 OR auto_mark_correctness = 3 then 1 +# else 0 +# end as need_to_check +# from result_to_check_1 +# where need_to_check = 1 + +# %% +# result_to_check = %sql select * exclude (need_to_check) from result_to_check_2 + +# %% +result_to_check_df = result_to_check.DataFrame() + +# %% +result_to_check_df.to_excel(osp.join(output_dir, 'human_rating.xlsx'), index=False) + +# %% + +# %% + +# %% +raise Exception("Please edit the human_rating file.") + +# %% [markdown] +# ## Edit file, and then run below cells to merge back + +# %% +rating_file = osp.join(output_dir, 'human_rating.xlsx') + +# %% +human_ratings = pd.read_excel(rating_file) + +# %% +human_ratings[~pd.isnull(human_ratings.human_rating_score)] + +# %% +result_df_copy = result_df.copy() + +# %% +result_df_copy = result_df_copy.reset_index() + +# %% magic_args="merged_results << " language="sql" +# select +# r.* exclude (human_rating_score), +# l.human_rating_score +# from +# result_df_copy r left join human_ratings l +# on r.experiment_date = l.experiment_date +# and r.question_id = l.question_id +# and r.model_id = l.model_id +# and r.model_params = l.model_params +# and r.prompt_template = l.prompt_template + +# %% +merged_results_df = merged_results.DataFrame() + +# %% +merged_results_df + +# %% +result_df_copy + +# %% +assert merged_results_df.shape == result_df.shape + +# %% +merged_results_df[~pd.isnull(merged_results_df.human_rating_score)] + +# %% +merged_results_df.to_excel(osp.join(output_dir, 'results.xlsx'), index=False) + +# %% diff --git a/automation-api/yival_experiments/notebooks/result_data_analysis.py b/automation-api/yival_experiments/notebooks/result_data_analysis.py index 29ba865..ee4fcd4 100644 --- a/automation-api/yival_experiments/notebooks/result_data_analysis.py +++ b/automation-api/yival_experiments/notebooks/result_data_analysis.py @@ -53,7 +53,7 @@ # select * from result_to_analyze # where # model_configuration_id = 'mc030' -# OR model_configuration_id = 'mc031' +# OR model_configuration_id = 'mc035' # OR model_configuration_id = 'mc032' # OR model_configuration_id = 'mc033' # OR model_configuration_id = 'mc034' @@ -67,7 +67,7 @@ # select * from result_chn_prompt_renamed # where # model_configuration_id = 'mc030' -# OR model_configuration_id = 'mc031' +# OR model_configuration_id = 'mc035' # OR model_configuration_id = 'mc032' # OR model_configuration_id = 'mc033' # OR model_configuration_id = 'mc034' @@ -205,7 +205,8 @@ # + magic_args="by_prompt_family <<" language="sql" # select # p.prompt_family as prompt_family, -# count(DISTINCT p.variation_id) / 2 as number_of_prompts, -- treat chinese prompt and english prompt the same. +# count(DISTINCT p.variation_id) as number_of_prompts, +# -- count(DISTINCT p.variation_id) / 2 as number_of_prompts, -- uncomment this to treat chinese prompt and english prompt the same. # count(*) as total_count, # count(*) filter (result != 'fail') as total_count_exclude_indecisive, # count(*) filter (result = 'correct') / total_count_exclude_indecisive * 100 as correct_rate, diff --git a/automation-api/yival_experiments/notebooks/upload_to_ai_eval_sheet.py b/automation-api/yival_experiments/notebooks/upload_to_ai_eval_sheet.py index 286b19a..d3453e5 100644 --- a/automation-api/yival_experiments/notebooks/upload_to_ai_eval_sheet.py +++ b/automation-api/yival_experiments/notebooks/upload_to_ai_eval_sheet.py @@ -172,6 +172,9 @@ def suggest_language(q_text): result['model_conf_id'] = [model_id_params_to_model_config_mapping[ (row['model_id'], row['model_params'])] for _, row in result.iterrows()] +# update the correctness column with human scores +result['final_score'] = result['human_rating_score'].fillna(result['correctness']) + # counting # let's use polars from now result = pl.DataFrame(result) @@ -192,14 +195,16 @@ def suggest_language(q_text): ).agg(pl.all().first()) + + result_counts = result.group_by( ['question_id', 'language', 'prompt_variant_id', 'model_conf_id', 'experiment_date'] ).agg( - pl.col('correctness').filter(pl.col('correctness') == 0).count().alias('fail'), - pl.col('correctness').filter(pl.col('correctness') == 1).count().alias('very_wrong'), - pl.col('correctness').filter(pl.col('correctness') == 2).count().alias('wrong'), - pl.col('correctness').filter(pl.col('correctness') == 3).count().alias('correct'), - pl.col('correctness').count().alias('rounds') + pl.col('final_score').filter(pl.col('final_score') == 0).count().alias('fail'), + pl.col('final_score').filter(pl.col('final_score') == 1).count().alias('very_wrong'), + pl.col('final_score').filter(pl.col('final_score') == 2).count().alias('wrong'), + pl.col('final_score').filter(pl.col('final_score') == 3).count().alias('correct'), + pl.col('final_score').count().alias('rounds') ) result_counts diff --git a/automation-api/yival_experiments/scripts/generate_result.py b/automation-api/yival_experiments/scripts/generate_result.py index 35270ed..a8c0cf1 100644 --- a/automation-api/yival_experiments/scripts/generate_result.py +++ b/automation-api/yival_experiments/scripts/generate_result.py @@ -31,7 +31,7 @@ def exact_match_correctness(answer, options, correctness): option_occurance = [0, 0, 0] scores = [option_score_mapping[x] for x in correctness] for i, o in zip(range(3), options): - if o.lower() in answer.lower(): + if o.strip().lower() in answer.strip().lower(): option_occurance[i] = 1 if sum(option_occurance) == 1: score = scores[option_occurance.index(1)]