diff --git a/automation-api/yival_experiments/notebooks/human_rating.py b/automation-api/yival_experiments/notebooks/human_rating.py
new file mode 100644
index 0000000..fbcee3e
--- /dev/null
+++ b/automation-api/yival_experiments/notebooks/human_rating.py
@@ -0,0 +1,124 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.16.2
+#   kernelspec:
+#     display_name: Python 3 (ipykernel)
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # Usage
+#
+# Use this notebook to generate a file which contains results which auto mark correctness is different from LLM agent evaluator. Then merge the result back.
+
+# %% [markdown]
+# ## Generate file
+
+# %%
+# going to use duckdb
+# %load_ext sql
+
+# %%
+# %sql duckdb://
+
+# %%
+import os.path as osp
+import pandas as pd
+
+output_dir = '../output/archives/20240401'
+
+# %%
+result_file = osp.join(output_dir, 'results.xlsx')
+
+result_df = pd.read_excel(result_file)
+
+# %% magic_args="--save result_to_check_1 " language="sql"
+# select * 
+# from result_df 
+# where auto_mark_correctness != correctness
+
+# %% magic_args="--save result_to_check_2" language="sql"
+# select 
+#     *,
+#     case 
+#         when auto_mark_correctness = 0 and correctness = 3 and not contains(lower(raw_output), lower(correct_answer)) then 1
+#         when (correctness = 1 or correctness = 2) and contains(lower(raw_output), lower(correct_answer)) then 1
+#         when auto_mark_correctness = 0 and correctness = 0 then 1
+#         when auto_mark_correctness = 1 OR auto_mark_correctness = 2 OR auto_mark_correctness = 3 then 1
+#     else 0 
+#     end as need_to_check
+# from result_to_check_1
+# where need_to_check = 1
+
+# %%
+# result_to_check = %sql select * exclude (need_to_check) from result_to_check_2
+
+# %%
+result_to_check_df = result_to_check.DataFrame()
+
+# %%
+result_to_check_df.to_excel(osp.join(output_dir, 'human_rating.xlsx'), index=False)
+
+# %%
+
+# %%
+
+# %%
+raise Exception("Please edit the human_rating file.")
+
+# %% [markdown]
+# ## Edit file, and then run below cells to merge back
+
+# %%
+rating_file = osp.join(output_dir, 'human_rating.xlsx')
+
+# %%
+human_ratings = pd.read_excel(rating_file)
+
+# %%
+human_ratings[~pd.isnull(human_ratings.human_rating_score)]
+
+# %%
+result_df_copy = result_df.copy()
+
+# %%
+result_df_copy = result_df_copy.reset_index()
+
+# %% magic_args="merged_results << " language="sql"
+# select 
+#     r.* exclude (human_rating_score),
+#     l.human_rating_score
+# from 
+#     result_df_copy r left join human_ratings l
+#     on r.experiment_date = l.experiment_date 
+#     and r.question_id = l.question_id 
+#     and r.model_id = l.model_id 
+#     and r.model_params = l.model_params 
+#     and r.prompt_template = l.prompt_template
+
+# %%
+merged_results_df = merged_results.DataFrame()
+
+# %%
+merged_results_df
+
+# %%
+result_df_copy
+
+# %%
+assert merged_results_df.shape == result_df.shape
+
+# %%
+merged_results_df[~pd.isnull(merged_results_df.human_rating_score)]
+
+# %%
+merged_results_df.to_excel(osp.join(output_dir, 'results.xlsx'), index=False)
+
+# %%
diff --git a/automation-api/yival_experiments/notebooks/result_data_analysis.py b/automation-api/yival_experiments/notebooks/result_data_analysis.py
index 29ba865..ee4fcd4 100644
--- a/automation-api/yival_experiments/notebooks/result_data_analysis.py
+++ b/automation-api/yival_experiments/notebooks/result_data_analysis.py
@@ -53,7 +53,7 @@
 # select * from result_to_analyze
 # where 
 #     model_configuration_id = 'mc030' 
-#     OR model_configuration_id = 'mc031' 
+#     OR model_configuration_id = 'mc035' 
 #     OR model_configuration_id = 'mc032'
 #     OR model_configuration_id = 'mc033'
 #     OR model_configuration_id = 'mc034'
@@ -67,7 +67,7 @@
 # select * from result_chn_prompt_renamed
 # where 
 #     model_configuration_id = 'mc030' 
-#     OR model_configuration_id = 'mc031' 
+#     OR model_configuration_id = 'mc035' 
 #     OR model_configuration_id = 'mc032'
 #     OR model_configuration_id = 'mc033'
 #     OR model_configuration_id = 'mc034'
@@ -205,7 +205,8 @@
 # + magic_args="by_prompt_family <<" language="sql"
 # select
 #     p.prompt_family as prompt_family,
-#     count(DISTINCT p.variation_id) / 2 as number_of_prompts,  -- treat chinese prompt and english prompt the same.
+#     count(DISTINCT p.variation_id) as number_of_prompts,
+#     -- count(DISTINCT p.variation_id) / 2 as number_of_prompts,  -- uncomment this to treat chinese prompt and english prompt the same.
 #     count(*) as total_count,
 #     count(*) filter (result != 'fail') as total_count_exclude_indecisive,
 #     count(*) filter (result = 'correct') / total_count_exclude_indecisive * 100 as correct_rate,
diff --git a/automation-api/yival_experiments/notebooks/upload_to_ai_eval_sheet.py b/automation-api/yival_experiments/notebooks/upload_to_ai_eval_sheet.py
index 286b19a..d3453e5 100644
--- a/automation-api/yival_experiments/notebooks/upload_to_ai_eval_sheet.py
+++ b/automation-api/yival_experiments/notebooks/upload_to_ai_eval_sheet.py
@@ -172,6 +172,9 @@ def suggest_language(q_text):
 result['model_conf_id'] = [model_id_params_to_model_config_mapping[
     (row['model_id'], row['model_params'])] for _, row in result.iterrows()]
 
+# update the correctness column with human scores
+result['final_score'] = result['human_rating_score'].fillna(result['correctness'])
+
 # counting
 # let's use polars from now
 result = pl.DataFrame(result)
@@ -192,14 +195,16 @@ def suggest_language(q_text):
 ).agg(pl.all().first())
 
 
+
+
 result_counts = result.group_by(
     ['question_id', 'language', 'prompt_variant_id', 'model_conf_id', 'experiment_date']
 ).agg(
-    pl.col('correctness').filter(pl.col('correctness') == 0).count().alias('fail'),
-    pl.col('correctness').filter(pl.col('correctness') == 1).count().alias('very_wrong'),
-    pl.col('correctness').filter(pl.col('correctness') == 2).count().alias('wrong'),
-    pl.col('correctness').filter(pl.col('correctness') == 3).count().alias('correct'),
-    pl.col('correctness').count().alias('rounds')
+    pl.col('final_score').filter(pl.col('final_score') == 0).count().alias('fail'),
+    pl.col('final_score').filter(pl.col('final_score') == 1).count().alias('very_wrong'),
+    pl.col('final_score').filter(pl.col('final_score') == 2).count().alias('wrong'),
+    pl.col('final_score').filter(pl.col('final_score') == 3).count().alias('correct'),
+    pl.col('final_score').count().alias('rounds')
 )
 
 result_counts
diff --git a/automation-api/yival_experiments/scripts/generate_result.py b/automation-api/yival_experiments/scripts/generate_result.py
index 35270ed..a8c0cf1 100644
--- a/automation-api/yival_experiments/scripts/generate_result.py
+++ b/automation-api/yival_experiments/scripts/generate_result.py
@@ -31,7 +31,7 @@ def exact_match_correctness(answer, options, correctness):
     option_occurance = [0, 0, 0]
     scores = [option_score_mapping[x] for x in correctness]
     for i, o in zip(range(3), options):
-        if o.lower() in answer.lower():
+        if o.strip().lower() in answer.strip().lower():
             option_occurance[i] = 1
     if sum(option_occurance) == 1:
         score = scores[option_occurance.index(1)]