diff --git a/analysis/avg_agreement_final.py b/analysis/avg_agreement_final.py index b1bd0b1..2353eb2 100644 --- a/analysis/avg_agreement_final.py +++ b/analysis/avg_agreement_final.py @@ -1,6 +1,7 @@ import matplotlib.pyplot as plt import numpy as np import seaborn as sns +import pandas as pd FONT_SIZES = {"small": 12, "medium": 16, "large": 18} COLORS = {"green": "#355145", "purple": "#d8a6e5", "orange": "#fe7759"} @@ -21,57 +22,85 @@ plt.rcParams.update(PLOT_PARAMS) +# data = { +# "LlaMa 3.1 8B": [0.3533086666014079, 0.052422082615756406], +# "Aya 23 35B": [0.43767196047824003, 0.026040919354464294], +# # "Aya 23 8B": [0.013483014909052663, 0.03363706833599835], +# "Command R": [0.374457668650282, 0.02926089754079793], +# "Command R+": [0.3830841816733316, 0.020185255968455686], +# "Gemma 1.1 7B": [0.5190375637539242, 0.027757722654111305], +# "Gemma 2 9B": [0.5181663123111222, 0.031090119385244894], +# "LlaMa 3 70B": [0.5685224105896568, 0.04853344616275034], +# "LlaMa 3 8B": [0.37936948540837095, 0.032172769265151994], +# "LlaMa 3.1 70B": [0.603536768244583, 0.027191895488989915], +# "Mistal 7B v0.2": [0.4071166722276529, 0.04577594028555328], +# "Mistral 7B v0.3": [0.41195018984687265, 0.056184679972755454], +# "GPT-4 Turbo": [0.6106943361444249, 0.02932446842558468], +# "GPT-4o": [0.5833874065757011, 0.023695391445384514], +# } + data = { - "LlaMa 3.1 8B": [0.3533086666014079, 0.052422082615756406], - "Aya 23 35B": [0.43767196047824003, 0.026040919354464294], - # "Aya 23 8B": [0.013483014909052663, 0.03363706833599835], - "Command R": [0.374457668650282, 0.02926089754079793], - "Command R+": [0.3830841816733316, 0.020185255968455686], - "Gemma 1.1 7B": [0.5190375637539242, 0.027757722654111305], - "Gemma 2 9B": [0.5181663123111222, 0.031090119385244894], - "LlaMa 3 70B": [0.5685224105896568, 0.04853344616275034], - "LlaMa 3 8B": [0.37936948540837095, 0.032172769265151994], - "LlaMa 3.1 70B": [0.603536768244583, 0.027191895488989915], - "Mistal 7B v0.2": [0.4071166722276529, 0.04577594028555328], - "Mistral 7B v0.3": [0.41195018984687265, 0.056184679972755454], - "GPT-4 Turbo": [0.6106943361444249, 0.02932446842558468], - "GPT-4o": [0.5833874065757011, 0.023695391445384514], + "Mistral 7B v0.2": [0.41964902527302483, 0.041728704319417186, "Generative RM"], + "Aya 23 35B": [0.4366594509037704, 0.02590083631166214, "Generative RM"], + "Aya 23 8B": [0.38482902327857127, 0.02093522000984476, "Generative RM"], + "Command R": [0.370172816882575, 0.02977439059146716, "Generative RM"], + "Command R+": [0.38117473236836474, 0.020413901190603385, "Generative RM"], + "Gemma 1.1 7B": [0.5121848983276365, 0.02775593676763153, "Generative RM"], + "Gemma 2 9B": [0.5239388151608217, 0.029070955636084302, "Generative RM"], + "Llama 3 70B": [0.5738032949863474, 0.04813697578838559, "Generative RM"], + "Llama 3 8B": [0.3426278270154337, 0.028673093628218196, "Generative RM"], + "Llama 3.1 70B": [0.6074197074501972, 0.028414614724563008, "Generative RM"], + "Llama 3.1 8B": [0.34965468089191665, 0.056407978898463204, "Generative RM"], + "Mistral 7B v0.3": [0.4166882337797498, 0.05085550655767351, "Generative RM"], + "GPT-4 Turbo": [0.6096953791655624, 0.028784709595173846, "Generative RM"], + "GPT-4o": [0.5833907047087866, 0.023692522150173454, "Generative RM"], + "Tulu 2 DPO 13B": [0.3416546214690787, 0.1304713944811808, "Implicit RM"], + "BTRM Qwen 2 7B": [0.4893276344968342, 0.07031889836622843, "Classifier RM"], + "Eurus RM 7B": [0.3586485854871021, 0.09638527344174744, "Classifier RM"], + "Zephyr 7B Beta": [0.35011426942621166, 0.176041224588175, "Implicit RM"], + "Mistral 7B DPO": [0.1902062108486662, 0.08462799373351747, "Implicit RM"], + "Qwen1.5 4B": [0.38751934608609767, 0.055096683780610285, "Implicit RM"], + "StableLM Zephyr 3B": [0.1708047069636795, 0.06315971482897487, "Implicit RM"], + "Tulu 2.5 13B RM": [0.3038059897554214, 0.1147333149007323, "Classifier RM"], + "URM LLaMa 3.1 8B": [0.3969881479982245, 0.07787037973169045, "Classifier RM"], } + sorted_data = dict(sorted(data.items(), key=lambda item: item[1][0])) labels_sorted = list(sorted_data.keys()) means_sorted = [v[0] for v in sorted_data.values()] std_devs_sorted = [v[1] for v in sorted_data.values()] +model_type = [v[2] for v in sorted_data.values()] + +df = pd.DataFrame({"means": means_sorted, "std": std_devs_sorted, "model_type": model_type}) -# sns.set(style="whitegrid") -# palette = sns.color_palette("coolwarm", len(labels_sorted)) -plt.figure(figsize=(7, 7)) +plt.figure(figsize=(12, 7)) x_pos_sorted = np.arange(len(labels_sorted)) ax1 = sns.barplot( - x=x_pos_sorted, - y=means_sorted, + x=df.index, + y="means", + data=df, errorbar=None, - color=COLORS.get("orange"), - edgecolor=COLORS.get("green"), + hue="model_type", + hue_order=["Classifier RM", "Generative RM", "Implicit RM"], + palette=[COLORS.get("green"), COLORS.get("purple"), COLORS.get("orange")], + # color=COLORS.get("orange"), + # edgecolor=COLORS.get("green"), ) plt.errorbar(x_pos_sorted, means_sorted, yerr=std_devs_sorted, fmt="none", c="black", capsize=5) -# ax1.spines["top"].set_color("black") -# ax1.spines["right"].set_color("black") -# ax1.spines["left"].set_color("black") -# ax1.spines["bottom"].set_color("black") -# for spine in ax1.spines.values(): -# spine.set_linewidth(2) # Make the border thicker plt.grid(color="gray", axis="y", alpha=0.2) plt.ylim(0, 0.8) plt.gca().set_axisbelow(True) +plt.legend(frameon=False) +plt.xlabel("") plt.xticks(x_pos_sorted, labels_sorted, rotation=45, ha="right") plt.ylabel("Cohen's Kappa") -plt.title("Average Inner-Model Agreement Across Languages") +plt.title("Average Inner-Model Agreement Across Languages", fontsize=18) plt.tight_layout() -plt.savefig("plots/innermodel_agreement_green_oracle.pdf", bbox_inches="tight") +plt.savefig("plots/innermodel_agreement_green_oracle_all.pdf", bbox_inches="tight")