Use Yival to run Experiment (#25)

* add yival to dependency * migrate to pydantic v2 because yival require v2 * langchain still use pydantic v1 * update pandera version and make the code work * experiment files so far * add readme * some more files * update readme * update readme * add metrics data sheet * update experiment sturcture * minor changes * have to set a callback to make Palm work * add replicate key * remove the hacking I thought it's just my network issue * add rounds in model variation and update data * update dependencies use my fork of yival now * Don't evaluate formulas I think we won't use formulas in these sheets. And if we enable evaluate formulas, some text will be evaluated to incorrect values. Such as "-6C", will result in "#ERROR" * Update github workflow * update model compare function - use redis cache - use litellm directly instead of llm_completion from yival. llm_complete is just same as litellm.completion with some custom name mappings which we don't use. * strip question text * update generate result script * Latest results * update example * update dependencies * add todo * set model name for evaluator * latest experiment yaml * questions * update README * Add readme about Redis cache * latest experiment results * scripts and notebooks * add options * add langdetect as dependency * add gitignore * add experiment archive * questions * create a custom evaluator for gpt4 based evaluation Not using the one in Yival because it's moving fast. * take care some possible errors when loading data * update Readme * remove some unneeded files * rename script
Gapminder · Jan 25, 2024 · 7e789cc · 7e789cc
1 parent e9da232
commit 7e789cc
Show file tree

Hide file tree

Showing 41 changed files with 6,730 additions and 1,139 deletions.
diff --git a/.github/workflows/automation-api.yaml b/.github/workflows/automation-api.yaml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python: [ 3.9 ]
+        python: [ 3.11 ]
 
     steps:
       - name: Checkout code
@@ -40,7 +40,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python: [ 3.9 ]
+        python: [ 3.11 ]
 
     steps:
       - name: Checkout code
@@ -60,7 +60,7 @@ jobs:
       - name: Install Poetry
         uses: snok/install-poetry@v1
         with:
-          version: 1.4.2
+          version: 1.6.1
           virtualenvs-create: true
           virtualenvs-in-project: true
 

diff --git a/automation-api/.env.example b/automation-api/.env.example
@@ -8,13 +8,15 @@ OPENAI_ORG_ID=""
 ## for Huggingface Hub
 HUGGINGFACEHUB_API_TOKEN=""
 ## for PALM
-GOOGLE_API_KEY=""
+PALM_API_KEY=""
 ## for iFlytek
 IFLYTEK_API_KEY=""
 IFLYTEK_API_SECRET=""
 IFLYTEK_APPID=""
 ## for Alibaba
 DASHSCOPE_API_KEY=""
+# for Replicate
+REPLICATE_API_KEY=""
 
 # For local development
 SERVICE_ACCOUNT_CREDENTIALS=""

diff --git a/automation-api/lib/ai_eval_spreadsheet/schemas.py b/automation-api/lib/ai_eval_spreadsheet/schemas.py
@@ -5,26 +5,28 @@
 # for more info
 # Note that most types are str since spreadsheet columns can be formulas
 
-from datetime import datetime
 from typing import Optional
 
 import pandas as pd
 import pandera as pa
 from pandera.engines.pandas_engine import PydanticModel
-from pydantic import BaseModel, Field, validator
+from pydantic import BaseModel, ConfigDict, Field, field_validator
 
 
 class Question(BaseModel):
+    model_config = ConfigDict(coerce_numbers_to_str=True)
+
     include_in_next_evaluation: Optional[bool] = Field(
-        None, title="Include in next evaluation"
+        None, title="Include in next evaluation", validate_default=True
     )
     question_id: Optional[str] = Field(None, title="Question ID")
     language: Optional[str] = Field(None, title="Language")
     published_version_of_question: Optional[str] = Field(
         None, title="Published version of question"
     )
 
-    @validator("include_in_next_evaluation", pre=True, always=True)
+    @field_validator("include_in_next_evaluation", mode="before")
+    @classmethod
     def default_if_nan(cls, v):  # noqa: N805
         return False if pd.isna(v) else v
 
@@ -36,6 +38,8 @@ class Config:
 
 
 class QuestionOption(BaseModel):
+    model_config = ConfigDict(coerce_numbers_to_str=True)
+
     question_option_id: Optional[str] = Field(None, title="Question Option ID")
     question_id: Optional[str] = Field(None, title="Question ID")
     language: Optional[str] = Field(None, title="Language")
@@ -53,6 +57,8 @@ class Config:
 
 
 class PromptVariation(BaseModel):
+    model_config = ConfigDict(coerce_numbers_to_str=True)
+
     include_in_next_evaluation: Optional[bool] = Field(
         None, title="Include in next evaluation"
     )
@@ -78,6 +84,8 @@ class Config:
 
 
 class GenAiModel(BaseModel):
+    model_config = ConfigDict(coerce_numbers_to_str=True, protected_namespaces=())
+
     model_id: Optional[str] = Field(None, title="Model ID")
     vendor: Optional[str] = Field(None, title="Vendor")
     model_name: Optional[str] = Field(None, title="Model name")
@@ -90,6 +98,8 @@ class Config:
 
 
 class GenAiModelConfig(BaseModel):
+    model_config = ConfigDict(coerce_numbers_to_str=True, protected_namespaces=())
+
     include_in_next_evaluation: Optional[bool] = Field(
         None, title="Include in next evaluation"
     )
@@ -107,12 +117,28 @@ class Config:
         coerce = True
 
 
+class Metric(BaseModel):
+    name: Optional[str] = Field(None, title="Name")
+    description: Optional[str] = Field(None, title="Description")
+    prompt: Optional[str] = Field(None, title="Prompt")
+    choices: Optional[str] = Field(None, title="Choices")
+    choice_scores: Optional[str] = Field(None, title="Choice Scores")
+
+
+class MetricsDf(pa.DataFrameModel):
+    class Config:
+        dtype = PydanticModel(Metric)
+        coerce = True
+
+
 class EvalResult(BaseModel):
+    model_config = ConfigDict(coerce_numbers_to_str=True, protected_namespaces=())
+
     question_id: Optional[str] = Field(None, title="Question ID")
     language: Optional[str] = Field(None, title="Language")
     prompt_variation_id: Optional[str] = Field(None, title="Prompt variation ID")
     model_configuration_id: Optional[str] = Field(None, title="Model Configuration ID")
-    last_evaluation_datetime: Optional[datetime] = Field(None, title="Last Evaluation")
+    last_evaluation_datetime: Optional[str] = Field(None, title="Last Evaluation")
     percent_correct: Optional[float] = Field(None, title="Percent Correct")
     percent_wrong: Optional[float] = Field(None, title="Percent Wrong")
     percent_very_wrong: Optional[float] = Field(None, title="Percent Very Wrong")
@@ -128,6 +154,8 @@ class Config:
 
 
 class SessionResult(BaseModel):
+    model_config = ConfigDict(coerce_numbers_to_str=True, protected_namespaces=())
+
     session_id: Optional[str] = Field(None, title="Session ID")
     session_time: Optional[str] = Field(None, title="Session Time")
     prompt_variation_id: Optional[str] = Field(None, title="Prompt Variation ID")

diff --git a/automation-api/lib/ai_eval_spreadsheet/wrapper.py b/automation-api/lib/ai_eval_spreadsheet/wrapper.py
@@ -10,6 +10,8 @@
     GenAiModelConfig,
     GenAiModelConfigsDf,
     GenAiModelsDf,
+    Metric,
+    MetricsDf,
     PromptVariation,
     PromptVariationsDf,
     Question,
@@ -36,6 +38,7 @@ class AiEvalData:
     gen_ai_model_configs: Optional[
         GsheetsWorksheetEditor[GenAiModelConfigsDf, GenAiModelConfig]
     ] = None
+    metrics: Optional[GsheetsWorksheetEditor[MetricsDf, Metric]] = None
     evaluation_results: Optional[
         GsheetsWorksheetEditor[EvalResult, EvalResultsDf]
     ] = None
@@ -50,6 +53,7 @@ class AiEvalData:
     "prompt_variations": "Prompt variations",
     "gen_ai_models": "Models",
     "gen_ai_model_configs": "Model configurations",
+    "metrics": "Metrics",
     "evaluation_results": "Latest Results",
     "session_results": "Sessions",
 }
@@ -71,7 +75,7 @@ def read_ai_eval_data(
         row_schema=Question,
         worksheet_name=sheet_names["questions"],
         header_row_number=0,
-        evaluate_formulas=True,
+        evaluate_formulas=False,
     )
 
     question_options = GsheetsWorksheetEditor(
@@ -80,7 +84,7 @@ def read_ai_eval_data(
         row_schema=QuestionOption,
         worksheet_name=sheet_names["question_options"],
         header_row_number=0,
-        evaluate_formulas=True,
+        evaluate_formulas=False,
     )
 
     prompt_variations = GsheetsWorksheetEditor(
@@ -89,7 +93,7 @@ def read_ai_eval_data(
         row_schema=PromptVariation,
         worksheet_name=sheet_names["prompt_variations"],
         header_row_number=0,
-        evaluate_formulas=True,
+        evaluate_formulas=False,
     )
 
     gen_ai_models = GsheetsWorksheetEditor(
@@ -98,7 +102,7 @@ def read_ai_eval_data(
         row_schema=GenAiModel,
         worksheet_name=sheet_names["gen_ai_models"],
         header_row_number=0,
-        evaluate_formulas=True,
+        evaluate_formulas=False,
     )
 
     gen_ai_model_configs = GsheetsWorksheetEditor(
@@ -107,7 +111,16 @@ def read_ai_eval_data(
         row_schema=GenAiModelConfig,
         worksheet_name=sheet_names["gen_ai_model_configs"],
         header_row_number=0,
-        evaluate_formulas=True,
+        evaluate_formulas=False,
+    )
+
+    metrics = GsheetsWorksheetEditor(
+        sh=ai_eval_spreadsheet,
+        df_schema=MetricsDf,
+        row_schema=Metric,
+        worksheet_name=sheet_names["metrics"],
+        header_row_number=0,
+        evaluate_formulas=False,
     )
 
     evaluation_results = GsheetsWorksheetEditor(
@@ -134,6 +147,7 @@ def read_ai_eval_data(
         prompt_variations=prompt_variations,
         gen_ai_models=gen_ai_models,
         gen_ai_model_configs=gen_ai_model_configs,
+        metrics=metrics,
         evaluation_results=evaluation_results,
         session_results=session_results,
     )
diff --git a/automation-api/lib/config.py b/automation-api/lib/config.py
@@ -24,11 +24,12 @@ def read_config() -> dict[str, str]:
         "AI_EVAL_SPREADSHEET_ID",
         "AI_EVAL_DEV_SPREADSHEET_ID",
         "HUGGINGFACEHUB_API_TOKEN",
-        "GOOGLE_API_KEY",
+        "PALM_API_KEY",
         "IFLYTEK_APPID",
         "IFLYTEK_API_KEY",
         "IFLYTEK_API_SECRET",
         "DASHSCOPE_API_KEY",
+        "REPLICATE_API_KEY",
     ]:
         config[key] = os.getenv(key=key, default="")
     return config
diff --git a/automation-api/lib/gsheets/gsheets_utils.py b/automation-api/lib/gsheets/gsheets_utils.py
@@ -3,7 +3,7 @@
 import pandas as pd
 from gspread import Spreadsheet, Worksheet, WorksheetNotFound
 from gspread_dataframe import set_with_dataframe
-from pydantic.main import BaseModel
+from pydantic import BaseModel
 
 from lib.app_singleton import app_logger
 

diff --git a/automation-api/lib/gsheets/gsheets_worksheet_data.py b/automation-api/lib/gsheets/gsheets_worksheet_data.py
@@ -4,7 +4,7 @@
 import numpy as np
 import pandera as pa
 from pandera import DataFrameModel
-from pydantic.main import BaseModel
+from pydantic import BaseModel
 
 from lib.gsheets.gsheets_utils import get_pydantic_model_field_titles
 
@@ -36,11 +36,13 @@ def __init__(
         self.df_schema = df_schema
         self.row_schema = row_schema
         self.header_row_number = header_row_number
+
         self.attributes_to_columns_map = get_pydantic_model_field_titles(
             self.row_schema
         )
         df = df.rename(columns=inv_dict(self.attributes_to_columns_map))
         df = self.replace_current_row_numbers_in_formulas(df)
+        # import ipdb; ipdb.set_trace()
         self.df = df_schema(df)
 
     def replace_current_row_numbers_in_formulas(

diff --git a/automation-api/lib/gsheets/gsheets_worksheet_editor.py b/automation-api/lib/gsheets/gsheets_worksheet_editor.py
@@ -5,7 +5,7 @@
 from gspread import Spreadsheet, Worksheet
 from gspread.utils import rowcol_to_a1
 from pandera import DataFrameModel
-from pydantic.main import BaseModel
+from pydantic import BaseModel
 
 from lib.gsheets.gsheets_utils import get_worksheet
 from lib.gsheets.gsheets_worksheet_data import GsheetsWorksheetData

diff --git a/automation-api/lib/llms/alibaba.py b/automation-api/lib/llms/alibaba.py
@@ -6,7 +6,7 @@
 from dashscope import Generation
 from langchain.callbacks.manager import CallbackManagerForLLMRun
 from langchain.llms.base import LLM
-from pydantic import root_validator
+from langchain.pydantic_v1 import root_validator
 from tenacity import (
     retry,
     retry_if_exception_type,
@@ -71,7 +71,7 @@ def validate_environment(cls, values: Dict) -> Dict:  # noqa: N805
         dashscope.api_key = dashscope_api_key
 
         if values["top_p"] is not None and not 0.0 <= values["top_p"] <= 1.0:
-            raise ValueError("max_output_tokens must be between 0 and 1")
+            raise ValueError("top_p must be between 0 and 1")
 
         if values["top_k"] is not None and not 1 <= values["top_k"] <= 100:
             raise ValueError("top_k must be between 1 and 100")

diff --git a/automation-api/lib/llms/spark.py b/automation-api/lib/llms/spark.py
@@ -5,7 +5,7 @@
 
 from langchain.callbacks.manager import CallbackManagerForLLMRun
 from langchain.llms.base import LLM
-from pydantic import root_validator
+from langchain.pydantic_v1 import root_validator
 from tenacity import (
     retry,
     retry_if_exception_type,

diff --git a/automation-api/lib/pilot/helpers.py b/automation-api/lib/pilot/helpers.py
@@ -13,6 +13,7 @@
 from lib.ai_eval_spreadsheet.schemas import (
     GenAiModel,
     GenAiModelConfig,
+    Metric,
     PromptVariation,
     Question,
     QuestionOption,
@@ -219,9 +220,14 @@ def get_prompt_variants(
     return res
 
 
-def get_model_configs(sheet: AiEvalData) -> List[ModelAndConfig]:
+def get_model_configs(
+    sheet: AiEvalData, include_all: bool = False
+) -> List[ModelAndConfig]:
     models_df = sheet.gen_ai_models.data.df
-    model_configs_df = filter_included_rows(sheet.gen_ai_model_configs.data.df)
+    if include_all:
+        model_configs_df = sheet.gen_ai_model_configs.data.df
+    else:
+        model_configs_df = filter_included_rows(sheet.gen_ai_model_configs.data.df)
 
     model_configs = class_objects_from_df(model_configs_df, GenAiModelConfig)
     result = []
@@ -232,6 +238,11 @@ def get_model_configs(sheet: AiEvalData) -> List[ModelAndConfig]:
     return result
 
 
+def get_metrics(sheet: AiEvalData) -> List[Metric]:
+    res = class_objects_from_df(sheet.metrics.data.df, Metric)
+    return res
+
+
 def get_survey_hash(questions: List[QuestionAndOptions]) -> str:
     joined = ",".join([q[0].question_id for q in questions])
     return hash_dn(joined, "")