Skip to content

Commit

Permalink
Use Yival to run Experiment (#25)
Browse files Browse the repository at this point in the history
* add yival to dependency

* migrate to pydantic v2

because yival require v2

* langchain still use pydantic v1

* update pandera version and make the code work

* experiment files so far

* add readme

* some more files

* update readme

* update readme

* add metrics data sheet

* update experiment sturcture

* minor changes

* have to set a callback to make Palm work

* add replicate key

* remove the hacking

I thought it's just my network issue

* add rounds in model variation

and update data

* update dependencies

use my fork of yival now

* Don't evaluate formulas

I think we won't use formulas in these sheets. And if we enable
evaluate formulas, some text will be evaluated to incorrect values.
Such as "-6C", will result in "#ERROR"

* Update github workflow

* update model compare function

- use redis cache
- use litellm directly instead of llm_completion from yival.
  llm_complete is just same as litellm.completion with some custom
  name mappings which we don't use.

* strip question text

* update generate result script

* Latest results

* update example

* update dependencies

* add todo

* set model name for evaluator

* latest experiment yaml

* questions

* update README

* Add readme about Redis cache

* latest experiment results

* scripts and notebooks

* add options

* add langdetect as dependency

* add gitignore

* add experiment archive

* questions

* create a custom evaluator for gpt4 based evaluation

Not using the one in Yival because it's moving fast.

* take care some possible errors when loading data

* update Readme

* remove some unneeded files

* rename script
  • Loading branch information
semio authored Jan 25, 2024
1 parent e9da232 commit 7e789cc
Show file tree
Hide file tree
Showing 41 changed files with 6,730 additions and 1,139 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/automation-api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python: [ 3.9 ]
python: [ 3.11 ]

steps:
- name: Checkout code
Expand Down Expand Up @@ -40,7 +40,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python: [ 3.9 ]
python: [ 3.11 ]

steps:
- name: Checkout code
Expand All @@ -60,7 +60,7 @@ jobs:
- name: Install Poetry
uses: snok/install-poetry@v1
with:
version: 1.4.2
version: 1.6.1
virtualenvs-create: true
virtualenvs-in-project: true

Expand Down
4 changes: 3 additions & 1 deletion automation-api/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@ OPENAI_ORG_ID=""
## for Huggingface Hub
HUGGINGFACEHUB_API_TOKEN=""
## for PALM
GOOGLE_API_KEY=""
PALM_API_KEY=""
## for iFlytek
IFLYTEK_API_KEY=""
IFLYTEK_API_SECRET=""
IFLYTEK_APPID=""
## for Alibaba
DASHSCOPE_API_KEY=""
# for Replicate
REPLICATE_API_KEY=""

# For local development
SERVICE_ACCOUNT_CREDENTIALS=""
Expand Down
38 changes: 33 additions & 5 deletions automation-api/lib/ai_eval_spreadsheet/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,28 @@
# for more info
# Note that most types are str since spreadsheet columns can be formulas

from datetime import datetime
from typing import Optional

import pandas as pd
import pandera as pa
from pandera.engines.pandas_engine import PydanticModel
from pydantic import BaseModel, Field, validator
from pydantic import BaseModel, ConfigDict, Field, field_validator


class Question(BaseModel):
model_config = ConfigDict(coerce_numbers_to_str=True)

include_in_next_evaluation: Optional[bool] = Field(
None, title="Include in next evaluation"
None, title="Include in next evaluation", validate_default=True
)
question_id: Optional[str] = Field(None, title="Question ID")
language: Optional[str] = Field(None, title="Language")
published_version_of_question: Optional[str] = Field(
None, title="Published version of question"
)

@validator("include_in_next_evaluation", pre=True, always=True)
@field_validator("include_in_next_evaluation", mode="before")
@classmethod
def default_if_nan(cls, v): # noqa: N805
return False if pd.isna(v) else v

Expand All @@ -36,6 +38,8 @@ class Config:


class QuestionOption(BaseModel):
model_config = ConfigDict(coerce_numbers_to_str=True)

question_option_id: Optional[str] = Field(None, title="Question Option ID")
question_id: Optional[str] = Field(None, title="Question ID")
language: Optional[str] = Field(None, title="Language")
Expand All @@ -53,6 +57,8 @@ class Config:


class PromptVariation(BaseModel):
model_config = ConfigDict(coerce_numbers_to_str=True)

include_in_next_evaluation: Optional[bool] = Field(
None, title="Include in next evaluation"
)
Expand All @@ -78,6 +84,8 @@ class Config:


class GenAiModel(BaseModel):
model_config = ConfigDict(coerce_numbers_to_str=True, protected_namespaces=())

model_id: Optional[str] = Field(None, title="Model ID")
vendor: Optional[str] = Field(None, title="Vendor")
model_name: Optional[str] = Field(None, title="Model name")
Expand All @@ -90,6 +98,8 @@ class Config:


class GenAiModelConfig(BaseModel):
model_config = ConfigDict(coerce_numbers_to_str=True, protected_namespaces=())

include_in_next_evaluation: Optional[bool] = Field(
None, title="Include in next evaluation"
)
Expand All @@ -107,12 +117,28 @@ class Config:
coerce = True


class Metric(BaseModel):
name: Optional[str] = Field(None, title="Name")
description: Optional[str] = Field(None, title="Description")
prompt: Optional[str] = Field(None, title="Prompt")
choices: Optional[str] = Field(None, title="Choices")
choice_scores: Optional[str] = Field(None, title="Choice Scores")


class MetricsDf(pa.DataFrameModel):
class Config:
dtype = PydanticModel(Metric)
coerce = True


class EvalResult(BaseModel):
model_config = ConfigDict(coerce_numbers_to_str=True, protected_namespaces=())

question_id: Optional[str] = Field(None, title="Question ID")
language: Optional[str] = Field(None, title="Language")
prompt_variation_id: Optional[str] = Field(None, title="Prompt variation ID")
model_configuration_id: Optional[str] = Field(None, title="Model Configuration ID")
last_evaluation_datetime: Optional[datetime] = Field(None, title="Last Evaluation")
last_evaluation_datetime: Optional[str] = Field(None, title="Last Evaluation")
percent_correct: Optional[float] = Field(None, title="Percent Correct")
percent_wrong: Optional[float] = Field(None, title="Percent Wrong")
percent_very_wrong: Optional[float] = Field(None, title="Percent Very Wrong")
Expand All @@ -128,6 +154,8 @@ class Config:


class SessionResult(BaseModel):
model_config = ConfigDict(coerce_numbers_to_str=True, protected_namespaces=())

session_id: Optional[str] = Field(None, title="Session ID")
session_time: Optional[str] = Field(None, title="Session Time")
prompt_variation_id: Optional[str] = Field(None, title="Prompt Variation ID")
Expand Down
24 changes: 19 additions & 5 deletions automation-api/lib/ai_eval_spreadsheet/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
GenAiModelConfig,
GenAiModelConfigsDf,
GenAiModelsDf,
Metric,
MetricsDf,
PromptVariation,
PromptVariationsDf,
Question,
Expand All @@ -36,6 +38,7 @@ class AiEvalData:
gen_ai_model_configs: Optional[
GsheetsWorksheetEditor[GenAiModelConfigsDf, GenAiModelConfig]
] = None
metrics: Optional[GsheetsWorksheetEditor[MetricsDf, Metric]] = None
evaluation_results: Optional[
GsheetsWorksheetEditor[EvalResult, EvalResultsDf]
] = None
Expand All @@ -50,6 +53,7 @@ class AiEvalData:
"prompt_variations": "Prompt variations",
"gen_ai_models": "Models",
"gen_ai_model_configs": "Model configurations",
"metrics": "Metrics",
"evaluation_results": "Latest Results",
"session_results": "Sessions",
}
Expand All @@ -71,7 +75,7 @@ def read_ai_eval_data(
row_schema=Question,
worksheet_name=sheet_names["questions"],
header_row_number=0,
evaluate_formulas=True,
evaluate_formulas=False,
)

question_options = GsheetsWorksheetEditor(
Expand All @@ -80,7 +84,7 @@ def read_ai_eval_data(
row_schema=QuestionOption,
worksheet_name=sheet_names["question_options"],
header_row_number=0,
evaluate_formulas=True,
evaluate_formulas=False,
)

prompt_variations = GsheetsWorksheetEditor(
Expand All @@ -89,7 +93,7 @@ def read_ai_eval_data(
row_schema=PromptVariation,
worksheet_name=sheet_names["prompt_variations"],
header_row_number=0,
evaluate_formulas=True,
evaluate_formulas=False,
)

gen_ai_models = GsheetsWorksheetEditor(
Expand All @@ -98,7 +102,7 @@ def read_ai_eval_data(
row_schema=GenAiModel,
worksheet_name=sheet_names["gen_ai_models"],
header_row_number=0,
evaluate_formulas=True,
evaluate_formulas=False,
)

gen_ai_model_configs = GsheetsWorksheetEditor(
Expand All @@ -107,7 +111,16 @@ def read_ai_eval_data(
row_schema=GenAiModelConfig,
worksheet_name=sheet_names["gen_ai_model_configs"],
header_row_number=0,
evaluate_formulas=True,
evaluate_formulas=False,
)

metrics = GsheetsWorksheetEditor(
sh=ai_eval_spreadsheet,
df_schema=MetricsDf,
row_schema=Metric,
worksheet_name=sheet_names["metrics"],
header_row_number=0,
evaluate_formulas=False,
)

evaluation_results = GsheetsWorksheetEditor(
Expand All @@ -134,6 +147,7 @@ def read_ai_eval_data(
prompt_variations=prompt_variations,
gen_ai_models=gen_ai_models,
gen_ai_model_configs=gen_ai_model_configs,
metrics=metrics,
evaluation_results=evaluation_results,
session_results=session_results,
)
3 changes: 2 additions & 1 deletion automation-api/lib/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,12 @@ def read_config() -> dict[str, str]:
"AI_EVAL_SPREADSHEET_ID",
"AI_EVAL_DEV_SPREADSHEET_ID",
"HUGGINGFACEHUB_API_TOKEN",
"GOOGLE_API_KEY",
"PALM_API_KEY",
"IFLYTEK_APPID",
"IFLYTEK_API_KEY",
"IFLYTEK_API_SECRET",
"DASHSCOPE_API_KEY",
"REPLICATE_API_KEY",
]:
config[key] = os.getenv(key=key, default="")
return config
2 changes: 1 addition & 1 deletion automation-api/lib/gsheets/gsheets_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pandas as pd
from gspread import Spreadsheet, Worksheet, WorksheetNotFound
from gspread_dataframe import set_with_dataframe
from pydantic.main import BaseModel
from pydantic import BaseModel

from lib.app_singleton import app_logger

Expand Down
4 changes: 3 additions & 1 deletion automation-api/lib/gsheets/gsheets_worksheet_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
import pandera as pa
from pandera import DataFrameModel
from pydantic.main import BaseModel
from pydantic import BaseModel

from lib.gsheets.gsheets_utils import get_pydantic_model_field_titles

Expand Down Expand Up @@ -36,11 +36,13 @@ def __init__(
self.df_schema = df_schema
self.row_schema = row_schema
self.header_row_number = header_row_number

self.attributes_to_columns_map = get_pydantic_model_field_titles(
self.row_schema
)
df = df.rename(columns=inv_dict(self.attributes_to_columns_map))
df = self.replace_current_row_numbers_in_formulas(df)
# import ipdb; ipdb.set_trace()
self.df = df_schema(df)

def replace_current_row_numbers_in_formulas(
Expand Down
2 changes: 1 addition & 1 deletion automation-api/lib/gsheets/gsheets_worksheet_editor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from gspread import Spreadsheet, Worksheet
from gspread.utils import rowcol_to_a1
from pandera import DataFrameModel
from pydantic.main import BaseModel
from pydantic import BaseModel

from lib.gsheets.gsheets_utils import get_worksheet
from lib.gsheets.gsheets_worksheet_data import GsheetsWorksheetData
Expand Down
4 changes: 2 additions & 2 deletions automation-api/lib/llms/alibaba.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from dashscope import Generation
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM
from pydantic import root_validator
from langchain.pydantic_v1 import root_validator
from tenacity import (
retry,
retry_if_exception_type,
Expand Down Expand Up @@ -71,7 +71,7 @@ def validate_environment(cls, values: Dict) -> Dict: # noqa: N805
dashscope.api_key = dashscope_api_key

if values["top_p"] is not None and not 0.0 <= values["top_p"] <= 1.0:
raise ValueError("max_output_tokens must be between 0 and 1")
raise ValueError("top_p must be between 0 and 1")

if values["top_k"] is not None and not 1 <= values["top_k"] <= 100:
raise ValueError("top_k must be between 1 and 100")
Expand Down
2 changes: 1 addition & 1 deletion automation-api/lib/llms/spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM
from pydantic import root_validator
from langchain.pydantic_v1 import root_validator
from tenacity import (
retry,
retry_if_exception_type,
Expand Down
15 changes: 13 additions & 2 deletions automation-api/lib/pilot/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from lib.ai_eval_spreadsheet.schemas import (
GenAiModel,
GenAiModelConfig,
Metric,
PromptVariation,
Question,
QuestionOption,
Expand Down Expand Up @@ -219,9 +220,14 @@ def get_prompt_variants(
return res


def get_model_configs(sheet: AiEvalData) -> List[ModelAndConfig]:
def get_model_configs(
sheet: AiEvalData, include_all: bool = False
) -> List[ModelAndConfig]:
models_df = sheet.gen_ai_models.data.df
model_configs_df = filter_included_rows(sheet.gen_ai_model_configs.data.df)
if include_all:
model_configs_df = sheet.gen_ai_model_configs.data.df
else:
model_configs_df = filter_included_rows(sheet.gen_ai_model_configs.data.df)

model_configs = class_objects_from_df(model_configs_df, GenAiModelConfig)
result = []
Expand All @@ -232,6 +238,11 @@ def get_model_configs(sheet: AiEvalData) -> List[ModelAndConfig]:
return result


def get_metrics(sheet: AiEvalData) -> List[Metric]:
res = class_objects_from_df(sheet.metrics.data.df, Metric)
return res


def get_survey_hash(questions: List[QuestionAndOptions]) -> str:
joined = ",".join([q[0].question_id for q in questions])
return hash_dn(joined, "")
Expand Down
Loading

0 comments on commit 7e789cc

Please sign in to comment.