-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path1_solve_with_baseline.py
171 lines (144 loc) · 5.57 KB
/
1_solve_with_baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# Import packages
import os
from datetime import datetime
import pandas as pd
from models.model_factory import ModelFactory
from agents.agent_factory import AgentFactory
from problems.exam_reader import ExamReader
from experiments.experiment import Experiment
from experiments.result import Result
from experiments.experiments_file import ExperimentsFile
from details.details_writer import DetailsWriter
from details.details_row import DetailsRow
from dialogs.dialog_writer import DialogWriter
from models.pricing import get_pricing
from logs.log import Log
from logs.log_level import LogLevel
# Set the models
model_names = [
"gpt-35-turbo",
"gpt-4",
"llama-2-7b-chat",
"llama-2-70b-chat",
"mistral-large",
"cohere-command-r-plus",
"gemini-1.0-pro",
"gemini-1.5-pro-preview-0409",
"claude-3-opus-20240229"
]
# Set the agent
agent_name = "baseline"
# Set the exams
exam_names = [
"comprehensive-100",
"aqua-rat-100",
"logiqa-en-100",
"lsat-ar-100",
"lsat-lr-100",
"lsat-rc-100",
"sat-en-100",
"sat-math-100",
"arc-challenge-100",
"hella-swag-100",
"med-mcqa-100"
]
# Set the attempt id
attempt_id = 1
# Set the logging level
log_level = LogLevel.DEBUG
# Create the components
model_factory = ModelFactory()
agent_factory = AgentFactory()
exam_reader = ExamReader()
dialog_writer = DialogWriter()
# Loop through each model
for model_name in model_names:
# Loop through each exam
for exam_name in exam_names:
# Set the experiment parameters
start_time = pd.Timestamp.now()
experiment = Experiment(model_name, agent_name, exam_name, attempt_id)
experiment.start(start_time)
# Set file and folder paths
exam_path = f"../data/exams/{exam_name}.jsonl"
dialogs_folder_path = f"../data/dialogs/{experiment.name}"
details_file_path = f"../data/details/{experiment.name}.csv"
results_file_path = f"../data/results/results.csv"
log_name_prefix = start_time.strftime("%Y-%m-%d %H-%M-%S")
log_folder_path = f"../data/logs/{log_name_prefix} - {experiment.name}"
# Create the folders
os.makedirs(dialogs_folder_path, exist_ok=True)
os.makedirs(os.path.dirname(details_file_path), exist_ok=True)
os.makedirs(os.path.dirname(results_file_path), exist_ok=True)
os.makedirs(log_folder_path, exist_ok=True)
# Create the details table
details = []
# Load the exam
exam = exam_reader.read(exam_path)
# Loop through each exam problem
for i, problem in enumerate(exam.problems):
problem_id = i + 1
# # DEBUG: Answer only the first n problems
# if i >= 10:
# break
# Create the log file
log_file_path = f"{log_folder_path}/Problem {problem_id}.txt"
log = Log(log_level)
log.open(log_file_path)
log.head(f"Model: {experiment.model_name} | Agent: {experiment.agent_name} | Exam: {experiment.exam_name} | Problem {i + 1} of {len(exam.problems)}")
# Create the details row
details_row = DetailsRow()
episode_start_time = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
details_row.create(problem_id, episode_start_time, experiment, problem)
# Create the agent
model = model_factory.create(experiment.model_name)
agent = agent_factory.create(experiment.agent_name, model, problem.topic)
# Create the dialog
log.subhead("System:")
agent.create_dialog()
log.info(agent.dialog.get_all()[0].content)
log.subhead("User 1:")
log.info(agent.dialog.get_all()[1].content)
log.subhead("Assistant 1:")
log.info(agent.dialog.get_all()[2].content)
# Set the problem
log.subhead("User 2:")
agent.set_problem(problem)
log.info(agent.dialog.get_all()[3].content)
# Get the agent's answer
log.subhead("Assistant 2:")
answer_response = agent.get_answer()
answer = agent.get_answer_choice(answer_response.text)
log.info(agent.dialog.get_all()[4].content)
# Log the agent's answer
log.subhead("Result:")
is_correct = answer == problem.answer
score = 1 if is_correct else 0
details_row.update_answer(answer_response, answer, score)
log.info(f"Agent Answer: {answer}")
log.info(f"Correct Answer: {problem.answer}")
log.info(f"Score: {score}")
# Save the dialog
dialog_file_path = f"{dialogs_folder_path}/Problem {problem_id}.json"
dialog_writer.write(dialog_file_path, agent.dialog)
details.append(details_row)
# End the experiment
experiment.end(datetime.now())
details_table = pd.DataFrame(details)
pricing = get_pricing(experiment.model_name)
results = Result(experiment, details_table, pricing)
# Record the experiment details
details_writer = DetailsWriter()
details_writer.write(details, details_file_path)
# Record the experiment results
experiments = ExperimentsFile()
experiments.load(results_file_path)
experiments.add_row(experiment, results)
experiments.save(results_file_path)
# Log the experiments
log_file_path = f"{log_folder_path}/Results.txt"
log = Log(LogLevel.INFO)
log.open(log_file_path)
log.head("Results")
log.object(results)
log.close()