This is a modified version of https://github.com/chigkim/Ollama-MMLU-Pro benchmark via the OpenAI Chat Completion API. Works great for VLLM or Aphrodite engine for super fast batching inference.
The testing and scoring method is exactly the same as the original script from TIGER-LAB, adding only a few features to simplify running the test and displaying the results. To see the exact changes, compare between mmlu-pro branch against main with git diff:
git diff mmlu-pro..main -- run_openai.py
This fork adds support and the dataset for Indonesian MMLU-Pro testing and also some small fixes like:
- Prevent errors when there is zero random guesses.
- Ignore words inside brackets of answer and only take the letter as answer.
- Include square brackets [ ] for answer format as well as regular brackets ( ).
Change the config.toml according to your setup.
pip install -r requirements.txt
python run_openai.py # Or run_openai_id.py for Indonesian.
You can also override settings in configuration file with command line flags like --model, ----category, etc. For example, if you specify --model phi3
, all the settings from configuration file will be loaded except model. See python run_openai.py -h
for more info.
- If an answer cannot be extracted from the model's response, the script will randomly assign an answer. It's the same way as the original script.
- The total score represents the number of correct answers out of the total number of attempts including random guess attempts. This is the score from the original script.
- "Random Guess Attempts" indicates the total number of random guesses out of the total attempts.
- "Correct Random Guesses" shows the number of random guesses that were correct out of all the random guesses.
- "Adjusted Score Without Random Guesses" subtracts all random guesses from the correct answers and the total answers.
- The last overall score in the table is calculated as: the total number of correct answers across all categories / the total number of all attempts across all categories * 100.
- All the scores in percentage are rounded numbers.
[server]
url = "http://localhost:8000/v1"
api_key = "api key"
model = "llama3"
model_note = "LORA"
timeout = 600.0
[inference]
# Ssettings below are from evaluate_from_local.py for VLLM on TIGER-AI-Lab/MMLU-Pro
temperature = 0.0
top_p = 1.0 # not specified but default for VLLM
max_tokens = 1024
stop = "<|eot_id|>" # Change to stop token of model being tested
# The variable {subject} will be replaced with appropriate value in runtime.
system_prompt = "You are an expert that knows everything. You are tasked with answering a multiple-choice question. The following is a multiple choice question (with answers) about {subject}. Give your final answer in the format of `The answer is (chosen answer)`."
# Indo system prompt
# system_prompt = "Anda adalah seseorang yang pintar dan mengetahui segalanya. Anda diberi perintah untuk menjawab pertanyaan pilihan ganda. Berikut adalah pertanyaan pilihan ganda tentang {subject}. Jawab dengan format: 'Jawabannya adalah (pilihan jawaban)'."
# Japan system prompt
# system_prompt = "あなたはすべてを知っている専門家です。多肢選択式の質問に回答することが求められます。次は、{subject} に関する多肢選択式の質問です。最終的な回答を「答えは (選択した回答) です」という形式で入力してください。"
# Korean system prompt
# system_prompt = "당신은 모든 것을 아는 전문가입니다. 당신은 객관식 질문에 답하는 임무를 맡고 있습니다. 다음은 {subject}에 관한 객관식 질문(답변 포함)입니다. 최종 답변은 '대답은 (선택된 답변)' 형식으로 작성해 주세요."
# Chinese system prompt
# system_prompt = "你是一位无所不知的专家。你的任务是回答一道多项选择题。以下是一道关于{subject}的多项选择题(附答案)。请以“答案是(选定答案)”的格式给出你的最终答案。"
# "single_chat" inserts all the COT examples and question into a single message. Default style for GPT-4O script, but raises a lot of format issues especially for small models.
# "multi_chat" inserts COT examples into multi-turn messages. Use for instruct/chat models.
# "no_chat" uses v1/completion api. Use for non-instruct/chat model.
style = "multi_chat"
[test]
categories = ['biology', 'business', 'chemistry', 'computer science', 'economics', 'engineering', 'health', 'history', 'law', 'math', 'philosophy', 'physics', 'psychology', 'other']
parallel = 16
# Use chain of thoughts prompting or no.
cot = true
# Supported languages = en, id, ja, ko, zh
language = "en"
[log]
# Verbosity between 0-2
verbosity = 0
# If true, logs exact prompt sent to the model in the test result files.
log_prompt = true