Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] Support MMLU-CF Benchmark #1775

Open
wants to merge 29 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
d1a9db5
[Feature] Support MMLU-CF Benchmark
fistyee Dec 24, 2024
0c48407
[Feature] Support MMLU-CF Benchmark
fistyee Dec 24, 2024
17af07e
[Feature] Support MMLU-CF Benchmark
fistyee Dec 24, 2024
772b9a7
[Feature] Support MMLU-CF Benchmark
fistyee Dec 24, 2024
a32a1ee
[Feature] Support MMLU-CF Benchmark
fistyee Dec 24, 2024
531945d
[Feature] Support MMLU-CF Benchmark
fistyee Dec 24, 2024
3d769a9
[Feature] Support MMLU-CF Benchmark
fistyee Dec 24, 2024
21c8a98
[Feature] Support MMLU-CF Benchmark
fistyee Dec 25, 2024
113b564
[Feature] Support MMLU-CF Benchmark
fistyee Dec 25, 2024
5044516
[Feature] Support MMLU-CF Benchmark
fistyee Dec 26, 2024
6a57af5
[Feature] Support MMLU-CF Benchmark
fistyee Dec 27, 2024
706108d
[Feature] Support MMLU-CF Benchmark
fistyee Dec 27, 2024
2e15038
Merge branch 'open-compass:main' into main
fistyee Dec 27, 2024
5ab6362
[Feature] Support MMLU-CF Benchmark
fistyee Dec 30, 2024
4de7b20
Merge branch 'main' of https://github.com/fistyee/opencompass
fistyee Dec 30, 2024
ddd5583
[Feature] Support MMLU-CF Benchmark
fistyee Dec 30, 2024
956fe45
[Feature] Support MMLU-CF Benchmark
fistyee Dec 30, 2024
a222713
[Feature] Support MMLU-CF Benchmark
fistyee Jan 8, 2025
d5f756e
[Feature] Support MMLU-CF Benchmark
fistyee Jan 8, 2025
2329a5f
[Feature] Support MMLU-CF Benchmark
fistyee Jan 8, 2025
93c4411
[Feature] Support MMLU-CF Benchmark
fistyee Jan 8, 2025
77df499
Update mmlu-cf
liushz Jan 8, 2025
e428a7e
Update mmlu-cf
liushz Jan 8, 2025
ce3ee2d
Update mmlu-cf
liushz Jan 8, 2025
d061100
[Feature] Support MMLU-CF Benchmark
fistyee Jan 8, 2025
c5722b9
[Feature] Support MMLU-CF Benchmark
fistyee Jan 8, 2025
8439245
[Feature] Support MMLU-CF Benchmark
fistyee Jan 9, 2025
89929df
Remove outside configs
liushz Jan 9, 2025
e7149dd
Remove outside configs
liushz Jan 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions configs/eval_mmlu_cf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from mmengine.config import read_base

with read_base():
from opencompass.configs.datasets.mmlu_cf.mmlu_cf_gen_040615 import mmlu_cf_datasets

from opencompass.configs.models.qwen2_5.hf_qwen2_5_7b_instruct import models as hf_qwen2_5_7b_instruct_model
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model

from opencompass.configs.summarizers.mmlu_cf import summarizer


datasets = sum([v for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'], [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])


from opencompass.runners import LocalRunner
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask

infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=8,
task=dict(type=OpenICLInferTask)
),
)

eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(
type=LocalRunner,
max_num_workers=256,
task=dict(type=OpenICLEvalTask)
),
)

work_dir = 'outputs/debug/mmlu_cf'
5 changes: 5 additions & 0 deletions configs/summarizers/groups/mmlu_cf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
categories = ['Math', 'Physics', 'Chemistry', 'Law', 'Engineering', 'Other', 'Economics', 'Health', 'Psychology', 'Business', 'Biology', 'Philosophy', 'Computer_Science', 'History']

mmlu_cf_summary_groups = [
{'name': 'mmlu_cf', 'subsets': ['mmlu_cf_' + c.replace(' ', '_') for c in categories]},
]
25 changes: 25 additions & 0 deletions configs/summarizers/mmlu_cf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from mmengine.config import read_base

with read_base():
from .groups.mmlu_cf import mmlu_cf_summary_groups

summarizer = dict(
dataset_abbrs=[
'mmlu_cf_Biology',
'mmlu_cf_Business',
'mmlu_cf_Chemistry',
'mmlu_cf_Computer_Science',
'mmlu_cf_Economics',
'mmlu_cf_Engineering',
'mmlu_cf_Health',
'mmlu_cf_History',
'mmlu_cf_Law',
'mmlu_cf_Math',
'mmlu_cf_Philosophy',
'mmlu_cf_Physics',
'mmlu_cf_Psychology',
'mmlu_cf_Other',
'mmlu_cf',
],
summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
16 changes: 16 additions & 0 deletions opencompass/configs/datasets/mmlu_cf/mmlu_cf_categories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
categories = [
'Math',
'Physics',
'Chemistry',
'Law',
'Engineering',
'Other',
'Economics',
'Health',
'Psychology',
'Business',
'Biology',
'Philosophy',
'Computer_Science',
'History',
]
64 changes: 64 additions & 0 deletions opencompass/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
from opencompass.datasets import MMLUCFDataset
from opencompass.utils.text_postprocessors import first_option_postprocess

with read_base():
from .mmlu_cf_categories import categories

mmlu_cf_reader_cfg = dict(
input_columns=['input', 'A', 'B', 'C', 'D'],
output_column='target',
train_split='dev')

mmlu_cf_datasets = []
for _name in categories:
_hint = f'There is a single choice question (with answers). Answer the question by replying A, B, C or D.'
mmlu_cf_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=
f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
),
dict(role='BOT', prompt='{target}\n')
]),
),
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
),
],
),
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=GenInferencer),
)

mmlu_cf_eval_cfg = dict(
evaluator=dict(type=AccwithDetailsEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))

mmlu_cf_datasets.append(
dict(
abbr=f'mmlu_cf_{_name}',
type=MMLUCFDataset,
path='microsoft/MMLU-CF',
name=_name,
reader_cfg=mmlu_cf_reader_cfg,
infer_cfg=mmlu_cf_infer_cfg,
eval_cfg=mmlu_cf_eval_cfg,
))

del _name, _hint
4 changes: 4 additions & 0 deletions opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from mmengine.config import read_base

with read_base():
from .mmlu_cf_gen_040615 import mmlu_cf_datasets # noqa: F401, F403
64 changes: 64 additions & 0 deletions opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen_040615.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
from opencompass.datasets import MMLUCFDataset
from opencompass.utils.text_postprocessors import first_option_postprocess

with read_base():
from .mmlu_cf_categories import categories

mmlu_cf_reader_cfg = dict(
input_columns=['input', 'A', 'B', 'C', 'D'],
output_column='target',
train_split='dev')

mmlu_cf_datasets = []
for _name in categories:
_hint = f'There is a single choice question. Answer the question by replying A, B, C or D.'
mmlu_cf_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=
f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
),
dict(role='BOT', prompt='{target}\n')
]),
),
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
),
],
),
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=GenInferencer),
)

mmlu_cf_eval_cfg = dict(
evaluator=dict(type=AccwithDetailsEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))

mmlu_cf_datasets.append(
dict(
abbr=f'mmlu_cf_{_name}',
type=MMLUCFDataset,
path='microsoft/MMLU-CF',
name=_name,
reader_cfg=mmlu_cf_reader_cfg,
infer_cfg=mmlu_cf_infer_cfg,
eval_cfg=mmlu_cf_eval_cfg,
))

del _name, _hint
64 changes: 64 additions & 0 deletions opencompass/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
from opencompass.datasets import MMLUCFDataset
from opencompass.utils.text_postprocessors import first_option_postprocess

with read_base():
from .mmlu_cf_categories import categories

mmlu_cf_reader_cfg = dict(
input_columns=['input', 'A', 'B', 'C', 'D'],
output_column='target',
train_split='dev')

mmlu_cf_datasets = []
for _name in categories:
_hint = f'There is a single choice question (with answers). Answer the question by replying A, B, C or D.'
mmlu_cf_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=
f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
),
dict(role='BOT', prompt='{target}\n')
]),
),
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
),
],
),
ice_token='</E>',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)

mmlu_cf_eval_cfg = dict(
evaluator=dict(type=AccwithDetailsEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))

mmlu_cf_datasets.append(
dict(
abbr=f'mmlu_cf_{_name}',
type=MMLUCFDataset,
path='microsoft/MMLU-CF',
name=_name,
reader_cfg=mmlu_cf_reader_cfg,
infer_cfg=mmlu_cf_infer_cfg,
eval_cfg=mmlu_cf_eval_cfg,
))

del _name, _hint
5 changes: 5 additions & 0 deletions opencompass/configs/summarizers/groups/mmlu_cf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
categories = ['Math', 'Physics', 'Chemistry', 'Law', 'Engineering', 'Other', 'Economics', 'Health', 'Psychology', 'Business', 'Biology', 'Philosophy', 'Computer_Science', 'History']

mmlu_cf_summary_groups = [
{'name': 'mmlu_cf', 'subsets': ['mmlu_cf_' + c.replace(' ', '_') for c in categories]},
]
25 changes: 25 additions & 0 deletions opencompass/configs/summarizers/mmlu_cf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from mmengine.config import read_base

with read_base():
from .groups.mmlu_cf import mmlu_cf_summary_groups

summarizer = dict(
dataset_abbrs=[
'mmlu_cf_Biology',
'mmlu_cf_Business',
'mmlu_cf_Chemistry',
'mmlu_cf_Computer_Science',
'mmlu_cf_Economics',
'mmlu_cf_Engineering',
'mmlu_cf_Health',
'mmlu_cf_History',
'mmlu_cf_Law',
'mmlu_cf_Math',
'mmlu_cf_Philosophy',
'mmlu_cf_Physics',
'mmlu_cf_Psychology',
'mmlu_cf_Other',
'mmlu_cf',
],
summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
1 change: 1 addition & 0 deletions opencompass/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@
from .medbench import * # noqa: F401, F403
from .mgsm import * # noqa: F401, F403
from .mmlu import * # noqa: F401, F403
from .mmlu_cf import * # noqa: F401, F403
from .mmlu_pro import * # noqa: F401, F403
from .MMLUArabic import * # noqa: F401, F403
from .mmmlu import * # noqa: F401, F403
Expand Down
41 changes: 41 additions & 0 deletions opencompass/datasets/mmlu_cf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from datasets import DatasetDict, load_dataset

from opencompass.registry import LOAD_DATASET

from .base import BaseDataset


@LOAD_DATASET.register_module()
class MMLUCFDataset(BaseDataset):

@staticmethod
def load(path: str, name: str):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can set huggingface as your default loading method, and remove the information in datasets_info.py

"""Loading HuggingFace datasets."""
# Use HuggingFace's load_dataset method to load the dataset
hf_dataset = load_dataset(path)
columns_to_keep = ['Question', 'A', 'B', 'C', 'D', 'Answer']
hf_dataset = hf_dataset.map(
lambda x: {key: x[key]
for key in columns_to_keep})
splits = ['dev', 'val']

for split in splits:
sub_set = f'{name}_{split}'

# Rename fields here if they don't match the expected names
hf_dataset[sub_set] = hf_dataset[sub_set].map(
lambda example: {
'input': example['Question'],
'A': example['A'],
'B': example['B'],
'C': example['C'],
'D': example['D'],
'target': example['Answer']
})

# Create a DatasetDict and return it
dataset = DatasetDict({
'dev': hf_dataset[f'{name}_{splits[0]}'],
'test': hf_dataset[f'{name}_{splits[1]}'] # Use 'val' as 'test'
})
return dataset
Loading