-
Notifications
You must be signed in to change notification settings - Fork 231
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Benchmark] Add VL-RewardBench (#703)
* update vlrewardbench * pre-commit fix * formatter * [Improvement] Better `AUTO_SPLIT` and model split for InternVL2 * [Minor] Improve CC-OCR Import * [Model] Support QVQ * [Model] Update Molmo Eval to Match Official Implementation (#648) * add molmo prompts * fix lint format * [Fix] Refine Qwen-VL2 device assignment * [Fix] Fix RealWorldQA md5 * update MMMU_DEV_VAL tsv * [Fix] Fix confusing image width&height (#704) Co-authored-by: Yuan Ye <[email protected]> * Update llama_vision.py (#705) * [Fix] Fix Lint * Fix Lint * Fix Lint --------- Co-authored-by: kennymckormick <[email protected]> Co-authored-by: jamespark3922 <[email protected]> Co-authored-by: CMeteor <[email protected]> Co-authored-by: Yuan Ye <[email protected]> Co-authored-by: Guowei Xu <[email protected]>
- Loading branch information
1 parent
ae10ca4
commit 276d90a
Showing
6 changed files
with
207 additions
and
41 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,174 @@ | ||
from ast import literal_eval | ||
|
||
from .image_base import ImageBaseDataset | ||
from .utils import build_judge, DEBUG_MESSAGE | ||
from ..smp import * | ||
from ..utils import track_progress_rich | ||
|
||
|
||
LLM_PARSE_ANSWER_PROMPT = ''' | ||
You are given a pairwise judgement for two responses. Please return the better response according to the judgement. | ||
Return the Answer X ONLY. e.g., Answer 1 or Answer 2. | ||
Judgement: {judgement} | ||
''' | ||
|
||
|
||
PROMPT_TEMPLATE = '''\ | ||
You are a highly capable multimodal AI assistant tasked with evaluating answers to visual questions. | ||
Please analyze the following image and question, then determine which of the two provided answers is better. | ||
Question: {query} | ||
Answer 1: {answer_0} | ||
Answer 2: {answer_1} | ||
Please evaluate both answers based on the following criteria: | ||
1. Accuracy: How well does the answer align with the visual information in the image? | ||
2. Completeness: Does the answer fully address all aspects of the question? | ||
3. Clarity: Is the answer easy to understand and well-articulated? | ||
4. Relevance: Does the answer directly relate to the question and the image? | ||
After your evaluation, please: | ||
1. Explain your reasoning for each criterion. | ||
2. Provide an overall judgment on which answer is better (Answer 1 or Answer 2).\ | ||
For example: Overall Judgment: Answer X is better. | ||
Your response should be structured and detailed, \ | ||
demonstrating your understanding of both the visual and textual elements of the task.''' | ||
|
||
|
||
def get_score(line, parsed_response, random_number): | ||
gt_ans = line['human_ranking'].index(0 if random_number == 0 else 1) + 1 | ||
if 'Answer 1'.lower() in parsed_response.lower(): | ||
pred = 1 | ||
elif 'Answer 2'.lower() in parsed_response.lower(): | ||
pred = 2 | ||
else: # failed | ||
pred = 'None' # random.choice([1, 2]) | ||
|
||
if pred == gt_ans: | ||
return 1.0 | ||
else: | ||
return 0.0 | ||
|
||
|
||
def VLRewardBench_eval_answer(model, line): | ||
response = toliststr(line['response']) | ||
random_number = sum(len(res) for res in response) % 2 | ||
|
||
prompt = LLM_PARSE_ANSWER_PROMPT.format(judgement=line['prediction']) | ||
messages = [dict(type='text', value=prompt)] | ||
|
||
resp = model.generate(messages) | ||
score = get_score(line, resp, random_number) | ||
|
||
if score is None: | ||
return 'Unknown' | ||
return score | ||
|
||
|
||
class VLRewardBench(ImageBaseDataset): | ||
TYPE = 'VQA' | ||
DATASET_URL = { | ||
'VL-RewardBench': 'https://huggingface.co/datasets/MMInstruction/VL-RewardBench/resolve/main/vl_rewardbench.tsv' | ||
} | ||
DATASET_MD5 = {'VL-RewardBench': '1d2676f4ab4a5f755019ec0af2b28189'} | ||
|
||
# Given one data record, return the built prompt (a multi-modal message), can override | ||
def build_prompt(self, line): | ||
if isinstance(line, int): | ||
line = self.data.iloc[line] | ||
tgt_path = self.dump_image(line) # save image to local | ||
question = line['question'] | ||
msgs = [] | ||
if isinstance(tgt_path, list): | ||
msgs.extend([dict(type='image', value=p) for p in tgt_path]) | ||
else: | ||
msgs = [dict(type='image', value=tgt_path)] | ||
|
||
response = toliststr(line['response']) | ||
random_number = sum(len(res) for res in response) % 2 | ||
if random_number == 1: | ||
# randomly shuffle the order of the responses | ||
response = response[::-1] | ||
query_prompt = PROMPT_TEMPLATE.format( | ||
query=question, answer_0=response[0], answer_1=response[1] | ||
) | ||
msgs = msgs + [dict(type='text', value=query_prompt)] | ||
return msgs | ||
|
||
# It returns a DataFrame | ||
@classmethod | ||
def evaluate(self, eval_file, **judge_kwargs): | ||
suffix = eval_file.split('.')[-1] | ||
model = judge_kwargs['model'] | ||
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') | ||
score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv') | ||
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') | ||
nproc = judge_kwargs.pop('nproc', 4) | ||
|
||
if not osp.exists(storage): | ||
raw_data = VLRewardBench('VL-RewardBench').data | ||
data = load(eval_file) | ||
data['prediction'] = [str(x) for x in data['prediction']] | ||
data['human_ranking'] = [literal_eval(x) for x in raw_data['answer']] | ||
|
||
judge_kwargs['temperature'] = 0 | ||
judge_kwargs['timeout'] = 60 | ||
model = build_judge(max_tokens=128, **judge_kwargs) | ||
|
||
assert model.working(), ( | ||
'VLRewardBench evaluation requires a working OPENAI API\n' | ||
+ DEBUG_MESSAGE | ||
) | ||
|
||
lt = len(data) | ||
lines = [data.iloc[i] for i in range(lt)] | ||
tups = [(model, line) for line in lines] | ||
indices = [line['index'] for line in lines] | ||
|
||
ans = load(tmp_file) if osp.exists(tmp_file) else {} | ||
tups = [x for x, i in zip(tups, indices) if i not in ans] | ||
indices = [i for i in indices if i not in ans] | ||
|
||
if len(indices): | ||
new_results = track_progress_rich( | ||
VLRewardBench_eval_answer, | ||
tups, | ||
nproc=nproc, | ||
chunksize=nproc, | ||
keys=indices, | ||
save=tmp_file, | ||
) | ||
ans = load(tmp_file) | ||
for k, v in zip(indices, new_results): | ||
ans[k] = v | ||
|
||
data['score'] = [ans[idx] for idx in data['index']] | ||
# data.pop('image') | ||
dump(data, storage) | ||
|
||
data = load(storage) | ||
lt = len(data) | ||
|
||
category_scores = defaultdict(lambda: 0) | ||
category_cnt = defaultdict(lambda: 0) | ||
scores = defaultdict(lambda: 0) | ||
for i in range(lt): | ||
item = data.iloc[i] | ||
category_scores[item['category']] += item['score'] | ||
category_cnt[item['category']] += 1 | ||
# calculate the average score for each category | ||
for k, v in category_scores.items(): | ||
scores[k] = v / category_cnt[k] | ||
# calculate category macro accuracy (average across categories) | ||
scores['Macro Accuracy'] = sum(scores.values()) / len(scores) | ||
# calculate the total average score | ||
scores['Overall Consistency'] = sum(category_scores.values()) / lt | ||
|
||
scores = {k: [v] for k, v in scores.items()} | ||
scores = pd.DataFrame(scores) | ||
dump(scores, score_file) | ||
return scores |