From dbda46ab65aa97ed152d472a1f382ba17365882d Mon Sep 17 00:00:00 2001
From: Myhs_phz <demarcia2014@126.com>
Date: Thu, 19 Dec 2024 18:09:42 +0800
Subject: [PATCH] [benchmark] add new benchmark: CMMMU (#666)

* new branch: cmmmu

* feat

* feat

* feat

* feat

* feat build_prompt

* feat evaluate

* fix

* update CMMMU_TEST md5

* fix

* fix build prompt

* fix build prompt
---
 vlmeval/dataset/__init__.py |   4 +-
 vlmeval/dataset/cmmmu.py    | 354 ++++++++++++++++++++++++++++++++++++
 2 files changed, 357 insertions(+), 1 deletion(-)
 create mode 100644 vlmeval/dataset/cmmmu.py

diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py
index 66a3908f6..665943ebf 100644
--- a/vlmeval/dataset/__init__.py
+++ b/vlmeval/dataset/__init__.py
@@ -30,6 +30,7 @@
 from .mmgenbench import MMGenBench
 
 from .miabench import MIABench
+from .cmmmu import CMMMU
 from .wildvision import WildVision
 from .mmmath import MMMath
 from .dynamath import Dynamath
@@ -128,7 +129,8 @@ def evaluate(self, eval_file, **judge_kwargs):
     MMMUDataset, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, TableVQABench,
     MMLongBench, VCRDataset, MMDUDataset, DUDE, SlideVQA, MUIRDataset,
     GMAIMMBenchDataset, MMERealWorld, HRBenchDataset, CRPE, MathVerse, NaturalBenchDataset,
-    MIABench, OlympiadBench, WildVision, MMMath, QSpatial, Dynamath, MMGenBench, VizWiz, MMNIAH
+    MIABench, OlympiadBench, WildVision, MMMath, QSpatial, Dynamath, MMGenBench, VizWiz, MMNIAH,
+    CMMMU
 ]
 
 VIDEO_DATASET = [
diff --git a/vlmeval/dataset/cmmmu.py b/vlmeval/dataset/cmmmu.py
new file mode 100644
index 000000000..12c583f29
--- /dev/null
+++ b/vlmeval/dataset/cmmmu.py
@@ -0,0 +1,354 @@
+from .image_base import ImageBaseDataset
+import random
+from collections import Counter
+import os
+import re
+import tempfile
+from ..smp import *
+
+
+def get_multi_choice_prediction(response, all_choices, index2ans):
+    for char in [',', '.', '!', '?', ';', ':', "'"]:
+        response = response.strip(char)
+    response = " " + response + " "  # add space to avoid partial match
+
+    candidates = []
+
+    for choice in all_choices:  # (A) (B) (C) (D)
+        # Add the choice to candidates each time it appears in the response
+        candidates.extend([choice for _ in range(response.count(f'({choice})'))])
+
+    if len(candidates) == 0:
+        for choice in all_choices:  # A B C D
+            # Similarly, add the choice for each occurrence
+            candidates.extend([choice for _ in range(response.count(f'{choice}'))])
+
+    if len(candidates) == 0 and len(response.split()) >= 1:
+        for index, ans in index2ans.items():
+            # Add index for each occurrence of ans in response
+            candidates.extend([index for _ in range(response.count(ans))])
+
+    # if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example
+    if len(candidates) == 0 and len(response.split()) >= 1:
+        for index, ans in index2ans.items():
+            if ans in response:
+                candidates.append(index)
+                # index_ans = False  # it's content ans.
+
+    if len(candidates) == 0:  # still not get answer, randomly choose one.
+        return random.choice(all_choices)
+        # return ''
+    else:
+        # Count the occurrence of each candidate
+        candidate_counts = Counter(candidates)
+
+        # Select the most frequent candidates
+        max_count = max(candidate_counts.values())
+        most_frequent_candidates = [c for c in all_choices if candidate_counts.get(c, 0) == max_count]
+
+        # Combine the most frequent candidates in ABCD order
+        return ''.join(most_frequent_candidates)
+
+
+def extract_numbers(string):
+    # Pattern for numbers with Chinese commas
+    pattern_commas = r'-?\d{1,3}(?:，\d{3})+'
+    # Pattern for scientific notation
+    pattern_scientific = r'-?\d+(?:\.\d+)?[eE][+-]?\d+'
+    # Pattern for simple numbers without Chinese commas
+    pattern_simple = r'-?(?:\d+\.\d+|\.\d+|\d+)(?![eE][+-]?\d+)(?!，\d)'
+
+    # Extract numbers with Chinese commas
+    numbers_with_commas = re.findall(pattern_commas, string)
+    # Extract numbers in scientific notation
+    numbers_scientific = re.findall(pattern_scientific, string)
+    # Extract simple numbers without Chinese commas
+    numbers_simple = re.findall(pattern_simple, string)
+
+    # Combine all extracted numbers
+    all_numbers = numbers_with_commas + numbers_scientific + numbers_simple
+    return all_numbers
+
+
+def check_is_number(string):
+    try:
+        float(string.replace(',', ''))
+        return True
+    except ValueError:
+        # check if there's comma inside
+        return False
+
+
+def count_letters(string):
+    return sum(c.isalpha() and 'a' <= c <= 'z' or 'A' <= c <= 'Z' for c in string)
+
+
+def normalize_str(string, answer):
+    # check if characters in the string
+
+    # if number, numerize it.
+    if string is None:
+        return [string]
+    string = string.strip()
+
+    is_number = check_is_number(string)
+
+    if is_number:
+        string = string.replace(',', '')
+        string = float(string)
+        # leave 2 decimal
+        string = round(string, 2)
+        return [string]
+    else:  # it's likely to be a string
+        if len(string) > len(answer) + 20 or count_letters(string) > count_letters(answer) + 2:
+            return []
+        return [string]
+
+
+def get_fill_blank_prediction(response, answer):
+    """get the prediction from the generated response,
+    return a list of predicted strings or numbers"""
+
+    def get_key_subresponses(response):
+        response = response.strip("。").strip()
+        sub_responses = re.split(r'。|\n', response)
+        indicators_of_keys = ['是', '为', '所以', '等于', '方案', '选择',
+                              '正确答案', '因此', '最后', '答案', '结果']
+        key_responses = []
+        for index, resp in enumerate(sub_responses):
+            # if last one, accept it's an equation (the entire response can be just one sentence with equation)
+            if index == len(sub_responses) - 1:
+                indicators_of_keys.extend(['='])
+            shortest_key_response = None
+            # the shortest response that may contain the answer (tail part of the response)
+            for indicator in indicators_of_keys:
+                if indicator in resp:
+                    if not shortest_key_response:
+                        shortest_key_response = resp.split(indicator)[-1].strip()
+                    else:
+                        if len(resp.split(indicator)[-1].strip()) < len(shortest_key_response):
+                            shortest_key_response = resp.split(indicator)[-1].strip()
+
+            if shortest_key_response:
+                # and it's not trivial
+                if shortest_key_response.strip() not in [":", ",", ".", "!", "?", ";", ":", "'"]:
+                    key_responses.append(shortest_key_response)
+        if len(key_responses) == 0:  # did not found any
+            return [response]
+        return key_responses
+
+    key_responses = get_key_subresponses(response)
+
+    pred_list = key_responses.copy()  # keep the original string response
+    for resp in key_responses:
+        pred_list.extend(extract_numbers(resp))
+
+    tmp_pred_list = []
+    for i in range(len(pred_list)):
+        tmp_pred_list.extend(normalize_str(pred_list[i], answer))
+    pred_list = tmp_pred_list
+
+    # remove duplicates
+    pred_list = list(set(pred_list))
+
+    return pred_list
+
+
+def get_TF_prediction(response):
+    """get the prediction from the generated response,
+    return a list of predicted strings or numbers"""
+
+    def get_key_subresponses(response):
+        response = response.strip("。").strip()
+        sub_responses = re.split(r'。|\n', response)
+        indicators_of_keys = ['是', '为', '所以', '判断',
+                              '陈述', '说法', '表达', '答案', '结果']
+        key_responses = []
+        for index, resp in enumerate(sub_responses):
+            shortest_key_response = None
+            # the shortest response that may contain the answer (tail part of the response)
+            for indicator in indicators_of_keys:
+                if indicator in resp:
+                    if not shortest_key_response:
+                        shortest_key_response = resp.split(indicator)[-1].strip()
+                    else:
+                        if len(resp.split(indicator)[-1].strip()) < len(shortest_key_response):
+                            shortest_key_response = resp.split(indicator)[-1].strip()
+
+            if shortest_key_response:
+                # and it's not trivial
+                if shortest_key_response.strip() not in [":", ",", ".", "!", "?", ";", ":", "'"]:
+                    key_responses.append(shortest_key_response)
+        if len(key_responses) == 0:  # did not found any
+            return [response]
+        return key_responses
+
+    key_responses = get_key_subresponses(response)
+
+    pred_list = key_responses.copy()  # keep the original string response
+    # remove duplicates
+    pred_list = list(set(pred_list))
+
+    return pred_list
+
+
+class CMMMU(ImageBaseDataset):
+    TYPE = 'VQA'
+
+    DATASET_URL = {
+        'CMMMU_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/CMMMU_VAL.tsv'
+    }
+
+    DATASET_MD5 = {
+        'CMMMU_VAL': 'b4727e2fce2415bf646379e60c11a726'
+    }
+
+    def dump_image(self, line):
+        os.makedirs(self.img_root, exist_ok=True)
+
+        tgt_path_z = []
+        if isinstance(line['image'], list):
+            for i in range(len(line['image'])):
+                tgt_path = osp.join(self.img_root, f"{line['index']}--{i + 1}.jpg")
+                if not read_ok(tgt_path):
+                    decode_base64_to_image_file(line['image'][i], tgt_path)
+                tgt_path_z.append(tgt_path)
+        else:
+            tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
+            if not read_ok(tgt_path):
+                decode_base64_to_image_file(line['image'], tgt_path)
+            tgt_path_z.append(tgt_path)
+        return tgt_path_z
+
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+
+        suffix = eval_file.split('.')[-1]
+        result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+
+        if not osp.exists(result_file):
+            data = load(eval_file)
+            assert 'answer' in data and 'prediction' in data
+            data['prediction'] = [str(x) for x in data['prediction']]
+            data['answer'] = [str(x) for x in data['answer']]
+
+            correct_count = 0
+            correct_category = {
+                '技术与工程': [0, 0],
+                '科学': [0, 0],
+                '健康与医学': [0, 0],
+                '商业': [0, 0],
+                '艺术与设计': [0, 0],
+                '人文社会科学': [0, 0],
+            }
+
+            for i in tqdm(data.iterrows()):
+                line = i[1]
+                correct_category[line['category']][0] += 1
+
+                # Options
+                if line['type'] == '选择':
+                    index2ans = {
+                        'A': line['option1'],
+                        'B': line['option2'],
+                        'C': line['option3'],
+                        'D': line['option4']
+                    }
+                    fact_option = get_multi_choice_prediction(line['prediction'], ['A', 'B', 'C', 'D'], index2ans)
+                    if fact_option == line['answer']:
+                        correct_count += 1
+                        correct_category[line['category']][1] += 1
+
+                # Binary
+                elif line['type'] == '判断':
+                    positive_keywords = ['正确', '对', '准确', '肯定', '对的']
+                    negative_keywords = ['不对', '错误', '不正确', '不准确', '不合适', '否定', '错的', '错']
+                    ambiguous_keywords = ['对错', '是否正确', '否正确', '或者', '是否', '正确性', '对不']
+
+                    def judge_similarity(pred_list, positive_keywords, negative_keywords):
+                        positive_count = 0
+                        negative_count = 0
+
+                        for pred in pred_list:
+                            if any(pos_word in pred for pos_word in positive_keywords):
+                                positive_count += 1
+                            elif any(neg_word in pred for neg_word in negative_keywords):
+                                negative_count += 1
+
+                        if positive_count > negative_count:
+                            return "对"
+                        elif negative_count > positive_count:
+                            return "错"
+                        else:
+                            return random.choice(['对', '错'])
+
+                    answer = get_TF_prediction(line['prediction'])
+                    answer = [word for word in answer if not any(ambiguous in word for ambiguous in ambiguous_keywords)]
+                    fact_answer = judge_similarity(answer, positive_keywords, negative_keywords)
+                    if fact_answer == line['answer']:
+                        correct_count += 1
+                        correct_category[line['category']][1] += 1
+
+                # Fill the Blank
+                else:
+                    norm_answers = normalize_str(line['answer'], line['answer'])
+                    predicted_answer = get_fill_blank_prediction(line['prediction'], line['answer'])
+
+                    for pred in predicted_answer:
+                        # already normalized
+                        if isinstance(pred, str):  # if it's a string, then find if ans in the pred_i
+                            for norm_ans in norm_answers:
+                                # only see if the string answer in the string pred
+                                # print(norm_ans, pred)
+                                if isinstance(norm_ans, str) and norm_ans in pred:
+                                    correct_count += 1
+                                    correct_category[line['category']][1] += 1
+                        else:  # it's a number
+                            if pred in norm_answers:
+                                correct_count += 1
+                                correct_category[line['category']][1] += 1
+
+            accuracyz = {}
+            accuracyz['总准确率'] = correct_count / len(data)
+            for i in correct_category.keys():
+                accuracyz[i] = correct_category[i][1] / correct_category[i][0]
+
+            accuracyz = d2df(accuracyz)
+            accuracyz.round(10)
+            dump(accuracyz, result_file)
+
+        result = pd.read_csv(result_file)
+        return result
+
+    def build_prompt(self, line):
+        if line['type'] == '选择':
+            tgt_path = self.dump_image(line)
+            question = line['question']
+            options_prompt = 'Options:\n'
+
+            for i in [['A', '1'], ['B', '2'], ['C', '3'], ['D', '4']]:
+                options_prompt += i[0] + '. ' + line['option' + i[1]] + '\n'
+
+            prompt = (f'问题: {question}\n' + options_prompt
+                      + '请回答上述多项选择题，并选出正确选项。这些题目可能包括单选和多选题型。如果所提供的信息不足以确定一个明确的答案，那么请根据可用的数据和你的判断来选择最可能正确的选项。')
+
+            msgs = []
+            if isinstance(tgt_path, list):
+                msgs.extend([dict(type='image', value=p) for p in tgt_path])
+            else:
+                msgs = [dict(type='image', value=tgt_path)]
+            msgs.append(dict(type='text', value=prompt))
+
+            return msgs
+
+        elif line['type'] == '判断':
+            msgs = super().build_prompt(line)
+            assert msgs[-1]['type'] == 'text'
+            msgs[-1]['value'] += '\n请回答上述判断题，并根据题目描述和所给的信息来判断问题中陈述的对错。如果信息不完整或不足以作出绝对判断，请运用你的逻辑推理和现有信息来做出最可能的判断。'
+            return msgs
+
+        else:
+            msgs = super().build_prompt(line)
+            assert msgs[-1]['type'] == 'text'
+            msgs[-1]['value'] += '\n请回答上述填空题，并根据题目的要求和所提供的信息来给出最恰当的答案。如果信息不足以确切回答，那么请依据现有的数据和你的推理能力来填写最合理的答案。'
+            return msgs