diff --git a/run.py b/run.py index 26930a22..bee2c23b 100644 --- a/run.py +++ b/run.py @@ -322,7 +322,7 @@ def main(): judge_kwargs['model'] = 'chatgpt-0125' elif listinstr(['MMVet', 'LLaVABench', 'MMBench-Video'], dataset_name): judge_kwargs['model'] = 'gpt-4-turbo' - elif listinstr(['MathVista', 'MathVerse', 'MathVision', 'DynaMath'], dataset_name): + elif listinstr(['MathVista', 'MathVerse', 'MathVision', 'DynaMath', 'VL-RewardBench'], dataset_name): # noqa: E501 judge_kwargs['model'] = 'gpt-4o-mini' elif listinstr(['MMLongBench', 'MMDU', 'DUDE', 'SLIDEVQA', 'MIA-Bench', 'WildVision'], dataset_name): # noqa: E501 judge_kwargs['model'] = 'gpt-4o' diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py index ca3b3984..a1bab65c 100644 --- a/vlmeval/dataset/__init__.py +++ b/vlmeval/dataset/__init__.py @@ -20,6 +20,7 @@ from .mmlongbench import MMLongBench from .dude import DUDE from .slidevqa import SlideVQA +from .vl_rewardbench import VLRewardBench from .mmbench_video import MMBenchVideo from .videomme import VideoMME @@ -132,7 +133,7 @@ def evaluate(self, eval_file, **judge_kwargs): MMLongBench, VCRDataset, MMDUDataset, DUDE, SlideVQA, MUIRDataset, CCOCRDataset, GMAIMMBenchDataset, MMERealWorld, HRBenchDataset, CRPE, MathVerse, NaturalBenchDataset, MIABench, OlympiadBench, WildVision, MMMath, QSpatial, Dynamath, MMGenBench, VizWiz, MMNIAH, - CMMMU + CMMMU, VLRewardBench ] VIDEO_DATASET = [ diff --git a/vlmeval/dataset/utils/ccocr_evaluator/doc_parsing_evaluator.py b/vlmeval/dataset/utils/ccocr_evaluator/doc_parsing_evaluator.py index 382c4be8..d059adc0 100644 --- a/vlmeval/dataset/utils/ccocr_evaluator/doc_parsing_evaluator.py +++ b/vlmeval/dataset/utils/ccocr_evaluator/doc_parsing_evaluator.py @@ -241,7 +241,7 @@ def eval_formula(self, response_info, gt_info, op_name='formula'): pred = response_info[img_name] if op_name == 'formula': - pred = pred.replace("\n", " ").replace("```latex", "").replace("```", "").replace("\t", " ").replace(" ", "") + pred = pred.replace("\n", " ").replace("```latex", "").replace("```", "").replace("\t", " ").replace(" ", "") # noqa: E501 gt = gt.replace(" ", "") elif op_name == 'molecular': pred = pred.replace("\n", "").replace(" ", "").replace("", "").replace("", "") diff --git a/vlmeval/dataset/utils/ccocr_evaluator/kie_evaluator.py b/vlmeval/dataset/utils/ccocr_evaluator/kie_evaluator.py index 2613a338..797d4244 100644 --- a/vlmeval/dataset/utils/ccocr_evaluator/kie_evaluator.py +++ b/vlmeval/dataset/utils/ccocr_evaluator/kie_evaluator.py @@ -66,11 +66,11 @@ def update_cost(node1: Node, node2: Node): label2 = node2.label label1_leaf = "" in label1 label2_leaf = "" in label2 - if label1_leaf == True and label2_leaf == True: + if label1_leaf and label2_leaf: return edit_distance(label1.replace("", ""), label2.replace("", "")) - elif label1_leaf == False and label2_leaf == True: + elif not label1_leaf and label2_leaf: return 1 + len(label2.replace("", "")) - elif label1_leaf == True and label2_leaf == False: + elif label1_leaf and not label2_leaf: return 1 + len(label1.replace("", "")) else: return int(label1 != label2) @@ -121,7 +121,8 @@ def normalize_dict(data: Union[Dict, List, Any]): def cal_f1_all(preds, answers): """ - Calculate global F1 accuracy score (field-level, micro-averaged) by counting all true positives, false negatives and false positives + Calculate global F1 accuracy score (field-level, micro-averaged) by counting all true positives, + false negatives and false positives """ metric_info, error_info = {}, {} total_tp, total_fn_or_fp = 0, 0 @@ -233,35 +234,28 @@ def cal_acc(pred: dict, answer: dict): """ pred = construct_tree_from_dict(normalize_dict(pred)) answer = construct_tree_from_dict(normalize_dict(answer)) - return max( - 0, - 1 - - ( - zss.distance( - pred, - answer, - get_children=zss.Node.get_children, - insert_cost=insert_and_remove_cost, - remove_cost=insert_and_remove_cost, - update_cost=update_cost, - return_operations=False, - ) - / zss.distance( - construct_tree_from_dict(normalize_dict({})), - answer, - get_children=zss.Node.get_children, - insert_cost=insert_and_remove_cost, - remove_cost=insert_and_remove_cost, - update_cost=update_cost, - return_operations=False, - ) - ), + val1 = zss.distance( + pred, + answer, + get_children=zss.Node.get_children, + insert_cost=insert_and_remove_cost, + remove_cost=insert_and_remove_cost, + update_cost=update_cost, + return_operations=False, + ) + val2 = zss.distance( + construct_tree_from_dict(normalize_dict({})), + answer, + get_children=zss.Node.get_children, + insert_cost=insert_and_remove_cost, + remove_cost=insert_and_remove_cost, + update_cost=update_cost, + return_operations=False, ) + return max(0, 1 - val1 / val2) def cal_acc_all(pred_info, answer_info): - """ - """ acc_info, error_info = {}, {} for file_name, answer in answer_info.items(): # if file_name not in pred_info: @@ -303,13 +297,11 @@ def eval_donut(pdt_info, gt_info, normalize_func=None, data_name=None): acc_average, acc_error_info = cal_acc_all(pdt_info, gt_info) eval_info = {"f1_score": f1_score, "acc": acc_average, "class_f1_score": class_eval_info, "f1_error_info": error_info, "acc_error_info": acc_error_info} - print(data_name, "f1_score", f1_score, "acc", acc_average) + print(data_name, "f1_score", f1_score, "acc", acc_average) return eval_info def post_process_to_json(qwen_info_str, file_name=None): - """ - """ try: if "```json" in qwen_info_str: if "```" not in qwen_info_str: @@ -320,10 +312,7 @@ def post_process_to_json(qwen_info_str, file_name=None): json_str = qwen_info_str.strip().replace("\n", "") json_data = json.loads(json_str) return json_data - except Exception as e: - # print("--> post error: {}, file_name: {}".format(e, file_name)) - # print("json_raw", qwen_info_str) - # print("json_str", json_str) + except Exception as err: # noqa: F841 return None diff --git a/vlmeval/dataset/utils/ccocr_evaluator/ocr_evaluator.py b/vlmeval/dataset/utils/ccocr_evaluator/ocr_evaluator.py index e33b95c6..5c82abcc 100644 --- a/vlmeval/dataset/utils/ccocr_evaluator/ocr_evaluator.py +++ b/vlmeval/dataset/utils/ccocr_evaluator/ocr_evaluator.py @@ -92,8 +92,10 @@ def evaluate(self, response_info, gt_info, **kwargs): image_pdt_info, image_gt_info = {}, {} for file_name, gt_src in gt_info.items(): pred_src = response_info.get(file_name, "") - pdt_token_list = text_normalize_and_tokenize(str(pred_src).strip(), is_word_level, is_lower, is_alphanum_only) - gt_token_list = text_normalize_and_tokenize(str(gt_src).strip(), is_word_level, is_lower, is_alphanum_only) + pdt_token_list = text_normalize_and_tokenize( + str(pred_src).strip(), is_word_level, is_lower, is_alphanum_only) + gt_token_list = text_normalize_and_tokenize( + str(gt_src).strip(), is_word_level, is_lower, is_alphanum_only) image_pdt_info[file_name] = pdt_token_list image_gt_info[file_name] = gt_token_list eval_result = calculate_metrics(image_pdt_info, image_gt_info, is_verbose=False) diff --git a/vlmeval/dataset/vl_rewardbench.py b/vlmeval/dataset/vl_rewardbench.py new file mode 100644 index 00000000..d8dad738 --- /dev/null +++ b/vlmeval/dataset/vl_rewardbench.py @@ -0,0 +1,174 @@ +from ast import literal_eval + +from .image_base import ImageBaseDataset +from .utils import build_judge, DEBUG_MESSAGE +from ..smp import * +from ..utils import track_progress_rich + + +LLM_PARSE_ANSWER_PROMPT = ''' +You are given a pairwise judgement for two responses. Please return the better response according to the judgement. +Return the Answer X ONLY. e.g., Answer 1 or Answer 2. + +Judgement: {judgement} +''' + + +PROMPT_TEMPLATE = '''\ +You are a highly capable multimodal AI assistant tasked with evaluating answers to visual questions. +Please analyze the following image and question, then determine which of the two provided answers is better. + +Question: {query} + +Answer 1: {answer_0} + +Answer 2: {answer_1} + +Please evaluate both answers based on the following criteria: +1. Accuracy: How well does the answer align with the visual information in the image? +2. Completeness: Does the answer fully address all aspects of the question? +3. Clarity: Is the answer easy to understand and well-articulated? +4. Relevance: Does the answer directly relate to the question and the image? + +After your evaluation, please: +1. Explain your reasoning for each criterion. +2. Provide an overall judgment on which answer is better (Answer 1 or Answer 2).\ +For example: Overall Judgment: Answer X is better. + +Your response should be structured and detailed, \ +demonstrating your understanding of both the visual and textual elements of the task.''' + + +def get_score(line, parsed_response, random_number): + gt_ans = line['human_ranking'].index(0 if random_number == 0 else 1) + 1 + if 'Answer 1'.lower() in parsed_response.lower(): + pred = 1 + elif 'Answer 2'.lower() in parsed_response.lower(): + pred = 2 + else: # failed + pred = 'None' # random.choice([1, 2]) + + if pred == gt_ans: + return 1.0 + else: + return 0.0 + + +def VLRewardBench_eval_answer(model, line): + response = toliststr(line['response']) + random_number = sum(len(res) for res in response) % 2 + + prompt = LLM_PARSE_ANSWER_PROMPT.format(judgement=line['prediction']) + messages = [dict(type='text', value=prompt)] + + resp = model.generate(messages) + score = get_score(line, resp, random_number) + + if score is None: + return 'Unknown' + return score + + +class VLRewardBench(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL = { + 'VL-RewardBench': 'https://huggingface.co/datasets/MMInstruction/VL-RewardBench/resolve/main/vl_rewardbench.tsv' + } + DATASET_MD5 = {'VL-RewardBench': '1d2676f4ab4a5f755019ec0af2b28189'} + + # Given one data record, return the built prompt (a multi-modal message), can override + def build_prompt(self, line): + if isinstance(line, int): + line = self.data.iloc[line] + tgt_path = self.dump_image(line) # save image to local + question = line['question'] + msgs = [] + if isinstance(tgt_path, list): + msgs.extend([dict(type='image', value=p) for p in tgt_path]) + else: + msgs = [dict(type='image', value=tgt_path)] + + response = toliststr(line['response']) + random_number = sum(len(res) for res in response) % 2 + if random_number == 1: + # randomly shuffle the order of the responses + response = response[::-1] + query_prompt = PROMPT_TEMPLATE.format( + query=question, answer_0=response[0], answer_1=response[1] + ) + msgs = msgs + [dict(type='text', value=query_prompt)] + return msgs + + # It returns a DataFrame + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + suffix = eval_file.split('.')[-1] + model = judge_kwargs['model'] + storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') + score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv') + tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + nproc = judge_kwargs.pop('nproc', 4) + + if not osp.exists(storage): + raw_data = VLRewardBench('VL-RewardBench').data + data = load(eval_file) + data['prediction'] = [str(x) for x in data['prediction']] + data['human_ranking'] = [literal_eval(x) for x in raw_data['answer']] + + judge_kwargs['temperature'] = 0 + judge_kwargs['timeout'] = 60 + model = build_judge(max_tokens=128, **judge_kwargs) + + assert model.working(), ( + 'VLRewardBench evaluation requires a working OPENAI API\n' + + DEBUG_MESSAGE + ) + + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + tups = [(model, line) for line in lines] + indices = [line['index'] for line in lines] + + ans = load(tmp_file) if osp.exists(tmp_file) else {} + tups = [x for x, i in zip(tups, indices) if i not in ans] + indices = [i for i in indices if i not in ans] + + if len(indices): + new_results = track_progress_rich( + VLRewardBench_eval_answer, + tups, + nproc=nproc, + chunksize=nproc, + keys=indices, + save=tmp_file, + ) + ans = load(tmp_file) + for k, v in zip(indices, new_results): + ans[k] = v + + data['score'] = [ans[idx] for idx in data['index']] + # data.pop('image') + dump(data, storage) + + data = load(storage) + lt = len(data) + + category_scores = defaultdict(lambda: 0) + category_cnt = defaultdict(lambda: 0) + scores = defaultdict(lambda: 0) + for i in range(lt): + item = data.iloc[i] + category_scores[item['category']] += item['score'] + category_cnt[item['category']] += 1 + # calculate the average score for each category + for k, v in category_scores.items(): + scores[k] = v / category_cnt[k] + # calculate category macro accuracy (average across categories) + scores['Macro Accuracy'] = sum(scores.values()) / len(scores) + # calculate the total average score + scores['Overall Consistency'] = sum(category_scores.values()) / lt + + scores = {k: [v] for k, v in scores.items()} + scores = pd.DataFrame(scores) + dump(scores, score_file) + return scores