From 80a329ce5ad92d63ffafcedbcad0e926ba54c4f6 Mon Sep 17 00:00:00 2001 From: Lei Li Date: Mon, 30 Dec 2024 10:47:03 +0000 Subject: [PATCH 01/15] update vlrewardbench --- run.py | 2 +- vlmeval/dataset/__init__.py | 3 +- vlmeval/dataset/vl_rewardbench.py | 173 ++++++++++++++++++++++++++++++ 3 files changed, 176 insertions(+), 2 deletions(-) create mode 100644 vlmeval/dataset/vl_rewardbench.py diff --git a/run.py b/run.py index 26930a22..53a90735 100644 --- a/run.py +++ b/run.py @@ -322,7 +322,7 @@ def main(): judge_kwargs['model'] = 'chatgpt-0125' elif listinstr(['MMVet', 'LLaVABench', 'MMBench-Video'], dataset_name): judge_kwargs['model'] = 'gpt-4-turbo' - elif listinstr(['MathVista', 'MathVerse', 'MathVision', 'DynaMath'], dataset_name): + elif listinstr(['MathVista', 'MathVerse', 'MathVision', 'DynaMath', 'VL-RewardBench'], dataset_name): judge_kwargs['model'] = 'gpt-4o-mini' elif listinstr(['MMLongBench', 'MMDU', 'DUDE', 'SLIDEVQA', 'MIA-Bench', 'WildVision'], dataset_name): # noqa: E501 judge_kwargs['model'] = 'gpt-4o' diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py index ca3b3984..a1bab65c 100644 --- a/vlmeval/dataset/__init__.py +++ b/vlmeval/dataset/__init__.py @@ -20,6 +20,7 @@ from .mmlongbench import MMLongBench from .dude import DUDE from .slidevqa import SlideVQA +from .vl_rewardbench import VLRewardBench from .mmbench_video import MMBenchVideo from .videomme import VideoMME @@ -132,7 +133,7 @@ def evaluate(self, eval_file, **judge_kwargs): MMLongBench, VCRDataset, MMDUDataset, DUDE, SlideVQA, MUIRDataset, CCOCRDataset, GMAIMMBenchDataset, MMERealWorld, HRBenchDataset, CRPE, MathVerse, NaturalBenchDataset, MIABench, OlympiadBench, WildVision, MMMath, QSpatial, Dynamath, MMGenBench, VizWiz, MMNIAH, - CMMMU + CMMMU, VLRewardBench ] VIDEO_DATASET = [ diff --git a/vlmeval/dataset/vl_rewardbench.py b/vlmeval/dataset/vl_rewardbench.py new file mode 100644 index 00000000..0da3f218 --- /dev/null +++ b/vlmeval/dataset/vl_rewardbench.py @@ -0,0 +1,173 @@ +import re +from functools import partial +import random +from ast import literal_eval + +from .image_base import ImageBaseDataset +from .utils import build_judge, DEBUG_MESSAGE +from ..smp import * +from ..utils import track_progress_rich + + +LLM_PARSE_ANSWER_PROMPT = """ +You are given a pairwise judgement for two responses. Please return the better response according to the judgement. +Return the Answer X ONLY. e.g., Answer 1 or Answer 2. + +Judgement: {judgement} +""" + + +PROMPT_TEMPLATE = """\ +You are a highly capable multimodal AI assistant tasked with evaluating answers to visual questions. Please analyze the following image and question, then determine which of the two provided answers is better. + +Question: {query} + +Answer 1: {answer_0} + +Answer 2: {answer_1} + +Please evaluate both answers based on the following criteria: +1. Accuracy: How well does the answer align with the visual information in the image? +2. Completeness: Does the answer fully address all aspects of the question? +3. Clarity: Is the answer easy to understand and well-articulated? +4. Relevance: Does the answer directly relate to the question and the image? + +After your evaluation, please: +1. Explain your reasoning for each criterion. +2. Provide an overall judgment on which answer is better (Answer 1 or Answer 2). For example: Overall Judgment: Answer X is better. + +Your response should be structured and detailed, demonstrating your understanding of both the visual and textual elements of the task.""" + + +def get_score(line, parsed_response, random_number): + gt_ans = line["human_ranking"].index(0 if random_number == 0 else 1) + 1 + if "Answer 1".lower() in parsed_response.lower(): + pred = 1 + elif "Answer 2".lower() in parsed_response.lower(): + pred = 2 + else: # failed + pred = "None" # random.choice([1, 2]) + + if pred == gt_ans: + return 1.0 + else: + return 0.0 + + +def VLRewardBench_eval_answer(model, line): + response = toliststr(line["response"]) + random_number = sum(len(res) for res in response) % 2 + + prompt = LLM_PARSE_ANSWER_PROMPT.format(judgement=line["prediction"]) + messages = [dict(type="text", value=prompt)] + + resp = model.generate(messages) + score = get_score(line, resp, random_number) + + if score is None: + return "Unknown" + return score + + +class VLRewardBench(ImageBaseDataset): + TYPE = "VQA" + DATASET_URL = { + "VL-RewardBench": "https://huggingface.co/datasets/MMInstruction/VL-RewardBench/resolve/main/vl_rewardbench.tsv" + } + DATASET_MD5 = {"VL-RewardBench": "1d2676f4ab4a5f755019ec0af2b28189"} + + # Given one data record, return the built prompt (a multi-modal message), can override + def build_prompt(self, line): + if isinstance(line, int): + line = self.data.iloc[line] + tgt_path = self.dump_image(line) # save image to local + question = line["question"] + msgs = [] + if isinstance(tgt_path, list): + msgs.extend([dict(type='image', value=p) for p in tgt_path]) + else: + msgs = [dict(type='image', value=tgt_path)] + + response = toliststr(line["response"]) + random_number = sum(len(res) for res in response) % 2 + if random_number == 1: + response = response[::-1] # randomly shuffle the order of the responses + query_prompt = PROMPT_TEMPLATE.format( + query=question, answer_0=response[0], answer_1=response[1] + ) + msgs = msgs + [dict(type="text", value=query_prompt)] + return msgs + + # It returns a DataFrame + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + suffix = eval_file.split(".")[-1] + model = judge_kwargs["model"] + storage = eval_file.replace(f".{suffix}", f"_{model}.xlsx") + score_file = eval_file.replace(f".{suffix}", f"_{model}_score.csv") + tmp_file = eval_file.replace(f".{suffix}", f"_{model}.pkl") + nproc = judge_kwargs.pop("nproc", 4) + + if not osp.exists(storage): + raw_data = VLRewardBench("VL-RewardBench").data + data = load(eval_file) + data["prediction"] = [str(x) for x in data["prediction"]] + data["human_ranking"] = [literal_eval(x) for x in raw_data["answer"]] + + judge_kwargs["temperature"] = 0 + judge_kwargs["timeout"] = 60 + model = build_judge(max_tokens=128, **judge_kwargs) + + assert model.working(), ( + "VLRewardBench evaluation requires a working OPENAI API\n" + + DEBUG_MESSAGE + ) + + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + tups = [(model, line) for line in lines] + indices = [line["index"] for line in lines] + + ans = load(tmp_file) if osp.exists(tmp_file) else {} + tups = [x for x, i in zip(tups, indices) if i not in ans] + indices = [i for i in indices if i not in ans] + + if len(indices): + new_results = track_progress_rich( + VLRewardBench_eval_answer, + tups, + nproc=nproc, + chunksize=nproc, + keys=indices, + save=tmp_file, + ) + ans = load(tmp_file) + for k, v in zip(indices, new_results): + ans[k] = v + + data["score"] = [ans[idx] for idx in data["index"]] + # data.pop("image") + dump(data, storage) + + data = load(storage) + lt = len(data) + + category_scores = defaultdict(lambda: 0) + category_cnt = defaultdict(lambda: 0) + scores = defaultdict(lambda: 0) + for i in range(lt): + item = data.iloc[i] + category_scores[item["category"]] += item["score"] + category_cnt[item["category"]] += 1 + # calculate the average score for each category + for k, v in category_scores.items(): + scores[k] = v / category_cnt[k] + # calculate category macro accuracy (average across categories) + scores["Macro Accuracy"] = sum(scores.values()) / len(scores) + # calculate the total average score + scores["Overall Consistency"] = sum(category_scores.values()) / lt + + scores = {k: [v] for k, v in scores.items()} + scores = pd.DataFrame(scores) + dump(scores, score_file) + return scores From 72262a830d33a1ec226e046e64a8f971a62f5840 Mon Sep 17 00:00:00 2001 From: Lei Li Date: Mon, 30 Dec 2024 12:14:07 +0000 Subject: [PATCH 02/15] pre-commit fix --- vlmeval/dataset/vl_rewardbench.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/vlmeval/dataset/vl_rewardbench.py b/vlmeval/dataset/vl_rewardbench.py index 0da3f218..72e483c4 100644 --- a/vlmeval/dataset/vl_rewardbench.py +++ b/vlmeval/dataset/vl_rewardbench.py @@ -1,6 +1,3 @@ -import re -from functools import partial -import random from ast import literal_eval from .image_base import ImageBaseDataset @@ -18,7 +15,8 @@ PROMPT_TEMPLATE = """\ -You are a highly capable multimodal AI assistant tasked with evaluating answers to visual questions. Please analyze the following image and question, then determine which of the two provided answers is better. +You are a highly capable multimodal AI assistant tasked with evaluating answers to visual questions. +Please analyze the following image and question, then determine which of the two provided answers is better. Question: {query} @@ -34,9 +32,11 @@ After your evaluation, please: 1. Explain your reasoning for each criterion. -2. Provide an overall judgment on which answer is better (Answer 1 or Answer 2). For example: Overall Judgment: Answer X is better. +2. Provide an overall judgment on which answer is better (Answer 1 or Answer 2).\ +For example: Overall Judgment: Answer X is better. -Your response should be structured and detailed, demonstrating your understanding of both the visual and textual elements of the task.""" +Your response should be structured and detailed, \ +demonstrating your understanding of both the visual and textual elements of the task.""" def get_score(line, parsed_response, random_number): @@ -46,7 +46,7 @@ def get_score(line, parsed_response, random_number): elif "Answer 2".lower() in parsed_response.lower(): pred = 2 else: # failed - pred = "None" # random.choice([1, 2]) + pred = "None" # random.choice([1, 2]) if pred == gt_ans: return 1.0 @@ -84,14 +84,15 @@ def build_prompt(self, line): question = line["question"] msgs = [] if isinstance(tgt_path, list): - msgs.extend([dict(type='image', value=p) for p in tgt_path]) + msgs.extend([dict(type="image", value=p) for p in tgt_path]) else: - msgs = [dict(type='image', value=tgt_path)] + msgs = [dict(type="image", value=tgt_path)] response = toliststr(line["response"]) random_number = sum(len(res) for res in response) % 2 if random_number == 1: - response = response[::-1] # randomly shuffle the order of the responses + # randomly shuffle the order of the responses + response = response[::-1] query_prompt = PROMPT_TEMPLATE.format( query=question, answer_0=response[0], answer_1=response[1] ) From 86c2a57e3e9edecd31987a927f3618e1125070a3 Mon Sep 17 00:00:00 2001 From: Lei Li Date: Mon, 30 Dec 2024 12:22:29 +0000 Subject: [PATCH 03/15] formatter --- vlmeval/dataset/vl_rewardbench.py | 78 +++++++++++++++---------------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/vlmeval/dataset/vl_rewardbench.py b/vlmeval/dataset/vl_rewardbench.py index 72e483c4..d8dad738 100644 --- a/vlmeval/dataset/vl_rewardbench.py +++ b/vlmeval/dataset/vl_rewardbench.py @@ -6,15 +6,15 @@ from ..utils import track_progress_rich -LLM_PARSE_ANSWER_PROMPT = """ +LLM_PARSE_ANSWER_PROMPT = ''' You are given a pairwise judgement for two responses. Please return the better response according to the judgement. Return the Answer X ONLY. e.g., Answer 1 or Answer 2. Judgement: {judgement} -""" +''' -PROMPT_TEMPLATE = """\ +PROMPT_TEMPLATE = '''\ You are a highly capable multimodal AI assistant tasked with evaluating answers to visual questions. Please analyze the following image and question, then determine which of the two provided answers is better. @@ -36,17 +36,17 @@ For example: Overall Judgment: Answer X is better. Your response should be structured and detailed, \ -demonstrating your understanding of both the visual and textual elements of the task.""" +demonstrating your understanding of both the visual and textual elements of the task.''' def get_score(line, parsed_response, random_number): - gt_ans = line["human_ranking"].index(0 if random_number == 0 else 1) + 1 - if "Answer 1".lower() in parsed_response.lower(): + gt_ans = line['human_ranking'].index(0 if random_number == 0 else 1) + 1 + if 'Answer 1'.lower() in parsed_response.lower(): pred = 1 - elif "Answer 2".lower() in parsed_response.lower(): + elif 'Answer 2'.lower() in parsed_response.lower(): pred = 2 else: # failed - pred = "None" # random.choice([1, 2]) + pred = 'None' # random.choice([1, 2]) if pred == gt_ans: return 1.0 @@ -55,40 +55,40 @@ def get_score(line, parsed_response, random_number): def VLRewardBench_eval_answer(model, line): - response = toliststr(line["response"]) + response = toliststr(line['response']) random_number = sum(len(res) for res in response) % 2 - prompt = LLM_PARSE_ANSWER_PROMPT.format(judgement=line["prediction"]) - messages = [dict(type="text", value=prompt)] + prompt = LLM_PARSE_ANSWER_PROMPT.format(judgement=line['prediction']) + messages = [dict(type='text', value=prompt)] resp = model.generate(messages) score = get_score(line, resp, random_number) if score is None: - return "Unknown" + return 'Unknown' return score class VLRewardBench(ImageBaseDataset): - TYPE = "VQA" + TYPE = 'VQA' DATASET_URL = { - "VL-RewardBench": "https://huggingface.co/datasets/MMInstruction/VL-RewardBench/resolve/main/vl_rewardbench.tsv" + 'VL-RewardBench': 'https://huggingface.co/datasets/MMInstruction/VL-RewardBench/resolve/main/vl_rewardbench.tsv' } - DATASET_MD5 = {"VL-RewardBench": "1d2676f4ab4a5f755019ec0af2b28189"} + DATASET_MD5 = {'VL-RewardBench': '1d2676f4ab4a5f755019ec0af2b28189'} # Given one data record, return the built prompt (a multi-modal message), can override def build_prompt(self, line): if isinstance(line, int): line = self.data.iloc[line] tgt_path = self.dump_image(line) # save image to local - question = line["question"] + question = line['question'] msgs = [] if isinstance(tgt_path, list): - msgs.extend([dict(type="image", value=p) for p in tgt_path]) + msgs.extend([dict(type='image', value=p) for p in tgt_path]) else: - msgs = [dict(type="image", value=tgt_path)] + msgs = [dict(type='image', value=tgt_path)] - response = toliststr(line["response"]) + response = toliststr(line['response']) random_number = sum(len(res) for res in response) % 2 if random_number == 1: # randomly shuffle the order of the responses @@ -96,38 +96,38 @@ def build_prompt(self, line): query_prompt = PROMPT_TEMPLATE.format( query=question, answer_0=response[0], answer_1=response[1] ) - msgs = msgs + [dict(type="text", value=query_prompt)] + msgs = msgs + [dict(type='text', value=query_prompt)] return msgs # It returns a DataFrame @classmethod def evaluate(self, eval_file, **judge_kwargs): - suffix = eval_file.split(".")[-1] - model = judge_kwargs["model"] - storage = eval_file.replace(f".{suffix}", f"_{model}.xlsx") - score_file = eval_file.replace(f".{suffix}", f"_{model}_score.csv") - tmp_file = eval_file.replace(f".{suffix}", f"_{model}.pkl") - nproc = judge_kwargs.pop("nproc", 4) + suffix = eval_file.split('.')[-1] + model = judge_kwargs['model'] + storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') + score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv') + tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + nproc = judge_kwargs.pop('nproc', 4) if not osp.exists(storage): - raw_data = VLRewardBench("VL-RewardBench").data + raw_data = VLRewardBench('VL-RewardBench').data data = load(eval_file) - data["prediction"] = [str(x) for x in data["prediction"]] - data["human_ranking"] = [literal_eval(x) for x in raw_data["answer"]] + data['prediction'] = [str(x) for x in data['prediction']] + data['human_ranking'] = [literal_eval(x) for x in raw_data['answer']] - judge_kwargs["temperature"] = 0 - judge_kwargs["timeout"] = 60 + judge_kwargs['temperature'] = 0 + judge_kwargs['timeout'] = 60 model = build_judge(max_tokens=128, **judge_kwargs) assert model.working(), ( - "VLRewardBench evaluation requires a working OPENAI API\n" + 'VLRewardBench evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE ) lt = len(data) lines = [data.iloc[i] for i in range(lt)] tups = [(model, line) for line in lines] - indices = [line["index"] for line in lines] + indices = [line['index'] for line in lines] ans = load(tmp_file) if osp.exists(tmp_file) else {} tups = [x for x, i in zip(tups, indices) if i not in ans] @@ -146,8 +146,8 @@ def evaluate(self, eval_file, **judge_kwargs): for k, v in zip(indices, new_results): ans[k] = v - data["score"] = [ans[idx] for idx in data["index"]] - # data.pop("image") + data['score'] = [ans[idx] for idx in data['index']] + # data.pop('image') dump(data, storage) data = load(storage) @@ -158,15 +158,15 @@ def evaluate(self, eval_file, **judge_kwargs): scores = defaultdict(lambda: 0) for i in range(lt): item = data.iloc[i] - category_scores[item["category"]] += item["score"] - category_cnt[item["category"]] += 1 + category_scores[item['category']] += item['score'] + category_cnt[item['category']] += 1 # calculate the average score for each category for k, v in category_scores.items(): scores[k] = v / category_cnt[k] # calculate category macro accuracy (average across categories) - scores["Macro Accuracy"] = sum(scores.values()) / len(scores) + scores['Macro Accuracy'] = sum(scores.values()) / len(scores) # calculate the total average score - scores["Overall Consistency"] = sum(category_scores.values()) / lt + scores['Overall Consistency'] = sum(category_scores.values()) / lt scores = {k: [v] for k, v in scores.items()} scores = pd.DataFrame(scores) From ac535c52b11f0d70d00e0a4f6b58a0d3a056a532 Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Mon, 30 Dec 2024 04:05:53 +0000 Subject: [PATCH 04/15] [Improvement] Better `AUTO_SPLIT` and model split for InternVL2 --- vlmeval/smp/misc.py | 13 ++++++++++++- vlmeval/vlm/internvl/utils.py | 5 +++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/vlmeval/smp/misc.py b/vlmeval/smp/misc.py index 81b1ae5c..d0d9e909 100644 --- a/vlmeval/smp/misc.py +++ b/vlmeval/smp/misc.py @@ -277,4 +277,15 @@ def get_gpu_memory(): def auto_split_flag(): flag = os.environ.get('AUTO_SPLIT', '0') - return flag == '1' + if flag == '1': + return True + _, world_size = get_rank_and_world_size() + try: + import torch + device_count = torch.cuda.device_count() + if device_count > world_size and device_count % world_size == 0: + return True + else: + return False + except: + return False diff --git a/vlmeval/vlm/internvl/utils.py b/vlmeval/vlm/internvl/utils.py index a4e16c8d..7e3a917f 100644 --- a/vlmeval/vlm/internvl/utils.py +++ b/vlmeval/vlm/internvl/utils.py @@ -119,7 +119,7 @@ def get_local_rank_and_local_world_size(): def split_model(model_path): - num_gpus_per_node = 8 + num_gpus_per_node = torch.cuda.device_count() rank, world_size = get_rank_and_world_size() try: local_rank, local_world_size = get_local_rank_and_local_world_size() @@ -130,7 +130,7 @@ def split_model(model_path): gpus_per_process = int(os.environ['GPUS_PER_PROCESS']) else: gpus_per_process = 8 # default to use 8 GPUs for one model - + gpus_per_process = min(gpus_per_process, num_gpus_per_node // local_world_size) start_gpu = local_rank * gpus_per_process end_gpu = start_gpu + gpus_per_process @@ -159,6 +159,7 @@ def split_model(model_path): device_map['language_model.model.embed_tokens'] = visible_devices[0] device_map['language_model.output'] = visible_devices[0] device_map['language_model.model.norm'] = visible_devices[0] + device_map['language_model.model.rotary_emb'] = visible_devices[0] device_map['language_model.lm_head'] = visible_devices[0] device_map[f'language_model.model.layers.{num_layers - 1}'] = visible_devices[0] From fe3b2522eb5b10b49f28dc485dba06a8487839a3 Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Mon, 30 Dec 2024 04:13:54 +0000 Subject: [PATCH 05/15] [Minor] Improve CC-OCR Import --- vlmeval/config.py | 2 +- vlmeval/dataset/image_ccocr.py | 12 ++++++++++-- vlmeval/dataset/utils/__init__.py | 3 +-- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/vlmeval/config.py b/vlmeval/config.py index 6eb09f79..80bf14dd 100644 --- a/vlmeval/config.py +++ b/vlmeval/config.py @@ -13,7 +13,7 @@ VideoChatGPT_ROOT = None PLLaVA_ROOT = None RBDash_ROOT = None -VITA_ROOT = None +VITA_ROOT = '/fs-computility/mllm1/shared/dhd/VITA' LLAVA_V1_7B_MODEL_PTH = 'Please set your local path to LLaVA-7B-v1.1 here, the model weight is obtained by merging LLaVA delta weight based on vicuna-7b-v1.1 in https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md with vicuna-7b-v1.1. ' video_models = { diff --git a/vlmeval/dataset/image_ccocr.py b/vlmeval/dataset/image_ccocr.py index 0af2f077..290bfcee 100644 --- a/vlmeval/dataset/image_ccocr.py +++ b/vlmeval/dataset/image_ccocr.py @@ -1,10 +1,18 @@ +# flake8: noqa + import os import re import tempfile from functools import partial import pandas as pd -from .utils import ccocr_evaluator_map +try: + from .utils.ccocr_evaluator import evaluator_map_info as ccocr_evaluator_map +except ImportError as err: + import warnings + warnings.warn('The dependency of CCOCR evaluator is not properly installed') + warnings.warn(f'{type(err)}: {err}') + from .image_base import ImageBaseDataset from ..smp import * @@ -157,7 +165,7 @@ def evaluate(self, eval_file, **judge_kwargs): for data_info in dict_list: image_name = data_info['image_name'] gt_info[image_name] = data_info['answer'] - + # warning the FAIL samples if data_info['prediction'] != FAIL_MSG: ptd_info[image_name] = data_info['prediction'] diff --git a/vlmeval/dataset/utils/__init__.py b/vlmeval/dataset/utils/__init__.py index c9cb7cbc..ecad3127 100644 --- a/vlmeval/dataset/utils/__init__.py +++ b/vlmeval/dataset/utils/__init__.py @@ -1,10 +1,9 @@ from .judge_util import build_judge, DEBUG_MESSAGE from .multiple_choice import extract_answer_from_item, prefetch_answer from .vqa_eval import levenshtein_distance -from .ccocr_evaluator import evaluator_map_info as ccocr_evaluator_map __all__ = [ - 'build_judge', 'extract_answer_from_item', 'prefetch_answer', 'ccocr_evaluator_map', + 'build_judge', 'extract_answer_from_item', 'prefetch_answer', 'levenshtein_distance', 'DEBUG_MESSAGE', ] From c29628d26eca19843daa2fa924fd7dab09d0aa3c Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Mon, 30 Dec 2024 06:31:49 +0000 Subject: [PATCH 06/15] [Model] Support QVQ --- vlmeval/config.py | 1 + vlmeval/vlm/qwen2_vl/model.py | 20 ++++++++++++++++++++ vlmeval/vlm/vita.py | 4 ++-- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/vlmeval/config.py b/vlmeval/config.py index 80bf14dd..8545b045 100644 --- a/vlmeval/config.py +++ b/vlmeval/config.py @@ -332,6 +332,7 @@ qwen2vl_series = { 'Qwen-VL-Max-0809': partial(Qwen2VLAPI, model='qwen-vl-max-0809', min_pixels=1280*28*28, max_pixels=16384*28*28), 'Qwen-VL-Plus-0809': partial(Qwen2VLAPI, model='qwen-vl-plus-0809', min_pixels=1280*28*28, max_pixels=16384*28*28), + 'QVQ-72B-Preview': partial(Qwen2VLChat, model_path='Qwen/QVQ-72B-Preview', min_pixels=1280*28*28, max_pixels=16384*28*28, system_prompt='You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.', max_new_tokens=8192, post_process=False), 'Qwen2-VL-72B-Instruct': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-72B-Instruct', min_pixels=1280*28*28, max_pixels=16384*28*28), 'Qwen2-VL-7B-Instruct': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct', min_pixels=1280*28*28, max_pixels=16384*28*28), 'Qwen2-VL-7B-Instruct-AWQ': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct-AWQ', min_pixels=1280*28*28, max_pixels=16384*28*28), diff --git a/vlmeval/vlm/qwen2_vl/model.py b/vlmeval/vlm/qwen2_vl/model.py index 65629b0a..163581ed 100644 --- a/vlmeval/vlm/qwen2_vl/model.py +++ b/vlmeval/vlm/qwen2_vl/model.py @@ -76,6 +76,7 @@ def __init__( repetition_penalty=1.0, use_custom_prompt: bool = True, system_prompt: str | None = None, + post_process: bool = False, # if True, will try to only extract stuff in the last \boxed{}. verbose: bool = False, ): super().__init__(use_custom_prompt=use_custom_prompt) @@ -90,6 +91,7 @@ def __init__( ) self.system_prompt = system_prompt self.verbose = verbose + self.post_process = post_process self.fps = 2.0 self.nframe = 64 self.FRAME_FACTOR = 2 @@ -195,6 +197,24 @@ def generate_inner(self, message, dataset=None): generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False ) response = out[0] + if self.post_process: + resp = response.split('\\boxed{')[-1] + lt = len(resp) + counter, end = 1, None + for i in range(lt): + if resp[i] == '{': + counter += 1 + elif resp[i] == '}': + counter -= 1 + if counter == 0: + end = i + break + elif i == lt - 1: + end = lt + break + if end is not None: + response = resp[:end] + if self.verbose: print(f'\033[32m{response}\033[0m') return response diff --git a/vlmeval/vlm/vita.py b/vlmeval/vlm/vita.py index db0b31d0..e7604cee 100644 --- a/vlmeval/vlm/vita.py +++ b/vlmeval/vlm/vita.py @@ -28,9 +28,9 @@ def __init__(self, model_path='VITA/vita', root=None, **kwargs): model_name = get_model_name_from_path(model_path) tokenizer, model, image_processor, _ = load_pretrained_model( - model_path, None, model_name, model_type='qwen2p5_instruct', device_map='auto' + model_path, None, model_name, model_type='qwen2p5_instruct', device_map='cuda' ) - # model.cuda().eval() + model = model.eval() # model.tie_weights() audio_encoder = model.get_audio_encoder() From 40bbc75ca102acef26a725bcdff2a95075df003d Mon Sep 17 00:00:00 2001 From: jamespark3922 Date: Sun, 29 Dec 2024 23:06:12 -0800 Subject: [PATCH 07/15] [Model] Update Molmo Eval to Match Official Implementation (#648) * add molmo prompts * fix lint format --- vlmeval/vlm/molmo.py | 146 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 141 insertions(+), 5 deletions(-) diff --git a/vlmeval/vlm/molmo.py b/vlmeval/vlm/molmo.py index fe111a06..1227995f 100644 --- a/vlmeval/vlm/molmo.py +++ b/vlmeval/vlm/molmo.py @@ -1,11 +1,33 @@ import torch from PIL import Image -import os.path as osp -import sys from .base import BaseModel from ..smp import * from ..dataset import DATASET_TYPE +TYPE_PROMPTS = { + 'Y/N':'vqa2:', + 'VQA':'vqa2:', + 'MCQ':'a_okvqa_mc:', +} + +DATASET_PROMPTS = { + 'AI2D_TEST':'ai2_diagram:', + 'AI2D_TEST_NO_MASK':'ai2_diagram:', + 'COCO_VAL':'coco_captioning:', + 'ChartQA_TEST':'chart_qa:', + 'ChartQA_VAL':'chart_qa:', + 'DocVQA_VAL':'doc_qa:', + 'DocVQA_TEST':'doc_qa:', + 'InfoVQA_TEST':'info_qa:', + 'InfoVQA_VAL':'info_qa:', + 'OCRVQA_TEST':'ocr_vqa:', + 'OCRVQA_TESTCORE':'ocr_vqa:', + 'ScienceQA_VAL':'science_qa:', + 'ScienceQA_TEST':'science_qa:', + 'TableVQABench':'tabwmp_da:', + 'TextVQA_VAL':'text_vqa:' +} + class molmo(BaseModel): @@ -36,6 +58,106 @@ def __init__(self, model_path='allenai/Molmo-7B-D-0924', **kwargs): self.processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16) self.kwargs = kwargs self.model_name = model_path + # set default maximum number of crops to 36 + self.max_crops = kwargs.get('max_crops', 36) + + def use_custom_prompt(self, dataset): + if DATASET_TYPE(dataset) in ['Y/N', 'MCQ', 'VQA']: + return True + return False + + def build_prompt(self, line, dataset=None): + assert self.use_custom_prompt(dataset) + assert dataset is None or isinstance(dataset, str) + tgt_path = self.dump_image(line, dataset) + prefix = None + if dataset in ['MMMU_DEV_VAL', 'MMMU_TEST']: + prompt = self.build_prompt_mcq_vqa(line) + elif dataset in ['MathVista_MINI']: + prompt = self.build_prompt_mathvista(line) + elif dataset in ['AI2D_TEST', 'AI2D_TEST_NO_MASK']: + prompt = self.build_prompt_ai2d(line) + elif dataset is not None and listinstr(list(DATASET_PROMPTS.keys()), dataset): + prefix = DATASET_PROMPTS[dataset] # rest of supervised datasets are in VQA format + prompt = self.build_prompt_vqa(line, prefix) + elif dataset is not None and listinstr(['MCQ'], DATASET_TYPE(dataset)): + prompt = self.build_prompt_multiple_choice(line) + else: + prompt = self.build_prompt_vqa(line) + + message = [dict(type='text', value=prompt)] + message.extend([dict(type='image', value=s) for s in tgt_path]) + + # interleave dataset + if dataset.startswith('MMMU_'): + from .. import MMMUDataset + message = MMMUDataset.split_MMMU(message) + return message + + def build_prompt_mathvista(self, line): + if line['question_type'] == 'multi_choice': + prompt = self.build_prompt_multiple_choice(line) + else: + prompt = self.build_prompt_vqa(line) + return prompt + + def build_prompt_ai2d(self, line): + def option_is_abc(line): + for cand in string.ascii_uppercase: + if cand in line and not pd.isna(line[cand]): + # check if option is single letter + if not line[cand].strip().isalpha() or len(line[cand].strip()) > 1: + return False + return True + + if line['abcLabel'] and option_is_abc(line): + prompt = line['question'] + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + for key, item in options.items(): + prompt += f'\n{item}' + prompt = f"ai2_diagram_no_letter: {prompt}" + # prompt = self.build_prompt_multiple_choice(line, prefix='ai2_diagram_no_letter:') + else: + prompt = self.build_prompt_multiple_choice(line, prefix='ai2_diagram:') + return prompt + + def build_prompt_mcq_vqa(self, line): + if line['question_type'] == 'multiple-choice': + prompt = self.build_prompt_multiple_choice(line) + else: + prompt = self.build_prompt_vqa(line) + return prompt + + def build_prompt_multiple_choice(self, line, prefix=None): + question = line['question'] + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + if hint is not None: + question = hint + '\n' + question + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + for key, item in options.items(): + question += f'\n{key}: {item}' + if prefix is None: + prompt = f"{TYPE_PROMPTS['MCQ']} {question}" + else: + prompt = f"{prefix} {question}" + + return prompt + + def build_prompt_vqa(self, line, prefix=None): + question = line['question'] + if prefix is None: + prompt = f"{TYPE_PROMPTS['VQA']} {question}" + else: + prompt = f"{prefix} {question}" + return prompt def generate_inner(self, message, dataset=None): from transformers import GenerationConfig @@ -44,10 +166,15 @@ def generate_inner(self, message, dataset=None): image = Image.open(image_path) if image.mode != "RGB": image = image.convert("RGB") + # process the image and text + max_crops = self.max_crops inputs = self.processor.process( images=[image], - text=prompt + text=prompt, + images_kwargs={ + "max_crops": max_crops + } ) # move inputs to the correct device and make a batch of size 1 @@ -63,7 +190,16 @@ def generate_inner(self, message, dataset=None): # only get generated tokens; decode them to text generated_tokens = output[0, inputs['input_ids'].size(1):] - generated_text = self.processor.tokenizer.decode(generated_tokens, skip_special_tokens=True) + generated_text = self.processor.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip() + + # AI2D: map direct answer to letter option + if dataset in ['AI2D_TEST', 'AI2D_TEST_NO_MASK']: + # 'ai2_diagram_no_letter: Which of the following is the magma chamber?\nK\nB\nC\nH' + if 'ai2_diagram_no_letter' in prompt: + options = prompt.split('\n')[1:] + answer = options.index(generated_text) + generated_text = chr(answer + ord('A')) + + # print(dataset, prompt, generated_text, inputs['images'].size()) # uncomment to debug - # print the generated text return generated_text From 8c6ee87fb66f7b8fb93129dfd86830e28e8443f1 Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Mon, 30 Dec 2024 07:07:47 +0000 Subject: [PATCH 08/15] [Fix] Refine Qwen-VL2 device assignment --- vlmeval/vlm/qwen2_vl/model.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vlmeval/vlm/qwen2_vl/model.py b/vlmeval/vlm/qwen2_vl/model.py index 163581ed..0c4f2886 100644 --- a/vlmeval/vlm/qwen2_vl/model.py +++ b/vlmeval/vlm/qwen2_vl/model.py @@ -108,22 +108,22 @@ def __init__( assert max_gpu_mem > 0 # If only one process and GPU memory is less than 40GB - if auto_split_flag(): + if '72b' in self.model_path.lower(): + self.model = Qwen2VLForConditionalGeneration.from_pretrained( + model_path, torch_dtype='auto', device_map=split_model(), attn_implementation='flash_attention_2' + ) + self.model.eval() + elif auto_split_flag(): assert world_size == 1, 'Only support world_size == 1 when AUTO_SPLIT is set for non-72B Qwen2-VL' # Will Use All GPUs to run one model self.model = Qwen2VLForConditionalGeneration.from_pretrained( model_path, torch_dtype='auto', device_map='auto', attn_implementation='flash_attention_2' ) - elif '72b' not in self.model_path.lower(): + else: self.model = Qwen2VLForConditionalGeneration.from_pretrained( model_path, torch_dtype='auto', device_map='cpu', attn_implementation='flash_attention_2' ) self.model.cuda().eval() - else: - self.model = Qwen2VLForConditionalGeneration.from_pretrained( - model_path, torch_dtype='auto', device_map=split_model(), attn_implementation='flash_attention_2' - ) - self.model.eval() torch.cuda.empty_cache() From 624c12714711eca157a652cf80df0f619dd0a06f Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Mon, 30 Dec 2024 07:40:01 +0000 Subject: [PATCH 09/15] [Fix] Fix RealWorldQA md5 --- vlmeval/dataset/image_mcq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vlmeval/dataset/image_mcq.py b/vlmeval/dataset/image_mcq.py index 1a70304a..0b4de206 100644 --- a/vlmeval/dataset/image_mcq.py +++ b/vlmeval/dataset/image_mcq.py @@ -143,7 +143,7 @@ class ImageMCQDataset(ImageBaseDataset): 'AI2D_TEST': '0f593e0d1c7df9a3d69bf1f947e71975', 'AI2D_TEST_NO_MASK': 'fd8f463634d4fe9fbd23b876e8eea5be', 'MMStar': 'e1ecd2140806c1b1bbf54b43372efb9e', - 'RealWorldQA': '92321028d2bc29040284b6674721e48f', + 'RealWorldQA': '4de008f55dc4fd008ca9e15321dc44b7', 'MLLMGuard_DS': '975fc0dd7119386e198c37d71e274b3f', 'BLINK': '3b6649b6a662184ea046908e5506260e', 'TaskMeAnything_v1_imageqa_random': '023fef69e2ca21827afb77c5ec3bc889', From b66d47fafbaffc55a960a588b0c02a6b87d1dbae Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Mon, 30 Dec 2024 08:14:12 +0000 Subject: [PATCH 10/15] update MMMU_DEV_VAL tsv --- vlmeval/dataset/image_mcq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vlmeval/dataset/image_mcq.py b/vlmeval/dataset/image_mcq.py index 0b4de206..145209dd 100644 --- a/vlmeval/dataset/image_mcq.py +++ b/vlmeval/dataset/image_mcq.py @@ -286,7 +286,7 @@ class MMMUDataset(ImageMCQDataset): } DATASET_MD5 = { - 'MMMU_DEV_VAL': '521afc0f3bf341e6654327792781644d', + 'MMMU_DEV_VAL': '585e8ad75e73f75dcad265dfd0417d64', 'MMMU_TEST': 'c19875d11a2d348d07e5eb4bdf33166d', } From e5409529d34facc0f758440905a3be2b357a4a1e Mon Sep 17 00:00:00 2001 From: CMeteor Date: Tue, 31 Dec 2024 11:50:27 +0800 Subject: [PATCH 11/15] [Fix] Fix confusing image width&height (#704) Co-authored-by: Yuan Ye --- vlmeval/api/siliconflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vlmeval/api/siliconflow.py b/vlmeval/api/siliconflow.py index 6fea69e2..eeba793f 100644 --- a/vlmeval/api/siliconflow.py +++ b/vlmeval/api/siliconflow.py @@ -26,7 +26,7 @@ def resize_image(image: Image.Image, max_height: int, max_width: int) -> Image.I def encode_image(path: str, max_height: int = 1024, max_width: int = 1024) -> str: image = Image.open(path).convert("RGB") image = resize_image(image, max_height, max_width) - height, width = image.size + width, height = image.size if min(height, width) < 50: scale = 50 / min(width, height) image = image.resize((int(width * scale), int(height * scale))) From 369169800de7f4694453b1c75e8e96d5c7097b8b Mon Sep 17 00:00:00 2001 From: Guowei Xu <113534787+XuGW-Kevin@users.noreply.github.com> Date: Tue, 31 Dec 2024 11:51:05 +0800 Subject: [PATCH 12/15] Update llama_vision.py (#705) --- vlmeval/vlm/llama_vision.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vlmeval/vlm/llama_vision.py b/vlmeval/vlm/llama_vision.py index 9abbcfd9..649f963f 100644 --- a/vlmeval/vlm/llama_vision.py +++ b/vlmeval/vlm/llama_vision.py @@ -79,7 +79,7 @@ def __init__(self, model_path='meta-llama/Llama-3.2-11B-Vision-Instruct', **kwar self.device = 'cuda' self.processor = AutoProcessor.from_pretrained(model_path) - if 'Instruct' in model_path: + if 'Instruct' in model_path or 'cot' in model_path or 'CoT' in model_path: kwargs_default = dict(do_sample=True, temperature=0.6, top_p=0.9) else: kwargs_default = dict(do_sample=False, max_new_tokens=512, temperature=0.0, top_p=None, num_beams=1) @@ -200,5 +200,7 @@ def generate_inner(self, message, dataset=None): self.kwargs['max_new_tokens'] = 128 else: self.kwargs['max_new_tokens'] = 512 + if "cot" in self.model_name or "CoT" in self.model_name: + self.kwargs['max_new_tokens'] = 2048 output = self.model.generate(**inputs, **self.kwargs) return self.processor.decode(output[0][inputs['input_ids'].shape[1]:]).replace('<|eot_id|>', '') From 3b5d93f944ed7b33abd9f9c90353a00b9f58112f Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Tue, 31 Dec 2024 03:58:45 +0000 Subject: [PATCH 13/15] [Fix] Fix Lint --- vlmeval/dataset/utils/ccocr_evaluator/README.md | 8 ++++---- vlmeval/dataset/utils/ccocr_evaluator/__init__.py | 2 +- vlmeval/dataset/utils/ccocr_evaluator/common.py | 12 ++++++------ vlmeval/vlm/valley/valley_eagle_chat.py | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/vlmeval/dataset/utils/ccocr_evaluator/README.md b/vlmeval/dataset/utils/ccocr_evaluator/README.md index a636aa4a..99572ef5 100644 --- a/vlmeval/dataset/utils/ccocr_evaluator/README.md +++ b/vlmeval/dataset/utils/ccocr_evaluator/README.md @@ -1,12 +1,12 @@ # CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating Large Multimodal Models in Literacy ## Introduction - + Please refer to our [GitHub](https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/Benchmarks/CC-OCR) for more information. ## Running Scripts -Once the environment is ready, execute the following script from the root directory of VLMEvalKit +Once the environment is ready, execute the following script from the root directory of VLMEvalKit to perform inference and evaluation tasks in batch. ```shell @@ -44,13 +44,13 @@ If you find our work helpful, feel free to give us a cite. ``` @misc{yang2024ccocr, - title={CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating Large Multimodal Models in Literacy}, + title={CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating Large Multimodal Models in Literacy}, author={Zhibo Yang and Jun Tang and Zhaohai Li and Pengfei Wang and Jianqiang Wan and Humen Zhong and Xuejing Liu and Mingkun Yang and Peng Wang and Shuai Bai and LianWen Jin and Junyang Lin}, year={2024}, eprint={2412.02210}, archivePrefix={arXiv}, primaryClass={cs.CV}, - url={https://arxiv.org/abs/2412.02210}, + url={https://arxiv.org/abs/2412.02210}, } ``` diff --git a/vlmeval/dataset/utils/ccocr_evaluator/__init__.py b/vlmeval/dataset/utils/ccocr_evaluator/__init__.py index cd3ff5ac..d89f6f6b 100644 --- a/vlmeval/dataset/utils/ccocr_evaluator/__init__.py +++ b/vlmeval/dataset/utils/ccocr_evaluator/__init__.py @@ -9,4 +9,4 @@ "doc_parsing": ParsingEvaluator("doc_parsing"), "multi_lan_ocr": OcrEvaluator("multi_lan_ocr"), "multi_scene_ocr": OcrEvaluator("multi_scene_ocr") -} \ No newline at end of file +} diff --git a/vlmeval/dataset/utils/ccocr_evaluator/common.py b/vlmeval/dataset/utils/ccocr_evaluator/common.py index a3e80349..6ce9bcb5 100644 --- a/vlmeval/dataset/utils/ccocr_evaluator/common.py +++ b/vlmeval/dataset/utils/ccocr_evaluator/common.py @@ -26,7 +26,7 @@ def pick_response_text(json_path): response_text = None if model_name.startswith("gpt") or model_name.startswith("o1"): - response_text = model_response.get("data", {}).get("response", {}).get("choices", [{}])[0].get("message", {}).get("content", None) + response_text = model_response.get("data", {}).get("response", {}).get("choices", [{}])[0].get("message", {}).get("content", None) # noqa: E501 elif model_name.startswith("local_"): response_text = model_response else: @@ -35,7 +35,7 @@ def pick_response_text(json_path): elif model_name.startswith("gemini"): content_list = model_response.get("candidates", [{}])[0].get("content", {}).get("parts", None) elif model_name.startswith("qwen"): - content_list = model_response.get("output", {}).get("choices", [{}])[0].get("message", {}).get("content", None) + content_list = model_response.get("output", {}).get("choices", [{}])[0].get("message", {}).get("content", None) # noqa: E501 else: raise NotImplementedError("The pick_response_text NOT implemented for model: {}".format(model_name)) @@ -115,7 +115,7 @@ def __call__(self, pdt_res_dir, gt_info, with_response_ratio=True, **kwargs): # add response_success_ratio if "summary" in eval_info and with_response_ratio: success_ratio = (len(response_info) + len(post_error_list)) / (len(gt_info) + 1e-9) - eval_info["summary"].update({"response_success_ratio": success_ratio }) + eval_info["summary"].update({"response_success_ratio": success_ratio}) return meta_info, eval_info @@ -149,9 +149,9 @@ def summary_multi_exp(exp_dir_base, dataset_list=None, is_weighted_sum=False): data_status_info = json.load(f) all_dataset_name.extend(data_status_info.keys()) dataset_list = sorted(set(all_dataset_name)) - + # summary main code - all_evaluate_info, line_index = {}, 0 + all_evaluate_info, _ = {}, 0 for exp_name in os.listdir(exp_dir_base): dir_status_path = os.path.join(exp_dir_base, exp_name, "status.json") if not os.path.exists(dir_status_path): @@ -219,4 +219,4 @@ def summary_multi_exp(exp_dir_base, dataset_list=None, is_weighted_sum=False): summary_path = summary_multi_exp(exp_base_dir, dataset_list=None, is_weighted_sum=False) print("--> info: summary saved at : {}".format(summary_path)) - print("happy coding.") \ No newline at end of file + print("happy coding.") diff --git a/vlmeval/vlm/valley/valley_eagle_chat.py b/vlmeval/vlm/valley/valley_eagle_chat.py index 4c82ccc9..374fad56 100644 --- a/vlmeval/vlm/valley/valley_eagle_chat.py +++ b/vlmeval/vlm/valley/valley_eagle_chat.py @@ -124,7 +124,7 @@ def __init__(self, from .valley_eagle.util.mm_utils import process_anyres_image from .valley_eagle import conversation as conversation_lib from .valley_eagle.util.data_util import dynamic_preprocess, preprocess - + torch_dtype = torch.float16 padding_side = 'left' use_fast = True From 1bce5c7fe6ac79de940071bf5bf3c6d1ce5ea9ad Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Wed, 1 Jan 2025 14:21:45 +0000 Subject: [PATCH 14/15] Fix Lint --- .../ccocr_evaluator/doc_parsing_evaluator.py | 2 +- .../utils/ccocr_evaluator/kie_evaluator.py | 61 ++++++++----------- 2 files changed, 26 insertions(+), 37 deletions(-) diff --git a/vlmeval/dataset/utils/ccocr_evaluator/doc_parsing_evaluator.py b/vlmeval/dataset/utils/ccocr_evaluator/doc_parsing_evaluator.py index 382c4be8..d059adc0 100644 --- a/vlmeval/dataset/utils/ccocr_evaluator/doc_parsing_evaluator.py +++ b/vlmeval/dataset/utils/ccocr_evaluator/doc_parsing_evaluator.py @@ -241,7 +241,7 @@ def eval_formula(self, response_info, gt_info, op_name='formula'): pred = response_info[img_name] if op_name == 'formula': - pred = pred.replace("\n", " ").replace("```latex", "").replace("```", "").replace("\t", " ").replace(" ", "") + pred = pred.replace("\n", " ").replace("```latex", "").replace("```", "").replace("\t", " ").replace(" ", "") # noqa: E501 gt = gt.replace(" ", "") elif op_name == 'molecular': pred = pred.replace("\n", "").replace(" ", "").replace("", "").replace("", "") diff --git a/vlmeval/dataset/utils/ccocr_evaluator/kie_evaluator.py b/vlmeval/dataset/utils/ccocr_evaluator/kie_evaluator.py index 2613a338..797d4244 100644 --- a/vlmeval/dataset/utils/ccocr_evaluator/kie_evaluator.py +++ b/vlmeval/dataset/utils/ccocr_evaluator/kie_evaluator.py @@ -66,11 +66,11 @@ def update_cost(node1: Node, node2: Node): label2 = node2.label label1_leaf = "" in label1 label2_leaf = "" in label2 - if label1_leaf == True and label2_leaf == True: + if label1_leaf and label2_leaf: return edit_distance(label1.replace("", ""), label2.replace("", "")) - elif label1_leaf == False and label2_leaf == True: + elif not label1_leaf and label2_leaf: return 1 + len(label2.replace("", "")) - elif label1_leaf == True and label2_leaf == False: + elif label1_leaf and not label2_leaf: return 1 + len(label1.replace("", "")) else: return int(label1 != label2) @@ -121,7 +121,8 @@ def normalize_dict(data: Union[Dict, List, Any]): def cal_f1_all(preds, answers): """ - Calculate global F1 accuracy score (field-level, micro-averaged) by counting all true positives, false negatives and false positives + Calculate global F1 accuracy score (field-level, micro-averaged) by counting all true positives, + false negatives and false positives """ metric_info, error_info = {}, {} total_tp, total_fn_or_fp = 0, 0 @@ -233,35 +234,28 @@ def cal_acc(pred: dict, answer: dict): """ pred = construct_tree_from_dict(normalize_dict(pred)) answer = construct_tree_from_dict(normalize_dict(answer)) - return max( - 0, - 1 - - ( - zss.distance( - pred, - answer, - get_children=zss.Node.get_children, - insert_cost=insert_and_remove_cost, - remove_cost=insert_and_remove_cost, - update_cost=update_cost, - return_operations=False, - ) - / zss.distance( - construct_tree_from_dict(normalize_dict({})), - answer, - get_children=zss.Node.get_children, - insert_cost=insert_and_remove_cost, - remove_cost=insert_and_remove_cost, - update_cost=update_cost, - return_operations=False, - ) - ), + val1 = zss.distance( + pred, + answer, + get_children=zss.Node.get_children, + insert_cost=insert_and_remove_cost, + remove_cost=insert_and_remove_cost, + update_cost=update_cost, + return_operations=False, + ) + val2 = zss.distance( + construct_tree_from_dict(normalize_dict({})), + answer, + get_children=zss.Node.get_children, + insert_cost=insert_and_remove_cost, + remove_cost=insert_and_remove_cost, + update_cost=update_cost, + return_operations=False, ) + return max(0, 1 - val1 / val2) def cal_acc_all(pred_info, answer_info): - """ - """ acc_info, error_info = {}, {} for file_name, answer in answer_info.items(): # if file_name not in pred_info: @@ -303,13 +297,11 @@ def eval_donut(pdt_info, gt_info, normalize_func=None, data_name=None): acc_average, acc_error_info = cal_acc_all(pdt_info, gt_info) eval_info = {"f1_score": f1_score, "acc": acc_average, "class_f1_score": class_eval_info, "f1_error_info": error_info, "acc_error_info": acc_error_info} - print(data_name, "f1_score", f1_score, "acc", acc_average) + print(data_name, "f1_score", f1_score, "acc", acc_average) return eval_info def post_process_to_json(qwen_info_str, file_name=None): - """ - """ try: if "```json" in qwen_info_str: if "```" not in qwen_info_str: @@ -320,10 +312,7 @@ def post_process_to_json(qwen_info_str, file_name=None): json_str = qwen_info_str.strip().replace("\n", "") json_data = json.loads(json_str) return json_data - except Exception as e: - # print("--> post error: {}, file_name: {}".format(e, file_name)) - # print("json_raw", qwen_info_str) - # print("json_str", json_str) + except Exception as err: # noqa: F841 return None From c222e2fef80df7279cf31f9431cc373cb05580f8 Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Wed, 1 Jan 2025 14:24:14 +0000 Subject: [PATCH 15/15] Fix Lint --- run.py | 2 +- vlmeval/dataset/utils/ccocr_evaluator/ocr_evaluator.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/run.py b/run.py index 53a90735..bee2c23b 100644 --- a/run.py +++ b/run.py @@ -322,7 +322,7 @@ def main(): judge_kwargs['model'] = 'chatgpt-0125' elif listinstr(['MMVet', 'LLaVABench', 'MMBench-Video'], dataset_name): judge_kwargs['model'] = 'gpt-4-turbo' - elif listinstr(['MathVista', 'MathVerse', 'MathVision', 'DynaMath', 'VL-RewardBench'], dataset_name): + elif listinstr(['MathVista', 'MathVerse', 'MathVision', 'DynaMath', 'VL-RewardBench'], dataset_name): # noqa: E501 judge_kwargs['model'] = 'gpt-4o-mini' elif listinstr(['MMLongBench', 'MMDU', 'DUDE', 'SLIDEVQA', 'MIA-Bench', 'WildVision'], dataset_name): # noqa: E501 judge_kwargs['model'] = 'gpt-4o' diff --git a/vlmeval/dataset/utils/ccocr_evaluator/ocr_evaluator.py b/vlmeval/dataset/utils/ccocr_evaluator/ocr_evaluator.py index e33b95c6..5c82abcc 100644 --- a/vlmeval/dataset/utils/ccocr_evaluator/ocr_evaluator.py +++ b/vlmeval/dataset/utils/ccocr_evaluator/ocr_evaluator.py @@ -92,8 +92,10 @@ def evaluate(self, response_info, gt_info, **kwargs): image_pdt_info, image_gt_info = {}, {} for file_name, gt_src in gt_info.items(): pred_src = response_info.get(file_name, "") - pdt_token_list = text_normalize_and_tokenize(str(pred_src).strip(), is_word_level, is_lower, is_alphanum_only) - gt_token_list = text_normalize_and_tokenize(str(gt_src).strip(), is_word_level, is_lower, is_alphanum_only) + pdt_token_list = text_normalize_and_tokenize( + str(pred_src).strip(), is_word_level, is_lower, is_alphanum_only) + gt_token_list = text_normalize_and_tokenize( + str(gt_src).strip(), is_word_level, is_lower, is_alphanum_only) image_pdt_info[file_name] = pdt_token_list image_gt_info[file_name] = gt_token_list eval_result = calculate_metrics(image_pdt_info, image_gt_info, is_verbose=False)