From 5316c5f1b9ee3cec99a9ebcbf7f9684e9487b83b Mon Sep 17 00:00:00 2001 From: BaiqiLi <165443786+Baiqi-Li@users.noreply.github.com> Date: Tue, 17 Dec 2024 14:24:32 +0800 Subject: [PATCH] [Added/FIxed] added info of NaturalBench in README.md and fixed the metric names for NaturalBench Dataset (#660) * [Fixed] the metric name in NaturalBench * [Added] info of NaturalBench in README.md --------- Co-authored-by: Haodong Duan --- README.md | 4 +-- vlmeval/dataset/utils/naturalbench.py | 40 +++++++++++++-------------- 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 15ba1bcd7..3086d562c 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ English | [简体中文](/docs/zh-CN/README_zh-CN.md) | [日本語](/docs/ja/REA > We have presented a [**comprehensive survey**](https://arxiv.org/pdf/2411.15296) on the evaluation of large multi-modality models, jointly with [**MME Team**](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models) and [**LMMs-Lab**](https://lmms-lab.github.io) 🔥🔥🔥 +- **[2024-12-11]** Supported **[NaturalBench](https://huggingface.co/datasets/BaiqiL/NaturalBench)**, a vision-centric VQA benchmark (NeurIPS'24) that challenges vision-language models with simple questions about natural imagery. - **[2024-12-02]** Supported [VisOnlyQA](https://github.com/psunlpgroup/VisOnlyQA/), a benchmark for evaluating the visual perception capabilities 🔥🔥🔥 - **[2024-11-26]** Supported [Ovis1.6-Gemma2-27B](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-27B), thanks to **[runninglsy](https://github.com/runninglsy)** 🔥🔥🔥 - **[2024-11-25]** Create a new flag `VLMEVALKIT_USE_MODELSCOPE`. By setting this environment variable, you can download the video benchmarks supported from **[modelscope](https://www.modelscope.cn)** 🔥🔥🔥 @@ -37,9 +38,6 @@ English | [简体中文](/docs/zh-CN/README_zh-CN.md) | [日本語](/docs/ja/REA - **[2024-11-21]** Integrated a new config system to enable more flexible evaluation settings. Check the [Document](/docs/en/ConfigSystem.md) or run `python run.py --help` for more details 🔥🔥🔥 - **[2024-11-21]** Supported **[QSpatial](https://andrewliao11.github.io/spatial_prompt/)**, a multimodal benchmark for Quantitative Spatial Reasoning (determine the size / distance, e.g.), thanks **[andrewliao11](https://github.com/andrewliao11)** for providing the official support 🔥🔥🔥 - **[2024-11-21]** Supported **[MM-Math](https://github.com/kge-sun/mm-math)**, a new multimodal math benchmark comprising of ~6K middle school multi-modal reasoning math problems. GPT-4o-20240806 achieces 22.5% accuracy on this benchmark 🔥🔥🔥 -- **[2024-11-16]** Supported **[OlympiadBench](https://github.com/OpenBMB/OlympiadBench)**, a new multimodal benchmark comprising olympiad-level math and physics questions 🔥🔥🔥 -- **[2024-11-16]** Supported **[WildVision](https://huggingface.co/datasets/WildVision/wildvision-bench)**, a new subjective multimodal benchmark derived from multi-modal arena data 🔥🔥🔥 -- **[2024-11-13]** Supported **[MIA-Bench](https://arxiv.org/abs/2407.01509)**, a multimodal instruction-following benchmark 🔥🔥🔥 ## 🏗️ QuickStart diff --git a/vlmeval/dataset/utils/naturalbench.py b/vlmeval/dataset/utils/naturalbench.py index bd34cab61..ed9a9576d 100644 --- a/vlmeval/dataset/utils/naturalbench.py +++ b/vlmeval/dataset/utils/naturalbench.py @@ -60,15 +60,15 @@ def get_scores(scores): Returns: dict: A dictionary containing the calculated scores: - - 'question_score': Average question score - - 'image_score': Average image score - - 'binary_score': Average binary VQA score - - 'group_score': Average group score + - 'Q_Acc': Average question score + - 'I_Acc': Average image score + - 'Acc': Average binary VQA score + - 'G_Acc': Average group score """ - question_score = 0.0 - image_score = 0.0 - binary_score = 0.0 - group = 0.0 + Q_Acc = 0.0 + I_Acc = 0.0 + Acc = 0.0 + G_Acc = 0.0 num_samples = len(scores) @@ -124,22 +124,22 @@ def calculate_group(result): if isinstance(scores, dict): for _, result in scores.items(): - question_score += calculate_question_score(result) - image_score += calculate_image_score(result) - binary_score += calculate_binary_score(result) - group += calculate_group(result) + Q_Acc += calculate_question_score(result) + I_Acc += calculate_image_score(result) + Acc += calculate_binary_score(result) + G_Acc += calculate_group(result) else: for result in scores: - question_score += calculate_question_score(result) - image_score += calculate_image_score(result) - binary_score += calculate_binary_score(result) - group += calculate_group(result) + Q_Acc += calculate_question_score(result) + I_Acc += calculate_image_score(result) + Acc += calculate_binary_score(result) + G_Acc += calculate_group(result) results = { - 'question_score': question_score / float(num_samples * 2), - 'image_score': image_score / float(num_samples * 2), - 'binary_score': binary_score / float(num_samples * 4), - 'group_score': group / num_samples + 'Q_Acc': Q_Acc / float(num_samples * 2), + 'I_Acc': I_Acc / float(num_samples * 2), + 'Acc': Acc / float(num_samples * 4), + 'G_Acc': G_Acc / num_samples } return results