From 5316c5f1b9ee3cec99a9ebcbf7f9684e9487b83b Mon Sep 17 00:00:00 2001
From: BaiqiLi <165443786+Baiqi-Li@users.noreply.github.com>
Date: Tue, 17 Dec 2024 14:24:32 +0800
Subject: [PATCH] [Added/FIxed] added info of NaturalBench in README.md and
 fixed the metric names for NaturalBench Dataset (#660)

* [Fixed] the metric name in NaturalBench

* [Added] info of NaturalBench in README.md

---------

Co-authored-by: Haodong Duan <dhd@pku.edu.cn>
---
 README.md                             |  4 +--
 vlmeval/dataset/utils/naturalbench.py | 40 +++++++++++++--------------
 2 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index 15ba1bcd7..3086d562c 100644
--- a/README.md
+++ b/README.md
@@ -28,6 +28,7 @@ English | [简体中文](/docs/zh-CN/README_zh-CN.md) | [日本語](/docs/ja/REA
 
 > We have presented a [**comprehensive survey**](https://arxiv.org/pdf/2411.15296) on the evaluation of large multi-modality models, jointly with [**MME Team**](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models) and [**LMMs-Lab**](https://lmms-lab.github.io) 🔥🔥🔥
 
+- **[2024-12-11]** Supported **[NaturalBench](https://huggingface.co/datasets/BaiqiL/NaturalBench)**, a vision-centric VQA benchmark (NeurIPS'24) that challenges vision-language models with simple questions about natural imagery.
 - **[2024-12-02]** Supported [VisOnlyQA](https://github.com/psunlpgroup/VisOnlyQA/), a benchmark for evaluating the visual perception capabilities 🔥🔥🔥
 - **[2024-11-26]** Supported [Ovis1.6-Gemma2-27B](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-27B), thanks to **[runninglsy](https://github.com/runninglsy)** 🔥🔥🔥
 - **[2024-11-25]** Create a new flag `VLMEVALKIT_USE_MODELSCOPE`. By setting this environment variable, you can download the video benchmarks supported from **[modelscope](https://www.modelscope.cn)** 🔥🔥🔥
@@ -37,9 +38,6 @@ English | [简体中文](/docs/zh-CN/README_zh-CN.md) | [日本語](/docs/ja/REA
 - **[2024-11-21]** Integrated a new config system to enable more flexible evaluation settings. Check the [Document](/docs/en/ConfigSystem.md) or run `python run.py --help` for more details 🔥🔥🔥
 - **[2024-11-21]** Supported **[QSpatial](https://andrewliao11.github.io/spatial_prompt/)**, a multimodal benchmark for Quantitative Spatial Reasoning (determine the size / distance, e.g.), thanks **[andrewliao11](https://github.com/andrewliao11)**  for providing the official support 🔥🔥🔥
 - **[2024-11-21]** Supported **[MM-Math](https://github.com/kge-sun/mm-math)**, a new multimodal math benchmark comprising of ~6K middle school multi-modal reasoning math problems. GPT-4o-20240806 achieces 22.5% accuracy on this benchmark 🔥🔥🔥
-- **[2024-11-16]** Supported **[OlympiadBench](https://github.com/OpenBMB/OlympiadBench)**, a new multimodal benchmark comprising olympiad-level math and physics questions 🔥🔥🔥
-- **[2024-11-16]** Supported **[WildVision](https://huggingface.co/datasets/WildVision/wildvision-bench)**, a new subjective multimodal benchmark derived from multi-modal arena data 🔥🔥🔥
-- **[2024-11-13]** Supported **[MIA-Bench](https://arxiv.org/abs/2407.01509)**, a multimodal instruction-following benchmark 🔥🔥🔥
 
 ## 🏗️ QuickStart
 
diff --git a/vlmeval/dataset/utils/naturalbench.py b/vlmeval/dataset/utils/naturalbench.py
index bd34cab61..ed9a9576d 100644
--- a/vlmeval/dataset/utils/naturalbench.py
+++ b/vlmeval/dataset/utils/naturalbench.py
@@ -60,15 +60,15 @@ def get_scores(scores):
 
     Returns:
         dict: A dictionary containing the calculated scores:
-            - 'question_score': Average question score
-            - 'image_score': Average image score
-            - 'binary_score': Average binary VQA score
-            - 'group_score': Average group score
+            - 'Q_Acc': Average question score
+            - 'I_Acc': Average image score
+            - 'Acc': Average binary VQA score
+            - 'G_Acc': Average group score
     """
-    question_score = 0.0
-    image_score = 0.0
-    binary_score = 0.0
-    group = 0.0
+    Q_Acc = 0.0
+    I_Acc = 0.0
+    Acc = 0.0
+    G_Acc = 0.0
 
     num_samples = len(scores)
 
@@ -124,22 +124,22 @@ def calculate_group(result):
 
     if isinstance(scores, dict):
         for _, result in scores.items():
-            question_score += calculate_question_score(result)
-            image_score += calculate_image_score(result)
-            binary_score += calculate_binary_score(result)
-            group += calculate_group(result)
+            Q_Acc += calculate_question_score(result)
+            I_Acc += calculate_image_score(result)
+            Acc += calculate_binary_score(result)
+            G_Acc += calculate_group(result)
     else:
         for result in scores:
-            question_score += calculate_question_score(result)
-            image_score += calculate_image_score(result)
-            binary_score += calculate_binary_score(result)
-            group += calculate_group(result)
+            Q_Acc += calculate_question_score(result)
+            I_Acc += calculate_image_score(result)
+            Acc += calculate_binary_score(result)
+            G_Acc += calculate_group(result)
 
     results = {
-        'question_score': question_score / float(num_samples * 2),
-        'image_score': image_score / float(num_samples * 2),
-        'binary_score': binary_score / float(num_samples * 4),
-        'group_score': group / num_samples
+        'Q_Acc': Q_Acc / float(num_samples * 2),
+        'I_Acc': I_Acc / float(num_samples * 2),
+        'Acc': Acc / float(num_samples * 4),
+        'G_Acc': G_Acc / num_samples
     }
 
     return results