Skip to content

Commit

Permalink
[Benchmark] The CC-OCR benchmark applies to merge into the official V…
Browse files Browse the repository at this point in the history
…LMEvalKit (#698)

* add kie task config.

* add ccocr_evaluator

* add readme for ccocr_evaluator

* updat url

* update README.md

* update readme

* update hf url

---------

Co-authored-by: 松灵 <[email protected]>
  • Loading branch information
wulipc and 松灵 authored Dec 29, 2024
1 parent 4bea02b commit 14385c5
Show file tree
Hide file tree
Showing 10 changed files with 1,245 additions and 4 deletions.
2 changes: 1 addition & 1 deletion run.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,7 @@ def main():

# Create the symbolic links for the prediction files
files = os.listdir(pred_root)
files = [x for x in files if f'{model_name}_{dataset_name}' in x]
files = [x for x in files if (f'{model_name}_{dataset_name}' in x or "status.json" in x)]
for f in files:
cwd = os.getcwd()
file_addr = osp.join(cwd, pred_root, f)
Expand Down
3 changes: 2 additions & 1 deletion vlmeval/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
CustomVQADataset, CRPE, MathVerse, OlympiadBench, QSpatial, VizWiz, MMNIAH
)

from .image_ccocr import CCOCRDataset
from .text_mcq import CustomTextMCQDataset, TextMCQDataset

from .vcr import VCRDataset
Expand Down Expand Up @@ -128,7 +129,7 @@ def evaluate(self, eval_file, **judge_kwargs):
IMAGE_DATASET = [
ImageCaptionDataset, ImageYORNDataset, ImageMCQDataset, ImageVQADataset, MathVision,
MMMUDataset, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, TableVQABench,
MMLongBench, VCRDataset, MMDUDataset, DUDE, SlideVQA, MUIRDataset,
MMLongBench, VCRDataset, MMDUDataset, DUDE, SlideVQA, MUIRDataset, CCOCRDataset,
GMAIMMBenchDataset, MMERealWorld, HRBenchDataset, CRPE, MathVerse, NaturalBenchDataset,
MIABench, OlympiadBench, WildVision, MMMath, QSpatial, Dynamath, MMGenBench, VizWiz, MMNIAH,
CMMMU
Expand Down
190 changes: 190 additions & 0 deletions vlmeval/dataset/image_ccocr.py

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions vlmeval/dataset/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from .judge_util import build_judge, DEBUG_MESSAGE
from .multiple_choice import extract_answer_from_item, prefetch_answer
from .vqa_eval import levenshtein_distance
from .ccocr_evaluator import evaluator_map_info as ccocr_evaluator_map


__all__ = [
'build_judge', 'extract_answer_from_item', 'prefetch_answer',
'levenshtein_distance', 'DEBUG_MESSAGE'
'build_judge', 'extract_answer_from_item', 'prefetch_answer', 'ccocr_evaluator_map',
'levenshtein_distance', 'DEBUG_MESSAGE',
]
59 changes: 59 additions & 0 deletions vlmeval/dataset/utils/ccocr_evaluator/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating Large Multimodal Models in Literacy

## Introduction

Please refer to our [GitHub](https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/Benchmarks/CC-OCR) for more information.

## Running Scripts

Once the environment is ready, execute the following script from the root directory of VLMEvalKit
to perform inference and evaluation tasks in batch.

```shell
MODEL_NAME="QwenVLMax"
OUTPUT_DIR="/your/path/to/output_dir"

SUB_OUTPUT_DIR=${OUTPUT_DIR}/multi_scene_ocr
python run.py --data CCOCR_MultiSceneOcr_Cord CCOCR_MultiSceneOcr_Funsd CCOCR_MultiSceneOcr_Iam CCOCR_MultiSceneOcr_ZhDoc CCOCR_MultiSceneOcr_ZhHandwriting CCOCR_MultiSceneOcr_Hieragent CCOCR_MultiSceneOcr_Ic15 CCOCR_MultiSceneOcr_Inversetext CCOCR_MultiSceneOcr_Totaltext CCOCR_MultiSceneOcr_ZhScene CCOCR_MultiSceneOcr_UgcLaion CCOCR_MultiSceneOcr_ZhDense CCOCR_MultiSceneOcr_ZhVertical --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose
python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR}

SUB_OUTPUT_DIR=${OUTPUT_DIR}/multi_lan_ocr
python run.py --data CCOCR_MultiLanOcr_Arabic CCOCR_MultiLanOcr_French CCOCR_MultiLanOcr_German CCOCR_MultiLanOcr_Italian CCOCR_MultiLanOcr_Japanese CCOCR_MultiLanOcr_Korean CCOCR_MultiLanOcr_Portuguese CCOCR_MultiLanOcr_Russian CCOCR_MultiLanOcr_Spanish CCOCR_MultiLanOcr_Vietnamese --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose
python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR}

SUB_OUTPUT_DIR=${OUTPUT_DIR}/doc_parsing
python run.py --data CCOCR_DocParsing_DocPhotoChn CCOCR_DocParsing_DocPhotoEng CCOCR_DocParsing_DocScanChn CCOCR_DocParsing_DocScanEng CCOCR_DocParsing_TablePhotoChn CCOCR_DocParsing_TablePhotoEng CCOCR_DocParsing_TableScanChn CCOCR_DocParsing_TableScanEng CCOCR_DocParsing_MolecularHandwriting CCOCR_DocParsing_FormulaHandwriting --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose
python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR}

SUB_OUTPUT_DIR=${OUTPUT_DIR}/kie
python run.py --data CCOCR_Kie_Sroie2019Word CCOCR_Kie_Cord CCOCR_Kie_EphoieScut CCOCR_Kie_Poie CCOCR_Kie_ColdSibr CCOCR_Kie_ColdCell --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose
python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR}
```

## Example Output
The evaluation results will be saved in `${SUB_OUTPUT_DIR}/summary.md`. For example, for the KIE subset,
the output is as follows:

| exp_name(f1_score) | COLD_CELL | COLD_SIBR | CORD | EPHOIE_SCUT | POIE | sroie2019_word | summary |
|:-------------------|------------:|------------:|-------:|--------------:|-------:|-----------------:|----------:|
| QwenVLMax | 81.01 | 72.46 | 69.33 | 71.2 | 60.85 | 76.37 | 71.87 |


## Citation
If you find our work helpful, feel free to give us a cite.

```
@misc{yang2024ccocr,
title={CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating Large Multimodal Models in Literacy},
author={Zhibo Yang and Jun Tang and Zhaohai Li and Pengfei Wang and Jianqiang Wan and Humen Zhong and Xuejing Liu and Mingkun Yang and Peng Wang and Shuai Bai and LianWen Jin and Junyang Lin},
year={2024},
eprint={2412.02210},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2412.02210},
}
```

## Contact Us

If you have any questions, feel free to send an email to: [email protected] or [email protected]
12 changes: 12 additions & 0 deletions vlmeval/dataset/utils/ccocr_evaluator/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from .kie_evaluator import KieEvaluator
from .doc_parsing_evaluator import ParsingEvaluator
from .ocr_evaluator import OcrEvaluator
from .common import summary


evaluator_map_info = {
"kie": KieEvaluator("kie"),
"doc_parsing": ParsingEvaluator("doc_parsing"),
"multi_lan_ocr": OcrEvaluator("multi_lan_ocr"),
"multi_scene_ocr": OcrEvaluator("multi_scene_ocr")
}
222 changes: 222 additions & 0 deletions vlmeval/dataset/utils/ccocr_evaluator/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
import os
import json
import time
import sys
from abc import abstractmethod
from tabulate import tabulate


def pick_response_text(json_path):
"""
"""
try:
with open(json_path, "r") as f:
json_data = json.load(f)
except Exception as e:
print("--> file error: msg: {}, path: {}".format(e, json_path))
return None

for required_key in ["model_name", "response"]:
if required_key not in json_data:
print("--> required key not exists, name: {}, path: {}".format(required_key, json_path))
return None

model_name = json_data["model_name"]
model_response = json_data["response"]

response_text = None
if model_name.startswith("gpt") or model_name.startswith("o1"):
response_text = model_response.get("data", {}).get("response", {}).get("choices", [{}])[0].get("message", {}).get("content", None)
elif model_name.startswith("local_"):
response_text = model_response
else:
if model_name.startswith("claude"):
content_list = model_response.get("content", None)
elif model_name.startswith("gemini"):
content_list = model_response.get("candidates", [{}])[0].get("content", {}).get("parts", None)
elif model_name.startswith("qwen"):
content_list = model_response.get("output", {}).get("choices", [{}])[0].get("message", {}).get("content", None)
else:
raise NotImplementedError("The pick_response_text NOT implemented for model: {}".format(model_name))

if isinstance(content_list, list) and len(content_list) > 0:
response_text = content_list[0].get("text", None)

if response_text is None:
print("--> [error][{}] text pick error, path: {}".format(model_name, json_path))
return response_text


def load_response_from_dir(res_dir):
"""
"""
response_info = {}
for file_name in os.listdir(res_dir):
file_path = os.path.abspath(os.path.join(res_dir, file_name))
if not file_name.endswith(".json"):
print("--> skip: result file should be a json: but got: {}".format(file_path))
continue

response_text = pick_response_text(file_path)
if response_text is None:
continue

file_name_wo_ext, ext = os.path.splitext(file_name)
response_info[file_name_wo_ext] = response_text
return response_info


class BaseMetric(object):
""" BaseMetric """
""" OCRMetric """
def __init__(self, group_name, **kwargs):
self.group_name = group_name
self.kwargs = kwargs

def response_post_func(self, response_text, **kwargs):
return response_text

@abstractmethod
# Given the prediction and gt, return the evaluation results in the format of a dictionary
# results should contain a 'summary' key, for example:
# {
# "summary": {
# "f1-score": 99.99,
# "metric_name": "metric_value" # used for summary,only metric info could be placed in this dict.
# },
# "your other info": "xxx"
# }
def evaluate(self, response_info, gt_info, normalize_func=None, **kwargs):
pass

def __call__(self, pdt_res_dir, gt_info, with_response_ratio=True, **kwargs):
if isinstance(pdt_res_dir, dict):
raw_response_info = pdt_res_dir
elif os.path.exists(pdt_res_dir) and os.path.isdir(pdt_res_dir):
raw_response_info = load_response_from_dir(pdt_res_dir)
else:
return ValueError("invalid input: response dict or folder are required, but got {}".format(pdt_res_dir))

post_error_list, response_info = [], {}
response_error_list = list(gt_info.keys() - raw_response_info.keys())
for file_name, single_pdt_str in raw_response_info.items():
single_pdt_str = self.response_post_func(single_pdt_str, **kwargs)
if single_pdt_str is None:
post_error_list.append(file_name)
continue
response_info[file_name] = single_pdt_str

meta_info = {
"gt_total_num": len(gt_info), "pdt_total_num": len(response_info),
"post_error_list": post_error_list, "response_error_list": response_error_list,
}
eval_info = self.evaluate(response_info, gt_info, **kwargs)

# add response_success_ratio
if "summary" in eval_info and with_response_ratio:
success_ratio = (len(response_info) + len(post_error_list)) / (len(gt_info) + 1e-9)
eval_info["summary"].update({"response_success_ratio": success_ratio })
return meta_info, eval_info


def summary(index_path, exp_dir_base, is_weighted_sum=False):
"""
"""
with open(index_path, "r") as f:
data_list = json.load(f)

all_data_info = {}
for data_info_item in data_list:
data_name = data_info_item["dataset"]
if not data_info_item.get("release", True):
continue
all_data_info[data_name] = data_info_item
dataset_list = list(all_data_info.keys())
summary_path = summary_multi_exp(exp_dir_base, dataset_list, is_weighted_sum=is_weighted_sum)
return summary_path


def summary_multi_exp(exp_dir_base, dataset_list=None, is_weighted_sum=False):
"""
"""
if dataset_list is None:
all_dataset_name = []
for exp_name in os.listdir(exp_dir_base):
dir_status_path = os.path.join(exp_dir_base, exp_name, "status.json")
if not os.path.exists(dir_status_path):
continue
with open(dir_status_path, "r") as f:
data_status_info = json.load(f)
all_dataset_name.extend(data_status_info.keys())
dataset_list = sorted(set(all_dataset_name))

# summary main code
all_evaluate_info, line_index = {}, 0
for exp_name in os.listdir(exp_dir_base):
dir_status_path = os.path.join(exp_dir_base, exp_name, "status.json")
if not os.path.exists(dir_status_path):
print("--> skip: status.json not exist: {}".format(dir_status_path))
continue

with open(dir_status_path, "r") as f:
all_status_info = json.load(f)

for data_name in dataset_list:
total_num = all_status_info.get(data_name, {}).get("config", {}).get("num", "-1")
summary_info = all_status_info.get(data_name, {}).get("evaluation", {}).get("summary", {})
for metric_name, metric_value in summary_info.items():
if metric_name not in all_evaluate_info:
all_evaluate_info[metric_name] = {}
if exp_name not in all_evaluate_info[metric_name]:
all_evaluate_info[metric_name][exp_name] = {}
all_evaluate_info[metric_name][exp_name][data_name] = (metric_value, total_num)

all_table_md = []
for metric_name, metric_info in all_evaluate_info.items():
formatted_time = time.strftime("%Y-%m-%d %H:%M", time.localtime(time.time()))
summary_line_list = []
summary_key_name = "summary(weighted)" if is_weighted_sum else "summary"
summary_head = [f"exp_name({metric_name}_{formatted_time})"] + dataset_list + [summary_key_name]
for exp_name, data_eval_info in metric_info.items():
summary_line = [exp_name, ]

all_metric_value = 0
is_summary_valid, all_total_num, all_weighted_metric = True, 0, 0
for data_name in dataset_list:
metric_value, total_num = data_eval_info.get(data_name, ("-1", "-1"))
summary_line.append("{:.2f}".format(float(metric_value) * 100))
if str(metric_value) == "-1" or str(metric_value) == "-1":
is_summary_valid = False
continue

all_total_num += float(total_num)
all_weighted_metric += float(total_num) * float(metric_value)
all_metric_value += float(metric_value)

summary_value_valid = ((all_weighted_metric / (all_total_num + 1e-9)) * 100) if is_weighted_sum \
else (all_metric_value / (len(dataset_list) + 1e-9) * 100)
summary_value = "-" if not is_summary_valid else "{:.2f}".format(summary_value_valid)
summary_line.append(summary_value)
summary_line_list.append(summary_line)

md_table_info = tabulate(summary_line_list, headers=summary_head, tablefmt='pipe')
all_table_md.append(md_table_info)

print("\n\n".join(all_table_md))
summary_path = os.path.abspath(os.path.join(exp_dir_base, "summary.md"))
with open(summary_path, "w") as f:
f.write("\n\n".join(all_table_md))
return summary_path


if __name__ == '__main__':
if len(sys.argv) != 2:
print("Usage: python {} exp_base_dir".format(__file__))
exit(-1)
else:
print('--> info: {}'.format(sys.argv))
exp_base_dir = sys.argv[1]

summary_path = summary_multi_exp(exp_base_dir, dataset_list=None, is_weighted_sum=False)
print("--> info: summary saved at : {}".format(summary_path))
print("happy coding.")
Loading

0 comments on commit 14385c5

Please sign in to comment.