Skip to content

Commit 7709083

Browse files
Merge branch 'main' into lyccnb/main
2 parents 61cee42 + ae10ca4 commit 7709083

26 files changed

+1883
-35
lines changed

run.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -398,7 +398,7 @@ def main():
398398

399399
# Create the symbolic links for the prediction files
400400
files = os.listdir(pred_root)
401-
files = [x for x in files if f'{model_name}_{dataset_name}' in x]
401+
files = [x for x in files if (f'{model_name}_{dataset_name}' in x or "status.json" in x)]
402402
for f in files:
403403
cwd = os.getcwd()
404404
file_addr = osp.join(cwd, pred_root, f)

vlmeval/api/siliconflow.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def resize_image(image: Image.Image, max_height: int, max_width: int) -> Image.I
2626
def encode_image(path: str, max_height: int = 1024, max_width: int = 1024) -> str:
2727
image = Image.open(path).convert("RGB")
2828
image = resize_image(image, max_height, max_width)
29-
height, width = image.size
29+
width, height = image.size
3030
if min(height, width) < 50:
3131
scale = 50 / min(width, height)
3232
image = image.resize((int(width * scale), int(height * scale)))

vlmeval/config.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
VideoChatGPT_ROOT = None
1414
PLLaVA_ROOT = None
1515
RBDash_ROOT = None
16+
VITA_ROOT = '/fs-computility/mllm1/shared/dhd/VITA'
1617
LLAVA_V1_7B_MODEL_PTH = 'Please set your local path to LLaVA-7B-v1.1 here, the model weight is obtained by merging LLaVA delta weight based on vicuna-7b-v1.1 in https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md with vicuna-7b-v1.1. '
1718

1819
video_models = {
@@ -172,6 +173,11 @@
172173
'varco-vision-hf':partial(LLaVA_OneVision_HF, model_path='NCSOFT/VARCO-VISION-14B-HF'),
173174
}
174175

176+
vita_series = {
177+
'vita': partial(VITA, model_path='VITA-MLLM/VITA', root=VITA_ROOT),
178+
'vita_qwen2': partial(VITAQwen2, model_path='VITA-MLLM/VITA-1.5', root=VITA_ROOT),
179+
}
180+
175181
internvl_series = {
176182
'InternVL-Chat-V1-1': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-V1-1', version='V1.1'),
177183
'InternVL-Chat-V1-2': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-V1-2', version='V1.2'),
@@ -326,6 +332,7 @@
326332
qwen2vl_series = {
327333
'Qwen-VL-Max-0809': partial(Qwen2VLAPI, model='qwen-vl-max-0809', min_pixels=1280*28*28, max_pixels=16384*28*28),
328334
'Qwen-VL-Plus-0809': partial(Qwen2VLAPI, model='qwen-vl-plus-0809', min_pixels=1280*28*28, max_pixels=16384*28*28),
335+
'QVQ-72B-Preview': partial(Qwen2VLChat, model_path='Qwen/QVQ-72B-Preview', min_pixels=1280*28*28, max_pixels=16384*28*28, system_prompt='You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.', max_new_tokens=8192, post_process=False),
329336
'Qwen2-VL-72B-Instruct': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-72B-Instruct', min_pixels=1280*28*28, max_pixels=16384*28*28),
330337
'Qwen2-VL-7B-Instruct': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct', min_pixels=1280*28*28, max_pixels=16384*28*28),
331338
'Qwen2-VL-7B-Instruct-AWQ': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct-AWQ', min_pixels=1280*28*28, max_pixels=16384*28*28),
@@ -416,7 +423,7 @@
416423
mantis_series, mmalaya_series, phi3_series, xgen_mm_series, qwen2vl_series,
417424
slime_series, eagle_series, moondream_series, llama_series, molmo_series,
418425
kosmos_series, points_series, nvlm_series, vintern_series, h2ovl_series, aria_series,
419-
smolvlm_series, sail_series, valley_series
426+
smolvlm_series, sail_series, valley_series, vita_series
420427
]
421428

422429
for grp in model_groups:

vlmeval/dataset/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
CustomVQADataset, CRPE, MathVerse, OlympiadBench, QSpatial, VizWiz, MMNIAH
1414
)
1515

16+
from .image_ccocr import CCOCRDataset
1617
from .text_mcq import CustomTextMCQDataset, TextMCQDataset
1718

1819
from .vcr import VCRDataset
@@ -129,7 +130,7 @@ def evaluate(self, eval_file, **judge_kwargs):
129130
IMAGE_DATASET = [
130131
ImageCaptionDataset, ImageYORNDataset, ImageMCQDataset, ImageVQADataset, MathVision,
131132
MMMUDataset, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, TableVQABench,
132-
MMLongBench, VCRDataset, MMDUDataset, DUDE, SlideVQA, MUIRDataset,
133+
MMLongBench, VCRDataset, MMDUDataset, DUDE, SlideVQA, MUIRDataset, CCOCRDataset,
133134
GMAIMMBenchDataset, MMERealWorld, HRBenchDataset, CRPE, MathVerse, NaturalBenchDataset,
134135
MIABench, OlympiadBench, WildVision, MMMath, QSpatial, Dynamath, MMGenBench, VizWiz, MMNIAH,
135136
CMMMU

vlmeval/dataset/image_ccocr.py

+198
Large diffs are not rendered by default.

vlmeval/dataset/image_mcq.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ class ImageMCQDataset(ImageBaseDataset):
143143
'AI2D_TEST': '0f593e0d1c7df9a3d69bf1f947e71975',
144144
'AI2D_TEST_NO_MASK': 'fd8f463634d4fe9fbd23b876e8eea5be',
145145
'MMStar': 'e1ecd2140806c1b1bbf54b43372efb9e',
146-
'RealWorldQA': '92321028d2bc29040284b6674721e48f',
146+
'RealWorldQA': '4de008f55dc4fd008ca9e15321dc44b7',
147147
'MLLMGuard_DS': '975fc0dd7119386e198c37d71e274b3f',
148148
'BLINK': '3b6649b6a662184ea046908e5506260e',
149149
'TaskMeAnything_v1_imageqa_random': '023fef69e2ca21827afb77c5ec3bc889',
@@ -286,7 +286,7 @@ class MMMUDataset(ImageMCQDataset):
286286
}
287287

288288
DATASET_MD5 = {
289-
'MMMU_DEV_VAL': '521afc0f3bf341e6654327792781644d',
289+
'MMMU_DEV_VAL': '585e8ad75e73f75dcad265dfd0417d64',
290290
'MMMU_TEST': 'c19875d11a2d348d07e5eb4bdf33166d',
291291
}
292292

vlmeval/dataset/image_vqa.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ class ImageVQADataset(ImageBaseDataset):
3535
'InfoVQA_VAL': '2342e9c225222f0ef4dec545ebb126fe',
3636
'InfoVQA_TEST': 'df535bf51b88dc9718252c34131a6227',
3737
'ChartQA_TEST': 'c902e0aa9be5582a7aad6dcf52734b42',
38-
'GQA_TestDev_Balanced': 'fead7df22befc1ed3ca2b62ea26fa17b',
38+
'GQA_TestDev_Balanced': '99b62f22e224d9b2f32dcbe41359d1c9',
3939
}
4040

4141
def build_prompt(self, line):

vlmeval/dataset/utils/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,5 @@
55

66
__all__ = [
77
'build_judge', 'extract_answer_from_item', 'prefetch_answer',
8-
'levenshtein_distance', 'DEBUG_MESSAGE'
8+
'levenshtein_distance', 'DEBUG_MESSAGE',
99
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating Large Multimodal Models in Literacy
2+
3+
## Introduction
4+
5+
Please refer to our [GitHub](https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/Benchmarks/CC-OCR) for more information.
6+
7+
## Running Scripts
8+
9+
Once the environment is ready, execute the following script from the root directory of VLMEvalKit
10+
to perform inference and evaluation tasks in batch.
11+
12+
```shell
13+
MODEL_NAME="QwenVLMax"
14+
OUTPUT_DIR="/your/path/to/output_dir"
15+
16+
SUB_OUTPUT_DIR=${OUTPUT_DIR}/multi_scene_ocr
17+
python run.py --data CCOCR_MultiSceneOcr_Cord CCOCR_MultiSceneOcr_Funsd CCOCR_MultiSceneOcr_Iam CCOCR_MultiSceneOcr_ZhDoc CCOCR_MultiSceneOcr_ZhHandwriting CCOCR_MultiSceneOcr_Hieragent CCOCR_MultiSceneOcr_Ic15 CCOCR_MultiSceneOcr_Inversetext CCOCR_MultiSceneOcr_Totaltext CCOCR_MultiSceneOcr_ZhScene CCOCR_MultiSceneOcr_UgcLaion CCOCR_MultiSceneOcr_ZhDense CCOCR_MultiSceneOcr_ZhVertical --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose
18+
python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR}
19+
20+
SUB_OUTPUT_DIR=${OUTPUT_DIR}/multi_lan_ocr
21+
python run.py --data CCOCR_MultiLanOcr_Arabic CCOCR_MultiLanOcr_French CCOCR_MultiLanOcr_German CCOCR_MultiLanOcr_Italian CCOCR_MultiLanOcr_Japanese CCOCR_MultiLanOcr_Korean CCOCR_MultiLanOcr_Portuguese CCOCR_MultiLanOcr_Russian CCOCR_MultiLanOcr_Spanish CCOCR_MultiLanOcr_Vietnamese --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose
22+
python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR}
23+
24+
SUB_OUTPUT_DIR=${OUTPUT_DIR}/doc_parsing
25+
python run.py --data CCOCR_DocParsing_DocPhotoChn CCOCR_DocParsing_DocPhotoEng CCOCR_DocParsing_DocScanChn CCOCR_DocParsing_DocScanEng CCOCR_DocParsing_TablePhotoChn CCOCR_DocParsing_TablePhotoEng CCOCR_DocParsing_TableScanChn CCOCR_DocParsing_TableScanEng CCOCR_DocParsing_MolecularHandwriting CCOCR_DocParsing_FormulaHandwriting --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose
26+
python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR}
27+
28+
SUB_OUTPUT_DIR=${OUTPUT_DIR}/kie
29+
python run.py --data CCOCR_Kie_Sroie2019Word CCOCR_Kie_Cord CCOCR_Kie_EphoieScut CCOCR_Kie_Poie CCOCR_Kie_ColdSibr CCOCR_Kie_ColdCell --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose
30+
python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR}
31+
```
32+
33+
## Example Output
34+
The evaluation results will be saved in `${SUB_OUTPUT_DIR}/summary.md`. For example, for the KIE subset,
35+
the output is as follows:
36+
37+
| exp_name(f1_score) | COLD_CELL | COLD_SIBR | CORD | EPHOIE_SCUT | POIE | sroie2019_word | summary |
38+
|:-------------------|------------:|------------:|-------:|--------------:|-------:|-----------------:|----------:|
39+
| QwenVLMax | 81.01 | 72.46 | 69.33 | 71.2 | 60.85 | 76.37 | 71.87 |
40+
41+
42+
## Citation
43+
If you find our work helpful, feel free to give us a cite.
44+
45+
```
46+
@misc{yang2024ccocr,
47+
title={CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating Large Multimodal Models in Literacy},
48+
author={Zhibo Yang and Jun Tang and Zhaohai Li and Pengfei Wang and Jianqiang Wan and Humen Zhong and Xuejing Liu and Mingkun Yang and Peng Wang and Shuai Bai and LianWen Jin and Junyang Lin},
49+
year={2024},
50+
eprint={2412.02210},
51+
archivePrefix={arXiv},
52+
primaryClass={cs.CV},
53+
url={https://arxiv.org/abs/2412.02210},
54+
}
55+
```
56+
57+
## Contact Us
58+
59+
If you have any questions, feel free to send an email to: [email protected] or [email protected]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from .kie_evaluator import KieEvaluator
2+
from .doc_parsing_evaluator import ParsingEvaluator
3+
from .ocr_evaluator import OcrEvaluator
4+
from .common import summary
5+
6+
7+
evaluator_map_info = {
8+
"kie": KieEvaluator("kie"),
9+
"doc_parsing": ParsingEvaluator("doc_parsing"),
10+
"multi_lan_ocr": OcrEvaluator("multi_lan_ocr"),
11+
"multi_scene_ocr": OcrEvaluator("multi_scene_ocr")
12+
}

0 commit comments

Comments
 (0)