Skip to content

Commit 9cc14cd

Browse files
PhoenixZ810OliverLeeXZkennymckormick
authored
Modify tsv, add split_thinking (#1229)
* [major] add tsv, xlsx, json fo PRED_FORMAT and csv, json for EVAL_FORMAT * [minor] remove useless func * Fix Lint * [Major]refine V2 for format control * modify save file format, add split_thinking * modify import in sft, omnidoc, fix build_prompt in internvl * fix lint, modify split thinking prompt --------- Co-authored-by: OliverLeeXZ <[email protected]> Co-authored-by: Haodong Duan <[email protected]> Co-authored-by: kennymckormick <[email protected]>
1 parent 3df1d16 commit 9cc14cd

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+710
-647
lines changed

run.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,8 @@ def main():
271271
dist.barrier()
272272

273273
try:
274-
result_file_base = f'{model_name}_{dataset_name}.xlsx'
274+
pred_format = get_pred_file_format()
275+
result_file_base = f'{model_name}_{dataset_name}.{pred_format}'
275276

276277
if use_config:
277278
if WORLD_SIZE > 1:
@@ -299,9 +300,6 @@ def main():
299300
continue
300301

301302
# Handling Multi-Turn Dataset
302-
if dataset.TYPE == 'MT':
303-
result_file_base = result_file_base.replace('.xlsx', '.tsv')
304-
305303
result_file = osp.join(pred_root, result_file_base)
306304
# Reuse the previous prediction file if exists
307305
if RANK == 0 and len(prev_pred_roots):

scripts/apires_scan.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
model_name = root.split('/')[-1]
1111

1212
for d in SUPPORTED_DATASETS:
13-
fname = f'{model_name}_{d}.xlsx'
13+
from vlmeval.smp import get_pred_file_format
14+
pred_format = get_pred_file_format()
15+
fname = f'{model_name}_{d}.{pred_format}'
1416
pth = osp.join(root, fname)
1517
if osp.exists(pth):
1618
data = load(pth)

scripts/auto_run.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@ def is_large(x):
2626
models = [x for x in models if not listinstr(['MiniGPT', 'grounding-generalist'], x)]
2727

2828
for m in models:
29-
unknown_datasets = [x for x in args.data if not osp.exists(f'{m}/{m}_{x}.xlsx')]
29+
from vlmeval.smp import get_pred_file_format
30+
pred_format = get_pred_file_format()
31+
unknown_datasets = [x for x in args.data if not osp.exists(f'{m}/{m}_{x}.{pred_format}')]
3032
if len(unknown_datasets) == 0:
3133
continue
3234
dataset_str = ' '.join(unknown_datasets)

vlmeval/dataset/CGAVCounting/cg_av_counting.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -359,10 +359,11 @@ def save_video_frames(self, video, uid, num_frames=8, fps=-1):
359359

360360
def evaluate(self, eval_file, **judge_kwargs):
361361

362-
assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
362+
assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], \
363+
'data file should be an supported format (xlsx/json/tsv) file'
363364

364-
tgt_file = eval_file.replace(".xlsx", "_rating.json")
365-
score_file = eval_file.replace(".xlsx", "_score.xlsx")
365+
tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
366+
score_file = get_intermediate_file_path(eval_file, '_score', 'csv')
366367

367368
data = load(eval_file)
368369

vlmeval/dataset/EgoExoBench/egoexobench.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -244,11 +244,12 @@ def build_prompt(self, line, video_llm):
244244
def evaluate(self, eval_file, **judge_kwargs):
245245
from .utils import get_dimension_rating, extract_characters_regex, extract_option
246246

247-
assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
247+
assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], \
248+
'data file should be an supported format (xlsx/json/tsv) file'
248249

249-
tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
250-
tgt_file = eval_file.replace('.xlsx', '_rating.json')
251-
score_file = eval_file.replace('.xlsx', '_score.xlsx')
250+
tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
251+
tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
252+
score_file = get_intermediate_file_path(eval_file, '_score', 'csv')
252253

253254
if not osp.exists(score_file):
254255
model = judge_kwargs.get('model', 'exact_matching')

vlmeval/dataset/GUI/screenspot.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,7 @@ def evaluate_rectangle(self, eval_file, **judge_kwargs):
324324
results_dict[key] = str(0)
325325
else:
326326
results_dict[key] = str(sum(results_dict[key]) / len(results_dict[key]))
327-
score_pth = eval_file.replace(".xlsx", "_score.json")
327+
score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
328328
dump(results_dict, score_pth)
329329

330330
failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None)
@@ -437,7 +437,7 @@ def make_safe(value):
437437
sub_stats = itertools.chain(*sub_stats)
438438
final_score_dict[c + '_Accuracy'] = np.mean([x > 0 for x in sub_stats]) * 100
439439

440-
score_pth = eval_file.replace(".xlsx", "_score.json")
440+
score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
441441
dump(final_score_dict, score_pth)
442442

443443
failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None)

vlmeval/dataset/GUI/screenspot_pro.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,7 @@ def evaluate_rectangle(self, eval_file, **judge_kwargs):
312312
results_dict[key] = str(0)
313313
else:
314314
results_dict[key] = str(sum(results_dict[key]) / len(results_dict[key]))
315-
score_pth = eval_file.replace(".xlsx", "_score.json")
315+
score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
316316
dump(results_dict, score_pth)
317317

318318
failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None)
@@ -422,7 +422,7 @@ def make_safe(value):
422422
sub_stats = itertools.chain(*sub_stats)
423423
final_score_dict[c + '_Accuracy'] = np.mean([x > 0 for x in sub_stats]) * 100
424424

425-
score_pth = eval_file.replace(".xlsx", "_score.json")
425+
score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
426426
dump(final_score_dict, score_pth)
427427

428428
failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None)

vlmeval/dataset/OmniDocBench/omnidocbench.py

Lines changed: 26 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44
import pandas as pd
55
import tempfile
66
import base64
7+
import numpy as np
78
from tqdm import tqdm
89
import torch.distributed as dist
910
from ..image_base import ImageBaseDataset
1011
from ...smp import *
12+
# from ..utils import get_intermediate_file_path, load, dump
1113

1214

1315
class OmniDocBench(ImageBaseDataset):
@@ -28,7 +30,7 @@ class OmniDocBench(ImageBaseDataset):
2830
2931
2. Mathematical Formula Processing:
3032
- Convert all mathematical formulas to LaTeX format.
31-
- Enclose inline formulas with \( \). For example: This is an inline formula \( E = mc^2 \)
33+
# - Enclose inline formulas with \( \). For example: This is an inline formula \( E = mc^2 \)
3234
- Enclose block formulas with \\[ \\]. For example: \[ \frac{-b \pm \sqrt{b^2 - 4ac}}{2a} \]
3335
3436
3. Table Processing:
@@ -75,9 +77,6 @@ def __init__(self,
7577
tsv_path,
7678
match_method:str='quick_match',
7779
filter_types:dict=None):
78-
self.result_foler='../../../outputs/OmniDocBench'
79-
if not os.path.exists(self.result_foler):
80-
os.makedirs(self.result_foler)
8180
self.eval_file=eval_file
8281
self.match_method=match_method
8382
self.references=[]
@@ -374,17 +373,18 @@ def process_generated_metric_results(self,samples,save_name:str='end2end_quick_m
374373
'group':group_result,
375374
'page':page_result
376375
}
377-
if not os.path.exists('./output/OmniDocBench'):
378-
os.makedirs('./output/OmniDocBench')
379376
if isinstance(cur_samples,list):
380377
saved_samples=cur_samples
381378
else:
382379
saved_samples=cur_samples.samples
383-
with open(os.path.join(self.result_foler,f'{save_name}_result.josn'),'w',encoding='utf-8') as f:
384-
json.dump(saved_samples,f,indent=4,ensure_ascii=False)
380+
# NOTE: The original code has a bug here, it will overwrite the result file in each iteration.
381+
# I will fix it by adding element to the filename.
382+
# NOTE: Fixed typo .josn -> .json
383+
result_file = get_intermediate_file_path(self.eval_file, f'_{save_name}_{element}_result', 'json')
384+
dump(saved_samples, result_file)
385385

386-
with open(os.path.join(self.result_foler,f'{save_name}_metric_result.json'),'w',encoding='utf-8') as f:
387-
json.dump(result_all,f,indent=4,ensure_ascii=False)
386+
metric_result_file = get_intermediate_file_path(self.eval_file, f'_{save_name}_metric_result', 'json')
387+
dump(result_all, metric_result_file)
388388

389389
dict_list = []
390390
save_dict={}
@@ -409,20 +409,20 @@ def process_generated_metric_results(self,samples,save_name:str='end2end_quick_m
409409
dict_list.append(save_dict)
410410
df = pd.DataFrame(dict_list,index=['end2end',]).round(3)
411411

412-
with open(os.path.join(self.result_foler,'End2End_Evaluation.json'),'w',encoding='utf-8') as f:
413-
json.dump(result_all,f,indent=4,ensure_ascii=False)
414-
df.to_csv(os.path.join(self.result_foler,'overall.csv'))
415-
over_all_path=os.path.join(self.result_foler,'End2End_Evaluation.json')
416-
print(f"The save path of overall.csv is :{over_all_path}")
412+
e2e_eval_file = get_intermediate_file_path(self.eval_file, '_End2End_Evaluation', 'json')
413+
dump(result_all, e2e_eval_file)
414+
415+
overall_file = get_intermediate_file_path(self.eval_file, '_overall')
416+
dump(df, overall_file)
417+
418+
print(f"The save path of End2End_Evaluation is: {e2e_eval_file}")
419+
print(f"The save path of overall metrics is: {overall_file}")
417420
return df
418421

419422

420423
class table_evalutor():
421424
def __init__(self,eval_file,tsv_path):
422-
423-
self.result_foler='../../../outputs/OmniDocBench'
424-
if not os.path.exists(self.result_foler):
425-
os.makedirs(self.result_foler)
425+
self.eval_file = eval_file
426426
gt_key='html'
427427
pred_key='pred'
428428
self.category_filter='table'
@@ -434,8 +434,8 @@ def load_data(self,eval_file,gt_file,pred_key,gt_key):
434434
from .data_preprocess import clean_string, normalized_formula, textblock2unicode, normalized_table
435435
samples=[]
436436
preds=[]
437-
predictions=pd.read_excel(eval_file)['prediction'].tolist()
438-
gt_samples=pd.read_csv(gt_file,sep='\t')['answer'].tolist()
437+
predictions=load(eval_file)['prediction'].tolist()
438+
gt_samples=load(gt_file)['answer'].tolist()
439439
load_success,load_fail=0,0
440440
for i,gt_sample in tqdm(enumerate(gt_samples),desc='Loading data'):
441441
try:
@@ -533,8 +533,8 @@ def process_generated_metric_results(self,save_name:str='OmniDocBench_table'):
533533
'page':page_result
534534
}
535535

536-
with open(os.path.join(self.result_foler,f'{save_name}_metric_result.json'),'w',encoding='utf-8') as f:
537-
json.dump(result_all,f,indent=4,ensure_ascii=False)
536+
metric_result_file = get_intermediate_file_path(self.eval_file, f'_{save_name}_metric_result', 'json')
537+
dump(result_all, metric_result_file)
538538

539539
dict_list=[]
540540
dict_list.append(result_all["group"]["TEDS"])
@@ -545,10 +545,7 @@ def process_generated_metric_results(self,save_name:str='OmniDocBench_table'):
545545
selected_columns = df4[["language: table_en", "language: table_simplified_chinese", "language: table_en_ch_mixed", "line: full_line", "line: less_line", "line: fewer_line", "line: wireless_line",
546546
"with_span: True", "with_span: False", "include_equation: True", "include_equation: False", "include_background: True", "include_background: False", "table_layout: vertical", "table_layout: horizontal"]]
547547

548-
selected_columns.to_csv(os.path.join(self.result_foler,'table_attribute.csv'))
549-
table_attribute_path=os.path.join(self.result_foler,'table_attribute.csv')
550-
print(f'The save path of table_attribute.csv is :{table_attribute_path}')
551-
selected_columns
552-
553-
548+
table_attr_file = get_intermediate_file_path(self.eval_file, '_table_attribute')
549+
dump(selected_columns, table_attr_file)
550+
print(f'The save path of table_attribute is :{table_attr_file}')
554551
return selected_columns

vlmeval/dataset/__init__.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,6 @@ def supported_datasets(cls):
151151
return list(cls.DATASET_SETS)
152152

153153
def evaluate(self, eval_file, **judge_kwargs):
154-
suffix = eval_file.split('.')[-1]
155154
# First, split the eval_file by dataset
156155
data_all = load(eval_file)
157156
for dname in self.datasets:
@@ -179,11 +178,11 @@ def evaluate(self, eval_file, **judge_kwargs):
179178

180179
if len(df_all):
181180
result = pd.concat(df_all)
182-
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
181+
score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
183182
dump(result, score_file)
184183
return result
185184
else:
186-
score_file = eval_file.replace(f'.{suffix}', '_score.json')
185+
score_file = get_intermediate_file_path(eval_file, '_score', 'json')
187186
dump(dict_all, score_file)
188187
return dict_all
189188

vlmeval/dataset/cgbench.py

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from huggingface_hub import snapshot_download
22
from ..smp import *
3+
from ..smp.file import get_intermediate_file_path, get_file_extension
34
from .video_base import VideoBaseDataset
45
from .utils import build_judge, DEBUG_MESSAGE
56
from .utils.cgbench import *
@@ -432,10 +433,10 @@ def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-
432433

433434
def evaluate(self, eval_file, **judge_kwargs):
434435

435-
assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
436+
assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format"
436437

437-
tgt_file = eval_file.replace(".xlsx", "_rating.json")
438-
score_file = eval_file.replace(".xlsx", "_score.xlsx")
438+
tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
439+
score_file = get_intermediate_file_path(eval_file, '_score')
439440

440441
data = load(eval_file)
441442

@@ -760,12 +761,12 @@ def evaluate(self, eval_file, **judge_kwargs):
760761

761762
from .utils.cgbench import get_dimention_rating_open_ended, post_process_open
762763

763-
assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
764+
assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format"
764765

765-
tgt_file = eval_file.replace(".xlsx", "_rating.json")
766-
score_file = eval_file.replace(".xlsx", "_score.xlsx")
767-
step_1_tmp_file = eval_file.replace(".xlsx", "_step_1.pkl")
768-
step_2_tmp_file = eval_file.replace(".xlsx", "_step_2.pkl")
766+
tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
767+
score_file = get_intermediate_file_path(eval_file, '_score')
768+
step_1_tmp_file = get_intermediate_file_path(eval_file, '_step_1', 'pkl')
769+
step_2_tmp_file = get_intermediate_file_path(eval_file, '_step_2', 'pkl')
769770

770771
data = load(eval_file)
771772

@@ -784,13 +785,13 @@ def evaluate(self, eval_file, **judge_kwargs):
784785
axis=1,
785786
)
786787

787-
data_no_model_result = data_pred_no_na[data_pred_no_na["model_result"] == -1]
788-
data_step_1 = data_pred_no_na[data_pred_no_na["model_result"] != -1]
789-
790788
if judge_kwargs.get("model", None) != "gpt-4o-0806":
791789
judge_kwargs["model"] = "gpt-4o-0806"
792790
print("The judge model in cg-bench is gpt-4o-0806!")
793791

792+
data_no_model_result = data_pred_no_na[data_pred_no_na["model_result"] == -1]
793+
data_step_1 = data_pred_no_na[data_pred_no_na["model_result"] != -1]
794+
794795
model_step_1 = build_judge(system_prompt=sys_prompt_open_eval_step_1, **judge_kwargs)
795796
nproc = judge_kwargs.pop("nproc", 32)
796797

@@ -1314,10 +1315,10 @@ def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-
13141315

13151316
def evaluate(self, eval_file, **judge_kwargs):
13161317

1317-
assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
1318+
assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format"
13181319

1319-
tgt_file = eval_file.replace(".xlsx", "_rating.json")
1320-
score_file = eval_file.replace(".xlsx", "_score.xlsx")
1320+
tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
1321+
score_file = get_intermediate_file_path(eval_file, '_score')
13211322

13221323
data = load(eval_file)
13231324

@@ -1641,12 +1642,12 @@ def evaluate(self, eval_file, **judge_kwargs):
16411642

16421643
from .utils.cgbench import get_dimention_rating_open_ended, post_process_open
16431644

1644-
assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
1645+
assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format"
16451646

1646-
tgt_file = eval_file.replace(".xlsx", "_rating.json")
1647-
score_file = eval_file.replace(".xlsx", "_score.xlsx")
1648-
step_1_tmp_file = eval_file.replace(".xlsx", "_step_1.pkl")
1649-
step_2_tmp_file = eval_file.replace(".xlsx", "_step_2.pkl")
1647+
tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
1648+
score_file = get_intermediate_file_path(eval_file, '_score')
1649+
step_1_tmp_file = get_intermediate_file_path(eval_file, '_step_1', 'pkl')
1650+
step_2_tmp_file = get_intermediate_file_path(eval_file, '_step_2', 'pkl')
16501651

16511652
data = load(eval_file)
16521653

0 commit comments

Comments
 (0)