diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py index a1bab65c..ce98d248 100644 --- a/vlmeval/dataset/__init__.py +++ b/vlmeval/dataset/__init__.py @@ -30,6 +30,7 @@ from .longvideobench import LongVideoBench from .video_concat_dataset import ConcatVideoDataset from .mmgenbench import MMGenBench +from .cgbench import CGBench_MCQ_Grounding_Mini, CGBench_OpenEnded_Mini, CGBench_MCQ_Grounding, CGBench_OpenEnded from .miabench import MIABench from .cmmmu import CMMMU @@ -139,7 +140,8 @@ def evaluate(self, eval_file, **judge_kwargs): VIDEO_DATASET = [ MMBenchVideo, VideoMME, MVBench, MVBench_MP4, LongVideoBench, MLVU, MLVU_MCQ, MLVU_OpenEnded, - TempCompass, TempCompass_MCQ, TempCompass_Captioning, TempCompass_YorN + TempCompass, TempCompass_MCQ, TempCompass_Captioning, TempCompass_YorN, + CGBench_MCQ_Grounding_Mini, CGBench_OpenEnded_Mini, CGBench_MCQ_Grounding, CGBench_OpenEnded ] TEXT_DATASET = [ diff --git a/vlmeval/dataset/cgbench.py b/vlmeval/dataset/cgbench.py new file mode 100644 index 00000000..172cdbb3 --- /dev/null +++ b/vlmeval/dataset/cgbench.py @@ -0,0 +1,1760 @@ +from huggingface_hub import snapshot_download +from ..smp import * +from .video_base import VideoBaseDataset +from .utils import build_judge, DEBUG_MESSAGE +from .utils.cgbench import * +from ..utils import track_progress_rich + + +class CGBench_MCQ_Grounding_Mini(VideoBaseDataset): + + dataset = "CG-Bench_MCQ_Grounding_Mini" + + TYPE = "Video-MCQ-Grounding" + + MD5 = "54ed3e90a51a6fb375c92b319a715f72" + + SYS = { + "long_acc": ( + "You will be provided with sampled frames from a video, along with a " + "multiple-choice question that includes a question and several answer options.\n" + "Your task is to analyze the provided frames, infer the most plausible " + "answer based on the visual information.\n" + "If the video does not provide enough information, infer the answer based " + "on the options available and still provide a result. " + "Therefore, In all cases, an answer must be given.\n" + "Only output the answer in the following format:\n\n" + '```json\n{"result": "option"}\n```\n\n' + 'The "option" is the uppercase letter corresponding to your answer.\n\n' + ), + "clue_acc": ( + "You will be provided with sampled frames from a video, along with a " + "multiple-choice question that includes a question and several answer options.\n" + "Your task is to analyze the provided frames, infer the most plausible " + "answer based on the visual information.\n" + "If the video does not provide enough information, infer the answer based " + "on the options available and still provide a result. " + "Therefore, In all cases, an answer must be given.\n" + "Only output the answer in the following format:\n\n" + '```json\n{"result": "option"}\n```\n\n' + "The 'option' is the uppercase letter corresponding to your answer.\n\n" + ), + "miou": ( + "You will be provided with uniformly sampled frames from a video and their " + "timestamps, along with a multiple-choice question that includes a question " + "and several answer options.\n" + "Your task is to determine in which intervals the 'clue intervals' exist " + "that contain visual information needed to answer the question.\n" + "Only output the answer in the following format:\n\n" + '```json\n{"result": [[start1, end1], [start2, end2], ...]}\n```\n\n' + "In this output format, each 'start' and 'end' represents the beginning and " + "end of an interval in seconds where relevant clues can be found.\n" + "You must provide at least one interval and at most five intervals. " + "Intervals exceeding five will NOT be considered valid.\n" + ), + "miou_wo_frame_time": ( + "You will be provided with uniformly sampled frames from a video, along " + "with a multiple-choice question that includes a question and several " + "answer options.\n" + "Your task is to determine in which intervals the 'clue intervals' exist " + "that contain visual information needed to answer the question.\n" + "Only output the answer in the following format:\n\n" + '```json\n{"result": [[start1, end1], [start2, end2], ...]}\n```\n\n' + 'In this output format, each "start" and "end" represents the start and ' + "end of the video where the relevant clue can be found in the form of a " + "floating point number between 0 and 1, where 0 represents the start time " + "of the video and 1 represents the end time of the video.\n" + "You must provide at least one interval and at most five intervals. " + "Intervals exceeding five will NOT be considered valid.\n" + ), + } + + def __init__( + self, + dataset="CG-Bench_MCQ_Grounding_Mini", + use_subtitle=False, + use_subtitle_time=False, + use_frame_time=False, + nframe=0, + fps=-1, + ): + super().__init__(dataset=dataset, nframe=nframe, fps=fps) + self.use_subtitle = use_subtitle + self.use_subtitle_time = use_subtitle_time + self.use_frame_time = use_frame_time + self.dataset_name = dataset + lmu_root = LMUDataRoot() + self.clue_frame_root = osp.join(lmu_root, "clue_images", dataset) + + @classmethod + def supported_datasets(cls): + return ["CG-Bench_MCQ_Grounding_Mini"] + + def clue_frame_paths(self, qid, num_frames=8): + frame_root = osp.join(self.clue_frame_root, qid) + os.makedirs(frame_root, exist_ok=True) + return [osp.join(frame_root, self.frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)] + + def clue_frame_paths_fps(self, qid, num_frames=8, fps=-1): + frame_root = osp.join(self.clue_frame_root, qid) + os.makedirs(frame_root, exist_ok=True) + return [osp.join(frame_root, self.frame_tmpl_fps.format(i, num_frames, fps)) for i in range(1, num_frames + 1)] + + def get_subtitles(self, subtitle_path, frame_indices=None, fps=None, sub_time=False): + + subtitles = [] + + srt_path = osp.join(self.data_root, subtitle_path) + assert osp.exists(srt_path) + import pysubs2 + + subs = pysubs2.load(srt_path, encoding="utf-8") + if not frame_indices: + for sub in subs: + sub_text = sub.text.replace("\\N", " ") + if sub_time: + start_time = milliseconds_to_seconds(sub.start) + end_time = milliseconds_to_seconds(sub.end) + sub_text = f"[{start_time}, {end_time}] {sub_text}" + if sub_text.strip() and sub_text not in subtitles: + subtitles.append(sub_text) + else: + for selected_frame_id in frame_indices: + cur_time = pysubs2.make_time(fps=fps, frames=selected_frame_id) + for sub in subs: + if sub.start < cur_time and sub.end > cur_time: + sub_text = sub.text.replace("\\N", " ") + if sub_time: + start_time = milliseconds_to_seconds(sub.start) + end_time = milliseconds_to_seconds(sub.end) + sub_text = f"[{start_time}, {end_time}] {sub_text}" + if sub_text.strip() and sub_text not in subtitles: + subtitles.append(sub_text) + + if subtitles: + subtitles_str = '\n'.join(subtitles) + return f"The subtitles of the video are as follows:\n\n{subtitles_str}\n\n" + else: + return "" + + def prepare_dataset(self, dataset_name="CG-Bench_MCQ_Grounding_Mini", repo_id="CG-Bench/CG-Bench"): + + def check_integrity(pth): + data_file = osp.join(pth, f"{dataset_name}.tsv") + + if not os.path.exists(data_file): + return False + + if md5(data_file) != self.MD5: + return False + data = load(data_file) + for video_pth in data["video"]: + if not osp.exists(osp.join(pth, video_pth)): + return False + + return True + + cache_path = get_cache_path(repo_id) + + if cache_path is not None and check_integrity(cache_path): + dataset_path = cache_path + else: + + def generate_tsv(pth): + + tsv_file = osp.join(pth, f"{dataset_name}.tsv") + + task_modes = ["long_acc", "clue_acc", "miou"] + all_data = [] + for task_mode in task_modes: + with open(osp.join(pth, "cgbench_mini.json"), "r") as f: + data_file = pd.DataFrame(json.load(f)) + + data_file = data_file.assign(index=range(len(data_file))) + data_file["video"] = data_file["video_uid"].apply(lambda x: f"cg_videos_720p/{x}.mp4") + data_file["subtitle_path"] = data_file["video_uid"].apply( + lambda x: ( + f"cg_subtitles/{x}.srt" + if osp.exists(osp.join(dataset_path, f"cg_subtitles/{x}.srt")) + else "" + ) + ) + + data_file["clue_video_path"] = "" + + if task_mode in ["clue_acc"]: + data_file["clue_video_path"] = data_file["clue_video_path"] = data_file.apply( + lambda row: f"cg_clue_videos/{row['qid']}.mp4", axis=1 + ) + + data_file["task_mode"] = task_mode + + if task_mode in ["clue_acc", "long_acc"]: + data_file["answer"] = data_file["right_answer"] + + if task_mode == "miou": + data_file["answer"] = data_file["clue_intervals"] + + if task_mode in ["long_acc", "miou"]: + data_file["clue_intervals"] = "" + + data_file = data_file[ + [ + "index", + "video_uid", + "video", + "duration", + "domain", + "choices", + "sub_category", + "subtitle_path", + "question", + "answer", + "task_mode", + "clue_intervals", + "qid", + "clue_video_path", + ] + ] + + all_data.append(data_file) + + final_data = pd.concat(all_data, ignore_index=True) + final_data["index"] = range(len(final_data)) + final_data.to_csv(tsv_file, sep="\t", index=False) + + if modelscope_flag_set(): + from modelscope import dataset_snapshot_download + + dataset_path = dataset_snapshot_download(dataset_id=repo_id) + else: + dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset") + + unzip_hf_zip(dataset_path) + generate_tsv(dataset_path) + + tsv_file = osp.join(dataset_path, f"{dataset_name}.tsv") + + return dict(data_file=tsv_file, root=dataset_path) + + def build_prompt(self, line, video_llm): + + if isinstance(line, int): + assert line < len(self) + line = self.data.iloc[line] + + task_mode = line["task_mode"] + + message = [] + + origin_use_subtitle_time = self.use_subtitle_time + + try: + if task_mode in ["long_acc", "clue_acc"]: + system_prompt = self.SYS[task_mode] + elif task_mode == "miou": + if self.use_frame_time and not video_llm: + system_prompt = self.SYS[task_mode] + else: + system_prompt = self.SYS["miou_wo_frame_time"] + if self.use_subtitle_time is True: + self.use_subtitle_time = False + + user_prompt = "" + + if task_mode in ["long_acc", "miou"]: + video_path = line["video"] + + if video_llm: + message.append(dict(type="video", value=osp.join(self.data_root, video_path))) + + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + if self.nframe: + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps + ) + user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices, + fps=vid_fps, sub_time=self.use_subtitle_time) + else: + user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time) + else: + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps + ) + message.extend(dict(type="image", value=im) for im in image_paths) + + if self.use_frame_time: + user_prompt += get_timestampes(frame_indices, vid_fps) + + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + user_prompt += self.get_subtitles( + line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps, + sub_time=self.use_subtitle_time + ) + + elif task_mode == "clue_acc": + clue_video_path = line["clue_video_path"] + video_path = line["video"] + + if video_llm: + message.append(dict(type="video", value=osp.join(self.data_root, clue_video_path))) + print(message) + + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + if self.nframe: + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps + ) + user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices, + fps=vid_fps, sub_time=self.use_subtitle_time) + else: + user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time) + else: + if self.nframe > 32: + self.nframe = 32 + print("The maximum number of frames is 32 when evaluating clue-based mcq in CG-Bench !") + + clue_intervals = eval(line["clue_intervals"]) + + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["qid"], clue_intervals=clue_intervals, num_frames=self.nframe, fps=self.fps + ) + + message.extend(dict(type="image", value=im) for im in image_paths) + + if self.use_frame_time: + user_prompt += get_timestampes(frame_indices, vid_fps) + + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + user_prompt += self.get_subtitles( + line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps, + sub_time=self.use_subtitle_time + ) + + question = line["question"] + user_prompt += f"Question: {question}\n\n" + + choices = eval(line["choices"]) + labels = [chr(ord("A") + i) for i in range(len(choices))] + user_prompt += "\n".join([f"{label}:{value}" for label, value in zip(labels, choices)]) + "\n\n" + + message.append(dict(type="text", value=system_prompt + user_prompt)) + + return message + + finally: + # Ensure that `use_subtitle_time` is always restored to its original value + self.use_subtitle_time = origin_use_subtitle_time + + def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-1): + + if type(uid) is not str: + uid = str(uid) + + vid_path = osp.join(self.data_root, video) + vid = decord.VideoReader(vid_path) + vid_fps = vid.get_avg_fps() + n_frames = len(vid) + + if clue_intervals is not None: + merged_intervals = merge_intervals(clue_intervals) + + if num_frames > 0 and fps < 0: + indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps) + frame_paths = self.clue_frame_paths(uid, len(indices)) + + elif fps > 0: + frame_indices = [] + for start, end in merged_intervals: + start_frame = int(start * vid_fps) + end_frame = int(end * vid_fps) + step = vid_fps / fps + interval_indices = [ + int(start_frame + i * step) for i in range(int((end_frame - start_frame) / step)) + ] + frame_indices.extend(interval_indices) + + if len(frame_indices) < 32: + indices = sample_frames_clue_average(merged_intervals, 32, vid_fps) + else: + indices = frame_indices + frame_paths = self.clue_frame_paths_fps(uid, len(indices), fps) + + else: + if num_frames > 0 and fps < 0: + step_size = len(vid) / (num_frames + 1) + indices = [int(i * step_size) for i in range(1, num_frames + 1)] + + frame_paths = self.frame_paths(uid) + elif fps > 0: + total_duration = n_frames / vid_fps + required_frames = int(total_duration * fps) + step_size = vid_fps / fps + indices = [int(i * step_size) for i in range(required_frames)] + frame_paths = self.frame_paths_fps(uid, len(indices)) + + # Save and validate frames + valid_paths = [] + valid_indices = [] + + if not np.all([osp.exists(p) for p in frame_paths]): + images = [vid[i].asnumpy() for i in indices] + for i, (img_array, path) in enumerate(zip(images, frame_paths)): + if osp.exists(path): + try: + with Image.open(path) as img: + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + else: + try: + img = Image.fromarray(img_array) + img.save(path) + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + else: + for i, path in enumerate(frame_paths): + try: + with Image.open(path) as img: + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + + return valid_paths, valid_indices, vid_fps + + def evaluate(self, eval_file, **judge_kwargs): + + assert eval_file.endswith(".xlsx"), "data file should be an xlsx file" + + tgt_file = eval_file.replace(".xlsx", "_rating.json") + score_file = eval_file.replace(".xlsx", "_score.xlsx") + + data = load(eval_file) + + data_un = data[~pd.isna(data["prediction"])] + data_pred_na = data[pd.isna(data["prediction"])] + + data_pred_na["score"] = -1 + + data_un["score"] = data_un.apply( + lambda row: post_process( + response=row["prediction"], + right_answer=row["answer"], + task_mode=row["task_mode"], + duration=row["duration"], + ), + axis=1, + ) + + data = pd.concat([data_pred_na, data_un]) + + rejected_count = (data["score"] == -1).sum() + + print( + f"Among {len(data)} questions, " + f"failed to obtain prediction for {len(data_pred_na)} questions, " + f"failed to obtain the score for {rejected_count - len(data_pred_na)} questions. " + f"Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating." + ) + + dump(data, score_file) + + rating = get_dimention_rating_mcq_grouding(score_file) + + dump(rating, tgt_file) + + return rating + + +# 评估时,step_2 评估时,给出 [prompt] + image_paths 就行 +class CGBench_OpenEnded_Mini(VideoBaseDataset): + + TYPE = "Video-OpenEnded" + + dataset = "CG-Bench_OpenEnded_Mini" + + MD5 = "9175791b11afdfa305fdb3e525b7a4ee" + + SYS = ( + "You will be provided with sampled frames from a video, along with a " + "question.\n" + "Your task is to analyze the provided frames and infer the most plausible " + "answer based on the visual information.\n" + "If the visual information is ambiguous or insufficient, use the available " + "context to reason your answer.\n" + "Only output the answer in the following format:\n\n" + '```json\n{"result": "answer"}\n```\n\n' + 'The "answer" can be a word, phrase, or sentence that directly responds to ' + "the question.\n\n" + ) + + def __init__( + self, + dataset="CG-Bench_OpenEnded_Mini", + use_subtitle=False, + use_subtitle_time=False, + use_frame_time=False, + nframe=0, + fps=-1, + ): + super().__init__(dataset=dataset, nframe=nframe, fps=fps) + self.use_subtitle = use_subtitle + self.use_subtitle_time = use_subtitle_time + self.use_frame_time = use_frame_time + self.dataset_name = dataset + lmu_root = LMUDataRoot() + self.clue_frame_root = osp.join(lmu_root, "clue_images", dataset) + + @classmethod + def supported_datasets(cls): + return ["CG-Bench_OpenEnded_Mini"] + + def get_subtitles(self, subtitle_path, frame_indices=None, fps=None, sub_time=False): + + subtitles = [] + + srt_path = osp.join(self.data_root, subtitle_path) + assert osp.exists(srt_path) + import pysubs2 + + subs = pysubs2.load(srt_path, encoding="utf-8") + if not frame_indices: + for sub in subs: + sub_text = sub.text.replace("\\N", " ") + if sub_time: + start_time = milliseconds_to_seconds(sub.start) + end_time = milliseconds_to_seconds(sub.end) + sub_text = f"[{start_time}, {end_time}] {sub_text}" + if sub_text.strip() and sub_text not in subtitles: + subtitles.append(sub_text) + else: + for selected_frame_id in frame_indices: + cur_time = pysubs2.make_time(fps=fps, frames=selected_frame_id) + for sub in subs: + if sub.start < cur_time and sub.end > cur_time: + sub_text = sub.text.replace("\\N", " ") + if sub_time: + start_time = milliseconds_to_seconds(sub.start) + end_time = milliseconds_to_seconds(sub.end) + sub_text = f"[{start_time}, {end_time}] {sub_text}" + if sub_text.strip() and sub_text not in subtitles: + subtitles.append(sub_text) + + if subtitles: + subtitles_str = '\n'.join(subtitles) + return f"The subtitles of the video are as follows:\n\n{subtitles_str}\n\n" + else: + return "" + + def prepare_dataset(self, dataset_name="CG-Bench_OpenEnded_Mini", repo_id="CG-Bench/CG-Bench"): + + def check_integrity(pth): + data_file = osp.join(pth, f"{dataset_name}.tsv") + + if not os.path.exists(data_file): + return False + + if md5(data_file) != self.MD5: + return False + data = load(data_file) + for video_pth in data["video"]: + if not osp.exists(osp.join(pth, video_pth)): + return False + + return True + + cache_path = get_cache_path(repo_id) + + if cache_path is not None and check_integrity(cache_path): + dataset_path = cache_path + else: + + def generate_tsv(pth): + + tsv_file = osp.join(pth, f"{dataset_name}.tsv") + + with open(osp.join(pth, "cgbench_mini.json"), "r") as f: + data_file = pd.DataFrame(json.load(f)) + + data_file = data_file.assign(index=range(len(data_file))) + data_file["video"] = data_file["video_uid"].apply(lambda x: f"cg_videos_720p/{x}.mp4") + data_file["subtitle_path"] = data_file["video_uid"].apply( + lambda x: f"cg_subtitles/{x}.srt" if osp.exists(osp.join(pth, f"cg_subtitles/{x}.srt")) else "" + ) + + data_file = data_file[ + [ + "index", + "video_uid", + "video", + "duration", + "domain", + "sub_category", + "subtitle_path", + "question", + "answer", + "clue_intervals", + "qid", + ] + ] + + data_file.to_csv(tsv_file, sep="\t", index=False) + + if modelscope_flag_set(): + from modelscope import dataset_snapshot_download + + dataset_path = dataset_snapshot_download(dataset_id=repo_id) + else: + dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset") + + unzip_hf_zip(dataset_path) + generate_tsv(dataset_path) + + tsv_file = osp.join(dataset_path, f"{dataset_name}.tsv") + + return dict(data_file=tsv_file, root=dataset_path) + + def build_prompt(self, line, video_llm): + + if isinstance(line, int): + assert line < len(self) + line = self.data.iloc[line] + + message = [] + + sys_prompt = self.SYS + + user_prompt = "" + + video_path = line["video"] + + if video_llm: + message.append(dict(type="video", value=osp.join(self.data_root, video_path))) + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + if self.nframe: + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps + ) + user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices, + fps=vid_fps, sub_time=self.use_subtitle_time) + else: + user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time) + else: + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps + ) + message.extend(dict(type="image", value=im) for im in image_paths) + + if self.use_frame_time: + user_prompt += get_timestampes(frame_indices, vid_fps) + + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + user_prompt += self.get_subtitles( + line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps, + sub_time=self.use_subtitle_time + ) + + question = line["question"] + user_prompt += f"Question: {question}\n\n" + + message.append(dict(type="text", value=sys_prompt + user_prompt)) + + return message + + def clue_frame_paths(self, qid, num_frames=8): + frame_root = osp.join(self.clue_frame_root, qid) + os.makedirs(frame_root, exist_ok=True) + return [osp.join(frame_root, self.frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)] + + def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-1): + + if type(uid) is not str: + uid = str(uid) + + vid_path = osp.join(self.data_root, video) + vid = decord.VideoReader(vid_path) + vid_fps = vid.get_avg_fps() + n_frames = len(vid) + + if clue_intervals is not None: + merged_intervals = merge_intervals(clue_intervals) + + if num_frames > 0 and fps < 0: + indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps) + frame_paths = self.clue_frame_paths(uid, len(indices)) + + elif fps > 0: + frame_indices = [] + for start, end in merged_intervals: + start_frame = int(start * vid_fps) + end_frame = int(end * vid_fps) + step = vid_fps / fps + interval_indices = [ + int(start_frame + i * step) for i in range(int((end_frame - start_frame) / step)) + ] + frame_indices.extend(interval_indices) + + if len(frame_indices) < 32: + indices = sample_frames_clue_average(merged_intervals, 32, vid_fps) + else: + indices = frame_indices + frame_paths = self.clue_frame_paths_fps(uid, len(indices), fps) + + else: + if num_frames > 0 and fps < 0: + step_size = len(vid) / (num_frames + 1) + indices = [int(i * step_size) for i in range(1, num_frames + 1)] + frame_paths = self.frame_paths(uid) + elif fps > 0: + total_duration = n_frames / vid_fps + required_frames = int(total_duration * fps) + step_size = vid_fps / fps + indices = [int(i * step_size) for i in range(required_frames)] + frame_paths = self.frame_paths_fps(uid, len(indices)) + + valid_paths = [] + valid_indices = [] + + if not np.all([osp.exists(p) for p in frame_paths]): + images = [vid[i].asnumpy() for i in indices] + for i, (img_array, path) in enumerate(zip(images, frame_paths)): + if osp.exists(path): + try: + with Image.open(path) as img: + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + else: + try: + img = Image.fromarray(img_array) + img.save(path) + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + else: + for i, path in enumerate(frame_paths): + try: + with Image.open(path) as img: + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + + return valid_paths, valid_indices, vid_fps + + def evaluate(self, eval_file, **judge_kwargs): + + from .utils.cgbench import get_dimention_rating_open_ended, post_process_open + + assert eval_file.endswith(".xlsx"), "data file should be an xlsx file" + + tgt_file = eval_file.replace(".xlsx", "_rating.json") + score_file = eval_file.replace(".xlsx", "_score.xlsx") + step_1_tmp_file = eval_file.replace(".xlsx", "_step_1.pkl") + step_2_tmp_file = eval_file.replace(".xlsx", "_step_2.pkl") + + data = load(eval_file) + + data_pred_no_na = data[~pd.isna(data["prediction"])] + data_pred_na = data[pd.isna(data["prediction"])] + + data_pred_na["model_result"] = -1 + data_pred_na["step_1_result"] = -1 + data_pred_na["step_2_result"] = -1 + data_pred_na["score"] = -1 + + data_pred_no_na["model_result"] = data_pred_no_na.apply( + lambda row: post_process_open( + response=row["prediction"], + ), + axis=1, + ) + + data_no_model_result = data_pred_no_na[data_pred_no_na["model_result"] == -1] + data_step_1 = data_pred_no_na[data_pred_no_na["model_result"] != -1] + + if judge_kwargs.get("model", None) != "gpt-4o-0806": + judge_kwargs["model"] = "gpt-4o-0806" + print("The judge model in cg-bench is gpt-4o-0806!") + + model_step_1 = build_judge(system_prompt=sys_prompt_open_eval_step_1, **judge_kwargs) + nproc = judge_kwargs.pop("nproc", 32) + + lines_step_1 = data_step_1.to_dict("records") + tups_step_1 = [(model_step_1, line) for line in lines_step_1] + + keys_step_1 = {line["qid"] for line in lines_step_1} + + ans = {} + if osp.exists(step_1_tmp_file): + ans = load(step_1_tmp_file) + tups_step_1 = [x for x, i in zip(tups_step_1, keys_step_1) if i not in ans] + keys_step_1 = [i for i in keys_step_1 if i not in ans] + + _ = track_progress_rich( + eval_open_first, + tups_step_1, + nproc=nproc, + keys=keys_step_1, + save=step_1_tmp_file, + ) + + step_1_results = load(step_1_tmp_file) + data_step_1 = save_step_1_steps(data_step_1, step_1_results) # -1, 0, 1, 2 + + data_no_step_1_results = data_step_1[data_step_1["step_1_result"] == -1] + data_step_1_over = data_step_1[data_step_1["step_1_result"].isin([0, 1])] + data_step_2 = data_step_1[data_step_1["step_1_result"] == 2] + + print(judge_kwargs) + + model_step_2 = build_judge(system_prompt=sys_prompt_open_eval_step_2, **judge_kwargs) + + lines_step_2 = data_step_2.to_dict("records") + + tups_step_2 = [] + + for line in tqdm(lines_step_2): + clue_intervals = eval(line["clue_intervals"]) + lmu_root = LMUDataRoot() + clue_frame_root = osp.join(lmu_root, "clue_images", self.dataset) + data_root = self.data_root + frame_paths, _, _ = save_clue_video_frames( + data_root, + clue_frame_root, + video=line["video"], + uid=line["qid"], + clue_intervals=clue_intervals, + num_frames=32, + ) + tups_step_2.append((model_step_2, line, frame_paths)) + + keys_step_2 = {line["qid"] for line in lines_step_2} + + ans = {} + if osp.exists(step_2_tmp_file): + ans = load(step_2_tmp_file) + tups_step_2 = [x for x, i in zip(tups_step_2, keys_step_2) if i not in ans] + keys_step_2 = [i for i in keys_step_2 if i not in ans] + + _ = track_progress_rich( + eval_open_second, + tups_step_2, + nproc=nproc, + keys=keys_step_2, + save=step_2_tmp_file, + ) + + step_2_results = load(step_2_tmp_file) + data_step_2 = save_step_2_steps(data_step_2, step_2_results) + + data_no_step_2_results = data_step_2[data_step_2["score"] == -1] + data_step_2_over = data_step_2[data_step_2["score"].isin([0, 1])] + + data = pd.concat( + [ + data_pred_na, + data_no_model_result, + data_no_step_1_results, + data_step_1_over, + data_no_step_2_results, + data_step_2_over, + ] + ) + + dump(data, score_file) + + rating = get_dimention_rating_open_ended(score_file) + + dump(rating, tgt_file) + + return rating + + +class CGBench_MCQ_Grounding(VideoBaseDataset): + + TYPE = "Video-MCQ-Grounding" + + MD5 = "eaead3d978a689269fefce4ae29c86df" + + SYS = { + "long_acc": ( + "You will be provided with sampled frames from a video, along with a " + "multiple-choice question that includes a question and several answer options.\n" + "Your task is to analyze the provided frames, infer the most plausible " + "answer based on the visual information.\n" + "If the video does not provide enough information, infer the answer based " + "on the options available and still provide a result. " + "Therefore, In all cases, an answer must be given.\n" + "Only output the answer in the following format:\n\n" + '```json\n{"result": "option"}\n```\n\n' + 'The "option" is the uppercase letter corresponding to your answer.\n\n' + ), + "clue_acc": ( + "You will be provided with sampled frames from a video, along with a " + "multiple-choice question that includes a question and several answer options.\n" + "Your task is to analyze the provided frames, infer the most plausible " + "answer based on the visual information.\n" + "If the video does not provide enough information, infer the answer based " + "on the options available and still provide a result. " + "Therefore, In all cases, an answer must be given.\n" + "Only output the answer in the following format:\n\n" + '```json\n{"result": "option"}\n```\n\n' + "The 'option' is the uppercase letter corresponding to your answer.\n\n" + ), + "miou": ( + "You will be provided with uniformly sampled frames from a video and their " + "timestamps, along with a multiple-choice question that includes a question " + "and several answer options.\n" + "Your task is to determine in which intervals the 'clue intervals' exist " + "that contain visual information needed to answer the question.\n" + "Only output the answer in the following format:\n\n" + '```json\n{"result": [[start1, end1], [start2, end2], ...]}\n```\n\n' + "In this output format, each 'start' and 'end' represents the beginning and " + "end of an interval in seconds where relevant clues can be found.\n" + "You must provide at least one interval and at most five intervals. " + "Intervals exceeding five will NOT be considered valid.\n" + ), + "miou_wo_frame_time": ( + "You will be provided with uniformly sampled frames from a video, along " + "with a multiple-choice question that includes a question and several " + "answer options.\n" + "Your task is to determine in which intervals the 'clue intervals' exist " + "that contain visual information needed to answer the question.\n" + "Only output the answer in the following format:\n\n" + '```json\n{"result": [[start1, end1], [start2, end2], ...]}\n```\n\n' + 'In this output format, each "start" and "end" represents the start and ' + "end of the video where the relevant clue can be found in the form of a " + "floating point number between 0 and 1, where 0 represents the start time " + "of the video and 1 represents the end time of the video.\n" + "You must provide at least one interval and at most five intervals. " + "Intervals exceeding five will NOT be considered valid.\n" + ), + } + + def __init__( + self, + dataset="CG-Bench_MCQ_Grounding", + use_subtitle=False, + use_subtitle_time=False, + use_frame_time=False, + nframe=0, + fps=-1, + ): + super().__init__(dataset=dataset, nframe=nframe, fps=fps) + self.use_subtitle = use_subtitle + self.use_subtitle_time = use_subtitle_time + self.use_frame_time = use_frame_time + self.dataset_name = dataset + lmu_root = LMUDataRoot() + self.clue_frame_root = osp.join(lmu_root, "clue_images", dataset) + + @classmethod + def supported_datasets(cls): + return ["CG-Bench_MCQ_Grounding"] + + def clue_frame_paths(self, qid, num_frames=8): + frame_root = osp.join(self.clue_frame_root, qid) + os.makedirs(frame_root, exist_ok=True) + return [osp.join(frame_root, self.frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)] + + def clue_frame_paths_fps(self, qid, num_frames=8, fps=-1): + frame_root = osp.join(self.clue_frame_root, qid) + os.makedirs(frame_root, exist_ok=True) + return [osp.join(frame_root, self.frame_tmpl_fps.format(i, num_frames, fps)) for i in range(1, num_frames + 1)] + + def get_subtitles(self, subtitle_path, frame_indices=None, fps=None, sub_time=False): + + subtitles = [] + + srt_path = osp.join(self.data_root, subtitle_path) + assert osp.exists(srt_path) + import pysubs2 + + subs = pysubs2.load(srt_path, encoding="utf-8") + if not frame_indices: + for sub in subs: + sub_text = sub.text.replace("\\N", " ") + if sub_time: + start_time = milliseconds_to_seconds(sub.start) + end_time = milliseconds_to_seconds(sub.end) + sub_text = f"[{start_time}, {end_time}] {sub_text}" + if sub_text.strip() and sub_text not in subtitles: + subtitles.append(sub_text) + else: + for selected_frame_id in frame_indices: + cur_time = pysubs2.make_time(fps=fps, frames=selected_frame_id) + for sub in subs: + if sub.start < cur_time and sub.end > cur_time: + sub_text = sub.text.replace("\\N", " ") + if sub_time: + start_time = milliseconds_to_seconds(sub.start) + end_time = milliseconds_to_seconds(sub.end) + sub_text = f"[{start_time}, {end_time}] {sub_text}" + if sub_text.strip() and sub_text not in subtitles: + subtitles.append(sub_text) + + if subtitles: + subtitles_str = '\n'.join(subtitles) + return f"The subtitles of the video are as follows:\n\n{subtitles_str}\n\n" + else: + return "" + + def prepare_dataset(self, dataset_name="CG-Bench_MCQ_Grounding", repo_id="CG-Bench/CG-Bench"): + + def check_integrity(pth): + data_file = osp.join(pth, f"{dataset_name}.tsv") + + if not os.path.exists(data_file): + return False + + if md5(data_file) != self.MD5: + return False + data = load(data_file) + for video_pth in data["video"]: + if not osp.exists(osp.join(pth, video_pth)): + return False + + for clue_video_pth in data["clue_video_path"]: + if clue_video_pth and not (isinstance(clue_video_pth, float) and np.isnan(clue_video_pth)): + if not osp.exists(osp.join(pth, clue_video_pth)): + return False + + return True + + cache_path = get_cache_path(repo_id) + + if cache_path is not None and check_integrity(cache_path): + dataset_path = cache_path + else: + + def generate_tsv(pth): + + tsv_file = osp.join(pth, f"{dataset_name}.tsv") + + task_modes = ["long_acc", "clue_acc", "miou"] + all_data = [] + for task_mode in task_modes: + with open(osp.join(pth, "cgbench.json"), "r") as f: + data_file = pd.DataFrame(json.load(f)) + + data_file = data_file.assign(index=range(len(data_file))) + data_file["video"] = data_file["video_uid"].apply(lambda x: f"cg_videos_720p/{x}.mp4") + data_file["subtitle_path"] = data_file["video_uid"].apply( + lambda x: ( + f"cg_subtitles/{x}.srt" + if osp.exists(osp.join(dataset_path, f"cg_subtitles/{x}.srt")) + else "" + ) + ) + + data_file["clue_video_path"] = "" + + if task_mode in ["clue_acc"]: + data_file["clue_video_path"] = data_file["clue_video_path"] = data_file.apply( + lambda row: f"cg_clue_videos/{row['qid']}.mp4", axis=1 + ) + + data_file["task_mode"] = task_mode + + if task_mode in ["clue_acc", "long_acc"]: + data_file["answer"] = data_file["right_answer"] + + if task_mode == "miou": + data_file["answer"] = data_file["clue_intervals"] + + if task_mode in ["long_acc", "miou"]: + data_file["clue_intervals"] = "" + + data_file = data_file[ + [ + "index", + "video_uid", + "video", + "duration", + "domain", + "choices", + "sub_category", + "subtitle_path", + "question", + "answer", + "task_mode", + "clue_intervals", + "qid", + "clue_video_path", + ] + ] + + all_data.append(data_file) + + final_data = pd.concat(all_data, ignore_index=True) + final_data["index"] = range(len(final_data)) + final_data.to_csv(tsv_file, sep="\t", index=False) + + if modelscope_flag_set(): + from modelscope import dataset_snapshot_download + + dataset_path = dataset_snapshot_download(dataset_id=repo_id) + else: + dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset") + + unzip_hf_zip(dataset_path) + generate_tsv(dataset_path) + + tsv_file = osp.join(dataset_path, f"{dataset_name}.tsv") + + return dict(data_file=tsv_file, root=dataset_path) + + def build_prompt(self, line, video_llm): + + if isinstance(line, int): + assert line < len(self) + line = self.data.iloc[line] + + task_mode = line["task_mode"] + + message = [] + + origin_use_subtitle_time = self.use_subtitle_time + + try: + if task_mode in ["long_acc", "clue_acc"]: + system_prompt = self.SYS[task_mode] + elif task_mode == "miou": + if self.use_frame_time and not video_llm: + system_prompt = self.SYS[task_mode] + else: + system_prompt = self.SYS["miou_wo_frame_time"] + if self.use_subtitle_time is True: + self.use_subtitle_time = False + + user_prompt = "" + + if task_mode in ["long_acc", "miou"]: + video_path = line["video"] + + if video_llm: + message.append(dict(type="video", value=osp.join(self.data_root, video_path))) + + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + if self.nframe: + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps + ) + user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices, + fps=vid_fps, sub_time=self.use_subtitle_time) + else: + user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time) + else: + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps + ) + message.extend(dict(type="image", value=im) for im in image_paths) + + if self.use_frame_time: + user_prompt += get_timestampes(frame_indices, vid_fps) + + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + user_prompt += self.get_subtitles( + line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps, + sub_time=self.use_subtitle_time + ) + + elif task_mode == "clue_acc": + clue_video_path = line["clue_video_path"] + video_path = line["video"] + + if video_llm: + message.append(dict(type="video", value=osp.join(self.data_root, clue_video_path))) + print(message) + + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + if self.nframe: + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps + ) + user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices, + fps=vid_fps, sub_time=self.use_subtitle_time) + else: + user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time) + else: + if self.nframe > 32: + self.nframe = 32 + print("The maximum number of frames is 32 when evaluating clue-based mcq in CG-Bench !") + + clue_intervals = eval(line["clue_intervals"]) + + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["qid"], clue_intervals=clue_intervals, num_frames=self.nframe, fps=self.fps + ) + + message.extend(dict(type="image", value=im) for im in image_paths) + + if self.use_frame_time: + user_prompt += get_timestampes(frame_indices, vid_fps) + + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + user_prompt += self.get_subtitles( + line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps, + sub_time=self.use_subtitle_time + ) + + question = line["question"] + user_prompt += f"Question: {question}\n\n" + + choices = eval(line["choices"]) + labels = [chr(ord("A") + i) for i in range(len(choices))] + user_prompt += "\n".join([f"{label}:{value}" for label, value in zip(labels, choices)]) + "\n\n" + + message.append(dict(type="text", value=system_prompt + user_prompt)) + + return message + + finally: + # Ensure that `use_subtitle_time` is always restored to its original value + self.use_subtitle_time = origin_use_subtitle_time + + def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-1): + + if type(uid) is not str: + uid = str(uid) + + vid_path = osp.join(self.data_root, video) + vid = decord.VideoReader(vid_path) + vid_fps = vid.get_avg_fps() + n_frames = len(vid) + + if clue_intervals is not None: + merged_intervals = merge_intervals(clue_intervals) + + if num_frames > 0 and fps < 0: + indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps) + frame_paths = self.clue_frame_paths(uid, len(indices)) + + elif fps > 0: + frame_indices = [] + for start, end in merged_intervals: + start_frame = int(start * vid_fps) + end_frame = int(end * vid_fps) + step = vid_fps / fps + interval_indices = [ + int(start_frame + i * step) for i in range(int((end_frame - start_frame) / step)) + ] + frame_indices.extend(interval_indices) + + if len(frame_indices) < 32: + indices = sample_frames_clue_average(merged_intervals, 32, vid_fps) + else: + indices = frame_indices + frame_paths = self.clue_frame_paths_fps(uid, len(indices), fps) + + else: + if num_frames > 0 and fps < 0: + step_size = len(vid) / (num_frames + 1) + indices = [int(i * step_size) for i in range(1, num_frames + 1)] + + frame_paths = self.frame_paths(uid) + elif fps > 0: + total_duration = n_frames / vid_fps + required_frames = int(total_duration * fps) + step_size = vid_fps / fps + indices = [int(i * step_size) for i in range(required_frames)] + frame_paths = self.frame_paths_fps(uid, len(indices)) + + # Save and validate frames + valid_paths = [] + valid_indices = [] + + if not np.all([osp.exists(p) for p in frame_paths]): + images = [vid[i].asnumpy() for i in indices] + for i, (img_array, path) in enumerate(zip(images, frame_paths)): + if osp.exists(path): + try: + with Image.open(path) as img: + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + else: + try: + img = Image.fromarray(img_array) + img.save(path) + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + else: + for i, path in enumerate(frame_paths): + try: + with Image.open(path) as img: + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + + return valid_paths, valid_indices, vid_fps + + def evaluate(self, eval_file, **judge_kwargs): + + assert eval_file.endswith(".xlsx"), "data file should be an xlsx file" + + tgt_file = eval_file.replace(".xlsx", "_rating.json") + score_file = eval_file.replace(".xlsx", "_score.xlsx") + + data = load(eval_file) + + data_un = data[~pd.isna(data["prediction"])] + data_pred_na = data[pd.isna(data["prediction"])] + + data_pred_na["score"] = -1 + + data_un["score"] = data_un.apply( + lambda row: post_process( + response=row["prediction"], + right_answer=row["answer"], + task_mode=row["task_mode"], + duration=row["duration"], + ), + axis=1, + ) + + data = pd.concat([data_pred_na, data_un]) + + rejected_count = (data["score"] == -1).sum() + + print( + f"Among {len(data)} questions, " + f"failed to obtain prediction for {len(data_pred_na)} questions, " + f"failed to obtain the score for {rejected_count - len(data_pred_na)} questions. " + f"Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating." + ) + + dump(data, score_file) + + rating = get_dimention_rating_mcq_grouding(score_file) + + dump(rating, tgt_file) + + return rating + + +# 评估时,step_2 评估时,给出 [prompt] + image_paths 就行 +class CGBench_OpenEnded(VideoBaseDataset): + + TYPE = "Video-OpenEnded" + + dataset = "CG-Bench_OpenEnded" + + MD5 = "796035eda0b1e916c517cdc1bc145cfc" + + SYS = ( + "You will be provided with sampled frames from a video, along with a " + "question.\n" + "Your task is to analyze the provided frames and infer the most plausible " + "answer based on the visual information.\n" + "If the visual information is ambiguous or insufficient, use the available " + "context to reason your answer.\n" + "Only output the answer in the following format:\n\n" + '```json\n{"result": "answer"}\n```\n\n' + 'The "answer" can be a word, phrase, or sentence that directly responds to ' + "the question.\n\n" + ) + + def __init__( + self, + dataset="CG-Bench_OpenEnded", + use_subtitle=False, + use_subtitle_time=False, + use_frame_time=False, + nframe=0, + fps=-1, + ): + super().__init__(dataset=dataset, nframe=nframe, fps=fps) + self.use_subtitle = use_subtitle + self.use_subtitle_time = use_subtitle_time + self.use_frame_time = use_frame_time + self.dataset_name = dataset + lmu_root = LMUDataRoot() + self.clue_frame_root = osp.join(lmu_root, "clue_images", dataset) + + @classmethod + def supported_datasets(cls): + return ["CG-Bench_OpenEnded"] + + def get_subtitles(self, subtitle_path, frame_indices=None, fps=None, sub_time=False): + + subtitles = [] + + srt_path = osp.join(self.data_root, subtitle_path) + assert osp.exists(srt_path) + import pysubs2 + + subs = pysubs2.load(srt_path, encoding="utf-8") + if not frame_indices: + for sub in subs: + sub_text = sub.text.replace("\\N", " ") + if sub_time: + start_time = milliseconds_to_seconds(sub.start) + end_time = milliseconds_to_seconds(sub.end) + sub_text = f"[{start_time}, {end_time}] {sub_text}" + if sub_text.strip() and sub_text not in subtitles: + subtitles.append(sub_text) + else: + for selected_frame_id in frame_indices: + cur_time = pysubs2.make_time(fps=fps, frames=selected_frame_id) + for sub in subs: + if sub.start < cur_time and sub.end > cur_time: + sub_text = sub.text.replace("\\N", " ") + if sub_time: + start_time = milliseconds_to_seconds(sub.start) + end_time = milliseconds_to_seconds(sub.end) + sub_text = f"[{start_time}, {end_time}] {sub_text}" + if sub_text.strip() and sub_text not in subtitles: + subtitles.append(sub_text) + + if subtitles: + subtitles_str = '\n'.join(subtitles) + return f"The subtitles of the video are as follows:\n\n{subtitles_str}\n\n" + else: + return "" + + def prepare_dataset(self, dataset_name="CG-Bench_OpenEnded", repo_id="CG-Bench/CG-Bench"): + + def check_integrity(pth): + data_file = osp.join(pth, f"{dataset_name}.tsv") + + if not os.path.exists(data_file): + return False + + if md5(data_file) != self.MD5: + return False + data = load(data_file) + for video_pth in data["video"]: + if not osp.exists(osp.join(pth, video_pth)): + return False + + return True + + cache_path = get_cache_path(repo_id) + + if cache_path is not None and check_integrity(cache_path): + dataset_path = cache_path + else: + + def generate_tsv(pth): + + tsv_file = osp.join(pth, f"{dataset_name}.tsv") + + with open(osp.join(pth, "cgbench.json"), "r") as f: + data_file = pd.DataFrame(json.load(f)) + + data_file = data_file.assign(index=range(len(data_file))) + data_file["video"] = data_file["video_uid"].apply(lambda x: f"cg_videos_720p/{x}.mp4") + data_file["subtitle_path"] = data_file["video_uid"].apply( + lambda x: f"cg_subtitles/{x}.srt" if osp.exists(osp.join(pth, f"cg_subtitles/{x}.srt")) else "" + ) + + data_file = data_file[ + [ + "index", + "video_uid", + "video", + "duration", + "domain", + "sub_category", + "subtitle_path", + "question", + "answer", + "clue_intervals", + "qid", + ] + ] + + data_file.to_csv(tsv_file, sep="\t", index=False) + + if modelscope_flag_set(): + from modelscope import dataset_snapshot_download + dataset_path = dataset_snapshot_download(dataset_id=repo_id) + else: + dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset") + + unzip_hf_zip(dataset_path) + generate_tsv(dataset_path) + + tsv_file = osp.join(dataset_path, f"{dataset_name}.tsv") + + return dict(data_file=tsv_file, root=dataset_path) + + def build_prompt(self, line, video_llm): + + if isinstance(line, int): + assert line < len(self) + line = self.data.iloc[line] + + message = [] + + sys_prompt = self.SYS + + user_prompt = "" + + video_path = line["video"] + + if video_llm: + message.append(dict(type="video", value=osp.join(self.data_root, video_path))) + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + if self.nframe: + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps + ) + user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices, + fps=vid_fps, sub_time=self.use_subtitle_time) + else: + user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time) + else: + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps + ) + message.extend(dict(type="image", value=im) for im in image_paths) + + if self.use_frame_time: + user_prompt += get_timestampes(frame_indices, vid_fps) + + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + user_prompt += self.get_subtitles( + line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps, + sub_time=self.use_subtitle_time + ) + + question = line["question"] + user_prompt += f"Question: {question}\n\n" + + message.append(dict(type="text", value=sys_prompt + user_prompt)) + + return message + + def clue_frame_paths(self, qid, num_frames=8): + frame_root = osp.join(self.clue_frame_root, qid) + os.makedirs(frame_root, exist_ok=True) + return [osp.join(frame_root, self.frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)] + + def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-1): + + if type(uid) is not str: + uid = str(uid) + + vid_path = osp.join(self.data_root, video) + vid = decord.VideoReader(vid_path) + vid_fps = vid.get_avg_fps() + n_frames = len(vid) + + if clue_intervals is not None: + merged_intervals = merge_intervals(clue_intervals) + + if num_frames > 0 and fps < 0: + indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps) + frame_paths = self.clue_frame_paths(uid, len(indices)) + + elif fps > 0: + frame_indices = [] + for start, end in merged_intervals: + start_frame = int(start * vid_fps) + end_frame = int(end * vid_fps) + step = vid_fps / fps + interval_indices = [ + int(start_frame + i * step) for i in range(int((end_frame - start_frame) / step)) + ] + frame_indices.extend(interval_indices) + + if len(frame_indices) < 32: + indices = sample_frames_clue_average(merged_intervals, 32, vid_fps) + else: + indices = frame_indices + frame_paths = self.clue_frame_paths_fps(uid, len(indices), fps) + + else: + if num_frames > 0 and fps < 0: + step_size = len(vid) / (num_frames + 1) + indices = [int(i * step_size) for i in range(1, num_frames + 1)] + frame_paths = self.frame_paths(uid) + elif fps > 0: + total_duration = n_frames / vid_fps + required_frames = int(total_duration * fps) + step_size = vid_fps / fps + indices = [int(i * step_size) for i in range(required_frames)] + frame_paths = self.frame_paths_fps(uid, len(indices)) + + valid_paths = [] + valid_indices = [] + + if not np.all([osp.exists(p) for p in frame_paths]): + images = [vid[i].asnumpy() for i in indices] + for i, (img_array, path) in enumerate(zip(images, frame_paths)): + if osp.exists(path): + try: + with Image.open(path) as img: + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + else: + try: + img = Image.fromarray(img_array) + img.save(path) + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + else: + for i, path in enumerate(frame_paths): + try: + with Image.open(path) as img: + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + + return valid_paths, valid_indices, vid_fps + + def evaluate(self, eval_file, **judge_kwargs): + + from .utils.cgbench import get_dimention_rating_open_ended, post_process_open + + assert eval_file.endswith(".xlsx"), "data file should be an xlsx file" + + tgt_file = eval_file.replace(".xlsx", "_rating.json") + score_file = eval_file.replace(".xlsx", "_score.xlsx") + step_1_tmp_file = eval_file.replace(".xlsx", "_step_1.pkl") + step_2_tmp_file = eval_file.replace(".xlsx", "_step_2.pkl") + + data = load(eval_file) + + data_pred_no_na = data[~pd.isna(data["prediction"])] + data_pred_na = data[pd.isna(data["prediction"])] + + data_pred_na["model_result"] = -1 + data_pred_na["step_1_result"] = -1 + data_pred_na["step_2_result"] = -1 + data_pred_na["score"] = -1 + + data_pred_no_na["model_result"] = data_pred_no_na.apply( + lambda row: post_process_open( + response=row["prediction"], + ), + axis=1, + ) + + if judge_kwargs.get("model", None) != "gpt-4o-0806": + judge_kwargs["model"] = "gpt-4o-0806" + print("The judge model in cg-bench is gpt-4o-0806!") + + data_no_model_result = data_pred_no_na[data_pred_no_na["model_result"] == -1] + data_step_1 = data_pred_no_na[data_pred_no_na["model_result"] != -1] + + model_step_1 = build_judge(system_prompt=sys_prompt_open_eval_step_1, **judge_kwargs) + nproc = judge_kwargs.pop('nproc', 32) + + lines_step_1 = data_step_1.to_dict("records") + tups_step_1 = [(model_step_1, line) for line in lines_step_1] + + keys_step_1 = {line["qid"] for line in lines_step_1} + + ans = {} + if osp.exists(step_1_tmp_file): + ans = load(step_1_tmp_file) + tups_step_1 = [x for x, i in zip(tups_step_1, keys_step_1) if i not in ans] + keys_step_1 = [i for i in keys_step_1 if i not in ans] + + _ = track_progress_rich( + eval_open_first, + tups_step_1, + nproc=nproc, + keys=keys_step_1, + save=step_1_tmp_file, + ) + + step_1_results = load(step_1_tmp_file) + data_step_1 = save_step_1_steps(data_step_1, step_1_results) # -1, 0, 1, 2 + + data_no_step_1_results = data_step_1[data_step_1["step_1_result"] == -1] + data_step_1_over = data_step_1[data_step_1["step_1_result"].isin([0, 1])] + data_step_2 = data_step_1[data_step_1["step_1_result"] == 2] + + model_step_2 = build_judge(system_prompt=sys_prompt_open_eval_step_2, **judge_kwargs) + + lines_step_2 = data_step_2.to_dict("records") + + tups_step_2 = [] + + for line in tqdm(lines_step_2): + clue_intervals = eval(line["clue_intervals"]) + lmu_root = LMUDataRoot() + clue_frame_root = osp.join(lmu_root, "clue_images", self.dataset) + data_root = self.data_root + frame_paths, _, _ = save_clue_video_frames( + data_root, + clue_frame_root, + video=line["video"], + uid=line["qid"], + clue_intervals=clue_intervals, + num_frames=32, + ) + tups_step_2.append((model_step_2, line, frame_paths)) + + keys_step_2 = {line["qid"] for line in lines_step_2} + + ans = {} + if osp.exists(step_2_tmp_file): + ans = load(step_2_tmp_file) + tups_step_2 = [x for x, i in zip(tups_step_2, keys_step_2) if i not in ans] + keys_step_2 = [i for i in keys_step_2 if i not in ans] + + _ = track_progress_rich( + eval_open_second, + tups_step_2, + nproc=nproc, + keys=keys_step_2, + save=step_2_tmp_file, + ) + + step_2_results = load(step_2_tmp_file) + data_step_2 = save_step_2_steps(data_step_2, step_2_results) + + data_no_step_2_results = data_step_2[data_step_2["score"] == -1] + data_step_2_over = data_step_2[data_step_2["score"].isin([0, 1])] + + data = pd.concat( + [ + data_pred_na, + data_no_model_result, + data_no_step_1_results, + data_step_1_over, + data_no_step_2_results, + data_step_2_over, + ] + ) + + dump(data, score_file) + + rating = get_dimention_rating_open_ended(score_file) + + dump(rating, tgt_file) + + return rating diff --git a/vlmeval/dataset/utils/cgbench.py b/vlmeval/dataset/utils/cgbench.py new file mode 100644 index 00000000..eaf643bc --- /dev/null +++ b/vlmeval/dataset/utils/cgbench.py @@ -0,0 +1,682 @@ +from ...smp import * +from .multiple_choice import extract_answer_from_item +import pandas as pd +import numpy as np +import re + +FAIL_MSG = "Failed to obtain answer via API." + +frame_tmpl = "frame-{}-of-{}.jpg" + +sys_prompt_open_eval_step_1 = ( + "You will be provided with a question, a model's prediction, and the ground " + "truth answer for this question.\n" + "Your task is to judge whether the model's prediction is correct based on the " + "meaning of the two texts.\n" + "In most cases, this can be done by determining if the meaning of the model's " + "prediction is consistent with, or contains, the ground truth answer. However, " + "in some cases where the two texts differ, it may represent different " + "descriptions of the same visual scene, in which case visual information is " + "needed for further judgment.\n" + "Therefore, I hope you:\n" + "- Output 0, if the model's prediction and the ground truth answer are neither " + "consistent nor related by inclusion, with fundamentally different meanings.\n" + "- Output 1, if the meaning of the model's prediction and the ground truth " + "answer is consistent, or if the model's prediction meaningfully contains the " + "ground truth answer.\n" + "- Output 2, if the model's prediction and ground truth are not consistent or " + "inclusive, but may be different descriptions of the same visual scene, " + "requiring visual information for further judgment.\n" + "Only output the answer in the following format:\n\n" + '```json\n{"result": choice}\n```\n\n' + "The choice is either 0, 1, or 2 as specified above." +) + +sys_prompt_open_eval_step_2 = ( + "You will be provided with a question, a model's prediction, and the sampling " + "frames of the clue intervals related to this question.\n" + "Your task is to determine whether the model has answered the question " + "correctly based on the visual information provided.\n" + "Therefore, I hope you:\n" + "- Output 0, if the model's prediction does not correctly answer the question.\n" + "- Output 1, if the model's prediction correctly answers the question.\n" + "Only output the answer in the following format without output extra " + "explanation:\n\n" + '```json\n{"result": choice}\n```\n\n' + "The choice is either 0 or 1 as specified above." +) + +FAIL_MSG = "Failed to obtain answer via API." + +# '10-20', '20-30', '30-40', '40-50', '50-60' +DURATIONS = ["0 ~ 10", "10 ~ 20", "20 ~ 30", "30 ~ 40", "40 ~ 50", "50 ~ 60", "60+"] + +DOMAINS = [ + "Life Record", + "Music & TV show", + "Instruction & Knowledge", + "Driving", + "Embodied Expert", + "Humor/funny", + "Electonic/Social Gaming", + "Security & Health", + "Sports & Exercise", + "Special Scenes", + "Art & Culture", + "GUI", + "News", + "Animal & Pet", +] + +SUB_CATEGORIES = [ + "Time Cognition", + "Hallucination", + "Entity Perception", + "2D Spatial Perception", + "Time Perception", + "Scene Perception", + "Text Perception", + "Event Cognition", + "Entity Cognition", + "Text Cognition", + "Event Perception", + "Scene Cognition", +] + + +def get_dimention_rating_open_ended(data_path): + # 读取数据 + df = load(data_path) + + df = df[df["score"] != -1] + + # 将秒转换为分钟并分配到对应区间 + df["duration_minutes"] = df["duration"] / 60 + df["duration_range"] = pd.cut( + df["duration_minutes"], bins=[-np.inf, 10, 20, 30, 40, 50, 60, np.inf], labels=DURATIONS + ) + + # 初始化结果字典 + result = { + "overall": 0, + "duration": {k: 0 for k in DURATIONS}, + "domain": {k: 0 for k in DOMAINS}, + "sub_category": {k: 0 for k in SUB_CATEGORIES}, + } + + # Overall + result["overall"] = round(df["score"].mean(), 4) + + # Duration + for dur in DURATIONS: + dur_scores = df[df["duration_range"] == dur]["score"] + result["duration"][dur] = round(dur_scores.mean(), 4) if not dur_scores.empty else 0 + + # Domain + for domain in DOMAINS: + domain_scores = df[df["domain"] == domain]["score"] + result["domain"][domain] = round(domain_scores.mean(), 4) if not domain_scores.empty else 0 + + # Sub-category + for sub_cat in SUB_CATEGORIES: + sub_cat_scores = df[df["sub_category"] == sub_cat]["score"] + result["sub_category"][sub_cat] = round(sub_cat_scores.mean(), 4) if not sub_cat_scores.empty else 0 + + return result + + +def get_dimention_rating_mcq_grouding(data_path): + + # 读取数据 + df = load(data_path) + + # df.loc[(df['task_mode'] == 'miou') & (df['score'] == -1), 'score'] = 0 + + df = df[df["score"] != -1] + + # 将秒转换为分钟并分配到对应区间 + df["duration_minutes"] = df["duration"] / 60 + df["duration_range"] = pd.cut( + df["duration_minutes"], bins=[-np.inf, 10, 20, 30, 40, 50, 60, np.inf], labels=DURATIONS + ) + + # 初始化结果字典 + result = { + metric: { + "overall": 0, + "duration": {k: 0 for k in DURATIONS}, + "domain": {k: 0 for k in DOMAINS}, + "sub_category": {k: 0 for k in SUB_CATEGORIES}, + } + for metric in ["long_acc", "clue_acc", "miou", "CRR", "acc@iou", "rec@iou"] + } + + # 计算基础指标 + for metric in ["long_acc", "clue_acc", "miou"]: + metric_df = df[df["task_mode"] == metric] + + # Overall + result[metric]["overall"] = round(metric_df["score"].mean(), 4) + + # Duration + for dur in DURATIONS: + dur_scores = metric_df[metric_df["duration_range"] == dur]["score"] + result[metric]["duration"][dur] = round(dur_scores.mean(), 4) if not dur_scores.empty else 0 + + # Domain + for domain in DOMAINS: + domain_scores = metric_df[metric_df["domain"] == domain]["score"] + result[metric]["domain"][domain] = round(domain_scores.mean(), 4) if not domain_scores.empty else 0 + + # Sub-category + for sub_cat in SUB_CATEGORIES: + sub_cat_scores = metric_df[metric_df["sub_category"] == sub_cat]["score"] + result[metric]["sub_category"][sub_cat] = round(sub_cat_scores.mean(), 4) if not sub_cat_scores.empty else 0 + + # 计算复合指标 CRR + def calculate_crr(scores): + long_acc = scores[scores["task_mode"] == "long_acc"]["score"].mean() + clue_acc = scores[scores["task_mode"] == "clue_acc"]["score"].mean() + return round(min(long_acc, clue_acc) / clue_acc, 4) if clue_acc != 0 else 0 + + # Overall CRR + result["CRR"]["overall"] = calculate_crr(df) + + # Duration CRR + for dur in DURATIONS: + dur_df = df[df["duration_range"] == dur] + result["CRR"]["duration"][dur] = calculate_crr(dur_df) + + # Domain CRR + for domain in DOMAINS: + domain_df = df[df["domain"] == domain] + result["CRR"]["domain"][domain] = calculate_crr(domain_df) + + # Sub-category CRR + for sub_cat in SUB_CATEGORIES: + sub_cat_df = df[df["sub_category"] == sub_cat] + result["CRR"]["sub_category"][sub_cat] = calculate_crr(sub_cat_df) + + # 计算 acc@iou + def calculate_acc_at_iou_threshold(scores, threshold): + + miou_qids = set(scores[scores["task_mode"] == "miou"]["qid"]) + + long_acc_qids = set(scores[scores["task_mode"] == "long_acc"]["qid"]) + + valid_qids = miou_qids & long_acc_qids + + miou_positive = set(scores[(scores["task_mode"] == "miou") & (scores["score"] > threshold)]["qid"]) + + long_acc_positive = scores[ + (scores["task_mode"] == "long_acc") & (scores["qid"].isin(miou_positive)) & (scores["score"] == 1) + ] + + acc_at_iou_threshold = len(long_acc_positive) / len(valid_qids) if len(valid_qids) > 0 else 0 + return round(acc_at_iou_threshold, 4) + + def calculate_acc_at_iou(scores): + thresholds = [0.1, 0.2, 0.3, 0.4, 0.5] + acc_at_iou_values = [calculate_acc_at_iou_threshold(scores, threshold) for threshold in thresholds] + + return round(sum(acc_at_iou_values) / len(acc_at_iou_values), 4) + + # Overall acc@iou + result["acc@iou"]["overall"] = calculate_acc_at_iou(df) + + # Duration acc@iou + for dur in DURATIONS: + dur_df = df[df["duration_range"] == dur] + result["acc@iou"]["duration"][dur] = calculate_acc_at_iou(dur_df) + + # Domain acc@iou + for domain in DOMAINS: + domain_df = df[df["domain"] == domain] + result["acc@iou"]["domain"][domain] = calculate_acc_at_iou(domain_df) + + # Sub-category acc@iou + for sub_cat in SUB_CATEGORIES: + sub_cat_df = df[df["sub_category"] == sub_cat] + result["acc@iou"]["sub_category"][sub_cat] = calculate_acc_at_iou(sub_cat_df) + + # 计算 rec@iou + def calculate_rec_at_iou_threshold(scores, threshold): + # 获取所有 miou 类型的数据 + miou_scores = scores[scores["task_mode"] == "miou"] + + # 计算 miou score 大于 threshold 的数量 + miou_positive = miou_scores[miou_scores["score"] > threshold] + + # 计算比例 + rec_at_iou = len(miou_positive) / len(miou_scores) if len(miou_scores) > 0 else 0 + + return round(rec_at_iou, 4) + + def calculate_rec_at_iou(scores): + thresholds = [0.1, 0.2, 0.3, 0.4, 0.5] + rec_at_iou_values = [calculate_rec_at_iou_threshold(scores, threshold) for threshold in thresholds] + + return round(sum(rec_at_iou_values) / len(rec_at_iou_values), 4) + + # Overall rec@iou + result["rec@iou"]["overall"] = calculate_rec_at_iou(df) + + # Duration rec@iou + for dur in DURATIONS: + dur_df = df[df["duration_range"] == dur] + result["rec@iou"]["duration"][dur] = calculate_rec_at_iou(dur_df) + + # Domain rec@iou + for domain in DOMAINS: + domain_df = df[df["domain"] == domain] + result["rec@iou"]["domain"][domain] = calculate_rec_at_iou(domain_df) + + # Sub-category rec@iou + for sub_cat in SUB_CATEGORIES: + sub_cat_df = df[df["sub_category"] == sub_cat] + result["rec@iou"]["sub_category"][sub_cat] = calculate_rec_at_iou(sub_cat_df) + + return result + + +def milliseconds_to_seconds(milliseconds): + return milliseconds / 1000 + + +def sample_frames_clue_average(clues_time_intervals, frame_num, fps): + # 计算每个线索区间的时长 + clues_frame_intervals = [(round(interval[0] * fps), round(interval[1] * fps)) for interval in clues_time_intervals] + clue_durations = [interval[1] - interval[0] for interval in clues_frame_intervals] + total_duration = sum(clue_durations) + # 如果 frame_num 的数量大于等于总帧数, 则直接返回全部帧 + if frame_num >= total_duration: + return [frame for interval in clues_frame_intervals for frame in range(interval[0], interval[1])] + frames_per_clue = [int(frame_num * (duration / total_duration)) for duration in clue_durations] + frame_indices = [] + for i, (interval, num_frames) in enumerate(zip(clues_frame_intervals, frames_per_clue)): + num_frames = max(1, num_frames) + seg_size = (interval[1] - interval[0]) / num_frames + clue_frame_indices = [int(interval[0] + seg_size / 2 + seg_size * idx) for idx in range(num_frames)] + frame_indices.extend(clue_frame_indices) + return frame_indices + + +def merge_intervals(intervals): + """ + Merge overlapping intervals in a list. + Assumes each interval is a list [start, end]. + """ + if not intervals: + return [] + + # Sort intervals by start time + intervals.sort(key=lambda x: x[0]) + + merged = [intervals[0]] + + for current in intervals[1:]: + last_merged = merged[-1] + + # Check if there is an overlap + if current[0] <= last_merged[1]: + # Merge the current interval with the last one + last_merged[1] = max(last_merged[1], current[1]) + else: + # No overlap, add current interval + merged.append(current) + + return merged + + +def calculate_intervals_iou(intervals1, intervals2): + """ + Calculate the IoU of two lists of intervals. + Each list contains intervals represented as [start, end]. + """ + # Merge overlapping intervals in both lists + merged1 = merge_intervals(intervals1) + merged2 = merge_intervals(intervals2) + + # Calculate total length of intervals for both lists + def total_length(merged_intervals): + return sum(end - start for start, end in merged_intervals) + + length1 = total_length(merged1) + length2 = total_length(merged2) + + # Calculate intersection length + intersection_length = 0 + for interval1 in merged1: + for interval2 in merged2: + intersection_start = max(interval1[0], interval2[0]) + intersection_end = min(interval1[1], interval2[1]) + intersection_length += max(0, intersection_end - intersection_start) + # Calculate union length + union_length = length1 + length2 - intersection_length + # IoU is intersection divided by union + iou = intersection_length / union_length if union_length > 0 else 0 + return iou + + +def post_process(response, right_answer, task_mode, duration): + result = -1 + + if response: + # 找到 ```json 和 ``` 的位置 + json_start = response.find("```json") + json_end = response.find("```", json_start + len("```json")) + + # 如果找到了 json 内容 + if json_start != -1 and json_end != -1: + json_content = response[json_start + len("```json"):json_end].strip() + else: + json_content = "" + + if json_content: + if task_mode in ["long_acc", "clue_acc"]: + json_content = re.sub(r"(?<=:\s)([A-Za-z_]\w*)", r'"\1"', json_content) + + try: + model_result = json.loads(json_content)["result"] + + if task_mode in ["long_acc", "clue_acc"]: + result = 1 if right_answer == model_result else 0 + elif task_mode == "miou": + if not isinstance(model_result, list): + return -1 + if not isinstance(model_result[0], list): + model_result = [model_result] + + need_duration = all(interval[0] <= 1 and interval[1] <= 1 for interval in model_result) + + if need_duration: + model_result = [[interval[0] * duration, interval[1] * duration] for interval in model_result] + + right_answer = eval(right_answer) + + result = calculate_intervals_iou(right_answer, model_result) + + except Exception as e: + print(f"Error in parsing JSON: {e}, {json_content}") + + if result == -1: + if task_mode in ["long_acc", "clue_acc"]: + # 检查是否存在大写字母 A-H,认为其为模型答案 + matches = re.findall(r"\b[A-H]\b", response) + if matches: + result = 1 if right_answer in matches else 0 + elif task_mode == "miou": + # 提取所有实数,进行配对 + numbers = re.findall(r"-?\d+\.?\d*", response) + if len(numbers) < 2: + result = -1 + else: + if len(numbers) % 2 != 0: + numbers = numbers[:-1] + model_result = [[float(numbers[i]), float(numbers[i + 1])] for i in range(0, len(numbers), 2)] + + if type(right_answer) is str: + right_answer = eval(right_answer) + + result = calculate_intervals_iou(right_answer, model_result) + + return result + + +def get_timestampes(frame_indices, fps): + seconds = list(map(lambda x: str(round(x / fps, 4)), frame_indices)) + timestamps = ", ".join(seconds) + return "A total of {frame_num} frames are sampled. Their corresponding timestamps are:\n\n{timestamps}\n\n".format( + frame_num=len(frame_indices), timestamps=timestamps + ) + + +def post_process_open(response): + model_result = -1 + + if response and response != FAIL_MSG: + json_start = response.find("```json") + json_end = response.find("```", json_start + len("```json")) + + # 如果找到了 json 内容 + if json_start != -1 and json_end != -1: + json_content = response[json_start + len("```json"):json_end].strip() + else: + json_content = "" + + if json_content: + try: + model_result = json.loads(json_content)["result"] + except Exception as e: + print(f"Error in parsing JSON: {e}, {json_content}") + + if model_result == -1: + model_result = response + + return model_result + + +def post_process_eval_open(response, step): + + model_result = -1 + + if response and response != FAIL_MSG: + + json_start = response.find("```json") + json_end = response.find("```", json_start + len("```json")) + + if json_start != -1 and json_end != -1: + json_content = response[json_start + len("```json"):json_end].strip() + else: + json_content = "" + + if json_content: + try: + model_result = json.loads(json_content)["result"] + except Exception as e: + print(f"Error in parsing JSON: {e}, {json_content}") + return -1 + if model_result == -1: + if step == 1: + match = re.search(r"[012]", response) + if match: + model_result = int(match.group()) + else: + match = re.search(r"[01]", response) + if match: + model_result = int(match.group()) + + return model_result + + +def eval_open_first(model, line): + + user_prompt = "" + + user_prompt += f"Question: {line['question']}\n\n" + + user_prompt += f"The ground truth answer is '{line['answer']}'\n\n" + + user_prompt += f"The model's prediction is '{line['model_result']}'\n\n" + + result = model.generate(user_prompt) + + return result + + +def save_step_1_steps(data, step_1_results): + + # 处理所有结果 + data["step_1_result"] = data["qid"].map(lambda x: post_process_eval_open(step_1_results[x], 1)) + + # 条件更新 + mask = data["step_1_result"].isin([-1, 0, 1]) + data.loc[mask, "step_2_result"] = data.loc[mask, "step_1_result"] + data.loc[mask, "score"] = data.loc[mask, "step_1_result"] + + return data + + +def eval_open_second(model, line, frame_paths): + + user_prompt = "" + + user_prompt += f"Question: {line['question']}\n\n" + + user_prompt += f"The model's prediction is '{line['model_result']}'\n\n" + + result = model.generate([user_prompt] + frame_paths) + + return result + + +def save_step_2_steps(data, step_1_results): + + # 处理所有结果 + data["score"] = data["qid"].map(lambda x: post_process_eval_open(step_1_results[x], 2)) + + return data + + +def clue_frame_paths(clue_frame_root, qid, num_frames=8): + frame_root = osp.join(clue_frame_root, str(qid)) + os.makedirs(frame_root, exist_ok=True) + return [osp.join(frame_root, frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)] + + +def save_clue_video_frames(data_root, clue_frame_root, video, uid, clue_intervals=None, num_frames=8, fps=-1): + + if type(uid) is str: + uid = str(uid) + + vid_path = osp.join(data_root, video) + vid = decord.VideoReader(vid_path) + vid_fps = vid.get_avg_fps() + + if clue_intervals is not None: + # 1. 合并重叠区间 + merged_intervals = merge_intervals(clue_intervals) + + if num_frames > 0 and fps < 0: + # 2. 基于clue_intervals均匀抽帧 + indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps) + frame_paths = clue_frame_paths(clue_frame_root, uid, len(indices)) + + # 保存帧 + flag = np.all([osp.exists(p) for p in frame_paths]) + if not flag: + images = [vid[i].asnumpy() for i in indices] + images = [Image.fromarray(arr) for arr in images] + for im, pth in zip(images, frame_paths): + if not osp.exists(pth): + im.save(pth) + + return frame_paths, indices, vid_fps + + +def get_chunk_number(filename): + try: + num = filename.split("chunk_")[1].split(".zip")[0] + return int(num) + except: + return float('inf') + + +def unzip_hf_zip(pth): + + import zipfile + + target_dir = pth + + if os.path.exists(f"{target_dir}/cg_videos_720p") and os.path.exists(f"{target_dir}/cg_subtitles")\ + and os.path.exists(f"{target_dir}/cg_clue_videos"): + print("all exists") + return + + video_zip_files = [ + os.path.join(target_dir, file) + for file in os.listdir(target_dir) + if file.endswith(".zip") and file.startswith("video") + ] + + video_zip_files = sorted(video_zip_files, key=lambda x: get_chunk_number(os.path.basename(x))) + + videos_temp_zip = os.path.join(target_dir, "videos_merged.zip") + + print("Merging video files ...") + + with open(videos_temp_zip, "wb") as outfile: + for video_zip_file in tqdm(video_zip_files, desc="Merging videos"): + with open(video_zip_file, "rb") as infile: + outfile.write(infile.read()) + + print("Extracting video files...") + + try: + with zipfile.ZipFile(videos_temp_zip, "r") as zip_ref: + + total_files = len(zip_ref.namelist()) + + for file in tqdm(zip_ref.namelist(), desc="Extracting", total=total_files): + zip_ref.extract(file, target_dir) + + print(f"Successfully extracted to {target_dir}") + except Exception as e: + print(f"Error during extraction: {e}") + finally: + + if os.path.exists(videos_temp_zip): + os.remove(videos_temp_zip) + print("Cleaned up temporary video file") + + clue_video_zip_files = [ + os.path.join(target_dir, file) + for file in os.listdir(target_dir) + if file.endswith(".zip") and file.startswith("clue_video") + ] + + clue_video_zip_files = sorted(clue_video_zip_files, key=lambda x: get_chunk_number(os.path.basename(x))) + + clue_videos_temp_zip = os.path.join(target_dir, "clue_videos_merged.zip") + + print("Merging clue video files ...") + + with open(clue_videos_temp_zip, "wb") as outfile: + for clue_video_zip_file in tqdm(clue_video_zip_files, desc="Merging clue_videos"): + with open(clue_video_zip_file, "rb") as infile: + outfile.write(infile.read()) + + print("Extracting clue video files...") + + try: + with zipfile.ZipFile(clue_videos_temp_zip, "r") as zip_ref: + + total_files = len(zip_ref.namelist()) + + for file in tqdm(zip_ref.namelist(), desc="Extracting", total=total_files): + zip_ref.extract(file, target_dir) + + print(f"Successfully extracted to {target_dir}") + except Exception as e: + print(f"Error during extraction: {e}") + finally: + + if os.path.exists(clue_videos_temp_zip): + os.remove(clue_videos_temp_zip) + print("Cleaned up temporary clue video file") + + print("Extracting subtitle files ...") + + subtitles_zip = os.path.join(target_dir, "subtitles.zip") + + try: + with zipfile.ZipFile(subtitles_zip, "r") as zip_ref: + + total_files = len(zip_ref.namelist()) + + for file in tqdm(zip_ref.namelist(), desc="Extracting", total=total_files): + zip_ref.extract(file, target_dir) + + print(f"Successfully extracted to {target_dir}") + except Exception as e: + print(f"Error during extraction: {e}") diff --git a/vlmeval/dataset/video_dataset_config.py b/vlmeval/dataset/video_dataset_config.py index 0937e8fd..8e195445 100644 --- a/vlmeval/dataset/video_dataset_config.py +++ b/vlmeval/dataset/video_dataset_config.py @@ -43,11 +43,40 @@ 'TempCompass_0.5fps': partial(TempCompass, dataset='TempCompass', fps=0.5) } +cgbench_dataset = { + 'CGBench_MCQ_Grounding_Mini_8frame_subs_subt': partial( + CGBench_MCQ_Grounding_Mini, + dataset='CG-Bench_MCQ_Grounding_Mini', + nframe=8, + use_subtitle=True, + use_subtitle_time=True + ), + 'CGBench_OpenEnded_Mini_8frame_subs_subt_ft': partial( + CGBench_OpenEnded_Mini, + dataset='CG-Bench_OpenEnded_Mini', + nframe=8, + use_subtitle=True, + use_subtitle_time=True, + use_frame_time=True + ), + 'CGBench_MCQ_Grounding_32frame_subs': partial( + CGBench_MCQ_Grounding, + dataset='CG-Bench_MCQ_Grounding', + nframe=32, + use_subtitle=True + ), + 'CGBench_OpenEnded_8frame': partial( + CGBench_OpenEnded, + dataset='CG-Bench_OpenEnded', + nframe=8 + ), +} + supported_video_datasets = {} dataset_groups = [ mmbench_video_dataset, mvbench_dataset, videomme_dataset, longvideobench_dataset, - mlvu_dataset, tempcompass_dataset + mlvu_dataset, tempcompass_dataset, cgbench_dataset ] for grp in dataset_groups: