diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py
index a1bab65c..ce98d248 100644
--- a/vlmeval/dataset/__init__.py
+++ b/vlmeval/dataset/__init__.py
@@ -30,6 +30,7 @@
 from .longvideobench import LongVideoBench
 from .video_concat_dataset import ConcatVideoDataset
 from .mmgenbench import MMGenBench
+from .cgbench import CGBench_MCQ_Grounding_Mini, CGBench_OpenEnded_Mini, CGBench_MCQ_Grounding, CGBench_OpenEnded
 
 from .miabench import MIABench
 from .cmmmu import CMMMU
@@ -139,7 +140,8 @@ def evaluate(self, eval_file, **judge_kwargs):
 VIDEO_DATASET = [
     MMBenchVideo, VideoMME, MVBench, MVBench_MP4, LongVideoBench,
     MLVU, MLVU_MCQ, MLVU_OpenEnded,
-    TempCompass, TempCompass_MCQ, TempCompass_Captioning, TempCompass_YorN
+    TempCompass, TempCompass_MCQ, TempCompass_Captioning, TempCompass_YorN,
+    CGBench_MCQ_Grounding_Mini, CGBench_OpenEnded_Mini, CGBench_MCQ_Grounding, CGBench_OpenEnded
 ]
 
 TEXT_DATASET = [
diff --git a/vlmeval/dataset/cgbench.py b/vlmeval/dataset/cgbench.py
new file mode 100644
index 00000000..172cdbb3
--- /dev/null
+++ b/vlmeval/dataset/cgbench.py
@@ -0,0 +1,1760 @@
+from huggingface_hub import snapshot_download
+from ..smp import *
+from .video_base import VideoBaseDataset
+from .utils import build_judge, DEBUG_MESSAGE
+from .utils.cgbench import *
+from ..utils import track_progress_rich
+
+
+class CGBench_MCQ_Grounding_Mini(VideoBaseDataset):
+
+    dataset = "CG-Bench_MCQ_Grounding_Mini"
+
+    TYPE = "Video-MCQ-Grounding"
+
+    MD5 = "54ed3e90a51a6fb375c92b319a715f72"
+
+    SYS = {
+        "long_acc": (
+            "You will be provided with sampled frames from a video, along with a "
+            "multiple-choice question that includes a question and several answer options.\n"
+            "Your task is to analyze the provided frames, infer the most plausible "
+            "answer based on the visual information.\n"
+            "If the video does not provide enough information, infer the answer based "
+            "on the options available and still provide a result. "
+            "Therefore, In all cases, an answer must be given.\n"
+            "Only output the answer in the following format:\n\n"
+            '```json\n{"result": "option"}\n```\n\n'
+            'The "option" is the uppercase letter corresponding to your answer.\n\n'
+        ),
+        "clue_acc": (
+            "You will be provided with sampled frames from a video, along with a "
+            "multiple-choice question that includes a question and several answer options.\n"
+            "Your task is to analyze the provided frames, infer the most plausible "
+            "answer based on the visual information.\n"
+            "If the video does not provide enough information, infer the answer based "
+            "on the options available and still provide a result. "
+            "Therefore, In all cases, an answer must be given.\n"
+            "Only output the answer in the following format:\n\n"
+            '```json\n{"result": "option"}\n```\n\n'
+            "The 'option' is the uppercase letter corresponding to your answer.\n\n"
+        ),
+        "miou": (
+            "You will be provided with uniformly sampled frames from a video and their "
+            "timestamps, along with a multiple-choice question that includes a question "
+            "and several answer options.\n"
+            "Your task is to determine in which intervals the 'clue intervals' exist "
+            "that contain visual information needed to answer the question.\n"
+            "Only output the answer in the following format:\n\n"
+            '```json\n{"result": [[start1, end1], [start2, end2], ...]}\n```\n\n'
+            "In this output format, each 'start' and 'end' represents the beginning and "
+            "end of an interval in seconds where relevant clues can be found.\n"
+            "You must provide at least one interval and at most five intervals. "
+            "Intervals exceeding five will NOT be considered valid.\n"
+        ),
+        "miou_wo_frame_time": (
+            "You will be provided with uniformly sampled frames from a video, along "
+            "with a multiple-choice question that includes a question and several "
+            "answer options.\n"
+            "Your task is to determine in which intervals the 'clue intervals' exist "
+            "that contain visual information needed to answer the question.\n"
+            "Only output the answer in the following format:\n\n"
+            '```json\n{"result": [[start1, end1], [start2, end2], ...]}\n```\n\n'
+            'In this output format, each "start" and "end" represents the start and '
+            "end of the video where the relevant clue can be found in the form of a "
+            "floating point number between 0 and 1, where 0 represents the start time "
+            "of the video and 1 represents the end time of the video.\n"
+            "You must provide at least one interval and at most five intervals. "
+            "Intervals exceeding five will NOT be considered valid.\n"
+        ),
+    }
+
+    def __init__(
+        self,
+        dataset="CG-Bench_MCQ_Grounding_Mini",
+        use_subtitle=False,
+        use_subtitle_time=False,
+        use_frame_time=False,
+        nframe=0,
+        fps=-1,
+    ):
+        super().__init__(dataset=dataset, nframe=nframe, fps=fps)
+        self.use_subtitle = use_subtitle
+        self.use_subtitle_time = use_subtitle_time
+        self.use_frame_time = use_frame_time
+        self.dataset_name = dataset
+        lmu_root = LMUDataRoot()
+        self.clue_frame_root = osp.join(lmu_root, "clue_images", dataset)
+
+    @classmethod
+    def supported_datasets(cls):
+        return ["CG-Bench_MCQ_Grounding_Mini"]
+
+    def clue_frame_paths(self, qid, num_frames=8):
+        frame_root = osp.join(self.clue_frame_root, qid)
+        os.makedirs(frame_root, exist_ok=True)
+        return [osp.join(frame_root, self.frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)]
+
+    def clue_frame_paths_fps(self, qid, num_frames=8, fps=-1):
+        frame_root = osp.join(self.clue_frame_root, qid)
+        os.makedirs(frame_root, exist_ok=True)
+        return [osp.join(frame_root, self.frame_tmpl_fps.format(i, num_frames, fps)) for i in range(1, num_frames + 1)]
+
+    def get_subtitles(self, subtitle_path, frame_indices=None, fps=None, sub_time=False):
+
+        subtitles = []
+
+        srt_path = osp.join(self.data_root, subtitle_path)
+        assert osp.exists(srt_path)
+        import pysubs2
+
+        subs = pysubs2.load(srt_path, encoding="utf-8")
+        if not frame_indices:
+            for sub in subs:
+                sub_text = sub.text.replace("\\N", " ")
+                if sub_time:
+                    start_time = milliseconds_to_seconds(sub.start)
+                    end_time = milliseconds_to_seconds(sub.end)
+                    sub_text = f"[{start_time}, {end_time}] {sub_text}"
+                if sub_text.strip() and sub_text not in subtitles:
+                    subtitles.append(sub_text)
+        else:
+            for selected_frame_id in frame_indices:
+                cur_time = pysubs2.make_time(fps=fps, frames=selected_frame_id)
+                for sub in subs:
+                    if sub.start < cur_time and sub.end > cur_time:
+                        sub_text = sub.text.replace("\\N", " ")
+                        if sub_time:
+                            start_time = milliseconds_to_seconds(sub.start)
+                            end_time = milliseconds_to_seconds(sub.end)
+                            sub_text = f"[{start_time}, {end_time}] {sub_text}"
+                        if sub_text.strip() and sub_text not in subtitles:
+                            subtitles.append(sub_text)
+
+        if subtitles:
+            subtitles_str = '\n'.join(subtitles)
+            return f"The subtitles of the video are as follows:\n\n{subtitles_str}\n\n"
+        else:
+            return ""
+
+    def prepare_dataset(self, dataset_name="CG-Bench_MCQ_Grounding_Mini", repo_id="CG-Bench/CG-Bench"):
+
+        def check_integrity(pth):
+            data_file = osp.join(pth, f"{dataset_name}.tsv")
+
+            if not os.path.exists(data_file):
+                return False
+
+            if md5(data_file) != self.MD5:
+                return False
+            data = load(data_file)
+            for video_pth in data["video"]:
+                if not osp.exists(osp.join(pth, video_pth)):
+                    return False
+
+            return True
+
+        cache_path = get_cache_path(repo_id)
+
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+        else:
+
+            def generate_tsv(pth):
+
+                tsv_file = osp.join(pth, f"{dataset_name}.tsv")
+
+                task_modes = ["long_acc", "clue_acc", "miou"]
+                all_data = []
+                for task_mode in task_modes:
+                    with open(osp.join(pth, "cgbench_mini.json"), "r") as f:
+                        data_file = pd.DataFrame(json.load(f))
+
+                    data_file = data_file.assign(index=range(len(data_file)))
+                    data_file["video"] = data_file["video_uid"].apply(lambda x: f"cg_videos_720p/{x}.mp4")
+                    data_file["subtitle_path"] = data_file["video_uid"].apply(
+                        lambda x: (
+                            f"cg_subtitles/{x}.srt"
+                            if osp.exists(osp.join(dataset_path, f"cg_subtitles/{x}.srt"))
+                            else ""
+                        )
+                    )
+
+                    data_file["clue_video_path"] = ""
+
+                    if task_mode in ["clue_acc"]:
+                        data_file["clue_video_path"] = data_file["clue_video_path"] = data_file.apply(
+                            lambda row: f"cg_clue_videos/{row['qid']}.mp4", axis=1
+                        )
+
+                    data_file["task_mode"] = task_mode
+
+                    if task_mode in ["clue_acc", "long_acc"]:
+                        data_file["answer"] = data_file["right_answer"]
+
+                    if task_mode == "miou":
+                        data_file["answer"] = data_file["clue_intervals"]
+
+                    if task_mode in ["long_acc", "miou"]:
+                        data_file["clue_intervals"] = ""
+
+                    data_file = data_file[
+                        [
+                            "index",
+                            "video_uid",
+                            "video",
+                            "duration",
+                            "domain",
+                            "choices",
+                            "sub_category",
+                            "subtitle_path",
+                            "question",
+                            "answer",
+                            "task_mode",
+                            "clue_intervals",
+                            "qid",
+                            "clue_video_path",
+                        ]
+                    ]
+
+                    all_data.append(data_file)
+
+                final_data = pd.concat(all_data, ignore_index=True)
+                final_data["index"] = range(len(final_data))
+                final_data.to_csv(tsv_file, sep="\t", index=False)
+
+            if modelscope_flag_set():
+                from modelscope import dataset_snapshot_download
+
+                dataset_path = dataset_snapshot_download(dataset_id=repo_id)
+            else:
+                dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
+
+            unzip_hf_zip(dataset_path)
+            generate_tsv(dataset_path)
+
+        tsv_file = osp.join(dataset_path, f"{dataset_name}.tsv")
+
+        return dict(data_file=tsv_file, root=dataset_path)
+
+    def build_prompt(self, line, video_llm):
+
+        if isinstance(line, int):
+            assert line < len(self)
+            line = self.data.iloc[line]
+
+        task_mode = line["task_mode"]
+
+        message = []
+
+        origin_use_subtitle_time = self.use_subtitle_time
+
+        try:
+            if task_mode in ["long_acc", "clue_acc"]:
+                system_prompt = self.SYS[task_mode]
+            elif task_mode == "miou":
+                if self.use_frame_time and not video_llm:
+                    system_prompt = self.SYS[task_mode]
+                else:
+                    system_prompt = self.SYS["miou_wo_frame_time"]
+                    if self.use_subtitle_time is True:
+                        self.use_subtitle_time = False
+
+            user_prompt = ""
+
+            if task_mode in ["long_acc", "miou"]:
+                video_path = line["video"]
+
+                if video_llm:
+                    message.append(dict(type="video", value=osp.join(self.data_root, video_path)))
+
+                    if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                        if self.nframe:
+                            image_paths, frame_indices, vid_fps = self.save_video_frames(
+                                video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
+                            )
+                            user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices,
+                                                              fps=vid_fps, sub_time=self.use_subtitle_time)
+                        else:
+                            user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time)
+                else:
+                    image_paths, frame_indices, vid_fps = self.save_video_frames(
+                        video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
+                    )
+                    message.extend(dict(type="image", value=im) for im in image_paths)
+
+                    if self.use_frame_time:
+                        user_prompt += get_timestampes(frame_indices, vid_fps)
+
+                    if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                        user_prompt += self.get_subtitles(
+                            line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps,
+                            sub_time=self.use_subtitle_time
+                        )
+
+            elif task_mode == "clue_acc":
+                clue_video_path = line["clue_video_path"]
+                video_path = line["video"]
+
+                if video_llm:
+                    message.append(dict(type="video", value=osp.join(self.data_root, clue_video_path)))
+                    print(message)
+
+                    if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                        if self.nframe:
+                            image_paths, frame_indices, vid_fps = self.save_video_frames(
+                                video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
+                            )
+                            user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices,
+                                                              fps=vid_fps, sub_time=self.use_subtitle_time)
+                        else:
+                            user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time)
+                else:
+                    if self.nframe > 32:
+                        self.nframe = 32
+                        print("The maximum number of frames is 32 when evaluating clue-based mcq in CG-Bench !")
+
+                    clue_intervals = eval(line["clue_intervals"])
+
+                    image_paths, frame_indices, vid_fps = self.save_video_frames(
+                        video_path, uid=line["qid"], clue_intervals=clue_intervals, num_frames=self.nframe, fps=self.fps
+                    )
+
+                    message.extend(dict(type="image", value=im) for im in image_paths)
+
+                    if self.use_frame_time:
+                        user_prompt += get_timestampes(frame_indices, vid_fps)
+
+                    if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                        user_prompt += self.get_subtitles(
+                            line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps,
+                            sub_time=self.use_subtitle_time
+                        )
+
+            question = line["question"]
+            user_prompt += f"Question: {question}\n\n"
+
+            choices = eval(line["choices"])
+            labels = [chr(ord("A") + i) for i in range(len(choices))]
+            user_prompt += "\n".join([f"{label}:{value}" for label, value in zip(labels, choices)]) + "\n\n"
+
+            message.append(dict(type="text", value=system_prompt + user_prompt))
+
+            return message
+
+        finally:
+            # Ensure that `use_subtitle_time` is always restored to its original value
+            self.use_subtitle_time = origin_use_subtitle_time
+
+    def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-1):
+
+        if type(uid) is not str:
+            uid = str(uid)
+
+        vid_path = osp.join(self.data_root, video)
+        vid = decord.VideoReader(vid_path)
+        vid_fps = vid.get_avg_fps()
+        n_frames = len(vid)
+
+        if clue_intervals is not None:
+            merged_intervals = merge_intervals(clue_intervals)
+
+            if num_frames > 0 and fps < 0:
+                indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps)
+                frame_paths = self.clue_frame_paths(uid, len(indices))
+
+            elif fps > 0:
+                frame_indices = []
+                for start, end in merged_intervals:
+                    start_frame = int(start * vid_fps)
+                    end_frame = int(end * vid_fps)
+                    step = vid_fps / fps
+                    interval_indices = [
+                        int(start_frame + i * step) for i in range(int((end_frame - start_frame) / step))
+                    ]
+                    frame_indices.extend(interval_indices)
+
+                if len(frame_indices) < 32:
+                    indices = sample_frames_clue_average(merged_intervals, 32, vid_fps)
+                else:
+                    indices = frame_indices
+                frame_paths = self.clue_frame_paths_fps(uid, len(indices), fps)
+
+        else:
+            if num_frames > 0 and fps < 0:
+                step_size = len(vid) / (num_frames + 1)
+                indices = [int(i * step_size) for i in range(1, num_frames + 1)]
+
+                frame_paths = self.frame_paths(uid)
+            elif fps > 0:
+                total_duration = n_frames / vid_fps
+                required_frames = int(total_duration * fps)
+                step_size = vid_fps / fps
+                indices = [int(i * step_size) for i in range(required_frames)]
+                frame_paths = self.frame_paths_fps(uid, len(indices))
+
+        # Save and validate frames
+        valid_paths = []
+        valid_indices = []
+
+        if not np.all([osp.exists(p) for p in frame_paths]):
+            images = [vid[i].asnumpy() for i in indices]
+            for i, (img_array, path) in enumerate(zip(images, frame_paths)):
+                if osp.exists(path):
+                    try:
+                        with Image.open(path) as img:
+                            img.verify()
+                        valid_paths.append(path)
+                        valid_indices.append(indices[i])
+                    except Exception:
+                        continue
+                else:
+                    try:
+                        img = Image.fromarray(img_array)
+                        img.save(path)
+                        img.verify()
+                        valid_paths.append(path)
+                        valid_indices.append(indices[i])
+                    except Exception:
+                        continue
+        else:
+            for i, path in enumerate(frame_paths):
+                try:
+                    with Image.open(path) as img:
+                        img.verify()
+                    valid_paths.append(path)
+                    valid_indices.append(indices[i])
+                except Exception:
+                    continue
+
+        return valid_paths, valid_indices, vid_fps
+
+    def evaluate(self, eval_file, **judge_kwargs):
+
+        assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
+
+        tgt_file = eval_file.replace(".xlsx", "_rating.json")
+        score_file = eval_file.replace(".xlsx", "_score.xlsx")
+
+        data = load(eval_file)
+
+        data_un = data[~pd.isna(data["prediction"])]
+        data_pred_na = data[pd.isna(data["prediction"])]
+
+        data_pred_na["score"] = -1
+
+        data_un["score"] = data_un.apply(
+            lambda row: post_process(
+                response=row["prediction"],
+                right_answer=row["answer"],
+                task_mode=row["task_mode"],
+                duration=row["duration"],
+            ),
+            axis=1,
+        )
+
+        data = pd.concat([data_pred_na, data_un])
+
+        rejected_count = (data["score"] == -1).sum()
+
+        print(
+            f"Among {len(data)} questions, "
+            f"failed to obtain prediction for {len(data_pred_na)} questions, "
+            f"failed to obtain the score for {rejected_count - len(data_pred_na)} questions. "
+            f"Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating."
+        )
+
+        dump(data, score_file)
+
+        rating = get_dimention_rating_mcq_grouding(score_file)
+
+        dump(rating, tgt_file)
+
+        return rating
+
+
+# 评估时，step_2 评估时，给出 [prompt] + image_paths 就行
+class CGBench_OpenEnded_Mini(VideoBaseDataset):
+
+    TYPE = "Video-OpenEnded"
+
+    dataset = "CG-Bench_OpenEnded_Mini"
+
+    MD5 = "9175791b11afdfa305fdb3e525b7a4ee"
+
+    SYS = (
+        "You will be provided with sampled frames from a video, along with a "
+        "question.\n"
+        "Your task is to analyze the provided frames and infer the most plausible "
+        "answer based on the visual information.\n"
+        "If the visual information is ambiguous or insufficient, use the available "
+        "context to reason your answer.\n"
+        "Only output the answer in the following format:\n\n"
+        '```json\n{"result": "answer"}\n```\n\n'
+        'The "answer" can be a word, phrase, or sentence that directly responds to '
+        "the question.\n\n"
+    )
+
+    def __init__(
+        self,
+        dataset="CG-Bench_OpenEnded_Mini",
+        use_subtitle=False,
+        use_subtitle_time=False,
+        use_frame_time=False,
+        nframe=0,
+        fps=-1,
+    ):
+        super().__init__(dataset=dataset, nframe=nframe, fps=fps)
+        self.use_subtitle = use_subtitle
+        self.use_subtitle_time = use_subtitle_time
+        self.use_frame_time = use_frame_time
+        self.dataset_name = dataset
+        lmu_root = LMUDataRoot()
+        self.clue_frame_root = osp.join(lmu_root, "clue_images", dataset)
+
+    @classmethod
+    def supported_datasets(cls):
+        return ["CG-Bench_OpenEnded_Mini"]
+
+    def get_subtitles(self, subtitle_path, frame_indices=None, fps=None, sub_time=False):
+
+        subtitles = []
+
+        srt_path = osp.join(self.data_root, subtitle_path)
+        assert osp.exists(srt_path)
+        import pysubs2
+
+        subs = pysubs2.load(srt_path, encoding="utf-8")
+        if not frame_indices:
+            for sub in subs:
+                sub_text = sub.text.replace("\\N", " ")
+                if sub_time:
+                    start_time = milliseconds_to_seconds(sub.start)
+                    end_time = milliseconds_to_seconds(sub.end)
+                    sub_text = f"[{start_time}, {end_time}] {sub_text}"
+                if sub_text.strip() and sub_text not in subtitles:
+                    subtitles.append(sub_text)
+        else:
+            for selected_frame_id in frame_indices:
+                cur_time = pysubs2.make_time(fps=fps, frames=selected_frame_id)
+                for sub in subs:
+                    if sub.start < cur_time and sub.end > cur_time:
+                        sub_text = sub.text.replace("\\N", " ")
+                        if sub_time:
+                            start_time = milliseconds_to_seconds(sub.start)
+                            end_time = milliseconds_to_seconds(sub.end)
+                            sub_text = f"[{start_time}, {end_time}] {sub_text}"
+                            if sub_text.strip() and sub_text not in subtitles:
+                                subtitles.append(sub_text)
+
+        if subtitles:
+            subtitles_str = '\n'.join(subtitles)
+            return f"The subtitles of the video are as follows:\n\n{subtitles_str}\n\n"
+        else:
+            return ""
+
+    def prepare_dataset(self, dataset_name="CG-Bench_OpenEnded_Mini", repo_id="CG-Bench/CG-Bench"):
+
+        def check_integrity(pth):
+            data_file = osp.join(pth, f"{dataset_name}.tsv")
+
+            if not os.path.exists(data_file):
+                return False
+
+            if md5(data_file) != self.MD5:
+                return False
+            data = load(data_file)
+            for video_pth in data["video"]:
+                if not osp.exists(osp.join(pth, video_pth)):
+                    return False
+
+            return True
+
+        cache_path = get_cache_path(repo_id)
+
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+        else:
+
+            def generate_tsv(pth):
+
+                tsv_file = osp.join(pth, f"{dataset_name}.tsv")
+
+                with open(osp.join(pth, "cgbench_mini.json"), "r") as f:
+                    data_file = pd.DataFrame(json.load(f))
+
+                data_file = data_file.assign(index=range(len(data_file)))
+                data_file["video"] = data_file["video_uid"].apply(lambda x: f"cg_videos_720p/{x}.mp4")
+                data_file["subtitle_path"] = data_file["video_uid"].apply(
+                    lambda x: f"cg_subtitles/{x}.srt" if osp.exists(osp.join(pth, f"cg_subtitles/{x}.srt")) else ""
+                )
+
+                data_file = data_file[
+                    [
+                        "index",
+                        "video_uid",
+                        "video",
+                        "duration",
+                        "domain",
+                        "sub_category",
+                        "subtitle_path",
+                        "question",
+                        "answer",
+                        "clue_intervals",
+                        "qid",
+                    ]
+                ]
+
+                data_file.to_csv(tsv_file, sep="\t", index=False)
+
+            if modelscope_flag_set():
+                from modelscope import dataset_snapshot_download
+
+                dataset_path = dataset_snapshot_download(dataset_id=repo_id)
+            else:
+                dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
+
+            unzip_hf_zip(dataset_path)
+            generate_tsv(dataset_path)
+
+        tsv_file = osp.join(dataset_path, f"{dataset_name}.tsv")
+
+        return dict(data_file=tsv_file, root=dataset_path)
+
+    def build_prompt(self, line, video_llm):
+
+        if isinstance(line, int):
+            assert line < len(self)
+            line = self.data.iloc[line]
+
+        message = []
+
+        sys_prompt = self.SYS
+
+        user_prompt = ""
+
+        video_path = line["video"]
+
+        if video_llm:
+            message.append(dict(type="video", value=osp.join(self.data_root, video_path)))
+            if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                if self.nframe:
+                    image_paths, frame_indices, vid_fps = self.save_video_frames(
+                        video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
+                    )
+                    user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices,
+                                                      fps=vid_fps, sub_time=self.use_subtitle_time)
+                else:
+                    user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time)
+        else:
+            image_paths, frame_indices, vid_fps = self.save_video_frames(
+                video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
+            )
+            message.extend(dict(type="image", value=im) for im in image_paths)
+
+            if self.use_frame_time:
+                user_prompt += get_timestampes(frame_indices, vid_fps)
+
+            if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                user_prompt += self.get_subtitles(
+                    line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps,
+                    sub_time=self.use_subtitle_time
+                )
+
+        question = line["question"]
+        user_prompt += f"Question: {question}\n\n"
+
+        message.append(dict(type="text", value=sys_prompt + user_prompt))
+
+        return message
+
+    def clue_frame_paths(self, qid, num_frames=8):
+        frame_root = osp.join(self.clue_frame_root, qid)
+        os.makedirs(frame_root, exist_ok=True)
+        return [osp.join(frame_root, self.frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)]
+
+    def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-1):
+
+        if type(uid) is not str:
+            uid = str(uid)
+
+        vid_path = osp.join(self.data_root, video)
+        vid = decord.VideoReader(vid_path)
+        vid_fps = vid.get_avg_fps()
+        n_frames = len(vid)
+
+        if clue_intervals is not None:
+            merged_intervals = merge_intervals(clue_intervals)
+
+            if num_frames > 0 and fps < 0:
+                indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps)
+                frame_paths = self.clue_frame_paths(uid, len(indices))
+
+            elif fps > 0:
+                frame_indices = []
+                for start, end in merged_intervals:
+                    start_frame = int(start * vid_fps)
+                    end_frame = int(end * vid_fps)
+                    step = vid_fps / fps
+                    interval_indices = [
+                        int(start_frame + i * step) for i in range(int((end_frame - start_frame) / step))
+                    ]
+                    frame_indices.extend(interval_indices)
+
+                if len(frame_indices) < 32:
+                    indices = sample_frames_clue_average(merged_intervals, 32, vid_fps)
+                else:
+                    indices = frame_indices
+                frame_paths = self.clue_frame_paths_fps(uid, len(indices), fps)
+
+        else:
+            if num_frames > 0 and fps < 0:
+                step_size = len(vid) / (num_frames + 1)
+                indices = [int(i * step_size) for i in range(1, num_frames + 1)]
+                frame_paths = self.frame_paths(uid)
+            elif fps > 0:
+                total_duration = n_frames / vid_fps
+                required_frames = int(total_duration * fps)
+                step_size = vid_fps / fps
+                indices = [int(i * step_size) for i in range(required_frames)]
+                frame_paths = self.frame_paths_fps(uid, len(indices))
+
+        valid_paths = []
+        valid_indices = []
+
+        if not np.all([osp.exists(p) for p in frame_paths]):
+            images = [vid[i].asnumpy() for i in indices]
+            for i, (img_array, path) in enumerate(zip(images, frame_paths)):
+                if osp.exists(path):
+                    try:
+                        with Image.open(path) as img:
+                            img.verify()
+                        valid_paths.append(path)
+                        valid_indices.append(indices[i])
+                    except Exception:
+                        continue
+                else:
+                    try:
+                        img = Image.fromarray(img_array)
+                        img.save(path)
+                        img.verify()
+                        valid_paths.append(path)
+                        valid_indices.append(indices[i])
+                    except Exception:
+                        continue
+        else:
+            for i, path in enumerate(frame_paths):
+                try:
+                    with Image.open(path) as img:
+                        img.verify()
+                    valid_paths.append(path)
+                    valid_indices.append(indices[i])
+                except Exception:
+                    continue
+
+        return valid_paths, valid_indices, vid_fps
+
+    def evaluate(self, eval_file, **judge_kwargs):
+
+        from .utils.cgbench import get_dimention_rating_open_ended, post_process_open
+
+        assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
+
+        tgt_file = eval_file.replace(".xlsx", "_rating.json")
+        score_file = eval_file.replace(".xlsx", "_score.xlsx")
+        step_1_tmp_file = eval_file.replace(".xlsx", "_step_1.pkl")
+        step_2_tmp_file = eval_file.replace(".xlsx", "_step_2.pkl")
+
+        data = load(eval_file)
+
+        data_pred_no_na = data[~pd.isna(data["prediction"])]
+        data_pred_na = data[pd.isna(data["prediction"])]
+
+        data_pred_na["model_result"] = -1
+        data_pred_na["step_1_result"] = -1
+        data_pred_na["step_2_result"] = -1
+        data_pred_na["score"] = -1
+
+        data_pred_no_na["model_result"] = data_pred_no_na.apply(
+            lambda row: post_process_open(
+                response=row["prediction"],
+            ),
+            axis=1,
+        )
+
+        data_no_model_result = data_pred_no_na[data_pred_no_na["model_result"] == -1]
+        data_step_1 = data_pred_no_na[data_pred_no_na["model_result"] != -1]
+
+        if judge_kwargs.get("model", None) != "gpt-4o-0806":
+            judge_kwargs["model"] = "gpt-4o-0806"
+            print("The judge model in cg-bench is gpt-4o-0806!")
+
+        model_step_1 = build_judge(system_prompt=sys_prompt_open_eval_step_1, **judge_kwargs)
+        nproc = judge_kwargs.pop("nproc", 32)
+
+        lines_step_1 = data_step_1.to_dict("records")
+        tups_step_1 = [(model_step_1, line) for line in lines_step_1]
+
+        keys_step_1 = {line["qid"] for line in lines_step_1}
+
+        ans = {}
+        if osp.exists(step_1_tmp_file):
+            ans = load(step_1_tmp_file)
+        tups_step_1 = [x for x, i in zip(tups_step_1, keys_step_1) if i not in ans]
+        keys_step_1 = [i for i in keys_step_1 if i not in ans]
+
+        _ = track_progress_rich(
+            eval_open_first,
+            tups_step_1,
+            nproc=nproc,
+            keys=keys_step_1,
+            save=step_1_tmp_file,
+        )
+
+        step_1_results = load(step_1_tmp_file)
+        data_step_1 = save_step_1_steps(data_step_1, step_1_results)  # -1, 0, 1, 2
+
+        data_no_step_1_results = data_step_1[data_step_1["step_1_result"] == -1]
+        data_step_1_over = data_step_1[data_step_1["step_1_result"].isin([0, 1])]
+        data_step_2 = data_step_1[data_step_1["step_1_result"] == 2]
+
+        print(judge_kwargs)
+
+        model_step_2 = build_judge(system_prompt=sys_prompt_open_eval_step_2, **judge_kwargs)
+
+        lines_step_2 = data_step_2.to_dict("records")
+
+        tups_step_2 = []
+
+        for line in tqdm(lines_step_2):
+            clue_intervals = eval(line["clue_intervals"])
+            lmu_root = LMUDataRoot()
+            clue_frame_root = osp.join(lmu_root, "clue_images", self.dataset)
+            data_root = self.data_root
+            frame_paths, _, _ = save_clue_video_frames(
+                data_root,
+                clue_frame_root,
+                video=line["video"],
+                uid=line["qid"],
+                clue_intervals=clue_intervals,
+                num_frames=32,
+            )
+            tups_step_2.append((model_step_2, line, frame_paths))
+
+        keys_step_2 = {line["qid"] for line in lines_step_2}
+
+        ans = {}
+        if osp.exists(step_2_tmp_file):
+            ans = load(step_2_tmp_file)
+        tups_step_2 = [x for x, i in zip(tups_step_2, keys_step_2) if i not in ans]
+        keys_step_2 = [i for i in keys_step_2 if i not in ans]
+
+        _ = track_progress_rich(
+            eval_open_second,
+            tups_step_2,
+            nproc=nproc,
+            keys=keys_step_2,
+            save=step_2_tmp_file,
+        )
+
+        step_2_results = load(step_2_tmp_file)
+        data_step_2 = save_step_2_steps(data_step_2, step_2_results)
+
+        data_no_step_2_results = data_step_2[data_step_2["score"] == -1]
+        data_step_2_over = data_step_2[data_step_2["score"].isin([0, 1])]
+
+        data = pd.concat(
+            [
+                data_pred_na,
+                data_no_model_result,
+                data_no_step_1_results,
+                data_step_1_over,
+                data_no_step_2_results,
+                data_step_2_over,
+            ]
+        )
+
+        dump(data, score_file)
+
+        rating = get_dimention_rating_open_ended(score_file)
+
+        dump(rating, tgt_file)
+
+        return rating
+
+
+class CGBench_MCQ_Grounding(VideoBaseDataset):
+
+    TYPE = "Video-MCQ-Grounding"
+
+    MD5 = "eaead3d978a689269fefce4ae29c86df"
+
+    SYS = {
+        "long_acc": (
+            "You will be provided with sampled frames from a video, along with a "
+            "multiple-choice question that includes a question and several answer options.\n"
+            "Your task is to analyze the provided frames, infer the most plausible "
+            "answer based on the visual information.\n"
+            "If the video does not provide enough information, infer the answer based "
+            "on the options available and still provide a result. "
+            "Therefore, In all cases, an answer must be given.\n"
+            "Only output the answer in the following format:\n\n"
+            '```json\n{"result": "option"}\n```\n\n'
+            'The "option" is the uppercase letter corresponding to your answer.\n\n'
+        ),
+        "clue_acc": (
+            "You will be provided with sampled frames from a video, along with a "
+            "multiple-choice question that includes a question and several answer options.\n"
+            "Your task is to analyze the provided frames, infer the most plausible "
+            "answer based on the visual information.\n"
+            "If the video does not provide enough information, infer the answer based "
+            "on the options available and still provide a result. "
+            "Therefore, In all cases, an answer must be given.\n"
+            "Only output the answer in the following format:\n\n"
+            '```json\n{"result": "option"}\n```\n\n'
+            "The 'option' is the uppercase letter corresponding to your answer.\n\n"
+        ),
+        "miou": (
+            "You will be provided with uniformly sampled frames from a video and their "
+            "timestamps, along with a multiple-choice question that includes a question "
+            "and several answer options.\n"
+            "Your task is to determine in which intervals the 'clue intervals' exist "
+            "that contain visual information needed to answer the question.\n"
+            "Only output the answer in the following format:\n\n"
+            '```json\n{"result": [[start1, end1], [start2, end2], ...]}\n```\n\n'
+            "In this output format, each 'start' and 'end' represents the beginning and "
+            "end of an interval in seconds where relevant clues can be found.\n"
+            "You must provide at least one interval and at most five intervals. "
+            "Intervals exceeding five will NOT be considered valid.\n"
+        ),
+        "miou_wo_frame_time": (
+            "You will be provided with uniformly sampled frames from a video, along "
+            "with a multiple-choice question that includes a question and several "
+            "answer options.\n"
+            "Your task is to determine in which intervals the 'clue intervals' exist "
+            "that contain visual information needed to answer the question.\n"
+            "Only output the answer in the following format:\n\n"
+            '```json\n{"result": [[start1, end1], [start2, end2], ...]}\n```\n\n'
+            'In this output format, each "start" and "end" represents the start and '
+            "end of the video where the relevant clue can be found in the form of a "
+            "floating point number between 0 and 1, where 0 represents the start time "
+            "of the video and 1 represents the end time of the video.\n"
+            "You must provide at least one interval and at most five intervals. "
+            "Intervals exceeding five will NOT be considered valid.\n"
+        ),
+    }
+
+    def __init__(
+        self,
+        dataset="CG-Bench_MCQ_Grounding",
+        use_subtitle=False,
+        use_subtitle_time=False,
+        use_frame_time=False,
+        nframe=0,
+        fps=-1,
+    ):
+        super().__init__(dataset=dataset, nframe=nframe, fps=fps)
+        self.use_subtitle = use_subtitle
+        self.use_subtitle_time = use_subtitle_time
+        self.use_frame_time = use_frame_time
+        self.dataset_name = dataset
+        lmu_root = LMUDataRoot()
+        self.clue_frame_root = osp.join(lmu_root, "clue_images", dataset)
+
+    @classmethod
+    def supported_datasets(cls):
+        return ["CG-Bench_MCQ_Grounding"]
+
+    def clue_frame_paths(self, qid, num_frames=8):
+        frame_root = osp.join(self.clue_frame_root, qid)
+        os.makedirs(frame_root, exist_ok=True)
+        return [osp.join(frame_root, self.frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)]
+
+    def clue_frame_paths_fps(self, qid, num_frames=8, fps=-1):
+        frame_root = osp.join(self.clue_frame_root, qid)
+        os.makedirs(frame_root, exist_ok=True)
+        return [osp.join(frame_root, self.frame_tmpl_fps.format(i, num_frames, fps)) for i in range(1, num_frames + 1)]
+
+    def get_subtitles(self, subtitle_path, frame_indices=None, fps=None, sub_time=False):
+
+        subtitles = []
+
+        srt_path = osp.join(self.data_root, subtitle_path)
+        assert osp.exists(srt_path)
+        import pysubs2
+
+        subs = pysubs2.load(srt_path, encoding="utf-8")
+        if not frame_indices:
+            for sub in subs:
+                sub_text = sub.text.replace("\\N", " ")
+                if sub_time:
+                    start_time = milliseconds_to_seconds(sub.start)
+                    end_time = milliseconds_to_seconds(sub.end)
+                    sub_text = f"[{start_time}, {end_time}] {sub_text}"
+                if sub_text.strip() and sub_text not in subtitles:
+                    subtitles.append(sub_text)
+        else:
+            for selected_frame_id in frame_indices:
+                cur_time = pysubs2.make_time(fps=fps, frames=selected_frame_id)
+                for sub in subs:
+                    if sub.start < cur_time and sub.end > cur_time:
+                        sub_text = sub.text.replace("\\N", " ")
+                        if sub_time:
+                            start_time = milliseconds_to_seconds(sub.start)
+                            end_time = milliseconds_to_seconds(sub.end)
+                            sub_text = f"[{start_time}, {end_time}] {sub_text}"
+                        if sub_text.strip() and sub_text not in subtitles:
+                            subtitles.append(sub_text)
+
+        if subtitles:
+            subtitles_str = '\n'.join(subtitles)
+            return f"The subtitles of the video are as follows:\n\n{subtitles_str}\n\n"
+        else:
+            return ""
+
+    def prepare_dataset(self, dataset_name="CG-Bench_MCQ_Grounding", repo_id="CG-Bench/CG-Bench"):
+
+        def check_integrity(pth):
+            data_file = osp.join(pth, f"{dataset_name}.tsv")
+
+            if not os.path.exists(data_file):
+                return False
+
+            if md5(data_file) != self.MD5:
+                return False
+            data = load(data_file)
+            for video_pth in data["video"]:
+                if not osp.exists(osp.join(pth, video_pth)):
+                    return False
+
+            for clue_video_pth in data["clue_video_path"]:
+                if clue_video_pth and not (isinstance(clue_video_pth, float) and np.isnan(clue_video_pth)):
+                    if not osp.exists(osp.join(pth, clue_video_pth)):
+                        return False
+
+            return True
+
+        cache_path = get_cache_path(repo_id)
+
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+        else:
+
+            def generate_tsv(pth):
+
+                tsv_file = osp.join(pth, f"{dataset_name}.tsv")
+
+                task_modes = ["long_acc", "clue_acc", "miou"]
+                all_data = []
+                for task_mode in task_modes:
+                    with open(osp.join(pth, "cgbench.json"), "r") as f:
+                        data_file = pd.DataFrame(json.load(f))
+
+                    data_file = data_file.assign(index=range(len(data_file)))
+                    data_file["video"] = data_file["video_uid"].apply(lambda x: f"cg_videos_720p/{x}.mp4")
+                    data_file["subtitle_path"] = data_file["video_uid"].apply(
+                        lambda x: (
+                            f"cg_subtitles/{x}.srt"
+                            if osp.exists(osp.join(dataset_path, f"cg_subtitles/{x}.srt"))
+                            else ""
+                        )
+                    )
+
+                    data_file["clue_video_path"] = ""
+
+                    if task_mode in ["clue_acc"]:
+                        data_file["clue_video_path"] = data_file["clue_video_path"] = data_file.apply(
+                            lambda row: f"cg_clue_videos/{row['qid']}.mp4", axis=1
+                        )
+
+                    data_file["task_mode"] = task_mode
+
+                    if task_mode in ["clue_acc", "long_acc"]:
+                        data_file["answer"] = data_file["right_answer"]
+
+                    if task_mode == "miou":
+                        data_file["answer"] = data_file["clue_intervals"]
+
+                    if task_mode in ["long_acc", "miou"]:
+                        data_file["clue_intervals"] = ""
+
+                    data_file = data_file[
+                        [
+                            "index",
+                            "video_uid",
+                            "video",
+                            "duration",
+                            "domain",
+                            "choices",
+                            "sub_category",
+                            "subtitle_path",
+                            "question",
+                            "answer",
+                            "task_mode",
+                            "clue_intervals",
+                            "qid",
+                            "clue_video_path",
+                        ]
+                    ]
+
+                    all_data.append(data_file)
+
+                final_data = pd.concat(all_data, ignore_index=True)
+                final_data["index"] = range(len(final_data))
+                final_data.to_csv(tsv_file, sep="\t", index=False)
+
+            if modelscope_flag_set():
+                from modelscope import dataset_snapshot_download
+
+                dataset_path = dataset_snapshot_download(dataset_id=repo_id)
+            else:
+                dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
+
+            unzip_hf_zip(dataset_path)
+            generate_tsv(dataset_path)
+
+        tsv_file = osp.join(dataset_path, f"{dataset_name}.tsv")
+
+        return dict(data_file=tsv_file, root=dataset_path)
+
+    def build_prompt(self, line, video_llm):
+
+        if isinstance(line, int):
+            assert line < len(self)
+            line = self.data.iloc[line]
+
+        task_mode = line["task_mode"]
+
+        message = []
+
+        origin_use_subtitle_time = self.use_subtitle_time
+
+        try:
+            if task_mode in ["long_acc", "clue_acc"]:
+                system_prompt = self.SYS[task_mode]
+            elif task_mode == "miou":
+                if self.use_frame_time and not video_llm:
+                    system_prompt = self.SYS[task_mode]
+                else:
+                    system_prompt = self.SYS["miou_wo_frame_time"]
+                    if self.use_subtitle_time is True:
+                        self.use_subtitle_time = False
+
+            user_prompt = ""
+
+            if task_mode in ["long_acc", "miou"]:
+                video_path = line["video"]
+
+                if video_llm:
+                    message.append(dict(type="video", value=osp.join(self.data_root, video_path)))
+
+                    if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                        if self.nframe:
+                            image_paths, frame_indices, vid_fps = self.save_video_frames(
+                                video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
+                            )
+                            user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices,
+                                                              fps=vid_fps, sub_time=self.use_subtitle_time)
+                        else:
+                            user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time)
+                else:
+                    image_paths, frame_indices, vid_fps = self.save_video_frames(
+                        video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
+                    )
+                    message.extend(dict(type="image", value=im) for im in image_paths)
+
+                    if self.use_frame_time:
+                        user_prompt += get_timestampes(frame_indices, vid_fps)
+
+                    if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                        user_prompt += self.get_subtitles(
+                            line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps,
+                            sub_time=self.use_subtitle_time
+                        )
+
+            elif task_mode == "clue_acc":
+                clue_video_path = line["clue_video_path"]
+                video_path = line["video"]
+
+                if video_llm:
+                    message.append(dict(type="video", value=osp.join(self.data_root, clue_video_path)))
+                    print(message)
+
+                    if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                        if self.nframe:
+                            image_paths, frame_indices, vid_fps = self.save_video_frames(
+                                video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
+                            )
+                            user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices,
+                                                              fps=vid_fps, sub_time=self.use_subtitle_time)
+                        else:
+                            user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time)
+                else:
+                    if self.nframe > 32:
+                        self.nframe = 32
+                        print("The maximum number of frames is 32 when evaluating clue-based mcq in CG-Bench !")
+
+                    clue_intervals = eval(line["clue_intervals"])
+
+                    image_paths, frame_indices, vid_fps = self.save_video_frames(
+                        video_path, uid=line["qid"], clue_intervals=clue_intervals, num_frames=self.nframe, fps=self.fps
+                    )
+
+                    message.extend(dict(type="image", value=im) for im in image_paths)
+
+                    if self.use_frame_time:
+                        user_prompt += get_timestampes(frame_indices, vid_fps)
+
+                    if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                        user_prompt += self.get_subtitles(
+                            line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps,
+                            sub_time=self.use_subtitle_time
+                        )
+
+            question = line["question"]
+            user_prompt += f"Question: {question}\n\n"
+
+            choices = eval(line["choices"])
+            labels = [chr(ord("A") + i) for i in range(len(choices))]
+            user_prompt += "\n".join([f"{label}:{value}" for label, value in zip(labels, choices)]) + "\n\n"
+
+            message.append(dict(type="text", value=system_prompt + user_prompt))
+
+            return message
+
+        finally:
+            # Ensure that `use_subtitle_time` is always restored to its original value
+            self.use_subtitle_time = origin_use_subtitle_time
+
+    def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-1):
+
+        if type(uid) is not str:
+            uid = str(uid)
+
+        vid_path = osp.join(self.data_root, video)
+        vid = decord.VideoReader(vid_path)
+        vid_fps = vid.get_avg_fps()
+        n_frames = len(vid)
+
+        if clue_intervals is not None:
+            merged_intervals = merge_intervals(clue_intervals)
+
+            if num_frames > 0 and fps < 0:
+                indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps)
+                frame_paths = self.clue_frame_paths(uid, len(indices))
+
+            elif fps > 0:
+                frame_indices = []
+                for start, end in merged_intervals:
+                    start_frame = int(start * vid_fps)
+                    end_frame = int(end * vid_fps)
+                    step = vid_fps / fps
+                    interval_indices = [
+                        int(start_frame + i * step) for i in range(int((end_frame - start_frame) / step))
+                    ]
+                    frame_indices.extend(interval_indices)
+
+                if len(frame_indices) < 32:
+                    indices = sample_frames_clue_average(merged_intervals, 32, vid_fps)
+                else:
+                    indices = frame_indices
+                frame_paths = self.clue_frame_paths_fps(uid, len(indices), fps)
+
+        else:
+            if num_frames > 0 and fps < 0:
+                step_size = len(vid) / (num_frames + 1)
+                indices = [int(i * step_size) for i in range(1, num_frames + 1)]
+
+                frame_paths = self.frame_paths(uid)
+            elif fps > 0:
+                total_duration = n_frames / vid_fps
+                required_frames = int(total_duration * fps)
+                step_size = vid_fps / fps
+                indices = [int(i * step_size) for i in range(required_frames)]
+                frame_paths = self.frame_paths_fps(uid, len(indices))
+
+        # Save and validate frames
+        valid_paths = []
+        valid_indices = []
+
+        if not np.all([osp.exists(p) for p in frame_paths]):
+            images = [vid[i].asnumpy() for i in indices]
+            for i, (img_array, path) in enumerate(zip(images, frame_paths)):
+                if osp.exists(path):
+                    try:
+                        with Image.open(path) as img:
+                            img.verify()
+                        valid_paths.append(path)
+                        valid_indices.append(indices[i])
+                    except Exception:
+                        continue
+                else:
+                    try:
+                        img = Image.fromarray(img_array)
+                        img.save(path)
+                        img.verify()
+                        valid_paths.append(path)
+                        valid_indices.append(indices[i])
+                    except Exception:
+                        continue
+        else:
+            for i, path in enumerate(frame_paths):
+                try:
+                    with Image.open(path) as img:
+                        img.verify()
+                    valid_paths.append(path)
+                    valid_indices.append(indices[i])
+                except Exception:
+                    continue
+
+        return valid_paths, valid_indices, vid_fps
+
+    def evaluate(self, eval_file, **judge_kwargs):
+
+        assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
+
+        tgt_file = eval_file.replace(".xlsx", "_rating.json")
+        score_file = eval_file.replace(".xlsx", "_score.xlsx")
+
+        data = load(eval_file)
+
+        data_un = data[~pd.isna(data["prediction"])]
+        data_pred_na = data[pd.isna(data["prediction"])]
+
+        data_pred_na["score"] = -1
+
+        data_un["score"] = data_un.apply(
+            lambda row: post_process(
+                response=row["prediction"],
+                right_answer=row["answer"],
+                task_mode=row["task_mode"],
+                duration=row["duration"],
+            ),
+            axis=1,
+        )
+
+        data = pd.concat([data_pred_na, data_un])
+
+        rejected_count = (data["score"] == -1).sum()
+
+        print(
+            f"Among {len(data)} questions, "
+            f"failed to obtain prediction for {len(data_pred_na)} questions, "
+            f"failed to obtain the score for {rejected_count - len(data_pred_na)} questions. "
+            f"Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating."
+        )
+
+        dump(data, score_file)
+
+        rating = get_dimention_rating_mcq_grouding(score_file)
+
+        dump(rating, tgt_file)
+
+        return rating
+
+
+# 评估时，step_2 评估时，给出 [prompt] + image_paths 就行
+class CGBench_OpenEnded(VideoBaseDataset):
+
+    TYPE = "Video-OpenEnded"
+
+    dataset = "CG-Bench_OpenEnded"
+
+    MD5 = "796035eda0b1e916c517cdc1bc145cfc"
+
+    SYS = (
+        "You will be provided with sampled frames from a video, along with a "
+        "question.\n"
+        "Your task is to analyze the provided frames and infer the most plausible "
+        "answer based on the visual information.\n"
+        "If the visual information is ambiguous or insufficient, use the available "
+        "context to reason your answer.\n"
+        "Only output the answer in the following format:\n\n"
+        '```json\n{"result": "answer"}\n```\n\n'
+        'The "answer" can be a word, phrase, or sentence that directly responds to '
+        "the question.\n\n"
+    )
+
+    def __init__(
+        self,
+        dataset="CG-Bench_OpenEnded",
+        use_subtitle=False,
+        use_subtitle_time=False,
+        use_frame_time=False,
+        nframe=0,
+        fps=-1,
+    ):
+        super().__init__(dataset=dataset, nframe=nframe, fps=fps)
+        self.use_subtitle = use_subtitle
+        self.use_subtitle_time = use_subtitle_time
+        self.use_frame_time = use_frame_time
+        self.dataset_name = dataset
+        lmu_root = LMUDataRoot()
+        self.clue_frame_root = osp.join(lmu_root, "clue_images", dataset)
+
+    @classmethod
+    def supported_datasets(cls):
+        return ["CG-Bench_OpenEnded"]
+
+    def get_subtitles(self, subtitle_path, frame_indices=None, fps=None, sub_time=False):
+
+        subtitles = []
+
+        srt_path = osp.join(self.data_root, subtitle_path)
+        assert osp.exists(srt_path)
+        import pysubs2
+
+        subs = pysubs2.load(srt_path, encoding="utf-8")
+        if not frame_indices:
+            for sub in subs:
+                sub_text = sub.text.replace("\\N", " ")
+                if sub_time:
+                    start_time = milliseconds_to_seconds(sub.start)
+                    end_time = milliseconds_to_seconds(sub.end)
+                    sub_text = f"[{start_time}, {end_time}] {sub_text}"
+                if sub_text.strip() and sub_text not in subtitles:
+                    subtitles.append(sub_text)
+        else:
+            for selected_frame_id in frame_indices:
+                cur_time = pysubs2.make_time(fps=fps, frames=selected_frame_id)
+                for sub in subs:
+                    if sub.start < cur_time and sub.end > cur_time:
+                        sub_text = sub.text.replace("\\N", " ")
+                        if sub_time:
+                            start_time = milliseconds_to_seconds(sub.start)
+                            end_time = milliseconds_to_seconds(sub.end)
+                            sub_text = f"[{start_time}, {end_time}] {sub_text}"
+                            if sub_text.strip() and sub_text not in subtitles:
+                                subtitles.append(sub_text)
+
+        if subtitles:
+            subtitles_str = '\n'.join(subtitles)
+            return f"The subtitles of the video are as follows:\n\n{subtitles_str}\n\n"
+        else:
+            return ""
+
+    def prepare_dataset(self, dataset_name="CG-Bench_OpenEnded", repo_id="CG-Bench/CG-Bench"):
+
+        def check_integrity(pth):
+            data_file = osp.join(pth, f"{dataset_name}.tsv")
+
+            if not os.path.exists(data_file):
+                return False
+
+            if md5(data_file) != self.MD5:
+                return False
+            data = load(data_file)
+            for video_pth in data["video"]:
+                if not osp.exists(osp.join(pth, video_pth)):
+                    return False
+
+            return True
+
+        cache_path = get_cache_path(repo_id)
+
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+        else:
+
+            def generate_tsv(pth):
+
+                tsv_file = osp.join(pth, f"{dataset_name}.tsv")
+
+                with open(osp.join(pth, "cgbench.json"), "r") as f:
+                    data_file = pd.DataFrame(json.load(f))
+
+                data_file = data_file.assign(index=range(len(data_file)))
+                data_file["video"] = data_file["video_uid"].apply(lambda x: f"cg_videos_720p/{x}.mp4")
+                data_file["subtitle_path"] = data_file["video_uid"].apply(
+                    lambda x: f"cg_subtitles/{x}.srt" if osp.exists(osp.join(pth, f"cg_subtitles/{x}.srt")) else ""
+                )
+
+                data_file = data_file[
+                    [
+                        "index",
+                        "video_uid",
+                        "video",
+                        "duration",
+                        "domain",
+                        "sub_category",
+                        "subtitle_path",
+                        "question",
+                        "answer",
+                        "clue_intervals",
+                        "qid",
+                    ]
+                ]
+
+                data_file.to_csv(tsv_file, sep="\t", index=False)
+
+            if modelscope_flag_set():
+                from modelscope import dataset_snapshot_download
+                dataset_path = dataset_snapshot_download(dataset_id=repo_id)
+            else:
+                dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
+
+            unzip_hf_zip(dataset_path)
+            generate_tsv(dataset_path)
+
+        tsv_file = osp.join(dataset_path, f"{dataset_name}.tsv")
+
+        return dict(data_file=tsv_file, root=dataset_path)
+
+    def build_prompt(self, line, video_llm):
+
+        if isinstance(line, int):
+            assert line < len(self)
+            line = self.data.iloc[line]
+
+        message = []
+
+        sys_prompt = self.SYS
+
+        user_prompt = ""
+
+        video_path = line["video"]
+
+        if video_llm:
+            message.append(dict(type="video", value=osp.join(self.data_root, video_path)))
+            if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                if self.nframe:
+                    image_paths, frame_indices, vid_fps = self.save_video_frames(
+                        video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
+                    )
+                    user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices,
+                                                      fps=vid_fps, sub_time=self.use_subtitle_time)
+                else:
+                    user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time)
+        else:
+            image_paths, frame_indices, vid_fps = self.save_video_frames(
+                video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
+            )
+            message.extend(dict(type="image", value=im) for im in image_paths)
+
+            if self.use_frame_time:
+                user_prompt += get_timestampes(frame_indices, vid_fps)
+
+            if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                user_prompt += self.get_subtitles(
+                    line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps,
+                    sub_time=self.use_subtitle_time
+                )
+
+        question = line["question"]
+        user_prompt += f"Question: {question}\n\n"
+
+        message.append(dict(type="text", value=sys_prompt + user_prompt))
+
+        return message
+
+    def clue_frame_paths(self, qid, num_frames=8):
+        frame_root = osp.join(self.clue_frame_root, qid)
+        os.makedirs(frame_root, exist_ok=True)
+        return [osp.join(frame_root, self.frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)]
+
+    def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-1):
+
+        if type(uid) is not str:
+            uid = str(uid)
+
+        vid_path = osp.join(self.data_root, video)
+        vid = decord.VideoReader(vid_path)
+        vid_fps = vid.get_avg_fps()
+        n_frames = len(vid)
+
+        if clue_intervals is not None:
+            merged_intervals = merge_intervals(clue_intervals)
+
+            if num_frames > 0 and fps < 0:
+                indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps)
+                frame_paths = self.clue_frame_paths(uid, len(indices))
+
+            elif fps > 0:
+                frame_indices = []
+                for start, end in merged_intervals:
+                    start_frame = int(start * vid_fps)
+                    end_frame = int(end * vid_fps)
+                    step = vid_fps / fps
+                    interval_indices = [
+                        int(start_frame + i * step) for i in range(int((end_frame - start_frame) / step))
+                    ]
+                    frame_indices.extend(interval_indices)
+
+                if len(frame_indices) < 32:
+                    indices = sample_frames_clue_average(merged_intervals, 32, vid_fps)
+                else:
+                    indices = frame_indices
+                frame_paths = self.clue_frame_paths_fps(uid, len(indices), fps)
+
+        else:
+            if num_frames > 0 and fps < 0:
+                step_size = len(vid) / (num_frames + 1)
+                indices = [int(i * step_size) for i in range(1, num_frames + 1)]
+                frame_paths = self.frame_paths(uid)
+            elif fps > 0:
+                total_duration = n_frames / vid_fps
+                required_frames = int(total_duration * fps)
+                step_size = vid_fps / fps
+                indices = [int(i * step_size) for i in range(required_frames)]
+                frame_paths = self.frame_paths_fps(uid, len(indices))
+
+        valid_paths = []
+        valid_indices = []
+
+        if not np.all([osp.exists(p) for p in frame_paths]):
+            images = [vid[i].asnumpy() for i in indices]
+            for i, (img_array, path) in enumerate(zip(images, frame_paths)):
+                if osp.exists(path):
+                    try:
+                        with Image.open(path) as img:
+                            img.verify()
+                        valid_paths.append(path)
+                        valid_indices.append(indices[i])
+                    except Exception:
+                        continue
+                else:
+                    try:
+                        img = Image.fromarray(img_array)
+                        img.save(path)
+                        img.verify()
+                        valid_paths.append(path)
+                        valid_indices.append(indices[i])
+                    except Exception:
+                        continue
+        else:
+            for i, path in enumerate(frame_paths):
+                try:
+                    with Image.open(path) as img:
+                        img.verify()
+                    valid_paths.append(path)
+                    valid_indices.append(indices[i])
+                except Exception:
+                    continue
+
+        return valid_paths, valid_indices, vid_fps
+
+    def evaluate(self, eval_file, **judge_kwargs):
+
+        from .utils.cgbench import get_dimention_rating_open_ended, post_process_open
+
+        assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
+
+        tgt_file = eval_file.replace(".xlsx", "_rating.json")
+        score_file = eval_file.replace(".xlsx", "_score.xlsx")
+        step_1_tmp_file = eval_file.replace(".xlsx", "_step_1.pkl")
+        step_2_tmp_file = eval_file.replace(".xlsx", "_step_2.pkl")
+
+        data = load(eval_file)
+
+        data_pred_no_na = data[~pd.isna(data["prediction"])]
+        data_pred_na = data[pd.isna(data["prediction"])]
+
+        data_pred_na["model_result"] = -1
+        data_pred_na["step_1_result"] = -1
+        data_pred_na["step_2_result"] = -1
+        data_pred_na["score"] = -1
+
+        data_pred_no_na["model_result"] = data_pred_no_na.apply(
+            lambda row: post_process_open(
+                response=row["prediction"],
+            ),
+            axis=1,
+        )
+
+        if judge_kwargs.get("model", None) != "gpt-4o-0806":
+            judge_kwargs["model"] = "gpt-4o-0806"
+            print("The judge model in cg-bench is gpt-4o-0806!")
+
+        data_no_model_result = data_pred_no_na[data_pred_no_na["model_result"] == -1]
+        data_step_1 = data_pred_no_na[data_pred_no_na["model_result"] != -1]
+
+        model_step_1 = build_judge(system_prompt=sys_prompt_open_eval_step_1, **judge_kwargs)
+        nproc = judge_kwargs.pop('nproc', 32)
+
+        lines_step_1 = data_step_1.to_dict("records")
+        tups_step_1 = [(model_step_1, line) for line in lines_step_1]
+
+        keys_step_1 = {line["qid"] for line in lines_step_1}
+
+        ans = {}
+        if osp.exists(step_1_tmp_file):
+            ans = load(step_1_tmp_file)
+        tups_step_1 = [x for x, i in zip(tups_step_1, keys_step_1) if i not in ans]
+        keys_step_1 = [i for i in keys_step_1 if i not in ans]
+
+        _ = track_progress_rich(
+            eval_open_first,
+            tups_step_1,
+            nproc=nproc,
+            keys=keys_step_1,
+            save=step_1_tmp_file,
+        )
+
+        step_1_results = load(step_1_tmp_file)
+        data_step_1 = save_step_1_steps(data_step_1, step_1_results)  # -1, 0, 1, 2
+
+        data_no_step_1_results = data_step_1[data_step_1["step_1_result"] == -1]
+        data_step_1_over = data_step_1[data_step_1["step_1_result"].isin([0, 1])]
+        data_step_2 = data_step_1[data_step_1["step_1_result"] == 2]
+
+        model_step_2 = build_judge(system_prompt=sys_prompt_open_eval_step_2, **judge_kwargs)
+
+        lines_step_2 = data_step_2.to_dict("records")
+
+        tups_step_2 = []
+
+        for line in tqdm(lines_step_2):
+            clue_intervals = eval(line["clue_intervals"])
+            lmu_root = LMUDataRoot()
+            clue_frame_root = osp.join(lmu_root, "clue_images", self.dataset)
+            data_root = self.data_root
+            frame_paths, _, _ = save_clue_video_frames(
+                data_root,
+                clue_frame_root,
+                video=line["video"],
+                uid=line["qid"],
+                clue_intervals=clue_intervals,
+                num_frames=32,
+            )
+            tups_step_2.append((model_step_2, line, frame_paths))
+
+        keys_step_2 = {line["qid"] for line in lines_step_2}
+
+        ans = {}
+        if osp.exists(step_2_tmp_file):
+            ans = load(step_2_tmp_file)
+        tups_step_2 = [x for x, i in zip(tups_step_2, keys_step_2) if i not in ans]
+        keys_step_2 = [i for i in keys_step_2 if i not in ans]
+
+        _ = track_progress_rich(
+            eval_open_second,
+            tups_step_2,
+            nproc=nproc,
+            keys=keys_step_2,
+            save=step_2_tmp_file,
+        )
+
+        step_2_results = load(step_2_tmp_file)
+        data_step_2 = save_step_2_steps(data_step_2, step_2_results)
+
+        data_no_step_2_results = data_step_2[data_step_2["score"] == -1]
+        data_step_2_over = data_step_2[data_step_2["score"].isin([0, 1])]
+
+        data = pd.concat(
+            [
+                data_pred_na,
+                data_no_model_result,
+                data_no_step_1_results,
+                data_step_1_over,
+                data_no_step_2_results,
+                data_step_2_over,
+            ]
+        )
+
+        dump(data, score_file)
+
+        rating = get_dimention_rating_open_ended(score_file)
+
+        dump(rating, tgt_file)
+
+        return rating
diff --git a/vlmeval/dataset/utils/cgbench.py b/vlmeval/dataset/utils/cgbench.py
new file mode 100644
index 00000000..eaf643bc
--- /dev/null
+++ b/vlmeval/dataset/utils/cgbench.py
@@ -0,0 +1,682 @@
+from ...smp import *
+from .multiple_choice import extract_answer_from_item
+import pandas as pd
+import numpy as np
+import re
+
+FAIL_MSG = "Failed to obtain answer via API."
+
+frame_tmpl = "frame-{}-of-{}.jpg"
+
+sys_prompt_open_eval_step_1 = (
+    "You will be provided with a question, a model's prediction, and the ground "
+    "truth answer for this question.\n"
+    "Your task is to judge whether the model's prediction is correct based on the "
+    "meaning of the two texts.\n"
+    "In most cases, this can be done by determining if the meaning of the model's "
+    "prediction is consistent with, or contains, the ground truth answer. However, "
+    "in some cases where the two texts differ, it may represent different "
+    "descriptions of the same visual scene, in which case visual information is "
+    "needed for further judgment.\n"
+    "Therefore, I hope you:\n"
+    "- Output 0, if the model's prediction and the ground truth answer are neither "
+    "consistent nor related by inclusion, with fundamentally different meanings.\n"
+    "- Output 1, if the meaning of the model's prediction and the ground truth "
+    "answer is consistent, or if the model's prediction meaningfully contains the "
+    "ground truth answer.\n"
+    "- Output 2, if the model's prediction and ground truth are not consistent or "
+    "inclusive, but may be different descriptions of the same visual scene, "
+    "requiring visual information for further judgment.\n"
+    "Only output the answer in the following format:\n\n"
+    '```json\n{"result": choice}\n```\n\n'
+    "The choice is either 0, 1, or 2 as specified above."
+)
+
+sys_prompt_open_eval_step_2 = (
+    "You will be provided with a question, a model's prediction, and the sampling "
+    "frames of the clue intervals related to this question.\n"
+    "Your task is to determine whether the model has answered the question "
+    "correctly based on the visual information provided.\n"
+    "Therefore, I hope you:\n"
+    "- Output 0, if the model's prediction does not correctly answer the question.\n"
+    "- Output 1, if the model's prediction correctly answers the question.\n"
+    "Only output the answer in the following format without output extra "
+    "explanation:\n\n"
+    '```json\n{"result": choice}\n```\n\n'
+    "The choice is either 0 or 1 as specified above."
+)
+
+FAIL_MSG = "Failed to obtain answer via API."
+
+# '10-20', '20-30', '30-40', '40-50', '50-60'
+DURATIONS = ["0 ~ 10", "10 ~ 20", "20 ~ 30", "30 ~ 40", "40 ~ 50", "50 ~ 60", "60+"]
+
+DOMAINS = [
+    "Life Record",
+    "Music & TV show",
+    "Instruction & Knowledge",
+    "Driving",
+    "Embodied Expert",
+    "Humor/funny",
+    "Electonic/Social Gaming",
+    "Security & Health",
+    "Sports & Exercise",
+    "Special Scenes",
+    "Art & Culture",
+    "GUI",
+    "News",
+    "Animal & Pet",
+]
+
+SUB_CATEGORIES = [
+    "Time Cognition",
+    "Hallucination",
+    "Entity Perception",
+    "2D Spatial Perception",
+    "Time Perception",
+    "Scene Perception",
+    "Text Perception",
+    "Event Cognition",
+    "Entity Cognition",
+    "Text Cognition",
+    "Event Perception",
+    "Scene Cognition",
+]
+
+
+def get_dimention_rating_open_ended(data_path):
+    # 读取数据
+    df = load(data_path)
+
+    df = df[df["score"] != -1]
+
+    # 将秒转换为分钟并分配到对应区间
+    df["duration_minutes"] = df["duration"] / 60
+    df["duration_range"] = pd.cut(
+        df["duration_minutes"], bins=[-np.inf, 10, 20, 30, 40, 50, 60, np.inf], labels=DURATIONS
+    )
+
+    # 初始化结果字典
+    result = {
+        "overall": 0,
+        "duration": {k: 0 for k in DURATIONS},
+        "domain": {k: 0 for k in DOMAINS},
+        "sub_category": {k: 0 for k in SUB_CATEGORIES},
+    }
+
+    # Overall
+    result["overall"] = round(df["score"].mean(), 4)
+
+    # Duration
+    for dur in DURATIONS:
+        dur_scores = df[df["duration_range"] == dur]["score"]
+        result["duration"][dur] = round(dur_scores.mean(), 4) if not dur_scores.empty else 0
+
+    # Domain
+    for domain in DOMAINS:
+        domain_scores = df[df["domain"] == domain]["score"]
+        result["domain"][domain] = round(domain_scores.mean(), 4) if not domain_scores.empty else 0
+
+    # Sub-category
+    for sub_cat in SUB_CATEGORIES:
+        sub_cat_scores = df[df["sub_category"] == sub_cat]["score"]
+        result["sub_category"][sub_cat] = round(sub_cat_scores.mean(), 4) if not sub_cat_scores.empty else 0
+
+    return result
+
+
+def get_dimention_rating_mcq_grouding(data_path):
+
+    # 读取数据
+    df = load(data_path)
+
+    # df.loc[(df['task_mode'] == 'miou') & (df['score'] == -1), 'score'] = 0
+
+    df = df[df["score"] != -1]
+
+    # 将秒转换为分钟并分配到对应区间
+    df["duration_minutes"] = df["duration"] / 60
+    df["duration_range"] = pd.cut(
+        df["duration_minutes"], bins=[-np.inf, 10, 20, 30, 40, 50, 60, np.inf], labels=DURATIONS
+    )
+
+    # 初始化结果字典
+    result = {
+        metric: {
+            "overall": 0,
+            "duration": {k: 0 for k in DURATIONS},
+            "domain": {k: 0 for k in DOMAINS},
+            "sub_category": {k: 0 for k in SUB_CATEGORIES},
+        }
+        for metric in ["long_acc", "clue_acc", "miou", "CRR", "acc@iou", "rec@iou"]
+    }
+
+    # 计算基础指标
+    for metric in ["long_acc", "clue_acc", "miou"]:
+        metric_df = df[df["task_mode"] == metric]
+
+        # Overall
+        result[metric]["overall"] = round(metric_df["score"].mean(), 4)
+
+        # Duration
+        for dur in DURATIONS:
+            dur_scores = metric_df[metric_df["duration_range"] == dur]["score"]
+            result[metric]["duration"][dur] = round(dur_scores.mean(), 4) if not dur_scores.empty else 0
+
+        # Domain
+        for domain in DOMAINS:
+            domain_scores = metric_df[metric_df["domain"] == domain]["score"]
+            result[metric]["domain"][domain] = round(domain_scores.mean(), 4) if not domain_scores.empty else 0
+
+        # Sub-category
+        for sub_cat in SUB_CATEGORIES:
+            sub_cat_scores = metric_df[metric_df["sub_category"] == sub_cat]["score"]
+            result[metric]["sub_category"][sub_cat] = round(sub_cat_scores.mean(), 4) if not sub_cat_scores.empty else 0
+
+    # 计算复合指标 CRR
+    def calculate_crr(scores):
+        long_acc = scores[scores["task_mode"] == "long_acc"]["score"].mean()
+        clue_acc = scores[scores["task_mode"] == "clue_acc"]["score"].mean()
+        return round(min(long_acc, clue_acc) / clue_acc, 4) if clue_acc != 0 else 0
+
+    # Overall CRR
+    result["CRR"]["overall"] = calculate_crr(df)
+
+    # Duration CRR
+    for dur in DURATIONS:
+        dur_df = df[df["duration_range"] == dur]
+        result["CRR"]["duration"][dur] = calculate_crr(dur_df)
+
+    # Domain CRR
+    for domain in DOMAINS:
+        domain_df = df[df["domain"] == domain]
+        result["CRR"]["domain"][domain] = calculate_crr(domain_df)
+
+    # Sub-category CRR
+    for sub_cat in SUB_CATEGORIES:
+        sub_cat_df = df[df["sub_category"] == sub_cat]
+        result["CRR"]["sub_category"][sub_cat] = calculate_crr(sub_cat_df)
+
+    # 计算 acc@iou
+    def calculate_acc_at_iou_threshold(scores, threshold):
+
+        miou_qids = set(scores[scores["task_mode"] == "miou"]["qid"])
+
+        long_acc_qids = set(scores[scores["task_mode"] == "long_acc"]["qid"])
+
+        valid_qids = miou_qids & long_acc_qids
+
+        miou_positive = set(scores[(scores["task_mode"] == "miou") & (scores["score"] > threshold)]["qid"])
+
+        long_acc_positive = scores[
+            (scores["task_mode"] == "long_acc") & (scores["qid"].isin(miou_positive)) & (scores["score"] == 1)
+        ]
+
+        acc_at_iou_threshold = len(long_acc_positive) / len(valid_qids) if len(valid_qids) > 0 else 0
+        return round(acc_at_iou_threshold, 4)
+
+    def calculate_acc_at_iou(scores):
+        thresholds = [0.1, 0.2, 0.3, 0.4, 0.5]
+        acc_at_iou_values = [calculate_acc_at_iou_threshold(scores, threshold) for threshold in thresholds]
+
+        return round(sum(acc_at_iou_values) / len(acc_at_iou_values), 4)
+
+    # Overall acc@iou
+    result["acc@iou"]["overall"] = calculate_acc_at_iou(df)
+
+    # Duration acc@iou
+    for dur in DURATIONS:
+        dur_df = df[df["duration_range"] == dur]
+        result["acc@iou"]["duration"][dur] = calculate_acc_at_iou(dur_df)
+
+    # Domain acc@iou
+    for domain in DOMAINS:
+        domain_df = df[df["domain"] == domain]
+        result["acc@iou"]["domain"][domain] = calculate_acc_at_iou(domain_df)
+
+    # Sub-category acc@iou
+    for sub_cat in SUB_CATEGORIES:
+        sub_cat_df = df[df["sub_category"] == sub_cat]
+        result["acc@iou"]["sub_category"][sub_cat] = calculate_acc_at_iou(sub_cat_df)
+
+    # 计算 rec@iou
+    def calculate_rec_at_iou_threshold(scores, threshold):
+        # 获取所有 miou 类型的数据
+        miou_scores = scores[scores["task_mode"] == "miou"]
+
+        # 计算 miou score 大于 threshold 的数量
+        miou_positive = miou_scores[miou_scores["score"] > threshold]
+
+        # 计算比例
+        rec_at_iou = len(miou_positive) / len(miou_scores) if len(miou_scores) > 0 else 0
+
+        return round(rec_at_iou, 4)
+
+    def calculate_rec_at_iou(scores):
+        thresholds = [0.1, 0.2, 0.3, 0.4, 0.5]
+        rec_at_iou_values = [calculate_rec_at_iou_threshold(scores, threshold) for threshold in thresholds]
+
+        return round(sum(rec_at_iou_values) / len(rec_at_iou_values), 4)
+
+    # Overall rec@iou
+    result["rec@iou"]["overall"] = calculate_rec_at_iou(df)
+
+    # Duration rec@iou
+    for dur in DURATIONS:
+        dur_df = df[df["duration_range"] == dur]
+        result["rec@iou"]["duration"][dur] = calculate_rec_at_iou(dur_df)
+
+    # Domain rec@iou
+    for domain in DOMAINS:
+        domain_df = df[df["domain"] == domain]
+        result["rec@iou"]["domain"][domain] = calculate_rec_at_iou(domain_df)
+
+    # Sub-category rec@iou
+    for sub_cat in SUB_CATEGORIES:
+        sub_cat_df = df[df["sub_category"] == sub_cat]
+        result["rec@iou"]["sub_category"][sub_cat] = calculate_rec_at_iou(sub_cat_df)
+
+    return result
+
+
+def milliseconds_to_seconds(milliseconds):
+    return milliseconds / 1000
+
+
+def sample_frames_clue_average(clues_time_intervals, frame_num, fps):
+    # 计算每个线索区间的时长
+    clues_frame_intervals = [(round(interval[0] * fps), round(interval[1] * fps)) for interval in clues_time_intervals]
+    clue_durations = [interval[1] - interval[0] for interval in clues_frame_intervals]
+    total_duration = sum(clue_durations)
+    # 如果 frame_num 的数量大于等于总帧数, 则直接返回全部帧
+    if frame_num >= total_duration:
+        return [frame for interval in clues_frame_intervals for frame in range(interval[0], interval[1])]
+    frames_per_clue = [int(frame_num * (duration / total_duration)) for duration in clue_durations]
+    frame_indices = []
+    for i, (interval, num_frames) in enumerate(zip(clues_frame_intervals, frames_per_clue)):
+        num_frames = max(1, num_frames)
+        seg_size = (interval[1] - interval[0]) / num_frames
+        clue_frame_indices = [int(interval[0] + seg_size / 2 + seg_size * idx) for idx in range(num_frames)]
+        frame_indices.extend(clue_frame_indices)
+    return frame_indices
+
+
+def merge_intervals(intervals):
+    """
+    Merge overlapping intervals in a list.
+    Assumes each interval is a list [start, end].
+    """
+    if not intervals:
+        return []
+
+    # Sort intervals by start time
+    intervals.sort(key=lambda x: x[0])
+
+    merged = [intervals[0]]
+
+    for current in intervals[1:]:
+        last_merged = merged[-1]
+
+        # Check if there is an overlap
+        if current[0] <= last_merged[1]:
+            # Merge the current interval with the last one
+            last_merged[1] = max(last_merged[1], current[1])
+        else:
+            # No overlap, add current interval
+            merged.append(current)
+
+    return merged
+
+
+def calculate_intervals_iou(intervals1, intervals2):
+    """
+    Calculate the IoU of two lists of intervals.
+    Each list contains intervals represented as [start, end].
+    """
+    # Merge overlapping intervals in both lists
+    merged1 = merge_intervals(intervals1)
+    merged2 = merge_intervals(intervals2)
+
+    # Calculate total length of intervals for both lists
+    def total_length(merged_intervals):
+        return sum(end - start for start, end in merged_intervals)
+
+    length1 = total_length(merged1)
+    length2 = total_length(merged2)
+
+    # Calculate intersection length
+    intersection_length = 0
+    for interval1 in merged1:
+        for interval2 in merged2:
+            intersection_start = max(interval1[0], interval2[0])
+            intersection_end = min(interval1[1], interval2[1])
+            intersection_length += max(0, intersection_end - intersection_start)
+    # Calculate union length
+    union_length = length1 + length2 - intersection_length
+    # IoU is intersection divided by union
+    iou = intersection_length / union_length if union_length > 0 else 0
+    return iou
+
+
+def post_process(response, right_answer, task_mode, duration):
+    result = -1
+
+    if response:
+        # 找到 ```json 和 ``` 的位置
+        json_start = response.find("```json")
+        json_end = response.find("```", json_start + len("```json"))
+
+        # 如果找到了 json 内容
+        if json_start != -1 and json_end != -1:
+            json_content = response[json_start + len("```json"):json_end].strip()
+        else:
+            json_content = ""
+
+        if json_content:
+            if task_mode in ["long_acc", "clue_acc"]:
+                json_content = re.sub(r"(?<=:\s)([A-Za-z_]\w*)", r'"\1"', json_content)
+
+            try:
+                model_result = json.loads(json_content)["result"]
+
+                if task_mode in ["long_acc", "clue_acc"]:
+                    result = 1 if right_answer == model_result else 0
+                elif task_mode == "miou":
+                    if not isinstance(model_result, list):
+                        return -1
+                    if not isinstance(model_result[0], list):
+                        model_result = [model_result]
+
+                    need_duration = all(interval[0] <= 1 and interval[1] <= 1 for interval in model_result)
+
+                    if need_duration:
+                        model_result = [[interval[0] * duration, interval[1] * duration] for interval in model_result]
+
+                    right_answer = eval(right_answer)
+
+                    result = calculate_intervals_iou(right_answer, model_result)
+
+            except Exception as e:
+                print(f"Error in parsing JSON: {e}, {json_content}")
+
+        if result == -1:
+            if task_mode in ["long_acc", "clue_acc"]:
+                # 检查是否存在大写字母 A-H，认为其为模型答案
+                matches = re.findall(r"\b[A-H]\b", response)
+                if matches:
+                    result = 1 if right_answer in matches else 0
+            elif task_mode == "miou":
+                # 提取所有实数，进行配对
+                numbers = re.findall(r"-?\d+\.?\d*", response)
+                if len(numbers) < 2:
+                    result = -1
+                else:
+                    if len(numbers) % 2 != 0:
+                        numbers = numbers[:-1]
+                    model_result = [[float(numbers[i]), float(numbers[i + 1])] for i in range(0, len(numbers), 2)]
+
+                    if type(right_answer) is str:
+                        right_answer = eval(right_answer)
+
+                    result = calculate_intervals_iou(right_answer, model_result)
+
+    return result
+
+
+def get_timestampes(frame_indices, fps):
+    seconds = list(map(lambda x: str(round(x / fps, 4)), frame_indices))
+    timestamps = ", ".join(seconds)
+    return "A total of {frame_num} frames are sampled. Their corresponding timestamps are:\n\n{timestamps}\n\n".format(
+        frame_num=len(frame_indices), timestamps=timestamps
+    )
+
+
+def post_process_open(response):
+    model_result = -1
+
+    if response and response != FAIL_MSG:
+        json_start = response.find("```json")
+        json_end = response.find("```", json_start + len("```json"))
+
+        # 如果找到了 json 内容
+        if json_start != -1 and json_end != -1:
+            json_content = response[json_start + len("```json"):json_end].strip()
+        else:
+            json_content = ""
+
+        if json_content:
+            try:
+                model_result = json.loads(json_content)["result"]
+            except Exception as e:
+                print(f"Error in parsing JSON: {e}, {json_content}")
+
+        if model_result == -1:
+            model_result = response
+
+    return model_result
+
+
+def post_process_eval_open(response, step):
+
+    model_result = -1
+
+    if response and response != FAIL_MSG:
+
+        json_start = response.find("```json")
+        json_end = response.find("```", json_start + len("```json"))
+
+        if json_start != -1 and json_end != -1:
+            json_content = response[json_start + len("```json"):json_end].strip()
+        else:
+            json_content = ""
+
+        if json_content:
+            try:
+                model_result = json.loads(json_content)["result"]
+            except Exception as e:
+                print(f"Error in parsing JSON: {e}, {json_content}")
+                return -1
+        if model_result == -1:
+            if step == 1:
+                match = re.search(r"[012]", response)
+                if match:
+                    model_result = int(match.group())
+            else:
+                match = re.search(r"[01]", response)
+                if match:
+                    model_result = int(match.group())
+
+    return model_result
+
+
+def eval_open_first(model, line):
+
+    user_prompt = ""
+
+    user_prompt += f"Question: {line['question']}\n\n"
+
+    user_prompt += f"The ground truth answer is '{line['answer']}'\n\n"
+
+    user_prompt += f"The model's prediction is '{line['model_result']}'\n\n"
+
+    result = model.generate(user_prompt)
+
+    return result
+
+
+def save_step_1_steps(data, step_1_results):
+
+    # 处理所有结果
+    data["step_1_result"] = data["qid"].map(lambda x: post_process_eval_open(step_1_results[x], 1))
+
+    # 条件更新
+    mask = data["step_1_result"].isin([-1, 0, 1])
+    data.loc[mask, "step_2_result"] = data.loc[mask, "step_1_result"]
+    data.loc[mask, "score"] = data.loc[mask, "step_1_result"]
+
+    return data
+
+
+def eval_open_second(model, line, frame_paths):
+
+    user_prompt = ""
+
+    user_prompt += f"Question: {line['question']}\n\n"
+
+    user_prompt += f"The model's prediction is '{line['model_result']}'\n\n"
+
+    result = model.generate([user_prompt] + frame_paths)
+
+    return result
+
+
+def save_step_2_steps(data, step_1_results):
+
+    # 处理所有结果
+    data["score"] = data["qid"].map(lambda x: post_process_eval_open(step_1_results[x], 2))
+
+    return data
+
+
+def clue_frame_paths(clue_frame_root, qid, num_frames=8):
+    frame_root = osp.join(clue_frame_root, str(qid))
+    os.makedirs(frame_root, exist_ok=True)
+    return [osp.join(frame_root, frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)]
+
+
+def save_clue_video_frames(data_root, clue_frame_root, video, uid, clue_intervals=None, num_frames=8, fps=-1):
+
+    if type(uid) is str:
+        uid = str(uid)
+
+    vid_path = osp.join(data_root, video)
+    vid = decord.VideoReader(vid_path)
+    vid_fps = vid.get_avg_fps()
+
+    if clue_intervals is not None:
+        # 1. 合并重叠区间
+        merged_intervals = merge_intervals(clue_intervals)
+
+        if num_frames > 0 and fps < 0:
+            # 2. 基于clue_intervals均匀抽帧
+            indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps)
+            frame_paths = clue_frame_paths(clue_frame_root, uid, len(indices))
+
+    # 保存帧
+    flag = np.all([osp.exists(p) for p in frame_paths])
+    if not flag:
+        images = [vid[i].asnumpy() for i in indices]
+        images = [Image.fromarray(arr) for arr in images]
+        for im, pth in zip(images, frame_paths):
+            if not osp.exists(pth):
+                im.save(pth)
+
+    return frame_paths, indices, vid_fps
+
+
+def get_chunk_number(filename):
+    try:
+        num = filename.split("chunk_")[1].split(".zip")[0]
+        return int(num)
+    except:
+        return float('inf')
+
+
+def unzip_hf_zip(pth):
+
+    import zipfile
+
+    target_dir = pth
+
+    if os.path.exists(f"{target_dir}/cg_videos_720p") and os.path.exists(f"{target_dir}/cg_subtitles")\
+            and os.path.exists(f"{target_dir}/cg_clue_videos"):
+        print("all exists")
+        return
+
+    video_zip_files = [
+        os.path.join(target_dir, file)
+        for file in os.listdir(target_dir)
+        if file.endswith(".zip") and file.startswith("video")
+    ]
+
+    video_zip_files = sorted(video_zip_files, key=lambda x: get_chunk_number(os.path.basename(x)))
+
+    videos_temp_zip = os.path.join(target_dir, "videos_merged.zip")
+
+    print("Merging video files ...")
+
+    with open(videos_temp_zip, "wb") as outfile:
+        for video_zip_file in tqdm(video_zip_files, desc="Merging videos"):
+            with open(video_zip_file, "rb") as infile:
+                outfile.write(infile.read())
+
+    print("Extracting video files...")
+
+    try:
+        with zipfile.ZipFile(videos_temp_zip, "r") as zip_ref:
+
+            total_files = len(zip_ref.namelist())
+
+            for file in tqdm(zip_ref.namelist(), desc="Extracting", total=total_files):
+                zip_ref.extract(file, target_dir)
+
+        print(f"Successfully extracted to {target_dir}")
+    except Exception as e:
+        print(f"Error during extraction: {e}")
+    finally:
+
+        if os.path.exists(videos_temp_zip):
+            os.remove(videos_temp_zip)
+            print("Cleaned up temporary video file")
+
+    clue_video_zip_files = [
+        os.path.join(target_dir, file)
+        for file in os.listdir(target_dir)
+        if file.endswith(".zip") and file.startswith("clue_video")
+    ]
+
+    clue_video_zip_files = sorted(clue_video_zip_files, key=lambda x: get_chunk_number(os.path.basename(x)))
+
+    clue_videos_temp_zip = os.path.join(target_dir, "clue_videos_merged.zip")
+
+    print("Merging clue video files ...")
+
+    with open(clue_videos_temp_zip, "wb") as outfile:
+        for clue_video_zip_file in tqdm(clue_video_zip_files, desc="Merging clue_videos"):
+            with open(clue_video_zip_file, "rb") as infile:
+                outfile.write(infile.read())
+
+    print("Extracting clue video files...")
+
+    try:
+        with zipfile.ZipFile(clue_videos_temp_zip, "r") as zip_ref:
+
+            total_files = len(zip_ref.namelist())
+
+            for file in tqdm(zip_ref.namelist(), desc="Extracting", total=total_files):
+                zip_ref.extract(file, target_dir)
+
+        print(f"Successfully extracted to {target_dir}")
+    except Exception as e:
+        print(f"Error during extraction: {e}")
+    finally:
+
+        if os.path.exists(clue_videos_temp_zip):
+            os.remove(clue_videos_temp_zip)
+            print("Cleaned up temporary clue video file")
+
+    print("Extracting subtitle files ...")
+
+    subtitles_zip = os.path.join(target_dir, "subtitles.zip")
+
+    try:
+        with zipfile.ZipFile(subtitles_zip, "r") as zip_ref:
+
+            total_files = len(zip_ref.namelist())
+
+            for file in tqdm(zip_ref.namelist(), desc="Extracting", total=total_files):
+                zip_ref.extract(file, target_dir)
+
+        print(f"Successfully extracted to {target_dir}")
+    except Exception as e:
+        print(f"Error during extraction: {e}")
diff --git a/vlmeval/dataset/video_dataset_config.py b/vlmeval/dataset/video_dataset_config.py
index 0937e8fd..8e195445 100644
--- a/vlmeval/dataset/video_dataset_config.py
+++ b/vlmeval/dataset/video_dataset_config.py
@@ -43,11 +43,40 @@
     'TempCompass_0.5fps': partial(TempCompass, dataset='TempCompass', fps=0.5)
 }
 
+cgbench_dataset = {
+    'CGBench_MCQ_Grounding_Mini_8frame_subs_subt': partial(
+        CGBench_MCQ_Grounding_Mini,
+        dataset='CG-Bench_MCQ_Grounding_Mini',
+        nframe=8,
+        use_subtitle=True,
+        use_subtitle_time=True
+    ),
+    'CGBench_OpenEnded_Mini_8frame_subs_subt_ft': partial(
+        CGBench_OpenEnded_Mini,
+        dataset='CG-Bench_OpenEnded_Mini',
+        nframe=8,
+        use_subtitle=True,
+        use_subtitle_time=True,
+        use_frame_time=True
+    ),
+    'CGBench_MCQ_Grounding_32frame_subs': partial(
+        CGBench_MCQ_Grounding,
+        dataset='CG-Bench_MCQ_Grounding',
+        nframe=32,
+        use_subtitle=True
+    ),
+    'CGBench_OpenEnded_8frame': partial(
+        CGBench_OpenEnded,
+        dataset='CG-Bench_OpenEnded',
+        nframe=8
+    ),
+}
+
 supported_video_datasets = {}
 
 dataset_groups = [
     mmbench_video_dataset, mvbench_dataset, videomme_dataset, longvideobench_dataset,
-    mlvu_dataset, tempcompass_dataset
+    mlvu_dataset, tempcompass_dataset, cgbench_dataset
 ]
 
 for grp in dataset_groups: