diff --git a/README.md b/README.md
index 8dd01b23a..b5efc575a 100644
--- a/README.md
+++ b/README.md
@@ -20,18 +20,14 @@
 
 ## Annoucement
 
+- [2025-02] 🚀🚀 We have integrated [`vllm`](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/544) into our models, enabling accelerated evaluation for both multimodal and language models. Additionally, we have incorporated [`openai_compatible`](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/546) to support the evaluation of any API-based model that follows the OpenAI API format. Check the usages [here](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/miscs/model_dryruns).
+
 - [2025-01] 🎓🎓 We have released our new benchmark: [Video-MMMU: Evaluating Knowledge Acquisition from Multi-Discipline Professional Videos](https://arxiv.org/abs/2501.13826). Please refer to the [project page](https://videommmu.github.io/) for more details.
 
 - [2024-12] 🎉🎉 We have presented [MME-Survey: A Comprehensive Survey on Evaluation of Multimodal LLMs](https://arxiv.org/pdf/2411.15296), jointly with [MME Team](https://github.com/BradyFU/Video-MME) and [OpenCompass Team](https://github.com/open-compass).
 
 - [2024-11] 🔈🔊 The `lmms-eval/v0.3.0` has been upgraded to support audio evaluations for audio models like Qwen2-Audio and Gemini-Audio across tasks such as AIR-Bench, Clotho-AQA, LibriSpeech, and more. Please refer to the [blog](https://github.com/EvolvingLMMs-Lab/lmms-eval/blob/main/docs/lmms-eval-0.3.md) for more details!
 
-- [2024-07] 🎉🎉 We have released the [technical report](https://arxiv.org/abs/2407.12772) and [LiveBench](https://huggingface.co/spaces/lmms-lab/LiveBench)! 
-
-- [2024-06] 🎬🎬 The `lmms-eval/v0.2.0` has been upgraded to support video evaluations for video models like LLaVA-NeXT Video and Gemini 1.5 Pro across tasks such as EgoSchema, PerceptionTest, VideoMME, and more. Please refer to the [blog](https://lmms-lab.github.io/posts/lmms-eval-0.2/) for more details!
-
-- [2024-03] 📝📝 We have released the first version of `lmms-eval`, please refer to the [blog](https://lmms-lab.github.io/posts/lmms-eval-0.1/) for more details!
-
 <details>
 <summary>We warmly welcome contributions from the open-source community! Below is a chronological list of recent tasks, models, and features added by our amazing contributors. </summary>
 
@@ -42,6 +38,9 @@
 - [2024-09] ⚙️️⚙️️️️ We upgrade `lmms-eval` to `0.2.3` with more tasks and features. We support a compact set of language tasks evaluations (code credit to [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)), and we remove the registration logic at start (for all models and tasks) to reduce the overhead. Now `lmms-eval` only launches necessary tasks/models. Please check the [release notes](https://github.com/EvolvingLMMs-Lab/lmms-eval/releases/tag/v0.2.3) for more details.
 - [2024-08] 🎉🎉 We welcome the new model [LLaVA-OneVision](https://huggingface.co/papers/2408.03326), [Mantis](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/162), new tasks [MVBench](https://huggingface.co/datasets/OpenGVLab/MVBench), [LongVideoBench](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/117), [MMStar](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/158). We provide new feature of SGlang Runtime API for llava-onevision model, please refer the [doc](https://github.com/EvolvingLMMs-Lab/lmms-eval/blob/main/docs/commands.md) for inference acceleration
 - [2024-07] 👨‍💻👨‍💻 The `lmms-eval/v0.2.1` has been upgraded to support more models, including [LongVA](https://github.com/EvolvingLMMs-Lab/LongVA), [InternVL-2](https://github.com/OpenGVLab/InternVL), [VILA](https://github.com/NVlabs/VILA), and many more evaluation tasks, e.g. [Details Captions](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/136), [MLVU](https://arxiv.org/abs/2406.04264), [WildVision-Bench](https://huggingface.co/datasets/WildVision/wildvision-arena-data), [VITATECS](https://github.com/lscpku/VITATECS) and [LLaVA-Interleave-Bench](https://llava-vl.github.io/blog/2024-06-16-llava-next-interleave/).
+- [2024-07] 🎉🎉 We have released the [technical report](https://arxiv.org/abs/2407.12772) and [LiveBench](https://huggingface.co/spaces/lmms-lab/LiveBench)! 
+- [2024-06] 🎬🎬 The `lmms-eval/v0.2.0` has been upgraded to support video evaluations for video models like LLaVA-NeXT Video and Gemini 1.5 Pro across tasks such as EgoSchema, PerceptionTest, VideoMME, and more. Please refer to the [blog](https://lmms-lab.github.io/posts/lmms-eval-0.2/) for more details!
+- [2024-03] 📝📝 We have released the first version of `lmms-eval`, please refer to the [blog](https://lmms-lab.github.io/posts/lmms-eval-0.1/) for more details!
 
 </details>
 
diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py
index df4810974..c53856bf6 100644
--- a/lmms_eval/models/__init__.py
+++ b/lmms_eval/models/__init__.py
@@ -62,6 +62,7 @@
     "vllm": "VLLM",
     "xcomposer2_4KHD": "XComposer2_4KHD",
     "xcomposer2d5": "XComposer2D5",
+    "egogpt": "EgoGPT",
 }
 
 
diff --git a/lmms_eval/models/egogpt.py b/lmms_eval/models/egogpt.py
new file mode 100644
index 000000000..f75bd5df5
--- /dev/null
+++ b/lmms_eval/models/egogpt.py
@@ -0,0 +1,472 @@
+import copy
+import json
+import logging
+import math
+import re
+import warnings
+from datetime import timedelta
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import PIL
+import torch
+import transformers
+from accelerate import Accelerator, DistributedType, InitProcessGroupKwargs
+from accelerate.state import AcceleratorState
+from decord import VideoReader, cpu
+from packaging import version
+from tqdm import tqdm
+from transformers import AutoConfig
+
+from lmms_eval import utils
+from lmms_eval.api.instance import Instance
+from lmms_eval.api.model import lmms
+from lmms_eval.api.registry import register_model
+
+# Suppress warnings
+warnings.filterwarnings("ignore")
+
+# Configure logging
+eval_logger = logging.getLogger("lmms-eval")
+
+# Enable TF32 for CUDA
+torch.backends.cuda.matmul.allow_tf32 = True
+
+# Import LLaVA modules
+try:
+    import copy
+    import os
+    import re
+    import sys
+    import warnings
+
+    import numpy as np
+    import requests
+    import soundfile as sf
+    import torch
+    import whisper
+    from decord import VideoReader, cpu
+    from egogpt.constants import (
+        DEFAULT_IMAGE_TOKEN,
+        DEFAULT_SPEECH_TOKEN,
+        IGNORE_INDEX,
+        IMAGE_TOKEN_INDEX,
+        SPEECH_TOKEN_INDEX,
+    )
+    from egogpt.conversation import SeparatorStyle, conv_templates
+    from egogpt.mm_utils import get_model_name_from_path, process_images
+    from egogpt.model.builder import load_pretrained_model
+    from PIL import Image
+    from scipy.signal import resample
+except ImportError as e:
+    eval_logger.debug(f"egogpt is not installed. Please install egogpt to use this model.\nError: {e}")
+
+
+# Determine best attention implementation
+if version.parse(torch.__version__) >= version.parse("2.1.2"):
+    best_fit_attn_implementation = "sdpa"
+else:
+    best_fit_attn_implementation = "eager"
+
+
+@register_model("egogpt")
+class EgoGPT(lmms):
+    """
+    EgoGPT Model
+    """
+
+    def __init__(
+        self,
+        pretrained: str = "checkpoints/egogpt_IT_12k_1126_zero3",
+        truncation: Optional[bool] = True,
+        device: Optional[str] = "cuda:0",
+        batch_size: Optional[Union[int, str]] = 1,
+        model_name: Optional[str] = None,
+        attn_implementation: Optional[str] = best_fit_attn_implementation,
+        device_map: Optional[str] = "cuda:0",
+        conv_template: Optional[str] = "qwen_1_5",
+        use_cache: Optional[bool] = True,
+        truncate_context: Optional[bool] = False,  # whether to truncate the context in generation, set it False for LLaVA-1.6
+        customized_config: Optional[str] = None,  # ends in json
+        max_frames_num: Optional[int] = 32,
+        mm_spatial_pool_stride: Optional[int] = 2,
+        mm_spatial_pool_mode: Optional[str] = "bilinear",
+        token_strategy: Optional[str] = "single",  # could be "single" or "multiple", "multiple" denotes adding multiple <image> tokens for each frame
+        video_decode_backend: str = "decord",
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        # Do not use kwargs for now
+        assert kwargs == {}, f"Unexpected kwargs: {kwargs}"
+
+        accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
+        accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
+        if accelerator.num_processes > 1:
+            self._device = torch.device(f"cuda:{accelerator.local_process_index}")
+            self.device_map = f"cuda:{accelerator.local_process_index}"
+        elif accelerator.num_processes == 1 and device_map == "auto":
+            self._device = torch.device(device)
+            self.device_map = device_map
+        else:
+            self._device = torch.device(f"cuda:{accelerator.local_process_index}")
+            self.device_map = f"cuda:{accelerator.local_process_index}"
+
+        egogpt_model_args = {}
+        if attn_implementation is not None:
+            egogpt_model_args["attn_implementation"] = attn_implementation
+
+        self.pretrained = pretrained
+        self.token_strategy = token_strategy
+        self.max_frames_num = max_frames_num
+        self.mm_spatial_pool_stride = mm_spatial_pool_stride
+        self.mm_spatial_pool_mode = mm_spatial_pool_mode
+        self.video_decode_backend = video_decode_backend
+        # Try to load the model with the multimodal argument
+        self._tokenizer, self._model, self._max_length = load_pretrained_model(pretrained, device_map=self.device_map, **egogpt_model_args)
+        self._image_processor = self._model.get_vision_tower().image_processor
+        self._config = self._model.config
+        self.model.eval()
+        self.truncation = truncation
+        self.batch_size_per_gpu = int(batch_size)
+        self.conv_template = conv_template
+        self.use_cache = use_cache
+        self.truncate_context = truncate_context
+        assert self.batch_size_per_gpu == 1
+
+        if accelerator.num_processes > 1:
+            assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported."
+            # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model
+            # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works
+            # I tried to set different parameters in the kwargs to let default zero 2 stage works, but it didn't work.
+            if accelerator.distributed_type == DistributedType.DEEPSPEED:
+                kwargs = {
+                    "train_micro_batch_size_per_gpu": self.batch_size_per_gpu,
+                    "train_batch_size": self.batch_size_per_gpu * accelerator.num_processes,
+                }
+                AcceleratorState().deepspeed_plugin.deepspeed_config_process(must_match=True, **kwargs)
+                eval_logger.info("Detected that you are using DistributedType.DEEPSPEED. Make sure you run `accelerate config` and set zero stage to 0")
+
+            if accelerator.distributed_type == DistributedType.FSDP or accelerator.distributed_type == DistributedType.DEEPSPEED:
+                self._model = accelerator.prepare(self.model)
+            else:
+                self._model = accelerator.prepare_model(self.model, evaluation_mode=True)
+            self.accelerator = accelerator
+            if self.accelerator.is_local_main_process:
+                eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism")
+            self._rank = self.accelerator.local_process_index
+            self._world_size = self.accelerator.num_processes
+
+        elif accelerator.num_processes == 1 and device_map == "auto":
+            eval_logger.info(f"Using {accelerator.num_processes} devices with tensor parallelism")
+            self._rank = 0
+            self._world_size = 1
+
+        else:
+            eval_logger.info(f"Using single device: {self._device}")
+            self.model.to(self._device)
+            self._rank = 0
+            self._world_size = 1
+
+    @property
+    def config(self):
+        # return the associated transformers.AutoConfig for the given pretrained model.
+        return self._config
+
+    @property
+    def tokenizer(self):
+        return self._tokenizer
+
+    @property
+    def model(self):
+        # returns the model, unwrapping it if using Accelerate
+        if hasattr(self, "accelerator"):
+            return self.accelerator.unwrap_model(self._model)
+        else:
+            return self._model
+
+    @property
+    def eot_token_id(self):
+        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+        return self.tokenizer.eos_token_id
+
+    @property
+    def max_length(self):
+        return self._max_length
+
+    def pad_sequence(self, input_ids, batch_first, padding_value):
+        if self.tokenizer.padding_side == "left":
+            input_ids = [torch.flip(_input_ids, [0]) for _input_ids in input_ids]
+        input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=batch_first, padding_value=padding_value)
+        if self.tokenizer.padding_side == "left":
+            input_ids = torch.flip(input_ids, [1])
+        return input_ids
+
+    @property
+    def batch_size(self):
+        return self.batch_size_per_gpu
+
+    @property
+    def device(self):
+        return self._device
+
+    @property
+    def rank(self):
+        return self._rank
+
+    @property
+    def world_size(self):
+        return self._world_size
+
+    def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None) -> List[int]:
+        """ """
+        add_special_tokens = False if add_special_tokens is None else add_special_tokens
+        encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens)
+        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+        if left_truncate_len:
+            encoding = encoding[-left_truncate_len:]
+        return encoding
+
+    def tok_decode(self, tokens):
+        try:
+            return self.tokenizer.decode(tokens)
+        except:
+            return self.tokenizer.decode([tokens])
+
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        raise NotImplementedError("Loglikelihood is not implemented for EgoGPT")
+
+    def flatten(self, input):
+        new_list = []
+        for i in input:
+            for j in i:
+                new_list.append(j)
+        return new_list
+
+    def split_text(self, text, keywords):
+        pattern = "(" + "|".join(map(re.escape, keywords)) + ")"
+        parts = re.split(pattern, text)
+        parts = [part for part in parts if part]
+        return parts
+
+    def load_video(self, video_path=None, audio_path=None, max_frames_num=16, fps=1, task_name=None):
+        if audio_path is not None:
+            speech, sample_rate = sf.read(audio_path)
+            if sample_rate != 16000:
+                target_length = int(len(speech) * 16000 / sample_rate)
+                speech = resample(speech, target_length)
+            if speech.ndim > 1:
+                speech = np.mean(speech, axis=1)
+            # max_length = 480000
+            speech = whisper.pad_or_trim(speech.astype(np.float32))
+            speech = whisper.log_mel_spectrogram(speech, n_mels=128).permute(1, 0)
+            speech_lengths = torch.LongTensor([speech.shape[0]])
+        else:
+            speech = torch.zeros(3000, 128)
+            speech_lengths = torch.LongTensor([3000])
+
+        vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+        total_frame_num = len(vr)
+        avg_fps = round(vr.get_avg_fps() / fps)
+        frame_idx = [i for i in range(0, total_frame_num, avg_fps)]
+        frame_time = [i / avg_fps for i in frame_idx]
+
+        if max_frames_num > 0:
+            if len(frame_idx) > max_frames_num:
+                uniform_sampled_frames = np.linspace(0, total_frame_num - 1, max_frames_num, dtype=int)
+                frame_idx = uniform_sampled_frames.tolist()
+        if task_name == "egoplan":
+            # add current ovservation frame
+            frame_idx.append(total_frame_num - 1)
+        video = vr.get_batch(frame_idx).asnumpy()
+        return video, speech, speech_lengths
+
+    def generate_until(self, requests: List[Instance]) -> List[str]:
+        res = []
+
+        def _collate(x):
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = self.tok_encode(x[0])
+            return -len(toks), x[0]
+
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        metadata = requests[0].metadata
+        re_ords = utils.Collator([reg.args for reg in requests], _collate, grouping=True)
+        chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
+        num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1
+        pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding")
+
+        origin_image_aspect_ratio = getattr(self._config, "image_aspect_ratio", None)
+
+        for chunk in chunks:
+            batched_contexts, all_gen_kwargs, batched_doc_to_visual, batched_doc_id, batched_task, batched_split = zip(*chunk)
+            task = batched_task[0]
+            split = batched_split[0]
+            batched_visuals = [batched_doc_to_visual[0](self.task_dict[task][split][ids]) for ids in batched_doc_id]  # [B, N]
+            assert len(batched_visuals) == 1
+
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            gen_kwargs = all_gen_kwargs[0]
+            if "until" in gen_kwargs:
+                gen_kwargs.pop("until")
+
+            question_input = []
+            # import ipdb; ipdb.set_trace()
+            for visual, context in zip(batched_visuals, batched_contexts):
+                if origin_image_aspect_ratio is not None and self._config.image_aspect_ratio != origin_image_aspect_ratio:
+                    self._config.image_aspect_ratio = origin_image_aspect_ratio
+                    eval_logger.info(f"Resetting image aspect ratio to {origin_image_aspect_ratio}")
+
+                if visual is None or visual == []:  # for text-only tasks.
+                    visual = None
+                    task_type = "text"
+                    placeholder_count = 0
+                    image_tensor = None
+                else:
+                    if len(visual) > 1 or "image_aspect_ratio" not in self._config.__dict__:  # for multi image case, we treat per image aspect ratio as "pad" by default.
+                        self._config.image_aspect_ratio = getattr(gen_kwargs, "image_aspect_ratio", "pad")
+                        eval_logger.info(f"In Multi-Image setting, image aspect ratio: {self._config.image_aspect_ratio}")
+
+                    if "task_type" in metadata and metadata["task_type"] == "video" and "sample_frames" in metadata:  # overwrite logic for video task with multiple static image frames
+                        assert type(visual) == list, "sample_frames must be specified for video task"
+                        sample_indices = np.linspace(0, len(visual) - 1, metadata["sample_frames"], dtype=int)
+                        visual = [visual[i] for i in sample_indices]
+                        assert len(visual) == metadata["sample_frames"]
+
+                        image_tensor = process_images(visual, self._image_processor, self._config)
+                        if type(image_tensor) is list:
+                            image_tensor = [_image.to(dtype=torch.float16, device=self.device) for _image in image_tensor]
+                        else:
+                            image_tensor = image_tensor.to(dtype=torch.float16, device=self.device)
+                        image_tensor = [image_tensor]
+                        task_type = "video"
+                        placeholder_count = 1
+
+                    elif type(visual[0]) == PIL.Image.Image:  # For image, multi-image tasks
+                        image_tensor = process_images(visual, self._image_processor, self._config)
+                        speech = torch.zeros(3000, 128)
+                        speech_lengths = torch.LongTensor([3000])
+                        if type(image_tensor) is list:
+                            image_tensor = [_image.to(dtype=torch.float16, device=self.device) for _image in image_tensor]
+                        else:
+                            image_tensor = image_tensor.to(dtype=torch.float16, device=self.device)
+
+                        task_type = "image"
+                        placeholder_count = len(visual) if isinstance(visual, list) else 1
+
+                    elif type(visual[0]) == str:  # For video task
+                        image_tensor = []
+                        try:
+                            if self.video_decode_backend == "decord":
+                                if "egoplan" in visual[0]:
+                                    task_name = "egoplan"
+                                else:
+                                    task_name = None
+                                frames, speech, speech_lengths = self.load_video(video_path=visual[0], max_frames_num=self.max_frames_num, task_name=task_name)
+                            else:
+                                raise NotImplementedError("Only decord backend is supported for video task")
+                            processed_frames = self._image_processor.preprocess(frames, return_tensors="pt")["pixel_values"].half().cuda()
+                            processed_frames = processed_frames.half()
+                            image_tensor.append(processed_frames)
+                            image_sizes = [frames[0].size]
+                        except Exception as e:
+                            eval_logger.error(f"Error {e} in loading video")
+                            image_tensor = None
+
+                        task_type = "video"
+                        placeholder_count = len(frames) if self.token_strategy == "multiple" else 1
+                if DEFAULT_IMAGE_TOKEN not in context:
+                    question = DEFAULT_IMAGE_TOKEN + "\n" + context
+                else:
+                    question = context
+                speech = torch.stack([speech]).to(self.device).half()
+                # This is much safer for llama3, as we now have some object type in it
+                if "llama_3" in self.conv_template:
+                    conv = copy.deepcopy(conv_templates[self.conv_template])
+                else:
+                    conv = conv_templates[self.conv_template].copy()
+
+                if utils.is_json(question):  # conversational question input
+                    question = json.loads(question)
+                    for idx, item in enumerate(question):
+                        role = conv.roles[idx % 2]
+                        message = item["value"]
+                        conv.append_message(role, message)
+
+                    assert len(conv.messages) % 2 == 1
+                    conv.append_message(conv.roles[1], None)
+                    prompt_question = conv.get_prompt()
+                    question_input.append(prompt_question)
+                else:  # only simple string for question
+                    conv.append_message(conv.roles[0], question)
+                    conv.append_message(conv.roles[1], None)
+                    prompt_question = conv.get_prompt()
+                    question_input.append(prompt_question)
+
+            # preconfigure gen_kwargs with defaults
+            if "max_new_tokens" not in gen_kwargs:
+                gen_kwargs["max_new_tokens"] = 1024
+            if "temperature" not in gen_kwargs:
+                gen_kwargs["temperature"] = 0
+            if "do_sample" not in gen_kwargs:
+                gen_kwargs["do_sample"] = False
+            if "top_p" not in gen_kwargs:
+                gen_kwargs["top_p"] = None
+            if "num_beams" not in gen_kwargs:
+                gen_kwargs["num_beams"] = 1
+
+            parts = self.split_text(prompt_question, ["<image>", "<speech>"])
+            input_ids = []
+            for part in parts:
+                if "<image>" == part:
+                    input_ids += [IMAGE_TOKEN_INDEX]
+                elif "<speech>" == part:
+                    input_ids += [SPEECH_TOKEN_INDEX]
+                else:
+                    input_ids += self.tokenizer(part).input_ids
+
+            input_ids = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(self.device)
+            input_ids_list = [input_ids]
+            pad_token_ids = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
+            input_ids = self.pad_sequence(input_ids_list, batch_first=True, padding_value=pad_token_ids).to(self.device)
+            attention_masks = input_ids.ne(pad_token_ids).to(self.device)
+            input_ids = torch.tensor(input_ids, dtype=torch.long).squeeze(0).to(self.device)
+            if task_type == "image":
+                gen_kwargs["image_sizes"] = [batched_visuals[0][idx].size for idx in range(len(batched_visuals[0]))]
+            elif task_type == "video":
+                gen_kwargs["modalities"] = ["video"]
+                self._config.mm_spatial_pool_stride = self.mm_spatial_pool_stride
+                self._config.mm_spatial_pool_mode = self.mm_spatial_pool_mode
+                gen_kwargs["eos_token_id"] = self.tokenizer.eos_token_id
+
+            # These steps are not in LLaVA's original code, but are necessary for generation to work
+            # TODO: attention to this major generation step...
+            if "image_aspect_ratio" in gen_kwargs.keys():
+                gen_kwargs.pop("image_aspect_ratio")
+            try:
+                with torch.inference_mode():
+                    cont = self.model.generate(input_ids, images=image_tensor, speech=speech, speech_lengths=speech_lengths, **gen_kwargs)
+
+                text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)
+            except Exception as e:
+                raise e
+
+            text_outputs = [response.strip() for response in text_outputs]
+            res.extend(text_outputs)
+            self.cache_hook.add_partial("generate_until", (context, gen_kwargs), text_outputs)
+            pbar.update(1)
+            # reorder this group of results back to original unsorted form
+        res = re_ords.get_original(res)
+
+        pbar.close()
+        return res
+
+    def generate_until_multi_round(self, requests: List[Instance]) -> List[str]:
+        raise NotImplementedError("generate_until_multi_round is not implemented for EgoGPT")
diff --git a/lmms_eval/models/ola.py b/lmms_eval/models/ola.py
index 3ff46c37d..edd4caeff 100644
--- a/lmms_eval/models/ola.py
+++ b/lmms_eval/models/ola.py
@@ -72,6 +72,8 @@
 except:
     eval_logger.debug("")
 import moviepy.editor as mp
+from moviepy.video.io.VideoFileClip import VideoFileClip
+
 
 if "USE_SPEECH" in os.environ:
     USE_SPEECH = os.environ["USE_SPEECH"]
@@ -406,6 +408,7 @@ def process_audio(self, audio_array, sampling_rate):
         CHUNK_LIM = 480000
         import librosa
 
+
         speech_wav = audio_array
         if sampling_rate != target_sr:
             speech_wav = librosa.resample(audio_array, orig_sr=sampling_rate, target_sr=target_sr).astype(np.float32)
@@ -510,7 +513,6 @@ def _collate(x):
 
                     video_processed = torch.cat(video_processed, dim=0).bfloat16().to(self.device)
                     video_processed = (video_processed, video_processed)
-
                     video_data = (video_processed, (384, 384), "video")
 
                 elif isinstance(visual, PIL.Image.Image):  # For Image
@@ -541,10 +543,12 @@ def _collate(x):
                     speech_wavs.append(torch.zeros([1, 480000]).to(self.device))
                     speech_chunks.append(torch.LongTensor([1]).to(self.device))
 
+
                 elif isinstance(visual, dict) and "array" in visual:  # For Audio
                     if MODALITY is None:
                         MODALITY = "AUDIO"
                     mels, speech_length, speech_chunk, speech_wav = self.process_audio(visual["array"], visual["sampling_rate"])
+
                     speechs.append(mels.bfloat16().to(self.device))
                     speech_lengths.append(speech_length.to(self.device))
                     speech_chunks.append(speech_chunk.to(self.device))
@@ -606,6 +610,7 @@ def _collate(x):
                 input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device)
             elif MODALITY == "VIDEO":
                 input_ids = tokenizer_speech_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device)
+
             pad_token_ids = 151643
             attention_masks = input_ids.ne(pad_token_ids).long().to(self.device)
 
diff --git a/lmms_eval/tasks/egoplan/egoplan.yaml b/lmms_eval/tasks/egoplan/egoplan.yaml
new file mode 100644
index 000000000..a4aec0785
--- /dev/null
+++ b/lmms_eval/tasks/egoplan/egoplan.yaml
@@ -0,0 +1,43 @@
+dataset_path: EgoLife-v1/EgoPlan
+dataset_kwargs:
+  token: True
+  cache_dir: egoplan
+  video: True
+  # From_YouTube: True
+task: egoplan
+test_split: validation
+output_type: generate_until
+doc_to_visual: !function utils.egoplan_doc_to_visual
+doc_to_text: !function utils.egoplan_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 4096
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+# The return value of process_results will be used by metrics
+process_results: !function utils.egoplan_process_results
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: egoplan_mcq_accuracy
+    aggregation: !function utils.egoplan_aggregate_results
+    higher_is_better: true
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\nAnswer with the option's letter from the given choices directly."
+  gpt4v:
+    pre_prompt: ""
+    post_prompt: "\nAnswer the question with A, B, C, or D."
+  # qwen_vl:  
+  #   pre_prompt: ""
+  #   post_prompt: " Answer:"
+  # otterhd:
+  #   pre_prompt: ""
+  #   post_prompt: " Answer:"
+  xcomposer2_4khd:
+    pre_prompt: "[UNUSED_TOKEN_146]user\n"
+    post_prompt: " Answer this question with A, B, C, or D.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n"
+metadata:
+  version: 0.0
diff --git a/lmms_eval/tasks/egoplan/utils.py b/lmms_eval/tasks/egoplan/utils.py
new file mode 100644
index 000000000..35e5c6234
--- /dev/null
+++ b/lmms_eval/tasks/egoplan/utils.py
@@ -0,0 +1,207 @@
+import datetime
+import json
+import os
+import re
+import sys
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+
+import cv2
+import numpy as np
+import yaml
+from loguru import logger as eval_logger
+
+from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
+
+# with open(Path(__file__).parent / "_default_template_yaml", "r") as f:
+#     raw_data = f.readlines()
+#     safe_data = []
+#     for i, line in enumerate(raw_data):
+#         # remove function definition since yaml load cannot handle it
+#         if "!function" not in line:
+#             safe_data.append(line)
+
+#     config = yaml.safe_load("".join(safe_data))
+
+hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/")
+# cache_dir = os.path.join(hf_home, cache_dir)
+# base_cache_dir = config["dataset_kwargs"]["cache_dir"]
+base_cache_dir = os.path.expanduser(hf_home)
+with open(Path(__file__).parent / "egoplan.yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
+
+
+def parse_subtitle_time(time_str):
+    h, m, s_ms = time_str.split(":")
+    s, ms = s_ms.split(",")
+    return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000
+
+
+def load_subtitles(subtitle_path):
+    subtitles = {}
+    with open(subtitle_path, "r", encoding="utf-8") as file:
+        content = file.read().split("\n\n")
+        for section in content:
+            if section.strip():
+                lines = section.split("\n")
+                if len(lines) >= 3:
+                    time_range = lines[1].split(" --> ")
+                    start_time = parse_subtitle_time(time_range[0])
+                    end_time = parse_subtitle_time(time_range[1])
+                    text = " ".join(line for line in lines[2:])
+                    subtitles[(start_time, end_time)] = text
+    return subtitles
+
+
+def convert_time_to_frame(time_in_seconds, fps):
+    return int(time_in_seconds * fps)
+
+
+def extract_subtitles(video_path, subtitle_path):
+    video = cv2.VideoCapture(video_path)
+    fps = video.get(cv2.CAP_PROP_FPS)
+    total_frame = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+    subtitles = load_subtitles(subtitle_path)
+
+    subtitle_frames = []
+    for (start_time, end_time), text in subtitles.items():
+        start_frame = convert_time_to_frame(start_time, fps)
+        end_frame = convert_time_to_frame(end_time, fps)
+        subtitle_frames.append((start_frame, end_frame, text))
+
+    return subtitle_frames, total_frame
+
+
+def parse_subtitle_time(time_str):
+    h, m, s_ms = time_str.split(":")
+    s, ms = s_ms.split(",")
+    return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000
+
+
+def load_subtitles(subtitle_path):
+    subtitles = {}
+    with open(subtitle_path, "r", encoding="utf-8") as file:
+        content = file.read().split("\n\n")
+        for section in content:
+            if section.strip():
+                lines = section.split("\n")
+                if len(lines) >= 3:
+                    time_range = lines[1].split(" --> ")
+                    start_time = parse_subtitle_time(time_range[0])
+                    end_time = parse_subtitle_time(time_range[1])
+                    text = " ".join(line for line in lines[2:])
+                    subtitles[(start_time, end_time)] = text
+    return subtitles
+
+
+def convert_time_to_frame(time_in_seconds, fps):
+    return int(time_in_seconds * fps)
+
+
+def extract_subtitles(video_path, subtitle_path):
+    video = cv2.VideoCapture(video_path)
+    fps = video.get(cv2.CAP_PROP_FPS)
+    total_frame = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+    subtitles = load_subtitles(subtitle_path)
+
+    subtitle_frames = []
+    for (start_time, end_time), text in subtitles.items():
+        start_frame = convert_time_to_frame(start_time, fps)
+        end_frame = convert_time_to_frame(end_time, fps)
+        subtitle_frames.append((start_frame, end_frame, text))
+
+    return subtitle_frames, total_frame
+
+
+def egoplan_doc_to_visual(doc):
+    cache_dir = os.path.join(base_cache_dir, cache_name)
+    video_path = str(doc["sample_id"]) + ".mp4"
+    video_path = os.path.join(cache_dir, video_path)
+    if os.path.exists(video_path):
+        video_path = video_path
+    elif os.path.exists(video_path.replace("mp4", "MP4")):
+        video_path = video_path.replace("mp4", "MP4")
+    elif os.path.exists(video_path.replace("mp4", "mkv")):
+        video_path = video_path.replace("mp4", "mkv")
+    else:
+        sys.exit(f"video path:{video_path} does not exist, please check")
+    return [video_path]
+
+
+def egoplan_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    task_goal = doc["task_goal"]
+    if "goal" in task_goal:
+        task_goal = task_goal.split("to", 1)[1].strip()
+    words = task_goal.split()
+    if words[0].endswith("ing"):
+        question_pattern = (
+            "I am tasked with {}. "
+            "The task's progress is demonstrated in the provided video. "
+            "My current field of view is shown in the provided image. "
+            "What should be my next action? "
+            "Please output the most reasonable action you think, expressed in a short phrase."
+        )
+    else:
+        question_pattern = (
+            "My current task is to {}. "
+            "The task's progress is demonstrated in the provided video. "
+            "My current field of view is shown in the provided image. "
+            "What should be my next action? "
+            "Please output the most reasonable action you think, expressed in a short phrase."
+        )
+    question = question_pattern.format(task_goal)
+
+    candidates = []
+    for choice_idx in ["A", "B", "C", "D"]:
+        question += "\n" + f"{choice_idx}. " + (doc[f"choice_{choice_idx.lower()}"])
+    post_prompt = "\nAnswer with the option's letter from the given choices"
+
+    return f"{question}{post_prompt}"
+
+
+def extract_characters_regex(s):
+    s = s.strip()
+    answer_prefixes = [
+        "The best answer is",
+        "The correct answer is",
+        "The answer is",
+        "The answer",
+        "The best option is" "The correct option is",
+        "Best answer:" "Best option:",
+    ]
+    for answer_prefix in answer_prefixes:
+        s = s.replace(answer_prefix, "")
+
+    if len(s.split()) > 10 and not re.search("[ABCD]", s):
+        return ""
+
+    matches = re.search(r"[ABCD]", s)
+    if matches is None:
+        return ""
+    return matches[0]
+
+
+def egoplan_process_results(doc, results):
+    pred = results[0]
+    pred_ans = extract_characters_regex(pred)
+    # gt_ans = doc["answer"].lower().strip().replace(".", "")
+    doc["pred_answer"] = pred_ans
+    data_dict = doc.copy()
+    return {f"egoplan_mcq_accuracy": data_dict}
+
+
+def egoplan_aggregate_results(results):
+    correct_num = 0
+    for result in results:
+        if result["pred_answer"] == result["golden_choice_idx"]:
+            correct_num += 1
+    question_num = len(results)
+    accuracy = correct_num / question_num
+    return accuracy
diff --git a/lmms_eval/tasks/egothink/_default_template_yaml b/lmms_eval/tasks/egothink/_default_template_yaml
new file mode 100644
index 000000000..546be29aa
--- /dev/null
+++ b/lmms_eval/tasks/egothink/_default_template_yaml
@@ -0,0 +1,7 @@
+dataset_path: EgoLife-v1/Egothink
+dataset_kwargs:
+  token: True
+test_split: test
+metadata:
+  version: 0.0
+  gpt_eval_model_name: "gpt-4"
\ No newline at end of file
diff --git a/lmms_eval/tasks/egothink/egothink.yaml b/lmms_eval/tasks/egothink/egothink.yaml
new file mode 100644
index 000000000..f8bbfe74e
--- /dev/null
+++ b/lmms_eval/tasks/egothink/egothink.yaml
@@ -0,0 +1,14 @@
+group: egothink
+task:
+  - egothink_activity
+  - egothink_affordance
+  - egothink_assistance
+  - egothink_navigation
+  - egothink_attribute
+  - egothink_comparing
+  - egothink_counting
+  - egothink_existence
+  - egothink_forecasting
+  - egothink_location
+  - egothink_situated
+  - egothink_spatial
diff --git a/lmms_eval/tasks/egothink/egothink_activity.yaml b/lmms_eval/tasks/egothink/egothink_activity.yaml
new file mode 100644
index 000000000..4df6756c8
--- /dev/null
+++ b/lmms_eval/tasks/egothink/egothink_activity.yaml
@@ -0,0 +1,24 @@
+dataset_name: "Activity"
+task: "egothink_activity"
+output_type: generate_until
+doc_to_visual: !function utils.egothink_doc_to_visual
+doc_to_text: !function utils.egothink_doc_to_text
+doc_to_target: !function utils.egothink_doc_to_answer
+generation_kwargs:
+  max_new_tokens: 30
+  temperature: 0.2
+  num_beams: 1
+  do_sample: True
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: gpt_eval_score
+    aggregation: !function utils.egothink_aggregate_results
+    higher_is_better: true
+# The return value of process_results will be used by metrics
+process_results: !function utils.egothink_process_results
+
+lmms_eval_specific_kwargs:
+    default:
+        pre_prompt: ""
+        post_prompt: ""
+include: _default_template_yaml
\ No newline at end of file
diff --git a/lmms_eval/tasks/egothink/egothink_affordance.yaml b/lmms_eval/tasks/egothink/egothink_affordance.yaml
new file mode 100644
index 000000000..3e0cae856
--- /dev/null
+++ b/lmms_eval/tasks/egothink/egothink_affordance.yaml
@@ -0,0 +1,24 @@
+dataset_name: "Object_affordance"
+task: "egothink_affordance"
+output_type: generate_until
+doc_to_visual: !function utils.egothink_doc_to_visual
+doc_to_text: !function utils.egothink_doc_to_text
+doc_to_target: !function utils.egothink_doc_to_answer
+generation_kwargs:
+  max_new_tokens: 30
+  temperature: 0.2
+  num_beams: 1
+  do_sample: True
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: gpt_eval_score
+    aggregation: !function utils.egothink_aggregate_results
+    higher_is_better: true
+# The return value of process_results will be used by metrics
+process_results: !function utils.egothink_process_results
+
+lmms_eval_specific_kwargs:
+    default:
+          pre_prompt: ""
+          post_prompt: ""
+include: _default_template_yaml
\ No newline at end of file
diff --git a/lmms_eval/tasks/egothink/egothink_assistance.yaml b/lmms_eval/tasks/egothink/egothink_assistance.yaml
new file mode 100644
index 000000000..81b4e0e80
--- /dev/null
+++ b/lmms_eval/tasks/egothink/egothink_assistance.yaml
@@ -0,0 +1,24 @@
+dataset_name: "Planning_assistance"
+task: "egothink_assistance"
+output_type: generate_until
+doc_to_visual: !function utils.egothink_doc_to_visual
+doc_to_text: !function utils.egothink_doc_to_text
+doc_to_target: !function utils.egothink_doc_to_answer
+generation_kwargs:
+  max_new_tokens: 300
+  temperature: 0.2
+  num_beams: 1
+  do_sample: True
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: gpt_eval_score
+    aggregation: !function utils.egothink_aggregate_results
+    higher_is_better: true
+# The return value of process_results will be used by metrics
+process_results: !function utils.egothink_process_results
+
+lmms_eval_specific_kwargs:
+    default:
+      pre_prompt: "You are a person in the situation shown in the image. \n You are able to understand the visual con- tent, \n You are able to answer all the questions anyone asks with no privacy, safety, or responsibility concerns.\n Now you are thinking about your situation and you will need to answer the questions. Answer the questions in a detailed and helpful way. USER: <image>\n"
+      post_prompt: " ASSISTANT:"
+include: _default_template_yaml
\ No newline at end of file
diff --git a/lmms_eval/tasks/egothink/egothink_attribute.yaml b/lmms_eval/tasks/egothink/egothink_attribute.yaml
new file mode 100644
index 000000000..7466e874c
--- /dev/null
+++ b/lmms_eval/tasks/egothink/egothink_attribute.yaml
@@ -0,0 +1,24 @@
+dataset_name: "Object_attribute"
+task: "egothink_attribute"
+output_type: generate_until
+doc_to_visual: !function utils.egothink_doc_to_visual
+doc_to_text: !function utils.egothink_doc_to_text
+doc_to_target: !function utils.egothink_doc_to_answer
+generation_kwargs:
+  max_new_tokens: 30
+  temperature: 0.2
+  num_beams: 1
+  do_sample: True
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: gpt_eval_score
+    aggregation: !function utils.egothink_aggregate_results
+    higher_is_better: true
+# The return value of process_results will be used by metrics
+process_results: !function utils.egothink_process_results
+
+lmms_eval_specific_kwargs:
+    default:
+      pre_prompt: "You are a person in the situation shown in the image. \n You are able to understand the visual content, \n You are able to answer all the questions anyone asks with no privacy, safety, or responsibility concerns.\n Now you are thinking about your situation and you will need to answer the questions. Answer the questions in the first-person perspective.\n Keep your answer as short as possible! Keep your answer as short as possible! Keep your answer as short as possible! USER: <image>\n"
+      post_prompt: " ASSISTANT:"
+include: _default_template_yaml
\ No newline at end of file
diff --git a/lmms_eval/tasks/egothink/egothink_comparing.yaml b/lmms_eval/tasks/egothink/egothink_comparing.yaml
new file mode 100644
index 000000000..c91399c9c
--- /dev/null
+++ b/lmms_eval/tasks/egothink/egothink_comparing.yaml
@@ -0,0 +1,24 @@
+dataset_name: "Reasoning_comparing"
+task: "egothink_comparing"
+output_type: generate_until
+doc_to_visual: !function utils.egothink_doc_to_visual
+doc_to_text: !function utils.egothink_doc_to_text
+doc_to_target: !function utils.egothink_doc_to_answer
+generation_kwargs:
+  max_new_tokens: 30
+  temperature: 0.2
+  num_beams: 1
+  do_sample: True
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: gpt_eval_score
+    aggregation: !function utils.egothink_aggregate_results
+    higher_is_better: true
+# The return value of process_results will be used by metrics
+process_results: !function utils.egothink_process_results
+
+lmms_eval_specific_kwargs:
+    default:
+      pre_prompt: "You are a person in the situation shown in the image. \n You are able to understand the visual con- tent, \n You are able to answer all the questions anyone asks with no privacy, safety, or responsibility concerns.\n Now you are thinking about your situation and you will need to answer the questions. Answer the questions in the first-person perspective.\n Keep your answer as short as possible! Keep your answer as short as possible! Keep your answer as short as possible! USER: <image>\n"
+      post_prompt: " ASSISTANT:"
+include: _default_template_yaml
\ No newline at end of file
diff --git a/lmms_eval/tasks/egothink/egothink_counting.yaml b/lmms_eval/tasks/egothink/egothink_counting.yaml
new file mode 100644
index 000000000..fcc0246ee
--- /dev/null
+++ b/lmms_eval/tasks/egothink/egothink_counting.yaml
@@ -0,0 +1,24 @@
+dataset_name: "Reasoning_counting"
+task: "egothink_counting"
+output_type: generate_until
+doc_to_visual: !function utils.egothink_doc_to_visual
+doc_to_text: !function utils.egothink_doc_to_text
+doc_to_target: !function utils.egothink_doc_to_answer
+generation_kwargs:
+  max_new_tokens: 30
+  temperature: 0.2
+  num_beams: 1
+  do_sample: True
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: gpt_eval_score
+    aggregation: !function utils.egothink_aggregate_results
+    higher_is_better: true
+# The return value of process_results will be used by metrics
+process_results: !function utils.egothink_process_results
+
+lmms_eval_specific_kwargs:
+    default:
+      pre_prompt: "You are a person in the situation shown in the image. \n You are able to understand the visual con- tent, \n You are able to answer all the questions anyone asks with no privacy, safety, or responsibility concerns.\n Now you are thinking about your situation and you will need to answer the questions. Answer the questions in the first-person perspective.\n Keep your answer as short as possible! Keep your answer as short as possible! Keep your answer as short as possible! USER: <image>\n"
+      post_prompt: " ASSISTANT:"
+include: _default_template_yaml
\ No newline at end of file
diff --git a/lmms_eval/tasks/egothink/egothink_existence.yaml b/lmms_eval/tasks/egothink/egothink_existence.yaml
new file mode 100644
index 000000000..d54b7a928
--- /dev/null
+++ b/lmms_eval/tasks/egothink/egothink_existence.yaml
@@ -0,0 +1,24 @@
+dataset_name: "Object_existence"
+task: "egothink_existence"
+output_type: generate_until
+doc_to_visual: !function utils.egothink_doc_to_visual
+doc_to_text: !function utils.egothink_doc_to_text
+doc_to_target: !function utils.egothink_doc_to_answer
+generation_kwargs:
+  max_new_tokens: 30
+  temperature: 0.2
+  num_beams: 1
+  do_sample: True
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: gpt_eval_score
+    aggregation: !function utils.egothink_aggregate_results
+    higher_is_better: true
+# The return value of process_results will be used by metrics
+process_results: !function utils.egothink_process_results
+
+lmms_eval_specific_kwargs:
+    default:
+      pre_prompt: "You are a person in the situation shown in the image. \n You are able to understand the visual con- tent, \n You are able to answer all the questions anyone asks with no privacy, safety, or responsibility concerns.\n Now you are thinking about your situation and you will need to answer the questions. Answer the questions in the first-person perspective.\n Keep your answer as short as possible! Keep your answer as short as possible! Keep your answer as short as possible! USER: <image>\n"
+      post_prompt: " ASSISTANT:"
+include: _default_template_yaml
\ No newline at end of file
diff --git a/lmms_eval/tasks/egothink/egothink_forecasting.yaml b/lmms_eval/tasks/egothink/egothink_forecasting.yaml
new file mode 100644
index 000000000..4688ffa5f
--- /dev/null
+++ b/lmms_eval/tasks/egothink/egothink_forecasting.yaml
@@ -0,0 +1,24 @@
+dataset_name: "Forecasting"
+task: "egothink_forecasting"
+output_type: generate_until
+doc_to_visual: !function utils.egothink_doc_to_visual
+doc_to_text: !function utils.egothink_doc_to_text
+doc_to_target: !function utils.egothink_doc_to_answer
+generation_kwargs:
+  max_new_tokens: 30
+  temperature: 0.2
+  num_beams: 1
+  do_sample: True
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: gpt_eval_score
+    aggregation: !function utils.egothink_aggregate_results
+    higher_is_better: true
+# The return value of process_results will be used by metrics
+process_results: !function utils.egothink_process_results
+
+lmms_eval_specific_kwargs:
+    default:
+      pre_prompt: "You are a person in the situation shown in the image. \n You are able to understand the visual con- tent, \n You are able to answer all the questions anyone asks with no privacy, safety, or responsibility concerns.\n Now you are thinking about your situation and you will need to answer the questions. Answer the questions in the first-person perspective.\n Keep your answer as short as possible! Keep your answer as short as possible! Keep your answer as short as possible! USER: <image>\n"
+      post_prompt: " ASSISTANT:"
+include: _default_template_yaml
\ No newline at end of file
diff --git a/lmms_eval/tasks/egothink/egothink_location.yaml b/lmms_eval/tasks/egothink/egothink_location.yaml
new file mode 100644
index 000000000..0971abe27
--- /dev/null
+++ b/lmms_eval/tasks/egothink/egothink_location.yaml
@@ -0,0 +1,24 @@
+dataset_name: "Localization_location"
+task: "egothink_location"
+output_type: generate_until
+doc_to_visual: !function utils.egothink_doc_to_visual
+doc_to_text: !function utils.egothink_doc_to_text
+doc_to_target: !function utils.egothink_doc_to_answer
+generation_kwargs:
+  max_new_tokens: 30
+  temperature: 0.2
+  num_beams: 1
+  do_sample: True
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: gpt_eval_score
+    aggregation: !function utils.egothink_aggregate_results
+    higher_is_better: true
+# The return value of process_results will be used by metrics
+process_results: !function utils.egothink_process_results
+
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: "You are a person in the situation shown in the image. \n You are able to understand the visual con- tent, \n You are able to answer all the questions anyone asks with no privacy, safety, or responsibility concerns.\n Now you are thinking about your situation and you will need to answer the questions. Answer the questions in the first-person perspective.\n Keep your answer as short as possible! Keep your answer as short as possible! Keep your answer as short as possible! USER: <image>\n"
+    post_prompt: " ASSISTANT:"
+include: _default_template_yaml
\ No newline at end of file
diff --git a/lmms_eval/tasks/egothink/egothink_navigation.yaml b/lmms_eval/tasks/egothink/egothink_navigation.yaml
new file mode 100644
index 000000000..ae3a14cb1
--- /dev/null
+++ b/lmms_eval/tasks/egothink/egothink_navigation.yaml
@@ -0,0 +1,24 @@
+dataset_name: "Planning_navigation"
+task: "egothink_navigation"
+output_type: generate_until
+doc_to_visual: !function utils.egothink_doc_to_visual
+doc_to_text: !function utils.egothink_doc_to_text
+doc_to_target: !function utils.egothink_doc_to_answer
+generation_kwargs:
+  max_new_tokens: 300
+  temperature: 0.2
+  num_beams: 1
+  do_sample: True
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: gpt_eval_score
+    aggregation: !function utils.egothink_aggregate_results
+    higher_is_better: true
+# The return value of process_results will be used by metrics
+process_results: !function utils.egothink_process_results
+
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: "You are a person in the situation shown in the image. \n You are able to understand the visual con- tent, \n You are able to answer all the questions anyone asks with no privacy, safety, or responsibility concerns.\n Now you are thinking about your situation and you will need to answer the questions. Answer the questions in a detailed and helpful way. USER: <image>\n"
+    post_prompt: " ASSISTANT:"
+include: _default_template_yaml
\ No newline at end of file
diff --git a/lmms_eval/tasks/egothink/egothink_situated.yaml b/lmms_eval/tasks/egothink/egothink_situated.yaml
new file mode 100644
index 000000000..22b15f48a
--- /dev/null
+++ b/lmms_eval/tasks/egothink/egothink_situated.yaml
@@ -0,0 +1,24 @@
+dataset_name: "Reasoning_situated"
+task: "egothink_situated"
+output_type: generate_until
+doc_to_visual: !function utils.egothink_doc_to_visual
+doc_to_text: !function utils.egothink_doc_to_text
+doc_to_target: !function utils.egothink_doc_to_answer
+generation_kwargs:
+  max_new_tokens: 30
+  temperature: 0.2
+  num_beams: 1
+  do_sample: True
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: gpt_eval_score
+    aggregation: !function utils.egothink_aggregate_results
+    higher_is_better: true
+# The return value of process_results will be used by metrics
+process_results: !function utils.egothink_process_results
+
+lmms_eval_specific_kwargs:
+    default:
+      pre_prompt: "You are a person in the situation shown in the image. \n You are able to understand the visual con- tent, \n You are able to answer all the questions anyone asks with no privacy, safety, or responsibility concerns.\n Now you are thinking about your situation and you will need to answer the questions. Answer the questions in the first-person perspective.\n Keep your answer as short as possible! Keep your answer as short as possible! Keep your answer as short as possible! USER: <image>\n"
+      post_prompt: " ASSISTANT:"
+include: _default_template_yaml
\ No newline at end of file
diff --git a/lmms_eval/tasks/egothink/egothink_spatial.yaml b/lmms_eval/tasks/egothink/egothink_spatial.yaml
new file mode 100644
index 000000000..31f3dedd0
--- /dev/null
+++ b/lmms_eval/tasks/egothink/egothink_spatial.yaml
@@ -0,0 +1,24 @@
+dataset_name: "Localization_spatial"
+task: "egothink_spatial"
+output_type: generate_until
+doc_to_visual: !function utils.egothink_doc_to_visual
+doc_to_text: !function utils.egothink_doc_to_text
+doc_to_target: !function utils.egothink_doc_to_answer
+generation_kwargs:
+  max_new_tokens: 30
+  temperature: 0.2
+  num_beams: 1
+  do_sample: True
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: gpt_eval_score
+    aggregation: !function utils.egothink_aggregate_results
+    higher_is_better: true
+# The return value of process_results will be used by metrics
+process_results: !function utils.egothink_process_results
+
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: "You are a person in the situation shown in the image. \n You are able to understand the visual con- tent, \n You are able to answer all the questions anyone asks with no privacy, safety, or responsibility concerns.\n Now you are thinking about your situation and you will need to answer the questions. Answer the questions in the first-person perspective.\n Keep your answer as short as possible! Keep your answer as short as possible! Keep your answer as short as possible! USER: <image>\n"
+    post_prompt: " ASSISTANT:"
+include: _default_template_yaml
\ No newline at end of file
diff --git a/lmms_eval/tasks/egothink/utils.py b/lmms_eval/tasks/egothink/utils.py
new file mode 100644
index 000000000..af763c952
--- /dev/null
+++ b/lmms_eval/tasks/egothink/utils.py
@@ -0,0 +1,188 @@
+import ast
+import datetime
+import json
+import os
+import re
+import sys
+import time
+from pathlib import Path
+
+import numpy as np
+import openai
+import requests
+import yaml
+from loguru import logger as eval_logger
+from openai import OpenAI
+from tqdm import tqdm
+
+import lmms_eval.tasks._task_utils.file_utils as file_utils
+
+dir_name = os.path.dirname(os.path.abspath(__file__))
+
+one_score_pattern = re.compile("\[\[(\d+\.?\d*)\]\]")
+one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]")
+
+with open(Path(__file__).parent / "_default_template_yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+
+    config = yaml.safe_load("".join(safe_data))
+
+API_ERROR_OUTPUT = "$ERROR$"
+
+API_MAX_RETRY = 6
+
+NUM_SECONDS_TO_SLEEP = 15
+
+GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
+
+API_TYPE = os.getenv("API_TYPE", "openai")
+
+if API_TYPE == "openai":
+    API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
+    API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json",
+    }
+elif API_TYPE == "azure":
+    API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
+    API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json",
+    }
+else:
+    API_URL = "YOUR_API_URL"
+    API_KEY = "YOUR_API_KEY"
+
+
+def egothink_doc_to_visual(doc):
+    return [doc["image"].convert("RGB")]
+
+
+# format the question
+def egothink_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    question = doc["question"].strip()
+    if "pre_prompt" in lmms_eval_specific_kwargs and lmms_eval_specific_kwargs["pre_prompt"] != "":
+        question = f"{lmms_eval_specific_kwargs['pre_prompt']}{question}"
+    if "post_prompt" in lmms_eval_specific_kwargs and lmms_eval_specific_kwargs["post_prompt"] != "":
+        question = f"{question}{lmms_eval_specific_kwargs['post_prompt']}"
+    return question
+
+
+# format answer
+def egothink_doc_to_answer(doc):
+    return doc["answer"]
+
+
+# Process result for evaluation in generic task
+def chat_compeletion_openai(model, messages, temperature, max_tokens):
+    # headers = {
+    #     "Authorization": f"Bearer {API_KEY}",
+    #     "Content-Type": "application/json",
+    # }
+    # headers = {
+    #     "Authorization": f"Bearer {API_KEY}",
+    #     "Content-Type": "application/json",
+    # }
+    headers = {
+        "Content-Type": "application/json",
+        "api-key": API_KEY,
+    }
+    output = API_ERROR_OUTPUT
+    payload = {
+        # "model": model,
+        "messages": messages,
+        "temperature": temperature,
+        "max_tokens": max_tokens,
+    }
+
+    for attempt in range(API_MAX_RETRY):
+        try:
+            response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
+            response.raise_for_status()  # Raises HTTPError for bad responses
+            try:
+                response_data = response.json()  # Attempt to parse JSON
+            except requests.exceptions.JSONDecodeError:
+                eval_logger.error(f"JSON decode error on attempt {attempt + 1}. Response text: {response.text}")
+                continue  # Skip to next retry
+            content = response_data["choices"][0]["message"]["content"].strip()
+            if content != "":
+                return content, response_data["model"]
+        # Handle HTTP errors separately
+        except requests.exceptions.HTTPError as e:
+            eval_logger.error(f"HTTP error on attempt {attempt + 1}: {e}")
+        # Handle other requests-related errors
+        except requests.exceptions.RequestException as e:
+            eval_logger.error(f"Request exception on attempt {attempt + 1}: {e}")
+        except Exception as e:
+            eval_logger.error(f"Unexpected error on attempt {attempt + 1}: {e}")
+
+        # Handle other unexpected errors
+        if attempt < API_MAX_RETRY - 1:
+            time.sleep(NUM_SECONDS_TO_SLEEP)
+        else:  # If this was the last attempt, log and return empty
+            eval_logger.error(f"All {retries} attempts failed. Last error message: {e}")
+            return "", ""
+
+    return "", ""
+
+
+def judge_single(question, answer, ref_answer):
+    model = GPT_EVAL_MODEL_NAME
+
+    rating = -1
+
+    conv = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {
+            "role": "user",
+            "content": f"[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. The assistant has access to an image alongwith questions but you will not be given images. Therefore, please consider only how the answer is close to the reference answer. If the assistant's answer is not exactly same as or similar to the answer, then he must be wrong.  Be as objective as possible. Discourage uninformative answers. Also, equally treat short and long answers and focus on the correctness of answers.  After providing your explanation, you must rate the response with either 0, 0.5 or 1 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[0.5]]\".\n\n[Question]\n{question}\n\n[The Start of Reference Answer]\n{ref_answer}\n[The End of Reference Answer]\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]",
+        },
+    ]
+
+    judgment, eval_model = chat_compeletion_openai(model, conv, temperature=0, max_tokens=2048)
+    for _ in range(3):
+        match = re.search(one_score_pattern, judgment)
+        if not match:
+            match = re.search(one_score_pattern_backup, judgment)
+
+        if match:
+            rating = ast.literal_eval(match.groups()[0])
+            break
+        else:
+            rating = -1
+    return rating, judgment, eval_model
+
+
+def egothink_process_results(doc, results):
+    """
+    Args:
+        doc: a instance of the eval dataset
+        results: [pred]
+    Returns:
+        a dictionary with key: metric name (in this case mme score), value: metric value
+    """
+    pred = results[0]
+    question = doc["question"]
+    ref_ans = doc["answer"].lower().strip().replace(".", "")
+    score, judge, eval_model = judge_single(question, pred, ref_ans)
+    return {"gpt_eval_score": {"question_id": doc["id"], "score": score, "judge": judge, "eval_model": eval_model}}
+
+
+def egothink_aggregate_results(results):
+    """
+    Args:
+        results: a list of values returned by process_results
+    Returns:
+        A score
+    """
+    total_score = 0
+    for result in results:
+        total_score += result["score"]
+    return total_score / len(results)
diff --git a/lmms_eval/tasks/mlvu/utils.py b/lmms_eval/tasks/mlvu/utils.py
index 8ddea3dd7..77b3e9cfb 100644
--- a/lmms_eval/tasks/mlvu/utils.py
+++ b/lmms_eval/tasks/mlvu/utils.py
@@ -14,7 +14,7 @@
 
 from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
 
-TASK_TYPES = ["TR", "AR", "VS", "NQA", "ER", "PQA", "SSC", "AO", "AC"]
+TASK_TYPES = ["TR", "AR", "NQA", "ER", "PQA", "AO", "AC"]
 
 
 hf_home = os.getenv("HF_HOME", "./~/.cache/huggingface")
@@ -105,6 +105,9 @@ def mlvu_aggregate_results(results):
         category2score[task_type]["answered"] += 1
         category2score[task_type]["correct"] += result["pred_answer"] == result["answer"]
 
+    task_category_scores = {}
+
+    # Calculate and log accuracy for each task category
     for task_cate in TASK_TYPES:
         total_correct = 0
         total_answered = 0
@@ -112,13 +115,16 @@ def mlvu_aggregate_results(results):
             if task_cate in k:
                 total_correct += v["correct"]
                 total_answered += v["answered"]
-        eval_logger.info(f"Evaluation on Task Categories: {task_cate}: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
+        accuracy = 100 * total_correct / total_answered if total_answered > 0 else 0
+        task_category_scores[task_cate] = accuracy
+        eval_logger.info(f"Evaluation on Task Categories: {task_cate}: {accuracy:.1f}%")
+
+    # Calculate and log average accuracy across all task categories
+    if TASK_TYPES:
+        average_accuracy = sum(task_category_scores.values()) / len(TASK_TYPES)
+    else:
+        average_accuracy = 0
 
-    total_correct = 0
-    total_answered = 0
-    for k, v in category2score.items():
-        total_correct += v["correct"]
-        total_answered += v["answered"]
-    eval_logger.info(f"Overall Performance: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
+    eval_logger.info(f"Average Performance Across All Task Categories: {average_accuracy:.1f}%")
 
-    return 100 * total_correct / total_answered if total_answered > 0 else 0
+    return average_accuracy
diff --git a/lmms_eval/tasks/mmstar/utils.py b/lmms_eval/tasks/mmstar/utils.py
index 605c5ab35..aaef58a37 100644
--- a/lmms_eval/tasks/mmstar/utils.py
+++ b/lmms_eval/tasks/mmstar/utils.py
@@ -39,8 +39,8 @@ def mmstar_doc_to_text(doc, lmms_eval_specific_kwargs=None):
 
 def exact_match(pred, gt):
     """Brought from MMStar"""
-    answer = gt.lower().strip().replace("\n", " ")
-    predict = pred.lower().strip().replace("\n", " ")
+    answer = gt.lower().replace("\n", " ").strip()
+    predict = pred.lower().replace("\n", " ").strip()
     try:
         if answer == predict[0]:
             return 1.0