diff --git a/lmms_eval/tasks/vsibench/utils.py b/lmms_eval/tasks/vsibench/utils.py
new file mode 100644
index 000000000..e2e4db6ac
--- /dev/null
+++ b/lmms_eval/tasks/vsibench/utils.py
@@ -0,0 +1,153 @@
+
+import os
+from pathlib import Path
+import yaml
+from loguru import logger as eval_logger
+from functools import partial
+import numpy as np
+import pandas as pd
+
+import datasets
+
+MCA_QUESTION_TYPES = [
+    "object_rel_direction_easy",
+    "object_rel_direction_medium",
+    "object_rel_direction_hard",
+    "object_rel_distance",
+    "route_planning",
+    "obj_appearance_order",
+]
+NA_QUESTION_TYPES = [
+    "object_abs_distance",
+    "object_counting",
+    "object_size_estimation",
+    "room_size_estimation",
+]
+
+METRICS_FOR_MCA = {
+    "accuracy": "exact_match",
+}
+
+METRICS_FOR_NA = {
+    "MRA:.5:.95:.05": "partial(mean_relative_accuracy, start=.5, end=.95, interval=.05)",
+}
+
+
+hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/")
+base_cache_dir = os.path.expanduser(hf_home)
+with open(Path(__file__).parent / "vsibench.yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        if "!function" not in line:
+            safe_data.append(line)
+cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
+
+
+def vsibench_doc_to_visual(doc):
+    cache_dir = os.path.join(base_cache_dir, cache_name)
+    video_path = doc["dataset"] + "/" + doc["scene_name"] + ".mp4"
+    video_path = os.path.join(cache_dir, video_path)
+    if os.path.exists(video_path):
+        video_path = video_path
+    else:
+        raise FileExistsError(f"video path:{video_path} does not exist.")
+    return [video_path]
+
+
+def vsibench_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    question = doc["question"]
+        
+    pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "") or "These are frames of a video."
+    
+    if doc['question_type'] in NA_QUESTION_TYPES:
+        post_prompt = lmms_eval_specific_kwargs.get("na_post_prompt", "") or "Please answer the question using a single word or phrase."
+        return pre_prompt + "\n" + question + "\n" + post_prompt
+    elif doc['question_type'] in MCA_QUESTION_TYPES:
+        options = "Options:\n" + "\n".join(doc["options"])
+        post_prompt = lmms_eval_specific_kwargs.get("mca_post_prompt", "") or "Answer with the option's letter from the given choices directly."
+        return "\n".join([pre_prompt, question, options, post_prompt])
+    else:
+        raise ValueError(f"Unknown question type: {doc['question_type']}")
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    if os.getenv('LMMS_EVAL_SHUFFLE_DOCS', None):
+        eval_logger.info(f"Environment variable LMMS_EVAL_SHUFFLE_DOCS detected, dataset will be shuffled.")
+        return dataset.shuffle(seed=42)
+    return dataset
+
+def fuzzy_matching(pred):
+    return pred.split(' ')[0].rstrip('.').strip()
+
+def exact_match(pred, target):
+    return 1. if pred.lower() == target.lower() else 0.
+
+def abs_dist_norm(pred, target):
+    return abs(pred - target) / target
+
+def mean_relative_accuracy(pred, target, start, end, interval):
+    num_pts = (end - start) / interval + 2
+    conf_intervs = np.linspace(start, end, int(num_pts))
+    accuracy = abs_dist_norm(pred, target) <= 1 - conf_intervs
+    return accuracy.mean()
+
+WORST_CASE_FOR_METRICS = {
+    "accuracy": 0.,
+    "MRA:.5:.95:.05": 0.,
+}
+
+def to_float(pred):
+    try:
+        pred = float(pred)
+    except BaseException as e:
+        pred = None
+    return pred
+
+def vsibench_process_results(doc, results):
+    
+    doc['prediction'] = results[0]
+    if doc['question_type'] in MCA_QUESTION_TYPES:
+        for key, value in METRICS_FOR_MCA.items():
+            doc[key] = eval(value)(fuzzy_matching(doc['prediction']), doc['ground_truth'])
+    elif doc['question_type'] in NA_QUESTION_TYPES:
+        for key, value in METRICS_FOR_NA.items():
+            try:
+                doc[key] = eval(value)(to_float(fuzzy_matching(doc['prediction'])), to_float(doc['ground_truth']))
+            except TypeError:
+                doc[key] = WORST_CASE_FOR_METRICS[key]
+    else:
+        raise ValueError(f"Unknown question type: {doc['question_type']}")
+
+    return {"vsibench_score": doc}
+
+def vsibench_aggregate_results(results):
+    results = pd.DataFrame(results)
+    
+    output = {}
+
+    for question_type, question_type_indexes in results.groupby('question_type').groups.items():
+        per_question_type = results.iloc[question_type_indexes]
+        
+        if question_type in MCA_QUESTION_TYPES:
+            for metric in METRICS_FOR_MCA.keys():
+                output[f"{question_type}_{metric}"] = per_question_type[metric].mean()
+        elif question_type in NA_QUESTION_TYPES:
+            for metric in METRICS_FOR_NA.keys():
+                if metric == 'success_rate':
+                    output[f"{question_type}_{metric}"] = per_question_type[metric].mean()
+                else:
+                    output[f"{question_type}_{metric}"] = per_question_type[metric].mean()
+
+        else:
+            raise ValueError(f"Unknown question type: {question_type}")
+    
+    output['object_rel_direction_accuracy'] = sum([
+        output.pop('object_rel_direction_easy_accuracy'),
+        output.pop('object_rel_direction_medium_accuracy'),
+        output.pop('object_rel_direction_hard_accuracy'),
+    ]) / 3.
+    
+    output['overall'] = sum([_ for _ in output.values()]) / len(output)
+    eval_logger.info(f"Evaluation results: {output}")
+    return output
diff --git a/lmms_eval/tasks/vsibench/vsibench.yaml b/lmms_eval/tasks/vsibench/vsibench.yaml
new file mode 100644
index 000000000..a1743cb65
--- /dev/null
+++ b/lmms_eval/tasks/vsibench/vsibench.yaml
@@ -0,0 +1,40 @@
+dataset_path: nyu-visionx/VSI-Bench
+dataset_kwargs:
+  token: True
+  cache_dir: vsibench
+  video: True
+task: vsibench
+test_split: test
+output_type: generate_until
+process_docs: !function utils.process_docs
+doc_to_visual: !function utils.vsibench_doc_to_visual
+doc_to_text: !function utils.vsibench_doc_to_text
+doc_to_target: "ground_truth"
+generation_kwargs:
+  max_new_tokens: 16
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+# The return value of process_results will be used by metrics
+process_results: !function utils.vsibench_process_results
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: vsibench_score
+    aggregation: !function utils.vsibench_aggregate_results
+    higher_is_better: true
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    mca_post_prompt: "Answer with the option's letter from the given choices directly."
+    na_post_prompt: "Please answer the question using a single word or phrase."
+  gemini_api:
+    pre_prompt: ""
+    mca_post_prompt: "Answer with the option's letter from the given choices directly."
+    na_post_prompt: "Do not response anything other than a single number!"
+  gpt4v:
+    pre_prompt: ""
+    mca_post_prompt: "Answer with the option's letter from the given choices directly."
+    na_post_prompt: "Do not response anything other than a single number!"
+metadata:
+  - version: 0.0