[Model] Support QVQ

open-compass · Dec 30, 2024 · cb80ee0 · cb80ee0
1 parent 9bc5008
commit cb80ee0
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 2 deletions.
diff --git a/vlmeval/config.py b/vlmeval/config.py
@@ -332,6 +332,7 @@
 qwen2vl_series = {
     'Qwen-VL-Max-0809': partial(Qwen2VLAPI, model='qwen-vl-max-0809', min_pixels=1280*28*28, max_pixels=16384*28*28),
     'Qwen-VL-Plus-0809': partial(Qwen2VLAPI, model='qwen-vl-plus-0809', min_pixels=1280*28*28, max_pixels=16384*28*28),
+    'QVQ-72B-Preview': partial(Qwen2VLChat, model_path='Qwen/QVQ-72B-Preview', min_pixels=1280*28*28, max_pixels=16384*28*28, system_prompt='You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.', max_new_tokens=8192, post_process=False),
     'Qwen2-VL-72B-Instruct': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-72B-Instruct', min_pixels=1280*28*28, max_pixels=16384*28*28),
     'Qwen2-VL-7B-Instruct': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct', min_pixels=1280*28*28, max_pixels=16384*28*28),
     'Qwen2-VL-7B-Instruct-AWQ': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct-AWQ', min_pixels=1280*28*28, max_pixels=16384*28*28),

diff --git a/vlmeval/vlm/qwen2_vl/model.py b/vlmeval/vlm/qwen2_vl/model.py
@@ -76,6 +76,7 @@ def __init__(
         repetition_penalty=1.0,
         use_custom_prompt: bool = True,
         system_prompt: str | None = None,
+        post_process: bool = False,  # if True, will try to only extract stuff in the last \boxed{}.
         verbose: bool = False,
     ):
         super().__init__(use_custom_prompt=use_custom_prompt)
@@ -90,6 +91,7 @@ def __init__(
         )
         self.system_prompt = system_prompt
         self.verbose = verbose
+        self.post_process = post_process
         self.fps = 2.0
         self.nframe = 64
         self.FRAME_FACTOR = 2
@@ -195,6 +197,24 @@ def generate_inner(self, message, dataset=None):
             generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
         response = out[0]
+        if self.post_process:
+            resp = response.split('\\boxed{')[-1]
+            lt = len(resp)
+            counter, end = 1, None
+            for i in range(lt):
+                if resp[i] == '{':
+                    counter += 1
+                elif resp[i] == '}':
+                    counter -= 1
+                if counter == 0:
+                    end = i
+                    break
+                elif i == lt - 1:
+                    end = lt
+                    break
+            if end is not None:
+                response = resp[:end]
+
         if self.verbose:
             print(f'\033[32m{response}\033[0m')
         return response
diff --git a/vlmeval/vlm/vita.py b/vlmeval/vlm/vita.py
@@ -28,9 +28,9 @@ def __init__(self, model_path='VITA/vita', root=None, **kwargs):
 
         model_name = get_model_name_from_path(model_path)
         tokenizer, model, image_processor, _ = load_pretrained_model(
-            model_path, None, model_name, model_type='qwen2p5_instruct', device_map='auto'
+            model_path, None, model_name, model_type='qwen2p5_instruct', device_map='cuda'
         )
-        # model.cuda().eval()
+        model = model.eval()
         # model.tie_weights()
 
         audio_encoder = model.get_audio_encoder()