[Model] Add DeepSeek-VL2 (#697)

* add deepseek-vl2 * Fix Lint * Make distributed timeout configurable --------- Co-authored-by: Xingchao Liu <[email protected]> Co-authored-by: kennymckormick <[email protected]>
open-compass · Dec 27, 2024 · cf4d61b · cf4d61b
1 parent f055c2e
commit cf4d61b
Show file tree

Hide file tree

Showing 4 changed files with 174 additions and 2 deletions.
diff --git a/run.py b/run.py
@@ -181,7 +181,10 @@ def main():
     if world_size > 1:
         local_rank = os.environ.get('LOCAL_RANK', 0)
         torch.cuda.set_device(int(local_rank))
-        dist.init_process_group(backend='nccl', timeout=datetime.timedelta(seconds=3600))
+        dist.init_process_group(
+            backend='nccl',
+            timeout=datetime.timedelta(seconds=os.environ.get('DIST_TIMEOUT', 3600))
+        )
 
     for _, model_name in enumerate(args.model):
         model = None

diff --git a/vlmeval/config.py b/vlmeval/config.py
@@ -259,6 +259,11 @@
     'deepseek_vl_1.3b': partial(DeepSeekVL, model_path='deepseek-ai/deepseek-vl-1.3b-chat'),
 }
 
+deepseekvl2_series = {
+    'deepseek_vl2_tiny': partial(DeepSeekVL2, model_path='deepseek-ai/deepseek-vl2-tiny'),
+    'deepseek_vl2_small': partial(DeepSeekVL2, model_path='deepseek-ai/deepseek-vl2-small'),
+    'deepseek_vl2': partial(DeepSeekVL2, model_path='deepseek-ai/deepseek-vl2'),
+}
 
 janus_series = {
     'Janus-1.3B': partial(Janus, model_path='deepseek-ai/Janus-1.3B')
@@ -406,7 +411,7 @@
     ungrouped, api_models,
     xtuner_series, qwen_series, llava_series, internvl_series, yivl_series,
     xcomposer_series, minigpt4_series, idefics_series, instructblip_series,
-    deepseekvl_series, janus_series, minicpm_series, cogvlm_series, wemm_series,
+    deepseekvl_series, deepseekvl2_series, janus_series, minicpm_series, cogvlm_series, wemm_series,
     cambrian_series, chameleon_series, video_models, ovis_series, vila_series,
     mantis_series, mmalaya_series, phi3_series, xgen_mm_series, qwen2vl_series,
     slime_series, eagle_series, moondream_series, llama_series, molmo_series,

diff --git a/vlmeval/vlm/__init__.py b/vlmeval/vlm/__init__.py
@@ -29,6 +29,7 @@
 from .yi_vl import Yi_VL
 from .internvl import InternVLChat
 from .deepseek_vl import DeepSeekVL
+from .deepseek_vl2 import DeepSeekVL2
 from .janus import Janus
 from .mgm import Mini_Gemini
 from .bunnyllama3 import BunnyLLama3

diff --git a/vlmeval/vlm/deepseek_vl2.py b/vlmeval/vlm/deepseek_vl2.py
@@ -0,0 +1,163 @@
+import sys
+import torch
+from transformers import AutoModelForCausalLM
+import warnings
+from .base import BaseModel
+from ..smp import *
+from PIL import Image
+
+
+class DeepSeekVL2(BaseModel):
+
+    INSTALL_REQ = True
+    INTERLEAVE = True
+
+    def check_install(self):
+        try:
+            import deepseek_vl2
+        except Exception as e:
+            logging.critical(
+                'Please first install deepseek_vl2 from source codes in: https://github.com/deepseek-ai/DeepSeek-VL2')
+            raise e
+
+    def __init__(self, model_path='deepseek-ai/deepseek-vl2-tiny', **kwargs):
+        self.check_install()
+        assert model_path is not None
+        self.model_path = model_path
+        from deepseek_vl2.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM
+
+        self.vl_chat_processor = DeepseekVLV2Processor.from_pretrained(model_path)
+        self.tokenizer = self.vl_chat_processor.tokenizer
+
+        model: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(model_path,
+                                                                              trust_remote_code=True,
+                                                                              torch_dtype=torch.bfloat16)
+        self.model = model.cuda().eval()
+
+        torch.cuda.empty_cache()
+        default_kwargs = dict(max_new_tokens=512, do_sample=False, use_cache=True)
+        default_kwargs.update(kwargs)
+        self.kwargs = default_kwargs
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+
+    def prepare_inputs(self, message, dataset=None):
+
+        if dataset == 'MMMU_DEV_VAL':
+
+            def prepare_itlist(msgs):
+                content, images = '', []
+                image_idx = 1
+                for s in msgs:
+                    if s['type'] == 'image':
+                        images.append(s['value'])
+                        content += f'<image {image_idx}>'
+                        image_idx += 1
+                    elif s['type'] == 'text':
+                        content += s['value']
+                # content = '<image>' * (image_idx-1) + '\n' + content
+                content = '<image>' * (image_idx - 1) + '\n' + content
+                return content, images
+
+            conversation = []
+            if 'role' not in message[0]:
+                content, images = prepare_itlist(message)
+                content = content.replace(
+                    'Please select the correct answer from the options above.',
+                    "Answer with the option's letter from the given choices directly. Answer the question using a single word or phrase.\n"  # noqa
+                )
+                content = content.replace('Question:', "")
+                content = content.replace('Options:\n', "")
+                conversation.append(dict(role='<|User|>', content=content, images=images))
+            else:
+                role_map = {'user': '<|User|>', 'assistant': '<|Assistant|>'}
+                for msgs in message:
+                    role = role_map[msgs['role']]
+                    content, images = prepare_itlist(msgs['content'])
+                    content = content.replace(
+                        'Please select the correct answer from the options above.',
+                        "Answer with the option's letter from the given choices directly. Answer the question using a single word or phrase.\n"  # noqa
+                    )
+                    content = content.replace('Question:', "")
+                    content = content.replace('Options:\n', "")
+                    conversation.append(dict(role=role, content=content, images=images))
+            conversation.append(dict(role='<|Assistant|>', content=''))
+
+        else:
+
+            def prepare_itlist(msgs):
+                content, images = '', []
+                for s in msgs:
+                    if s['type'] == 'image':
+                        images.append(s['value'])
+                        content += '<image>\n'
+                    elif s['type'] == 'text':
+                        content += s['value']
+                return content, images
+
+            conversation = []
+            if 'role' not in message[0]:
+                content, images = prepare_itlist(message)
+                conversation.append(dict(role='<|User|>', content=content, images=images))
+            else:
+                role_map = {'user': '<|User|>', 'assistant': '<|Assistant|>'}
+                for msgs in message:
+                    role = role_map[msgs['role']]
+                    content, images = prepare_itlist(msgs['content'])
+                    conversation.append(dict(role=role, content=content, images=images))
+            conversation.append(dict(role='<|Assistant|>', content=''))
+
+        return conversation
+
+    def generate_inner(self, message, dataset=None):
+        conversation = self.prepare_inputs(message, dataset)
+        from deepseek_vl2.utils.io import load_pil_images
+        pil_images = load_pil_images(conversation)
+
+        if dataset == 'MMMU_DEV_VAL':
+            if len(pil_images):
+                h, w = pil_images[0].size
+                pil_images[0] = pil_images[0].resize((2 * h, 2 * w), Image.BILINEAR)
+
+        prepare_inputs = self.vl_chat_processor(
+            conversations=conversation,
+            images=pil_images,
+            force_batchify=True,
+            system_prompt=""
+        )
+        prepare_inputs = prepare_inputs.to(self.model.device)
+        inputs_embeds = self.model.prepare_inputs_embeds(**prepare_inputs)
+
+        inputs_embeds, past_key_values = self.model.incremental_prefilling(
+            input_ids=prepare_inputs.input_ids,
+            images=prepare_inputs.images,
+            images_seq_mask=prepare_inputs.images_seq_mask,
+            images_spatial_crop=prepare_inputs.images_spatial_crop,
+            attention_mask=prepare_inputs.attention_mask,
+            chunk_size=512
+        )
+
+        # run the model to get the response
+        outputs = self.model.generate(
+            inputs_embeds=inputs_embeds,
+            input_ids=prepare_inputs.input_ids,
+            images=prepare_inputs.images,
+            images_seq_mask=prepare_inputs.images_seq_mask,
+            images_spatial_crop=prepare_inputs.images_spatial_crop,
+            attention_mask=prepare_inputs.attention_mask,
+            past_key_values=past_key_values,
+            pad_token_id=self.tokenizer.eos_token_id,
+            bos_token_id=self.tokenizer.bos_token_id,
+            eos_token_id=self.tokenizer.eos_token_id,
+            **self.kwargs
+        )
+
+        answer = self.tokenizer.decode(
+            outputs[0][len(prepare_inputs.input_ids[0]):].cpu().tolist(),
+            skip_special_tokens=True
+        )
+        answer = answer.rstrip('.')
+
+        return answer
+
+    def chat_inner(self, message, dataset=None):
+        return self.generate_inner(message, dataset=dataset)