Skip to content

Commit

Permalink
[Model] Add DeepSeek-VL2 (#697)
Browse files Browse the repository at this point in the history
* add deepseek-vl2

* Fix Lint

* Make distributed timeout configurable

---------

Co-authored-by: Xingchao Liu <[email protected]>
Co-authored-by: kennymckormick <[email protected]>
  • Loading branch information
3 people authored Dec 27, 2024
1 parent f055c2e commit cf4d61b
Show file tree
Hide file tree
Showing 4 changed files with 174 additions and 2 deletions.
5 changes: 4 additions & 1 deletion run.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,10 @@ def main():
if world_size > 1:
local_rank = os.environ.get('LOCAL_RANK', 0)
torch.cuda.set_device(int(local_rank))
dist.init_process_group(backend='nccl', timeout=datetime.timedelta(seconds=3600))
dist.init_process_group(
backend='nccl',
timeout=datetime.timedelta(seconds=os.environ.get('DIST_TIMEOUT', 3600))
)

for _, model_name in enumerate(args.model):
model = None
Expand Down
7 changes: 6 additions & 1 deletion vlmeval/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,11 @@
'deepseek_vl_1.3b': partial(DeepSeekVL, model_path='deepseek-ai/deepseek-vl-1.3b-chat'),
}

deepseekvl2_series = {
'deepseek_vl2_tiny': partial(DeepSeekVL2, model_path='deepseek-ai/deepseek-vl2-tiny'),
'deepseek_vl2_small': partial(DeepSeekVL2, model_path='deepseek-ai/deepseek-vl2-small'),
'deepseek_vl2': partial(DeepSeekVL2, model_path='deepseek-ai/deepseek-vl2'),
}

janus_series = {
'Janus-1.3B': partial(Janus, model_path='deepseek-ai/Janus-1.3B')
Expand Down Expand Up @@ -406,7 +411,7 @@
ungrouped, api_models,
xtuner_series, qwen_series, llava_series, internvl_series, yivl_series,
xcomposer_series, minigpt4_series, idefics_series, instructblip_series,
deepseekvl_series, janus_series, minicpm_series, cogvlm_series, wemm_series,
deepseekvl_series, deepseekvl2_series, janus_series, minicpm_series, cogvlm_series, wemm_series,
cambrian_series, chameleon_series, video_models, ovis_series, vila_series,
mantis_series, mmalaya_series, phi3_series, xgen_mm_series, qwen2vl_series,
slime_series, eagle_series, moondream_series, llama_series, molmo_series,
Expand Down
1 change: 1 addition & 0 deletions vlmeval/vlm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from .yi_vl import Yi_VL
from .internvl import InternVLChat
from .deepseek_vl import DeepSeekVL
from .deepseek_vl2 import DeepSeekVL2
from .janus import Janus
from .mgm import Mini_Gemini
from .bunnyllama3 import BunnyLLama3
Expand Down
163 changes: 163 additions & 0 deletions vlmeval/vlm/deepseek_vl2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
import sys
import torch
from transformers import AutoModelForCausalLM
import warnings
from .base import BaseModel
from ..smp import *
from PIL import Image


class DeepSeekVL2(BaseModel):

INSTALL_REQ = True
INTERLEAVE = True

def check_install(self):
try:
import deepseek_vl2
except Exception as e:
logging.critical(
'Please first install deepseek_vl2 from source codes in: https://github.com/deepseek-ai/DeepSeek-VL2')
raise e

def __init__(self, model_path='deepseek-ai/deepseek-vl2-tiny', **kwargs):
self.check_install()
assert model_path is not None
self.model_path = model_path
from deepseek_vl2.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM

self.vl_chat_processor = DeepseekVLV2Processor.from_pretrained(model_path)
self.tokenizer = self.vl_chat_processor.tokenizer

model: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(model_path,
trust_remote_code=True,
torch_dtype=torch.bfloat16)
self.model = model.cuda().eval()

torch.cuda.empty_cache()
default_kwargs = dict(max_new_tokens=512, do_sample=False, use_cache=True)
default_kwargs.update(kwargs)
self.kwargs = default_kwargs
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')

def prepare_inputs(self, message, dataset=None):

if dataset == 'MMMU_DEV_VAL':

def prepare_itlist(msgs):
content, images = '', []
image_idx = 1
for s in msgs:
if s['type'] == 'image':
images.append(s['value'])
content += f'<image {image_idx}>'
image_idx += 1
elif s['type'] == 'text':
content += s['value']
# content = '<image>' * (image_idx-1) + '\n' + content
content = '<image>' * (image_idx - 1) + '\n' + content
return content, images

conversation = []
if 'role' not in message[0]:
content, images = prepare_itlist(message)
content = content.replace(
'Please select the correct answer from the options above.',
"Answer with the option's letter from the given choices directly. Answer the question using a single word or phrase.\n" # noqa
)
content = content.replace('Question:', "")
content = content.replace('Options:\n', "")
conversation.append(dict(role='<|User|>', content=content, images=images))
else:
role_map = {'user': '<|User|>', 'assistant': '<|Assistant|>'}
for msgs in message:
role = role_map[msgs['role']]
content, images = prepare_itlist(msgs['content'])
content = content.replace(
'Please select the correct answer from the options above.',
"Answer with the option's letter from the given choices directly. Answer the question using a single word or phrase.\n" # noqa
)
content = content.replace('Question:', "")
content = content.replace('Options:\n', "")
conversation.append(dict(role=role, content=content, images=images))
conversation.append(dict(role='<|Assistant|>', content=''))

else:

def prepare_itlist(msgs):
content, images = '', []
for s in msgs:
if s['type'] == 'image':
images.append(s['value'])
content += '<image>\n'
elif s['type'] == 'text':
content += s['value']
return content, images

conversation = []
if 'role' not in message[0]:
content, images = prepare_itlist(message)
conversation.append(dict(role='<|User|>', content=content, images=images))
else:
role_map = {'user': '<|User|>', 'assistant': '<|Assistant|>'}
for msgs in message:
role = role_map[msgs['role']]
content, images = prepare_itlist(msgs['content'])
conversation.append(dict(role=role, content=content, images=images))
conversation.append(dict(role='<|Assistant|>', content=''))

return conversation

def generate_inner(self, message, dataset=None):
conversation = self.prepare_inputs(message, dataset)
from deepseek_vl2.utils.io import load_pil_images
pil_images = load_pil_images(conversation)

if dataset == 'MMMU_DEV_VAL':
if len(pil_images):
h, w = pil_images[0].size
pil_images[0] = pil_images[0].resize((2 * h, 2 * w), Image.BILINEAR)

prepare_inputs = self.vl_chat_processor(
conversations=conversation,
images=pil_images,
force_batchify=True,
system_prompt=""
)
prepare_inputs = prepare_inputs.to(self.model.device)
inputs_embeds = self.model.prepare_inputs_embeds(**prepare_inputs)

inputs_embeds, past_key_values = self.model.incremental_prefilling(
input_ids=prepare_inputs.input_ids,
images=prepare_inputs.images,
images_seq_mask=prepare_inputs.images_seq_mask,
images_spatial_crop=prepare_inputs.images_spatial_crop,
attention_mask=prepare_inputs.attention_mask,
chunk_size=512
)

# run the model to get the response
outputs = self.model.generate(
inputs_embeds=inputs_embeds,
input_ids=prepare_inputs.input_ids,
images=prepare_inputs.images,
images_seq_mask=prepare_inputs.images_seq_mask,
images_spatial_crop=prepare_inputs.images_spatial_crop,
attention_mask=prepare_inputs.attention_mask,
past_key_values=past_key_values,
pad_token_id=self.tokenizer.eos_token_id,
bos_token_id=self.tokenizer.bos_token_id,
eos_token_id=self.tokenizer.eos_token_id,
**self.kwargs
)

answer = self.tokenizer.decode(
outputs[0][len(prepare_inputs.input_ids[0]):].cpu().tolist(),
skip_special_tokens=True
)
answer = answer.rstrip('.')

return answer

def chat_inner(self, message, dataset=None):
return self.generate_inner(message, dataset=dataset)

0 comments on commit cf4d61b

Please sign in to comment.