Skip to content

Commit

Permalink
support deepseekvl
Browse files Browse the repository at this point in the history
  • Loading branch information
kennymckormick committed Mar 20, 2024
1 parent f38fd35 commit 2b573fb
Show file tree
Hide file tree
Showing 4 changed files with 142 additions and 31 deletions.
89 changes: 58 additions & 31 deletions vlmeval/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,46 +9,20 @@
OmniLMM_ROOT = None
LLAVA_V1_7B_MODEL_PTH = 'Please set your local path to LLaVA-7B-v1.1 here, the model weight is obtained by merging LLaVA delta weight based on vicuna-7b-v1.1 in https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md with vicuna-7b-v1.1. '

models = {
'qwen_base': partial(QwenVL, model_path='Qwen/Qwen-VL'),
ungrouped = {
'TransCore_M': partial(TransCoreM, root=TransCore_ROOT),
'qwen_chat': partial(QwenVLChat, model_path='Qwen/Qwen-VL-Chat'),
'PandaGPT_13B': partial(PandaGPT, name='PandaGPT_13B', root=PandaGPT_ROOT),
'flamingov2': partial(OpenFlamingo, name='v2', mpt_pth='anas-awadalla/mpt-7b', ckpt_pth='openflamingo/OpenFlamingo-9B-vitl-mpt7b'),
'flamingov2_fs': partial(OpenFlamingo, name='v2', with_context=True, mpt_pth='anas-awadalla/mpt-7b', ckpt_pth='openflamingo/OpenFlamingo-9B-vitl-mpt7b'),
'idefics_9b_instruct': partial(IDEFICS, model_pth='HuggingFaceM4/idefics-9b-instruct'),
'idefics_80b_instruct': partial(IDEFICS, model_pth='HuggingFaceM4/idefics-80b-instruct'),
'idefics_9b_instruct_fs': partial(IDEFICS, model_pth='HuggingFaceM4/idefics-9b-instruct', with_context=True),
'idefics_80b_instruct_fs': partial(IDEFICS, model_pth='HuggingFaceM4/idefics-80b-instruct', with_context=True),
'llava_v1.5_7b': partial(LLaVA, model_pth='liuhaotian/llava-v1.5-7b'),
'llava_v1.5_13b': partial(LLaVA, model_pth='liuhaotian/llava-v1.5-13b'),
'llava_v1_7b': partial(LLaVA, model_pth=LLAVA_V1_7B_MODEL_PTH),
'sharegpt4v_7b': partial(LLaVA, model_pth='Lin-Chen/ShareGPT4V-7B'),
'sharegpt4v_13b': partial(LLaVA, model_pth='Lin-Chen/ShareGPT4V-13B'),
'instructblip_7b': partial(InstructBLIP, name='instructblip_7b'),
'instructblip_13b': partial(InstructBLIP, name='instructblip_13b'),
'VisualGLM_6b': partial(VisualGLM, model_path='THUDM/visualglm-6b'),
'MiniGPT-4-v2': partial(MiniGPT4, mode='v2', root=MiniGPT4_ROOT),
'MiniGPT-4-v1-7B': partial(MiniGPT4, mode='v1_7b', root=MiniGPT4_ROOT),
'MiniGPT-4-v1-13B': partial(MiniGPT4, mode='v1_13b', root=MiniGPT4_ROOT),
'XComposer': partial(XComposer, model_path='internlm/internlm-xcomposer-vl-7b'),
'XComposer2': partial(XComposer2, model_path='internlm/internlm-xcomposer2-vl-7b'),
'mPLUG-Owl2': partial(mPLUG_Owl2, model_path='MAGAer13/mplug-owl2-llama2-7b'),
'cogvlm-grounding-generalist':partial(CogVlm, name='cogvlm-grounding-generalist',tokenizer_name ='lmsys/vicuna-7b-v1.5'),
'cogvlm-chat':partial(CogVlm, name='cogvlm-chat',tokenizer_name ='lmsys/vicuna-7b-v1.5'),
'sharedcaptioner':partial(SharedCaptioner, model_path='Lin-Chen/ShareCaptioner'),
'emu2':partial(Emu, name='emu2'),
'emu2_chat':partial(Emu, name='emu2_chat'),
'monkey':partial(Monkey, model_path='echo840/Monkey'),
'monkey-chat':partial(MonkeyChat, model_path='echo840/Monkey-Chat'),
'Yi_VL_6B':partial(Yi_VL, model_path='01-ai/Yi-VL-6B', root=Yi_ROOT),
'Yi_VL_34B':partial(Yi_VL, model_path='01-ai/Yi-VL-34B', root=Yi_ROOT),
'MMAlaya':partial(MMAlaya, model_path='DataCanvas/MMAlaya'),
'MiniCPM-V':partial(MiniCPM_V, model_path='openbmb/MiniCPM-V'),
'OmniLMM_12B':partial(OmniLMM12B, model_path='openbmb/OmniLMM-12B', root=OmniLMM_ROOT),
'InternVL-Chat-V1-1':partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-Chinese-V1-1'),
'InternVL-Chat-V1-2': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-Chinese-V1-2'),
'InternVL-Chat-V1-2-Plus': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-Chinese-V1-2-Plus'),
}

api_models = {
Expand All @@ -65,22 +39,75 @@
'GeminiProVision': partial(GeminiProVision, temperature=0, retry=10),
'QwenVLPlus': partial(QwenVLAPI, model='qwen-vl-plus', temperature=0, retry=10),
'QwenVLMax': partial(QwenVLAPI, model='qwen-vl-max', temperature=0, retry=10),
# Internal Only
'Step1V': partial(Step1V, temperature=0, retry=10),
# Internal Only
'Claude3V_Opus': partial(Claude3V, model='claude-3-opus-20240229', temperature=0, retry=10),
'Claude3V_Sonnet': partial(Claude3V, model='claude-3-sonnet-20240229', temperature=0, retry=10),
'Claude3V_Haiku': partial(Claude3V, model='claude-3-haiku-20240307', temperature=0, retry=10),
}

xtuner_models = {
xtuner_series = {
'llava-internlm2-7b': partial(LLaVA_XTuner, llm_path='internlm/internlm2-chat-7b', llava_path='xtuner/llava-internlm2-7b', visual_select_layer=-2, prompt_template='internlm2_chat'),
'llava-internlm2-20b': partial(LLaVA_XTuner, llm_path='internlm/internlm2-chat-20b', llava_path='xtuner/llava-internlm2-20b', visual_select_layer=-2, prompt_template='internlm2_chat'),
'llava-internlm-7b': partial(LLaVA_XTuner, llm_path='internlm/internlm-chat-7b', llava_path='xtuner/llava-internlm-7b', visual_select_layer=-2, prompt_template='internlm_chat'),
'llava-v1.5-7b-xtuner': partial(LLaVA_XTuner, llm_path='lmsys/vicuna-7b-v1.5', llava_path='xtuner/llava-v1.5-7b-xtuner', visual_select_layer=-2, prompt_template='vicuna'),
'llava-v1.5-13b-xtuner': partial(LLaVA_XTuner, llm_path='lmsys/vicuna-13b-v1.5', llava_path='xtuner/llava-v1.5-13b-xtuner', visual_select_layer=-2, prompt_template='vicuna'),
}

qwen_series = {
'qwen_base': partial(QwenVL, model_path='Qwen/Qwen-VL'),
'qwen_chat': partial(QwenVLChat, model_path='Qwen/Qwen-VL-Chat'),
'monkey':partial(Monkey, model_path='echo840/Monkey'),
'monkey-chat':partial(MonkeyChat, model_path='echo840/Monkey-Chat')
}

llava_series = {
'llava_v1.5_7b': partial(LLaVA, model_pth='liuhaotian/llava-v1.5-7b'),
'llava_v1.5_13b': partial(LLaVA, model_pth='liuhaotian/llava-v1.5-13b'),
'llava_v1_7b': partial(LLaVA, model_pth=LLAVA_V1_7B_MODEL_PTH),
'sharegpt4v_7b': partial(LLaVA, model_pth='Lin-Chen/ShareGPT4V-7B'),
'sharegpt4v_13b': partial(LLaVA, model_pth='Lin-Chen/ShareGPT4V-13B'),
}

internvl_series = {
'InternVL-Chat-V1-1':partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-Chinese-V1-1'),
'InternVL-Chat-V1-2': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-Chinese-V1-2'),
'InternVL-Chat-V1-2-Plus': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-Chinese-V1-2-Plus'),
}

yivl_series = {
'Yi_VL_6B':partial(Yi_VL, model_path='01-ai/Yi-VL-6B', root=Yi_ROOT),
'Yi_VL_34B':partial(Yi_VL, model_path='01-ai/Yi-VL-34B', root=Yi_ROOT),
}

xcomposer_series = {
'XComposer': partial(XComposer, model_path='internlm/internlm-xcomposer-vl-7b'),
'XComposer2': partial(XComposer2, model_path='internlm/internlm-xcomposer2-vl-7b'),
}

minigpt4_series = {
'MiniGPT-4-v2': partial(MiniGPT4, mode='v2', root=MiniGPT4_ROOT),
'MiniGPT-4-v1-7B': partial(MiniGPT4, mode='v1_7b', root=MiniGPT4_ROOT),
'MiniGPT-4-v1-13B': partial(MiniGPT4, mode='v1_13b', root=MiniGPT4_ROOT),
}

idefics_series = {
'idefics_9b_instruct': partial(IDEFICS, model_pth='HuggingFaceM4/idefics-9b-instruct'),
'idefics_80b_instruct': partial(IDEFICS, model_pth='HuggingFaceM4/idefics-80b-instruct'),
}

instructblip_series = {
'instructblip_7b': partial(InstructBLIP, name='instructblip_7b'),
'instructblip_13b': partial(InstructBLIP, name='instructblip_13b'),
}

supported_VLM = {}
for model_set in [models, api_models, xtuner_models]:
supported_VLM.update(model_set)

model_groups = [
ungrouped, api_models,
xtuner_series, qwen_series, llava_series, internvl_series, yivl_series,
xcomposer_series, minigpt4_series, idefics_series, instructblip_series,
]

for grp in model_groups:
supported_VLM.update(grp)
13 changes: 13 additions & 0 deletions vlmeval/smp/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,16 @@ def load_env():
os.environ[k] = v
print(f'API Keys successfully loaded from {pth}')
return

def pip_install_robust(package):
import sys
retry = 3
while retry > 0:
try:
package_base = package.split('=')[0]
module = __import__(package)
return True
except ImportError:
subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
retry -= 1
return False
1 change: 1 addition & 0 deletions vlmeval/vlm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@
from .xcomposer2 import XComposer2
from .yi_vl import Yi_VL
from .internvl_chat import InternVLChat
from .deepseek_vl import DeepSeekVL
70 changes: 70 additions & 0 deletions vlmeval/vlm/deepseek_vl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import sys
import torch
from transformers import AutoModelForCausalLM
import warnings
from vlmeval.smp import isimg, pip_install


class DeepSeekVL:

INSTALL_REQ = True

def check_install(self):
installed = pip_install('deepseek_vl')
if not installed:
warnings.warn(
'Please first install deepseek_vl from source codes in: https://github.com/deepseek-ai/DeepSeek-VL')
sys.exit(-1)

def __init__(self, model_path='deepseek-ai/deepseek-vl-7b-chat', **kwargs):
self.check_install()
assert model_path is not None
self.model_path = model_path
from deepseek_vl.models import VLChatProcessor

self.vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
self.tokenizer = self.vl_chat_processor.tokenizer

model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, device='cpu')
self.model = model.to(torch.bfloat16).cuda().eval()

torch.cuda.empty_cache()
default_kwargs = dict(max_new_tokens=512, do_sample=False, use_cache=True)
default_kwargs.update(kwargs)
self.kwargs = default_kwargs
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')

def prepare_inputs(self, msgs):
content, images = '', []
for s in msgs:
if isimg(s):
images.append(s)
content += '<image_placeholder>'
else:
content += s
conversation = [
dict(role='User', content=content, images=images),
dict(role='Assistant', content='')
]
return conversation

def interleave_generate(self, ti_list, dataset=None):
conversation = self.prepare_inputs(ti_list)
from deepseek_vl.utils.io import load_pil_images
pil_images = load_pil_images(conversation)
prepare_inputs = self.vl_chat_processor(conversations=conversation, images=pil_images, force_batchify=True)
prepare_inputs = prepare_inputs.to(self.model.device)
input_embeds = self.model.prepare_inputs_embeds(**prepare_inputs)

outputs = self.model.language_model.generate(
input_embeds=input_embeds,
attention_mask=prepare_inputs.attention_mask,
pad_token_id=self.tokenizer.eos_token_id,
bos_token_id=self.tokenizer.bos_token_id,
eos_token_id=self.tokenizer.eos_token_id,
**self.kwargs)
answer = self.tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
return answer

def generate(self, image_path, prompt, dataset=None):
return self.interleave_generate([image_path, prompt], dataset=dataset)

0 comments on commit 2b573fb

Please sign in to comment.