From 4bea02bef3003eb2ca4ae43fb7f49aad54c6c5ec Mon Sep 17 00:00:00 2001 From: Xiangyu Zhao <98592339+PhoenixZ810@users.noreply.github.com> Date: Sun, 29 Dec 2024 23:59:51 +0800 Subject: [PATCH] modify valley import (#700) --- vlmeval/vlm/valley/valley_eagle_chat.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/vlmeval/vlm/valley/valley_eagle_chat.py b/vlmeval/vlm/valley/valley_eagle_chat.py index d556e745..4c82ccc9 100644 --- a/vlmeval/vlm/valley/valley_eagle_chat.py +++ b/vlmeval/vlm/valley/valley_eagle_chat.py @@ -8,10 +8,6 @@ from transformers import AutoTokenizer, AutoProcessor import re -from .valley_eagle.model.language_model.valley_qwen2 import ValleyQwen2ForCausalLM -from .valley_eagle.util.mm_utils import process_anyres_image -from .valley_eagle import conversation as conversation_lib -from .valley_eagle.util.data_util import dynamic_preprocess, preprocess IGNORE_INDEX = -100 IMAGE_TOKEN_INDEX = -200 @@ -124,6 +120,11 @@ class ValleyEagleChat(BaseModel): def __init__(self, model_path='liuhaotian/llava_v1.5_7b', **kwargs): + from .valley_eagle.model.language_model.valley_qwen2 import ValleyQwen2ForCausalLM + from .valley_eagle.util.mm_utils import process_anyres_image + from .valley_eagle import conversation as conversation_lib + from .valley_eagle.util.data_util import dynamic_preprocess, preprocess + torch_dtype = torch.float16 padding_side = 'left' use_fast = True @@ -144,6 +145,8 @@ def __init__(self, self.model_path = model_path self.model = ValleyQwen2ForCausalLM.from_pretrained(model_path, torch_dtype=torch_dtype) self.model = self.model.to(self.device).half() + self.process_anyres_image = process_anyres_image + self.preprocess = preprocess # should check this code self.model.config.min_tile_num = 1 @@ -192,7 +195,7 @@ def preprocess_images( video_pad = [] for img in images: if self.model.config.anyres: - image = process_anyres_image(img, self.image_processor, self.model.config.grid_pinpoints) + image = self.process_anyres_image(img, self.image_processor, self.model.config.grid_pinpoints) else: image = self.image_processor(img, return_tensors="pt")["pixel_values"][0] @@ -269,7 +272,7 @@ def generate_inner(self, message, dataset=None): img_length = len(video_images_tensor) source = preprocess_multimodal(messages, img_length, self.model.config) - data_dict = preprocess( + data_dict = self.preprocess( source, self.tokenizer, has_image=True,