diff --git a/vlmeval/vlm/valley/valley_eagle/model/multimodal_encoder/builder.py b/vlmeval/vlm/valley/valley_eagle/model/multimodal_encoder/builder.py index 99cbc7e9..5f699af6 100644 --- a/vlmeval/vlm/valley/valley_eagle/model/multimodal_encoder/builder.py +++ b/vlmeval/vlm/valley/valley_eagle/model/multimodal_encoder/builder.py @@ -1,9 +1,9 @@ import torch -from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VisionTransformerPretrainedModel from ...util.vision_encoder_config import qwen2vl_vit_config def build_vision_tower(vision_tower_cfg, **kwargs): + from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VisionTransformerPretrainedModel vision_tower = getattr(vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None)) if getattr(vision_tower_cfg, "language", None) is None: vision_tower_cfg.language = "chinese" if "chinese" in vision_tower else "english" diff --git a/vlmeval/vlm/valley/valley_eagle/model/token_compressor/evo.py b/vlmeval/vlm/valley/valley_eagle/model/token_compressor/evo.py index 6d5b6134..39b00577 100644 --- a/vlmeval/vlm/valley/valley_eagle/model/token_compressor/evo.py +++ b/vlmeval/vlm/valley/valley_eagle/model/token_compressor/evo.py @@ -1,6 +1,5 @@ import torch from torch import nn -from timm.models.layers import trunc_normal_ import torch.nn.functional as F @@ -43,6 +42,7 @@ def __init__(self, embed_dim=2048, inner_dim=64, prune_ratio=0.25, **kwargs): self.apply(self._init_weights) def _init_weights(self, m): + from timm.models.layers import trunc_normal_ if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: diff --git a/vlmeval/vlm/valley/valley_eagle/model/token_compressor/lavit.py b/vlmeval/vlm/valley/valley_eagle/model/token_compressor/lavit.py index a968466b..6c5b5b96 100644 --- a/vlmeval/vlm/valley/valley_eagle/model/token_compressor/lavit.py +++ b/vlmeval/vlm/valley/valley_eagle/model/token_compressor/lavit.py @@ -1,6 +1,5 @@ import torch from torch import nn -from timm.models.layers import trunc_normal_ import torch.nn.functional as F @@ -40,6 +39,7 @@ def __init__(self, embed_dim=2048, inner_dim=64, **kwargs): self.apply(self._init_weights) def _init_weights(self, m): + from timm.models.layers import trunc_normal_ if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: diff --git a/vlmeval/vlm/valley/valley_eagle_chat.py b/vlmeval/vlm/valley/valley_eagle_chat.py index 31a52705..d556e745 100644 --- a/vlmeval/vlm/valley/valley_eagle_chat.py +++ b/vlmeval/vlm/valley/valley_eagle_chat.py @@ -6,7 +6,6 @@ import logging from transformers import set_seed from transformers import AutoTokenizer, AutoProcessor -from qwen_vl_utils import fetch_image, fetch_video import re from .valley_eagle.model.language_model.valley_qwen2 import ValleyQwen2ForCausalLM @@ -240,6 +239,7 @@ def generate_inner(self, message, dataset=None): messages_qwen = [] image_list = [] for image_file in images: + from qwen_vl_utils import fetch_image image = fetch_image({"image": image_file}) image_list.append(image) messages_qwen.append({'role': 'user', "content": [{"type": "text", "text": text}]})