@@ -5403,7 +5403,18 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
54035403
54045404 if lora_names := hparams .get ("lora_adaptations" ):
54055405 self ._lora_names = lora_names
5406- self .model_arch = gguf .MODEL_ARCH .JINA_BERT_V3
5406+
5407+ try :
5408+ text_cfg = hparams .get ("text_config" , {}) if isinstance (hparams .get ("text_config" , {}), dict ) else {}
5409+ pe_type = (text_cfg .get ("position_embedding_type" ) or hparams .get ("position_embedding_type" ) or "" ).lower ()
5410+ rope_base = text_cfg .get ("rotary_emb_base" , hparams .get ("rotary_emb_base" ))
5411+ name_path = (hparams .get ("_name_or_path" ) or "" ).lower ()
5412+ is_vx = ("jina" in name_path and ("v2" in name_path or "v3" in name_path ))
5413+ is_v3 = (pe_type == "rotary" or rope_base is not None ) and is_vx
5414+ if (is_v3 ) or self ._lora_names :
5415+ self .model_arch = gguf .MODEL_ARCH .JINA_BERT_V3
5416+ except Exception :
5417+ pass
54075418
54085419 super ().__init__ (dir_model , ftype , fname_out , hparams = hparams , ** kwargs )
54095420 self ._xlmroberta_tokenizer_init ()
@@ -6625,6 +6636,254 @@ def set_vocab(self):
66256636 raise NotImplementedError (f'Tokenizer { tokenizer_class } is not supported for JinaBertModel' )
66266637
66276638
6639+ @ModelBase .register ("JinaCLIPVisionModel" , "JinaCLIPModel" )
6640+ class JinaCLIPVisionModel (MmprojModel ):
6641+ """JinaCLIP v2 Vision Encoder Model - handles vision component only"""
6642+ model_arch = gguf .MODEL_ARCH .MMPROJ
6643+
6644+ def __init__ (self , * args , ** kwargs ):
6645+ super ().__init__ (* args , ** kwargs )
6646+
6647+ # Load config for vision encoder
6648+ config_path = self .dir_model / "config.json"
6649+ if not config_path .exists ():
6650+ raise FileNotFoundError (
6651+ f"JinaCLIPVisionModel: missing config.json in { self .dir_model } . "
6652+ "Please ensure the original model config is present; default hyperparameter fallbacks are not used."
6653+ )
6654+ with open (config_path , encoding = "utf-8" ) as f :
6655+ self .vision_config = json .load (f )
6656+
6657+ def set_vocab (self ):
6658+ # Vision encoder doesn't need vocabulary
6659+ pass
6660+
6661+ def set_gguf_parameters (self ):
6662+ cfg = self .vision_config
6663+
6664+ try :
6665+ width = int (cfg ["width" ]) # channel dim
6666+ head_width = int (cfg ["head_width" ]) # per-head dim
6667+ layers = int (cfg ["layers" ]) # block count
6668+ image_size = int (cfg ["image_size" ]) # input image size
6669+ patch_size = int (cfg ["patch_size" ]) # patch size
6670+ except KeyError as e :
6671+ raise KeyError (f"JinaCLIPVisionModel: missing key in config.json: { e } " )
6672+
6673+ if width % head_width != 0 :
6674+ raise ValueError (
6675+ f"JinaCLIPVisionModel: width ({ width } ) not divisible by head_width ({ head_width } )"
6676+ )
6677+ n_head = width // head_width
6678+
6679+ if "mlp_ratio" in cfg :
6680+ n_ff = int (width * float (cfg ["mlp_ratio" ]))
6681+ elif bool (cfg .get ("naive_swiglu" , False )):
6682+ n_ff = int ((width * 8 ) // 3 )
6683+ else :
6684+ raise ValueError ("JinaCLIPVisionModel: unable to infer FFN size; please provide 'mlp_ratio' or set 'naive_swiglu' in config.json" )
6685+
6686+ self .gguf_writer .add_clip_has_vision_encoder (True )
6687+ proj_dim = int (cfg .get ("projection_dim" , width ))
6688+ self .gguf_writer .add_vision_projection_dim (proj_dim )
6689+
6690+ self .gguf_writer .add_vision_image_size (image_size )
6691+ self .gguf_writer .add_vision_patch_size (patch_size )
6692+ self .gguf_writer .add_vision_embedding_length (width )
6693+ self .gguf_writer .add_vision_block_count (layers )
6694+ self .gguf_writer .add_vision_head_count (n_head )
6695+ self .gguf_writer .add_vision_feed_forward_length (n_ff )
6696+
6697+ self .gguf_writer .add_vision_attention_layernorm_eps (float (cfg .get ("layer_norm_eps" , 1e-5 )))
6698+
6699+ mean = self .preprocessor_config .get ("image_mean" , self .preprocessor_config .get ("mean" ))
6700+ std = self .preprocessor_config .get ("image_std" , self .preprocessor_config .get ("std" ))
6701+ if mean is None or std is None :
6702+ raise KeyError (
6703+ "JinaCLIPVisionModel: preprocessor_config missing image mean/std (expected keys: 'image_mean'/'image_std' or 'mean'/'std')"
6704+ )
6705+ self .gguf_writer .add_vision_image_mean (mean )
6706+ self .gguf_writer .add_vision_image_std (std )
6707+
6708+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .JINACLIP2 )
6709+ self .gguf_writer .add_vision_use_silu (True )
6710+
6711+ def _strip_vm_prefix (self , name : str ) -> str :
6712+ return name [len ('vision_model.' ):] if name .startswith ('vision_model.' ) else name
6713+
6714+ def _map_block_tensor (self , layer : int , rest : str , data_torch : Tensor , name : str ) -> list [tuple [str , Tensor ]] | None :
6715+ parts = rest .split ('.' )
6716+ # layer norms
6717+ if rest .startswith ('norm1.' ):
6718+ suffix = parts [- 1 ]
6719+ return [(f'v.blk.{ layer } .ln1.{ suffix } ' , data_torch )]
6720+ if rest .startswith ('norm2.' ):
6721+ suffix = parts [- 1 ]
6722+ return [(f'v.blk.{ layer } .ln2.{ suffix } ' , data_torch )]
6723+ if rest .startswith ('attn.inner_attn_ln.' ):
6724+ suffix = parts [- 1 ]
6725+ return [(f'v.blk.{ layer } .attn_ln.{ suffix } ' , data_torch )]
6726+
6727+ # fused qkv
6728+ if rest == 'attn.qkv.weight' :
6729+ w = data_torch
6730+ wdim = w .shape [0 ]
6731+ if wdim % 3 != 0 :
6732+ logger .warning ('mmproj(jinaclip): unexpected qkv weight shape %s for %s' , tuple (w .shape ), name )
6733+ d = wdim // 3
6734+ q , k , v = w [0 :d , :], w [d :2 * d , :], w [2 * d :, :]
6735+ return [
6736+ (f'v.blk.{ layer } .attn_q.weight' , q ),
6737+ (f'v.blk.{ layer } .attn_k.weight' , k ),
6738+ (f'v.blk.{ layer } .attn_v.weight' , v ),
6739+ ]
6740+ if rest == 'attn.qkv.bias' :
6741+ b = data_torch
6742+ bdim = b .shape [0 ]
6743+ if bdim % 3 != 0 :
6744+ logger .warning ('mmproj(jinaclip): unexpected qkv bias shape %s for %s' , tuple (b .shape ), name )
6745+ d = bdim // 3
6746+ qb , kb , vb = b [0 :d ], b [d :2 * d ], b [2 * d :]
6747+ return [
6748+ (f'v.blk.{ layer } .attn_q.bias' , qb ),
6749+ (f'v.blk.{ layer } .attn_k.bias' , kb ),
6750+ (f'v.blk.{ layer } .attn_v.bias' , vb ),
6751+ ]
6752+ # separate q/v bias (some checkpoints)
6753+ if rest == 'attn.q_bias' :
6754+ return [(f'v.blk.{ layer } .attn_q.bias' , data_torch )]
6755+ if rest == 'attn.v_bias' :
6756+ return [(f'v.blk.{ layer } .attn_v.bias' , data_torch )]
6757+
6758+ # separate projections
6759+ if rest .startswith ('attn.q_proj.' ):
6760+ suffix = parts [- 1 ]
6761+ return [(f'v.blk.{ layer } .attn_q.{ suffix } ' , data_torch )]
6762+ if rest .startswith ('attn.k_proj.' ):
6763+ suffix = parts [- 1 ]
6764+ return [(f'v.blk.{ layer } .attn_k.{ suffix } ' , data_torch )]
6765+ if rest .startswith ('attn.v_proj.' ):
6766+ suffix = parts [- 1 ]
6767+ return [(f'v.blk.{ layer } .attn_v.{ suffix } ' , data_torch )]
6768+ if rest .startswith ('attn.proj.' ):
6769+ suffix = parts [- 1 ]
6770+ return [(f'v.blk.{ layer } .attn_out.{ suffix } ' , data_torch )]
6771+
6772+ # MLP
6773+ if rest .startswith ('mlp.w1.' ):
6774+ suffix = parts [- 1 ]
6775+ return [(f'v.blk.{ layer } .ffn_gate.{ suffix } ' , data_torch )]
6776+ if rest .startswith ('mlp.w2.' ):
6777+ suffix = parts [- 1 ]
6778+ return [(f'v.blk.{ layer } .ffn_up.{ suffix } ' , data_torch )]
6779+ if rest .startswith ('mlp.w3.' ):
6780+ suffix = parts [- 1 ]
6781+ return [(f'v.blk.{ layer } .ffn_down.{ suffix } ' , data_torch )]
6782+ if rest .startswith ('mlp.ffn_ln.' ):
6783+ suffix = parts [- 1 ]
6784+ return [(f'v.blk.{ layer } .ffn_norm.{ suffix } ' , data_torch )]
6785+ if rest .startswith ('mlp.fc1.' ):
6786+ suffix = parts [- 1 ]
6787+ return [(f'v.blk.{ layer } .ffn_up.{ suffix } ' , data_torch )]
6788+ if rest .startswith ('mlp.fc2.' ):
6789+ suffix = parts [- 1 ]
6790+ return [(f'v.blk.{ layer } .ffn_down.{ suffix } ' , data_torch )]
6791+ return None
6792+
6793+ def map_tensor_name (self , name : str , try_suffixes : Sequence [str ] = (".weight" , ".bias" )) -> str :
6794+ """Prefer base table-driven mapping; keep Jina-specific targets if already mapped; fallback to legacy mapper."""
6795+ # Already a GGUF target name (e.g., "v.*" or "mm.*"): return as-is
6796+ if name .startswith ('v.' ) or name .startswith ('mm.' ):
6797+ return name
6798+ # Try the base mapping first
6799+ try :
6800+ return super ().map_tensor_name (name , try_suffixes = try_suffixes )
6801+ except Exception :
6802+ # Fallback to legacy Jina-specific mapper for any remaining edge keys
6803+ if hasattr (self , "_map_jinaclip_tensor_name" ):
6804+ mapped = self ._map_jinaclip_tensor_name (name ) # type: ignore[attr-defined]
6805+ if mapped :
6806+ return mapped
6807+ return name
6808+
6809+ def get_tensors (self ) -> Iterator [tuple [str , Tensor ]]:
6810+ yielded_any = False
6811+ try :
6812+ for name , tensor in super ().get_tensors ():
6813+ yielded_any = True
6814+ yield name , tensor
6815+ except Exception as e :
6816+ logger .warning ("mmproj(jinaclip): base get_tensors failed, falling back: %s" , e )
6817+ if yielded_any :
6818+ return
6819+
6820+ candidates = [
6821+ self .dir_model / "pytorch_model.bin" ,
6822+ self .dir_model / "vision_model_weights.bin" ,
6823+ ]
6824+ model_path = next ((p for p in candidates if p .exists ()), None )
6825+ if model_path is None :
6826+ raise FileNotFoundError (f"mmproj(jinaclip): no model weights found in { self .dir_model } " )
6827+ try :
6828+ state_dict = torch .load (model_path , map_location = "cpu" , weights_only = True )
6829+ except TypeError :
6830+ state_dict = torch .load (model_path , map_location = "cpu" )
6831+
6832+ for name , tensor in state_dict .items ():
6833+ yield name , tensor
6834+
6835+ def _should_be_f32 (self , gguf_name : str ) -> bool :
6836+ patterns = (
6837+ ".ln1.weight" , ".ln1.bias" ,
6838+ ".ln2.weight" , ".ln2.bias" ,
6839+ ".attn_ln.weight" , ".attn_ln.bias" ,
6840+ ".ffn_norm.weight" , ".ffn_norm.bias" ,
6841+ "v.patch_embd.proj.bias" ,
6842+ )
6843+ return any (p in gguf_name for p in patterns )
6844+
6845+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
6846+ del bid # unused
6847+
6848+ src = name
6849+ if src .startswith ('v.' ) or src .startswith ('mm.' ):
6850+ return [(src , data_torch )]
6851+
6852+ # Drop 'vision_model.' prefix if present
6853+ src_no_vm = self ._strip_vm_prefix (src )
6854+
6855+ # Top-level direct mappings — use gguf constants directly for canonical names
6856+ if src_no_vm == 'cls_token' :
6857+ base = gguf .TENSOR_NAMES [gguf .MODEL_TENSOR .V_ENC_EMBD_CLS ]
6858+ return [(base , data_torch )]
6859+ if src_no_vm .startswith ('patch_embed.proj.' ):
6860+ suffix = src_no_vm .split ('.' )[- 1 ]
6861+ base = gguf .TENSOR_NAMES [gguf .MODEL_TENSOR .V_ENC_EMBD_PATCH ]
6862+ return [(f'{ base } .{ suffix } ' , data_torch )]
6863+ if src_no_vm == 'pos_embed' :
6864+ pos_name = gguf .TENSOR_NAMES [gguf .MODEL_TENSOR .V_ENC_EMBD_POS ] + '.weight'
6865+ return [(pos_name , data_torch )]
6866+ if src_no_vm .startswith ('norm.' ):
6867+ suffix = src_no_vm .split ('.' )[- 1 ]
6868+ base = gguf .TENSOR_NAMES [gguf .MODEL_TENSOR .V_POST_NORM ]
6869+ return [(f'{ base } .{ suffix } ' , data_torch )]
6870+
6871+ if src_no_vm .startswith ('blocks.' ):
6872+ parts = src_no_vm .split ('.' )
6873+ if len (parts ) >= 3 and parts [1 ].isdigit ():
6874+ layer = int (parts [1 ])
6875+ rest = '.' .join (parts [2 :])
6876+ mapped = self ._map_block_tensor (layer , rest , data_torch , name )
6877+ if mapped is not None :
6878+ return mapped
6879+
6880+ try :
6881+ return [(self .map_tensor_name (name ), data_torch )]
6882+ except Exception :
6883+ logger .debug ("mmproj(jinaclip): skip unmapped tensor %s" , name )
6884+ return []
6885+
6886+
66286887@ModelBase .register ("OpenELMForCausalLM" )
66296888class OpenELMModel (TextModel ):
66306889 model_arch = gguf .MODEL_ARCH .OPENELM
0 commit comments