|  | 
| 39 | 39 | #define KEY_FEATURE_LAYER       "clip.vision.feature_layer" | 
| 40 | 40 | #define KEY_PROJ_SCALE_FACTOR   "clip.vision.projector.scale_factor" | 
| 41 | 41 | #define KEY_SPATIAL_MERGE_SIZE  "clip.vision.spatial_merge_size" | 
|  | 42 | +// RoPE base (theta) for vision encoder | 
|  | 43 | +#define KEY_VISION_ROPE_THETA   "clip.vision.rope_theta" | 
| 42 | 44 | 
 | 
| 43 | 45 | #define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type" | 
| 44 | 46 | #define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints" | 
|  | 
| 67 | 69 | #define TN_ATTN_Q          "%s.blk.%d.attn_q.%s" | 
| 68 | 70 | #define TN_ATTN_V          "%s.blk.%d.attn_v.%s" | 
| 69 | 71 | #define TN_ATTN_OUTPUT     "%s.blk.%d.attn_out.%s" | 
|  | 72 | +#define TN_ATTN_LN         "%s.blk.%d.attn_ln.%s"  // inner attention LayerNorm | 
| 70 | 73 | #define TN_ATTN_K_NORM     "%s.blk.%d.attn_k_norm.%s" | 
| 71 | 74 | #define TN_ATTN_Q_NORM     "%s.blk.%d.attn_q_norm.%s" | 
| 72 | 75 | #define TN_FFN_DOWN        "%s.blk.%d.ffn_down.%s" | 
| 73 | 76 | #define TN_FFN_GATE        "%s.blk.%d.ffn_gate.%s" | 
| 74 | 77 | #define TN_FFN_UP          "%s.blk.%d.ffn_up.%s" | 
| 75 |  | -#define TN_FFN_GATE        "%s.blk.%d.ffn_gate.%s" | 
| 76 |  | -#define TN_LN_1            "%s.blk.%d.ln1.%s" // layer norm | 
| 77 |  | -#define TN_LN_2            "%s.blk.%d.ln2.%s" // layer norm | 
|  | 78 | +#define TN_FFN_NORM        "%s.blk.%d.ffn_norm.%s" | 
|  | 79 | +#define TN_LN_1            "%s.blk.%d.ln_1.%s"  // layer norm | 
|  | 80 | +#define TN_LN_2            "%s.blk.%d.ln_2.%s"  // layer norm | 
| 78 | 81 | #define TN_LS_1            "%s.blk.%d.ls1.%s" // layer scale | 
| 79 | 82 | #define TN_LS_2            "%s.blk.%d.ls2.%s" // layer scale | 
| 80 | 83 | #define TN_LN_PRE          "%s.pre_ln.%s" | 
| @@ -137,30 +140,32 @@ enum projector_type { | 
| 137 | 140 |     PROJECTOR_TYPE_QWEN2A, | 
| 138 | 141 |     PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx | 
| 139 | 142 |     PROJECTOR_TYPE_VOXTRAL, | 
|  | 143 | +    PROJECTOR_TYPE_JINACLIP2, // JinaCLIP v2 | 
| 140 | 144 |     PROJECTOR_TYPE_LFM2, | 
| 141 | 145 |     PROJECTOR_TYPE_KIMIVL, | 
| 142 | 146 |     PROJECTOR_TYPE_UNKNOWN, | 
| 143 | 147 | }; | 
| 144 | 148 | 
 | 
| 145 | 149 | static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = { | 
| 146 |  | -    { PROJECTOR_TYPE_MLP,       "mlp" }, | 
| 147 |  | -    { PROJECTOR_TYPE_LDP,       "ldp" }, | 
| 148 |  | -    { PROJECTOR_TYPE_LDPV2,     "ldpv2"}, | 
| 149 |  | -    { PROJECTOR_TYPE_MINICPMV,  "resampler"}, | 
| 150 |  | -    { PROJECTOR_TYPE_GLM_EDGE,  "adapter"}, | 
| 151 |  | -    { PROJECTOR_TYPE_QWEN2VL,   "qwen2vl_merger"}, | 
| 152 |  | -    { PROJECTOR_TYPE_QWEN25VL,  "qwen2.5vl_merger"}, | 
| 153 |  | -    { PROJECTOR_TYPE_GEMMA3,    "gemma3"}, | 
| 154 |  | -    { PROJECTOR_TYPE_IDEFICS3,  "idefics3"}, | 
| 155 |  | -    { PROJECTOR_TYPE_PIXTRAL,   "pixtral"}, | 
| 156 |  | -    { PROJECTOR_TYPE_ULTRAVOX,  "ultravox"}, | 
| 157 |  | -    { PROJECTOR_TYPE_INTERNVL,  "internvl"}, | 
| 158 |  | -    { PROJECTOR_TYPE_LLAMA4,    "llama4"}, | 
| 159 |  | -    { PROJECTOR_TYPE_QWEN2A,    "qwen2a"}, | 
| 160 |  | -    { PROJECTOR_TYPE_QWEN25O,   "qwen2.5o"}, | 
| 161 |  | -    { PROJECTOR_TYPE_VOXTRAL,   "voxtral"}, | 
| 162 |  | -    { PROJECTOR_TYPE_LFM2,      "lfm2"}, | 
| 163 |  | -    { PROJECTOR_TYPE_KIMIVL,    "kimivl"}, | 
|  | 150 | +    { PROJECTOR_TYPE_MLP,      "mlp"              }, | 
|  | 151 | +    { PROJECTOR_TYPE_LDP,      "ldp"              }, | 
|  | 152 | +    { PROJECTOR_TYPE_LDPV2,    "ldpv2"            }, | 
|  | 153 | +    { PROJECTOR_TYPE_MINICPMV, "resampler"        }, | 
|  | 154 | +    { PROJECTOR_TYPE_GLM_EDGE, "adapter"          }, | 
|  | 155 | +    { PROJECTOR_TYPE_QWEN2VL,  "qwen2vl_merger"   }, | 
|  | 156 | +    { PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger" }, | 
|  | 157 | +    { PROJECTOR_TYPE_GEMMA3,   "gemma3"           }, | 
|  | 158 | +    { PROJECTOR_TYPE_IDEFICS3, "idefics3"         }, | 
|  | 159 | +    { PROJECTOR_TYPE_PIXTRAL,  "pixtral"          }, | 
|  | 160 | +    { PROJECTOR_TYPE_ULTRAVOX, "ultravox"         }, | 
|  | 161 | +    { PROJECTOR_TYPE_INTERNVL, "internvl"         }, | 
|  | 162 | +    { PROJECTOR_TYPE_LLAMA4,   "llama4"           }, | 
|  | 163 | +    { PROJECTOR_TYPE_QWEN2A,   "qwen2a"           }, | 
|  | 164 | +    { PROJECTOR_TYPE_QWEN25O,  "qwen2.5o"         }, | 
|  | 165 | +    { PROJECTOR_TYPE_VOXTRAL,  "voxtral"          }, | 
|  | 166 | +    { PROJECTOR_TYPE_JINACLIP2, "jinaclip2"        }, | 
|  | 167 | +    { PROJECTOR_TYPE_LFM2,     "lfm2"             }, | 
|  | 168 | +    { PROJECTOR_TYPE_KIMIVL,   "kimivl"           }, | 
| 164 | 169 | }; | 
| 165 | 170 | 
 | 
| 166 | 171 | static projector_type clip_projector_type_from_string(const std::string & str) { | 
|  | 
0 commit comments