|
39 | 39 | #define KEY_FEATURE_LAYER "clip.vision.feature_layer" |
40 | 40 | #define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor" |
41 | 41 | #define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size" |
| 42 | +// RoPE base (theta) for vision encoder |
| 43 | +#define KEY_VISION_ROPE_THETA "clip.vision.rope_theta" |
42 | 44 |
|
43 | 45 | #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" |
44 | 46 | #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" |
|
67 | 69 | #define TN_ATTN_Q "%s.blk.%d.attn_q.%s" |
68 | 70 | #define TN_ATTN_V "%s.blk.%d.attn_v.%s" |
69 | 71 | #define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s" |
| 72 | +#define TN_ATTN_LN "%s.blk.%d.attn_ln.%s" // inner attention LayerNorm |
70 | 73 | #define TN_ATTN_K_NORM "%s.blk.%d.attn_k_norm.%s" |
71 | 74 | #define TN_ATTN_Q_NORM "%s.blk.%d.attn_q_norm.%s" |
72 | 75 | #define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s" |
73 | 76 | #define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s" |
74 | 77 | #define TN_FFN_UP "%s.blk.%d.ffn_up.%s" |
75 | | -#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s" |
76 | | -#define TN_LN_1 "%s.blk.%d.ln1.%s" // layer norm |
77 | | -#define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm |
| 78 | +#define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s" |
| 79 | +#define TN_LN_1 "%s.blk.%d.ln_1.%s" // layer norm |
| 80 | +#define TN_LN_2 "%s.blk.%d.ln_2.%s" // layer norm |
78 | 81 | #define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale |
79 | 82 | #define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale |
80 | 83 | #define TN_LN_PRE "%s.pre_ln.%s" |
@@ -137,30 +140,32 @@ enum projector_type { |
137 | 140 | PROJECTOR_TYPE_QWEN2A, |
138 | 141 | PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx |
139 | 142 | PROJECTOR_TYPE_VOXTRAL, |
| 143 | + PROJECTOR_TYPE_JINACLIP2, // JinaCLIP v2 |
140 | 144 | PROJECTOR_TYPE_LFM2, |
141 | 145 | PROJECTOR_TYPE_KIMIVL, |
142 | 146 | PROJECTOR_TYPE_UNKNOWN, |
143 | 147 | }; |
144 | 148 |
|
145 | 149 | static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = { |
146 | | - { PROJECTOR_TYPE_MLP, "mlp" }, |
147 | | - { PROJECTOR_TYPE_LDP, "ldp" }, |
148 | | - { PROJECTOR_TYPE_LDPV2, "ldpv2"}, |
149 | | - { PROJECTOR_TYPE_MINICPMV, "resampler"}, |
150 | | - { PROJECTOR_TYPE_GLM_EDGE, "adapter"}, |
151 | | - { PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"}, |
152 | | - { PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"}, |
153 | | - { PROJECTOR_TYPE_GEMMA3, "gemma3"}, |
154 | | - { PROJECTOR_TYPE_IDEFICS3, "idefics3"}, |
155 | | - { PROJECTOR_TYPE_PIXTRAL, "pixtral"}, |
156 | | - { PROJECTOR_TYPE_ULTRAVOX, "ultravox"}, |
157 | | - { PROJECTOR_TYPE_INTERNVL, "internvl"}, |
158 | | - { PROJECTOR_TYPE_LLAMA4, "llama4"}, |
159 | | - { PROJECTOR_TYPE_QWEN2A, "qwen2a"}, |
160 | | - { PROJECTOR_TYPE_QWEN25O, "qwen2.5o"}, |
161 | | - { PROJECTOR_TYPE_VOXTRAL, "voxtral"}, |
162 | | - { PROJECTOR_TYPE_LFM2, "lfm2"}, |
163 | | - { PROJECTOR_TYPE_KIMIVL, "kimivl"}, |
| 150 | + { PROJECTOR_TYPE_MLP, "mlp" }, |
| 151 | + { PROJECTOR_TYPE_LDP, "ldp" }, |
| 152 | + { PROJECTOR_TYPE_LDPV2, "ldpv2" }, |
| 153 | + { PROJECTOR_TYPE_MINICPMV, "resampler" }, |
| 154 | + { PROJECTOR_TYPE_GLM_EDGE, "adapter" }, |
| 155 | + { PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger" }, |
| 156 | + { PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger" }, |
| 157 | + { PROJECTOR_TYPE_GEMMA3, "gemma3" }, |
| 158 | + { PROJECTOR_TYPE_IDEFICS3, "idefics3" }, |
| 159 | + { PROJECTOR_TYPE_PIXTRAL, "pixtral" }, |
| 160 | + { PROJECTOR_TYPE_ULTRAVOX, "ultravox" }, |
| 161 | + { PROJECTOR_TYPE_INTERNVL, "internvl" }, |
| 162 | + { PROJECTOR_TYPE_LLAMA4, "llama4" }, |
| 163 | + { PROJECTOR_TYPE_QWEN2A, "qwen2a" }, |
| 164 | + { PROJECTOR_TYPE_QWEN25O, "qwen2.5o" }, |
| 165 | + { PROJECTOR_TYPE_VOXTRAL, "voxtral" }, |
| 166 | + { PROJECTOR_TYPE_JINACLIP2, "jinaclip2" }, |
| 167 | + { PROJECTOR_TYPE_LFM2, "lfm2" }, |
| 168 | + { PROJECTOR_TYPE_KIMIVL, "kimivl" }, |
164 | 169 | }; |
165 | 170 |
|
166 | 171 | static projector_type clip_projector_type_from_string(const std::string & str) { |
|
0 commit comments