Skip to content

Commit e19eb27

Browse files
author
liyang
committed
mtmd/clip+converter: address #16574; fold CLI into mtmd-cli, fix converter keys, rope_ext + bicubic
mtmd-cli: move the standalone Jina CLI into mtmd-cli (projector-only path); drop the extra binary.
1 parent 6ea37f5 commit e19eb27

File tree

10 files changed

+1098
-109
lines changed

10 files changed

+1098
-109
lines changed

convert_hf_to_gguf.py

Lines changed: 315 additions & 11 deletions
Large diffs are not rendered by default.

gguf-py/gguf/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3057,6 +3057,7 @@ class VisionProjectorType:
30573057
QWEN25VL = "qwen2.5vl_merger"
30583058
ULTRAVOX = "ultravox"
30593059
INTERNVL = "internvl"
3060+
JINACLIP2 = "jinaclip2"
30603061
QWEN2A = "qwen2a" # audio
30613062
QWEN25O = "qwen2.5o" # omni
30623063
VOXTRAL = "voxtral"

tools/mtmd/clip-impl.h

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@
3939
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
4040
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
4141
#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size"
42+
// RoPE base (theta) for vision encoder
43+
#define KEY_VISION_ROPE_THETA "clip.vision.rope_theta"
4244

4345
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
4446
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
@@ -67,14 +69,15 @@
6769
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
6870
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
6971
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
72+
#define TN_ATTN_LN "%s.blk.%d.attn_ln.%s" // inner attention LayerNorm
7073
#define TN_ATTN_K_NORM "%s.blk.%d.attn_k_norm.%s"
7174
#define TN_ATTN_Q_NORM "%s.blk.%d.attn_q_norm.%s"
7275
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
7376
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
7477
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
75-
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
76-
#define TN_LN_1 "%s.blk.%d.ln1.%s" // layer norm
77-
#define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm
78+
#define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s"
79+
#define TN_LN_1 "%s.blk.%d.ln_1.%s" // layer norm
80+
#define TN_LN_2 "%s.blk.%d.ln_2.%s" // layer norm
7881
#define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale
7982
#define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale
8083
#define TN_LN_PRE "%s.pre_ln.%s"
@@ -137,30 +140,32 @@ enum projector_type {
137140
PROJECTOR_TYPE_QWEN2A,
138141
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
139142
PROJECTOR_TYPE_VOXTRAL,
143+
PROJECTOR_TYPE_JINACLIP2, // JinaCLIP v2
140144
PROJECTOR_TYPE_LFM2,
141145
PROJECTOR_TYPE_KIMIVL,
142146
PROJECTOR_TYPE_UNKNOWN,
143147
};
144148

145149
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
146-
{ PROJECTOR_TYPE_MLP, "mlp" },
147-
{ PROJECTOR_TYPE_LDP, "ldp" },
148-
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
149-
{ PROJECTOR_TYPE_MINICPMV, "resampler"},
150-
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
151-
{ PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"},
152-
{ PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"},
153-
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
154-
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
155-
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
156-
{ PROJECTOR_TYPE_ULTRAVOX, "ultravox"},
157-
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
158-
{ PROJECTOR_TYPE_LLAMA4, "llama4"},
159-
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
160-
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
161-
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
162-
{ PROJECTOR_TYPE_LFM2, "lfm2"},
163-
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
150+
{ PROJECTOR_TYPE_MLP, "mlp" },
151+
{ PROJECTOR_TYPE_LDP, "ldp" },
152+
{ PROJECTOR_TYPE_LDPV2, "ldpv2" },
153+
{ PROJECTOR_TYPE_MINICPMV, "resampler" },
154+
{ PROJECTOR_TYPE_GLM_EDGE, "adapter" },
155+
{ PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger" },
156+
{ PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger" },
157+
{ PROJECTOR_TYPE_GEMMA3, "gemma3" },
158+
{ PROJECTOR_TYPE_IDEFICS3, "idefics3" },
159+
{ PROJECTOR_TYPE_PIXTRAL, "pixtral" },
160+
{ PROJECTOR_TYPE_ULTRAVOX, "ultravox" },
161+
{ PROJECTOR_TYPE_INTERNVL, "internvl" },
162+
{ PROJECTOR_TYPE_LLAMA4, "llama4" },
163+
{ PROJECTOR_TYPE_QWEN2A, "qwen2a" },
164+
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o" },
165+
{ PROJECTOR_TYPE_VOXTRAL, "voxtral" },
166+
{ PROJECTOR_TYPE_JINACLIP2, "jinaclip2" },
167+
{ PROJECTOR_TYPE_LFM2, "lfm2" },
168+
{ PROJECTOR_TYPE_KIMIVL, "kimivl" },
164169
};
165170

166171
static projector_type clip_projector_type_from_string(const std::string & str) {

0 commit comments

Comments
 (0)