From 5999fdc8d17ceae53196a64ec2868be6958b1e35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?DAN=E2=84=A2?= Date: Thu, 19 Dec 2024 19:21:51 -0500 Subject: [PATCH] Apply RoPE for SWA layers only. --- src/llama.cpp | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 21c0e8f48142a..b1d8901a8bcb8 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -14857,12 +14857,13 @@ struct llm_build_context { struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(); // sliding window switch pattern - const int32_t n_layer_switch = 4; + const int32_t sliding_window_pattern = 4; for (int il = 0; il < n_layer; ++il) { // three layers sliding window attention (window size 4096) and ROPE // fourth layer uses global attention without positional embeddings - struct ggml_tensor * KQ_mask_l = (il % n_layer_switch < (n_layer_switch - 1)) ? KQ_mask_swa : KQ_mask; + const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1); + struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask; // norm cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM, cb, il); @@ -14871,6 +14872,9 @@ struct llm_build_context { // self-attention { + // rope freq factors for 128k context + struct ggml_tensor * rope_factors = build_rope_factors(il); + // compute Q and K and RoPE them struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); @@ -14893,15 +14897,24 @@ struct llm_build_context { cb(Vcur, "Vcur", il); } - Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + if (is_sliding) { + Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); - cb(Qcur, "Qcur", il); + cb(Qcur, "Qcur", il); - Kcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, + Kcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); - cb(Kcur, "Kcur", il); + cb(Kcur, "Kcur", il); + } else { + // For non-sliding layers, just reshape without applying RoPE + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + cb(Qcur, "Qcur", il); + + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + cb(Kcur, "Kcur", il); + } cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);