InfiniTensor
diff --git a/‎include/infinicore/ops.hpp‎
Lines changed: 7 additions & 0 deletions b/‎include/infinicore/ops.hpp‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎include/infinicore/ops/gla_attention.hpp‎
Lines changed: 23 additions & 0 deletions b/‎include/infinicore/ops/gla_attention.hpp‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎include/infinicore/ops/infllmv2_api.hpp‎
Lines changed: 65 additions & 0 deletions b/‎include/infinicore/ops/infllmv2_api.hpp‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎include/infinicore/ops/infllmv2_attention.hpp‎
Lines changed: 47 additions & 0 deletions b/‎include/infinicore/ops/infllmv2_attention.hpp‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎include/infinicore/ops/sigmoid.hpp‎
Lines changed: 17 additions & 0 deletions b/‎include/infinicore/ops/sigmoid.hpp‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎include/infinicore/ops/zeros.hpp‎
Lines changed: 15 additions & 0 deletions b/‎include/infinicore/ops/zeros.hpp‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎python/infinicore/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎python/infinicore/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/infinicore/ops/gla_attention.py‎
Lines changed: 24 additions & 0 deletions b/‎python/infinicore/ops/gla_attention.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎src/infinicore/context/allocators/pinnable_block_allocator.cc‎
Lines changed: 15 additions & 0 deletions b/‎src/infinicore/context/allocators/pinnable_block_allocator.cc‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎src/infinicore/ops/gla_attention/gla_attention.cc‎
Lines changed: 64 additions & 0 deletions b/‎src/infinicore/ops/gla_attention/gla_attention.cc‎
Lines changed: 64 additions & 0 deletions
@@ -12,11 +12,17 @@
 #include "ops/cross_entropy.hpp"
 #include "ops/embedding.hpp"
 #include "ops/flash_attention.hpp"
+#include "ops/gla_attention.hpp"
+#include "ops/infllmv2_attention.hpp"
 #include "ops/hardswish.hpp"
 #include "ops/hardtanh.hpp"
 #include "ops/kv_caching.hpp"
 #include "ops/matmul.hpp"
+#include "ops/mha_kvcache.hpp"
+#include "ops/mha_varlen.hpp"
+#include "ops/mul.hpp"
 #include "ops/ones.hpp"
+#include "ops/zeros.hpp"
 #include "ops/paged_attention.hpp"
 #include "ops/paged_attention_prefill.hpp"
 #include "ops/paged_caching.hpp"
@@ -25,6 +31,7 @@
 #include "ops/reciprocal.hpp"
 #include "ops/rms_norm.hpp"
 #include "ops/rope.hpp"
+#include "ops/sigmoid.hpp"
 #include "ops/silu.hpp"
 #include "ops/silu_and_mul.hpp"
 #include "ops/swiglu.hpp"
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "../device.hpp"
+#include "../graph/graph.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+// Lightweight GLA-style attention built from existing primitives.
+// Shapes:
+//   q        : [B, n_q, S_q, D]
+//   k_total  : [B, n_kv, S_kv, D]
+//   v_total  : [B, n_kv, S_kv, D]
+// Returns:
+//   [B, n_q, S_q, D]
+Tensor gla_attention(const Tensor &q,
+                     const Tensor &k_total,
+                     const Tensor &v_total,
+                     float scale,
+                     bool causal);
+
+} // namespace infinicore::op
+
@@ -0,0 +1,65 @@
+/**
+ * C++ API declarations for InfLLM-V2 attention kernels.
+ * When ENABLE_INFLLMV2 is defined, link against the InfLLM-V2 library
+ * (e.g. from infllmv2_cuda_impl) that provides these symbols.
+ * Requires ENABLE_ATEN for at::Tensor.
+ * Symbols are in global namespace to match entry.cu.
+ */
+#pragma once
+
+#if defined(ENABLE_INFLLMV2) && defined(ENABLE_ATEN)
+
+#include <ATen/ATen.h>
+#include <c10/util/Optional.h>
+#include <vector>
+
+/** Varlen forward: unpadded Q/K/V with cu_seqlens. Returns {out, softmax_lse, ...}. */
+std::vector<at::Tensor> mha_varlen_fwd(
+    at::Tensor &q,
+    const at::Tensor &k,
+    const at::Tensor &v,
+    c10::optional<at::Tensor> &out_,
+    const at::Tensor &cu_seqlens_q,
+    const at::Tensor &cu_seqlens_k,
+    c10::optional<at::Tensor> &seqused_k,
+    c10::optional<const at::Tensor> &leftpad_k_,
+    c10::optional<at::Tensor> &block_table_,
+    c10::optional<at::Tensor> &alibi_slopes_,
+    int max_seqlen_q,
+    int max_seqlen_k,
+    float p_dropout,
+    float softmax_scale,
+    bool zero_tensors,
+    bool is_causal,
+    int window_size_left,
+    int window_size_right,
+    float softcap,
+    bool return_softmax,
+    c10::optional<at::Generator> gen_,
+    c10::optional<at::Tensor> &blockmask_);
+
+/** KV-cache forward (decode). Returns {out, softmax_lse}. */
+std::vector<at::Tensor> mha_fwd_kvcache(
+    at::Tensor &q,
+    const at::Tensor &kcache,
+    const at::Tensor &vcache,
+    c10::optional<const at::Tensor> &k_,
+    c10::optional<const at::Tensor> &v_,
+    c10::optional<const at::Tensor> &seqlens_k_,
+    c10::optional<const at::Tensor> &rotary_cos_,
+    c10::optional<const at::Tensor> &rotary_sin_,
+    c10::optional<const at::Tensor> &cache_batch_idx_,
+    c10::optional<const at::Tensor> &leftpad_k_,
+    c10::optional<at::Tensor> &block_table_,
+    c10::optional<at::Tensor> &alibi_slopes_,
+    c10::optional<at::Tensor> &out_,
+    float softmax_scale,
+    bool is_causal,
+    int window_size_left,
+    int window_size_right,
+    float softcap,
+    bool is_rotary_interleaved,
+    int num_splits,
+    c10::optional<at::Tensor> &blockmask_);
+
+#endif // ENABLE_INFLLMV2 && ENABLE_ATEN
@@ -0,0 +1,47 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+#include <optional>
+
+namespace infinicore::op {
+
+// Varlen InfLLM-V2 attention over unpadded Q/K/V.
+//
+// Shapes follow the FlashAttn-style varlen convention:
+//   q           : [total_q, nheads, head_dim]
+//   k, v        : [total_k, nheads_k, head_dim]
+//   cu_seqlens_q: [batch_size + 1] (int32)
+//   cu_seqlens_k: [batch_size + 1] (int32)
+//
+// Returns:
+//   [total_q, nheads, head_dim]
+Tensor infllmv2_varlen(const Tensor &q,
+                       const Tensor &k,
+                       const Tensor &v,
+                       const Tensor &cu_seqlens_q,
+                       const Tensor &cu_seqlens_k,
+                       int max_seqlen_q,
+                       int max_seqlen_k,
+                       float scale,
+                       bool causal);
+
+// Decode-time InfLLM-V2 attention with KV cache.
+//
+// Shapes:
+//   q          : [batch, seqlen_q, nheads, head_dim]
+//   k_cache    : [num_blocks, block_size, nheads_k, head_dim] or [batch, seqlen_cache, nheads_k, head_dim]
+//   v_cache    : same as k_cache
+//   cache_lens : [batch] (int32) total KV length per sequence
+//
+// Returns:
+//   [batch, seqlen_q, nheads, head_dim]
+Tensor infllmv2_kvcache(const Tensor &q,
+                        const Tensor &k_cache,
+                        const Tensor &v_cache,
+                        const Tensor &cache_lens,
+                        float scale,
+                        bool causal);
+
+} // namespace infinicore::op
+
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Sigmoid {
+public:
+    using schema = void (*)(Tensor, Tensor);
+    static void execute(Tensor output, Tensor input);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor sigmoid(Tensor input);
+void sigmoid_(Tensor output, Tensor input);
+} // namespace infinicore::op
+
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Zeros {
+
+public:
+    using schema = void (*)(Tensor);
+    static void execute(Tensor output);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+void zeros_(Tensor output);
+} // namespace infinicore::op
@@ -57,6 +57,7 @@
     binary_cross_entropy_with_logits,
 )
 from infinicore.ops.cdist import cdist
+from infinicore.ops.gla_attention import gla_attention
 from infinicore.ops.cross_entropy import cross_entropy
 from infinicore.ops.equal import equal
 from infinicore.ops.kv_caching import kv_caching
@@ -141,6 +142,7 @@
     "attention",
     "binary_cross_entropy_with_logits",
     "cdist",
+    "gla_attention",
     "kv_caching",
     "matmul",
     "equal",
 
@@ -0,0 +1,24 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+_native_gla_attention = getattr(_infinicore, "gla_attention", None)
+if _native_gla_attention is None:
+    _MISSING_MSG = (
+        "gla_attention not found in _infinicore. Rebuild InfiniCore extension: "
+        "cd InfiniCore && xmake build _infinicore"
+    )
+
+
+def gla_attention(q, k_total, v_total, scale, *, causal=True):
+    """GLA-style attention. q, k_total, v_total are [B, n_q/n_kv, S, D]. Returns [B, n_q, S_q, D]."""
+    if _native_gla_attention is None:
+        raise NotImplementedError(_MISSING_MSG)
+    return Tensor(
+        _native_gla_attention(
+            q._underlying,
+            k_total._underlying,
+            v_total._underlying,
+            float(scale),
+            causal,
+        )
+    )
@@ -7,6 +7,7 @@
 #include <algorithm>
 #include <infinirt.h>
 #include <stdexcept>
+#include <cstdlib>
 
 namespace infinicore {
 
@@ -72,6 +73,13 @@ std::byte *PinnableBlockAllocator::allocate(size_t size) {
             block->frozen = pinned_mode_;
             block->in_use = true;
 
+            if (std::getenv("INFINICORE_DEBUG_ALLOC") != nullptr) {
+                infiniDevice_t dev;
+                int dev_id;
+                infinirtGetDevice(&dev, &dev_id);
+                spdlog::warn("PinnableBlockAllocator cudaMalloc request: requested={} aligned={} class={} device={} id={}",
+                             size, size, cls.block_size, static_cast<int>(dev), dev_id);
+            }
             INFINICORE_CHECK_ERROR(infinirtMalloc(&block->ptr, block->size));
 
             all_blocks_[block->ptr] = block;
@@ -97,6 +105,13 @@ std::byte *PinnableBlockAllocator::allocate(size_t size) {
     block->frozen = pinned_mode_;
     block->in_use = true;
 
+    if (std::getenv("INFINICORE_DEBUG_ALLOC") != nullptr) {
+        infiniDevice_t dev;
+        int dev_id;
+        infinirtGetDevice(&dev, &dev_id);
+        spdlog::warn("PinnableBlockAllocator cudaMalloc request (large): requested={} aligned={} device={} id={}",
+                     size, size, static_cast<int>(dev), dev_id);
+    }
     INFINICORE_CHECK_ERROR(infinirtMalloc(&block->ptr, block->size));
 
     large_blocks_.push_back(block);
 
@@ -0,0 +1,64 @@
+#include "infinicore/ops/gla_attention.hpp"
+
+#include "infinicore/ops/matmul.hpp"
+#include "infinicore/ops/causal_softmax.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+Tensor gla_attention(const Tensor &q,
+                     const Tensor &k_total,
+                     const Tensor &v_total,
+                     float scale,
+                     bool causal) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(q, k_total, v_total);
+
+    const auto &q_shape = q->shape();       // [B, n_q, S_q, D]
+    const auto &k_shape = k_total->shape(); // [B, n_kv, S_kv, D]
+    const auto &v_shape = v_total->shape(); // [B, n_kv, S_kv, D]
+
+    INFINICORE_ASSERT(q_shape.size() == 4);
+    INFINICORE_ASSERT(k_shape.size() == 4);
+    INFINICORE_ASSERT(v_shape.size() == 4);
+    INFINICORE_ASSERT(q_shape[0] == k_shape[0] && k_shape[0] == v_shape[0]); // B
+    INFINICORE_ASSERT(q_shape[3] == k_shape[3] && k_shape[3] == v_shape[3]); // D
+    INFINICORE_ASSERT(k_shape[1] == v_shape[1] && k_shape[2] == v_shape[2]); // n_kv, S_kv
+
+    const size_t B = q_shape[0];
+    const size_t n_q = q_shape[1];
+    const size_t S_q = q_shape[2];
+    const size_t D = q_shape[3];
+    const size_t n_kv = k_shape[1];
+    const size_t S_kv = k_shape[2];
+
+    INFINICORE_ASSERT(n_q % n_kv == 0);
+    const size_t ngroup = n_q / n_kv;
+
+    // Reshape to grouped GQA layout:
+    //   Q: [B * n_kv, ngroup * S_q, D]
+    //   K: [B * n_kv, S_kv, D]
+    //   V: [B * n_kv, S_kv, D]
+    auto Q = q->view({B * n_kv, ngroup, S_q, D})
+                 ->view({B * n_kv, ngroup * S_q, D});
+    auto K = k_total->view({B * n_kv, S_kv, D});
+    auto V = v_total->view({B * n_kv, S_kv, D});
+
+    auto Kt = K->permute({0, 2, 1}); // [B * n_kv, D, S_kv]
+    auto attn_weight = infinicore::op::matmul(Q, Kt, scale); // [B * n_kv, ngroup * S_q, S_kv]
+
+    if (causal) {
+        auto attn_weight_softmax =
+            attn_weight->view({B * n_q, S_q, S_kv}); // [B * n_q, S_q, S_kv]
+        infinicore::op::causal_softmax_(attn_weight_softmax, attn_weight_softmax);
+    }
+
+    auto out = infinicore::op::matmul(attn_weight, V); // [B * n_kv, ngroup * S_q, D]
+    auto out_view =
+        out->view({B, n_kv, ngroup, S_q, D})
+            ->view({B, n_q, S_q, D}); // merge kv,group back into n_q
+
+    return out_view;
+}
+
+} // namespace infinicore::op
+