huggingface · kozistr · Jun 13, 2025 · Jun 13, 2025 · Jun 13, 2025 · Jun 13, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -4,6 +4,7 @@ members = [
     "candle-rotary", 
     "candle-flash-attn-v1",
     "candle-cublaslt",
+    "candle-moe",
 ]
 resolver = "2"
 

diff --git a/candle-moe/.gitignore b/candle-moe/.gitignore
@@ -0,0 +1,17 @@
+# Generated by Cargo
+# will have compiled files and executables
+debug/
+target/
+
+# These are backup files generated by rustfmt
+**/*.rs.bk
+
+# MSVC Windows builds of rustc generate these, which store debugging information
+*.pdb
+
+# RustRover
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
diff --git a/candle-moe/Cargo.toml b/candle-moe/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "candle-moe"
+description = "fused MoE layer for the candle ML framework."
+homepage = "https://github.com/huggingface/candle-extensions/candle-moe/"
+version.workspace = true
+edition.workspace = true
+keywords.workspace = true
+categories.workspace  = true
+license.workspace = true
+repository.workspace = true
+
+[dependencies]
+candle = { version = "0.8", package = "candle-core", features = ["cuda"] }
+cudarc = { version = "0.13.3", features = ["cuda-12080"], default-features = false }
+half = { workspace = true }
+
+[build-dependencies]
+anyhow = { workspace = true }
+bindgen_cuda = { workspace = true }
+
+[dev-dependencies]
+anyhow = { workspace = true }
+candle-nn = { version = "0.8", features = ["cuda"] }
+candle-transformers = { version = "0.8" }
diff --git a/candle-moe/README.md b/candle-moe/README.md
@@ -0,0 +1,5 @@
+# candle-moe
+
+fused MoE kernel in Candle backend
+
+This layer is adapted from https://huggingface.co/kernels-community/moe.
diff --git a/candle-moe/build.rs b/candle-moe/build.rs
@@ -0,0 +1,65 @@
+// Build script to run nvcc and generate the C glue code for launching the flash-attention kernel.
+// The cuda build time is very long so one can set the CANDLE_FLASH_ATTN_BUILD_DIR environment
+// variable in order to cache the compiled artifacts and avoid recompiling too often.
+use anyhow::{Context, Result};
+use std::path::PathBuf;
+
+const KERNEL_FILES: [&str; 2] = [
+    "kernels/topk_softmax_kernels.cu",
+    "kernels/moe_align_sum_kernels.cu",
+];
+
+fn main() -> Result<()> {
+    println!("cargo:rerun-if-changed=build.rs");
+    for kernel_file in KERNEL_FILES.iter() {
+        println!("cargo:rerun-if-changed={kernel_file}");
+    }
+    println!("cargo:rerun-if-changed=kernels/moe_wna16_utils.h");
+
+    let out_dir = PathBuf::from(std::env::var("OUT_DIR").context("OUT_DIR not set")?);
+    let build_dir = match std::env::var("CANDLE_MOE_BUILD_DIR") {
+        Err(_) =>
+        {
+            #[allow(clippy::redundant_clone)]
+            out_dir.clone()
+        }
+        Ok(build_dir) => {
+            let path = PathBuf::from(build_dir);
+            let current_dir = std::env::current_dir()?;
+            path.canonicalize().unwrap_or_else(|_| {
+                panic!(
+                    "Directory doesn't exists: {} (the current directory is {})",
+                    &path.display(),
+                    current_dir.display()
+                )
+            })
+        }
+    };
+
+    let kernels: Vec<_> = KERNEL_FILES.iter().collect();
+    let builder = bindgen_cuda::Builder::default()
+        .kernel_paths(kernels)
+        .out_dir(build_dir.clone())
+        .arg("-std=c++17")
+        .arg("-O3")
+        .arg("--compiler-options")
+        .arg("-fPIC")
+        .arg("-U__CUDA_NO_HALF_OPERATORS__")
+        .arg("-U__CUDA_NO_HALF_CONVERSIONS__")
+        .arg("-U__CUDA_NO_HALF2_OPERATORS__")
+        .arg("-U__CUDA_NO_BFLOAT16_CONVERSIONS__")
+        .arg("--expt-relaxed-constexpr")
+        .arg("--expt-extended-lambda")
+        .arg("--use_fast_math")
+        .arg("--ptxas-options=-v")
+        .arg("--verbose");
+
+    let out_file = build_dir.join("libmoe.a");
+    builder.build_lib(out_file);
+
+    println!("cargo:rustc-link-search={}", build_dir.display());
+    println!("cargo:rustc-link-lib=moe");
+    println!("cargo:rustc-link-lib=dylib=cudart");
+
+    Ok(())
+}
diff --git a/candle-moe/kernels/cuda_compat.h b/candle-moe/kernels/cuda_compat.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#ifdef USE_ROCM
+  #include <hip/hip_runtime.h>
+#endif
+
+#ifndef USE_ROCM
+  #define WARP_SIZE 32
+#else
+  #define WARP_SIZE warpSize
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_LDG(arg) __ldg(arg)
+#else
+  #define VLLM_LDG(arg) *(arg)
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) \
+    __shfl_xor_sync(uint32_t(-1), var, lane_mask)
+  #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
+    __shfl_xor_sync(uint32_t(-1), var, lane_mask, width)
+#else
+  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
+  #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
+    __shfl_xor(var, lane_mask, width)
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane)
+#else
+  #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane)
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) \
+    __shfl_down_sync(uint32_t(-1), var, lane_delta)
+#else
+  #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) __shfl_down(var, lane_delta)
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
+    cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL)
+#else
+  #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
+    hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
+#endif