Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ members = [
"candle-rotary",
"candle-flash-attn-v1",
"candle-cublaslt",
"candle-moe",
]
resolver = "2"

Expand Down
17 changes: 17 additions & 0 deletions candle-moe/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Generated by Cargo
# will have compiled files and executables
debug/
target/

# These are backup files generated by rustfmt
**/*.rs.bk

# MSVC Windows builds of rustc generate these, which store debugging information
*.pdb

# RustRover
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
24 changes: 24 additions & 0 deletions candle-moe/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
[package]
name = "candle-moe"
description = "fused MoE layer for the candle ML framework."
homepage = "https://github.com/huggingface/candle-extensions/candle-moe/"
version.workspace = true
edition.workspace = true
keywords.workspace = true
categories.workspace = true
license.workspace = true
repository.workspace = true

[dependencies]
candle = { version = "0.8", package = "candle-core", features = ["cuda"] }
cudarc = { version = "0.13.3", features = ["cuda-12080"], default-features = false }
half = { workspace = true }

[build-dependencies]
anyhow = { workspace = true }
bindgen_cuda = { workspace = true }

[dev-dependencies]
anyhow = { workspace = true }
candle-nn = { version = "0.8", features = ["cuda"] }
candle-transformers = { version = "0.8" }
5 changes: 5 additions & 0 deletions candle-moe/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# candle-moe

fused MoE kernel in Candle backend

This layer is adapted from https://huggingface.co/kernels-community/moe.
65 changes: 65 additions & 0 deletions candle-moe/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// Build script to run nvcc and generate the C glue code for launching the flash-attention kernel.
// The cuda build time is very long so one can set the CANDLE_FLASH_ATTN_BUILD_DIR environment
// variable in order to cache the compiled artifacts and avoid recompiling too often.
use anyhow::{Context, Result};
use std::path::PathBuf;

const KERNEL_FILES: [&str; 2] = [
"kernels/topk_softmax_kernels.cu",
"kernels/moe_align_sum_kernels.cu",
];

fn main() -> Result<()> {
println!("cargo:rerun-if-changed=build.rs");
for kernel_file in KERNEL_FILES.iter() {
println!("cargo:rerun-if-changed={kernel_file}");
}
println!("cargo:rerun-if-changed=kernels/moe_wna16_utils.h");

let out_dir = PathBuf::from(std::env::var("OUT_DIR").context("OUT_DIR not set")?);
let build_dir = match std::env::var("CANDLE_MOE_BUILD_DIR") {
Err(_) =>
{
#[allow(clippy::redundant_clone)]
out_dir.clone()
}
Ok(build_dir) => {
let path = PathBuf::from(build_dir);
let current_dir = std::env::current_dir()?;
path.canonicalize().unwrap_or_else(|_| {
panic!(
"Directory doesn't exists: {} (the current directory is {})",
&path.display(),
current_dir.display()
)
})
}
};

let kernels: Vec<_> = KERNEL_FILES.iter().collect();
let builder = bindgen_cuda::Builder::default()
.kernel_paths(kernels)
.out_dir(build_dir.clone())
.arg("-std=c++17")
.arg("-O3")
.arg("--compiler-options")
.arg("-fPIC")
.arg("-U__CUDA_NO_HALF_OPERATORS__")
.arg("-U__CUDA_NO_HALF_CONVERSIONS__")
.arg("-U__CUDA_NO_HALF2_OPERATORS__")
.arg("-U__CUDA_NO_BFLOAT16_CONVERSIONS__")
.arg("--expt-relaxed-constexpr")
.arg("--expt-extended-lambda")
.arg("--use_fast_math")
.arg("--ptxas-options=-v")
.arg("--verbose");

let out_file = build_dir.join("libmoe.a");
builder.build_lib(out_file);

println!("cargo:rustc-link-search={}", build_dir.display());
println!("cargo:rustc-link-lib=moe");
println!("cargo:rustc-link-lib=dylib=cudart");

Ok(())
}
49 changes: 49 additions & 0 deletions candle-moe/kernels/cuda_compat.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#pragma once

#ifdef USE_ROCM
#include <hip/hip_runtime.h>
#endif

#ifndef USE_ROCM
#define WARP_SIZE 32
#else
#define WARP_SIZE warpSize
#endif

#ifndef USE_ROCM
#define VLLM_LDG(arg) __ldg(arg)
#else
#define VLLM_LDG(arg) *(arg)
#endif

#ifndef USE_ROCM
#define VLLM_SHFL_XOR_SYNC(var, lane_mask) \
__shfl_xor_sync(uint32_t(-1), var, lane_mask)
#define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
__shfl_xor_sync(uint32_t(-1), var, lane_mask, width)
#else
#define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
#define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
__shfl_xor(var, lane_mask, width)
#endif

#ifndef USE_ROCM
#define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane)
#else
#define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane)
#endif

#ifndef USE_ROCM
#define VLLM_SHFL_DOWN_SYNC(var, lane_delta) \
__shfl_down_sync(uint32_t(-1), var, lane_delta)
#else
#define VLLM_SHFL_DOWN_SYNC(var, lane_delta) __shfl_down(var, lane_delta)
#endif

#ifndef USE_ROCM
#define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL)
#else
#define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
#endif
Loading