Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,9 @@
[submodule "third_party/Mooncake"]
path = third_party/Mooncake
url = https://gitcode.com/xLLM-AI/Mooncake.git
[submodule "third_party/tvm-ffi"]
path = third_party/tvm-ffi
url = https://gitcode.com/xLLM-AI/tvm-ffi.git
[submodule "third_party/dlpack"]
path = third_party/dlpack
url = https://gitcode.com/xLLM-AI/dlpack.git
74 changes: 72 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
cmake_minimum_required(VERSION 3.26)
set_property(GLOBAL PROPERTY USE_FOLDERS ON)
set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc")

option(USE_NPU "Enable NPU support" OFF)
option(USE_MLU "Enable MLU support" OFF)
option(USE_CUDA "Enable CUDA support" OFF)

if(DEVICE_ARCH STREQUAL "ARM")
set(CMAKE_SYSTEM_PROCESSOR aarch64)
Expand Down Expand Up @@ -101,7 +103,7 @@ set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS ON)

if(USE_NPU)
if(USE_NPU OR USE_CUDA)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
elseif(USE_MLU)
Expand Down Expand Up @@ -178,6 +180,32 @@ if (DEFINED ENV{DEPENDENCES_ROOT})
message(STATUS "Using DEPENDENCES_ROOT: $ENV{DEPENDENCES_ROOT}")
endif()

# set architecture for CUDA
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES AND USE_CUDA)
set(CMAKE_CUDA_ARCHITECTURES 80)
endif()

# Build TORCH_CUDA_ARCH_LIST
if(USE_CUDA)
# Build TORCH_CUDA_ARCH_LIST
set(TORCH_CUDA_ARCH_LIST "")
foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES)
if(CUDA_ARCH MATCHES "^([0-9])([0-9])a$")
set(TORCH_ARCH "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}a")
elseif(CUDA_ARCH MATCHES "^([0-9])([0-9])*$")
set(TORCH_ARCH "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}")
elseif(CUDA_ARCH STREQUAL "native")
set(TORCH_ARCH "Auto")
else()
message(FATAL_ERROR "${CUDA_ARCH} is not supported")
endif()
list(APPEND TORCH_CUDA_ARCH_LIST ${TORCH_ARCH})
endforeach()

message(STATUS "CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}")
message(STATUS "TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST}")
endif()

# configure vcpkg
# have to set CMAKE_TOOLCHAIN_FILE before first project call.
# if (DEFINED ENV{VCPKG_ROOT} AND NOT DEFINED CMAKE_TOOLCHAIN_FILE)
Expand Down Expand Up @@ -217,7 +245,12 @@ endif()
set(CPPREST_EXCLUDE_WEBSOCKETS ON CACHE BOOL "Exclude websockets functionality." FORCE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format-truncation")

project("xllm" LANGUAGES C CXX)
if(USE_CUDA)
project("xllm" LANGUAGES C CXX CUDA)
find_package(CUDAToolkit REQUIRED)
else()
project("xllm" LANGUAGES C CXX)
endif()

# find_package(CUDAToolkit REQUIRED)

Expand Down Expand Up @@ -352,6 +385,43 @@ if(USE_MLU)
)
endif()

if(USE_CUDA)
add_definitions(-DUSE_CUDA)
add_compile_definitions(TORCH_CUDA=1)
set(CMAKE_VERBOSE_MAKEFILE ON)
include_directories(
$ENV{PYTHON_INCLUDE_PATH}
$ENV{PYTORCH_INSTALL_PATH}/include
$ENV{PYTORCH_INSTALL_PATH}/include/torch/csrc/api/include
)

link_directories(
$ENV{PYTHON_LIB_PATH}
$ENV{PYTORCH_INSTALL_PATH}/lib
$ENV{CUDA_TOOLKIT_ROOT_DIR}/lib64
)

set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -O3)
# The following definitions must be undefined since half-precision operation is required.
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}
-U__CUDA_NO_HALF_OPERATORS__
-U__CUDA_NO_HALF_CONVERSIONS__
-U__CUDA_NO_HALF2_OPERATORS__
-U__CUDA_NO_BFLOAT16_CONVERSIONS__)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --use_fast_math -Xfatbin -compress-all)
message(STATUS "CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")

# find_package(NCCL REQUIRED)

# find cudnn
execute_process(COMMAND python -c "import nvidia.cudnn; print(nvidia.cudnn.__file__)" OUTPUT_VARIABLE CUDNN_PYTHON_PATH)
get_filename_component(CUDNN_ROOT_DIR "${CUDNN_PYTHON_PATH}" DIRECTORY)
link_directories(
${CUDNN_ROOT_DIR}/lib64
${CUDNN_ROOT_DIR}/lib
)
endif()

# check if USE_CXX11_ABI is set correctly
# if (DEFINED USE_CXX11_ABI)
# parse_make_options(${TORCH_CXX_FLAGS} "TORCH_CXX_FLAGS")
Expand Down
32 changes: 26 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,6 @@ def get_python_include_path():
return None


# PYTORCH_INSTALL_PATH and LIBTORCH_ROOT
def get_torch_root_path():
try:
import torch
Expand All @@ -115,6 +114,12 @@ def get_torch_mlu_root_path():
except ImportError:
return None

def get_nccl_root_path():
try:
from nvidia import nccl
return str(Path(nccl.__file__).parent)
except ImportError:
return None

def set_npu_envs():
PYTORCH_NPU_INSTALL_PATH = os.getenv("PYTORCH_NPU_INSTALL_PATH")
Expand Down Expand Up @@ -212,7 +217,16 @@ def set_mlu_envs():
os.environ["LIBTORCH_ROOT"] = get_torch_root_path()
os.environ["PYTORCH_INSTALL_PATH"] = get_torch_root_path()
os.environ["PYTORCH_MLU_INSTALL_PATH"] = get_torch_mlu_root_path()


def set_cuda_envs():
os.environ["PYTHON_INCLUDE_PATH"] = get_python_include_path()
os.environ["PYTHON_LIB_PATH"] = get_torch_root_path()
os.environ["LIBTORCH_ROOT"] = get_torch_root_path()
os.environ["PYTORCH_INSTALL_PATH"] = get_torch_root_path()
os.environ["CUDA_TOOLKIT_ROOT_DIR"] = "/usr/local/cuda"
os.environ["NCCL_ROOT"] = get_nccl_root_path()
os.environ["NCCL_VERSION"] = "2"

class CMakeExtension(Extension):
def __init__(self, name: str, path: str, sourcedir: str = "") -> None:
super().__init__(name, sources=[])
Expand All @@ -223,7 +237,7 @@ def __init__(self, name: str, path: str, sourcedir: str = "") -> None:
class ExtBuild(build_ext):
user_options = build_ext.user_options + [
("base-dir=", None, "base directory of xLLM project"),
("device=", None, "target device type (a3 or a2 or mlu)"),
("device=", None, "target device type (a3 or a2 or mlu or cuda)"),
("arch=", None, "target arch type (x86 or arm)"),
("install-xllm-kernels=", None, "install xllm_kernels RPM package (true/false)"),
]
Expand Down Expand Up @@ -302,8 +316,14 @@ def build_extension(self, ext: CMakeExtension):
cmake_args += ["-DUSE_MLU=ON"]
# set mlu environment variables
set_mlu_envs()
elif self.device == "cuda":
cuda_architectures = "80;89;90"
cmake_args += ["-DUSE_CUDA=ON",
f"-DCMAKE_CUDA_ARCHITECTURES={cuda_architectures}"]
# set cuda environment variables
set_cuda_envs()
else:
raise ValueError("Please set --device to a2 or a3 or mlu.")
raise ValueError("Please set --device to a2 or a3 or mlu or cuda.")


# Adding CMake arguments set as environment variable
Expand Down Expand Up @@ -353,7 +373,7 @@ def build_extension(self, ext: CMakeExtension):

class BuildDistWheel(bdist_wheel):
user_options = bdist_wheel.user_options + [
("device=", None, "target device type (a3 or a2 or mlu)"),
("device=", None, "target device type (a3 or a2 or mlu or cuda)"),
("arch=", None, "target arch type (x86 or arm)"),
]

Expand Down Expand Up @@ -530,7 +550,7 @@ def apply_patch():
idx = sys.argv.index('--device')
if idx + 1 < len(sys.argv):
device = sys.argv[idx+1].lower()
if device not in ('a2', 'a3', 'mlu'):
if device not in ('a2', 'a3', 'mlu', 'cuda'):
print("Error: --device must be a2 or a3 or mlu (case-insensitive)")
sys.exit(1)
# Remove the arguments so setup() doesn't see them
Expand Down
1 change: 1 addition & 0 deletions third_party/dlpack
Submodule dlpack added at 93c8f2
1 change: 1 addition & 0 deletions third_party/tvm-ffi
Submodule tvm-ffi added at af898a
2 changes: 2 additions & 0 deletions xllm/core/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ cc_library(
rate_limiter.h
types.h
device_monitor.h
flashinfer_workspace.h
SRCS
etcd_client.cpp
global_flags.cpp
Expand All @@ -23,6 +24,7 @@ cc_library(
options.cpp
rate_limiter.cpp
device_monitor.cpp
flashinfer_workspace.cpp
DEPS
util
absl::random_random
Expand Down
49 changes: 49 additions & 0 deletions xllm/core/common/flashinfer_workspace.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/* Copyright 2025 The xLLM Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

https://github.com/jd-opensource/xllm/blob/main/LICENSE

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "flashinfer_workspace.h"

#include <glog/logging.h>

#include "global_flags.h"

namespace xllm {

void FlashinferWorkspace::initialize(const torch::Device& device) {
LOG(INFO) << "FlashinferWorkspace initialize on device: " << device;
float_workspace_buffer_ =
torch::empty({FLAGS_workspace_buffer_size},
torch::dtype(torch::kUInt8).device(device));
int_workspace_buffer_ = torch::empty(
{128 * 1024 * 1024}, torch::dtype(torch::kUInt8).device(device));
page_locked_int_workspace_buffer_ = torch::empty(
{int_workspace_buffer_.size(0)},
torch::dtype(torch::kUInt8).device(torch::kCPU).pinned_memory(true));
LOG(INFO) << "FlashinferWorkspace initialize end";
}

torch::Tensor FlashinferWorkspace::get_float_workspace_buffer() {
return float_workspace_buffer_;
}

torch::Tensor FlashinferWorkspace::get_int_workspace_buffer() {
return int_workspace_buffer_;
}

torch::Tensor FlashinferWorkspace::get_page_locked_int_workspace_buffer() {
return page_locked_int_workspace_buffer_;
}

} // namespace xllm
49 changes: 49 additions & 0 deletions xllm/core/common/flashinfer_workspace.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/* Copyright 2025 The xLLM Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

https://github.com/jd-opensource/xllm/blob/main/LICENSE

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#pragma once

#include <torch/torch.h>

#include <cstdint>

#include "macros.h"

namespace xllm {

class FlashinferWorkspace {
public:
static FlashinferWorkspace& get_instance() {
static FlashinferWorkspace instance;
return instance;
};

void initialize(const torch::Device& device);

torch::Tensor get_float_workspace_buffer();
torch::Tensor get_int_workspace_buffer();
torch::Tensor get_page_locked_int_workspace_buffer();

private:
FlashinferWorkspace() = default;
~FlashinferWorkspace() = default;
DISALLOW_COPY_AND_ASSIGN(FlashinferWorkspace);

torch::Tensor float_workspace_buffer_;
torch::Tensor int_workspace_buffer_;
torch::Tensor page_locked_int_workspace_buffer_;
};

} // namespace xllm
11 changes: 11 additions & 0 deletions xllm/core/common/global_flags.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,7 @@ DEFINE_int32(micro_batch_num,
"Default use two micro batches for multi-stream parallel.");

// --- dit config ---

DEFINE_int32(max_requests_per_batch, 1, "Max number of request per batch.");

// --- continuous kv cache config ---
Expand All @@ -377,15 +378,25 @@ DEFINE_int64(buffer_size_per_seq,
"Buffer size per sequence in bytes, default 0.");

// --- beam search config ---

DEFINE_bool(enable_beam_search_kernel,
false,
"Whether to enable beam search kernel.");

// --- reasoning parser config ---

DEFINE_string(reasoning_parser,
"",
"Specify the reasoning parser for handling reasoning "
"interactions(e.g. glm45, qwen3, deepseek-r1).");

// --- qwen3 reranker config ---

DEFINE_bool(enable_qwen3_reranker, false, "Whether to enable qwen3 reranker.");

// --- flashinfer config ---

DEFINE_int32(workspace_buffer_size,
128 * 1024 * 1024,
"The user reserved workspace buffer used to store intermediate "
"attention results in split-k algorithm for flashinfer.");
2 changes: 2 additions & 0 deletions xllm/core/common/global_flags.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,3 +202,5 @@ DECLARE_bool(enable_qwen3_reranker);
DECLARE_string(reasoning_parser);

DECLARE_bool(enable_shm);

DECLARE_int32(workspace_buffer_size);
1 change: 1 addition & 0 deletions xllm/core/distributed_runtime/worker_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ void WorkerServer::create_server(

CollectiveCommunicator comm(worker_global_rank, world_size, dp_size, ep_size);
const ParallelArgs* parallel_args = comm.parallel_args();
// TODO: fix bug when creating cuda process group
#if defined(USE_MLU) || defined(USE_CUDA)
comm.create_process_groups(master_node_addr, device);
#endif
Expand Down
Loading