Skip to content
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
b182ccd
[cmake] Use object library to compile cuda files to cubin/fatbin
oraluben Dec 2, 2025
c997be8
check for cudart version
oraluben Dec 3, 2025
0aa4b0e
tmp
oraluben Dec 15, 2025
cbacfcb
[1/n] Use unify API
oraluben Dec 15, 2025
6a21adb
[2/n] Remove all rt api
oraluben Dec 16, 2025
b553c7e
[3/3] Unify rt and driver api
oraluben Dec 16, 2025
6bffd5e
upd namespace
oraluben Dec 16, 2025
848b8e2
fix version check
oraluben Dec 16, 2025
8bea855
adapt dynamic example
oraluben Dec 16, 2025
2125287
update
oraluben Dec 16, 2025
b8b246c
embed example
oraluben Dec 16, 2025
2427cc4
update cmake doc
oraluben Dec 16, 2025
e9546c1
update cmake doc
oraluben Dec 16, 2025
fb01e61
upd
oraluben Dec 17, 2025
871f686
rename macro
oraluben Dec 17, 2025
556629b
Add an example with cpp's resource inclusion
oraluben Dec 17, 2025
d6bb52a
add example for bin2c
oraluben Dec 17, 2025
463c3bf
cleanup
oraluben Dec 17, 2025
617a686
update doc
oraluben Dec 17, 2025
1ca425c
lint
oraluben Dec 17, 2025
30c11c6
Refactor CUBIN embedding macros to use TVM_FFI_LOAD_LIBRARY_FROM_BYTE…
oraluben Dec 17, 2025
5378347
doc
oraluben Dec 17, 2025
5b13cbf
vibe documenting
oraluben Dec 17, 2025
03819ef
lint
oraluben Dec 17, 2025
a28bf1b
add doc for TVM_FFI_LOAD_LIBRARY_FROM_BYTES
oraluben Dec 17, 2025
3a3c3ad
ci
oraluben Dec 17, 2025
acdc148
upd
oraluben Dec 17, 2025
71a0a85
use proper name for cuda result type
oraluben Dec 18, 2025
4134d13
Use a better signature for `tvm_ffi_embed_bin_into`
oraluben Dec 18, 2025
8cda1f2
Rename `TVM_FFI_LOAD_LIBRARY_FROM_BYTES` to `TVM_FFI_EMBED_CUBIN_FROM…
oraluben Dec 19, 2025
6dbe317
Remove `INTERMEDIATE_FILE` arg
oraluben Dec 19, 2025
8db2de4
Move copy util to a separate file
oraluben Dec 19, 2025
48c3c0c
Merge branch 'main' into embed-cubin-v2
oraluben Dec 23, 2025
81ae232
Set `CMAKE_CUDA_RUNTIME_LIBRARY` to Shared when no default value and …
oraluben Dec 23, 2025
6a98732
move unified api to new location and namespace
oraluben Dec 23, 2025
11253ec
Fix issues found by gemini
oraluben Dec 23, 2025
2d177b6
doc update from gemini
oraluben Dec 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -295,3 +295,6 @@ _docs/
.gdb_history

build/

*.cubin
*.fatbin
288 changes: 123 additions & 165 deletions cmake/Utils/EmbedCubin.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -15,205 +15,163 @@
# specific language governing permissions and limitations
# under the License.

# Do not let cmake to link cudart.
set(CMAKE_CUDA_RUNTIME_LIBRARY None)

# We need this to simulate `CUDA_{CUBIN,FATBIN}_COMPILATION` in `add_tvm_ffi_{cubin,fatbin}`, to
# copy `a.cu.o` to `a.cubin`/`a.fatbin`.
set(COPY_SCRIPT "${CMAKE_BINARY_DIR}/cuda_copy_utils.cmake")
file(
WRITE ${COPY_SCRIPT}
"
# Arguments: OBJECTS (semicolon-separated list), OUT_DIR, EXT
string(REPLACE \"\\\"\" \"\" ext_strip \"\${EXT}\")
string(REPLACE \"\\\"\" \"\" out_dir_strip \"\${OUT_DIR}\")
foreach(obj_raw \${OBJECTS})
string(REPLACE \"\\\"\" \"\" obj \"\${obj_raw}\")
# Extract filename: /path/to/kernel.cu.o -> kernel
# Note: CMake objects are usually named source.cu.o, so we strip extensions twice.
get_filename_component(fname \${obj} NAME_WE)
get_filename_component(fname \${fname} NAME_WE)
# If OUT_DIR is provided, use it. Otherwise, use the object's directory.
if(NOT out_dir_strip STREQUAL \"\")
set(final_dir \"\${out_dir_strip}\")
else()
get_filename_component(final_dir \${obj} DIRECTORY)
endif()
message(\"Copying \${obj} -> \${final_dir}/\${fname}.\${ext_strip}\")
execute_process(
COMMAND \${CMAKE_COMMAND} -E copy_if_different
\"\${obj}\"
\"\${final_dir}/\${fname}.\${ext_strip}\"
)
endforeach()
"
)

# ~~~
# tvm_ffi_generate_cubin(
# OUTPUT <output_cubin_file>
# SOURCE <cuda_source_file>
# [ARCH <architecture>]
# [OPTIONS <extra_nvcc_options>...]
# [DEPENDS <additional_dependencies>...]
# )
# add_tvm_ffi_cubin(<target_name> CUDA <source_file>)
#
# Compiles a CUDA source file to CUBIN format using nvcc.
# Creates an object library that compiles CUDA source to CUBIN format.
# This function uses CMake's native CUDA support and respects CMAKE_CUDA_ARCHITECTURES.
# This is a compatibility util for cmake < 3.27, user can create
# cmake target with `CUDA_CUBIN_COMPILATION` for cmake >= 3.27.
#
# Parameters:
# OUTPUT: Path to the output CUBIN file (e.g., kernel.cubin)
# SOURCE: Path to the CUDA source file (e.g., kernel.cu)
# ARCH: Target GPU architecture (default: native for auto-detection)
# Examples: sm_75, sm_80, sm_86, compute_80, native
# OPTIONS: Additional nvcc compiler options (e.g., -O3, --use_fast_math)
# DEPENDS: Optional additional dependencies
#
# The function will:
# 1. Find the CUDA compiler (nvcc)
# 2. Compile the SOURCE to CUBIN with specified architecture and options
# 3. Create the output CUBIN file
# target_name: Name of the object library target
# CUDA: One CUDA source file
#
# Example:
# tvm_ffi_generate_cubin(
# OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/kernel.cubin
# SOURCE src/kernel.cu
# ARCH native
# OPTIONS -O3 --use_fast_math
# )
# add_tvm_ffi_cubin(my_kernel_cubin CUDA kernel.cu)
# ~~~

# cmake-lint: disable=C0111,C0103
function (tvm_ffi_generate_cubin)
# Parse arguments
set(options "")
set(oneValueArgs OUTPUT SOURCE ARCH)
set(multiValueArgs OPTIONS DEPENDS)
cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

# Validate required arguments
if (NOT ARG_OUTPUT)
message(FATAL_ERROR "tvm_ffi_generate_cubin: OUTPUT is required")
endif ()
if (NOT ARG_SOURCE)
message(FATAL_ERROR "tvm_ffi_generate_cubin: SOURCE is required")
function (add_tvm_ffi_cubin target_name)
cmake_parse_arguments(ARG "" "CUDA" "" ${ARGN})
if (NOT ARG_CUDA)
message(FATAL_ERROR "add_tvm_ffi_cubin: CUDA source is required")
endif ()

# Default architecture to native if not specified
if (NOT ARG_ARCH)
set(ARG_ARCH "native")
endif ()
add_library(${target_name} OBJECT ${ARG_CUDA})
target_compile_options(${target_name} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--cubin>)

# Ensure CUDA compiler is available
if (NOT CMAKE_CUDA_COMPILER)
message(
FATAL_ERROR
"tvm_ffi_generate_cubin: CMAKE_CUDA_COMPILER not found. Enable CUDA language in project()."
)
add_custom_target(
${target_name}_bin ALL
COMMAND ${CMAKE_COMMAND} -DOBJECTS="$<TARGET_OBJECTS:${target_name}>" -DOUT_DIR="" -DEXT="cubin"
-P "${COPY_SCRIPT}"
DEPENDS ${target_name}
COMMENT "Generating .cubin files for ${target_name}"
VERBATIM
)
endfunction ()

# ~~~
# add_tvm_ffi_fatbin(<target_name> CUDA <source_file>)
#
# Creates an object library that compiles CUDA source to FATBIN format.
# This function uses CMake's native CUDA support and respects CMAKE_CUDA_ARCHITECTURES.
# This is a compatibility util for cmake < 3.27, user can create
# cmake target with `CUDA_FATBIN_COMPILATION` for cmake >= 3.27.
#
# Parameters:
# target_name: Name of the object library target
# CUDA: One CUDA source file
#
# Example:
# add_tvm_ffi_fatbin(my_kernel_cubin CUDA kernel.cu)
# ~~~
function (add_tvm_ffi_fatbin target_name)
cmake_parse_arguments(ARG "" "CUDA" "" ${ARGN})
if (NOT ARG_CUDA)
message(FATAL_ERROR "add_tvm_ffi_fatbin: CUDA source is required")
endif ()

# Get absolute paths
get_filename_component(ARG_SOURCE_ABS "${ARG_SOURCE}" ABSOLUTE)
get_filename_component(ARG_OUTPUT_ABS "${ARG_OUTPUT}" ABSOLUTE)
add_library(${target_name} OBJECT ${ARG_CUDA})
target_compile_options(${target_name} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--fatbin>)

# Build nvcc command
add_custom_command(
OUTPUT "${ARG_OUTPUT_ABS}"
COMMAND ${CMAKE_CUDA_COMPILER} --cubin -arch=${ARG_ARCH} ${ARG_OPTIONS} "${ARG_SOURCE_ABS}" -o
"${ARG_OUTPUT_ABS}"
DEPENDS "${ARG_SOURCE_ABS}" ${ARG_DEPENDS}
COMMENT "Compiling ${ARG_SOURCE} to CUBIN (arch: ${ARG_ARCH})"
add_custom_target(
${target_name}_bin ALL
COMMAND ${CMAKE_COMMAND} -DOBJECTS="$<TARGET_OBJECTS:${target_name}>" -DOUT_DIR=""
-DEXT="fatbin" -P "${COPY_SCRIPT}"
DEPENDS ${target_name}
COMMENT "Generating .fatbin files for ${target_name}"
VERBATIM
)
endfunction ()

# ~~~
# tvm_ffi_embed_cubin(
# OUTPUT <output_object_file>
# SOURCE <source_file>
# CUBIN <cubin_file>
# NAME <symbol_name>
# [DEPENDS <additional_dependencies>...]
# )
# tvm_ffi_embed_bin_into(<target_name> <library_name>
# BIN <cubin_or_fatbin>
# INTERMEDIATE_FILE <intermediate_path>)
#
# Compiles a C++ source file and embeds a CUBIN file into it, creating a
# combined object file that can be linked into a shared library or executable.
# Embed one cubin/fatbin into given target with specified library name,
# can be loaded with `TVM_FFI_EMBED_CUBIN(library_name)`.
# Can only have one object in target and one cubin/fatbin.
#
# Parameters:
# OUTPUT: Path to the output object file (e.g., lib_embedded_with_cubin.o)
# SOURCE: Path to the C++ source file that uses TVM_FFI_EMBED_CUBIN macro
# CUBIN: Path to the CUBIN file to embed (can be a file path or a custom target output)
# NAME: Name used in the TVM_FFI_EMBED_CUBIN macro (e.g., "env" for TVM_FFI_EMBED_CUBIN(env))
# DEPENDS: Optional additional dependencies (e.g., custom targets)
# The reason of this design is to integrate with cmake's workflow.
#
# The function will:
# 1. Compile the SOURCE file to an intermediate object file
# 2. Use the tvm_ffi.utils.embed_cubin Python utility to merge the object file
# with the CUBIN data
# 3. Create symbols: __tvm_ffi__cubin_<NAME> and __tvm_ffi__cubin_<NAME>_end
# Parameters:
# target_name: Name of the object library target
# library_name: Name of the kernel library
# BIN: CUBIN or FATBIN file
# INTERMEDIATE_FILE: Optional, location to copy original object file to.
#
# Example:
# tvm_ffi_embed_cubin(
# OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/lib_embedded_with_cubin.o
# SOURCE src/lib_embedded.cc
# CUBIN ${CMAKE_CURRENT_BINARY_DIR}/kernel.cubin
# NAME env
# )
#
# add_library(lib_embedded SHARED ${CMAKE_CURRENT_BINARY_DIR}/lib_embedded_with_cubin.o)
# target_link_libraries(lib_embedded PRIVATE tvm_ffi_header CUDA::cudart)
#
# Note: The .note.GNU-stack section is automatically added to mark the stack as
# non-executable, so you don't need to add linker options manually
# tvm_ffi_embed_bin_into(lib_embedded env BIN "$<TARGET_OBJECTS:kernel_fatbin>")
# ~~~
function (tvm_ffi_embed_bin_into target_name kernel_name)
cmake_parse_arguments(ARG "" "BIN;INTERMEDIATE_FILE" "" ${ARGN})

# cmake-lint: disable=C0111,C0103
function (tvm_ffi_embed_cubin)
# Parse arguments
set(options "")
set(oneValueArgs OUTPUT SOURCE CUBIN NAME)
set(multiValueArgs DEPENDS)
cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

# Validate required arguments
if (NOT ARG_OUTPUT)
message(FATAL_ERROR "tvm_ffi_embed_cubin: OUTPUT is required")
endif ()
if (NOT ARG_SOURCE)
message(FATAL_ERROR "tvm_ffi_embed_cubin: SOURCE is required")
endif ()
if (NOT ARG_CUBIN)
message(FATAL_ERROR "tvm_ffi_embed_cubin: CUBIN is required")
endif ()
if (NOT ARG_NAME)
message(FATAL_ERROR "tvm_ffi_embed_cubin: NAME is required")
if (NOT ARG_BIN)
message(FATAL_ERROR "tvm_ffi_embed_object: BIN is required")
endif ()

# Ensure Python is found (prefer virtualenv)
if (NOT Python_EXECUTABLE)
set(Python_FIND_VIRTUALENV FIRST)
find_package(
Python
COMPONENTS Interpreter
REQUIRED
)
endif ()
get_filename_component(LIB_ABS "$<TARGET_OBJECTS:${target_name}>" ABSOLUTE)
if (NOT ARG_INTERMEDIATE_FILE)
get_filename_component(OUTPUT_DIR_ABS "${LIB_ABS}" DIRECTORY)

# Get absolute paths
get_filename_component(ARG_SOURCE_ABS "${ARG_SOURCE}" ABSOLUTE)
get_filename_component(ARG_OUTPUT_ABS "${ARG_OUTPUT}" ABSOLUTE)

# Generate intermediate object file path
get_filename_component(OUTPUT_DIR "${ARG_OUTPUT_ABS}" DIRECTORY)
get_filename_component(OUTPUT_NAME "${ARG_OUTPUT_ABS}" NAME_WE)
set(INTERMEDIATE_OBJ "${OUTPUT_DIR}/${OUTPUT_NAME}_intermediate.o")

# Get include directories from tvm_ffi_header
get_target_property(TVM_FFI_INCLUDES tvm_ffi_header INTERFACE_INCLUDE_DIRECTORIES)

# Convert list to -I flags
set(INCLUDE_FLAGS "")
foreach (inc_dir ${TVM_FFI_INCLUDES})
list(APPEND INCLUDE_FLAGS "-I${inc_dir}")
endforeach ()

# Add CUDA include directories if CUDAToolkit is found
if (TARGET CUDA::cudart)
get_target_property(CUDA_INCLUDES CUDA::cudart INTERFACE_INCLUDE_DIRECTORIES)
foreach (inc_dir ${CUDA_INCLUDES})
list(APPEND INCLUDE_FLAGS "-I${inc_dir}")
endforeach ()
set(final_output "${OUTPUT_DIR_ABS}/${kernel_name}_intermediate.o")
else ()
get_filename_component(final_output "${ARG_INTERMEDIATE_FILE}" ABSOLUTE)
endif ()

# Step 1: Compile source file to intermediate object file
add_custom_command(
OUTPUT "${INTERMEDIATE_OBJ}"
COMMAND ${CMAKE_CXX_COMPILER} -c -fPIC -std=c++17 ${INCLUDE_FLAGS} "${ARG_SOURCE_ABS}" -o
"${INTERMEDIATE_OBJ}"
DEPENDS "${ARG_SOURCE_ABS}"
COMMENT "Compiling ${ARG_SOURCE} to intermediate object file"
VERBATIM
TARGET ${target_name}
PRE_LINK
COMMAND ${CMAKE_COMMAND} -E copy_if_different "$<TARGET_OBJECTS:${target_name}>"
"${final_output}"
COMMENT "Moving $<TARGET_OBJECTS:${target_name}> -> ${final_output}"
)

# Step 2: Embed CUBIN into the object file using Python utility Note: The Python utility
# automatically adds .note.GNU-stack section
add_custom_command(
OUTPUT "${ARG_OUTPUT_ABS}"
COMMAND ${Python_EXECUTABLE} -m tvm_ffi.utils.embed_cubin --output-obj "${ARG_OUTPUT_ABS}"
--input-obj "${INTERMEDIATE_OBJ}" --cubin "${ARG_CUBIN}" --name "${ARG_NAME}"
DEPENDS "${INTERMEDIATE_OBJ}" "${ARG_CUBIN}" ${ARG_DEPENDS}
COMMENT "Embedding CUBIN into object file (name: ${ARG_NAME})"
TARGET ${target_name}
PRE_LINK
COMMAND
${Python_EXECUTABLE} -m tvm_ffi.utils.embed_cubin --output-obj
"$<TARGET_OBJECTS:${target_name}>" --name "${kernel_name}" --input-obj "${final_output}"
--cubin "${ARG_BIN}" DEPENDS
COMMENT "Embedding CUBIN into object file (name: ${kernel_name})"
VERBATIM
)

# Set a variable in parent scope so users can add dependencies
set(${ARG_NAME}_EMBEDDED_OBJ
"${ARG_OUTPUT_ABS}"
PARENT_SCOPE
)
endfunction ()
Loading