Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
b182ccd
[cmake] Use object library to compile cuda files to cubin/fatbin
oraluben Dec 2, 2025
c997be8
check for cudart version
oraluben Dec 3, 2025
0aa4b0e
tmp
oraluben Dec 15, 2025
cbacfcb
[1/n] Use unify API
oraluben Dec 15, 2025
6a21adb
[2/n] Remove all rt api
oraluben Dec 16, 2025
b553c7e
[3/3] Unify rt and driver api
oraluben Dec 16, 2025
6bffd5e
upd namespace
oraluben Dec 16, 2025
848b8e2
fix version check
oraluben Dec 16, 2025
8bea855
adapt dynamic example
oraluben Dec 16, 2025
2125287
update
oraluben Dec 16, 2025
b8b246c
embed example
oraluben Dec 16, 2025
2427cc4
update cmake doc
oraluben Dec 16, 2025
e9546c1
update cmake doc
oraluben Dec 16, 2025
fb01e61
upd
oraluben Dec 17, 2025
871f686
rename macro
oraluben Dec 17, 2025
556629b
Add an example with cpp's resource inclusion
oraluben Dec 17, 2025
d6bb52a
add example for bin2c
oraluben Dec 17, 2025
463c3bf
cleanup
oraluben Dec 17, 2025
617a686
update doc
oraluben Dec 17, 2025
1ca425c
lint
oraluben Dec 17, 2025
30c11c6
Refactor CUBIN embedding macros to use TVM_FFI_LOAD_LIBRARY_FROM_BYTE…
oraluben Dec 17, 2025
5378347
doc
oraluben Dec 17, 2025
5b13cbf
vibe documenting
oraluben Dec 17, 2025
03819ef
lint
oraluben Dec 17, 2025
a28bf1b
add doc for TVM_FFI_LOAD_LIBRARY_FROM_BYTES
oraluben Dec 17, 2025
3a3c3ad
ci
oraluben Dec 17, 2025
acdc148
upd
oraluben Dec 17, 2025
71a0a85
use proper name for cuda result type
oraluben Dec 18, 2025
4134d13
Use a better signature for `tvm_ffi_embed_bin_into`
oraluben Dec 18, 2025
8cda1f2
Rename `TVM_FFI_LOAD_LIBRARY_FROM_BYTES` to `TVM_FFI_EMBED_CUBIN_FROM…
oraluben Dec 19, 2025
6dbe317
Remove `INTERMEDIATE_FILE` arg
oraluben Dec 19, 2025
8db2de4
Move copy util to a separate file
oraluben Dec 19, 2025
48c3c0c
Merge branch 'main' into embed-cubin-v2
oraluben Dec 23, 2025
81ae232
Set `CMAKE_CUDA_RUNTIME_LIBRARY` to Shared when no default value and …
oraluben Dec 23, 2025
6a98732
move unified api to new location and namespace
oraluben Dec 23, 2025
11253ec
Fix issues found by gemini
oraluben Dec 23, 2025
2d177b6
doc update from gemini
oraluben Dec 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -295,3 +295,6 @@ _docs/
.gdb_history

build/

*.cubin
*.fatbin
269 changes: 93 additions & 176 deletions cmake/Utils/EmbedCubin.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -15,215 +15,132 @@
# specific language governing permissions and limitations
# under the License.

# If CMAKE_CUDA_RUNTIME_LIBRARY is not set, we default it to Shared. This prevents static linking of
# cudart which requires exact driver version match.
if (NOT DEFINED CMAKE_CUDA_RUNTIME_LIBRARY)
set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
message(STATUS "CMAKE_CUDA_RUNTIME_LIBRARY not set, defaulting to Shared. "
"If you want to use driver API only, set CMAKE_CUDA_RUNTIME_LIBRARY to None."
)
endif ()

set(OBJECT_COPY_UTIL "${CMAKE_CURRENT_LIST_DIR}/ObjectCopyUtil.cmake")

# ~~~
# tvm_ffi_generate_cubin(
# OUTPUT <output_cubin_file>
# SOURCE <cuda_source_file>
# [ARCH <architecture>]
# [OPTIONS <extra_nvcc_options>...]
# [DEPENDS <additional_dependencies>...]
# )
# add_tvm_ffi_cubin(<target_name> CUDA <source_file>)
#
# Compiles a CUDA source file to CUBIN format using nvcc.
# Creates an object library that compiles CUDA source to CUBIN format.
# This function uses CMake's native CUDA support and respects CMAKE_CUDA_ARCHITECTURES.
# This is a compatibility util for cmake < 3.27, user can create
# cmake target with `CUDA_CUBIN_COMPILATION` for cmake >= 3.27.
#
# Parameters:
# OUTPUT: Path to the output CUBIN file (e.g., kernel.cubin)
# SOURCE: Path to the CUDA source file (e.g., kernel.cu)
# ARCH: Target GPU architecture (default: native for auto-detection)
# Examples: sm_75, sm_80, sm_86, compute_80, native
# OPTIONS: Additional nvcc compiler options (e.g., -O3, --use_fast_math)
# DEPENDS: Optional additional dependencies
#
# The function will:
# 1. Find the CUDA compiler (nvcc)
# 2. Compile the SOURCE to CUBIN with specified architecture and options
# 3. Create the output CUBIN file
# target_name: Name of the object library target
# CUDA: One CUDA source file
#
# Example:
# tvm_ffi_generate_cubin(
# OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/kernel.cubin
# SOURCE src/kernel.cu
# ARCH native
# OPTIONS -O3 --use_fast_math
# )
# add_tvm_ffi_cubin(my_kernel_cubin CUDA kernel.cu)
# ~~~

# cmake-lint: disable=C0111,C0103
function (tvm_ffi_generate_cubin)
# Parse arguments
set(options "")
set(oneValueArgs OUTPUT SOURCE ARCH)
set(multiValueArgs OPTIONS DEPENDS)
cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

# Validate required arguments
if (NOT ARG_OUTPUT)
message(FATAL_ERROR "tvm_ffi_generate_cubin: OUTPUT is required")
endif ()
if (NOT ARG_SOURCE)
message(FATAL_ERROR "tvm_ffi_generate_cubin: SOURCE is required")
function (add_tvm_ffi_cubin target_name)
cmake_parse_arguments(ARG "" "CUDA" "" ${ARGN})
if (NOT ARG_CUDA)
message(FATAL_ERROR "add_tvm_ffi_cubin: CUDA source is required")
endif ()

# Default architecture to native if not specified
if (NOT ARG_ARCH)
set(ARG_ARCH "native")
endif ()

# Ensure CUDA compiler is available
if (NOT CMAKE_CUDA_COMPILER)
message(
FATAL_ERROR
"tvm_ffi_generate_cubin: CMAKE_CUDA_COMPILER not found. Enable CUDA language in project()."
)
endif ()
add_library(${target_name} OBJECT ${ARG_CUDA})
target_compile_options(${target_name} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--cubin>)

# Get absolute paths
get_filename_component(ARG_SOURCE_ABS "${ARG_SOURCE}" ABSOLUTE)
get_filename_component(ARG_OUTPUT_ABS "${ARG_OUTPUT}" ABSOLUTE)

# Build nvcc command
add_custom_command(
OUTPUT "${ARG_OUTPUT_ABS}"
COMMAND ${CMAKE_CUDA_COMPILER} --cubin -arch=${ARG_ARCH} ${ARG_OPTIONS} "${ARG_SOURCE_ABS}" -o
"${ARG_OUTPUT_ABS}"
DEPENDS "${ARG_SOURCE_ABS}" ${ARG_DEPENDS}
COMMENT "Compiling ${ARG_SOURCE} to CUBIN (arch: ${ARG_ARCH})"
add_custom_target(
${target_name}_bin ALL
COMMAND ${CMAKE_COMMAND} -DOBJECTS="$<TARGET_OBJECTS:${target_name}>" -DOUT_DIR="" -DEXT="cubin"
-P "${OBJECT_COPY_UTIL}"
DEPENDS ${target_name}
COMMENT "Generating .cubin files for ${target_name}"
VERBATIM
)
endfunction ()

# ~~~
# tvm_ffi_embed_cubin(
# OUTPUT <output_object_file>
# SOURCE <source_file>
# CUBIN <cubin_file>
# NAME <symbol_name>
# [DEPENDS <additional_dependencies>...]
# )
# add_tvm_ffi_fatbin(<target_name> CUDA <source_file>)
#
# Compiles a C++ source file and embeds a CUBIN file into it, creating a
# combined object file that can be linked into a shared library or executable.
# Creates an object library that compiles CUDA source to FATBIN format.
# This function uses CMake's native CUDA support and respects CMAKE_CUDA_ARCHITECTURES.
# This is a compatibility util for cmake < 3.27, user can create
# cmake target with `CUDA_FATBIN_COMPILATION` for cmake >= 3.27.
#
# Parameters:
# OUTPUT: Path to the output object file (e.g., lib_embedded_with_cubin.o)
# SOURCE: Path to the C++ source file that uses TVM_FFI_EMBED_CUBIN macro
# CUBIN: Path to the CUBIN file to embed (can be a file path or a custom target output)
# NAME: Name used in the TVM_FFI_EMBED_CUBIN macro (e.g., "env" for TVM_FFI_EMBED_CUBIN(env))
# DEPENDS: Optional additional dependencies (e.g., custom targets)
#
# The function will:
# 1. Compile the SOURCE file to an intermediate object file
# 2. Use the tvm_ffi.utils.embed_cubin Python utility to merge the object file
# with the CUBIN data
# 3. Create symbols: __tvm_ffi__cubin_<NAME> and __tvm_ffi__cubin_<NAME>_end
# target_name: Name of the object library target
# CUDA: One CUDA source file
#
# Example:
# tvm_ffi_embed_cubin(
# OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/lib_embedded_with_cubin.o
# SOURCE src/lib_embedded.cc
# CUBIN ${CMAKE_CURRENT_BINARY_DIR}/kernel.cubin
# NAME env
# )
#
# add_library(lib_embedded SHARED ${CMAKE_CURRENT_BINARY_DIR}/lib_embedded_with_cubin.o)
# target_link_libraries(lib_embedded PRIVATE tvm_ffi_header CUDA::cudart)
#
# Note: The .note.GNU-stack section is automatically added to mark the stack as
# non-executable, so you don't need to add linker options manually
# add_tvm_ffi_fatbin(my_kernel_cubin CUDA kernel.cu)
# ~~~

# cmake-lint: disable=C0111,C0103
function (tvm_ffi_embed_cubin)
# Parse arguments
set(options "")
set(oneValueArgs OUTPUT SOURCE CUBIN NAME)
set(multiValueArgs DEPENDS)
cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

# Validate required arguments
if (NOT ARG_OUTPUT)
message(FATAL_ERROR "tvm_ffi_embed_cubin: OUTPUT is required")
endif ()
if (NOT ARG_SOURCE)
message(FATAL_ERROR "tvm_ffi_embed_cubin: SOURCE is required")
endif ()
if (NOT ARG_CUBIN)
message(FATAL_ERROR "tvm_ffi_embed_cubin: CUBIN is required")
endif ()
if (NOT ARG_NAME)
message(FATAL_ERROR "tvm_ffi_embed_cubin: NAME is required")
function (add_tvm_ffi_fatbin target_name)
cmake_parse_arguments(ARG "" "CUDA" "" ${ARGN})
if (NOT ARG_CUDA)
message(FATAL_ERROR "add_tvm_ffi_fatbin: CUDA source is required")
endif ()

# Ensure Python is found (prefer virtualenv)
if (NOT Python_EXECUTABLE)
set(Python_FIND_VIRTUALENV FIRST)
find_package(
Python
COMPONENTS Interpreter
REQUIRED
)
endif ()
add_library(${target_name} OBJECT ${ARG_CUDA})
target_compile_options(${target_name} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--fatbin>)

# Get absolute paths
get_filename_component(ARG_SOURCE_ABS "${ARG_SOURCE}" ABSOLUTE)
get_filename_component(ARG_OUTPUT_ABS "${ARG_OUTPUT}" ABSOLUTE)
add_custom_target(
${target_name}_bin ALL
COMMAND ${CMAKE_COMMAND} -DOBJECTS="$<TARGET_OBJECTS:${target_name}>" -DOUT_DIR=""
-DEXT="fatbin" -P "${OBJECT_COPY_UTIL}"
DEPENDS ${target_name}
COMMENT "Generating .fatbin files for ${target_name}"
VERBATIM
)
endfunction ()

# Generate intermediate object file path
get_filename_component(OUTPUT_DIR "${ARG_OUTPUT_ABS}" DIRECTORY)
get_filename_component(OUTPUT_NAME "${ARG_OUTPUT_ABS}" NAME_WE)
set(INTERMEDIATE_OBJ "${OUTPUT_DIR}/${OUTPUT_NAME}_intermediate.o")
# ~~~
# tvm_ffi_embed_bin_into(<target_name>
# SYMBOL <symbol_name>
# BIN <cubin_or_fatbin>)
#
# Embed one cubin/fatbin into given target with specified library name,
# can be loaded with `TVM_FFI_EMBED_CUBIN(symbol_name)`.
# Can only have one object in target and one cubin/fatbin.
#
# The reason of this design is to integrate with cmake's workflow.
#
# Parameters:
# target_name: Name of the object library target
# symbol_name: Name of the symbol in TVM_FFI_EMBED_CUBIN macro.
# BIN: CUBIN or FATBIN file
#
# Example:
# tvm_ffi_embed_bin_into(lib_embedded SYMBOL env BIN "$<TARGET_OBJECTS:kernel_fatbin>")
# ~~~
function (tvm_ffi_embed_bin_into target_name)
cmake_parse_arguments(ARG "" "SYMBOL;BIN" "" ${ARGN})

# Get include directories from tvm_ffi header target
if (TARGET tvm_ffi::header)
set(TVM_FFI_HEADER_TARGET tvm_ffi::header)
elseif (TARGET tvm_ffi_header)
set(TVM_FFI_HEADER_TARGET tvm_ffi_header)
else ()
message(
FATAL_ERROR
"tvm_ffi_embed_cubin: required target 'tvm_ffi::header' or 'tvm_ffi_header' does not exist."
)
if (NOT ARG_BIN)
message(FATAL_ERROR "tvm_ffi_embed_bin_into: BIN is required")
endif ()
get_target_property(TVM_FFI_INCLUDES ${TVM_FFI_HEADER_TARGET} INTERFACE_INCLUDE_DIRECTORIES)

# Convert list to -I flags
set(INCLUDE_FLAGS "")
foreach (inc_dir ${TVM_FFI_INCLUDES})
list(APPEND INCLUDE_FLAGS "-I${inc_dir}")
endforeach ()

# Add CUDA include directories if CUDAToolkit is found
if (TARGET CUDA::cudart)
get_target_property(CUDA_INCLUDES CUDA::cudart INTERFACE_INCLUDE_DIRECTORIES)
foreach (inc_dir ${CUDA_INCLUDES})
list(APPEND INCLUDE_FLAGS "-I${inc_dir}")
endforeach ()
if (NOT ARG_SYMBOL)
message(FATAL_ERROR "tvm_ffi_embed_bin_into: SYMBOL is required")
endif ()

# Step 1: Compile source file to intermediate object file
set(intermediate_path "${CMAKE_CURRENT_BINARY_DIR}/${ARG_SYMBOL}_intermediate.o")

add_custom_command(
OUTPUT "${INTERMEDIATE_OBJ}"
COMMAND ${CMAKE_CXX_COMPILER} -c -fPIC -std=c++17 ${INCLUDE_FLAGS} "${ARG_SOURCE_ABS}" -o
"${INTERMEDIATE_OBJ}"
DEPENDS "${ARG_SOURCE_ABS}"
COMMENT "Compiling ${ARG_SOURCE} to intermediate object file"
VERBATIM
TARGET ${target_name}
PRE_LINK
COMMAND ${CMAKE_COMMAND} -E copy_if_different "$<TARGET_OBJECTS:${target_name}>"
"${intermediate_path}"
COMMENT "Moving $<TARGET_OBJECTS:${target_name}> -> ${intermediate_path}"
)

# Step 2: Embed CUBIN into the object file using Python utility Note: The Python utility
# automatically adds .note.GNU-stack section
add_custom_command(
OUTPUT "${ARG_OUTPUT_ABS}"
COMMAND ${Python_EXECUTABLE} -m tvm_ffi.utils.embed_cubin --output-obj "${ARG_OUTPUT_ABS}"
--input-obj "${INTERMEDIATE_OBJ}" --cubin "${ARG_CUBIN}" --name "${ARG_NAME}"
DEPENDS "${INTERMEDIATE_OBJ}" "${ARG_CUBIN}" ${ARG_DEPENDS}
COMMENT "Embedding CUBIN into object file (name: ${ARG_NAME})"
TARGET ${target_name}
PRE_LINK
COMMAND
${Python_EXECUTABLE} -m tvm_ffi.utils.embed_cubin --output-obj
"$<TARGET_OBJECTS:${target_name}>" --name "${ARG_SYMBOL}" --input-obj "${intermediate_path}"
--cubin "${ARG_BIN}"
COMMENT "Embedding CUBIN into object file (name: ${ARG_SYMBOL})"
VERBATIM
)

# Set a variable in parent scope so users can add dependencies
set(${ARG_NAME}_EMBEDDED_OBJ
"${ARG_OUTPUT_ABS}"
PARENT_SCOPE
)
endfunction ()
48 changes: 48 additions & 0 deletions cmake/Utils/ObjectCopyUtil.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# We need this to simulate `CUDA_{CUBIN,FATBIN}_COMPILATION` in `add_tvm_ffi_{cubin,fatbin}`, to
# copy `a.cu.o` to `a.cubin`/`a.fatbin`.

# Usage: cmake -DOBJECTS=<input_object_file1>;...;<input_object_fileN> -DOUT_DIR=<output_directory>
# -DEXT=<extension> -P <this_script>

# Parameter: OBJECTS: semicolon-separated list of input object files; OUT_DIR: output directory,
# empty for the same directory as the object file EXT: extension to rename to

string(REPLACE "\"" "" ext_strip "${EXT}")
string(REPLACE "\"" "" out_dir_strip "${OUT_DIR}")
foreach (obj_raw ${OBJECTS})
string(REPLACE "\"" "" obj "${obj_raw}")

# Extract filename: /path/to/kernel.cu.o -> kernel Note: CMake objects are usually named
# source.cu.o, so we strip extensions twice.
get_filename_component(fname ${obj} NAME_WE)
get_filename_component(fname ${fname} NAME_WE)

# If OUT_DIR is provided, use it. Otherwise, use the object's directory.
if (NOT out_dir_strip STREQUAL "")
set(FINAL_DIR "${out_dir_strip}")
else ()
get_filename_component(FINAL_DIR ${obj} DIRECTORY)
endif ()

message("Copying ${obj} -> ${FINAL_DIR}/${fname}.${ext_strip}")
execute_process(
COMMAND ${CMAKE_COMMAND} -E copy_if_different "${obj}" "${FINAL_DIR}/${fname}.${ext_strip}"
)
endforeach ()
Loading