Skip to content
Merged
81 changes: 46 additions & 35 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,7 @@ list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/scheduler/resize.cpp
${NVFUSER_SRCS_DIR}/scheduler/runtime_info.cpp
${NVFUSER_SRCS_DIR}/scheduler/scheduler_types.cpp
${NVFUSER_SRCS_DIR}/scheduler/tools/cub_utils.cpp
${NVFUSER_SRCS_DIR}/scheduler/tools/domain_map.cpp
${NVFUSER_SRCS_DIR}/scheduler/tools/inlining.cpp
${NVFUSER_SRCS_DIR}/scheduler/tools/loop_domain_scheduler.cpp
Expand Down Expand Up @@ -417,34 +418,38 @@ list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/validator_utils.cpp
)

# Add LLVM JIT related dependencies
set(LLVM_MINIMUM_VERSION "18.1")
find_package(LLVM REQUIRED CONFIG)
if(${LLVM_VERSION} VERSION_LESS ${LLVM_MINIMUM_VERSION})
message(FATAL_ERROR "LLVM ${LLVM_VERSION} does not meet the minimum version required: ${LLVM_MINIMUM_VERSION}")
endif()
llvm_map_components_to_libnames(LLVM_LIBS
support
core
orcjit
executionengine
irreader
nativecodegen
Target
Analysis
JITLink
Demangle
)
cmake_dependent_option(NVFUSER_HOST_IR_JIT "Build nvFuser with LLVM" ON "USE_HOST_IR_JIT" OFF)


message(STATUS "Setting NVFUSER_HOST_IR_JIT=${NVFUSER_HOST_IR_JIT}")

if(NVFUSER_HOST_IR_JIT)
add_compile_definitions(NVFUSER_HOST_IR_JIT)
# Add LLVM JIT related dependencies
find_package(LLVM 18.1 REQUIRED CONFIG)
llvm_map_components_to_libnames(LLVM_LIBS
support
core
orcjit
executionengine
irreader
nativecodegen
Target
Analysis
JITLink
Demangle
)

add_library(LLVM_JIT INTERFACE)
target_include_directories(LLVM_JIT SYSTEM INTERFACE ${LLVM_INCLUDE_DIRS})
target_compile_definitions(LLVM_JIT INTERFACE ${LLVM_DEFINITIONS})
target_link_libraries(LLVM_JIT INTERFACE ${LLVM_LIBS})
add_library(LLVM_JIT INTERFACE)
target_include_directories(LLVM_JIT INTERFACE ${LLVM_INCLUDE_DIRS})
target_compile_definitions(LLVM_JIT INTERFACE ${LLVM_DEFINITIONS})
target_link_libraries(LLVM_JIT INTERFACE ${LLVM_LIBS})

# Add LLVM JIT related sources
list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/host_ir/jit.cpp
)
# Add LLVM JIT related sources
list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/host_ir/jit.cpp
)
endif()

# We don't link CUPTI for MSVC
if(NOT MSVC)
Expand Down Expand Up @@ -540,7 +545,9 @@ if (BUILD_CUTLASS AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
target_compile_definitions(codegen_internal PRIVATE "-DNVFUSER_CUTLASS_KERNEL_ENABLED")
endif()

target_link_libraries(codegen_internal PUBLIC LLVM_JIT)
if(NVFUSER_HOST_IR_JIT)
target_link_libraries(codegen_internal PUBLIC LLVM_JIT)
endif()

add_library(nvfuser_codegen SHARED $<TARGET_OBJECTS:codegen_internal>)

Expand Down Expand Up @@ -575,7 +582,8 @@ target_include_directories(nvfuser_codegen SYSTEM PUBLIC
)
target_link_libraries(nvfuser_codegen
PUBLIC ${TORCH_LIBRARIES}
PRIVATE dynamic_type flatbuffers ${CUDA_NVRTC_LIB} CUDA::cupti dl LLVM_JIT
PRIVATE dynamic_type flatbuffers ${CUDA_NVRTC_LIB} CUDA::cupti dl
$<$<BOOL:${NVFUSER_HOST_IR_JIT}>:LLVM_JIT>
)
set_target_properties(nvfuser_codegen PROPERTIES
C_STANDARD ${NVFUSER_C_STANDARD}
Expand Down Expand Up @@ -1232,13 +1240,15 @@ if(BUILD_TEST)
add_test(test_host_ir "${HOSTIR_TEST_SRCS}" "")
list(APPEND TEST_BINARIES test_host_ir)

set(LLVM_COMPILE_TEST_SRCS)
list(APPEND LLVM_COMPILE_TEST_SRCS
${NVFUSER_ROOT}/tests/cpp/test_host_ir_jit.cpp
)
add_test(test_host_ir_jit "${LLVM_COMPILE_TEST_SRCS}" "")
target_link_libraries(test_host_ir_jit PUBLIC LLVM_JIT)
list(APPEND TEST_BINARIES test_host_ir_jit)
if(NVFUSER_HOST_IR_JIT)
set(LLVM_COMPILE_TEST_SRCS)
list(APPEND LLVM_COMPILE_TEST_SRCS
${NVFUSER_ROOT}/tests/cpp/test_host_ir_jit.cpp
)
add_test(test_host_ir_jit "${LLVM_COMPILE_TEST_SRCS}" "")
target_link_libraries(test_host_ir_jit PUBLIC LLVM_JIT)
list(APPEND TEST_BINARIES test_host_ir_jit)
endif()


# We don't link CUPTI for MSVC
Expand Down Expand Up @@ -1477,6 +1487,7 @@ message(STATUS " UCC_FOUND: ${UCC_FOUND}")
message(STATUS " NVFUSER_STANDALONE_BUILD_WITH_UCC : ${NVFUSER_STANDALONE_BUILD_WITH_UCC}")
message(STATUS " NVFUSER_BUILD_WITH_ASAN : ${NVFUSER_BUILD_WITH_ASAN}")
message(STATUS " NVFUSER_DISTRIBUTED : ${NVFUSER_DISTRIBUTED}")
message(STATUS " NVFUSER_HOST_IR_JIT : ${NVFUSER_HOST_IR_JIT}")
message(STATUS " NVFUSER_CPP_STANDARD : ${NVFUSER_CPP_STANDARD}")
message(STATUS " NVMMH_INCLUDE_DIR : ${NVMMH_INCLUDE_DIR}")

Expand Down
4 changes: 2 additions & 2 deletions csrc/host_ir/jit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -492,8 +492,8 @@ void inferTensorShapesAndStrides(
// Check if sizes and strides are the same size as logical domain
const auto logical_ndims =
std::ranges::distance(logical_domain | TensorDomain::kNoReductions);
NVF_ERROR_EQ(std::ssize(sizes), logical_ndims);
NVF_ERROR_EQ(std::ssize(strides), logical_ndims);
NVF_ERROR_EQ(sizes.size(), logical_ndims);
NVF_ERROR_EQ(strides.size(), logical_ndims);
}

void unpackInputs(
Expand Down
1 change: 0 additions & 1 deletion csrc/options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,6 @@ const std::unordered_map<std::string, EnableOption>& getEnableOptions() {
{"warn_register_spill", EnableOption::WarnRegisterSpill},
{"ws_normalization", EnableOption::WarpSpecializedNormalization},
{"host_ir_lowering", EnableOption::HostIrLowering},
{"host_ir_jit", EnableOption::HostIrJit},
{"insert_resharding_after", EnableOption::InsertReshardingAfter},
{"fast_math", EnableOption::FastMath},
};
Expand Down
1 change: 0 additions & 1 deletion csrc/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,6 @@ enum class EnableOption {
WarnRegisterSpill, //! Enable warnings of register spill
WarpSpecializedNormalization, //! Enable warp specialized persistent kernel
HostIrLowering, //! Enable FusionKernelRuntime lowering to host IR
HostIrJit, //! Enable Host IR JIT compilation with LLVM
InsertReshardingAfter, //! Insert resharding set after the expression
FastMath, //! Enable fast math optimizations (--use_fast_math)
EndOfOption //! Placeholder for counting the number of elements
Expand Down
33 changes: 18 additions & 15 deletions csrc/runtime/fusion_kernel_runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,11 @@ void FusionKernelRuntime::evictCache(size_t input_id) {

bool FusionKernelRuntime::isCompiled() const {
if (isOptionEnabled(EnableOption::HostIrLowering)) {
return hij_ != nullptr || hie_ != nullptr;
#ifdef NVFUSER_HOST_IR_JIT
return hij_ != nullptr;
#else
return hie_ != nullptr;
#endif
} else {
std::lock_guard<std::mutex> guard(mutex_);
return std::all_of(
Expand Down Expand Up @@ -295,14 +299,13 @@ KernelArgumentHolder FusionKernelRuntime::runWithInputs(
<< std::endl;
}

KernelArgumentHolder outputs;
if (hij_ != nullptr) {
outputs = hij_->runWithInputs(args);
} else if (hie_ != nullptr) {
outputs = hie_->runWithInputs(args);
} else {
NVF_THROW("Neither Host IR JIT or Host IR Evaluator are initialized.");
}
#ifdef NVFUSER_HOST_IR_JIT
auto outputs =
hij_->runWithInputs(args); // TODO: change NVFUSER_HOST_IR_JIT flag to
// enableOption in the future.
#else
auto outputs = hie_->runWithInputs(args);
#endif

if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
debug() << "============= FINISHED RUNNING HOSTIR EVALUATOR ============"
Expand Down Expand Up @@ -469,12 +472,12 @@ void FusionKernelRuntime::compileFusionParallel(KernelArgumentHolder args) {
}
std::unique_ptr<hir::HostIrContainer> hic = lowerSegmentedFusionToHostIr(
*segmented_fusion_, launch_params_per_segment, executors_);
if (isOptionEnabled(EnableOption::HostIrJit)) {
hij_ = std::make_unique<HostIrJit>(std::move(hic));
} else {
hie_ = std::make_unique<hir::HostIrEvaluator>(
std::move(hic), &Communicator::getInstance());
}
#ifdef NVFUSER_HOST_IR_JIT
hij_ = std::make_unique<HostIrJit>(std::move(hic));
#else
hie_ = std::make_unique<hir::HostIrEvaluator>(
std::move(hic), &Communicator::getInstance());
#endif
}

if (isProfilerEnabled()) {
Expand Down
21 changes: 12 additions & 9 deletions csrc/runtime/fusion_kernel_runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@

#include <fusion_segmenter.h>
#include <host_ir/evaluator.h>
#ifdef NVFUSER_HOST_IR_JIT
#include <host_ir/jit.h>
#endif
#include <polymorphic_value.h>
#include <runtime/executor.h>
#include <runtime/executor_kernel_arg.h>
Expand Down Expand Up @@ -141,13 +143,11 @@ class FusionKernelRuntime {

//! Get the Host IR Container
const hir::HostIrContainer& getHostIrContainer() const {
if (isOptionEnabled(EnableOption::HostIrJit)) {
NVF_ERROR(hij_ != nullptr, "Host IR JIT is not initialized");
return hij_->container();
} else {
NVF_ERROR(hie_ != nullptr, "Host IR Evaluator is not initialized");
return hie_->container();
}
#ifdef NVFUSER_HOST_IR_JIT
return hij_->container();
#else
return hie_->container();
#endif
}

private:
Expand Down Expand Up @@ -189,10 +189,13 @@ class FusionKernelRuntime {
//! Executors holding compiled kernels
std::vector<std::unique_ptr<ExecutorAbstract>> executors_;

//! Host IR JIT (used when EnableOption::HostIrJit is set)
#ifdef NVFUSER_HOST_IR_JIT
//! Host IR JIT
std::unique_ptr<HostIrJit> hij_;
//! Host IR Evaluator (used when EnableOption::HostIrJit is not set)
#else
//! Host IR Evaluator
std::unique_ptr<hir::HostIrEvaluator> hie_;
#endif

// A metadata copy of initial arguments used to contruct this
// FusionKernelRuntime. Used during deserialization to schedule the fusion
Expand Down
Loading