Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 35 additions & 45 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -418,38 +418,34 @@ list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/validator_utils.cpp
)

cmake_dependent_option(NVFUSER_HOST_IR_JIT "Build nvFuser with LLVM" ON "USE_HOST_IR_JIT" OFF)


message(STATUS "Setting NVFUSER_HOST_IR_JIT=${NVFUSER_HOST_IR_JIT}")

if(NVFUSER_HOST_IR_JIT)
add_compile_definitions(NVFUSER_HOST_IR_JIT)
# Add LLVM JIT related dependencies
find_package(LLVM 18.1 REQUIRED CONFIG)
llvm_map_components_to_libnames(LLVM_LIBS
support
core
orcjit
executionengine
irreader
nativecodegen
Target
Analysis
JITLink
Demangle
)
# Add LLVM JIT related dependencies
set(LLVM_MINIMUM_VERSION "18.1")
find_package(LLVM REQUIRED CONFIG)
if(${LLVM_VERSION} VERSION_LESS ${LLVM_MINIMUM_VERSION})
message(FATAL_ERROR "LLVM ${LLVM_VERSION} does not meet the minimum version required: ${LLVM_MINIMUM_VERSION}")
endif()
llvm_map_components_to_libnames(LLVM_LIBS
support
core
orcjit
executionengine
irreader
nativecodegen
Target
Analysis
JITLink
Demangle
)

add_library(LLVM_JIT INTERFACE)
target_include_directories(LLVM_JIT INTERFACE ${LLVM_INCLUDE_DIRS})
target_compile_definitions(LLVM_JIT INTERFACE ${LLVM_DEFINITIONS})
target_link_libraries(LLVM_JIT INTERFACE ${LLVM_LIBS})
add_library(LLVM_JIT INTERFACE)
target_include_directories(LLVM_JIT SYSTEM INTERFACE ${LLVM_INCLUDE_DIRS})
target_compile_definitions(LLVM_JIT INTERFACE ${LLVM_DEFINITIONS})
target_link_libraries(LLVM_JIT INTERFACE ${LLVM_LIBS})

# Add LLVM JIT related sources
list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/host_ir/jit.cpp
)
endif()
# Add LLVM JIT related sources
list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/host_ir/jit.cpp
)

# We don't link CUPTI for MSVC
if(NOT MSVC)
Expand Down Expand Up @@ -545,9 +541,7 @@ if (BUILD_CUTLASS AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
target_compile_definitions(codegen_internal PRIVATE "-DNVFUSER_CUTLASS_KERNEL_ENABLED")
endif()

if(NVFUSER_HOST_IR_JIT)
target_link_libraries(codegen_internal PUBLIC LLVM_JIT)
endif()
target_link_libraries(codegen_internal PUBLIC LLVM_JIT)

add_library(nvfuser_codegen SHARED $<TARGET_OBJECTS:codegen_internal>)

Expand Down Expand Up @@ -582,8 +576,7 @@ target_include_directories(nvfuser_codegen SYSTEM PUBLIC
)
target_link_libraries(nvfuser_codegen
PUBLIC ${TORCH_LIBRARIES}
PRIVATE dynamic_type flatbuffers ${CUDA_NVRTC_LIB} CUDA::cupti dl
$<$<BOOL:${NVFUSER_HOST_IR_JIT}>:LLVM_JIT>
PRIVATE dynamic_type flatbuffers ${CUDA_NVRTC_LIB} CUDA::cupti dl LLVM_JIT
)
set_target_properties(nvfuser_codegen PROPERTIES
C_STANDARD ${NVFUSER_C_STANDARD}
Expand Down Expand Up @@ -1240,15 +1233,13 @@ if(BUILD_TEST)
add_test(test_host_ir "${HOSTIR_TEST_SRCS}" "")
list(APPEND TEST_BINARIES test_host_ir)

if(NVFUSER_HOST_IR_JIT)
set(LLVM_COMPILE_TEST_SRCS)
list(APPEND LLVM_COMPILE_TEST_SRCS
${NVFUSER_ROOT}/tests/cpp/test_host_ir_jit.cpp
)
add_test(test_host_ir_jit "${LLVM_COMPILE_TEST_SRCS}" "")
target_link_libraries(test_host_ir_jit PUBLIC LLVM_JIT)
list(APPEND TEST_BINARIES test_host_ir_jit)
endif()
set(LLVM_COMPILE_TEST_SRCS)
list(APPEND LLVM_COMPILE_TEST_SRCS
${NVFUSER_ROOT}/tests/cpp/test_host_ir_jit.cpp
)
add_test(test_host_ir_jit "${LLVM_COMPILE_TEST_SRCS}" "")
target_link_libraries(test_host_ir_jit PUBLIC LLVM_JIT)
list(APPEND TEST_BINARIES test_host_ir_jit)


# We don't link CUPTI for MSVC
Expand Down Expand Up @@ -1493,7 +1484,6 @@ endif()
message(STATUS " NVFUSER_STANDALONE_BUILD_WITH_UCC : ${NVFUSER_STANDALONE_BUILD_WITH_UCC}")
message(STATUS " NVFUSER_BUILD_WITH_ASAN : ${NVFUSER_BUILD_WITH_ASAN}")
message(STATUS " NVFUSER_DISTRIBUTED : ${NVFUSER_DISTRIBUTED}")
message(STATUS " NVFUSER_HOST_IR_JIT : ${NVFUSER_HOST_IR_JIT}")
message(STATUS " NVFUSER_CPP_STANDARD : ${NVFUSER_CPP_STANDARD}")
message(STATUS " NVMMH_INCLUDE_DIR : ${NVMMH_INCLUDE_DIR}")
message(STATUS "******** End of Nvfuser configuration summary ********")
4 changes: 2 additions & 2 deletions csrc/host_ir/jit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -492,8 +492,8 @@ void inferTensorShapesAndStrides(
// Check if sizes and strides are the same size as logical domain
const auto logical_ndims =
std::ranges::distance(logical_domain | TensorDomain::kNoReductions);
NVF_ERROR_EQ(sizes.size(), logical_ndims);
NVF_ERROR_EQ(strides.size(), logical_ndims);
NVF_ERROR_EQ(std::ssize(sizes), logical_ndims);
NVF_ERROR_EQ(std::ssize(strides), logical_ndims);
}

void unpackInputs(
Expand Down
1 change: 1 addition & 0 deletions csrc/options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ const std::unordered_map<std::string, EnableOption>& getEnableOptions() {
{"warn_register_spill", EnableOption::WarnRegisterSpill},
{"ws_normalization", EnableOption::WarpSpecializedNormalization},
{"host_ir_lowering", EnableOption::HostIrLowering},
{"host_ir_jit", EnableOption::HostIrJit},
{"insert_resharding_after", EnableOption::InsertReshardingAfter},
{"fast_math", EnableOption::FastMath},
{"p2p_protocol", EnableOption::P2pProtocol},
Expand Down
1 change: 1 addition & 0 deletions csrc/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ enum class EnableOption {
WarnRegisterSpill, //! Enable warnings of register spill
WarpSpecializedNormalization, //! Enable warp specialized persistent kernel
HostIrLowering, //! Enable FusionKernelRuntime lowering to host IR
HostIrJit, //! Enable Host IR JIT compilation with LLVM
InsertReshardingAfter, //! Insert resharding set after the expression
FastMath, //! Enable fast math optimizations (--use_fast_math)
P2pProtocol, //! Prescribe P2P protocol: put|get
Expand Down
33 changes: 15 additions & 18 deletions csrc/runtime/fusion_kernel_runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,11 +157,7 @@ void FusionKernelRuntime::evictCache(size_t input_id) {

bool FusionKernelRuntime::isCompiled() const {
if (isOptionEnabled(EnableOption::HostIrLowering)) {
#ifdef NVFUSER_HOST_IR_JIT
return hij_ != nullptr;
#else
return hie_ != nullptr;
#endif
return hij_ != nullptr || hie_ != nullptr;
} else {
std::lock_guard<std::mutex> guard(mutex_);
return std::all_of(
Expand Down Expand Up @@ -299,13 +295,14 @@ KernelArgumentHolder FusionKernelRuntime::runWithInputs(
<< std::endl;
}

#ifdef NVFUSER_HOST_IR_JIT
auto outputs =
hij_->runWithInputs(args); // TODO: change NVFUSER_HOST_IR_JIT flag to
// enableOption in the future.
#else
auto outputs = hie_->runWithInputs(args);
#endif
KernelArgumentHolder outputs;
if (hij_ != nullptr) {
outputs = hij_->runWithInputs(args);
} else if (hie_ != nullptr) {
outputs = hie_->runWithInputs(args);
} else {
NVF_THROW("Neither Host IR JIT or Host IR Evaluator are initialized.");
}

if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
debug() << "============= FINISHED RUNNING HOSTIR EVALUATOR ============"
Expand Down Expand Up @@ -472,12 +469,12 @@ void FusionKernelRuntime::compileFusionParallel(KernelArgumentHolder args) {
}
std::unique_ptr<hir::HostIrContainer> hic = lowerSegmentedFusionToHostIr(
*segmented_fusion_, launch_params_per_segment, executors_);
#ifdef NVFUSER_HOST_IR_JIT
hij_ = std::make_unique<HostIrJit>(std::move(hic));
#else
hie_ = std::make_unique<hir::HostIrEvaluator>(
std::move(hic), &Communicator::getInstance());
#endif
if (isOptionEnabled(EnableOption::HostIrJit)) {
hij_ = std::make_unique<HostIrJit>(std::move(hic));
} else {
hie_ = std::make_unique<hir::HostIrEvaluator>(
std::move(hic), &Communicator::getInstance());
}
}

if (isProfilerEnabled()) {
Expand Down
21 changes: 9 additions & 12 deletions csrc/runtime/fusion_kernel_runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,7 @@

#include <fusion_segmenter.h>
#include <host_ir/evaluator.h>
#ifdef NVFUSER_HOST_IR_JIT
#include <host_ir/jit.h>
#endif
#include <polymorphic_value.h>
#include <runtime/executor.h>
#include <runtime/executor_kernel_arg.h>
Expand Down Expand Up @@ -143,11 +141,13 @@ class FusionKernelRuntime {

//! Get the Host IR Container
const hir::HostIrContainer& getHostIrContainer() const {
#ifdef NVFUSER_HOST_IR_JIT
return hij_->container();
#else
return hie_->container();
#endif
if (isOptionEnabled(EnableOption::HostIrJit)) {
NVF_ERROR(hij_ != nullptr, "Host IR JIT is not initialized");
return hij_->container();
} else {
NVF_ERROR(hie_ != nullptr, "Host IR Evaluator is not initialized");
return hie_->container();
}
}

private:
Expand Down Expand Up @@ -189,13 +189,10 @@ class FusionKernelRuntime {
//! Executors holding compiled kernels
std::vector<std::unique_ptr<ExecutorAbstract>> executors_;

#ifdef NVFUSER_HOST_IR_JIT
//! Host IR JIT
//! Host IR JIT (used when EnableOption::HostIrJit is set)
std::unique_ptr<HostIrJit> hij_;
#else
//! Host IR Evaluator
//! Host IR Evaluator (used when EnableOption::HostIrJit is not set)
std::unique_ptr<hir::HostIrEvaluator> hie_;
#endif

// A metadata copy of initial arguments used to contruct this
// FusionKernelRuntime. Used during deserialization to schedule the fusion
Expand Down
25 changes: 11 additions & 14 deletions doc/dev/host_ir_jit.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,21 +98,18 @@ KernelArgumentHolder HostIrJitImpl::runWithInputs(const KernelArgumentHolder& ar
```
*Detailed Implementation:* https://github.com/NVIDIA/Fuser/blob/3ac1a4697b6b5c31e4dbb9763b3b6db2f0e0164b/csrc/host_ir/jit.cpp#L1399-L1453

## Configuration and Build Options
Building nvFuser project with `NVFUSER_BUILD_HOST_IR_JIT=1` will enables Host IR JIT as default runtime in Host IR execution path.
Otherwise the default runtime is Host IR Evaluator. In the future, when llvm is fully supported in all build machines, we are able
to get rid of this opt-in flag and rather use `enableOption` to control backend switching after build is done.

Sample build
```python
NVFUSER_BUILD_HOST_IR_JIT=1 pip install --no-build-isolation -e python -v
```
or
```python
NVFUSER_BUILD_HOST_IR_JIT=1 _bn
```
## Configuration and Runtime Options

### Build Requirements
**LLVM 18.1+ is required** to build nvFuser. You can switch between Host IR JIT and Host IR Evaluator at runtime.

### Runtime Configuration
You can enable Host IR JIT via runtime option `EnableOption::HostIrJit` or environment `NVFUSER_ENABLE="host_ir_jit"`.

When `host_ir_jit` is enabled, the runtime uses LLVM ORC JIT for low-latency host execution. When disabled, it falls back to the Host IR Evaluator.

## Future Integration plan
We plan to turn on host IR JIT by default after its function and performance are on par.
We plan to turn on host IR JIT by default after its functionality and performance are on par.
Known missing supports and bugs are:

**Ops need to be supported:**
Expand Down
11 changes: 0 additions & 11 deletions python/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ class BuildConfig:
build_with_asan: bool = False
build_without_distributed: bool = False
explicit_error_check: bool = False
build_with_host_ir_jit: bool = False
overwrite_version: bool = False
version_tag: str = None
build_type: str = "Release"
Expand Down Expand Up @@ -98,12 +97,6 @@ def parse_args():
action="store_true",
help="Build nvfuser with UCC support",
)
parser.add_argument(
"--build-with-host-ir-jit",
dest="build_with_host_ir_jit",
action="store_true",
help="Build nvfuser with Host IR JIT support",
)
parser.add_argument(
"--explicit-error-check",
dest="explicit_error_check",
Expand Down Expand Up @@ -206,7 +199,6 @@ def create_build_config():
no_benchmark=args.no_benchmark,
no_ninja=args.no_ninja,
build_with_ucc=args.build_with_ucc,
build_with_host_ir_jit=args.build_with_host_ir_jit,
build_with_asan=args.build_with_asan,
build_without_distributed=args.build_without_distributed,
explicit_error_check=args.explicit_error_check,
Expand Down Expand Up @@ -252,8 +244,6 @@ def override_build_config_from_env(config):
config.no_ninja = get_env_flag_bool("NVFUSER_BUILD_NO_NINJA")
if "NVFUSER_BUILD_WITH_UCC" in os.environ:
config.build_with_ucc = get_env_flag_bool("NVFUSER_BUILD_WITH_UCC")
if "NVFUSER_BUILD_HOST_IR_JIT" in os.environ:
config.build_with_host_ir_jit = get_env_flag_bool("NVFUSER_BUILD_HOST_IR_JIT")
if "NVFUSER_BUILD_WITH_ASAN" in os.environ:
config.build_with_asan = get_env_flag_bool("NVFUSER_BUILD_WITH_ASAN")
if "NVFUSER_BUILD_WITHOUT_DISTRIBUTED" in os.environ:
Expand Down Expand Up @@ -483,7 +473,6 @@ def on_or_off(flag: bool) -> str:
f"-DPython_EXECUTABLE={sys.executable}",
f"-DBUILD_NVFUSER_BENCHMARK={on_or_off(not config.no_benchmark)}",
f"-DNVFUSER_DISTRIBUTED={on_or_off(not config.build_without_distributed)}",
f"-DUSE_HOST_IR_JIT={on_or_off(config.build_with_host_ir_jit)}",
f"-DCUTLASS_MAX_JOBS={config.cutlass_max_jobs}",
"-B",
cmake_build_dir,
Expand Down
1 change: 1 addition & 0 deletions tests/cpp/test_host_ir_integration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class HostIrIntegrationTest : public NVFuserTest {
protected:
HostIrIntegrationTest() {
EnableOptionsGuard::getCurOptions().set(EnableOption::HostIrLowering);
EnableOptionsGuard::getCurOptions().set(EnableOption::HostIrJit);
}
};

Expand Down
13 changes: 9 additions & 4 deletions tests/cpp/test_host_ir_jit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,12 @@ namespace nvfuser {

namespace hir {

using HostIrJitTest = NVFuserTest;
class HostIrJitTest : public NVFuserTest {
protected:
HostIrJitTest() {
EnableOptionsGuard::getCurOptions().set(EnableOption::HostIrJit);
}
};
// Build with: python setup.py install --build-with-host-ir-jit
TEST_F(HostIrJitTest, Set) {
auto hic = std::make_unique<HostIrContainer>();
Expand Down Expand Up @@ -333,7 +338,7 @@ TEST_F(HostIrJitTest, Matmul) {

HostIrJit jit(std::move(hic));

auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(torch::kFloat);
auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(at::kFloat);
at::Tensor t0 = at::randn({H, M, K}, options);
at::Tensor t1 = at::randn({H, K, N}, options);
at::Tensor t2 = at::randn({H, M, N}, options);
Expand Down Expand Up @@ -377,7 +382,7 @@ TEST_F(HostIrJitTest, MatmulOut) {

HostIrJit jit(std::move(hic));

auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(torch::kFloat);
auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(at::kFloat);
at::Tensor t0 = at::randn({H, M, K}, options);
at::Tensor t1 = at::randn({H, K, N}, options);
std::unordered_map<Val*, PolymorphicValue> concrete_input_buffers = {
Expand Down Expand Up @@ -428,7 +433,7 @@ TEST_F(HostIrJitTest, Linear) {

HostIrJit jit(std::move(hic));

auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(torch::kFloat);
auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(at::kFloat);
auto in_at = at::randint(5, {B, M, K}, options);
auto weight_at = at::randint(5, {N, K}, options);
auto bias_at = at::randint(5, {N}, options);
Expand Down
Loading