diff --git a/CMakeLists.txt b/CMakeLists.txt index 8dbecbe4381..8c807c93e85 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -418,38 +418,34 @@ list(APPEND NVFUSER_SRCS ${NVFUSER_SRCS_DIR}/validator_utils.cpp ) -cmake_dependent_option(NVFUSER_HOST_IR_JIT "Build nvFuser with LLVM" ON "USE_HOST_IR_JIT" OFF) - - -message(STATUS "Setting NVFUSER_HOST_IR_JIT=${NVFUSER_HOST_IR_JIT}") - -if(NVFUSER_HOST_IR_JIT) - add_compile_definitions(NVFUSER_HOST_IR_JIT) - # Add LLVM JIT related dependencies - find_package(LLVM 18.1 REQUIRED CONFIG) - llvm_map_components_to_libnames(LLVM_LIBS - support - core - orcjit - executionengine - irreader - nativecodegen - Target - Analysis - JITLink - Demangle - ) +# Add LLVM JIT related dependencies +set(LLVM_MINIMUM_VERSION "18.1") +find_package(LLVM REQUIRED CONFIG) +if(${LLVM_VERSION} VERSION_LESS ${LLVM_MINIMUM_VERSION}) + message(FATAL_ERROR "LLVM ${LLVM_VERSION} does not meet the minimum version required: ${LLVM_MINIMUM_VERSION}") +endif() +llvm_map_components_to_libnames(LLVM_LIBS + support + core + orcjit + executionengine + irreader + nativecodegen + Target + Analysis + JITLink + Demangle +) - add_library(LLVM_JIT INTERFACE) - target_include_directories(LLVM_JIT INTERFACE ${LLVM_INCLUDE_DIRS}) - target_compile_definitions(LLVM_JIT INTERFACE ${LLVM_DEFINITIONS}) - target_link_libraries(LLVM_JIT INTERFACE ${LLVM_LIBS}) +add_library(LLVM_JIT INTERFACE) +target_include_directories(LLVM_JIT SYSTEM INTERFACE ${LLVM_INCLUDE_DIRS}) +target_compile_definitions(LLVM_JIT INTERFACE ${LLVM_DEFINITIONS}) +target_link_libraries(LLVM_JIT INTERFACE ${LLVM_LIBS}) - # Add LLVM JIT related sources - list(APPEND NVFUSER_SRCS - ${NVFUSER_SRCS_DIR}/host_ir/jit.cpp - ) -endif() +# Add LLVM JIT related sources +list(APPEND NVFUSER_SRCS + ${NVFUSER_SRCS_DIR}/host_ir/jit.cpp +) # We don't link CUPTI for MSVC if(NOT MSVC) @@ -545,9 +541,7 @@ if (BUILD_CUTLASS AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) target_compile_definitions(codegen_internal PRIVATE "-DNVFUSER_CUTLASS_KERNEL_ENABLED") endif() -if(NVFUSER_HOST_IR_JIT) - target_link_libraries(codegen_internal PUBLIC LLVM_JIT) -endif() +target_link_libraries(codegen_internal PUBLIC LLVM_JIT) add_library(nvfuser_codegen SHARED $) @@ -582,8 +576,7 @@ target_include_directories(nvfuser_codegen SYSTEM PUBLIC ) target_link_libraries(nvfuser_codegen PUBLIC ${TORCH_LIBRARIES} - PRIVATE dynamic_type flatbuffers ${CUDA_NVRTC_LIB} CUDA::cupti dl - $<$:LLVM_JIT> + PRIVATE dynamic_type flatbuffers ${CUDA_NVRTC_LIB} CUDA::cupti dl LLVM_JIT ) set_target_properties(nvfuser_codegen PROPERTIES C_STANDARD ${NVFUSER_C_STANDARD} @@ -1240,15 +1233,13 @@ if(BUILD_TEST) add_test(test_host_ir "${HOSTIR_TEST_SRCS}" "") list(APPEND TEST_BINARIES test_host_ir) - if(NVFUSER_HOST_IR_JIT) - set(LLVM_COMPILE_TEST_SRCS) - list(APPEND LLVM_COMPILE_TEST_SRCS - ${NVFUSER_ROOT}/tests/cpp/test_host_ir_jit.cpp - ) - add_test(test_host_ir_jit "${LLVM_COMPILE_TEST_SRCS}" "") - target_link_libraries(test_host_ir_jit PUBLIC LLVM_JIT) - list(APPEND TEST_BINARIES test_host_ir_jit) - endif() + set(LLVM_COMPILE_TEST_SRCS) + list(APPEND LLVM_COMPILE_TEST_SRCS + ${NVFUSER_ROOT}/tests/cpp/test_host_ir_jit.cpp + ) + add_test(test_host_ir_jit "${LLVM_COMPILE_TEST_SRCS}" "") + target_link_libraries(test_host_ir_jit PUBLIC LLVM_JIT) + list(APPEND TEST_BINARIES test_host_ir_jit) # We don't link CUPTI for MSVC @@ -1493,7 +1484,6 @@ endif() message(STATUS " NVFUSER_STANDALONE_BUILD_WITH_UCC : ${NVFUSER_STANDALONE_BUILD_WITH_UCC}") message(STATUS " NVFUSER_BUILD_WITH_ASAN : ${NVFUSER_BUILD_WITH_ASAN}") message(STATUS " NVFUSER_DISTRIBUTED : ${NVFUSER_DISTRIBUTED}") -message(STATUS " NVFUSER_HOST_IR_JIT : ${NVFUSER_HOST_IR_JIT}") message(STATUS " NVFUSER_CPP_STANDARD : ${NVFUSER_CPP_STANDARD}") message(STATUS " NVMMH_INCLUDE_DIR : ${NVMMH_INCLUDE_DIR}") message(STATUS "******** End of Nvfuser configuration summary ********") diff --git a/csrc/host_ir/jit.cpp b/csrc/host_ir/jit.cpp index a4188c93587..8dec5c1afe3 100644 --- a/csrc/host_ir/jit.cpp +++ b/csrc/host_ir/jit.cpp @@ -492,8 +492,8 @@ void inferTensorShapesAndStrides( // Check if sizes and strides are the same size as logical domain const auto logical_ndims = std::ranges::distance(logical_domain | TensorDomain::kNoReductions); - NVF_ERROR_EQ(sizes.size(), logical_ndims); - NVF_ERROR_EQ(strides.size(), logical_ndims); + NVF_ERROR_EQ(std::ssize(sizes), logical_ndims); + NVF_ERROR_EQ(std::ssize(strides), logical_ndims); } void unpackInputs( diff --git a/csrc/options.cpp b/csrc/options.cpp index 65085e6d384..6e424d5e316 100644 --- a/csrc/options.cpp +++ b/csrc/options.cpp @@ -177,6 +177,7 @@ const std::unordered_map& getEnableOptions() { {"warn_register_spill", EnableOption::WarnRegisterSpill}, {"ws_normalization", EnableOption::WarpSpecializedNormalization}, {"host_ir_lowering", EnableOption::HostIrLowering}, + {"host_ir_jit", EnableOption::HostIrJit}, {"insert_resharding_after", EnableOption::InsertReshardingAfter}, {"fast_math", EnableOption::FastMath}, {"p2p_protocol", EnableOption::P2pProtocol}, diff --git a/csrc/options.h b/csrc/options.h index 335943b5441..4ba316242ad 100644 --- a/csrc/options.h +++ b/csrc/options.h @@ -121,6 +121,7 @@ enum class EnableOption { WarnRegisterSpill, //! Enable warnings of register spill WarpSpecializedNormalization, //! Enable warp specialized persistent kernel HostIrLowering, //! Enable FusionKernelRuntime lowering to host IR + HostIrJit, //! Enable Host IR JIT compilation with LLVM InsertReshardingAfter, //! Insert resharding set after the expression FastMath, //! Enable fast math optimizations (--use_fast_math) P2pProtocol, //! Prescribe P2P protocol: put|get diff --git a/csrc/runtime/fusion_kernel_runtime.cpp b/csrc/runtime/fusion_kernel_runtime.cpp index 4041c96a2b4..962b540d675 100644 --- a/csrc/runtime/fusion_kernel_runtime.cpp +++ b/csrc/runtime/fusion_kernel_runtime.cpp @@ -157,11 +157,7 @@ void FusionKernelRuntime::evictCache(size_t input_id) { bool FusionKernelRuntime::isCompiled() const { if (isOptionEnabled(EnableOption::HostIrLowering)) { -#ifdef NVFUSER_HOST_IR_JIT - return hij_ != nullptr; -#else - return hie_ != nullptr; -#endif + return hij_ != nullptr || hie_ != nullptr; } else { std::lock_guard guard(mutex_); return std::all_of( @@ -299,13 +295,14 @@ KernelArgumentHolder FusionKernelRuntime::runWithInputs( << std::endl; } -#ifdef NVFUSER_HOST_IR_JIT - auto outputs = - hij_->runWithInputs(args); // TODO: change NVFUSER_HOST_IR_JIT flag to - // enableOption in the future. -#else - auto outputs = hie_->runWithInputs(args); -#endif + KernelArgumentHolder outputs; + if (hij_ != nullptr) { + outputs = hij_->runWithInputs(args); + } else if (hie_ != nullptr) { + outputs = hie_->runWithInputs(args); + } else { + NVF_THROW("Neither Host IR JIT or Host IR Evaluator are initialized."); + } if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) { debug() << "============= FINISHED RUNNING HOSTIR EVALUATOR ============" @@ -472,12 +469,12 @@ void FusionKernelRuntime::compileFusionParallel(KernelArgumentHolder args) { } std::unique_ptr hic = lowerSegmentedFusionToHostIr( *segmented_fusion_, launch_params_per_segment, executors_); -#ifdef NVFUSER_HOST_IR_JIT - hij_ = std::make_unique(std::move(hic)); -#else - hie_ = std::make_unique( - std::move(hic), &Communicator::getInstance()); -#endif + if (isOptionEnabled(EnableOption::HostIrJit)) { + hij_ = std::make_unique(std::move(hic)); + } else { + hie_ = std::make_unique( + std::move(hic), &Communicator::getInstance()); + } } if (isProfilerEnabled()) { diff --git a/csrc/runtime/fusion_kernel_runtime.h b/csrc/runtime/fusion_kernel_runtime.h index 6a16ee27a7f..31965df07c2 100644 --- a/csrc/runtime/fusion_kernel_runtime.h +++ b/csrc/runtime/fusion_kernel_runtime.h @@ -11,9 +11,7 @@ #include #include -#ifdef NVFUSER_HOST_IR_JIT #include -#endif #include #include #include @@ -143,11 +141,13 @@ class FusionKernelRuntime { //! Get the Host IR Container const hir::HostIrContainer& getHostIrContainer() const { -#ifdef NVFUSER_HOST_IR_JIT - return hij_->container(); -#else - return hie_->container(); -#endif + if (isOptionEnabled(EnableOption::HostIrJit)) { + NVF_ERROR(hij_ != nullptr, "Host IR JIT is not initialized"); + return hij_->container(); + } else { + NVF_ERROR(hie_ != nullptr, "Host IR Evaluator is not initialized"); + return hie_->container(); + } } private: @@ -189,13 +189,10 @@ class FusionKernelRuntime { //! Executors holding compiled kernels std::vector> executors_; -#ifdef NVFUSER_HOST_IR_JIT - //! Host IR JIT + //! Host IR JIT (used when EnableOption::HostIrJit is set) std::unique_ptr hij_; -#else - //! Host IR Evaluator + //! Host IR Evaluator (used when EnableOption::HostIrJit is not set) std::unique_ptr hie_; -#endif // A metadata copy of initial arguments used to contruct this // FusionKernelRuntime. Used during deserialization to schedule the fusion diff --git a/doc/dev/host_ir_jit.md b/doc/dev/host_ir_jit.md index 833536342cd..4a10292f333 100644 --- a/doc/dev/host_ir_jit.md +++ b/doc/dev/host_ir_jit.md @@ -98,21 +98,18 @@ KernelArgumentHolder HostIrJitImpl::runWithInputs(const KernelArgumentHolder& ar ``` *Detailed Implementation:* https://github.com/NVIDIA/Fuser/blob/3ac1a4697b6b5c31e4dbb9763b3b6db2f0e0164b/csrc/host_ir/jit.cpp#L1399-L1453 -## Configuration and Build Options -Building nvFuser project with `NVFUSER_BUILD_HOST_IR_JIT=1` will enables Host IR JIT as default runtime in Host IR execution path. -Otherwise the default runtime is Host IR Evaluator. In the future, when llvm is fully supported in all build machines, we are able -to get rid of this opt-in flag and rather use `enableOption` to control backend switching after build is done. - -Sample build -```python -NVFUSER_BUILD_HOST_IR_JIT=1 pip install --no-build-isolation -e python -v -``` -or -```python -NVFUSER_BUILD_HOST_IR_JIT=1 _bn -``` +## Configuration and Runtime Options + +### Build Requirements +**LLVM 18.1+ is required** to build nvFuser. You can switch between Host IR JIT and Host IR Evaluator at runtime. + +### Runtime Configuration +You can enable Host IR JIT via runtime option `EnableOption::HostIrJit` or environment `NVFUSER_ENABLE="host_ir_jit"`. + +When `host_ir_jit` is enabled, the runtime uses LLVM ORC JIT for low-latency host execution. When disabled, it falls back to the Host IR Evaluator. + ## Future Integration plan -We plan to turn on host IR JIT by default after its function and performance are on par. +We plan to turn on host IR JIT by default after its functionality and performance are on par. Known missing supports and bugs are: **Ops need to be supported:** diff --git a/python/utils.py b/python/utils.py index 2654e5f4e82..32c02fb491e 100644 --- a/python/utils.py +++ b/python/utils.py @@ -25,7 +25,6 @@ class BuildConfig: build_with_asan: bool = False build_without_distributed: bool = False explicit_error_check: bool = False - build_with_host_ir_jit: bool = False overwrite_version: bool = False version_tag: str = None build_type: str = "Release" @@ -98,12 +97,6 @@ def parse_args(): action="store_true", help="Build nvfuser with UCC support", ) - parser.add_argument( - "--build-with-host-ir-jit", - dest="build_with_host_ir_jit", - action="store_true", - help="Build nvfuser with Host IR JIT support", - ) parser.add_argument( "--explicit-error-check", dest="explicit_error_check", @@ -206,7 +199,6 @@ def create_build_config(): no_benchmark=args.no_benchmark, no_ninja=args.no_ninja, build_with_ucc=args.build_with_ucc, - build_with_host_ir_jit=args.build_with_host_ir_jit, build_with_asan=args.build_with_asan, build_without_distributed=args.build_without_distributed, explicit_error_check=args.explicit_error_check, @@ -252,8 +244,6 @@ def override_build_config_from_env(config): config.no_ninja = get_env_flag_bool("NVFUSER_BUILD_NO_NINJA") if "NVFUSER_BUILD_WITH_UCC" in os.environ: config.build_with_ucc = get_env_flag_bool("NVFUSER_BUILD_WITH_UCC") - if "NVFUSER_BUILD_HOST_IR_JIT" in os.environ: - config.build_with_host_ir_jit = get_env_flag_bool("NVFUSER_BUILD_HOST_IR_JIT") if "NVFUSER_BUILD_WITH_ASAN" in os.environ: config.build_with_asan = get_env_flag_bool("NVFUSER_BUILD_WITH_ASAN") if "NVFUSER_BUILD_WITHOUT_DISTRIBUTED" in os.environ: @@ -483,7 +473,6 @@ def on_or_off(flag: bool) -> str: f"-DPython_EXECUTABLE={sys.executable}", f"-DBUILD_NVFUSER_BENCHMARK={on_or_off(not config.no_benchmark)}", f"-DNVFUSER_DISTRIBUTED={on_or_off(not config.build_without_distributed)}", - f"-DUSE_HOST_IR_JIT={on_or_off(config.build_with_host_ir_jit)}", f"-DCUTLASS_MAX_JOBS={config.cutlass_max_jobs}", "-B", cmake_build_dir, diff --git a/tests/cpp/test_host_ir_integration.cpp b/tests/cpp/test_host_ir_integration.cpp index b578cd17d8a..01614755adb 100644 --- a/tests/cpp/test_host_ir_integration.cpp +++ b/tests/cpp/test_host_ir_integration.cpp @@ -29,6 +29,7 @@ class HostIrIntegrationTest : public NVFuserTest { protected: HostIrIntegrationTest() { EnableOptionsGuard::getCurOptions().set(EnableOption::HostIrLowering); + EnableOptionsGuard::getCurOptions().set(EnableOption::HostIrJit); } }; diff --git a/tests/cpp/test_host_ir_jit.cpp b/tests/cpp/test_host_ir_jit.cpp index a3b5f13d619..9414e07a051 100644 --- a/tests/cpp/test_host_ir_jit.cpp +++ b/tests/cpp/test_host_ir_jit.cpp @@ -20,7 +20,12 @@ namespace nvfuser { namespace hir { -using HostIrJitTest = NVFuserTest; +class HostIrJitTest : public NVFuserTest { + protected: + HostIrJitTest() { + EnableOptionsGuard::getCurOptions().set(EnableOption::HostIrJit); + } +}; // Build with: python setup.py install --build-with-host-ir-jit TEST_F(HostIrJitTest, Set) { auto hic = std::make_unique(); @@ -333,7 +338,7 @@ TEST_F(HostIrJitTest, Matmul) { HostIrJit jit(std::move(hic)); - auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(torch::kFloat); + auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(at::kFloat); at::Tensor t0 = at::randn({H, M, K}, options); at::Tensor t1 = at::randn({H, K, N}, options); at::Tensor t2 = at::randn({H, M, N}, options); @@ -377,7 +382,7 @@ TEST_F(HostIrJitTest, MatmulOut) { HostIrJit jit(std::move(hic)); - auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(torch::kFloat); + auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(at::kFloat); at::Tensor t0 = at::randn({H, M, K}, options); at::Tensor t1 = at::randn({H, K, N}, options); std::unordered_map concrete_input_buffers = { @@ -428,7 +433,7 @@ TEST_F(HostIrJitTest, Linear) { HostIrJit jit(std::move(hic)); - auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(torch::kFloat); + auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(at::kFloat); auto in_at = at::randint(5, {B, M, K}, options); auto weight_at = at::randint(5, {N, K}, options); auto bias_at = at::randint(5, {N}, options);