NVIDIA · wujingyue · Oct 22, 2025 · Oct 14, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -418,38 +418,34 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/validator_utils.cpp
 )
 
-cmake_dependent_option(NVFUSER_HOST_IR_JIT "Build nvFuser with LLVM" ON "USE_HOST_IR_JIT" OFF)
-
-
-message(STATUS "Setting NVFUSER_HOST_IR_JIT=${NVFUSER_HOST_IR_JIT}")
-
-if(NVFUSER_HOST_IR_JIT)
-  add_compile_definitions(NVFUSER_HOST_IR_JIT)
-  # Add LLVM JIT related dependencies
-  find_package(LLVM 18.1 REQUIRED CONFIG)
-  llvm_map_components_to_libnames(LLVM_LIBS
-    support
-    core
-    orcjit
-    executionengine
-    irreader
-    nativecodegen
-    Target
-    Analysis
-    JITLink
-    Demangle
-  )
+# Add LLVM JIT related dependencies
+set(LLVM_MINIMUM_VERSION "18.1")
+find_package(LLVM REQUIRED CONFIG)
+if(${LLVM_VERSION} VERSION_LESS ${LLVM_MINIMUM_VERSION})
+  message(FATAL_ERROR "LLVM ${LLVM_VERSION} does not meet the minimum version required: ${LLVM_MINIMUM_VERSION}")
+endif()
+llvm_map_components_to_libnames(LLVM_LIBS
+  support
+  core
+  orcjit
+  executionengine
+  irreader
+  nativecodegen
+  Target
+  Analysis
+  JITLink
+  Demangle
+)
 
-  add_library(LLVM_JIT INTERFACE)
-  target_include_directories(LLVM_JIT INTERFACE ${LLVM_INCLUDE_DIRS})
-  target_compile_definitions(LLVM_JIT INTERFACE ${LLVM_DEFINITIONS})
-  target_link_libraries(LLVM_JIT INTERFACE ${LLVM_LIBS})
+add_library(LLVM_JIT INTERFACE)
+target_include_directories(LLVM_JIT SYSTEM INTERFACE ${LLVM_INCLUDE_DIRS})
+target_compile_definitions(LLVM_JIT INTERFACE ${LLVM_DEFINITIONS})
+target_link_libraries(LLVM_JIT INTERFACE ${LLVM_LIBS})
 
-  # Add LLVM JIT related sources
-  list(APPEND NVFUSER_SRCS
-    ${NVFUSER_SRCS_DIR}/host_ir/jit.cpp
-  )
-endif()
+# Add LLVM JIT related sources
+list(APPEND NVFUSER_SRCS
+  ${NVFUSER_SRCS_DIR}/host_ir/jit.cpp
+)
 
 # We don't link CUPTI for MSVC
 if(NOT MSVC)
@@ -545,9 +541,7 @@ if (BUILD_CUTLASS AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
   target_compile_definitions(codegen_internal PRIVATE "-DNVFUSER_CUTLASS_KERNEL_ENABLED")
 endif()
 
-if(NVFUSER_HOST_IR_JIT)
-  target_link_libraries(codegen_internal PUBLIC LLVM_JIT)
-endif()
+target_link_libraries(codegen_internal PUBLIC LLVM_JIT)
 
 add_library(nvfuser_codegen SHARED $<TARGET_OBJECTS:codegen_internal>)
 
@@ -582,8 +576,7 @@ target_include_directories(nvfuser_codegen SYSTEM PUBLIC
 )
 target_link_libraries(nvfuser_codegen
   PUBLIC ${TORCH_LIBRARIES}
-  PRIVATE dynamic_type flatbuffers ${CUDA_NVRTC_LIB} CUDA::cupti dl
-    $<$<BOOL:${NVFUSER_HOST_IR_JIT}>:LLVM_JIT>
+  PRIVATE dynamic_type flatbuffers ${CUDA_NVRTC_LIB} CUDA::cupti dl LLVM_JIT
 )
 set_target_properties(nvfuser_codegen PROPERTIES
   C_STANDARD ${NVFUSER_C_STANDARD}
@@ -1240,15 +1233,13 @@ if(BUILD_TEST)
   add_test(test_host_ir "${HOSTIR_TEST_SRCS}" "")
   list(APPEND TEST_BINARIES test_host_ir)
 
-  if(NVFUSER_HOST_IR_JIT)
-    set(LLVM_COMPILE_TEST_SRCS)
-    list(APPEND LLVM_COMPILE_TEST_SRCS
-      ${NVFUSER_ROOT}/tests/cpp/test_host_ir_jit.cpp
-    )
-    add_test(test_host_ir_jit "${LLVM_COMPILE_TEST_SRCS}" "")
-    target_link_libraries(test_host_ir_jit PUBLIC LLVM_JIT)
-    list(APPEND TEST_BINARIES test_host_ir_jit)
-  endif()
+  set(LLVM_COMPILE_TEST_SRCS)
+  list(APPEND LLVM_COMPILE_TEST_SRCS
+    ${NVFUSER_ROOT}/tests/cpp/test_host_ir_jit.cpp
+  )
+  add_test(test_host_ir_jit "${LLVM_COMPILE_TEST_SRCS}" "")
+  target_link_libraries(test_host_ir_jit PUBLIC LLVM_JIT)
+  list(APPEND TEST_BINARIES test_host_ir_jit)
 
 
   # We don't link CUPTI for MSVC
@@ -1493,7 +1484,6 @@ endif()
 message(STATUS "  NVFUSER_STANDALONE_BUILD_WITH_UCC  : ${NVFUSER_STANDALONE_BUILD_WITH_UCC}")
 message(STATUS "  NVFUSER_BUILD_WITH_ASAN            : ${NVFUSER_BUILD_WITH_ASAN}")
 message(STATUS "  NVFUSER_DISTRIBUTED                : ${NVFUSER_DISTRIBUTED}")
-message(STATUS "  NVFUSER_HOST_IR_JIT                : ${NVFUSER_HOST_IR_JIT}")
 message(STATUS "  NVFUSER_CPP_STANDARD               : ${NVFUSER_CPP_STANDARD}")
 message(STATUS "  NVMMH_INCLUDE_DIR                  : ${NVMMH_INCLUDE_DIR}")
 message(STATUS "******** End of Nvfuser configuration summary ********")
diff --git a/csrc/host_ir/jit.cpp b/csrc/host_ir/jit.cpp
@@ -492,8 +492,8 @@ void inferTensorShapesAndStrides(
   // Check if sizes and strides are the same size as logical domain
   const auto logical_ndims =
       std::ranges::distance(logical_domain | TensorDomain::kNoReductions);
-  NVF_ERROR_EQ(sizes.size(), logical_ndims);
-  NVF_ERROR_EQ(strides.size(), logical_ndims);
+  NVF_ERROR_EQ(std::ssize(sizes), logical_ndims);
+  NVF_ERROR_EQ(std::ssize(strides), logical_ndims);
 }
 
 void unpackInputs(

diff --git a/csrc/options.cpp b/csrc/options.cpp
@@ -177,6 +177,7 @@ const std::unordered_map<std::string, EnableOption>& getEnableOptions() {
           {"warn_register_spill", EnableOption::WarnRegisterSpill},
           {"ws_normalization", EnableOption::WarpSpecializedNormalization},
           {"host_ir_lowering", EnableOption::HostIrLowering},
+          {"host_ir_jit", EnableOption::HostIrJit},
           {"insert_resharding_after", EnableOption::InsertReshardingAfter},
           {"fast_math", EnableOption::FastMath},
           {"p2p_protocol", EnableOption::P2pProtocol},

diff --git a/csrc/options.h b/csrc/options.h
@@ -121,6 +121,7 @@ enum class EnableOption {
   WarnRegisterSpill, //! Enable warnings of register spill
   WarpSpecializedNormalization, //! Enable warp specialized persistent kernel
   HostIrLowering, //! Enable FusionKernelRuntime lowering to host IR
+  HostIrJit, //! Enable Host IR JIT compilation with LLVM
   InsertReshardingAfter, //! Insert resharding set after the expression
   FastMath, //! Enable fast math optimizations (--use_fast_math)
   P2pProtocol, //! Prescribe P2P protocol: put|get

diff --git a/csrc/runtime/fusion_kernel_runtime.cpp b/csrc/runtime/fusion_kernel_runtime.cpp
@@ -157,11 +157,7 @@ void FusionKernelRuntime::evictCache(size_t input_id) {
 
 bool FusionKernelRuntime::isCompiled() const {
   if (isOptionEnabled(EnableOption::HostIrLowering)) {
-#ifdef NVFUSER_HOST_IR_JIT
-    return hij_ != nullptr;
-#else
-    return hie_ != nullptr;
-#endif
+    return hij_ != nullptr || hie_ != nullptr;
   } else {
     std::lock_guard<std::mutex> guard(mutex_);
     return std::all_of(
@@ -299,13 +295,14 @@ KernelArgumentHolder FusionKernelRuntime::runWithInputs(
               << std::endl;
     }
 
-#ifdef NVFUSER_HOST_IR_JIT
-    auto outputs =
-        hij_->runWithInputs(args); // TODO: change NVFUSER_HOST_IR_JIT flag to
-                                   // enableOption in the future.
-#else
-    auto outputs = hie_->runWithInputs(args);
-#endif
+    KernelArgumentHolder outputs;
+    if (hij_ != nullptr) {
+      outputs = hij_->runWithInputs(args);
+    } else if (hie_ != nullptr) {
+      outputs = hie_->runWithInputs(args);
+    } else {
+      NVF_THROW("Neither Host IR JIT or Host IR Evaluator are initialized.");
+    }
 
     if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
       debug() << "============= FINISHED RUNNING HOSTIR EVALUATOR ============"
@@ -472,12 +469,12 @@ void FusionKernelRuntime::compileFusionParallel(KernelArgumentHolder args) {
     }
     std::unique_ptr<hir::HostIrContainer> hic = lowerSegmentedFusionToHostIr(
         *segmented_fusion_, launch_params_per_segment, executors_);
-#ifdef NVFUSER_HOST_IR_JIT
-    hij_ = std::make_unique<HostIrJit>(std::move(hic));
-#else
-    hie_ = std::make_unique<hir::HostIrEvaluator>(
-        std::move(hic), &Communicator::getInstance());
-#endif
+    if (isOptionEnabled(EnableOption::HostIrJit)) {
+      hij_ = std::make_unique<HostIrJit>(std::move(hic));
+    } else {
+      hie_ = std::make_unique<hir::HostIrEvaluator>(
+          std::move(hic), &Communicator::getInstance());
+    }
   }
 
   if (isProfilerEnabled()) {

diff --git a/csrc/runtime/fusion_kernel_runtime.h b/csrc/runtime/fusion_kernel_runtime.h
@@ -11,9 +11,7 @@
 
 #include <fusion_segmenter.h>
 #include <host_ir/evaluator.h>
-#ifdef NVFUSER_HOST_IR_JIT
 #include <host_ir/jit.h>
-#endif
 #include <polymorphic_value.h>
 #include <runtime/executor.h>
 #include <runtime/executor_kernel_arg.h>
@@ -143,11 +141,13 @@ class FusionKernelRuntime {
 
   //! Get the Host IR Container
   const hir::HostIrContainer& getHostIrContainer() const {
-#ifdef NVFUSER_HOST_IR_JIT
-    return hij_->container();
-#else
-    return hie_->container();
-#endif
+    if (isOptionEnabled(EnableOption::HostIrJit)) {
+      NVF_ERROR(hij_ != nullptr, "Host IR JIT is not initialized");
+      return hij_->container();
+    } else {
+      NVF_ERROR(hie_ != nullptr, "Host IR Evaluator is not initialized");
+      return hie_->container();
+    }
   }
 
  private:
@@ -189,13 +189,10 @@ class FusionKernelRuntime {
   //! Executors holding compiled kernels
   std::vector<std::unique_ptr<ExecutorAbstract>> executors_;
 
-#ifdef NVFUSER_HOST_IR_JIT
-  //! Host IR JIT
+  //! Host IR JIT (used when EnableOption::HostIrJit is set)
   std::unique_ptr<HostIrJit> hij_;
-#else
-  //! Host IR Evaluator
+  //! Host IR Evaluator (used when EnableOption::HostIrJit is not set)
   std::unique_ptr<hir::HostIrEvaluator> hie_;
-#endif
 
   // A metadata copy of initial arguments used to contruct this
   // FusionKernelRuntime. Used during deserialization to schedule the fusion

diff --git a/doc/dev/host_ir_jit.md b/doc/dev/host_ir_jit.md
@@ -98,21 +98,18 @@ KernelArgumentHolder HostIrJitImpl::runWithInputs(const KernelArgumentHolder& ar
 ```
 *Detailed Implementation:* https://github.com/NVIDIA/Fuser/blob/3ac1a4697b6b5c31e4dbb9763b3b6db2f0e0164b/csrc/host_ir/jit.cpp#L1399-L1453
 
-## Configuration and Build Options
-Building nvFuser project with `NVFUSER_BUILD_HOST_IR_JIT=1` will enables Host IR JIT as default runtime in Host IR execution path.
-Otherwise the default runtime is Host IR Evaluator. In the future, when llvm is fully supported in all build machines, we are able
-to get rid of this opt-in flag and rather use `enableOption` to control backend switching after build is done.
-
-Sample build
-```python
-NVFUSER_BUILD_HOST_IR_JIT=1 pip install --no-build-isolation -e python -v
-```
-or
-```python
-NVFUSER_BUILD_HOST_IR_JIT=1 _bn
-```
+## Configuration and Runtime Options
+
+### Build Requirements
+**LLVM 18.1+ is required** to build nvFuser. You can switch between Host IR JIT and Host IR Evaluator at runtime.
+
+### Runtime Configuration
+You can enable Host IR JIT via runtime option `EnableOption::HostIrJit` or environment `NVFUSER_ENABLE="host_ir_jit"`.
+
+When `host_ir_jit` is enabled, the runtime uses LLVM ORC JIT for low-latency host execution. When disabled, it falls back to the Host IR Evaluator.
+
 ## Future Integration plan
-We plan to turn on host IR JIT by default after its function and performance are on par.
+We plan to turn on host IR JIT by default after its functionality and performance are on par.
 Known missing supports and bugs are:
 
 **Ops need to be supported:**

diff --git a/python/utils.py b/python/utils.py
@@ -25,7 +25,6 @@ class BuildConfig:
     build_with_asan: bool = False
     build_without_distributed: bool = False
     explicit_error_check: bool = False
-    build_with_host_ir_jit: bool = False
     overwrite_version: bool = False
     version_tag: str = None
     build_type: str = "Release"
@@ -98,12 +97,6 @@ def parse_args():
         action="store_true",
         help="Build nvfuser with UCC support",
     )
-    parser.add_argument(
-        "--build-with-host-ir-jit",
-        dest="build_with_host_ir_jit",
-        action="store_true",
-        help="Build nvfuser with Host IR JIT support",
-    )
     parser.add_argument(
         "--explicit-error-check",
         dest="explicit_error_check",
@@ -206,7 +199,6 @@ def create_build_config():
         no_benchmark=args.no_benchmark,
         no_ninja=args.no_ninja,
         build_with_ucc=args.build_with_ucc,
-        build_with_host_ir_jit=args.build_with_host_ir_jit,
         build_with_asan=args.build_with_asan,
         build_without_distributed=args.build_without_distributed,
         explicit_error_check=args.explicit_error_check,
@@ -252,8 +244,6 @@ def override_build_config_from_env(config):
         config.no_ninja = get_env_flag_bool("NVFUSER_BUILD_NO_NINJA")
     if "NVFUSER_BUILD_WITH_UCC" in os.environ:
         config.build_with_ucc = get_env_flag_bool("NVFUSER_BUILD_WITH_UCC")
-    if "NVFUSER_BUILD_HOST_IR_JIT" in os.environ:
-        config.build_with_host_ir_jit = get_env_flag_bool("NVFUSER_BUILD_HOST_IR_JIT")
     if "NVFUSER_BUILD_WITH_ASAN" in os.environ:
         config.build_with_asan = get_env_flag_bool("NVFUSER_BUILD_WITH_ASAN")
     if "NVFUSER_BUILD_WITHOUT_DISTRIBUTED" in os.environ:
@@ -483,7 +473,6 @@ def on_or_off(flag: bool) -> str:
         f"-DPython_EXECUTABLE={sys.executable}",
         f"-DBUILD_NVFUSER_BENCHMARK={on_or_off(not config.no_benchmark)}",
         f"-DNVFUSER_DISTRIBUTED={on_or_off(not config.build_without_distributed)}",
-        f"-DUSE_HOST_IR_JIT={on_or_off(config.build_with_host_ir_jit)}",
         f"-DCUTLASS_MAX_JOBS={config.cutlass_max_jobs}",
         "-B",
         cmake_build_dir,

diff --git a/tests/cpp/test_host_ir_integration.cpp b/tests/cpp/test_host_ir_integration.cpp
@@ -29,6 +29,7 @@ class HostIrIntegrationTest : public NVFuserTest {
  protected:
   HostIrIntegrationTest() {
     EnableOptionsGuard::getCurOptions().set(EnableOption::HostIrLowering);
+    EnableOptionsGuard::getCurOptions().set(EnableOption::HostIrJit);
   }
 };
 

diff --git a/tests/cpp/test_host_ir_jit.cpp b/tests/cpp/test_host_ir_jit.cpp
@@ -20,7 +20,12 @@ namespace nvfuser {
 
 namespace hir {
 
-using HostIrJitTest = NVFuserTest;
+class HostIrJitTest : public NVFuserTest {
+ protected:
+  HostIrJitTest() {
+    EnableOptionsGuard::getCurOptions().set(EnableOption::HostIrJit);
+  }
+};
 // Build with: python setup.py install --build-with-host-ir-jit
 TEST_F(HostIrJitTest, Set) {
   auto hic = std::make_unique<HostIrContainer>();
@@ -333,7 +338,7 @@ TEST_F(HostIrJitTest, Matmul) {
 
   HostIrJit jit(std::move(hic));
 
-  auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(torch::kFloat);
+  auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(at::kFloat);
   at::Tensor t0 = at::randn({H, M, K}, options);
   at::Tensor t1 = at::randn({H, K, N}, options);
   at::Tensor t2 = at::randn({H, M, N}, options);
@@ -377,7 +382,7 @@ TEST_F(HostIrJitTest, MatmulOut) {
 
   HostIrJit jit(std::move(hic));
 
-  auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(torch::kFloat);
+  auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(at::kFloat);
   at::Tensor t0 = at::randn({H, M, K}, options);
   at::Tensor t1 = at::randn({H, K, N}, options);
   std::unordered_map<Val*, PolymorphicValue> concrete_input_buffers = {
@@ -428,7 +433,7 @@ TEST_F(HostIrJitTest, Linear) {
 
   HostIrJit jit(std::move(hic));
 
-  auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(torch::kFloat);
+  auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(at::kFloat);
   auto in_at = at::randint(5, {B, M, K}, options);
   auto weight_at = at::randint(5, {N, K}, options);
   auto bias_at = at::randint(5, {N}, options);