Skip to content

Commit 1ab9dc1

Browse files
wujingyuemdavis36
andauthored
Reapply #5344 (#5425)
It got accidentally reverted by #5328 Co-authored-by: Michael Davis <[email protected]>
1 parent 933c641 commit 1ab9dc1

File tree

10 files changed

+84
-106
lines changed

10 files changed

+84
-106
lines changed

CMakeLists.txt

Lines changed: 35 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -418,38 +418,34 @@ list(APPEND NVFUSER_SRCS
418418
${NVFUSER_SRCS_DIR}/validator_utils.cpp
419419
)
420420

421-
cmake_dependent_option(NVFUSER_HOST_IR_JIT "Build nvFuser with LLVM" ON "USE_HOST_IR_JIT" OFF)
422-
423-
424-
message(STATUS "Setting NVFUSER_HOST_IR_JIT=${NVFUSER_HOST_IR_JIT}")
425-
426-
if(NVFUSER_HOST_IR_JIT)
427-
add_compile_definitions(NVFUSER_HOST_IR_JIT)
428-
# Add LLVM JIT related dependencies
429-
find_package(LLVM 18.1 REQUIRED CONFIG)
430-
llvm_map_components_to_libnames(LLVM_LIBS
431-
support
432-
core
433-
orcjit
434-
executionengine
435-
irreader
436-
nativecodegen
437-
Target
438-
Analysis
439-
JITLink
440-
Demangle
441-
)
421+
# Add LLVM JIT related dependencies
422+
set(LLVM_MINIMUM_VERSION "18.1")
423+
find_package(LLVM REQUIRED CONFIG)
424+
if(${LLVM_VERSION} VERSION_LESS ${LLVM_MINIMUM_VERSION})
425+
message(FATAL_ERROR "LLVM ${LLVM_VERSION} does not meet the minimum version required: ${LLVM_MINIMUM_VERSION}")
426+
endif()
427+
llvm_map_components_to_libnames(LLVM_LIBS
428+
support
429+
core
430+
orcjit
431+
executionengine
432+
irreader
433+
nativecodegen
434+
Target
435+
Analysis
436+
JITLink
437+
Demangle
438+
)
442439

443-
add_library(LLVM_JIT INTERFACE)
444-
target_include_directories(LLVM_JIT INTERFACE ${LLVM_INCLUDE_DIRS})
445-
target_compile_definitions(LLVM_JIT INTERFACE ${LLVM_DEFINITIONS})
446-
target_link_libraries(LLVM_JIT INTERFACE ${LLVM_LIBS})
440+
add_library(LLVM_JIT INTERFACE)
441+
target_include_directories(LLVM_JIT SYSTEM INTERFACE ${LLVM_INCLUDE_DIRS})
442+
target_compile_definitions(LLVM_JIT INTERFACE ${LLVM_DEFINITIONS})
443+
target_link_libraries(LLVM_JIT INTERFACE ${LLVM_LIBS})
447444

448-
# Add LLVM JIT related sources
449-
list(APPEND NVFUSER_SRCS
450-
${NVFUSER_SRCS_DIR}/host_ir/jit.cpp
451-
)
452-
endif()
445+
# Add LLVM JIT related sources
446+
list(APPEND NVFUSER_SRCS
447+
${NVFUSER_SRCS_DIR}/host_ir/jit.cpp
448+
)
453449

454450
# We don't link CUPTI for MSVC
455451
if(NOT MSVC)
@@ -545,9 +541,7 @@ if (BUILD_CUTLASS AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
545541
target_compile_definitions(codegen_internal PRIVATE "-DNVFUSER_CUTLASS_KERNEL_ENABLED")
546542
endif()
547543

548-
if(NVFUSER_HOST_IR_JIT)
549-
target_link_libraries(codegen_internal PUBLIC LLVM_JIT)
550-
endif()
544+
target_link_libraries(codegen_internal PUBLIC LLVM_JIT)
551545

552546
add_library(nvfuser_codegen SHARED $<TARGET_OBJECTS:codegen_internal>)
553547

@@ -582,8 +576,7 @@ target_include_directories(nvfuser_codegen SYSTEM PUBLIC
582576
)
583577
target_link_libraries(nvfuser_codegen
584578
PUBLIC ${TORCH_LIBRARIES}
585-
PRIVATE dynamic_type flatbuffers ${CUDA_NVRTC_LIB} CUDA::cupti dl
586-
$<$<BOOL:${NVFUSER_HOST_IR_JIT}>:LLVM_JIT>
579+
PRIVATE dynamic_type flatbuffers ${CUDA_NVRTC_LIB} CUDA::cupti dl LLVM_JIT
587580
)
588581
set_target_properties(nvfuser_codegen PROPERTIES
589582
C_STANDARD ${NVFUSER_C_STANDARD}
@@ -1240,15 +1233,13 @@ if(BUILD_TEST)
12401233
add_test(test_host_ir "${HOSTIR_TEST_SRCS}" "")
12411234
list(APPEND TEST_BINARIES test_host_ir)
12421235

1243-
if(NVFUSER_HOST_IR_JIT)
1244-
set(LLVM_COMPILE_TEST_SRCS)
1245-
list(APPEND LLVM_COMPILE_TEST_SRCS
1246-
${NVFUSER_ROOT}/tests/cpp/test_host_ir_jit.cpp
1247-
)
1248-
add_test(test_host_ir_jit "${LLVM_COMPILE_TEST_SRCS}" "")
1249-
target_link_libraries(test_host_ir_jit PUBLIC LLVM_JIT)
1250-
list(APPEND TEST_BINARIES test_host_ir_jit)
1251-
endif()
1236+
set(LLVM_COMPILE_TEST_SRCS)
1237+
list(APPEND LLVM_COMPILE_TEST_SRCS
1238+
${NVFUSER_ROOT}/tests/cpp/test_host_ir_jit.cpp
1239+
)
1240+
add_test(test_host_ir_jit "${LLVM_COMPILE_TEST_SRCS}" "")
1241+
target_link_libraries(test_host_ir_jit PUBLIC LLVM_JIT)
1242+
list(APPEND TEST_BINARIES test_host_ir_jit)
12521243

12531244

12541245
# We don't link CUPTI for MSVC
@@ -1493,7 +1484,6 @@ endif()
14931484
message(STATUS " NVFUSER_STANDALONE_BUILD_WITH_UCC : ${NVFUSER_STANDALONE_BUILD_WITH_UCC}")
14941485
message(STATUS " NVFUSER_BUILD_WITH_ASAN : ${NVFUSER_BUILD_WITH_ASAN}")
14951486
message(STATUS " NVFUSER_DISTRIBUTED : ${NVFUSER_DISTRIBUTED}")
1496-
message(STATUS " NVFUSER_HOST_IR_JIT : ${NVFUSER_HOST_IR_JIT}")
14971487
message(STATUS " NVFUSER_CPP_STANDARD : ${NVFUSER_CPP_STANDARD}")
14981488
message(STATUS " NVMMH_INCLUDE_DIR : ${NVMMH_INCLUDE_DIR}")
14991489
message(STATUS "******** End of Nvfuser configuration summary ********")

csrc/host_ir/jit.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -492,8 +492,8 @@ void inferTensorShapesAndStrides(
492492
// Check if sizes and strides are the same size as logical domain
493493
const auto logical_ndims =
494494
std::ranges::distance(logical_domain | TensorDomain::kNoReductions);
495-
NVF_ERROR_EQ(sizes.size(), logical_ndims);
496-
NVF_ERROR_EQ(strides.size(), logical_ndims);
495+
NVF_ERROR_EQ(std::ssize(sizes), logical_ndims);
496+
NVF_ERROR_EQ(std::ssize(strides), logical_ndims);
497497
}
498498

499499
void unpackInputs(

csrc/options.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ const std::unordered_map<std::string, EnableOption>& getEnableOptions() {
177177
{"warn_register_spill", EnableOption::WarnRegisterSpill},
178178
{"ws_normalization", EnableOption::WarpSpecializedNormalization},
179179
{"host_ir_lowering", EnableOption::HostIrLowering},
180+
{"host_ir_jit", EnableOption::HostIrJit},
180181
{"insert_resharding_after", EnableOption::InsertReshardingAfter},
181182
{"fast_math", EnableOption::FastMath},
182183
{"p2p_protocol", EnableOption::P2pProtocol},

csrc/options.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ enum class EnableOption {
121121
WarnRegisterSpill, //! Enable warnings of register spill
122122
WarpSpecializedNormalization, //! Enable warp specialized persistent kernel
123123
HostIrLowering, //! Enable FusionKernelRuntime lowering to host IR
124+
HostIrJit, //! Enable Host IR JIT compilation with LLVM
124125
InsertReshardingAfter, //! Insert resharding set after the expression
125126
FastMath, //! Enable fast math optimizations (--use_fast_math)
126127
P2pProtocol, //! Prescribe P2P protocol: put|get

csrc/runtime/fusion_kernel_runtime.cpp

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -157,11 +157,7 @@ void FusionKernelRuntime::evictCache(size_t input_id) {
157157

158158
bool FusionKernelRuntime::isCompiled() const {
159159
if (isOptionEnabled(EnableOption::HostIrLowering)) {
160-
#ifdef NVFUSER_HOST_IR_JIT
161-
return hij_ != nullptr;
162-
#else
163-
return hie_ != nullptr;
164-
#endif
160+
return hij_ != nullptr || hie_ != nullptr;
165161
} else {
166162
std::lock_guard<std::mutex> guard(mutex_);
167163
return std::all_of(
@@ -299,13 +295,14 @@ KernelArgumentHolder FusionKernelRuntime::runWithInputs(
299295
<< std::endl;
300296
}
301297

302-
#ifdef NVFUSER_HOST_IR_JIT
303-
auto outputs =
304-
hij_->runWithInputs(args); // TODO: change NVFUSER_HOST_IR_JIT flag to
305-
// enableOption in the future.
306-
#else
307-
auto outputs = hie_->runWithInputs(args);
308-
#endif
298+
KernelArgumentHolder outputs;
299+
if (hij_ != nullptr) {
300+
outputs = hij_->runWithInputs(args);
301+
} else if (hie_ != nullptr) {
302+
outputs = hie_->runWithInputs(args);
303+
} else {
304+
NVF_THROW("Neither Host IR JIT or Host IR Evaluator are initialized.");
305+
}
309306

310307
if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
311308
debug() << "============= FINISHED RUNNING HOSTIR EVALUATOR ============"
@@ -472,12 +469,12 @@ void FusionKernelRuntime::compileFusionParallel(KernelArgumentHolder args) {
472469
}
473470
std::unique_ptr<hir::HostIrContainer> hic = lowerSegmentedFusionToHostIr(
474471
*segmented_fusion_, launch_params_per_segment, executors_);
475-
#ifdef NVFUSER_HOST_IR_JIT
476-
hij_ = std::make_unique<HostIrJit>(std::move(hic));
477-
#else
478-
hie_ = std::make_unique<hir::HostIrEvaluator>(
479-
std::move(hic), &Communicator::getInstance());
480-
#endif
472+
if (isOptionEnabled(EnableOption::HostIrJit)) {
473+
hij_ = std::make_unique<HostIrJit>(std::move(hic));
474+
} else {
475+
hie_ = std::make_unique<hir::HostIrEvaluator>(
476+
std::move(hic), &Communicator::getInstance());
477+
}
481478
}
482479

483480
if (isProfilerEnabled()) {

csrc/runtime/fusion_kernel_runtime.h

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,7 @@
1111

1212
#include <fusion_segmenter.h>
1313
#include <host_ir/evaluator.h>
14-
#ifdef NVFUSER_HOST_IR_JIT
1514
#include <host_ir/jit.h>
16-
#endif
1715
#include <polymorphic_value.h>
1816
#include <runtime/executor.h>
1917
#include <runtime/executor_kernel_arg.h>
@@ -143,11 +141,13 @@ class FusionKernelRuntime {
143141

144142
//! Get the Host IR Container
145143
const hir::HostIrContainer& getHostIrContainer() const {
146-
#ifdef NVFUSER_HOST_IR_JIT
147-
return hij_->container();
148-
#else
149-
return hie_->container();
150-
#endif
144+
if (isOptionEnabled(EnableOption::HostIrJit)) {
145+
NVF_ERROR(hij_ != nullptr, "Host IR JIT is not initialized");
146+
return hij_->container();
147+
} else {
148+
NVF_ERROR(hie_ != nullptr, "Host IR Evaluator is not initialized");
149+
return hie_->container();
150+
}
151151
}
152152

153153
private:
@@ -189,13 +189,10 @@ class FusionKernelRuntime {
189189
//! Executors holding compiled kernels
190190
std::vector<std::unique_ptr<ExecutorAbstract>> executors_;
191191

192-
#ifdef NVFUSER_HOST_IR_JIT
193-
//! Host IR JIT
192+
//! Host IR JIT (used when EnableOption::HostIrJit is set)
194193
std::unique_ptr<HostIrJit> hij_;
195-
#else
196-
//! Host IR Evaluator
194+
//! Host IR Evaluator (used when EnableOption::HostIrJit is not set)
197195
std::unique_ptr<hir::HostIrEvaluator> hie_;
198-
#endif
199196

200197
// A metadata copy of initial arguments used to contruct this
201198
// FusionKernelRuntime. Used during deserialization to schedule the fusion

doc/dev/host_ir_jit.md

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -98,21 +98,18 @@ KernelArgumentHolder HostIrJitImpl::runWithInputs(const KernelArgumentHolder& ar
9898
```
9999
*Detailed Implementation:* https://github.com/NVIDIA/Fuser/blob/3ac1a4697b6b5c31e4dbb9763b3b6db2f0e0164b/csrc/host_ir/jit.cpp#L1399-L1453
100100
101-
## Configuration and Build Options
102-
Building nvFuser project with `NVFUSER_BUILD_HOST_IR_JIT=1` will enables Host IR JIT as default runtime in Host IR execution path.
103-
Otherwise the default runtime is Host IR Evaluator. In the future, when llvm is fully supported in all build machines, we are able
104-
to get rid of this opt-in flag and rather use `enableOption` to control backend switching after build is done.
105-
106-
Sample build
107-
```python
108-
NVFUSER_BUILD_HOST_IR_JIT=1 pip install --no-build-isolation -e python -v
109-
```
110-
or
111-
```python
112-
NVFUSER_BUILD_HOST_IR_JIT=1 _bn
113-
```
101+
## Configuration and Runtime Options
102+
103+
### Build Requirements
104+
**LLVM 18.1+ is required** to build nvFuser. You can switch between Host IR JIT and Host IR Evaluator at runtime.
105+
106+
### Runtime Configuration
107+
You can enable Host IR JIT via runtime option `EnableOption::HostIrJit` or environment `NVFUSER_ENABLE="host_ir_jit"`.
108+
109+
When `host_ir_jit` is enabled, the runtime uses LLVM ORC JIT for low-latency host execution. When disabled, it falls back to the Host IR Evaluator.
110+
114111
## Future Integration plan
115-
We plan to turn on host IR JIT by default after its function and performance are on par.
112+
We plan to turn on host IR JIT by default after its functionality and performance are on par.
116113
Known missing supports and bugs are:
117114
118115
**Ops need to be supported:**

python/utils.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ class BuildConfig:
2525
build_with_asan: bool = False
2626
build_without_distributed: bool = False
2727
explicit_error_check: bool = False
28-
build_with_host_ir_jit: bool = False
2928
overwrite_version: bool = False
3029
version_tag: str = None
3130
build_type: str = "Release"
@@ -98,12 +97,6 @@ def parse_args():
9897
action="store_true",
9998
help="Build nvfuser with UCC support",
10099
)
101-
parser.add_argument(
102-
"--build-with-host-ir-jit",
103-
dest="build_with_host_ir_jit",
104-
action="store_true",
105-
help="Build nvfuser with Host IR JIT support",
106-
)
107100
parser.add_argument(
108101
"--explicit-error-check",
109102
dest="explicit_error_check",
@@ -206,7 +199,6 @@ def create_build_config():
206199
no_benchmark=args.no_benchmark,
207200
no_ninja=args.no_ninja,
208201
build_with_ucc=args.build_with_ucc,
209-
build_with_host_ir_jit=args.build_with_host_ir_jit,
210202
build_with_asan=args.build_with_asan,
211203
build_without_distributed=args.build_without_distributed,
212204
explicit_error_check=args.explicit_error_check,
@@ -252,8 +244,6 @@ def override_build_config_from_env(config):
252244
config.no_ninja = get_env_flag_bool("NVFUSER_BUILD_NO_NINJA")
253245
if "NVFUSER_BUILD_WITH_UCC" in os.environ:
254246
config.build_with_ucc = get_env_flag_bool("NVFUSER_BUILD_WITH_UCC")
255-
if "NVFUSER_BUILD_HOST_IR_JIT" in os.environ:
256-
config.build_with_host_ir_jit = get_env_flag_bool("NVFUSER_BUILD_HOST_IR_JIT")
257247
if "NVFUSER_BUILD_WITH_ASAN" in os.environ:
258248
config.build_with_asan = get_env_flag_bool("NVFUSER_BUILD_WITH_ASAN")
259249
if "NVFUSER_BUILD_WITHOUT_DISTRIBUTED" in os.environ:
@@ -483,7 +473,6 @@ def on_or_off(flag: bool) -> str:
483473
f"-DPython_EXECUTABLE={sys.executable}",
484474
f"-DBUILD_NVFUSER_BENCHMARK={on_or_off(not config.no_benchmark)}",
485475
f"-DNVFUSER_DISTRIBUTED={on_or_off(not config.build_without_distributed)}",
486-
f"-DUSE_HOST_IR_JIT={on_or_off(config.build_with_host_ir_jit)}",
487476
f"-DCUTLASS_MAX_JOBS={config.cutlass_max_jobs}",
488477
"-B",
489478
cmake_build_dir,

tests/cpp/test_host_ir_integration.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ class HostIrIntegrationTest : public NVFuserTest {
2929
protected:
3030
HostIrIntegrationTest() {
3131
EnableOptionsGuard::getCurOptions().set(EnableOption::HostIrLowering);
32+
EnableOptionsGuard::getCurOptions().set(EnableOption::HostIrJit);
3233
}
3334
};
3435

tests/cpp/test_host_ir_jit.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,12 @@ namespace nvfuser {
2020

2121
namespace hir {
2222

23-
using HostIrJitTest = NVFuserTest;
23+
class HostIrJitTest : public NVFuserTest {
24+
protected:
25+
HostIrJitTest() {
26+
EnableOptionsGuard::getCurOptions().set(EnableOption::HostIrJit);
27+
}
28+
};
2429
// Build with: python setup.py install --build-with-host-ir-jit
2530
TEST_F(HostIrJitTest, Set) {
2631
auto hic = std::make_unique<HostIrContainer>();
@@ -333,7 +338,7 @@ TEST_F(HostIrJitTest, Matmul) {
333338

334339
HostIrJit jit(std::move(hic));
335340

336-
auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(torch::kFloat);
341+
auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(at::kFloat);
337342
at::Tensor t0 = at::randn({H, M, K}, options);
338343
at::Tensor t1 = at::randn({H, K, N}, options);
339344
at::Tensor t2 = at::randn({H, M, N}, options);
@@ -377,7 +382,7 @@ TEST_F(HostIrJitTest, MatmulOut) {
377382

378383
HostIrJit jit(std::move(hic));
379384

380-
auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(torch::kFloat);
385+
auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(at::kFloat);
381386
at::Tensor t0 = at::randn({H, M, K}, options);
382387
at::Tensor t1 = at::randn({H, K, N}, options);
383388
std::unordered_map<Val*, PolymorphicValue> concrete_input_buffers = {
@@ -428,7 +433,7 @@ TEST_F(HostIrJitTest, Linear) {
428433

429434
HostIrJit jit(std::move(hic));
430435

431-
auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(torch::kFloat);
436+
auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(at::kFloat);
432437
auto in_at = at::randint(5, {B, M, K}, options);
433438
auto weight_at = at::randint(5, {N, K}, options);
434439
auto bias_at = at::randint(5, {N}, options);

0 commit comments

Comments
 (0)