Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
339731c
M8 Task 1a: Move PolymorphicValue alias to detail namespace
csarofeen Dec 26, 2025
8bbb51f
Revert "M8 Task 1a: Move PolymorphicValue alias to detail namespace"
csarofeen Dec 30, 2025
b2999ee
M8 Task 2a: Create DynamicType split header structure (decl.h, impl.h…
csarofeen Dec 30, 2025
a33f855
M8 Task 3: Add extern template for PolymorphicValue DynamicType
csarofeen Dec 30, 2025
2a056e1
M8 Task 4: Move operator<< to impl.h
csarofeen Dec 30, 2025
7bcace5
M8 Task 5: Move unary operators (+, -, ~, !) to impl.h
csarofeen Dec 30, 2025
7666a7f
M8 Task 6: Move operator* (dereference) to impl.h
csarofeen Dec 30, 2025
95d755f
M8 Task 7: Move prefix ++/-- operators to impl.h
csarofeen Dec 30, 2025
3d45bdb
M8 Task 8: Move postfix ++/-- operators to impl.h
csarofeen Dec 30, 2025
2e4be09
M8 Task 9: Move compound assignment operators to impl.h
csarofeen Dec 30, 2025
6e5f789
M8 Task 10a: Move operator+ to impl.h (binary op pattern validation)
csarofeen Dec 30, 2025
10e1dff
M8 Task 10b: Move all binary operators to impl.h
csarofeen Dec 30, 2025
473ae6b
M8 Task 12: Convert 22 binary operators to friend function pattern fo…
csarofeen Dec 30, 2025
fecdd77
M8 Task 12: Convert remaining operators (unary, ++/--, compound assig…
csarofeen Dec 30, 2025
36a8879
Refactor DynamicType operators to non-template friends with recursion…
csarofeen Dec 31, 2025
4366828
Move getDataType and castToDtype from type.h to type.cpp
csarofeen Dec 31, 2025
1c4484a
Enable narrow PCH for polymorphic_value.h
csarofeen Dec 31, 2025
d9ab351
Extend PCH to test targets
csarofeen Dec 31, 2025
3211649
Expand PCH to include top nvFuser headers
csarofeen Jan 1, 2026
89ae03c
Implement index-based switch dispatch for operator== in DynamicType
csarofeen Jan 1, 2026
d2fdb38
Convert comparison operators to switch-based dispatch to eliminate Fo…
csarofeen Jan 2, 2026
4e50e4a
Extend switch-based dispatch to all binary operators (arithmetic, bit…
csarofeen Jan 2, 2026
518198f
Fix symbol visibility for DynamicType by compiling polymorphic_value.…
csarofeen Jan 2, 2026
ea40a92
Replace dispatch() execution ForAllTypes with switch dispatch
csarofeen Jan 9, 2026
40b0d2b
Require explicit return type for dispatch<ReturnT>(), remove dispatch…
csarofeen Jan 10, 2026
353402e
Replace std::apply with fast_apply to skip noexcept machinery. Wall c…
csarofeen Jan 10, 2026
6362a1e
Replace any_check() with fold + requires pattern for 96% DynamicType …
csarofeen Jan 10, 2026
a16dfcd
Replace belongs_to/has_cross_type_equality with fold expressions, rem…
csarofeen Jan 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,42 @@ endif()

target_link_libraries(codegen_internal PUBLIC LLVM_JIT)

# Precompiled Headers for Top nvFuser Headers
# Post-M8, template instantiation is reduced by 81%, making header parsing
# a significant fraction of build cost. This PCH targets the top 10 heaviest
# nvFuser-controllable headers by exclusive parse time (from M9 Task 4 analysis).
# Enabled by default for Release builds (provides ~50% build time improvement).
if(CMAKE_BUILD_TYPE STREQUAL "Release")
option(NVFUSER_USE_POLYMORPHIC_PCH "Use PCH for top nvFuser headers to reduce parse time" ON)
else()
option(NVFUSER_USE_POLYMORPHIC_PCH "Use PCH for top nvFuser headers to reduce parse time" OFF)
endif()

if(NVFUSER_USE_POLYMORPHIC_PCH)
message(STATUS "Enabling PCH for top 10 nvFuser headers")
target_precompile_headers(codegen_internal PRIVATE
# Top 10 nvFuser headers by exclusive parse time (M9 Task 4 analysis)
"${NVFUSER_SRCS_DIR}/polymorphic_value.h" # 1675s (27.9m)
"${NVFUSER_ROOT}/lib/dynamic_type/src/dynamic_type/type_traits.h" # 473.6s (7.9m)
"${NVFUSER_SRCS_DIR}/ir/base_nodes.h" # 284.5s (4.7m)
"${NVFUSER_SRCS_DIR}/scheduler/tools/abstract_tensor.h" # 162.1s (2.7m)
"${NVFUSER_SRCS_DIR}/type.h" # 81.6s (1.4m)
"${NVFUSER_SRCS_DIR}/ir/container.h" # 51.6s (0.9m)
"${NVFUSER_SRCS_DIR}/serde/fusion_cache_generated.h" # 44.1s (0.7m)
"${NVFUSER_SRCS_DIR}/iter_visitor.h" # 38.2s (0.6m)
"${NVFUSER_SRCS_DIR}/ir/internal_nodes.h" # 33.3s (0.6m)
"${NVFUSER_SRCS_DIR}/ir/interface_nodes.h" # 29.6s (0.5m)
)
# Skip PCH for polymorphic_value.cpp to allow visibility override
# (PCH caches type with hidden visibility)
set_source_files_properties(
"${NVFUSER_SRCS_DIR}/polymorphic_value.cpp"
PROPERTIES
SKIP_PRECOMPILE_HEADERS ON
COMPILE_OPTIONS "-fvisibility=default"
)
endif()

add_library(nvfuser_codegen SHARED $<TARGET_OBJECTS:codegen_internal>)

if (BUILD_CUTLASS AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
Expand Down Expand Up @@ -1109,6 +1145,35 @@ function(add_test_without_main TEST_NAME TEST_SRC ADDITIONAL_LINK)
add_executable(${TEST_NAME} ${TEST_SRC})
set_property(TARGET ${TEST_NAME} PROPERTY CXX_STANDARD ${NVFUSER_CPP_STANDARD})
target_compile_definitions(${TEST_NAME} PRIVATE USE_GTEST)

# PCH for test targets: All test executables share a single PCH to avoid
# redundant compilation. The first test target (test_nvfuser) creates the PCH,
# and all subsequent tests reuse it via REUSE_FROM.
# Note: Can't reuse from codegen_internal due to -fPIC flag difference.
if(NVFUSER_USE_POLYMORPHIC_PCH)
get_property(NVFUSER_TEST_PCH_TARGET GLOBAL PROPERTY NVFUSER_TEST_PCH_TARGET)
if(NOT NVFUSER_TEST_PCH_TARGET)
# First test target: create the PCH with top 10 nvFuser headers
message(STATUS "Creating shared test PCH on target: ${TEST_NAME}")
target_precompile_headers(${TEST_NAME} PRIVATE
"${NVFUSER_SRCS_DIR}/polymorphic_value.h"
"${NVFUSER_ROOT}/lib/dynamic_type/src/dynamic_type/type_traits.h"
"${NVFUSER_SRCS_DIR}/ir/base_nodes.h"
"${NVFUSER_SRCS_DIR}/scheduler/tools/abstract_tensor.h"
"${NVFUSER_SRCS_DIR}/type.h"
"${NVFUSER_SRCS_DIR}/ir/container.h"
"${NVFUSER_SRCS_DIR}/serde/fusion_cache_generated.h"
"${NVFUSER_SRCS_DIR}/iter_visitor.h"
"${NVFUSER_SRCS_DIR}/ir/internal_nodes.h"
"${NVFUSER_SRCS_DIR}/ir/interface_nodes.h"
)
set_property(GLOBAL PROPERTY NVFUSER_TEST_PCH_TARGET ${TEST_NAME})
else()
# Subsequent test targets: reuse existing PCH
target_precompile_headers(${TEST_NAME} REUSE_FROM ${NVFUSER_TEST_PCH_TARGET})
endif()
endif()

target_include_directories(${TEST_NAME} PRIVATE "${NVFUSER_ROOT}")
target_include_directories(${TEST_NAME} SYSTEM PRIVATE
${NVFUSER_ROOT}/third_party/googletest/googletest/include
Expand Down
12 changes: 6 additions & 6 deletions csrc/device_lower/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1375,7 +1375,7 @@ std::string print(const std::monostate&) {
}

std::string print(const Projection& proj) {
return Projection::dispatch(
return Projection::dispatch<std::string>(
[&](const auto& proj) { return print(proj); }, proj);
}

Expand All @@ -1400,7 +1400,7 @@ bool related(const std::monostate&, const ValGroup& to) {
}

bool related(const Projection& proj, const ValGroup& to) {
return Projection::dispatch(
return Projection::dispatch<bool>(
[&](const auto& proj) { return related(proj, to); }, proj);
}

Expand Down Expand Up @@ -1430,7 +1430,7 @@ Val* extent(const std::monostate&) {
}

Val* extent(const Projection& proj) {
return Projection::dispatch(
return Projection::dispatch<Val*>(
[&](const auto& proj) { return extent(proj); }, proj);
}

Expand Down Expand Up @@ -1696,7 +1696,7 @@ Projection propagate(
const ValGraph& id_graph,
const ExprGroup& eg,
Direction direction) {
return Projection::dispatch(
return Projection::dispatch<Projection>(
[&](const auto& proj) {
return propagate(proj, id_graph, eg, direction);
},
Expand Down Expand Up @@ -1757,7 +1757,7 @@ Val* proveLinearAndGetStrideAfterPropagation(
Val* proveLinearAndGetStrideAfterPropagation(
const Projection& proj,
const ValGroups& domain) {
return Projection::dispatch(
return Projection::dispatch<Val*>(
[&](const auto& proj) {
return proveLinearAndGetStrideAfterPropagation(proj, domain);
},
Expand Down Expand Up @@ -2039,7 +2039,7 @@ Projection simplify(Projection projection) {
auto simplified = projection;
do {
projection = simplified;
simplified = Projection::dispatch(
simplified = Projection::dispatch<Projection>(
[&](const auto& projection) { return simplify(projection); },
projection);
} while (simplified.type() != projection.type() || simplified != projection);
Expand Down
8 changes: 4 additions & 4 deletions csrc/multidevice/symmetric_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,11 @@ class SymmetricTensor {
size_t aligned_size_;
bool are_remote_tensors_setup_ = false;
bool is_multicast_setup_ = false;
CUmemGenericAllocationHandle mcast_handle_{};
CUdevice cu_dev_{};
[[maybe_unused]] CUmemGenericAllocationHandle mcast_handle_{};
[[maybe_unused]] CUdevice cu_dev_{};
void* mc_ptr_{nullptr};
int exporter_rank_{-1};
int peer_fd_{-1};
[[maybe_unused]] int exporter_rank_{-1};
[[maybe_unused]] int peer_fd_{-1};
bool is_contiguous_view_setup_ = false;
at::Tensor contiguous_view_;
};
Expand Down
15 changes: 15 additions & 0 deletions csrc/polymorphic_value.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,18 @@ c10::IValue toIValue(const PolymorphicValue& x) {
} // namespace PolymorphicValue_functions

} // namespace nvfuser

// Explicit instantiation of DynamicType for PolymorphicValue.
// This is the single point where the template is fully instantiated.
// Note: This file is compiled with -fvisibility=default (set in CMakeLists.txt)
// to ensure all DynamicType symbols are exported from the shared library.
template struct dynamic_type::DynamicType<
dynamic_type::Containers<std::vector>,
nvfuser::StructHandle,
nvfuser::Pointer,
nvfuser::Opaque,
at::Tensor,
std::complex<double>,
double,
int64_t,
bool>;
13 changes: 13 additions & 0 deletions csrc/polymorphic_value.h
Original file line number Diff line number Diff line change
Expand Up @@ -544,4 +544,17 @@ c10::IValue toIValue(const PolymorphicValue& x);

} // namespace nvfuser

// Prevent implicit instantiation in other TUs - use explicit instantiation from
// polymorphic_value.cpp
extern template struct dynamic_type::DynamicType<
dynamic_type::Containers<std::vector>,
nvfuser::StructHandle,
nvfuser::Pointer,
nvfuser::Opaque,
at::Tensor,
std::complex<double>,
double,
int64_t,
bool>;

#include <struct.inl>
4 changes: 2 additions & 2 deletions csrc/scheduler/matmul_ampere-.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ AbstractTensor swizzleSharedMemory(TensorView* shared_mem_tv) {

// Extract the constant sizes of the swizzled tile
const int64_t tile_size_x =
swizzle_domain[-2]->extent()->evaluate().as<int64_t>();
swizzle_domain[-2].as<IterDomain*>()->extent()->evaluate().as<int64_t>();
const int64_t tile_size_y =
swizzle_domain[-1]->extent()->evaluate().as<int64_t>();
swizzle_domain[-1].as<IterDomain*>()->extent()->evaluate().as<int64_t>();

// Only tested for (1) ldmatrix access with sizeof(T) == 16bit (i.e.
// half/bfloat16) and (2) epilogue general access with sizeof(T) == 32bit
Expand Down
8 changes: 4 additions & 4 deletions csrc/scheduler/mma_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1127,7 +1127,7 @@ AbstractTensor MmaSwizzler::scheduleMmaOutputAllocation(AbstractTensor t) {

// Assume last 2 dims, for example [M64, N24] or [M64, N24, R]
NVF_ERROR(t.size() >= 2);
bool has_reduction = t[-1]->isReduction();
bool has_reduction = t[-1].as<IterDomain*>()->isReduction();

int64_t m_pos = has_reduction ? -3 : -2;
int64_t n_pos = has_reduction ? -2 : -1;
Expand Down Expand Up @@ -2473,9 +2473,9 @@ std::pair<int64_t, int64_t> analyzeSwizzleSharedMemory(

// Extract the constant sizes of the swizzled tile
const int64_t tile_size_x =
swizzle_domain[-2]->extent()->evaluate().as<int64_t>();
swizzle_domain[-2].as<IterDomain*>()->extent()->evaluate().as<int64_t>();
const int64_t tile_size_y =
swizzle_domain[-1]->extent()->evaluate().as<int64_t>();
swizzle_domain[-1].as<IterDomain*>()->extent()->evaluate().as<int64_t>();

// Only tested for (1) ldmatrix access with sizeof(T) == 16bit (i.e.
// half/bfloat16) and (2) epilogue general access with sizeof(T) == 32bit
Expand Down Expand Up @@ -2717,7 +2717,7 @@ MmaInputSmemSwizzle tmaSwizzleSharedMemory(TensorView* shared_mem_tv) {
AbstractTensor swizzle_domain(shared_mem_tv->getLoopDomain());
// Extract the constant sizes of the swizzled tile
const int64_t inner_dim_size =
swizzle_domain[-1]->extent()->evaluate().as<int64_t>();
swizzle_domain[-1].as<IterDomain*>()->extent()->evaluate().as<int64_t>();

auto dtype = shared_mem_tv->getDataType().value();
const int64_t B128_elements = 128 / dataTypeSizeByte(dtype);
Expand Down
Loading