NVIDIA · csarofeen · Dec 26, 2025 · Dec 30, 2025 · Dec 30, 2025 · Dec 30, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -561,6 +561,42 @@ endif()
 
 target_link_libraries(codegen_internal PUBLIC LLVM_JIT)
 
+# Precompiled Headers for Top nvFuser Headers
+# Post-M8, template instantiation is reduced by 81%, making header parsing
+# a significant fraction of build cost. This PCH targets the top 10 heaviest
+# nvFuser-controllable headers by exclusive parse time (from M9 Task 4 analysis).
+# Enabled by default for Release builds (provides ~50% build time improvement).
+if(CMAKE_BUILD_TYPE STREQUAL "Release")
+  option(NVFUSER_USE_POLYMORPHIC_PCH "Use PCH for top nvFuser headers to reduce parse time" ON)
+else()
+  option(NVFUSER_USE_POLYMORPHIC_PCH "Use PCH for top nvFuser headers to reduce parse time" OFF)
+endif()
+
+if(NVFUSER_USE_POLYMORPHIC_PCH)
+  message(STATUS "Enabling PCH for top 10 nvFuser headers")
+  target_precompile_headers(codegen_internal PRIVATE
+    # Top 10 nvFuser headers by exclusive parse time (M9 Task 4 analysis)
+    "${NVFUSER_SRCS_DIR}/polymorphic_value.h"                              # 1675s (27.9m)
+    "${NVFUSER_ROOT}/lib/dynamic_type/src/dynamic_type/type_traits.h"      # 473.6s (7.9m)
+    "${NVFUSER_SRCS_DIR}/ir/base_nodes.h"                                  # 284.5s (4.7m)
+    "${NVFUSER_SRCS_DIR}/scheduler/tools/abstract_tensor.h"                # 162.1s (2.7m)
+    "${NVFUSER_SRCS_DIR}/type.h"                                           # 81.6s (1.4m)
+    "${NVFUSER_SRCS_DIR}/ir/container.h"                                   # 51.6s (0.9m)
+    "${NVFUSER_SRCS_DIR}/serde/fusion_cache_generated.h"                   # 44.1s (0.7m)
+    "${NVFUSER_SRCS_DIR}/iter_visitor.h"                                   # 38.2s (0.6m)
+    "${NVFUSER_SRCS_DIR}/ir/internal_nodes.h"                              # 33.3s (0.6m)
+    "${NVFUSER_SRCS_DIR}/ir/interface_nodes.h"                             # 29.6s (0.5m)
+  )
+  # Skip PCH for polymorphic_value.cpp to allow visibility override
+  # (PCH caches type with hidden visibility)
+  set_source_files_properties(
+    "${NVFUSER_SRCS_DIR}/polymorphic_value.cpp"
+    PROPERTIES 
+      SKIP_PRECOMPILE_HEADERS ON
+      COMPILE_OPTIONS "-fvisibility=default"
+  )
+endif()
+
 add_library(nvfuser_codegen SHARED $<TARGET_OBJECTS:codegen_internal>)
 
 if (BUILD_CUTLASS AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
@@ -1109,6 +1145,35 @@ function(add_test_without_main TEST_NAME TEST_SRC ADDITIONAL_LINK)
   add_executable(${TEST_NAME} ${TEST_SRC})
   set_property(TARGET ${TEST_NAME} PROPERTY CXX_STANDARD ${NVFUSER_CPP_STANDARD})
   target_compile_definitions(${TEST_NAME} PRIVATE USE_GTEST)
+
+  # PCH for test targets: All test executables share a single PCH to avoid
+  # redundant compilation. The first test target (test_nvfuser) creates the PCH,
+  # and all subsequent tests reuse it via REUSE_FROM.
+  # Note: Can't reuse from codegen_internal due to -fPIC flag difference.
+  if(NVFUSER_USE_POLYMORPHIC_PCH)
+    get_property(NVFUSER_TEST_PCH_TARGET GLOBAL PROPERTY NVFUSER_TEST_PCH_TARGET)
+    if(NOT NVFUSER_TEST_PCH_TARGET)
+      # First test target: create the PCH with top 10 nvFuser headers
+      message(STATUS "Creating shared test PCH on target: ${TEST_NAME}")
+      target_precompile_headers(${TEST_NAME} PRIVATE
+        "${NVFUSER_SRCS_DIR}/polymorphic_value.h"
+        "${NVFUSER_ROOT}/lib/dynamic_type/src/dynamic_type/type_traits.h"
+        "${NVFUSER_SRCS_DIR}/ir/base_nodes.h"
+        "${NVFUSER_SRCS_DIR}/scheduler/tools/abstract_tensor.h"
+        "${NVFUSER_SRCS_DIR}/type.h"
+        "${NVFUSER_SRCS_DIR}/ir/container.h"
+        "${NVFUSER_SRCS_DIR}/serde/fusion_cache_generated.h"
+        "${NVFUSER_SRCS_DIR}/iter_visitor.h"
+        "${NVFUSER_SRCS_DIR}/ir/internal_nodes.h"
+        "${NVFUSER_SRCS_DIR}/ir/interface_nodes.h"
+      )
+      set_property(GLOBAL PROPERTY NVFUSER_TEST_PCH_TARGET ${TEST_NAME})
+    else()
+      # Subsequent test targets: reuse existing PCH
+      target_precompile_headers(${TEST_NAME} REUSE_FROM ${NVFUSER_TEST_PCH_TARGET})
+    endif()
+  endif()
+
   target_include_directories(${TEST_NAME} PRIVATE "${NVFUSER_ROOT}")
   target_include_directories(${TEST_NAME} SYSTEM PRIVATE
     ${NVFUSER_ROOT}/third_party/googletest/googletest/include

diff --git a/csrc/device_lower/utils.cpp b/csrc/device_lower/utils.cpp
@@ -1375,7 +1375,7 @@ std::string print(const std::monostate&) {
 }
 
 std::string print(const Projection& proj) {
-  return Projection::dispatch(
+  return Projection::dispatch<std::string>(
       [&](const auto& proj) { return print(proj); }, proj);
 }
 
@@ -1400,7 +1400,7 @@ bool related(const std::monostate&, const ValGroup& to) {
 }
 
 bool related(const Projection& proj, const ValGroup& to) {
-  return Projection::dispatch(
+  return Projection::dispatch<bool>(
       [&](const auto& proj) { return related(proj, to); }, proj);
 }
 
@@ -1430,7 +1430,7 @@ Val* extent(const std::monostate&) {
 }
 
 Val* extent(const Projection& proj) {
-  return Projection::dispatch(
+  return Projection::dispatch<Val*>(
       [&](const auto& proj) { return extent(proj); }, proj);
 }
 
@@ -1696,7 +1696,7 @@ Projection propagate(
     const ValGraph& id_graph,
     const ExprGroup& eg,
     Direction direction) {
-  return Projection::dispatch(
+  return Projection::dispatch<Projection>(
       [&](const auto& proj) {
         return propagate(proj, id_graph, eg, direction);
       },
@@ -1757,7 +1757,7 @@ Val* proveLinearAndGetStrideAfterPropagation(
 Val* proveLinearAndGetStrideAfterPropagation(
     const Projection& proj,
     const ValGroups& domain) {
-  return Projection::dispatch(
+  return Projection::dispatch<Val*>(
       [&](const auto& proj) {
         return proveLinearAndGetStrideAfterPropagation(proj, domain);
       },
@@ -2039,7 +2039,7 @@ Projection simplify(Projection projection) {
   auto simplified = projection;
   do {
     projection = simplified;
-    simplified = Projection::dispatch(
+    simplified = Projection::dispatch<Projection>(
         [&](const auto& projection) { return simplify(projection); },
         projection);
   } while (simplified.type() != projection.type() || simplified != projection);

diff --git a/csrc/multidevice/symmetric_tensor.h b/csrc/multidevice/symmetric_tensor.h
@@ -85,11 +85,11 @@ class SymmetricTensor {
   size_t aligned_size_;
   bool are_remote_tensors_setup_ = false;
   bool is_multicast_setup_ = false;
-  CUmemGenericAllocationHandle mcast_handle_{};
-  CUdevice cu_dev_{};
+  [[maybe_unused]] CUmemGenericAllocationHandle mcast_handle_{};
+  [[maybe_unused]] CUdevice cu_dev_{};
   void* mc_ptr_{nullptr};
-  int exporter_rank_{-1};
-  int peer_fd_{-1};
+  [[maybe_unused]] int exporter_rank_{-1};
+  [[maybe_unused]] int peer_fd_{-1};
   bool is_contiguous_view_setup_ = false;
   at::Tensor contiguous_view_;
 };

diff --git a/csrc/polymorphic_value.cpp b/csrc/polymorphic_value.cpp
@@ -140,3 +140,18 @@ c10::IValue toIValue(const PolymorphicValue& x) {
 } // namespace PolymorphicValue_functions
 
 } // namespace nvfuser
+
+// Explicit instantiation of DynamicType for PolymorphicValue.
+// This is the single point where the template is fully instantiated.
+// Note: This file is compiled with -fvisibility=default (set in CMakeLists.txt)
+// to ensure all DynamicType symbols are exported from the shared library.
+template struct dynamic_type::DynamicType<
+    dynamic_type::Containers<std::vector>,
+    nvfuser::StructHandle,
+    nvfuser::Pointer,
+    nvfuser::Opaque,
+    at::Tensor,
+    std::complex<double>,
+    double,
+    int64_t,
+    bool>;
diff --git a/csrc/polymorphic_value.h b/csrc/polymorphic_value.h
@@ -544,4 +544,17 @@ c10::IValue toIValue(const PolymorphicValue& x);
 
 } // namespace nvfuser
 
+// Prevent implicit instantiation in other TUs - use explicit instantiation from
+// polymorphic_value.cpp
+extern template struct dynamic_type::DynamicType<
+    dynamic_type::Containers<std::vector>,
+    nvfuser::StructHandle,
+    nvfuser::Pointer,
+    nvfuser::Opaque,
+    at::Tensor,
+    std::complex<double>,
+    double,
+    int64_t,
+    bool>;
+
 #include <struct.inl>
diff --git a/csrc/scheduler/matmul_ampere-.cpp b/csrc/scheduler/matmul_ampere-.cpp
@@ -56,9 +56,9 @@ AbstractTensor swizzleSharedMemory(TensorView* shared_mem_tv) {
 
   // Extract the constant sizes of the swizzled tile
   const int64_t tile_size_x =
-      swizzle_domain[-2]->extent()->evaluate().as<int64_t>();
+      swizzle_domain[-2].as<IterDomain*>()->extent()->evaluate().as<int64_t>();
   const int64_t tile_size_y =
-      swizzle_domain[-1]->extent()->evaluate().as<int64_t>();
+      swizzle_domain[-1].as<IterDomain*>()->extent()->evaluate().as<int64_t>();
 
   // Only tested for (1) ldmatrix access with sizeof(T) == 16bit (i.e.
   // half/bfloat16) and (2) epilogue general access with sizeof(T) == 32bit

diff --git a/csrc/scheduler/mma_utils.cpp b/csrc/scheduler/mma_utils.cpp
@@ -1127,7 +1127,7 @@ AbstractTensor MmaSwizzler::scheduleMmaOutputAllocation(AbstractTensor t) {
 
   // Assume last 2 dims, for example [M64, N24] or [M64, N24, R]
   NVF_ERROR(t.size() >= 2);
-  bool has_reduction = t[-1]->isReduction();
+  bool has_reduction = t[-1].as<IterDomain*>()->isReduction();
 
   int64_t m_pos = has_reduction ? -3 : -2;
   int64_t n_pos = has_reduction ? -2 : -1;
@@ -2473,9 +2473,9 @@ std::pair<int64_t, int64_t> analyzeSwizzleSharedMemory(
 
   // Extract the constant sizes of the swizzled tile
   const int64_t tile_size_x =
-      swizzle_domain[-2]->extent()->evaluate().as<int64_t>();
+      swizzle_domain[-2].as<IterDomain*>()->extent()->evaluate().as<int64_t>();
   const int64_t tile_size_y =
-      swizzle_domain[-1]->extent()->evaluate().as<int64_t>();
+      swizzle_domain[-1].as<IterDomain*>()->extent()->evaluate().as<int64_t>();
 
   // Only tested for (1) ldmatrix access with sizeof(T) == 16bit (i.e.
   // half/bfloat16) and (2) epilogue general access with sizeof(T) == 32bit
@@ -2717,7 +2717,7 @@ MmaInputSmemSwizzle tmaSwizzleSharedMemory(TensorView* shared_mem_tv) {
   AbstractTensor swizzle_domain(shared_mem_tv->getLoopDomain());
   // Extract the constant sizes of the swizzled tile
   const int64_t inner_dim_size =
-      swizzle_domain[-1]->extent()->evaluate().as<int64_t>();
+      swizzle_domain[-1].as<IterDomain*>()->extent()->evaluate().as<int64_t>();
 
   auto dtype = shared_mem_tv->getDataType().value();
   const int64_t B128_elements = 128 / dataTypeSizeByte(dtype);