From 3b68cf38daf11f1b561a3e92ef252040bdac3bef Mon Sep 17 00:00:00 2001
From: Liqiang Lu <liqiangxl@gmail.com>
Date: Tue, 6 Jan 2026 11:30:29 -0800
Subject: [PATCH 1/2] pass lparams to lowering pass to support dynamic shapes
 in warp specialized persistent kernel

---
 .../device_lower/analysis/fused_reduction.cpp | 23 ++++-
 csrc/device_lower/lower2device.cpp            |  8 +-
 csrc/device_lower/lower2device.h              |  8 +-
 csrc/parallel_dimension_map.cpp               | 13 ++-
 csrc/runtime/compiled_kernel.cpp              |  6 +-
 csrc/runtime/compiled_kernel.h                |  2 +
 csrc/runtime/executor.cpp                     |  2 +
 .../normalization_inner_outer_tma_ws.cpp      | 16 ++--
 csrc/scheduler/reduction_utils.cpp            | 12 ++-
 .../test_combined_inner_outer_reduction.cpp   |  7 +-
 tests/cpp/test_persistent_buffer.cpp          | 91 ++++++++++++++++++-
 11 files changed, 159 insertions(+), 29 deletions(-)

diff --git a/csrc/device_lower/analysis/fused_reduction.cpp b/csrc/device_lower/analysis/fused_reduction.cpp
index 5155d5681b4..b187cc888e7 100644
--- a/csrc/device_lower/analysis/fused_reduction.cpp
+++ b/csrc/device_lower/analysis/fused_reduction.cpp
@@ -87,7 +87,11 @@ class FusionInspector : private IterVisitor {
     // TIDx and its size is a multiple of warp size (32).
     auto is_static_warp_reduction = [](TensorView* out,
                                        bool has_warp_specialization) {
-      if (!has_warp_specialization) {
+      // Check if bdimx is statically known in launch params
+      bool has_static_bdimx = GpuLower::hasCurrent() &&
+          GpuLower::current()->launchParams().hasDim(ParallelType::TIDx);
+
+      if (!has_warp_specialization && !has_static_bdimx) {
         return false;
       }
 
@@ -97,10 +101,19 @@ class FusionInspector : private IterVisitor {
       for (auto ld : out->getLoopDomain()) {
         if (ld->isReduction()) {
           reduction_count++;
-          if (ld->getParallelType() == ParallelType::TIDx &&
-              ld->extent()->isConst() &&
-              ld->extent()->value().as<int64_t>() % kThreadsPerWarp == 0) {
-            has_valid_tidx_reduction = true;
+          if (ld->getParallelType() == ParallelType::TIDx) {
+            // Get extent either from launch params or from the const extent
+            std::optional<int64_t> extent;
+            if (has_static_bdimx) {
+              extent = GpuLower::current()->launchParams().getDim(
+                  ParallelType::TIDx);
+            } else if (ld->extent()->isConst()) {
+              extent = ld->extent()->value().as<int64_t>();
+            }
+
+            if (extent.has_value() && extent.value() % kThreadsPerWarp == 0) {
+              has_valid_tidx_reduction = true;
+            }
           }
         }
       }
diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp
index fe244d2d7da..1f1fe75830f 100644
--- a/csrc/device_lower/lower2device.cpp
+++ b/csrc/device_lower/lower2device.cpp
@@ -205,7 +205,10 @@ void dumpExprsIfEnabled(
   }
 }
 
-GpuLower::GpuLower(Fusion* fusion, const CompileParams& cparams)
+GpuLower::GpuLower(
+    Fusion* fusion,
+    const CompileParams& cparams,
+    const LaunchParams& lparams)
     : passes_(
           // Passes will be executed in the order they are added here
           // Each pass is a pair of (name, function), where the name will be
@@ -234,7 +237,8 @@ GpuLower::GpuLower(Fusion* fusion, const CompileParams& cparams)
            {"KIRCleaner", KIRCleaner::cleanUp},
            {"instrumentKernel", instrumentKernel},
            {"lowerToInlinePtx", lowerToInlinePtx}}),
-      cparams_(cparams) {
+      cparams_(cparams),
+      lparams_(lparams) {
   if (isDebugDumpEnabled(DebugDumpOption::FusionIrMath)) {
     fusion->printMath();
   }
diff --git a/csrc/device_lower/lower2device.h b/csrc/device_lower/lower2device.h
index 8f60f69894d..b88fd0cdc2c 100644
--- a/csrc/device_lower/lower2device.h
+++ b/csrc/device_lower/lower2device.h
@@ -64,7 +64,8 @@ class GpuLower : public NonCopyable {
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   NVF_API explicit GpuLower(
       Fusion* fusion,
-      const CompileParams& cparams = CompileParams());
+      const CompileParams& cparams = CompileParams(),
+      const LaunchParams& lparams = LaunchParams());
 
   NVF_API kir::Kernel* kernel() const;
 
@@ -83,6 +84,10 @@ class GpuLower : public NonCopyable {
     return cparams_.index_type.value();
   }
 
+  const LaunchParams& launchParams() const {
+    return lparams_;
+  }
+
   const auto& minDeviceVersion() const {
     return min_device_version_;
   }
@@ -391,6 +396,7 @@ class GpuLower : public NonCopyable {
   kir::KernelPerformanceProfile profile_;
   std::unordered_set<Split*> divisible_splits_;
   CompileParams cparams_;
+  LaunchParams lparams_;
   std::unique_ptr<TensorIndexer> tensor_indexer_;
   std::unordered_map<TensorView*, const TMAInfo> consumer_to_tma_info_;
   std::pair<int64_t, int64_t> dec_inc_register_usage = {-1, -1};
diff --git a/csrc/parallel_dimension_map.cpp b/csrc/parallel_dimension_map.cpp
index 3ee6ca64b39..7e607da3300 100644
--- a/csrc/parallel_dimension_map.cpp
+++ b/csrc/parallel_dimension_map.cpp
@@ -159,10 +159,15 @@ int64_t ParallelDimensionMap::getThreadCountInDim(ParallelType pt) {
   if (dim_map_.at(pt)->isConstScalar()) {
     return dim_map_.at(pt)->value().as<int64_t>();
   }
-  // Return -1 for dynamic dimensions, this disables register sharing on
-  // dynamic dimensions since we can't guarantee the number of threads is
-  // divisible by 128. We may allow this in the future and delegate this
-  // check to a point where the launch parameters are known.
+  // If dimension is dynamic but we have launch parameters available,
+  // use the actual launch parameter value
+  if (GpuLower::hasCurrent() &&
+      GpuLower::current()->launchParams().hasDim(pt)) {
+    return GpuLower::current()->launchParams().getDim(pt);
+  }
+  // Return -1 for dynamic dimensions when launch parameters are not known,
+  // this disables register sharing on dynamic dimensions since we can't
+  // guarantee the number of threads is divisible by 128.
   return -1;
 }
 
diff --git a/csrc/runtime/compiled_kernel.cpp b/csrc/runtime/compiled_kernel.cpp
index bc9eefebcc1..f740fcb36b9 100644
--- a/csrc/runtime/compiled_kernel.cpp
+++ b/csrc/runtime/compiled_kernel.cpp
@@ -1225,6 +1225,7 @@ void queryTargetGPUVersion(
 NVF_API CompiledKernel::CompiledKernel(
     Fusion* fusion,
     CompileParams compile_params,
+    const LaunchParams& launch_params,
     c10::Device device,
     SchedulerType scheduler_type,
     int64_t fusion_id,
@@ -1241,7 +1242,8 @@ NVF_API CompiledKernel::CompiledKernel(
           runtime_id,
           group_id),
       compile_params_(compile_params),
-      lowered_(std::make_unique<GpuLower>(fusion, compile_params)) {
+      lowered_(
+          std::make_unique<GpuLower>(fusion, compile_params, launch_params)) {
   FUSER_PERF_SCOPE("CompiledKernel::CompiledKernel");
 
   // TODO: No hooks can be sent because this is in the constructor
@@ -1270,6 +1272,7 @@ NVF_API CompiledKernel::CompiledKernel(
 NVF_API CompiledKernel::CompiledKernel(
     Fusion* fusion,
     CompileParams compile_params,
+    const LaunchParams& launch_params,
     c10::Device device,
     SchedulerType scheduler_type,
     int64_t fusion_id,
@@ -1279,6 +1282,7 @@ NVF_API CompiledKernel::CompiledKernel(
     : CompiledKernel(
           fusion,
           compile_params,
+          launch_params,
           device,
           scheduler_type,
           fusion_id,
diff --git a/csrc/runtime/compiled_kernel.h b/csrc/runtime/compiled_kernel.h
index 43ac97e1ef8..d2ffcfb3597 100644
--- a/csrc/runtime/compiled_kernel.h
+++ b/csrc/runtime/compiled_kernel.h
@@ -186,6 +186,7 @@ class CompiledKernel : public CompiledKernelBase {
   NVF_API CompiledKernel(
       Fusion* fusion,
       CompileParams compile_params,
+      const LaunchParams& launch_params,
       c10::Device device,
       SchedulerType scheduler_type,
       int64_t fusion_id,
@@ -199,6 +200,7 @@ class CompiledKernel : public CompiledKernelBase {
   NVF_API CompiledKernel(
       Fusion* fusion,
       CompileParams compile_params,
+      const LaunchParams& launch_params,
       c10::Device device,
       SchedulerType scheduler_type = SchedulerType::None,
       int64_t fusion_id = 0,
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index baa478f095f..e730c54a13a 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -242,6 +242,7 @@ void KernelExecutor::compile(
   compiled_kernel_ = std::make_unique<CompiledKernel>(
       fusion,
       compile_params,
+      launch_constraints,
       device,
       scheduler_type,
       fusion_id_,
@@ -1608,6 +1609,7 @@ void KernelExecutor::deserialize(
   compiled_kernel_ = std::make_unique<CompiledKernel>(
       _fusion,
       compile_params,
+      LaunchParams(),
       device,
       scheduler_type,
       fusion_id,
diff --git a/csrc/scheduler/normalization_inner_outer_tma_ws.cpp b/csrc/scheduler/normalization_inner_outer_tma_ws.cpp
index 7062f4ef881..6b71e484a65 100644
--- a/csrc/scheduler/normalization_inner_outer_tma_ws.cpp
+++ b/csrc/scheduler/normalization_inner_outer_tma_ws.cpp
@@ -462,12 +462,16 @@ void scheduleOuterReduction(
       outer_reduction_tv->axis(axisID--)->parallelize(ParallelType::Vectorize);
     }
 
-    if (rparams->lparams.bdimx() > 1) {
-      int64_t compute_bdimx = reduction_scheduler_utils::getComputeBdimx(
-          rparams->circular_buffer_options, rparams->lparams.bdimx());
-      outer_reduction_tv->split(axisID, compute_bdimx);
-      outer_reduction_tv->axis(axisID--)->parallelize(ParallelType::TIDx);
-    }
+    // if (rparams->lparams.bdimx() > 1) {
+    //   int64_t compute_bdimx = reduction_scheduler_utils::getComputeBdimx(
+    //       rparams->circular_buffer_options, rparams->lparams.bdimx());
+    //   outer_reduction_tv->split(axisID, compute_bdimx);
+    //   outer_reduction_tv->axis(axisID--)->parallelize(ParallelType::TIDx);
+    // }
+
+    outer_reduction_tv->split(
+        axisID, rparams->batches_per_block_inner_reduction, false);
+    outer_reduction_tv->axis(axisID--)->parallelize(ParallelType::TIDx);
 
     if (rparams->combined_split_grid_inner_dim) {
       outer_reduction_tv->split(
diff --git a/csrc/scheduler/reduction_utils.cpp b/csrc/scheduler/reduction_utils.cpp
index 47f66550f6f..00f7d9d0ab3 100644
--- a/csrc/scheduler/reduction_utils.cpp
+++ b/csrc/scheduler/reduction_utils.cpp
@@ -129,9 +129,15 @@ TensorView* scheduleReductionTV(
     // Reduction: [Persistent, TIDx, Vect]
     vectorize(inner_reduce_axis, rparams->unroll_factor_inner_reduction);
 
-    // static bdimx is required for TMA warp specialization
-    int64_t compute_bdimx = getComputeBdimx(option, rparams->lparams.bdimx());
-    inner_parallel_static(inner_reduce_axis, ParallelType::TIDx, compute_bdimx);
+    reduction_tv->split(
+        inner_reduce_axis, rparams->batches_per_block_inner_reduction, false);
+    reduction_tv->axis(inner_reduce_axis + 1)->parallelize(ParallelType::TIDx);
+    reduction_tv->axis(inner_reduce_axis + 1)->padToMultipleOfWarp();
+
+    // // static bdimx is required for TMA warp specialization
+    // int64_t compute_bdimx = getComputeBdimx(option,
+    // rparams->lparams.bdimx()); inner_parallel_static(inner_reduce_axis,
+    // ParallelType::TIDx, compute_bdimx);
 
     // Iteration: [I/Unroll/BIDy, BIDy, Unroll]
     if (rparams->unroll_factor_iter_dom > 1) {
diff --git a/tests/cpp/test_combined_inner_outer_reduction.cpp b/tests/cpp/test_combined_inner_outer_reduction.cpp
index 31849df0b8a..a24475e2c0f 100644
--- a/tests/cpp/test_combined_inner_outer_reduction.cpp
+++ b/tests/cpp/test_combined_inner_outer_reduction.cpp
@@ -1087,8 +1087,11 @@ TEST_P(TmaWarpSpecializedTest, SimpleFusion) {
 
   auto fusion = std::make_unique<Fusion>();
   FusionGuard fg(fusion.get());
-  auto tv0 = makeContigConcreteTensor({dim0, dim1}, dtype);
-  auto tv1 = makeContigConcreteTensor({dim0, dim1}, dtype);
+  // For case contig_1_dtype_float_batch_2048_hidden_8192
+  // the performance is 59.7% SOL uisng concrete inputs
+  // for symbolic inputs, the performance is 59.1% SOL
+  auto tv0 = makeContigTensor(2, dtype);
+  auto tv1 = makeContigTensor(2, dtype);
   fusion->addInput(tv0);
   fusion->addInput(tv1);
   tv0 = maybeCastOp(DataType::Float, tv0);
diff --git a/tests/cpp/test_persistent_buffer.cpp b/tests/cpp/test_persistent_buffer.cpp
index 14e7565c076..994a729777c 100644
--- a/tests/cpp/test_persistent_buffer.cpp
+++ b/tests/cpp/test_persistent_buffer.cpp
@@ -2196,8 +2196,8 @@ TEST_P(TmaPersistentTestP, TmaInnerPersistentRmsNorm) {
   const float kEps = 1e-6;
   Val* eps_ptr = IrBuilder::create<Val>(kEps);
 
-  auto tv0 = makeContigConcreteTensor({x, y}, dtype);
-  auto tv1 = makeContigConcreteTensor({y}, dtype);
+  auto tv0 = makeContigTensor(2, dtype);
+  auto tv1 = makeContigTensor(1, dtype);
   fusion.addInput(tv0);
   fusion.addInput(tv1);
   tv0 = maybeCastOp(DataType::Float, tv0);
@@ -2271,7 +2271,7 @@ TEST_P(TmaPersistentTestP, TmaInnerPersistentSoftmax) {
   auto& fusion = *fusion_ptr;
   FusionGuard fg(fusion_ptr.get());
 
-  auto tv0 = makeContigTensor(2, dtype);
+  auto tv0 = makeContigConcreteTensor({x, y}, dtype);
   fusion.addInput(tv0);
   tv0 = maybeCastOp(DataType::Float, tv0);
   auto res = softmax(tv0, 1);
@@ -2297,8 +2297,8 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Combine(
         testing::Values(DataType::BFloat16),
         testing::Values(
-            deviceSMCount() / 2,
-            1024), // batch size, less or larger than sm count
+            deviceSMCount() / 2, // small batch, can't do grid stride loop
+            2048), // batch size, less or larger than sm count
         testing::ValuesIn(Pow2Vals1to1Million)), // hidden size
     [](const testing::TestParamInfo<TmaPersistentTestParams>& info) {
       auto dtype = std::get<0>(info.param);
@@ -2308,4 +2308,85 @@ INSTANTIATE_TEST_SUITE_P(
       os << dtype << "_" << x << "_" << y;
       return os.str();
     });
+
+// Test that kernels with different launch parameters are not incorrectly reused
+// This ensures that LaunchParams is properly included in the cache key
+TEST_F(TmaPersistentTestF, KernelReuse) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  // Create an RMS norm fusion that will use inner persistent scheduler
+  const float kEps = 1e-6;
+  Val* eps_ptr = IrBuilder::create<Val>(kEps);
+
+  auto tv0 = makeContigTensor(2, DataType::BFloat16);
+  auto tv1 = makeContigTensor(1, DataType::BFloat16);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  tv0 = maybeCastOp(DataType::Float, tv0);
+  tv1 = maybeCastOp(DataType::Float, tv1);
+  auto rms_norm_results = rms_norm(tv0, 1, tv1, eps_ptr);
+  auto output = maybeCastOp(DataType::BFloat16, rms_norm_results.output);
+  fusion.addOutput(output);
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+
+  auto options = at::TensorOptions().dtype(at::kBFloat16).device(at::kCUDA, 0);
+
+  // Helper to get the number of compiled kernel runtimes
+  auto numRuntimes = [&executor_cache]() -> size_t {
+    // this is map<pair<device, conc_info>, vector<FusionKernelRuntime>>
+    const auto& runtime_map = executor_cache.getKernelRuntimes();
+    if (runtime_map.empty()) {
+      return 0;
+    }
+    return runtime_map
+        .begin() // There should be only one device/concretization pair
+        ->second.size();
+  };
+
+  // First run with specific dimensions that will produce launch config A
+  auto input1 = at::randn({2048, 4096}, options);
+  auto weight1 = at::randn({4096}, options);
+
+  auto output1 = executor_cache.runFusionWithInputs({input1, weight1});
+  testValidate(
+      executor_cache.fusion(), output1, {input1, weight1}, __LINE__, __FILE__);
+
+  EXPECT_EQ(numRuntimes(), 1) << "First run should compile one kernel";
+
+  FusionKernelRuntime* first_runtime =
+      executor_cache.getMostRecentKernelRuntime();
+
+  // Second run with different outer dimension - should reuse the kernel
+  auto input2 = at::randn({2048 + 8, 4096}, options);
+  auto weight2 = at::randn({4096}, options);
+
+  auto output2 = executor_cache.runFusionWithInputs({input2, weight2});
+  testValidate(
+      executor_cache.fusion(), output2, {input2, weight2}, __LINE__, __FILE__);
+
+  EXPECT_EQ(numRuntimes(), 1)
+      << "Same dimensions should reuse the existing kernel";
+
+  FusionKernelRuntime* second_runtime =
+      executor_cache.getMostRecentKernelRuntime();
+  EXPECT_EQ(first_runtime, second_runtime)
+      << "Should reuse the same runtime for identical shapes";
+
+  // Third run with slightly different inner dimension - should reuse the kernel
+  auto input3 = at::randn({2048 + 8, 4096 - 8}, options);
+  auto weight3 = at::randn({4096 - 8}, options);
+
+  auto output3 = executor_cache.runFusionWithInputs({input3, weight3});
+  testValidate(
+      executor_cache.fusion(), output3, {input3, weight3}, __LINE__, __FILE__);
+
+  // If launch params are properly included in cache, this should compile a new
+  // kernel
+  EXPECT_GE(numRuntimes(), 1)
+      << "Different dimensions may create new kernel if launch params differ";
+}
+
 } // namespace nvfuser

From 7397013eed1f452ccd7b7f9ba925304113a48006 Mon Sep 17 00:00:00 2001
From: Liqiang Lu <liqiangxl@gmail.com>
Date: Thu, 8 Jan 2026 09:44:55 -0800
Subject: [PATCH 2/2] fix

---
 .../normalization_inner_outer_tma_ws.cpp         | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/csrc/scheduler/normalization_inner_outer_tma_ws.cpp b/csrc/scheduler/normalization_inner_outer_tma_ws.cpp
index 6b71e484a65..7062f4ef881 100644
--- a/csrc/scheduler/normalization_inner_outer_tma_ws.cpp
+++ b/csrc/scheduler/normalization_inner_outer_tma_ws.cpp
@@ -462,16 +462,12 @@ void scheduleOuterReduction(
       outer_reduction_tv->axis(axisID--)->parallelize(ParallelType::Vectorize);
     }
 
-    // if (rparams->lparams.bdimx() > 1) {
-    //   int64_t compute_bdimx = reduction_scheduler_utils::getComputeBdimx(
-    //       rparams->circular_buffer_options, rparams->lparams.bdimx());
-    //   outer_reduction_tv->split(axisID, compute_bdimx);
-    //   outer_reduction_tv->axis(axisID--)->parallelize(ParallelType::TIDx);
-    // }
-
-    outer_reduction_tv->split(
-        axisID, rparams->batches_per_block_inner_reduction, false);
-    outer_reduction_tv->axis(axisID--)->parallelize(ParallelType::TIDx);
+    if (rparams->lparams.bdimx() > 1) {
+      int64_t compute_bdimx = reduction_scheduler_utils::getComputeBdimx(
+          rparams->circular_buffer_options, rparams->lparams.bdimx());
+      outer_reduction_tv->split(axisID, compute_bdimx);
+      outer_reduction_tv->axis(axisID--)->parallelize(ParallelType::TIDx);
+    }
 
     if (rparams->combined_split_grid_inner_dim) {
       outer_reduction_tv->split(