NVIDIA · jjsjann123 · Sep 18, 2025 · Sep 18, 2025 · Sep 18, 2025 · Sep 19, 2025
diff --git a/csrc/alias_analysis.cpp b/csrc/alias_analysis.cpp
@@ -59,6 +59,12 @@ bool okToRelayout(
     const TensorView* tv,
     const Layout& new_layout,
     const EmptyAllocationAs empty_allocation_as) {
+  // we can merge this with the one below
+  // when using logical domain as the allocation domain, we can basically ignore the layout when it's not used by codegen
+  if (empty_allocation_as == EmptyAllocationAs::kLogical && !ir_utils::canUsePresetAllocationDomain(tv, false)) {
+    return true;
+  }
+
   if (empty_allocation_as == EmptyAllocationAs::kUndetermined &&
       !tv->hasAllocation()) {
     return true;

diff --git a/csrc/device_lower/validation.cpp b/csrc/device_lower/validation.cpp
@@ -963,6 +963,9 @@ class VectorizeValidator : public OptInDispatch {
       TensorView* tv,
       std::string name,
       int64_t vector_word_size_bit) {
+    if (!ir_utils::canUsePresetAllocationDomain(tv)) {
+      return;
+    }
     // aten_element_size_bit is the minimum unit (one element) of tv's
     // corresponding at::Tensor. It may or may not be the same as
     // dataTypeSizeBit(tv->dtype()), because we support non-ATen data types as

diff --git a/csrc/ir/internal_base_nodes.h b/csrc/ir/internal_base_nodes.h
@@ -181,6 +181,11 @@ class NVF_API IterDomain : public Val {
     return getIterType() == IterType::Iteration;
   }
 
+  IterDomain* resetRFactorProduct(bool is_rfactor_domain=false) {
+    is_rfactor_domain_ = is_rfactor_domain;
+    return this;
+  }
+
   bool isRFactorProduct() const {
     return is_rfactor_domain_;
   }

diff --git a/csrc/ir/utils.cpp b/csrc/ir/utils.cpp
@@ -1747,4 +1747,56 @@ bool isParallelizedBy(const std::vector<IterDomain*>& ids, ParallelType pt) {
       ids, [&](IterDomain* id) { return id->getParallelType() == pt; });
 }
 
+bool canUsePresetAllocationDomain(const TensorView* tv, bool ignore_empty_alloc) {
+  if (ignore_empty_alloc && !tv->hasAllocation()) {
+    return false;
+  }
+  // Honor the allocation domain if the tensor is global or Hopper MMA's
+  // output
+  if (tv->getMemoryType() == MemoryType::Global ||
+      (tv->definition()->isA<MmaOp>() &&
+       isHopper(tv->definition()->as<MmaOp>()->macro()))) {
+    return true;
+  }
+  // If it's a shared memory tensor, the set domain is likely
+  // valid if Swizzle or Bulk is used. Also, if the allocation
+  // domain is just a permutation of the loop domain, use the
+  // set allocation domain. This seems to happen only with
+  // AllocationDomainTest.TransposedIntermediate.
+  if (tv->getMemoryType() == MemoryType::Shared) {
+    if (std::any_of(
+            tv->getAllocationDomain().begin(),
+            tv->getAllocationDomain().end(),
+            [](IterDomain* allocation_domain) {
+              return dynamic_cast<Swizzle*>(
+                         allocation_domain->definition()) != nullptr ||
+                  allocation_domain->getParallelType() == ParallelType::Bulk;
+            }) ||
+        std::is_permutation(
+            tv->getLoopDomain().begin(),
+            tv->getLoopDomain().end(),
+            tv->getAllocationDomain().begin(),
+            tv->getAllocationDomain().end())) {
+      return true;
+    }
+
+    // Honor the set allocation domain if the tensor is used by a
+    // TMA store or MmaOp
+    if (std::ranges::any_of(tv->uses(), [](Expr* expr) {
+          return ir_utils::isCpAsyncBulkStore(expr) || expr->isA<MmaOp>();
+        })) {
+      return true;
+    }
+
+    // If a shared memory output produced by scatter has an
+    // allocation domain explicitly set, it's likely to be the
+    // valid allocation domain.
+    if (auto def = tv->definition();
+        def != nullptr && def->isA<ScatterOp>()) {
+      return true;
+    }
+  }
+  return false;
+}
+
 } // namespace nvfuser::ir_utils
diff --git a/csrc/ir/utils.h b/csrc/ir/utils.h
@@ -861,4 +861,6 @@ std::vector<IterDomain*> propagateScatterAllocationDomain(
 
 bool isParallelizedBy(const std::vector<IterDomain*>& ids, ParallelType pt);
 
+bool canUsePresetAllocationDomain(const TensorView* tv, bool ignore_empty_alloc=true);
+
 } // namespace nvfuser::ir_utils
diff --git a/csrc/scheduler/matmul.cpp b/csrc/scheduler/matmul.cpp
@@ -197,8 +197,21 @@ void Common::updateIdModel() {
     // IdModel
     std::unordered_map<ValGroup, MatmulDimRole> new_id_roles;
     for (auto& [k, v] : id_roles_) {
-      const ValGroup& new_group = new_graph.toGroup(k->front());
-      new_id_roles.emplace(new_group, v);
+      // We need to traverse the ValGroup to find the remaining IDs that remains in the new id_model. This is because that cacheBefore could have eliminated the reduction ID.
+
+      // e.g.
+      // output [m4, n5, rk6] = mma(A [m0, k1], B [n2, k3])
+      // would become
+      // cache [m6, n7, rk8] = mma(A [m0, k1], B [n2, k3])
+      // output [m4, n5 ] = set(cache [m6, n7, rk8])
+      //
+      // So the old role rK6 wouldn't be mapped in new_graph.
+      auto old_vg = std::ranges::find_if(k->vector(), [&new_graph](Val* vg){return new_graph.hasGroup(vg);});
+      NVF_ERROR(
+        old_vg != k->vector().end(),
+        "Old ValGroup not found in new ValGraph"
+      );
+      new_id_roles.emplace(new_graph.toGroup(*old_vg), v);
     }
     id_roles_ = new_id_roles;
   }
@@ -290,10 +303,8 @@ TensorView* Common::cacheBefore(TensorView* orig, LoadStoreOpType op_type) {
   const std::vector<IterDomain*> cache_logical = c->getLogicalDomain();
   NVF_ERROR(orig_logical.size() == cache_logical.size());
   for (size_t i : arange(orig_logical.size())) {
-    // The domain of orig gets transferred to c and a new domain is applied to
-    // orig
-    ValGroup vg = graph_->toGroup(cache_logical[i]);
-    graph_->initializeVal(orig_logical[i], vg);
+    ValGroup vg = graph_->toGroup(orig_logical[i]);
+    graph_->initializeVal(cache_logical[i], vg);
   }
 
   return c;

diff --git a/csrc/tensor_view.cpp b/csrc/tensor_view.cpp
@@ -1128,18 +1128,105 @@ TensorView* TensorView::cacheBefore(LoadStoreOpType op_type) {
         "before computeAt.");
   }
 
-  // Create Producer Domain
-  // This domain will be the consumer which needs a new domain, so replace the
-  // producers domain with this domain.
-
-  auto* producer = IrBuilder::createInContainer<TensorView>(
-      container(),
-      IrBuilder::createInContainer<TensorDomain>(container(), domain()),
-      getDataType().value());
+  // TODO: 1. test reshape; 2. test reduction
+  // We want the producer domain to preserve `root` & `logical`
+  // meanwhile, we want consumer Tensor to preserve `logical` & `allocation` (while erasing all reductions).
+
+  TensorView* producer;
+
+  if (definition()->isA<ScatterOp>()) {
+    // TODO: is there any way to replay a scatter op?!
+    // scatter output's loop is not connected to its root.
+    NVF_ERROR(!domain()->hasRoot(), "scatter output's root is not replayed in cacheBefore");
+
+    std::vector<IterDomain*> logical;
+    std::vector<IterDomain*> loop;
+    std::unordered_map<IterDomain*, IterDomain*> map_cloned_ids;
+
+    std::ranges::transform(domain()->logical(), std::back_inserter(logical), [&](IterDomain* id) {
+        IterDomain* cloned_id = IrBuilder::createInContainer<IterDomain>(container(), id);
+        map_cloned_ids[id] = cloned_id;
+        return cloned_id;
+    });
+    std::ranges::transform(domain()->loop(), std::back_inserter(loop), [&](IterDomain* id) {
+        if (auto it = map_cloned_ids.find(id); it != map_cloned_ids.end()) {
+          // reuse cloned_ids
+          return it->second;
+        }
+        return IrBuilder::createInContainer<IterDomain>(container(), id);
+    });
+    producer = IrBuilder::createInContainer<TensorView>(
+        container(),
+        IrBuilder::createInContainer<TensorDomain>(container(), logical, loop, TensorDomain::getContiguityFilledWith(logical, true), /*skip_loop_validation=*/true),
+        getDataType().value());
+    // TODO:  we are not replaying the loop domain from consumer to producer, is that the right thing to do?!
+  } else {
+    // Create Producer Domain
+    // We only need root for full self replay.
+    std::vector<IterDomain*> root;
+    std::ranges::transform(domain()->hasRoot()?domain()->root():domain()->logical(), std::back_inserter(root), [&](IterDomain* id) {
+        return IrBuilder::createInContainer<IterDomain>(container(), id);
+    });
+
+    producer = IrBuilder::createInContainer<TensorView>(
+        container(),
+        IrBuilder::createInContainer<TensorDomain>(container(), root, root, root, TensorDomain::getContiguityFilledWith(root, true)),
+        getDataType().value());
+    // replay from `root`->`loop` on producer
+    producer->setDomain(TransformReplay::fullSelfReplay(producer->domain(), domain()));
+  }
+
+  // clean up consumer domain to wipe out root and all reduction IDs
+  std::vector<IterDomain*> logical_dom;
+  std::vector<IterDomain*> alloc_dom;
+  std::vector<IterDomain*> loop_dom;
+  std::vector<std::optional<bool>> contiguity;
+
+  // NOTE: I need to clear definition otherwise BestEffortReplay will not work with dangling sources
+  // create an issue for this, use the example from ./bin/test_nvfuser --gtest_filter="PointwiseTest.VectorizeWithBroadcastAndReshape1"
+  // copy non-reduction IDs onto logical and loop
+  std::ranges::copy_if(
+      domain()->logical() | std::views::transform([](IterDomain* id) { id->setDefinition(nullptr); return id->resetRFactorProduct(); }),
+      std::back_inserter(logical_dom),
+      [](IterDomain* id) {return !id->isReduction();});
+  if (definition()->isA<ScatterOp>()) {
+    // NOTE: this doesn't feel right. we would still want to replay the loop domain
+    // we are basically dropping transformations on loop domain for scatter op during cacheBefore
+    loop_dom = logical_dom;
+  } else {
+  std::ranges::copy_if(
+      domain()->loop() | std::views::transform([](IterDomain* id) { return id->resetRFactorProduct(); }),
+      std::back_inserter(loop_dom),
+      [](IterDomain* id) {return !id->isReduction();});
+  }
+  for (auto&& [id, c] : zip(domain()->hasAllocation() ? domain()->allocation() : domain()->logical(), domain()->contiguity())) {
+    if (id->isReduction()) {
+      continue;
+    }
+    id->resetRFactorProduct();
+    if (domain()->hasAllocation()) {
+      alloc_dom.push_back(id);
+    }
+    contiguity.push_back(c);
+  }
+  // TODO: We also need to clear all rfactor across IDs between logical->loop and logical->allocation.
 
   // Set domain of consumer
   TensorView* consumer = this;
-
+  consumer->setDomain(IrBuilder::createInContainer<TensorDomain>(
+      container(),
+      std::vector<IterDomain*>{},
+      logical_dom,
+      alloc_dom,
+      loop_dom,
+      contiguity));
+
+  // TODO: figure out scatter special handling.
+  // if (!producer->definition()->isA<ScatterOp>()) {
+  // } else {
+  // }
+
+  /* FIXME
   std::vector<IterDomain*> new_logical_domain;
   new_logical_domain.reserve(getLogicalDomain().size());
   for (IterDomain* dom : getLogicalDomain() | TensorDomain::kNoReductions) {
@@ -1152,6 +1239,7 @@ TensorView* TensorView::cacheBefore(LoadStoreOpType op_type) {
       container(),
       new_logical_domain,
       TensorDomain::getContiguityFilledWith(new_logical_domain, true)));
+   */
 
   // Insert producer - Cache_Before (CB) - before this TV.
   // Before: Prev TV -> [Definition Op] -> This TV
@@ -1170,6 +1258,7 @@ TensorView* TensorView::cacheBefore(LoadStoreOpType op_type) {
   // definition_ is no longer valid
   // setDefinition(nullptr);
 
+  /* FIXME
   // We do not want to reproduce the loop domain if it's for
   // scatter. Recall that the loop domain of the scatter op is derived
   // from the logical domain of the scatter index tensor. Here, the
@@ -1186,6 +1275,7 @@ TensorView* TensorView::cacheBefore(LoadStoreOpType op_type) {
             producer, consumer->getLogicalDomain()),
         true);
   }
+   */
 
   if (consumer->hasDeviceMesh()) {
     producer->setDeviceMesh(consumer->getDeviceMesh());

diff --git a/csrc/transform_replay.cpp b/csrc/transform_replay.cpp
@@ -70,6 +70,10 @@ class ReplaySelf : public ReplayTransformations {
         s->outer()->getIterType(),
         s->inner()->getIterType());
 
+    // Parallelize type could include device from split.
+    ido->parallelize(s->outer()->getParallelType());
+    idi->parallelize(s->inner()->getParallelType());
+
     // Remove mapped id from loop IDs
     loop_ids_.erase(mapped);
 
@@ -106,7 +110,7 @@ class ReplaySelf : public ReplayTransformations {
         id_inner_mapped,
         " however one or both are not loop nodes.");
 
-    IterDomain* merged_id = IterDomain::merge(id_outer_mapped, id_inner_mapped);
+    IterDomain* merged_id = IterDomain::merge(id_outer_mapped, id_inner_mapped, m->out()->isRFactorProduct());
 
     // Remove inputs from the loop IDs
     loop_ids_.erase(id_outer_mapped);
@@ -147,11 +151,15 @@ class ReplaySelf : public ReplayTransformations {
     // output domain also an rfactor
     const auto resize_out_logical = resize->out()->isRFactorProduct();
 
+    // Mark output IterType
+    const auto resize_out_iter_type = resize->out()->getIterType();
+
     auto replayed_out = IterDomain::resize(
         mapped,
         resize->leftExpand(),
         resize->rightExpand(),
-        resize_out_logical);
+        resize_out_logical,
+        resize_out_iter_type);
 
     loop_ids_.erase(mapped);
 
@@ -234,7 +242,7 @@ TensorDomain* TransformReplay::fullSelfReplay(
           new_self_root->root(),
           new_logical_domain,
           new_domain,
-          self->contiguity());
+          TensorDomain::getContiguityFilledWith(new_logical_domain, true));
     }
   }
 

diff --git a/tests/cpp/test_allocation_domain.cpp b/tests/cpp/test_allocation_domain.cpp
@@ -623,7 +623,6 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) {
 
   ASSERT_EQ(tv0->getAllocationDomain(), tv0_nhwc);
   ASSERT_EQ(tv1->getAllocationDomain(), expected_new_allocation_domain);
-  ASSERT_EQ(tv2->getAllocationDomain(), tv1_nhwc);
 
   for (auto tv : {tv1, tv2}) {
     // [N, C, H, W]
@@ -708,7 +707,6 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) {
 
   ASSERT_EQ(tv0->getAllocationDomain(), tv0_2d);
   ASSERT_EQ(tv1->getAllocationDomain(), expected_new_allocation_domain);
-  ASSERT_EQ(tv2->getAllocationDomain(), tv1_2d);
 
   for (auto tv : {tv1, tv2}) {
     tv->split(0, 128);
Original file line number	Diff line number	Diff line change
Expand Up		@@ -861,4 +861,6 @@ std::vector<IterDomain*> propagateScatterAllocationDomain(

		bool isParallelizedBy(const std::vector<IterDomain*>& ids, ParallelType pt);

		bool canUsePresetAllocationDomain(const TensorView* tv, bool ignore_empty_alloc=true);

		} // namespace nvfuser::ir_utils