NVIDIA · jjsjann123 · Sep 18, 2025 · Sep 18, 2025 · Sep 18, 2025 · Sep 19, 2025
diff --git a/csrc/multidevice/utils.cpp b/csrc/multidevice/utils.cpp
@@ -776,21 +776,16 @@ std::unordered_set<TensorView*> getTvsWithDifferentSharding(
   return ret;
 }
 
-void validateDeviceSplit(Expr* expr) {
-  NVF_ERROR(expr != nullptr, "Expected a valid expression.");
-  auto* split = dynamic_cast<Split*>(expr);
-  NVF_ERROR(
-      split != nullptr,
-      "Only split expressions are supported for producing device ids: ",
-      expr->toString());
-  NVF_ERROR(
-      split->outer()->isDeviceDim(),
-      "Expected the outer dimension to be a device dimension: ",
-      expr->toString());
-  NVF_ERROR(
-      !split->innerSplit(),
-      "Inner split by device dimension is not supported: ",
-      expr->toString());
+bool isValidDeviceSplit(Expr* expr) {
+  if (expr == nullptr || !expr->isA<Split>()) {
+    return false;
+  }
+  auto* split = expr->as<Split>();
+  if (split == nullptr || !split->outer()->isDeviceDim() ||
+      split->innerSplit()) {
+    return false;
+  }
+  return true;
 }
 
 IterDomain* projectShardedAllocationToLogical(
@@ -806,7 +801,8 @@ IterDomain* projectShardedAllocationToLogical(
 
   IterDomain* logical_id = allocation_id;
   for (Expr* expr : exprs | std::views::reverse) {
-    validateDeviceSplit(expr);
+    NVF_ERROR(
+        isValidDeviceSplit(expr), "invalid device split: ", expr->toString());
     logical_id = expr->as<Split>()->in();
   }
   return logical_id;
@@ -825,7 +821,8 @@ IterDomain* projectLogicalToShardedAllocation(
        tv->getMaybeAllocationDomain().end()});
   IterDomain* allocation_id = logical_id;
   for (auto expr : exprs) {
-    validateDeviceSplit(expr);
+    NVF_ERROR(
+        isValidDeviceSplit(expr), "invalid device split: ", expr->toString());
     allocation_id = expr->as<Split>()->inner();
   }
   return allocation_id;

diff --git a/csrc/multidevice/utils.h b/csrc/multidevice/utils.h
@@ -169,7 +169,7 @@ std::vector<int64_t> unshardedSizes(
 
 // Validate the expression is a valid DID split: expr is an outer split with
 // device dim as the outer dimension.
-void validateDeviceSplit(Expr* expr);
+bool isValidDeviceSplit(Expr* expr);
 
 // Find the producing logical id of the given allocation id traversing
 // through device splits. For unsharded allocation_id, logical_id is the same as

diff --git a/csrc/scheduler/vectorize_helper.cpp b/csrc/scheduler/vectorize_helper.cpp
@@ -807,13 +807,22 @@ Val* ContiguousInnerDimensionsMapper::getContigMergeOfInnerSize(
         {alloc_iid});
     IterDomain* logical_id = alloc_iid;
     Val* num_devices = of_tv->container()->oneVal();
+    bool only_valid_device_split = true;
     for (Expr* expr : exprs | std::views::reverse) {
-      validateDeviceSplit(expr);
+      if (!isValidDeviceSplit(expr)) {
+        only_valid_device_split = false;
+        break;
+      }
       auto* split = expr->as<Split>();
       logical_id = split->in();
       num_devices = SimplifyingIrBuilder::mulExpr(num_devices, split->factor());
     }
 
+    // Non device split could lead to padding, which prevents vectorization
+    if (!only_valid_device_split) {
+      break;
+    }
+
     // Mapping order isn't correct, cannot expand vectorization dimension.
     if (projected_dims[--projected_dims_i] != logical_id) {
       break;

diff --git a/csrc/tensor_view.cpp b/csrc/tensor_view.cpp
@@ -1163,10 +1163,19 @@ TensorView* TensorView::cacheBefore(LoadStoreOpType op_type) {
   // consumer tensor needs to copy the whole producer tensor, so the
   // loop domain must be based on the logical domain.
   if (!producer->definition()->isA<ScatterOp>()) {
-    auto replayed_consumer_pair = TransformReplay::replayCasP(
-        consumer, producer, -1, TransformReplayOptions().replayAllocation());
-
-    consumer->setDomain(replayed_consumer_pair.first);
+    // NOTE: Refactored from using TransformReplay::replayCasP doesn't
+    // replay transformation between logical to allocation. The map only works
+    // when the transformations are also on the path from logical to loop. I
+    // cannot comprehend what that replay code was doing and decided to switch
+    // to selfReplay, which targets replay of loop and allocation.
+    // NOTE: producer and consumer is linked by a LoadStoreOp, otherwise we
+    // cannot use selfReplay on general pari of producer-consumer TVs.
-    // cannot use selfReplay on general pari of producer-consumer TVs.
+    // cannot use selfReplay on a general pair of producer-consumer TVs.
-    // cannot use selfReplay on general pari of producer-consumer TVs.
+    // cannot use selfReplay on a general pair of producer-consumer TVs.
+    TransformReplay::selfReplay(
+        producer->domain(), consumer->domain(), /*ignore_reductions=*/true);
+    // TODO: remove allocation domain from cached TV
+    // technically we shouldn't let output allocation domain to dictate layout
 auto* producer = IrBuilder::createInContainer<TensorView>( 
     container(), 
     IrBuilder::createInContainer<TensorDomain>(container(), domain()), 
     getDataType().value()); 
 auto* producer = IrBuilder::createInContainer<TensorView>( 
     container(), 
     IrBuilder::createInContainer<TensorDomain>(container(), domain()), 
     getDataType().value()); 
+    // of the cache. But existing scheduler expects the behavior and allocation
+    // domain to be preserved on the cache
   } else if (producer->hasAllocation()) {
     consumer->setAllocationDomain(
         ir_utils::propagateScatterAllocationDomain(

diff --git a/csrc/transform_replay.cpp b/csrc/transform_replay.cpp
@@ -321,6 +321,9 @@ void TransformReplay::selfReplay(
   // Replay allocation.
   if (self->hasAllocation()) {
     const std::vector<IterDomain*>& self_allocation = self->allocation();
+    // replay on allocation, for cases when the transformation is not on path to
+    // loop.
+    ReplaySelf allocation_dom_replay(self_allocation, axis_map);
     const std::vector<std::optional<bool>>& self_contiguity =
         self->contiguity();
     NVF_ERROR_EQ(self_allocation.size(), self_contiguity.size());
@@ -347,18 +350,25 @@ void TransformReplay::selfReplay(
       if (ignore_reductions && alloc_id->isReduction()) {
         continue;
       }
-      auto it = replay.getReplay().find(alloc_id);
-      NVF_ERROR(
-          it != replay.getReplay().end(),
-          "failed to replay IterDomain: ",
-          alloc_id);
+      IterDomain* id = nullptr;
+      // NOTE: try to use replay on loop domain first, to avoid unnecessarily
+      // duplicated transformation
+      for (const auto& re :
+           {replay.getReplay(), allocation_dom_replay.getReplay()}) {
+        auto it = re.find(alloc_id);
+        if (it != re.end()) {
+          id = it->second;
+          break;
+        }
+      }
+      NVF_ERROR(id, "failed to replay IterDomain: ", alloc_id);
       NVF_ERROR_EQ(
-          it->second->isBroadcast(),
+          id->isBroadcast(),
           !contiguity.has_value(),
           "Contiguity should be nullopt iff broadcast.");
       new_contiguity.push_back(contiguity);
-      it->second->parallelize(alloc_id->getParallelType());
-      new_alloc_domain.push_back(it->second);
+      id->parallelize(alloc_id->getParallelType());
+      new_alloc_domain.push_back(id);
     }
 
     new_self->setAllocationDomain(new_alloc_domain, new_contiguity);

diff --git a/tests/cpp/test_layout_op.cpp b/tests/cpp/test_layout_op.cpp
@@ -70,6 +70,79 @@ class LayoutOpTest : public NVFuserTest {
   }
 };
 
+TEST_F(LayoutOpTest, LogicalAndAllocationSizes) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  auto inp = makeSymbolicTensor(2);
+  fusion.addInput(inp);
+  auto out = set(inp);
+  fusion.addOutput(out);
+  // padding output to multiple of 16 on allocation domain
+  auto&& [io, ii] = IterDomain::split(
+      out->axis(1), IrBuilder::create<Val>(16L, DataType::Index), true);
+  // NOTE: this doesn't feel right, we have to mark contiguity on axis(0) as
+  // `false` to avoid accidntal indexing collapsing, this should be figured out
+  // by indexing from the ceilDiv.
+  out->setAllocationDomain({out->axis(0), io, ii}, {false, true, true});
+
+  // Two issues with split and merge approach:
+  // 1. This causes predication to expand to the padded region.
+  // 2. Indexing with allocation domain set as `true` is wrong.
+  // out->split(1, 16);  // padding output to multiple of 16
+  // out->setAllocationDomain(out->getLoopDomain(), true);
+  // out->merge(1);  // restore loop domain
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  int m = 512;
+  int k = 9; // note: padded column size would be 16
+  auto t0 = at::randn({m, k}, options);
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0});
+  // padding on the inner dimension is represented as stride on the outer
+  // dimension
+  EXPECT_EQ(
+      cg_outputs[0].as<at::Tensor>().strides(), std::vector<int64_t>({16, 1}));
+  // We need to slice because output buffer shape is not right
+  EXPECT_TRUE(t0.equal(cg_outputs[0].as<at::Tensor>().slice(1, 0, k)));
+  // TODO: enable this when output buffer shape is fixed.
+  // output should remain the correct logical size
+  // EXPECT_EQ(
+  //     cg_outputs[0].as<at::Tensor>().sizes(), std::vector<int64_t>({512,
+  //     9}));
+}
+
+TEST_F(LayoutOpTest, AllocationDomainSplitVectorizationFactor) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  auto inp = makeSymbolicTensor(3);
+  fusion.addInput(inp);
+  auto out = set(inp);
+  fusion.addOutput(out);
+  // split would prevent vectorization
+  auto&& [io, ii] = IterDomain::split(
+      out->axis(1), IrBuilder::create<Val>(16L, DataType::Index), true);
+  out->setAllocationDomain(
+      {out->axis(0), io, ii, out->axis(2)}, {false, true, true, true});
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  // because of the split on the middle dimension, we only have the fastest
+  // dimension participating in vectorization.
+  auto t0 = at::randn({512, 128, 2}, options);
+
+  // NOTE force pointwise scheduler here just for testing purpose
+  auto cg_results =
+      scheduleAndRun(fusion_ptr.get(), SchedulerType::PointWise, {t0});
+  auto pparams = cg_results.heuristic_params->as<PointwiseParams>();
+  EXPECT_EQ(pparams->vectorization_factor, 2);
+
+  testValidate(fusion_ptr.get(), cg_results.outputs, {t0}, __LINE__, __FILE__);
+}
+
 TEST_F(LayoutOpTest, CppApi) {
   auto fusion_ptr = std::make_unique<Fusion>();
   Fusion& fusion = *fusion_ptr.get();