NVIDIA · jjsjann123 · Sep 18, 2025 · Sep 18, 2025 · Sep 18, 2025 · Sep 19, 2025
diff --git a/csrc/multidevice/utils.cpp b/csrc/multidevice/utils.cpp
@@ -776,21 +776,16 @@ std::unordered_set<TensorView*> getTvsWithDifferentSharding(
   return ret;
 }
 
-void validateDeviceSplit(Expr* expr) {
-  NVF_ERROR(expr != nullptr, "Expected a valid expression.");
-  auto* split = dynamic_cast<Split*>(expr);
-  NVF_ERROR(
-      split != nullptr,
-      "Only split expressions are supported for producing device ids: ",
-      expr->toString());
-  NVF_ERROR(
-      split->outer()->isDeviceDim(),
-      "Expected the outer dimension to be a device dimension: ",
-      expr->toString());
-  NVF_ERROR(
-      !split->innerSplit(),
-      "Inner split by device dimension is not supported: ",
-      expr->toString());
+bool isValidDeviceSplit(Expr* expr) {
+  if (expr == nullptr || !expr->isA<Split>()) {
+    return false;
+  }
+  auto* split = expr->as<Split>();
+  if (split == nullptr || !split->outer()->isDeviceDim() ||
+      split->innerSplit()) {
+    return false;
+  }
+  return true;
 }
 
 IterDomain* projectShardedAllocationToLogical(
@@ -806,7 +801,8 @@ IterDomain* projectShardedAllocationToLogical(
 
   IterDomain* logical_id = allocation_id;
   for (Expr* expr : exprs | std::views::reverse) {
-    validateDeviceSplit(expr);
+    NVF_ERROR(
+        isValidDeviceSplit(expr), "invalid device split: ", expr->toString());
     logical_id = expr->as<Split>()->in();
   }
   return logical_id;
@@ -825,7 +821,8 @@ IterDomain* projectLogicalToShardedAllocation(
        tv->getMaybeAllocationDomain().end()});
   IterDomain* allocation_id = logical_id;
   for (auto expr : exprs) {
-    validateDeviceSplit(expr);
+    NVF_ERROR(
+        isValidDeviceSplit(expr), "invalid device split: ", expr->toString());
     allocation_id = expr->as<Split>()->inner();
   }
   return allocation_id;

diff --git a/csrc/multidevice/utils.h b/csrc/multidevice/utils.h
@@ -169,7 +169,7 @@ std::vector<int64_t> unshardedSizes(
 
 // Validate the expression is a valid DID split: expr is an outer split with
 // device dim as the outer dimension.
-void validateDeviceSplit(Expr* expr);
+bool isValidDeviceSplit(Expr* expr);
 
 // Find the producing logical id of the given allocation id traversing
 // through device splits. For unsharded allocation_id, logical_id is the same as

diff --git a/csrc/scheduler/vectorize_helper.cpp b/csrc/scheduler/vectorize_helper.cpp
@@ -807,13 +807,22 @@ Val* ContiguousInnerDimensionsMapper::getContigMergeOfInnerSize(
         {alloc_iid});
     IterDomain* logical_id = alloc_iid;
     Val* num_devices = of_tv->container()->oneVal();
+    bool only_valid_device_split = true;
     for (Expr* expr : exprs | std::views::reverse) {
-      validateDeviceSplit(expr);
+      if (!isValidDeviceSplit(expr)) {
+        only_valid_device_split = false;
+        break;
+      }
       auto* split = expr->as<Split>();
       logical_id = split->in();
       num_devices = SimplifyingIrBuilder::mulExpr(num_devices, split->factor());
     }
 
+    // Non device split could lead to padding, which prevents vectorization
+    if (!only_valid_device_split) {
+      break;
+    }
+
     // Mapping order isn't correct, cannot expand vectorization dimension.
     if (projected_dims[--projected_dims_i] != logical_id) {
       break;

diff --git a/csrc/tensor_view.cpp b/csrc/tensor_view.cpp
@@ -1157,10 +1157,14 @@ TensorView* TensorView::cacheBefore(LoadStoreOpType op_type) {
   // consumer tensor needs to copy the whole producer tensor, so the
   // loop domain must be based on the logical domain.
   if (!producer->definition()->isA<ScatterOp>()) {
-    auto replayed_consumer_pair = TransformReplay::replayCasP(
-        consumer, producer, -1, TransformReplayOptions().replayAllocation());
-
-    consumer->setDomain(replayed_consumer_pair.first);
+    // NOTE(jiej): Refactoed from using TransformReplay::replayCasP doesn't
+    // replay transformation between logical to allocation. The map only works
+    // when the transformations are also on the path from logical to loop. I
+    // cannot comprehend what that replay code was doing and decided to switch
+    // to selfReplay, which targets replay of loop and allocation.
+    TransformReplay::selfReplay(producer->domain(), consumer->domain(), true);
+    // remove allocation domain from cached TV
+    producer->setAllocationDomain(producer->getMaybeAllocationDomain(), true);
   } else if (producer->hasAllocation()) {
     consumer->setAllocationDomain(
         ir_utils::propagateScatterAllocationDomain(

diff --git a/csrc/transform_replay.cpp b/csrc/transform_replay.cpp
@@ -288,10 +288,10 @@ void TransformReplay::selfReplay(
   // We use `self_loop` as the target domain because loop post-dominates
   // allocation.
   const std::vector<IterDomain*>& self_loop = self->loop();
-  ReplaySelf replay(self_loop, axis_map);
 
   // Replay loop.
   if (self_loop != self->logical()) {
+    ReplaySelf replay(self_loop, axis_map);
     std::vector<IterDomain*> new_loop;
     if (ignore_reductions) {
       for (auto* id : new_self->logical()) {
@@ -321,6 +321,7 @@ void TransformReplay::selfReplay(
   // Replay allocation.
   if (self->hasAllocation()) {
     const std::vector<IterDomain*>& self_allocation = self->allocation();
+    ReplaySelf replay(self_allocation, axis_map);
     const std::vector<std::optional<bool>>& self_contiguity =
         self->contiguity();
     NVF_ERROR_EQ(self_allocation.size(), self_contiguity.size());

diff --git a/tests/cpp/test_layout_op.cpp b/tests/cpp/test_layout_op.cpp
@@ -70,6 +70,79 @@ class LayoutOpTest : public NVFuserTest {
   }
 };
 
+TEST_F(LayoutOpTest, LogicalAndAllocationSizes) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  auto inp = makeSymbolicTensor(2);
+  fusion.addInput(inp);
+  auto out = set(inp);
+  fusion.addOutput(out);
+  // padding output to multiple of 16 on allocation domain
+  auto&& [io, ii] = IterDomain::split(
+      out->axis(1), IrBuilder::create<Val>(16L, DataType::Index), true);
+  // NOTE: this doesn't feel right, we have to mark contiguity on axis(0) as
+  // `false` to avoid accidntal indexing collapsing, this should be figured out
+  // by indexing from the ceilDiv.
+  out->setAllocationDomain({out->axis(0), io, ii}, {false, true, true});
+
+  // Tow issues with split and merge approach:
+  // 1. This causes predication to expand to the padded region.
+  // 2. Indexing with allocation domain set as `true` is wrong.
+  // out->split(1, 16);  // padding output to multiple of 16
+  // out->setAllocationDomain(out->getLoopDomain(), true);
+  // out->merge(1);  // restore loop domain
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  int m = 512;
+  int k = 9; // note: padded column size would be 16
+  auto t0 = at::randn({m, k}, options);
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0});
+  // padding on the inner dimension is represented as stride on the outer
+  // dimension
+  EXPECT_EQ(
+      cg_outputs[0].as<at::Tensor>().strides(), std::vector<int64_t>({16, 1}));
+  // We need to slice because output buffer shape is not right
+  EXPECT_TRUE(t0.equal(cg_outputs[0].as<at::Tensor>().slice(1, 0, k)));
+  // TODO: enable this when output buffer shape is fixed.
+  // output should remain the correct logical size
+  // EXPECT_EQ(
+  //     cg_outputs[0].as<at::Tensor>().sizes(), std::vector<int64_t>({512,
+  //     9}));
+}
+
+TEST_F(LayoutOpTest, AllocationDomainSplitVectorizationFactor) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  auto inp = makeSymbolicTensor(3);
+  fusion.addInput(inp);
+  auto out = set(inp);
+  fusion.addOutput(out);
+  // split would prevent vectorization
+  out->split(1, 16);
+  out->setAllocationDomain(out->getLoopDomain(), true);
+  // restore loop domain
+  out->merge(1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  // because of the split on the middle dimension, we only have the fastest
+  // dimension participating in vectorization.
+  auto t0 = at::randn({512, 128, 2}, options);
+
+  // NOTE force pointwise scheduler here just for testing purpose
+  auto cg_results =
+      scheduleAndRun(fusion_ptr.get(), SchedulerType::PointWise, {t0});
+  auto pparams = cg_results.heuristic_params->as<PointwiseParams>();
+  EXPECT_EQ(pparams->vectorization_factor, 2);
+
+  testValidate(fusion_ptr.get(), cg_results.outputs, {t0}, __LINE__, __FILE__);
+}
+
 TEST_F(LayoutOpTest, CppApi) {
   auto fusion_ptr = std::make_unique<Fusion>();
   Fusion& fusion = *fusion_ptr.get();