NVIDIA · jjsjann123 · Sep 18, 2025 · Sep 18, 2025 · Sep 18, 2025 · Sep 19, 2025
diff --git a/csrc/multidevice/utils.cpp b/csrc/multidevice/utils.cpp
@@ -776,21 +776,16 @@ std::unordered_set<TensorView*> getTvsWithDifferentSharding(
   return ret;
 }
 
-void validateDeviceSplit(Expr* expr) {
-  NVF_ERROR(expr != nullptr, "Expected a valid expression.");
-  auto* split = dynamic_cast<Split*>(expr);
-  NVF_ERROR(
-      split != nullptr,
-      "Only split expressions are supported for producing device ids: ",
-      expr->toString());
-  NVF_ERROR(
-      split->outer()->isDeviceDim(),
-      "Expected the outer dimension to be a device dimension: ",
-      expr->toString());
-  NVF_ERROR(
-      !split->innerSplit(),
-      "Inner split by device dimension is not supported: ",
-      expr->toString());
+bool isValidateDeviceSplit(Expr* expr) {
+  if (expr == nullptr || !expr->isA<Split>()) {
+    return false;
+  }
+  auto* split = expr->as<Split>();
+  if (split == nullptr || !split->outer()->isDeviceDim() ||
+      split->innerSplit()) {
+    return false;
+  }
+  return true;
 }
 
 IterDomain* projectShardedAllocationToLogical(
@@ -806,7 +801,10 @@ IterDomain* projectShardedAllocationToLogical(
 
   IterDomain* logical_id = allocation_id;
   for (Expr* expr : exprs | std::views::reverse) {
-    validateDeviceSplit(expr);
+    NVF_ERROR(
+        isValidateDeviceSplit(expr),
+        "invalid device split: ",
+        expr->toString());
     logical_id = expr->as<Split>()->in();
   }
   return logical_id;
@@ -825,7 +823,10 @@ IterDomain* projectLogicalToShardedAllocation(
        tv->getMaybeAllocationDomain().end()});
   IterDomain* allocation_id = logical_id;
   for (auto expr : exprs) {
-    validateDeviceSplit(expr);
+    NVF_ERROR(
+        isValidateDeviceSplit(expr),
+        "invalid device split: ",
+        expr->toString());
     allocation_id = expr->as<Split>()->inner();
   }
   return allocation_id;

diff --git a/csrc/multidevice/utils.h b/csrc/multidevice/utils.h
@@ -169,7 +169,7 @@ std::vector<int64_t> unshardedSizes(
 
 // Validate the expression is a valid DID split: expr is an outer split with
 // device dim as the outer dimension.
-void validateDeviceSplit(Expr* expr);
+bool isValidateDeviceSplit(Expr* expr);
 
 // Find the producing logical id of the given allocation id traversing
 // through device splits. For unsharded allocation_id, logical_id is the same as

diff --git a/csrc/scheduler/vectorize_helper.cpp b/csrc/scheduler/vectorize_helper.cpp
@@ -807,13 +807,22 @@ Val* ContiguousInnerDimensionsMapper::getContigMergeOfInnerSize(
         {alloc_iid});
     IterDomain* logical_id = alloc_iid;
     Val* num_devices = of_tv->container()->oneVal();
+    bool only_valid_device_split = true;
     for (Expr* expr : exprs | std::views::reverse) {
-      validateDeviceSplit(expr);
+      if (!isValidateDeviceSplit(expr)) {
+        only_valid_device_split = false;
+        break;
+      }
       auto* split = expr->as<Split>();
       logical_id = split->in();
       num_devices = SimplifyingIrBuilder::mulExpr(num_devices, split->factor());
     }
 
+    // Non device split could lead to padding, which prevents vectorization
+    if (!only_valid_device_split) {
+      break;
+    }
+
     // Mapping order isn't correct, cannot expand vectorization dimension.
     if (projected_dims[--projected_dims_i] != logical_id) {
       break;

diff --git a/tests/cpp/test_layout_op.cpp b/tests/cpp/test_layout_op.cpp
@@ -70,6 +70,70 @@ class LayoutOpTest : public NVFuserTest {
   }
 };
 
+TEST_F(LayoutOpTest, LogicalAndAllocationSizes) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  auto inp = makeSymbolicTensor(2);
+  fusion.addInput(inp);
+  auto out = set(inp);
+  fusion.addOutput(out);
+  // padding output to multiple of 16
+  out->split(1, 16);
+  out->setAllocationDomain(out->getLoopDomain(), true);
+  // restore loop domain
+  out->merge(1);
 NVF_ERROR(broadcast_bit_multiples.size() == ref_loop.size()); 
 // We always cacheBefore output at the beginning of the scheduling. And after 
 // cacheBefore, the reference tensor will have all reduction IDs removed. 
 ref_loop = TensorDomain::noDevices(TensorDomain::noReductions(ref_loop)); 
 NVF_ERROR(broadcast_bit_multiples.size() == ref_loop.size()); 
 // We always cacheBefore output at the beginning of the scheduling. And after 
 // cacheBefore, the reference tensor will have all reduction IDs removed. 
 ref_loop = TensorDomain::noDevices(TensorDomain::noReductions(ref_loop)); 
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  int m = 512;
+  int k = 9; // note: padded column size would be 16
+  auto t0 = at::randn({m, k}, options);
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0});
+  // padding on the inner dimension is represented as stride on the outer
+  // dimension
+  EXPECT_EQ(
+      cg_outputs[0].as<at::Tensor>().strides(), std::vector<int64_t>({16, 1}));
+  // We need to slice because output buffer shape is not right
+  EXPECT_TRUE(t0.equal(cg_outputs[0].as<at::Tensor>().slice(1, 0, k)));
+  // TODO: enable this when output buffer shape is fixed.
+  // output should remain the correct logical size
+  // EXPECT_EQ(
+  //     cg_outputs[0].as<at::Tensor>().sizes(), std::vector<int64_t>({512,
+  //     9}));
+}
+
+TEST_F(LayoutOpTest, AllocationDomainSplitVectorizationFactor) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  auto inp = makeSymbolicTensor(3);
+  fusion.addInput(inp);
+  auto out = set(inp);
+  fusion.addOutput(out);
+  // split would prevent vectorization
+  out->split(1, 16);
+  out->setAllocationDomain(out->getLoopDomain(), true);
+  // restore loop domain
+  out->merge(1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  // because of the split on the middle dimension, we only have the fastest
+  // dimension participating in vectorization.
+  auto t0 = at::randn({512, 128, 2}, options);
+
+  // NOTE force pointwise scheduler here just for testing purpose
+  auto cg_results =
+      scheduleAndRun(fusion_ptr.get(), SchedulerType::PointWise, {t0});
+  auto pparams = cg_results.heuristic_params->as<PointwiseParams>();
+  EXPECT_EQ(pparams->vectorization_factor, 2);
+
+  testValidate(fusion_ptr.get(), cg_results.outputs, {t0}, __LINE__, __FILE__);
+}
+
 TEST_F(LayoutOpTest, CppApi) {
   auto fusion_ptr = std::make_unique<Fusion>();
   Fusion& fusion = *fusion_ptr.get();