Allow ScatterOp with multiple dimensions as long as extents are the same (#5175)

naoyam · web-flow · commit db9721d5d859 · 2025-09-18T15:39:57.000-07:00
Analogous to the exact size attribute of `GatherOp`.
diff --git a/csrc/device_lower/pass/index.cpp b/csrc/device_lower/pass/index.cpp
@@ -372,6 +372,7 @@ void IndexLowering::handle(const ScatterOp* sop) {
       sop->dim(),
       lowered_index,
       lowered_src,
+      sop->exactSizes(),
       sop->accumulate() ? std::optional(sop->accumulateOp()) : std::nullopt));
   GpuLower::current()->propagateExprInfo(sop, back());
 }
diff --git a/csrc/device_lower/validation.cpp b/csrc/device_lower/validation.cpp
@@ -1339,12 +1339,10 @@ void validateScatter(Fusion* fusion) {
     auto in_tv = sop->in()->as<TensorView>();
     auto out_tv = sop->out()->as<TensorView>();
 
-    // TensorIndexer currently only supports scatter with 1D tensors
-    // due to the non-exactness of non-indexed IDs.
-    NVF_ERROR_EQ(
-        out_tv->getLogicalDomain().size(),
-        1,
-        "Scatter with multi-dimensional tensors is not yet supported: ",
+    // TensorIndexer currently only supports exact scatter ops
+    NVF_ERROR(
+        sop->exactSizes(),
+        "Non-exact scatter is not yet supported: ",
         sop->toString());
 
     // Scatter is implemented as an in-place op. To lower it safely, it
diff --git a/csrc/ir/internal_nodes.h b/csrc/ir/internal_nodes.h
@@ -252,13 +252,17 @@ class GatherOp : public Expr {
 class ScatterOp : public Expr {
  public:
   using Expr::Expr;
+
+  // exact_sizes: true when non-scatter axes of all inputs are
+  // guaranteed to have the same extents
   ScatterOp(
       IrBuilderPasskey,
       Val* out,
       Val* self,
       int64_t dim,
       Val* index,
       Val* src,
+      bool exact_sizes,
       std::optional<BinaryOpType> accumulate_op = std::nullopt);
 
   NVFUSER_DECLARE_CLONE_AND_CREATE
@@ -295,13 +299,17 @@ class ScatterOp : public Expr {
 
   IterDomain* getIndexedID() const;
 
-  bool accumulate() const {
+  bool exactSizes() const {
     return attribute<bool>(1);
   }
 
+  bool accumulate() const {
+    return attribute<bool>(2);
+  }
+
   BinaryOpType accumulateOp() const {
     NVF_ERROR(accumulate());
-    return attribute<BinaryOpType>(2);
+    return attribute<BinaryOpType>(3);
   }
 };
 
diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp
@@ -292,13 +292,15 @@ ScatterOp::ScatterOp(
     int64_t dim,
     Val* index,
     Val* src,
+    bool exact_sizes,
     std::optional<BinaryOpType> accumulate_op)
     : Expr(passkey) {
   addInput(self);
   addInput(index);
   addInput(src);
   addOutput(out);
   addDataAttribute(dim);
+  addDataAttribute(exact_sizes);
   // is this accumulate?
   addDataAttribute(accumulate_op.has_value());
   if (accumulate_op.has_value()) {
diff --git a/csrc/logical_domain_map.cpp b/csrc/logical_domain_map.cpp
@@ -141,6 +141,17 @@ std::pair<std::unordered_set<IterDomain*>, bool> getNonMappingDomainInfo(
       // we are not mapping anything, `has_consumer_id` doesn't matter.
       has_consumer_id = false;
     }
+  } else if (auto sop = dynamic_cast<ScatterOp*>(consumer_tv->definition())) {
+    if (producer_tv != sop->in()) {
+      auto producer_logical =
+          TensorDomain::noReductions(producer_tv->getLogicalDomain());
+      for (const auto& [i, p_id] : enumerate(producer_logical)) {
+        if ((int64_t)i == sop->dim() || !sop->exactSizes()) {
+          non_mapping_ids.insert(p_id);
+        }
+      }
+      has_consumer_id = true;
+    }
   }
 
   return std::make_pair(non_mapping_ids, has_consumer_id);
@@ -153,15 +164,6 @@ std::unordered_map<IterDomain*, IterDomain*> PairwiseLogicalDomainMap::map(
     const TensorDomain* consumer,
     const std::unordered_set<IterDomain*>& dims_to_map,
     bool producer_to_consumer) const {
-  // In the case of scatter, nothing is guaranteed to map except for
-  // the self producer. Note that in PyTorch even non-indexed
-  // dimensions of index and src tensors are not guaranteed to have
-  // the same extent as the self/out tensors.
-  if (auto sop = dynamic_cast<ScatterOp*>(consumer_tv_->definition());
-      sop != nullptr && producer_tv_ != sop->in()) {
-    return {};
-  }
-
   std::vector<bool> broadcast_flags;
   if (auto* bop = dynamic_cast<BroadcastOp*>(consumer_tv_->definition())) {
     broadcast_flags = bop->getBroadcastDimFlags();
diff --git a/csrc/ops/indexing.cpp b/csrc/ops/indexing.cpp
@@ -6,6 +6,7 @@
  */
 // clang-format on
 
+#include <expr_simplifier.h>
 #include <ir/all_nodes.h>
 #include <ir/builder.h>
 #include <ir/iostream.h>
@@ -182,6 +183,22 @@ TensorView* scatter(
       "dimensions in scatter like ops.");
   dim = wrapDim(dim, (int64_t)self_dom.size());
 
+  bool is_exact = true;
+  for (const auto i : arange(std::ssize(self_dom))) {
+    if (i == dim) {
+      continue;
+    }
+    Val* self_id_size = self_dom.at(i)->getMaybeExpandedExtent();
+    Val* idx_id_size = idx_dom.at(i)->getMaybeExpandedExtent();
+    auto same_size =
+        simplifyExpr(SimplifyingIrBuilder::eqExpr(self_id_size, idx_id_size));
+    if (same_size->isTrue()) {
+      continue;
+    }
+    is_exact = false;
+    break;
+  }
+
   // The shape of output tensor is same as self tensor.
   std::vector<IterDomain*> out_logical;
   for (const auto i : arange(self_dom.size())) {
@@ -195,13 +212,16 @@ TensorView* scatter(
   }
 
   // Create the loop domain based on the logical domain of the index
-  // tensor.
+  // tensor. For non-scatter axes, reuse the logical IDs if exact.
   std::vector<IterDomain*> out_loop;
   out_loop.reserve(idx_dom.size());
-  std::ranges::transform(
-      idx_dom, std::back_inserter(out_loop), [](IterDomain* id) {
-        return IterDomainBuilder(id).build();
-      });
+  for (const auto& [i, idx_id] : enumerate(idx_dom)) {
+    if ((int64_t)i == dim || !is_exact) {
+      out_loop.push_back(IterDomainBuilder(idx_id).build());
+    } else {
+      out_loop.push_back(out_logical.at(i));
+    }
+  }
 
   // Create the output tensor. The validation of the loop domain needs
   // to be skipped as it is not guaranteed to be equivalent to the
@@ -226,7 +246,7 @@ TensorView* scatter(
   }
 
   IrBuilder::create<ScatterOp>(
-      out_tensor, self, dim, index, src, accumulate_op);
+      out_tensor, self, dim, index, src, is_exact, accumulate_op);
 
   return out_tensor->as<TensorView>();
 }
diff --git a/csrc/scheduler/greedy.cpp b/csrc/scheduler/greedy.cpp
@@ -239,10 +239,9 @@ class CompileTimeChecker : private IterVisitor {
     auto inp = scatter->in()->as<TensorView>();
     auto out = scatter->out()->as<TensorView>();
 
-    if (out->getLogicalDomain().size() != 1) {
+    if (!scatter->exactSizes()) {
       can_schedule_ = false;
-      setRejectReason(
-          "Scatter with multi-dimensional tensors is not yet supported");
+      setRejectReason("Non-exact scatter is not yet supported");
       return;
     }
 
diff --git a/tests/cpp/test_scatter.cpp b/tests/cpp/test_scatter.cpp
@@ -168,7 +168,69 @@ TEST_F(ScatterTest, GridCounting) {
   testValidate(&fusion, outputs, {t0}, __LINE__, __FILE__);
 }
 
-TEST_F(ScatterTest, BlockCountingWithShmem2D) {
+TEST_P(ScatterTest, BlockCountingWithShmem2DExact) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  const std::vector<int64_t> self_shape{4, 100};
+  const std::vector<int64_t> index_shape{4, 10};
+
+  auto tv0 = makeContigConcreteTensor(index_shape, DataType::Int);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  auto tv2 = zeros(
+      {IrBuilder::create<Val>(self_shape[0]),
+       IrBuilder::create<Val>(self_shape[1])},
+      DataType::Int);
+  auto tv3 = ones(
+      {IrBuilder::create<Val>(index_shape[0]),
+       IrBuilder::create<Val>(index_shape[1])},
+      DataType::Int);
+  auto tv4 = scatter(tv2, 1, tv1, tv3);
+  auto tv5 = set(tv4);
+  fusion.addOutput(tv5);
+
+  auto options = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  auto t0 = at::randperm(self_shape[1], options)
+                .slice(0, 0, index_shape[1])
+                .repeat({index_shape[0], 1});
+
+  if (manual_scheduling) {
+    for (auto tv : fusion.allTvs()) {
+      tv->axis(0)->parallelize(ParallelType::BIDx);
+      tv->axis(1)->parallelize(ParallelType::TIDx);
+    }
+
+    // Scatter input must use the same memory as the output
+    tv2->setMemoryType(MemoryType::Shared);
+    tv2->setAllocationDomain(tv2->getLogicalDomain(), true);
+    tv4->setMemoryType(MemoryType::Shared);
+    tv4->setAllocationDomain(tv4->getLogicalDomain(), true);
+
+    KernelExecutor ke;
+    ke.compile(&fusion, {t0});
+    auto outputs = ke.run({t0});
+
+    testValidate(&fusion, outputs, {t0}, __LINE__, __FILE__);
+  } else {
+    FusionExecutorCache executor_cache(std::move(fusion_ptr));
+    auto outputs = executor_cache.runFusionWithInputs({t0});
+    testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__);
+    FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
+    // All ops should be taken care the greedy scheduler, but there's
+    // an additional segment due to a segmenter_set. Not sure why it
+    // gets inserted.
+    EXPECT_THAT(
+        runtime->fusionSegments()->groups(),
+        testing::UnorderedElementsAre(
+            HeuristicIs(SchedulerType::ExprEval),
+            HeuristicIs(SchedulerType::Greedy)));
+  }
+}
+
+TEST_F(ScatterTest, BlockCountingWithShmem2DNonExact) {
   // Scatter allows the non-indexed domains of the index tensor to
   // have smaller extents, which causes indexing error as there's not
   // traversal path. It is not currently supported.
@@ -209,7 +271,9 @@ TEST_F(ScatterTest, BlockCountingWithShmem2D) {
   tv4->setAllocationDomain(tv4->getLogicalDomain(), true);
 
   auto options = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-  auto t0 = at::randperm(self_shape[1], options).slice(0, 0, index_shape[1]);
+  auto t0 = at::randperm(self_shape[1], options)
+                .slice(0, 0, index_shape[1])
+                .repeat({index_shape[0], 1});
 
   KernelExecutor ke;
   ke.compile(&fusion, {t0});

Original file line number	Diff line number	Diff line change
`@@ -372,6 +372,7 @@ void IndexLowering::handle(const ScatterOp* sop) {`
`372`	`372`	`sop->dim(),`
`373`	`373`	`lowered_index,`
`374`	`374`	`lowered_src,`
	`375`	`+ sop->exactSizes(),`
`375`	`376`	`sop->accumulate() ? std::optional(sop->accumulateOp()) : std::nullopt));`
`376`	`377`	`GpuLower::current()->propagateExprInfo(sop, back());`
`377`	`378`	`}`