Fix decomposeLinearWithBias to shard all created tensorviews (#5563)

Priya2698 · web-flow · commit 5d8efce2b89d · 2025-12-04T15:30:27.000-08:00
Some of the created tensorviews were not sharded consistently and hence
led to more communication than needed.
diff --git a/csrc/multidevice/propagation.cpp b/csrc/multidevice/propagation.cpp
@@ -82,9 +82,13 @@ std::unordered_map<IterDomain*, IterDomain*> getRef2TargetMap(
     const TensorView* target,
     PropagateDirection direction) {
   if (direction == PropagateDirection::kForward) {
-    return PairwiseLogicalDomainMap(ref, target).mapProducerToConsumer();
+    return PairwiseLogicalDomainMap(ref, target)
+        .mapBroadcast(false)
+        .mapProducerToConsumer();
   }
-  return PairwiseLogicalDomainMap(target, ref).mapConsumerToProducer();
+  return PairwiseLogicalDomainMap(target, ref)
+      .mapBroadcast(false)
+      .mapConsumerToProducer();
 }
 
 // Propagates the given device/stream ids from ref to target.
diff --git a/csrc/preseg_passes/decompose_reshardings.cpp b/csrc/preseg_passes/decompose_reshardings.cpp
@@ -314,7 +314,6 @@ void decomposeRowParallelLinearWithBias(Fusion* fusion) {
     }
 
     auto* without_bias = linear(linear_op->inA(), linear_op->inB());
-    TransformReplay::selfReplay(out->domain(), without_bias->domain());
 
     TensorView* broadcasted_bias = [&]() {
       const int64_t rank_after_broadcast = std::ssize(
@@ -330,8 +329,29 @@ void decomposeRowParallelLinearWithBias(Fusion* fusion) {
 
     TensorView* new_out =
         maybeCastOp(out->dtype(), add(without_bias, broadcasted_bias));
-    TransformReplay::selfReplay(out->domain(), new_out->domain());
+
     ir_utils::replaceValInAllExprInputsAndFusionOutputs(out, new_out);
+
+    // Shard without_bias to match new_out so that reduction ID is properly
+    // sharded.
+    TransformReplay::selfReplay(out->domain(), without_bias->domain());
+    TransformReplay::selfReplay(out->domain(), new_out->domain());
+    // Backpropagate shardings to consistently shard all intermediate
+    // expressions. Forward propagating may miss sharding tensorviews
+    // on the path between `bias` and `new_out`.
+    for (Expr* expr : StmtSort::getExprsBetween(
+                          {without_bias, broadcasted_bias}, {new_out}) |
+             std::views::reverse) {
+      for (auto* output : ir_utils::filterByType<TensorView>(expr->outputs())) {
+        for (auto* input : ir_utils::filterByType<TensorView>(expr->inputs())) {
+          shardLoopLike(
+              /*ref=*/output,
+              /*target=*/input,
+              deviceAndStreamParallelTypes(),
+              PropagateDirection::kBackward);
+        }
+      }
+    }
   }
 }
 
diff --git a/csrc/runtime/communication_executor.cpp b/csrc/runtime/communication_executor.cpp
@@ -85,7 +85,7 @@ KernelArgumentHolder CommunicationExecutor::run(
         group_id_);
     SegmentProfiler& sprof = FusionProfiler::segment(group_id_);
     sprof.inputBytesAccessed(computeBytes(args));
-    sprof.scheduler(toString(SchedulerType::ExprEval));
+    sprof.scheduler(toString(SchedulerType::Communication));
     sprof.startKernel();
   }
   NVF_ERROR(host_ir_container_, "Need to compile before you can run.");
diff --git a/tests/python/multidevice/test_matmul.py b/tests/python/multidevice/test_matmul.py
@@ -6,7 +6,7 @@
 import torch
 
 import nvfuser_direct as nvfuser
-from nvfuser_direct import DataType, FusionDefinition
+from nvfuser_direct import DataType, FusionDefinition, PythonProfiler
 
 
 # Avoid doing this when possible. This test started to exist before nvFuser
@@ -197,50 +197,52 @@ def _multidevice_schedule(fd: FusionDefinition):
 def test_linear_reduce_scatter(multidevice_direct_test):
     d = multidevice_direct_test.size
     mesh = nvfuser.multidevice.DeviceMesh(torch.arange(d))
-    e = 768
+    b, s, e = 3, 5, 7
 
     def _definition(fd: FusionDefinition):
-        inp = fd.define_tensor([-1, -1, d * e])
-        weight = fd.define_tensor([e, d * e])
-        out = fd.ops.linear(inp, weight, None)
+        inp = fd.define_tensor([-1, d * s, d * e], dtype=DataType.BFloat16)
+        weight = fd.define_tensor([-1, d * e], dtype=DataType.BFloat16)
+        bias = fd.define_tensor([e], dtype=DataType.BFloat16)
+        out = fd.ops.linear(inp, weight, bias)
         fd.add_output(out)
 
     def _multidevice_schedule(fd: FusionDefinition):
-        inp, weight = fd.fusion.inputs()
+        inp, weight, bias = fd.fusion.inputs()
         (out,) = fd.fusion.outputs()
-        for t in [inp, weight, out]:
-            t.set_device_mesh(mesh)
-            t.outer_split(-1, d)
-            t.axis(-2).parallelize(nvfuser.ParallelType.mesh_x)
+        bias.set_device_mesh(mesh)
+        for tv in [inp, weight, out]:
+            tv.set_device_mesh(mesh)
+            tv.split(-1, d, inner_split=False)
+            tv.axis(-2).parallelize(nvfuser.ParallelType.mesh_x)
 
         # Scatter
         out.outer_split(1, d)
         out.axis(1).parallelize(nvfuser.ParallelType.mesh_x)
 
     torch.cuda.set_device(multidevice_direct_test.local_rank)
 
-    # set b=1 as a temporary fix for the test to pass.
-    # TODO: set b>1 once reduce scatter is fixed.
-    b, s = 2, 1024
-    unsharded_inp = torch.randn(b, s, d * e)
-    unsharded_weight = torch.randn(e, d * e)
-
+    unsharded_inp = torch.randint(-2, 3, (b, d * s, d * e)).to(torch.bfloat16)
+    unsharded_weight = torch.randint(-2, 3, (e, d * e)).to(torch.bfloat16)
+    bias = torch.randint(-2, 3, (e,)).to(torch.bfloat16)
     inp = multidevice_direct_test.shard_tensor(unsharded_inp, -1, mesh)
     weight = multidevice_direct_test.shard_tensor(unsharded_weight, -1, mesh)
 
     with FusionDefinition() as fd:
         _definition(fd)
         _multidevice_schedule(fd)
 
-    (out,) = fd.execute([inp, weight])
+    with PythonProfiler() as prof:
+        (out,) = fd.execute([inp, weight, bias.cuda()])
 
-    unsharded_out = torch.nn.functional.linear(unsharded_inp, unsharded_weight, None)
-    # rtol is the same as the default for fp32. atol is slightly increased.
+    # Only one reduce scatter kernel should be scheduled.
+    assert len(
+        [kp for kp in prof.profile.kernel_profiles if kp.scheduler == "communication"]
+    ) == (1 if d > 1 else 0)
+
+    unsharded_out = torch.nn.functional.linear(unsharded_inp, unsharded_weight, bias)
     torch.testing.assert_close(
         out,
         multidevice_direct_test.shard_tensor(unsharded_out, 1, mesh),
-        rtol=1.3e-6,
-        atol=1e-3,
     )
 
 

Original file line number	Diff line number	Diff line change
`@@ -85,7 +85,7 @@ KernelArgumentHolder CommunicationExecutor::run(`
`85`	`85`	`group_id_);`
`86`	`86`	`SegmentProfiler& sprof = FusionProfiler::segment(group_id_);`
`87`	`87`	`sprof.inputBytesAccessed(computeBytes(args));`
`88`		`- sprof.scheduler(toString(SchedulerType::ExprEval));`
	`88`	`+ sprof.scheduler(toString(SchedulerType::Communication));`
`89`	`89`	`sprof.startKernel();`
`90`	`90`	`}`
`91`	`91`	`NVF_ERROR(host_ir_container_, "Need to compile before you can run.");`