apache
diff --git a/‎include/tvm/tir/schedule/schedule.h‎
Lines changed: 7 additions & 0 deletions b/‎include/tvm/tir/schedule/schedule.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎python/tvm/tir/schedule/schedule.py‎
Lines changed: 27 additions & 0 deletions b/‎python/tvm/tir/schedule/schedule.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎src/tir/schedule/concrete_schedule.cc‎
Lines changed: 9 additions & 0 deletions b/‎src/tir/schedule/concrete_schedule.cc‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/tir/schedule/concrete_schedule.h‎
Lines changed: 2 additions & 0 deletions b/‎src/tir/schedule/concrete_schedule.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/tir/schedule/primitive.h‎
Lines changed: 8 additions & 0 deletions b/‎src/tir/schedule/primitive.h‎
Lines changed: 8 additions & 0 deletions
@@ -608,6 +608,13 @@ class ScheduleNode : public runtime::Object {
    * \param block The block to be inlined to its producer
    */
   virtual void ReverseComputeInline(const BlockRV& block) = 0;
+  /*!
+   * \brief Fuse an epilogue block into a reduction block
+   * \param reduction_block The reduction block (e.g., matmul)
+   * \param epilogue_block The epilogue block to be fused (e.g., bias add)
+   */
+  virtual void FuseReductionEpilogue(const BlockRV& reduction_block,
+                                     const BlockRV& epilogue_block) = 0;
   /******** Schedule: Reduction ********/
   /*!
    * \brief Decompose a reduction block into two separate blocks.
 
@@ -2345,6 +2345,33 @@ def after_inline(a: T.handle, c: T.handle) -> None:
         # pylint: disable-next=no-member
         _ffi_api.ScheduleReverseComputeInline(self, block)  # type: ignore
 
+    @type_checked
+    def fuse_reduction_epilogue(
+        self,
+        reduction_block: Union[BlockRV, str],
+        epilogue_block: Union[BlockRV, str],
+    ) -> None:
+        """Fuse an epilogue block into a reduction block.
+
+        It requires:
+        1) The reduction block is a complete reduction block
+        2) The epilogue block only reads from the reduction block's output
+        3) The epilogue performs a simple addition: output = reduction_result + bias
+
+        Parameters
+        ----------
+        reduction_block : Union[BlockRV, str]
+            The reduction block (e.g., matmul)
+        epilogue_block : Union[BlockRV, str]
+            The epilogue block to be fused (e.g., bias add)
+        """
+        reduction_block = self._normalize_block_arg(reduction_block)
+        epilogue_block = self._normalize_block_arg(epilogue_block)
+        # pylint: disable-next=no-member
+        _ffi_api.ScheduleFuseReductionEpilogue(
+            self, reduction_block, epilogue_block
+        )  # type: ignore
+
     ########## Schedule: Reduction ##########
 
     @type_checked
 
@@ -832,6 +832,15 @@ void ConcreteScheduleNode::ReverseComputeInline(const BlockRV& block_rv) {
   this->state_->DebugVerify();
 }
 
+void ConcreteScheduleNode::FuseReductionEpilogue(const BlockRV& reduction_block_rv,
+                                                 const BlockRV& epilogue_block_rv) {
+  TVM_TIR_SCHEDULE_BEGIN();
+  tir::FuseReductionEpilogue(state_, this->GetSRef(reduction_block_rv),
+                             this->GetSRef(epilogue_block_rv));
+  TVM_TIR_SCHEDULE_END("fuse-reduction-epilogue", this->error_render_level_);
+  this->state_->DebugVerify();
+}
+
 /******** Schedule: Block Annotation ********/
 
 void ConcreteScheduleNode::StorageAlign(const BlockRV& block_rv, int buffer_index, int axis,
 
@@ -147,6 +147,8 @@ class ConcreteScheduleNode : public ScheduleNode {
                         int index = -1) override;
   void ComputeInline(const BlockRV& block) override;
   void ReverseComputeInline(const BlockRV& block) override;
+  void FuseReductionEpilogue(const BlockRV& reduction_block,
+                             const BlockRV& epilogue_block) override;
   /******** Schedule: Reduction ********/
   BlockRV RFactor(const LoopRV& loop_rv, int factor_axis) override;
   BlockRV DecomposeReduction(const BlockRV& block_rv, const LoopRV& loop_rv) override;
 
@@ -509,6 +509,14 @@ TVM_DLL void ComputeInline(ScheduleState self, const StmtSRef& block_sref);
  * \param block_sref The sref to the block to be inlined to its producer
  */
 TVM_DLL void ReverseComputeInline(ScheduleState self, const StmtSRef& block_sref);
+/*!
+ * \brief Fuse an epilogue block into a reduction block
+ * \param self The state of the schedule
+ * \param reduction_block_sref The sref to the reduction block
+ * \param epilogue_block_sref The sref to the epilogue block to be fused
+ */
+TVM_DLL void FuseReductionEpilogue(ScheduleState self, const StmtSRef& reduction_block_sref,
+                                   const StmtSRef& epilogue_block_sref);
 /******** Schedule: Reduction ********/
 /*!
  * \brief Decompose a reduction block into two separate blocks.