coredac · HobbitQia · Mar 6, 2026 · Jan 2, 2026 · Jan 3, 2026 · Jan 6, 2026
diff --git a/include/NeuraDialect/Architecture/Architecture.h b/include/NeuraDialect/Architecture/Architecture.h
@@ -560,6 +560,9 @@ class Architecture {
 
 // Function for getting the architecture object.
 const Architecture &getArchitecture();
+
+// Function for getting the latency specification file path.
+const std::string &getLatencySpecFile();
 } // namespace neura
 } // namespace mlir
 

diff --git a/include/NeuraDialect/Mapping/MappingState.h b/include/NeuraDialect/Mapping/MappingState.h
@@ -10,6 +10,13 @@
 namespace mlir {
 namespace neura {
 
+// Occupy status for multi-cycle pipeline support.
+// These states define how a tile/FU is occupied at a given time step.
+#define SINGLE_OCCUPY     0 // A single-cycle op is in the FU (exclusive)
+#define START_PIPE_OCCUPY 1 // A multi-cycle op starts in the FU
+#define END_PIPE_OCCUPY   2 // A multi-cycle op ends in the FU
+#define IN_PIPE_OCCUPY    3 // A multi-cycle op is occupying the FU (pipelined)
+
 // Represents a spatial-temporal location: (resource, time_step)
 struct MappingLoc {
   BasicResource *resource;
@@ -54,9 +61,20 @@ namespace neura {
 class MappingState {
 public:
   MappingState(const Architecture &arch, int II, bool is_spatial_only);
-  // Binds a (tile/link, time_step) location to an operation.
+  // Binds a (tile/link, time_step) location to an operation with default
+  // SINGLE_OCCUPY status.
   bool bindOp(const MappingLoc &loc, Operation *op);
 
+  // Binds a (tile/link, time_step) location to an operation with specified
+  // occupy status for multi-cycle pipeline support.
+  bool bindOp(const MappingLoc &loc, Operation *op, int occupy_status);
+
+  // Binds multiple locations for a multi-cycle operation.
+  // This sets START_PIPE_OCCUPY at start_time, IN_PIPE_OCCUPY for intermediate
+  // times, and END_PIPE_OCCUPY at end_time-1.
+  bool bindMultiCycleOp(BasicResource *resource, int start_time, int latency,
+                        Operation *op);
+
   // Unbinds an operation from its (tile/link, time_step) location,
   // which is useful for backtracking.
   void unbindOp(Operation *op);
@@ -67,6 +85,19 @@ class MappingState {
   // it will check (tile 2, step 1), (tile 2, step 5), (tile 2, step 9), etc.
   bool isAvailableAcrossTime(const MappingLoc &loc) const;
 
+  // Checks if a location is available for a specific occupy status.
+  // This implements the pipeline-aware availability checking:
+  // - SINGLE_OCCUPY: only available if location is completely free
+  // - START_PIPE_OCCUPY: available if free or IN_PIPE_OCCUPY or END_PIPE_OCCUPY
+  // - END_PIPE_OCCUPY: available if free or IN_PIPE_OCCUPY or START_PIPE_OCCUPY
+  // - IN_PIPE_OCCUPY: always available (can pipeline with any status)
+  bool isAvailableForOccupyStatus(const MappingLoc &loc,
+                                  int new_occupy_status) const;
+
+  // Gets the occupy status at a specific location across time domain.
+  // Returns -1 if the location is not occupied.
+  int getOccupyStatusAcrossTime(const MappingLoc &loc) const;
+
   // Checks if a hardware resource is available across a time range.
   // This function leverages the isAvailableAcrossTime function in each
   // time step.
@@ -111,7 +142,8 @@ class MappingState {
   void dumpOpToLocs(llvm::raw_ostream &os = llvm::errs()) const;
 
   // Getters for state information.
-  const std::set<MappingLoc> &getOccupiedLocs() const {
+  const std::map<MappingLoc, std::vector<std::pair<int, Operation *>>> &
+  getOccupiedLocs() const {
     return this->occupied_locs;
   }
   const std::map<MappingLoc, Operation *> &getLocToOp() const {
@@ -122,7 +154,9 @@ class MappingState {
   }
 
   // Setters for state information.
-  void setOccupiedLocs(const std::set<MappingLoc> &locs) {
+  void setOccupiedLocs(
+      const std::map<MappingLoc, std::vector<std::pair<int, Operation *>>>
+          &locs) {
     this->occupied_locs = locs;
   }
   void setLocToOp(const std::map<MappingLoc, Operation *> &loc_to_op) {
@@ -139,7 +173,9 @@ class MappingState {
   bool is_spatial_only;
   static constexpr int kMaxSteps = 10;
 
-  std::set<MappingLoc> occupied_locs;
+  // Maps location to a list of (occupy_status, operation) pairs.
+  // Multiple ops can occupy the same location with compatible pipeline states.
+  std::map<MappingLoc, std::vector<std::pair<int, Operation *>>> occupied_locs;
   std::map<MappingLoc, Operation *> loc_to_op;
   std::map<Operation *, std::vector<MappingLoc>> op_to_locs;
 };
@@ -160,7 +196,7 @@ class MappingStateSnapshot {
   }
 
 private:
-  std::set<MappingLoc> occupied_locs;
+  std::map<MappingLoc, std::vector<std::pair<int, Operation *>>> occupied_locs;
   std::map<MappingLoc, Operation *> loc_to_op;
   std::map<Operation *, std::vector<MappingLoc>> op_to_locs;
 };

diff --git a/include/NeuraDialect/Mapping/mapping_util.h b/include/NeuraDialect/Mapping/mapping_util.h
@@ -116,5 +116,12 @@ bool canReachLocInTime(const std::vector<Operation *> &producers,
 Register *getAvailableRegister(const MappingState &mapping_state, Tile *tile,
                                int start_time, int exclusive_end_time);
 
+// Gets the execution latency of an operation from its "latency" attribute.
+// Returns 1 (single-cycle) if the attribute is not present.
+int getOpLatency(Operation *op);
+
+// Checks if an operation is a multi-cycle operation (latency > 1).
+bool isMultiCycleOp(Operation *op);
+
 } // namespace neura
 } // namespace mlir
diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h
@@ -51,6 +51,7 @@ std::unique_ptr<mlir::Pass> createInitPatternPass();
 
 // Hardware optimization passes
 std::unique_ptr<mlir::Pass> createHardwareMergePass();
+std::unique_ptr<mlir::Pass> createInitExecLatencyPass();
 
 #define GEN_PASS_REGISTRATION
 #include "NeuraDialect/NeuraPasses.h.inc"

diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td
@@ -220,4 +220,12 @@ def HardwareMerge : Pass<"hardware-merge", "ModuleOp"> {
   }];
   let constructor = "neura::createHardwareMergePass()";
 }
+
+def InitExecLatency : Pass<"init-exec-latency", "ModuleOp"> {
+  let summary = "Initialize execution latency information";
+  let description = [{
+    This pass initializes execution latency information.
+  }];
+  let constructor = "neura::createInitExecLatencyPass()";
+}
 #endif // NEURA_PASSES_TD
diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h
@@ -22,6 +22,7 @@ void registerTosaToAffineConversionPassPipeline();
 std::unique_ptr<mlir::Pass> createConstructHyperblockFromTaskPass();
 std::unique_ptr<mlir::Pass> createClassifyCountersPass();
 std::unique_ptr<mlir::Pass> createMapTaskOnCgraPass();
+std::unique_ptr<mlir::Pass> createFuseTaskPass();
 
 //=========================================================//
 // Optimization Passes

diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td
@@ -73,6 +73,27 @@ def MapTaskOnCgra : Pass<"map-task-on-cgra", "func::FuncOp"> {
   let constructor = "taskflow::createMapTaskOnCgraPass()";
 }
 
+def FuseTask : Pass<"fuse-task", "func::FuncOp"> {
+  let summary = "Fuses Taskflow tasks using producer-consumer and sibling strategies";
+  let description = [{
+    Fuses taskflow.task operations using producer-consumer and sibling
+    fusion strategies. Uses Neura-level MII metrics for profitability analysis.
+
+    Producer-Consumer Fusion: Fuses a producer task into its consumer when
+    the producer's memory output feeds directly into the consumer.
+
+    Sibling Fusion: Fuses tasks that share inputs without data dependency.
+  }];
+  let constructor = "taskflow::createFuseTaskPass()";
+  let dependentDialects = [
+    "mlir::LLVM::LLVMDialect",
+    "mlir::func::FuncDialect",
+    "mlir::arith::ArithDialect",
+    "mlir::memref::MemRefDialect",
+    "mlir::neura::NeuraDialect",
+    "mlir::taskflow::TaskflowDialect"];
+}
+
 def MemoryAccessStreamingFusion : Pass<"memory-access-streaming-fusion", "func::FuncOp"> {
   let summary = "Fuses tasks connected by memory dependencies for streaming execution";
   let description = [{

diff --git a/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp b/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp
@@ -67,10 +67,9 @@ struct HyperblockToKernelPattern
     }
 
     // Asserts that each task contains only one hyperblock.
+    // (Fused tasks may contain multiple hyperblocks, which is valid.)
     int hyperblock_count = 0;
     task_op.walk([&](TaskflowHyperblockOp op) { hyperblock_count++; });
-    assert(hyperblock_count == 1 &&
-           "Each taskflow.task should contain only one hyperblock");
 
     Block &hb_block = hyperblock_op.getBody().front();
     Block &task_block = task_op.getBody().front();