coredac · HobbitQia · Jan 2, 2026 · Jan 3, 2026 · Jan 6, 2026 · Jan 7, 2026
diff --git a/include/NeuraDialect/Architecture/ArchitectureSpec.h b/include/NeuraDialect/Architecture/ArchitectureSpec.h
@@ -72,6 +72,10 @@ struct LinkOverride {
 // This is set by the command line tool when a YAML file is provided.
 std::string getArchitectureSpecFile();
 
+// Function for getting the latency specification file path.
+// This is set by the command line tool when a YAML file is provided.
+std::string getLatencySpecFile();
+
 // Function for getting tile defaults configuration.
 TileDefaults getTileDefaults();
 

diff --git a/include/NeuraDialect/Mapping/MappingState.h b/include/NeuraDialect/Mapping/MappingState.h
@@ -10,6 +10,13 @@
 namespace mlir {
 namespace neura {
 
+// Occupy status for multi-cycle pipeline support.
+// These states define how a tile/FU is occupied at a given time step.
+#define SINGLE_OCCUPY     0 // A single-cycle op is in the FU (exclusive)
+#define START_PIPE_OCCUPY 1 // A multi-cycle op starts in the FU
+#define END_PIPE_OCCUPY   2 // A multi-cycle op ends in the FU
+#define IN_PIPE_OCCUPY    3 // A multi-cycle op is occupying the FU (pipelined)
+
 // Represents a spatial-temporal location: (resource, time_step)
 struct MappingLoc {
   BasicResource *resource;
@@ -54,9 +61,20 @@ namespace neura {
 class MappingState {
 public:
   MappingState(const Architecture &arch, int II, bool is_spatial_only);
-  // Binds a (tile/link, time_step) location to an operation.
+  // Binds a (tile/link, time_step) location to an operation with default
+  // SINGLE_OCCUPY status.
   bool bindOp(const MappingLoc &loc, Operation *op);
 
+  // Binds a (tile/link, time_step) location to an operation with specified
+  // occupy status for multi-cycle pipeline support.
+  bool bindOp(const MappingLoc &loc, Operation *op, int occupy_status);
+
+  // Binds multiple locations for a multi-cycle operation.
+  // This sets START_PIPE_OCCUPY at start_time, IN_PIPE_OCCUPY for intermediate
+  // times, and END_PIPE_OCCUPY at end_time-1.
+  bool bindMultiCycleOp(BasicResource *resource, int start_time, int latency,
+                        Operation *op);
+
   // Unbinds an operation from its (tile/link, time_step) location,
   // which is useful for backtracking.
   void unbindOp(Operation *op);
@@ -67,6 +85,19 @@ class MappingState {
   // it will check (tile 2, step 1), (tile 2, step 5), (tile 2, step 9), etc.
   bool isAvailableAcrossTime(const MappingLoc &loc) const;
 
+  // Checks if a location is available for a specific occupy status.
+  // This implements the pipeline-aware availability checking:
+  // - SINGLE_OCCUPY: only available if location is completely free
+  // - START_PIPE_OCCUPY: available if free or IN_PIPE_OCCUPY or END_PIPE_OCCUPY
+  // - END_PIPE_OCCUPY: available if free or IN_PIPE_OCCUPY or START_PIPE_OCCUPY
+  // - IN_PIPE_OCCUPY: always available (can pipeline with any status)
+  bool isAvailableForOccupyStatus(const MappingLoc &loc,
+                                  int new_occupy_status) const;
+
+  // Gets the occupy status at a specific location across time domain.
+  // Returns -1 if the location is not occupied.
+  int getOccupyStatusAcrossTime(const MappingLoc &loc) const;
+
   // Checks if a hardware resource is available across a time range.
   // This function leverages the isAvailableAcrossTime function in each
   // time step.
@@ -111,7 +142,8 @@ class MappingState {
   void dumpOpToLocs(llvm::raw_ostream &os = llvm::errs()) const;
 
   // Getters for state information.
-  const std::set<MappingLoc> &getOccupiedLocs() const {
+  const std::map<MappingLoc, std::vector<std::pair<int, Operation *>>> &
+  getOccupiedLocs() const {
     return this->occupied_locs;
   }
   const std::map<MappingLoc, Operation *> &getLocToOp() const {
@@ -122,7 +154,9 @@ class MappingState {
   }
 
   // Setters for state information.
-  void setOccupiedLocs(const std::set<MappingLoc> &locs) {
+  void setOccupiedLocs(
+      const std::map<MappingLoc, std::vector<std::pair<int, Operation *>>>
+          &locs) {
     this->occupied_locs = locs;
   }
   void setLocToOp(const std::map<MappingLoc, Operation *> &loc_to_op) {
@@ -139,7 +173,9 @@ class MappingState {
   bool is_spatial_only;
   static constexpr int kMaxSteps = 10;
 
-  std::set<MappingLoc> occupied_locs;
+  // Maps location to a list of (occupy_status, operation) pairs.
+  // Multiple ops can occupy the same location with compatible pipeline states.
+  std::map<MappingLoc, std::vector<std::pair<int, Operation *>>> occupied_locs;
   std::map<MappingLoc, Operation *> loc_to_op;
   std::map<Operation *, std::vector<MappingLoc>> op_to_locs;
 };
@@ -160,7 +196,7 @@ class MappingStateSnapshot {
   }
 
 private:
-  std::set<MappingLoc> occupied_locs;
+  std::map<MappingLoc, std::vector<std::pair<int, Operation *>>> occupied_locs;
   std::map<MappingLoc, Operation *> loc_to_op;
   std::map<Operation *, std::vector<MappingLoc>> op_to_locs;
 };

diff --git a/include/NeuraDialect/Mapping/mapping_util.h b/include/NeuraDialect/Mapping/mapping_util.h
@@ -116,5 +116,12 @@ bool canReachLocInTime(const std::vector<Operation *> &producers,
 Register *getAvailableRegister(const MappingState &mapping_state, Tile *tile,
                                int start_time, int exclusive_end_time);
 
+// Gets the execution latency of an operation from its "latency" attribute.
+// Returns 1 (single-cycle) if the attribute is not present.
+int getOpLatency(Operation *op);
+
+// Checks if an operation is a multi-cycle operation (latency > 1).
+bool isMultiCycleOp(Operation *op);
+
 } // namespace neura
 } // namespace mlir
diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h
@@ -38,6 +38,7 @@ std::unique_ptr<mlir::Pass> createWrapLoopInKernelPass();
 // Hardware specific optimization passes
 std::unique_ptr<mlir::Pass> createFuseLoopControlPass();
 std::unique_ptr<mlir::Pass> createFusePatternPass();
+std::unique_ptr<mlir::Pass> createFuseKernelPass();
 
 // Hardware agnostic optimization passes
 std::unique_ptr<mlir::Pass> createFoldConstantPass();
@@ -49,6 +50,7 @@ std::unique_ptr<mlir::Pass> createInitPatternPass();
 
 // Hardware optimization passes
 std::unique_ptr<mlir::Pass> createHardwareMergePass();
+std::unique_ptr<mlir::Pass> createInitExecLatencyPass();
 
 #define GEN_PASS_REGISTRATION
 #include "NeuraDialect/NeuraPasses.h.inc"

diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td
@@ -20,6 +20,21 @@ def FusePattern : Pass<"fuse-pattern", "ModuleOp"> {
   let constructor = "neura::createFusePatternPass()";
 }
 
+def FuseKernel : Pass<"fuse-kernel", "ModuleOp"> {
+  let summary = "Fuses kernel operations in the Neura dialect";
+  let description = [{
+    This pass fuses neura.kernel operations using producer-consumer and sibling
+    fusion strategies, inspired by MLIR's linalg and affine loop fusion.
+
+    Producer-Consumer Fusion: Fuses a producer kernel into its consumer when
+    the producer's output is only used by the consumer.
+
+    Sibling Fusion: Fuses kernels that share the same input operands and have
+    no data dependencies between them.
+  }];
+  let constructor = "neura::createFuseKernelPass()";
+}
+
 def InsertDataMov : Pass<"insert-data-mov", "ModuleOp"> {
   let summary = "Inserts data move operations in the Neura dialect";
   let description =
@@ -194,4 +209,12 @@ def HardwareMerge : Pass<"hardware-merge", "ModuleOp"> {
   }];
   let constructor = "neura::createHardwareMergePass()";
 }
+
+def InitExecLatency : Pass<"init-exec-latency", "ModuleOp"> {
+  let summary = "Initialize execution latency information";
+  let description = [{
+    This pass initializes execution latency information.
+  }];
+  let constructor = "neura::createInitExecLatencyPass()";
+}
 #endif // NEURA_PASSES_TD