diff --git a/include/NeuraDialect/Architecture/Architecture.h b/include/NeuraDialect/Architecture/Architecture.h
index 4f8e5cc2..80b4d1b5 100644
--- a/include/NeuraDialect/Architecture/Architecture.h
+++ b/include/NeuraDialect/Architecture/Architecture.h
@@ -560,6 +560,9 @@ class Architecture {
 
 // Function for getting the architecture object.
 const Architecture &getArchitecture();
+
+// Function for getting the latency specification file path.
+const std::string &getLatencySpecFile();
 } // namespace neura
 } // namespace mlir
 
diff --git a/include/NeuraDialect/Mapping/MappingState.h b/include/NeuraDialect/Mapping/MappingState.h
index a43cea04..99dc737f 100644
--- a/include/NeuraDialect/Mapping/MappingState.h
+++ b/include/NeuraDialect/Mapping/MappingState.h
@@ -10,6 +10,13 @@
 namespace mlir {
 namespace neura {
 
+// Occupy status for multi-cycle pipeline support.
+// These states define how a tile/FU is occupied at a given time step.
+#define SINGLE_OCCUPY     0 // A single-cycle op is in the FU (exclusive)
+#define START_PIPE_OCCUPY 1 // A multi-cycle op starts in the FU
+#define END_PIPE_OCCUPY   2 // A multi-cycle op ends in the FU
+#define IN_PIPE_OCCUPY    3 // A multi-cycle op is occupying the FU (pipelined)
+
 // Represents a spatial-temporal location: (resource, time_step)
 struct MappingLoc {
   BasicResource *resource;
@@ -54,9 +61,20 @@ namespace neura {
 class MappingState {
 public:
   MappingState(const Architecture &arch, int II, bool is_spatial_only);
-  // Binds a (tile/link, time_step) location to an operation.
+  // Binds a (tile/link, time_step) location to an operation with default
+  // SINGLE_OCCUPY status.
   bool bindOp(const MappingLoc &loc, Operation *op);
 
+  // Binds a (tile/link, time_step) location to an operation with specified
+  // occupy status for multi-cycle pipeline support.
+  bool bindOp(const MappingLoc &loc, Operation *op, int occupy_status);
+
+  // Binds multiple locations for a multi-cycle operation.
+  // This sets START_PIPE_OCCUPY at start_time, IN_PIPE_OCCUPY for intermediate
+  // times, and END_PIPE_OCCUPY at end_time-1.
+  bool bindMultiCycleOp(BasicResource *resource, int start_time, int latency,
+                        Operation *op);
+
   // Unbinds an operation from its (tile/link, time_step) location,
   // which is useful for backtracking.
   void unbindOp(Operation *op);
@@ -67,6 +85,19 @@ class MappingState {
   // it will check (tile 2, step 1), (tile 2, step 5), (tile 2, step 9), etc.
   bool isAvailableAcrossTime(const MappingLoc &loc) const;
 
+  // Checks if a location is available for a specific occupy status.
+  // This implements the pipeline-aware availability checking:
+  // - SINGLE_OCCUPY: only available if location is completely free
+  // - START_PIPE_OCCUPY: available if free or IN_PIPE_OCCUPY or END_PIPE_OCCUPY
+  // - END_PIPE_OCCUPY: available if free or IN_PIPE_OCCUPY or START_PIPE_OCCUPY
+  // - IN_PIPE_OCCUPY: always available (can pipeline with any status)
+  bool isAvailableForOccupyStatus(const MappingLoc &loc,
+                                  int new_occupy_status) const;
+
+  // Gets the occupy status at a specific location across time domain.
+  // Returns -1 if the location is not occupied.
+  int getOccupyStatusAcrossTime(const MappingLoc &loc) const;
+
   // Checks if a hardware resource is available across a time range.
   // This function leverages the isAvailableAcrossTime function in each
   // time step.
@@ -111,7 +142,8 @@ class MappingState {
   void dumpOpToLocs(llvm::raw_ostream &os = llvm::errs()) const;
 
   // Getters for state information.
-  const std::set<MappingLoc> &getOccupiedLocs() const {
+  const std::map<MappingLoc, std::vector<std::pair<int, Operation *>>> &
+  getOccupiedLocs() const {
     return this->occupied_locs;
   }
   const std::map<MappingLoc, Operation *> &getLocToOp() const {
@@ -122,7 +154,9 @@ class MappingState {
   }
 
   // Setters for state information.
-  void setOccupiedLocs(const std::set<MappingLoc> &locs) {
+  void setOccupiedLocs(
+      const std::map<MappingLoc, std::vector<std::pair<int, Operation *>>>
+          &locs) {
     this->occupied_locs = locs;
   }
   void setLocToOp(const std::map<MappingLoc, Operation *> &loc_to_op) {
@@ -139,7 +173,9 @@ class MappingState {
   bool is_spatial_only;
   static constexpr int kMaxSteps = 10;
 
-  std::set<MappingLoc> occupied_locs;
+  // Maps location to a list of (occupy_status, operation) pairs.
+  // Multiple ops can occupy the same location with compatible pipeline states.
+  std::map<MappingLoc, std::vector<std::pair<int, Operation *>>> occupied_locs;
   std::map<MappingLoc, Operation *> loc_to_op;
   std::map<Operation *, std::vector<MappingLoc>> op_to_locs;
 };
@@ -160,7 +196,7 @@ class MappingStateSnapshot {
   }
 
 private:
-  std::set<MappingLoc> occupied_locs;
+  std::map<MappingLoc, std::vector<std::pair<int, Operation *>>> occupied_locs;
   std::map<MappingLoc, Operation *> loc_to_op;
   std::map<Operation *, std::vector<MappingLoc>> op_to_locs;
 };
diff --git a/include/NeuraDialect/Mapping/mapping_util.h b/include/NeuraDialect/Mapping/mapping_util.h
index ee6fcefc..fd2f2329 100644
--- a/include/NeuraDialect/Mapping/mapping_util.h
+++ b/include/NeuraDialect/Mapping/mapping_util.h
@@ -116,5 +116,12 @@ bool canReachLocInTime(const std::vector<Operation *> &producers,
 Register *getAvailableRegister(const MappingState &mapping_state, Tile *tile,
                                int start_time, int exclusive_end_time);
 
+// Gets the execution latency of an operation from its "latency" attribute.
+// Returns 1 (single-cycle) if the attribute is not present.
+int getOpLatency(Operation *op);
+
+// Checks if an operation is a multi-cycle operation (latency > 1).
+bool isMultiCycleOp(Operation *op);
+
 } // namespace neura
 } // namespace mlir
diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h
index 56a9e785..c3a03f35 100644
--- a/include/NeuraDialect/NeuraPasses.h
+++ b/include/NeuraDialect/NeuraPasses.h
@@ -51,6 +51,7 @@ std::unique_ptr<mlir::Pass> createInitPatternPass();
 
 // Hardware optimization passes
 std::unique_ptr<mlir::Pass> createHardwareMergePass();
+std::unique_ptr<mlir::Pass> createInitExecLatencyPass();
 
 #define GEN_PASS_REGISTRATION
 #include "NeuraDialect/NeuraPasses.h.inc"
diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td
index f7fc06a3..cb0e8646 100644
--- a/include/NeuraDialect/NeuraPasses.td
+++ b/include/NeuraDialect/NeuraPasses.td
@@ -220,4 +220,12 @@ def HardwareMerge : Pass<"hardware-merge", "ModuleOp"> {
   }];
   let constructor = "neura::createHardwareMergePass()";
 }
+
+def InitExecLatency : Pass<"init-exec-latency", "ModuleOp"> {
+  let summary = "Initialize execution latency information";
+  let description = [{
+    This pass initializes execution latency information.
+  }];
+  let constructor = "neura::createInitExecLatencyPass()";
+}
 #endif // NEURA_PASSES_TD
\ No newline at end of file
diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h
index a23c5b02..92393d7c 100644
--- a/include/TaskflowDialect/TaskflowPasses.h
+++ b/include/TaskflowDialect/TaskflowPasses.h
@@ -22,6 +22,7 @@ void registerTosaToAffineConversionPassPipeline();
 std::unique_ptr<mlir::Pass> createConstructHyperblockFromTaskPass();
 std::unique_ptr<mlir::Pass> createClassifyCountersPass();
 std::unique_ptr<mlir::Pass> createMapTaskOnCgraPass();
+std::unique_ptr<mlir::Pass> createFuseTaskPass();
 
 //=========================================================//
 // Optimization Passes
diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td
index 8d765498..960f44c6 100644
--- a/include/TaskflowDialect/TaskflowPasses.td
+++ b/include/TaskflowDialect/TaskflowPasses.td
@@ -73,6 +73,27 @@ def MapTaskOnCgra : Pass<"map-task-on-cgra", "func::FuncOp"> {
   let constructor = "taskflow::createMapTaskOnCgraPass()";
 }
 
+def FuseTask : Pass<"fuse-task", "func::FuncOp"> {
+  let summary = "Fuses Taskflow tasks using producer-consumer and sibling strategies";
+  let description = [{
+    Fuses taskflow.task operations using producer-consumer and sibling
+    fusion strategies. Uses Neura-level MII metrics for profitability analysis.
+    
+    Producer-Consumer Fusion: Fuses a producer task into its consumer when
+    the producer's memory output feeds directly into the consumer.
+    
+    Sibling Fusion: Fuses tasks that share inputs without data dependency.
+  }];
+  let constructor = "taskflow::createFuseTaskPass()";
+  let dependentDialects = [
+    "mlir::LLVM::LLVMDialect",
+    "mlir::func::FuncDialect",
+    "mlir::arith::ArithDialect",
+    "mlir::memref::MemRefDialect",
+    "mlir::neura::NeuraDialect",
+    "mlir::taskflow::TaskflowDialect"];
+}
+
 def MemoryAccessStreamingFusion : Pass<"memory-access-streaming-fusion", "func::FuncOp"> {
   let summary = "Fuses tasks connected by memory dependencies for streaming execution";
   let description = [{
diff --git a/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp b/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp
index 80fe8e0c..36c83f22 100644
--- a/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp
+++ b/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp
@@ -67,10 +67,9 @@ struct HyperblockToKernelPattern
     }
 
     // Asserts that each task contains only one hyperblock.
+    // (Fused tasks may contain multiple hyperblocks, which is valid.)
     int hyperblock_count = 0;
     task_op.walk([&](TaskflowHyperblockOp op) { hyperblock_count++; });
-    assert(hyperblock_count == 1 &&
-           "Each taskflow.task should contain only one hyperblock");
 
     Block &hb_block = hyperblock_op.getBody().front();
     Block &task_block = task_op.getBody().front();
diff --git a/lib/NeuraDialect/Mapping/MappingState.cpp b/lib/NeuraDialect/Mapping/MappingState.cpp
index 110d1976..d537eeea 100644
--- a/lib/NeuraDialect/Mapping/MappingState.cpp
+++ b/lib/NeuraDialect/Mapping/MappingState.cpp
@@ -3,6 +3,7 @@
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 
 using namespace mlir;
 using namespace mlir::neura;
@@ -30,14 +31,62 @@ MappingState::MappingState(const Architecture &arch, int II,
     : II(II), is_spatial_only(is_spatial_only) {}
 
 bool MappingState::bindOp(const MappingLoc &loc, Operation *op) {
+  // Default to SINGLE_OCCUPY for backward compatibility
+  return bindOp(loc, op, SINGLE_OCCUPY);
+}
+
+bool MappingState::bindOp(const MappingLoc &loc, Operation *op,
+                          int occupy_status) {
+  // Check if the location is available for the specified occupy status
+  if (!isAvailableForOccupyStatus(loc, occupy_status)) {
+    return false;
+  }
+
   loc_to_op[loc] = op;
-  occupied_locs.insert(loc);
+  occupied_locs[loc].push_back({occupy_status, op});
   auto it = op_to_locs.find(op);
   assert(it == op_to_locs.end() && "Operation already has reserved locations");
   op_to_locs[op].push_back(loc);
   return true;
 }
 
+bool MappingState::bindMultiCycleOp(BasicResource *resource, int start_time,
+                                    int latency, Operation *op) {
+  // First check if all locations are available
+  for (int t = start_time; t < start_time + latency; ++t) {
+    MappingLoc check_loc = {resource, t};
+    int status;
+    if (t == start_time) {
+      status = START_PIPE_OCCUPY;
+    } else if (t == start_time + latency - 1) {
+      status = END_PIPE_OCCUPY;
+    } else {
+      status = IN_PIPE_OCCUPY;
+    }
+    if (!isAvailableForOccupyStatus(check_loc, status)) {
+      return false;
+    }
+  }
+
+  // Now bind all locations
+  for (int t = start_time; t < start_time + latency; ++t) {
+    MappingLoc loc = {resource, t};
+    int status;
+    if (t == start_time) {
+      status = START_PIPE_OCCUPY;
+    } else if (t == start_time + latency - 1) {
+      status = END_PIPE_OCCUPY;
+    } else {
+      status = IN_PIPE_OCCUPY;
+    }
+
+    loc_to_op[loc] = op;
+    occupied_locs[loc].push_back({status, op});
+    op_to_locs[op].push_back(loc);
+  }
+  return true;
+}
+
 void MappingState::unbindOp(Operation *op) {
   auto it = op_to_locs.find(op);
   if (it == op_to_locs.end()) {
@@ -46,7 +95,21 @@ void MappingState::unbindOp(Operation *op) {
 
   for (const MappingLoc &loc : it->second) {
     loc_to_op.erase(loc);
-    occupied_locs.erase(loc);
+    // Remove entries for this op from occupied_locs
+    auto occ_it = occupied_locs.find(loc);
+    if (occ_it != occupied_locs.end()) {
+      auto &entries = occ_it->second;
+      entries.erase(
+          std::remove_if(entries.begin(), entries.end(),
+                         [op](const std::pair<int, Operation *> &entry) {
+                           return entry.second == op;
+                         }),
+          entries.end());
+      // Remove the location entirely if no more entries
+      if (entries.empty()) {
+        occupied_locs.erase(occ_it);
+      }
+    }
   }
 
   op_to_locs.erase(it);
@@ -57,21 +120,128 @@ bool MappingState::isAvailableAcrossTime(const MappingLoc &loc) const {
   if (this->is_spatial_only) {
     for (int t = 0; t < II * kMaxSteps; ++t) {
       MappingLoc check_loc = {loc.resource, t};
-      if (occupied_locs.find(check_loc) != occupied_locs.end()) {
-        return false;
+      auto it = occupied_locs.find(check_loc);
+      if (it != occupied_locs.end()) {
+        // Check if all existing occupy statuses allow new single-cycle op
+        for (const auto &entry : it->second) {
+          if (entry.first != IN_PIPE_OCCUPY) {
+            return false;
+          }
+        }
       }
     }
     return true;
   } else {
-
     // Checks the availability across time domain.
     for (int t = loc.time_step % II; t < II * kMaxSteps; t += II) {
       MappingLoc check_loc = {loc.resource, t};
-      if (occupied_locs.find(check_loc) != occupied_locs.end()) {
+      auto it = occupied_locs.find(check_loc);
+      if (it != occupied_locs.end()) {
+        // Check if all existing occupy statuses allow new single-cycle op
+        for (const auto &entry : it->second) {
+          if (entry.first != IN_PIPE_OCCUPY) {
+            return false;
+          }
+        }
+      }
+    }
+    return true;
+  }
+}
+
+bool MappingState::isAvailableForOccupyStatus(const MappingLoc &loc,
+                                              int new_occupy_status) const {
+  // Helper lambda to check a single location against all existing entries
+  auto checkSingleLoc = [this, new_occupy_status](const MappingLoc &check_loc) -> bool {
+    auto it = occupied_locs.find(check_loc);
+    if (it == occupied_locs.end() || it->second.empty()) {
+      // Location is free, always available
+      return true;
+    }
+
+    // Check against all existing entries at this location
+    for (const auto &entry : it->second) {
+      int existing_status = entry.first;
+
+      // Implement the pipeline-aware availability rules:
+      // - SINGLE_OCCUPY (0): exclusive, no other op can share
+      // - START_PIPE_OCCUPY (1): cannot coexist with SINGLE or another START
+      // - END_PIPE_OCCUPY (2): cannot coexist with SINGLE or another END
+      // - IN_PIPE_OCCUPY (3): can coexist with any status except SINGLE
+
+      if (existing_status == SINGLE_OCCUPY) {
+        // SINGLE_OCCUPY blocks everything
+        return false;
+      }
+
+      if (new_occupy_status == SINGLE_OCCUPY) {
+        // SINGLE_OCCUPY cannot be placed if anything is there
         return false;
       }
+
+      if (new_occupy_status == START_PIPE_OCCUPY) {
+        // START cannot coexist with another START
+        if (existing_status == START_PIPE_OCCUPY) {
+          return false;
+        }
+      }
+
+      if (new_occupy_status == END_PIPE_OCCUPY) {
+        // END cannot coexist with another END
+        if (existing_status == END_PIPE_OCCUPY) {
+          return false;
+        }
+      }
+
+      // IN_PIPE_OCCUPY can coexist with START, END, or other IN_PIPE
     }
     return true;
+  };
+
+  // For spatial mapping, check all time steps
+  if (this->is_spatial_only) {
+    for (int t = 0; t < II * kMaxSteps; ++t) {
+      MappingLoc check_loc = {loc.resource, t};
+      if (!checkSingleLoc(check_loc)) {
+        return false;
+      }
+    }
+    return true;
+  } else {
+    // Check across time domain (modulo II)
+    for (int t = loc.time_step % II; t < II * kMaxSteps; t += II) {
+      MappingLoc check_loc = {loc.resource, t};
+      if (!checkSingleLoc(check_loc)) {
+        return false;
+      }
+    }
+    return true;
+  }
+}
+
+int MappingState::getOccupyStatusAcrossTime(const MappingLoc &loc) const {
+  // For spatial mapping, check all time steps
+  if (this->is_spatial_only) {
+    for (int t = 0; t < II * kMaxSteps; ++t) {
+      MappingLoc check_loc = {loc.resource, t};
+      auto it = occupied_locs.find(check_loc);
+      if (it != occupied_locs.end() && !it->second.empty()) {
+        // Return the first status found (most restrictive)
+        return it->second[0].first;
+      }
+    }
+    return -1;
+  } else {
+    // Check across time domain (modulo II)
+    for (int t = loc.time_step % II; t < II * kMaxSteps; t += II) {
+      MappingLoc check_loc = {loc.resource, t};
+      auto it = occupied_locs.find(check_loc);
+      if (it != occupied_locs.end() && !it->second.empty()) {
+        // Return the first status found (most restrictive)
+        return it->second[0].first;
+      }
+    }
+    return -1;
   }
 }
 
@@ -202,12 +372,9 @@ void MappingState::reserveRoute(Operation *op, ArrayRef<MappingLoc> path) {
   op_to_locs[op] = std::vector<MappingLoc>(path.begin(), path.end());
 
   for (const MappingLoc &loc : path) {
-    assert(occupied_locs.find(loc) == occupied_locs.end() &&
-           "Mapping location already occupied");
     loc_to_op[loc] = op;
-    assert(occupied_locs.find(loc) == occupied_locs.end() &&
-           "Mapping location already occupied in occupied_locs");
-    occupied_locs.insert(loc);
+    // Use SINGLE_OCCUPY for route reservations (links/registers)
+    occupied_locs[loc].push_back({SINGLE_OCCUPY, op});
   }
 }
 
@@ -221,7 +388,21 @@ void MappingState::releaseRoute(Operation *op) {
 
   for (const MappingLoc &loc : route) {
     loc_to_op.erase(loc);
-    occupied_locs.erase(loc);
+    // Remove entries for this op from occupied_locs
+    auto occ_it = occupied_locs.find(loc);
+    if (occ_it != occupied_locs.end()) {
+      auto &entries = occ_it->second;
+      entries.erase(
+          std::remove_if(entries.begin(), entries.end(),
+                         [op](const std::pair<int, Operation *> &entry) {
+                           return entry.second == op;
+                         }),
+          entries.end());
+      // Remove the location entirely if no more entries
+      if (entries.empty()) {
+        occupied_locs.erase(occ_it);
+      }
+    }
   }
 
   op_to_locs.erase(it);
diff --git a/lib/NeuraDialect/Mapping/mapping_util.cpp b/lib/NeuraDialect/Mapping/mapping_util.cpp
index 814c59a3..c007a9c3 100644
--- a/lib/NeuraDialect/Mapping/mapping_util.cpp
+++ b/lib/NeuraDialect/Mapping/mapping_util.cpp
@@ -4,6 +4,7 @@
 #include "NeuraDialect/Mapping/mapping_util.h"
 #include "NeuraDialect/NeuraOps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/Operation.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
@@ -1115,13 +1116,37 @@ llvm::SmallVector<Operation *> mlir::neura::getCtrlMovUsers(Operation *op) {
 
 bool mlir::neura::placeAndRoute(Operation *op, const MappingLoc &target_loc,
                                 MappingState &mapping_state) {
-  if (mapping_state.bindOp(target_loc, op)) {
+  // Get the latency of the operation to determine if it's multi-cycle
+  int latency = getOpLatency(op);
+  bool is_multi_cycle = latency > 1;
+
+  bool bind_success = false;
+  if (is_multi_cycle) {
+    // For multi-cycle ops, bind across multiple time steps with pipeline status
+    bind_success = mapping_state.bindMultiCycleOp(
+        target_loc.resource, target_loc.time_step, latency, op);
+    if (bind_success) {
+      llvm::errs() << "[DEBUG] Bound multi-cycle op (latency=" << latency
+                   << ") " << *op << " onto loc: "
+                   << target_loc.resource->getType() << "#"
+                   << target_loc.resource->getId()
+                   << " @t=" << target_loc.time_step << " to t="
+                   << (target_loc.time_step + latency - 1) << "\n";
+    }
+  } else {
+    // For single-cycle ops, use default SINGLE_OCCUPY binding
+    bind_success = mapping_state.bindOp(target_loc, op);
+    if (bind_success) {
+      llvm::errs() << "[DEBUG] Schedule op " << *op
+                   << " onto loc: " << target_loc.resource->getType() << "#"
+                   << target_loc.resource->getId()
+                   << " @t=" << target_loc.time_step << "\n";
+    }
+  }
+
+  if (bind_success) {
     std::vector<Operation *> routed_operands;
     std::vector<Operation *> routed_ctrl_movs;
-    llvm::errs() << "[DEBUG] Schedule op " << *op
-                 << " onto loc: " << target_loc.resource->getType() << "#"
-                 << target_loc.resource->getId()
-                 << " @t=" << target_loc.time_step << "\n";
     // Tries to route the data move operations.
     for (Value operand : op->getOperands()) {
       llvm::errs() << "Processing operand: " << operand << "\n";
@@ -1222,4 +1247,17 @@ bool mlir::neura::placeAndRoute(Operation *op, const MappingLoc &target_loc,
     return true;
   }
   return false;
+}
+
+int mlir::neura::getOpLatency(Operation *op) {
+  // Try to get the latency attribute from the operation
+  if (auto latency_attr = op->getAttrOfType<IntegerAttr>("latency")) {
+    return latency_attr.getInt();
+  }
+  // Default to single-cycle if no latency attribute is present
+  return 1;
+}
+
+bool mlir::neura::isMultiCycleOp(Operation *op) {
+  return getOpLatency(op) > 1;
 }
\ No newline at end of file
diff --git a/lib/NeuraDialect/Transforms/CMakeLists.txt b/lib/NeuraDialect/Transforms/CMakeLists.txt
index 010fc3c7..ce23714a 100644
--- a/lib/NeuraDialect/Transforms/CMakeLists.txt
+++ b/lib/NeuraDialect/Transforms/CMakeLists.txt
@@ -18,6 +18,7 @@ add_mlir_library(
     TransformToSteerControlPass.cpp
     RemovePredicatedTypePass.cpp
     HardwareMergePass.cpp
+    InitExecLatencyPass.cpp
     GraphMining/HardwareTemplate.cpp
 
     DEPENDS
diff --git a/lib/NeuraDialect/Transforms/GenerateCodePass.cpp b/lib/NeuraDialect/Transforms/GenerateCodePass.cpp
index 3c2a7699..69a3d50c 100644
--- a/lib/NeuraDialect/Transforms/GenerateCodePass.cpp
+++ b/lib/NeuraDialect/Transforms/GenerateCodePass.cpp
@@ -85,6 +85,7 @@ static bool isCtrlMov(Operation *op) { return dyn_cast<CtrlMovOp>(op) != nullptr
 static bool isPhiStart(Operation *op) { return dyn_cast<PhiStartOp>(op) != nullptr; }
 static bool isReserve(Operation *op) { return dyn_cast<ReserveOp>(op) != nullptr; }
 static bool isConstant(Operation *op) { return dyn_cast<ConstantOp>(op) != nullptr; }
+static bool isFusedOp(Operation *op) { return dyn_cast<FusedOp>(op) != nullptr; }
 // ---- Constant for phi_start operation ----.
 static constexpr unsigned kReserveOpIndex = 1;
 
@@ -477,24 +478,29 @@ struct GenerateCodePass
                SmallVector<Operation*> &ctrl_movs,
                DenseMap<Value, Operation*> &reserve_to_phi_map) {
     function.walk([&](Operation *op) {
-      // placement for every op (even for mov/reserve).
+      // Skips operations inside fused_op regions. 
+      if (op->getParentOp() && isFusedOp(op->getParentOp())) {
+        return;
+      }
+
+      // Records Records placement for every op (even for mov/reserve).
       operation_placements[op] = getTileLocation(op);
 
-      // build reserve -> phi mapping.
+      // Builds reserve -> phi mapping for loop-carried dependencies.
       if (isPhiStart(op)) {
         if (Value reserve = getReserveOperand(op)) {
           reserve_to_phi_map[reserve] = op;
         }
       }
 
-      // collect forwarders.
+      // Collects forwarders for later expansion.
       if (isDataMov(op)) { data_movs.push_back(op); return; }
       if (isCtrlMov(op)) { ctrl_movs.push_back(op); return; }
 
-      // skip Reserve from materialization.
+      // Skips Reserve from materialization.
       if (isReserve(op)) return;
 
-      // materialize all other ops placed on tiles (compute/phi/const/etc.).
+      // Materializes all other ops placed on tiles (compute/phi/const/fused_op/etc.).
       TileLocation placement = operation_placements[op];
       if (!placement.has_tile) return;
 
@@ -824,6 +830,31 @@ struct GenerateCodePass
                      const DenseMap<Value, Operation*> &reserve2phi) {
     if (!validateForwarderShape<IsCtrl>(forwarder)) return;
 
+    // Checks if this data_mov/ctrl_mov has mapping_locs assigned by MapToAcceleratorPass.
+    auto mapping_locs = getMappingLocations(forwarder);
+    if (!mapping_locs || mapping_locs.empty()) {
+      // Skips this mov operation - it will be handled by its consumer or does not need routing.
+      // This is expected for data_mov that only feeds into ctrl_mov.
+      if constexpr (!IsCtrl) {
+        // For data_mov without mapping, verifies if it is only used by ctrl_mov.
+        bool only_ctrl_mov_users = true;
+        for (OpOperand &use : forwarder->getResult(0).getUses()) {
+          if (!isa<CtrlMovOp>(use.getOwner())) {
+            only_ctrl_mov_users = false;
+            break;
+          }
+        }
+        if (only_ctrl_mov_users) {
+          // This is expected - ctrl_mov handles this data transfer implicitly.
+          return;
+        } else {
+          // This data_mov has non-ctrl_mov users but no mapping - this is an error.
+          forwarder->emitWarning("data_mov without mapping_locs has non-ctrl_mov users");
+        }
+      }
+      return;
+    }
+
     MovBasics<IsCtrl> basics = buildMovBasics<IsCtrl>(forwarder, topo);
 
     emitMovRoutingInstructions<IsCtrl>(forwarder, basics, topo);
@@ -1022,6 +1053,10 @@ struct GenerateCodePass
       if (operation == func.getOperation()) return;  // Skips function itself.
       if (isReserve(operation)) return; // Skips reserve nodes entirely (bypass later).
       if (isa<YieldOp>(operation)) return; // Skips yield nodes entirely (bypass later).
+      // Skips operations inside fused_op regions - they are handled by hardware
+      if (operation->getParentOp() && isFusedOp(operation->getParentOp())) {
+        return;
+      }
 
       int dfg_id = getDfgId(operation);
       if (dfg_id < 0) {
diff --git a/lib/NeuraDialect/Transforms/InitExecLatencyPass.cpp b/lib/NeuraDialect/Transforms/InitExecLatencyPass.cpp
new file mode 100644
index 00000000..0a2e8ef2
--- /dev/null
+++ b/lib/NeuraDialect/Transforms/InitExecLatencyPass.cpp
@@ -0,0 +1,185 @@
+//===- InitExecLatencyPass.cpp - Initialize Execution Latency --------------===//
+//
+// This pass initializes execution latency information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+#include "NeuraDialect/Architecture/ArchitectureSpec.h"
+#include "NeuraDialect/Architecture/Architecture.h"
+#include "NeuraDialect/NeuraDialect.h"
+#include "NeuraDialect/NeuraOps.h"
+#include "NeuraDialect/NeuraPasses.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/YAMLParser.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+
+using namespace mlir;
+
+#define GEN_PASS_DEF_INITEXECLATENCY
+#include "NeuraDialect/NeuraPasses.h.inc"
+
+namespace {
+
+// Helper function to parse YAML scalar to integer
+static bool parseYamlScalarInt(const llvm::yaml::Node *node, int &result) {
+  auto *scalar = llvm::dyn_cast_or_null<llvm::yaml::ScalarNode>(node);
+  if (!scalar)
+    return false;
+  llvm::SmallString<64> value_string;
+  llvm::StringRef value_ref = scalar->getValue(value_string);
+  long long temp_value = 0;
+  if (value_ref.getAsInteger(10, temp_value))
+    return false;
+  result = static_cast<int>(temp_value);
+  return true;
+}
+
+// Helper function to parse YAML scalar to string
+static bool parseYamlScalarString(const llvm::yaml::Node *node,
+                                   std::string &result) {
+  auto *scalar = llvm::dyn_cast_or_null<llvm::yaml::ScalarNode>(node);
+  if (!scalar)
+    return false;
+  llvm::SmallString<64> value_string;
+  llvm::StringRef value_ref = scalar->getValue(value_string);
+  result = value_ref.str();
+  return true;
+}
+
+// Parse latency YAML file: expects a mapping of operation names to latency values
+static bool parseLatencyYaml(const std::string &file_path,
+                              std::map<std::string, int> &latency_map) {
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> buffer_or_err =
+      llvm::MemoryBuffer::getFile(file_path);
+  if (!buffer_or_err) {
+    llvm::errs() << "[InitExecLatencyPass] Failed to open latency specification file: "
+                 << file_path << "\n";
+    return false;
+  }
+
+  llvm::SourceMgr sm;
+  sm.AddNewSourceBuffer(std::move(*buffer_or_err), llvm::SMLoc());
+  llvm::yaml::Stream yaml_stream(
+      sm.getMemoryBuffer(sm.getMainFileID())->getBuffer(), sm);
+
+  llvm::yaml::Document &yaml_doc = *yaml_stream.begin();
+  if (yaml_stream.failed()) {
+    llvm::errs() << "[InitExecLatencyPass] YAML parse error in: " << file_path << "\n";
+    return false;
+  }
+
+  auto *root = yaml_doc.getRoot();
+  if (!root) {
+    llvm::errs() << "[InitExecLatencyPass] Empty YAML document\n";
+    return false;
+  }
+
+  auto *root_map = llvm::dyn_cast<llvm::yaml::MappingNode>(root);
+  if (!root_map) {
+    llvm::errs() << "[InitExecLatencyPass] YAML root is not a mapping\n";
+    return false;
+  }
+
+  for (auto &key_value_pair : *root_map) {
+    auto *key_node =
+        llvm::dyn_cast_or_null<llvm::yaml::ScalarNode>(key_value_pair.getKey());
+    if (!key_node)
+      continue;
+    
+    std::string op_name;
+    if (!parseYamlScalarString(key_node, op_name))
+      continue;
+
+    int latency_value = 0;
+    if (!parseYamlScalarInt(key_value_pair.getValue(), latency_value))
+      continue;
+
+    latency_map[op_name] = latency_value;
+  }
+
+  return true;
+}
+
+void SetLatency(Operation *op, std::map<std::string, int> &latency_map) {
+    // Get operation name and look up latency
+    std::string op_name = op->getName().getStringRef().str();
+    if (op_name.compare("neura.fused_op") == 0) {
+      op_name = op->getAttrOfType<StringAttr>("pattern_name").getValue().str();
+    }
+    op_name = op_name.substr(op_name.find_last_of(".") + 1); // remove neura. prefix if exists
+    auto it = latency_map.find(op_name);
+    if (it != latency_map.end()) {
+        op->setAttr("latency", 
+        IntegerAttr::get(IntegerType::get(op->getContext(), 32), it->second));
+    }
+    else {
+        op->setAttr("latency", 
+            IntegerAttr::get(IntegerType::get(op->getContext(), 32), 1)); 
+    }
+}
+
+struct InitExecLatencyPass
+    : public PassWrapper<InitExecLatencyPass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(InitExecLatencyPass)
+  
+  InitExecLatencyPass() = default;
+  InitExecLatencyPass(const InitExecLatencyPass &pass)
+      : PassWrapper<InitExecLatencyPass, OperationPass<ModuleOp>>(pass) {}
+  
+  StringRef getArgument() const override { return "init-exec-latency"; }
+  StringRef getDescription() const override {
+    return "Initialize execution latency information.";
+  }
+  
+  void runOnOperation() override {
+
+    ModuleOp module_op = getOperation();
+    llvm::errs() << "[InitExecLatencyPass] Running init-exec-latency pass\n";
+    // Get latency spec file from global function (set by command line)
+    std::string latency_file = mlir::neura::getLatencySpecFile();
+    if (latency_file.empty()) {
+      latency_file = "latency_map.yaml"; // default file name
+    }
+    
+    llvm::errs() << "[InitExecLatencyPass] Latency file: " << latency_file << "\n";
+    // Builds a map of operation name to latency
+    std::map<std::string, int> latency_map;
+    if (!parseLatencyYaml(latency_file, latency_map)) {
+      llvm::errs() << "[InitExecLatencyPass] Failed to parse latency specification file: " << latency_file << "\n";
+      return;
+    }
+
+    // Apply latency values to operations
+    module_op.walk([&](Operation *op) {
+      if (!op->getRegions().empty()) {
+        for (Region &region : op->getRegions()) {
+          region.walk([&](Operation *inner_op) {
+            // Skip operations inside fused_op regions
+            if (inner_op->getParentOp() && isa<neura::FusedOp>(inner_op->getParentOp())) {
+              return;
+            }
+
+            if (inner_op->getName().getStringRef().str() == "neura.data_mov" || inner_op->getName().getStringRef().str() == "neura.reserve") {
+              return;
+            }
+            
+            SetLatency(inner_op, latency_map);
+          });
+        }
+      }
+    });
+  }
+};
+
+} // namespace
+
+namespace mlir::neura {
+std::unique_ptr<mlir::Pass> createInitExecLatencyPass() {
+  return std::make_unique<InitExecLatencyPass>();
+}
+} // namespace mlir::neura
diff --git a/lib/TaskflowDialect/CMakeLists.txt b/lib/TaskflowDialect/CMakeLists.txt
index d8e5d7ff..49d60c57 100644
--- a/lib/TaskflowDialect/CMakeLists.txt
+++ b/lib/TaskflowDialect/CMakeLists.txt
@@ -13,4 +13,5 @@ add_mlir_dialect_library(MLIRTaskflow
         MLIRInferTypeOpInterface
 )
 
-add_subdirectory(Transforms)
\ No newline at end of file
+add_subdirectory(Transforms)
+add_subdirectory(Transforms/Optimizations)
\ No newline at end of file
diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt
index 5dcb6736..60078298 100644
--- a/lib/TaskflowDialect/Transforms/CMakeLists.txt
+++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt
@@ -4,6 +4,7 @@ add_mlir_library(MLIRTaskflowTransforms
     ConstructHyperblockFromTaskPass.cpp
     ClassifyCountersPass.cpp
     MapTaskOnCgraPass.cpp
+    FuseTaskPass.cpp
 
     DEPENDS
     MLIRTaskflowTransformsIncGen
@@ -14,8 +15,9 @@ add_mlir_library(MLIRTaskflowTransforms
     MLIRSupport
     MLIRTransforms
     MLIRTaskflow
+    MLIRNeura
+    MLIRNeuraTransforms
+    MLIRConversion
     ${dialect_libs}
     LLVMSupport
-)
-
-add_subdirectory(Optimizations)
\ No newline at end of file
+)
\ No newline at end of file
diff --git a/lib/TaskflowDialect/Transforms/FuseTaskPass.cpp b/lib/TaskflowDialect/Transforms/FuseTaskPass.cpp
new file mode 100644
index 00000000..d55a7540
--- /dev/null
+++ b/lib/TaskflowDialect/Transforms/FuseTaskPass.cpp
@@ -0,0 +1,1101 @@
+//===- FuseTaskPass.cpp - Task fusion for Taskflow dialect ----------------===//
+//
+// Fuses taskflow.task ops by merging counter chains and hyperblock bodies.
+// Supports producer-consumer fusion (with intermediate store/load elimination)
+// and sibling fusion (with shared input deduplication).
+//
+//===----------------------------------------------------------------------===//
+
+#include "TaskflowDialect/TaskflowDialect.h"
+#include "TaskflowDialect/TaskflowOps.h"
+#include "TaskflowDialect/TaskflowPasses.h"
+
+#include "NeuraDialect/NeuraOps.h"
+#include "NeuraDialect/NeuraPasses.h"
+#include "NeuraDialect/Architecture/Architecture.h"
+#include "NeuraDialect/Mapping/mapping_util.h"
+#include "Conversion/ConversionPasses.h"
+
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Conversion/Passes.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Parser/Parser.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include <cmath>
+
+using namespace mlir;
+using namespace mlir::taskflow;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Operand helpers
+//===----------------------------------------------------------------------===//
+
+// Collects unique values from two ranges into a deduped vector with index map.
+static void collectUnique(ValueRange a, ValueRange b,
+                          SmallVectorImpl<Value> &result,
+                          llvm::SmallDenseMap<Value, unsigned> &idx_map) {
+  for (Value v : a) { idx_map[v] = result.size(); result.push_back(v); }
+  for (Value v : b)
+    if (!idx_map.count(v)) { idx_map[v] = result.size(); result.push_back(v); }
+}
+
+//===----------------------------------------------------------------------===//
+// Counter chain & hyperblock utilities
+//===----------------------------------------------------------------------===//
+
+// Extracts all TaskflowCounterOps from a task body in definition order.
+static SmallVector<TaskflowCounterOp>
+extractCounterChain(TaskflowTaskOp task) {
+  SmallVector<TaskflowCounterOp> chain;
+  for (Operation &op : task.getBody().front())
+    if (auto c = dyn_cast<TaskflowCounterOp>(&op))
+      chain.push_back(c);
+  return chain;
+}
+
+// Returns true if two counter chains have identical bounds and steps.
+static bool counterChainsMatch(SmallVectorImpl<TaskflowCounterOp> &a,
+                               SmallVectorImpl<TaskflowCounterOp> &b) {
+  if (a.size() != b.size()) return false;
+  for (unsigned i = 0; i < a.size(); ++i) {
+    auto get_int = [](Operation *op, StringRef name) -> int64_t {
+      return op->getAttrOfType<IntegerAttr>(name).getInt();
+    };
+    if (get_int(a[i], "lower_bound") != get_int(b[i], "lower_bound") ||
+        get_int(a[i], "upper_bound") != get_int(b[i], "upper_bound") ||
+        get_int(a[i], "step")        != get_int(b[i], "step"))
+      return false;
+  }
+  return true;
+}
+
+// Finds the single TaskflowHyperblockOp in a task body. Returns nullptr
+// if none or multiple exist.
+static TaskflowHyperblockOp findHyperblock(TaskflowTaskOp task) {
+  TaskflowHyperblockOp result = nullptr;
+  for (Operation &op : task.getBody().front()) {
+    if (auto hb = dyn_cast<TaskflowHyperblockOp>(&op)) {
+      if (result) return nullptr; // multiple hyperblocks
+      result = hb;
+    }
+  }
+  return result;
+}
+
+// Finds the single scf::ForOp in a hyperblock body. Returns nullptr
+// if none or multiple exist.
+static scf::ForOp findInnerForOp(TaskflowHyperblockOp hb) {
+  scf::ForOp result = nullptr;
+  for (auto &op : hb.getBody().front()) {
+    if (auto f = dyn_cast<scf::ForOp>(&op)) {
+      if (result) return nullptr;
+      result = f;
+    }
+  }
+  return result;
+}
+
+// Returns true if two scf::ForOp loops have identical constant bounds
+// and step.
+static bool scfForBoundsMatch(scf::ForOp a, scf::ForOp b) {
+  auto get_const = [](Value v) -> std::optional<int64_t> {
+    if (auto cst = v.getDefiningOp<arith::ConstantIndexOp>())
+      return cst.value();
+    return std::nullopt;
+  };
+  auto al = get_const(a.getLowerBound()), bl = get_const(b.getLowerBound());
+  auto au = get_const(a.getUpperBound()), bu = get_const(b.getUpperBound());
+  auto as = get_const(a.getStep()),       bs = get_const(b.getStep());
+  return al && bl && au && bu && as && bs &&
+         *al == *bl && *au == *bu && *as == *bs;
+}
+
+//===----------------------------------------------------------------------===//
+// Legality checks
+//===----------------------------------------------------------------------===//
+
+// Returns true if task1 dominates task2 in the same block.
+static bool canFuseTasks(TaskflowTaskOp t1, TaskflowTaskOp t2) {
+  return t1 && t2 && t1 != t2 &&
+         t1->getBlock() == t2->getBlock() &&
+         t1->isBeforeInBlock(t2);
+}
+
+// Returns true if any write output of the producer feeds the consumer.
+static bool hasProducerConsumerRelation(TaskflowTaskOp producer,
+                                       TaskflowTaskOp consumer) {
+  for (Value r : producer.getWriteOutputs()) {
+    for (Value in : consumer.getReadMemrefs())  if (r == in) return true;
+    for (Value in : consumer.getWriteMemrefs()) if (r == in) return true;
+  }
+  for (Value r : producer.getValueOutputs())
+    for (Value in : consumer.getValueInputs()) if (r == in) return true;
+  return false;
+}
+
+// Returns true if tasks share at least one input with no data dependency.
+static bool areSiblingTasks(TaskflowTaskOp t1, TaskflowTaskOp t2) {
+  llvm::SmallPtrSet<Value, 8> t1_mem;
+  for (Value v : t1.getReadMemrefs())  t1_mem.insert(v);
+  for (Value v : t1.getWriteMemrefs()) t1_mem.insert(v);
+  llvm::SmallPtrSet<Value, 8> t1_val(t1.getValueInputs().begin(),
+                                     t1.getValueInputs().end());
+  bool share = false;
+  for (Value v : t2.getReadMemrefs())  if (t1_mem.count(v)) { share = true; break; }
+  if (!share)
+    for (Value v : t2.getWriteMemrefs()) if (t1_mem.count(v)) { share = true; break; }
+  if (!share)
+    for (Value v : t2.getValueInputs())  if (t1_val.count(v)) { share = true; break; }
+  return share && !hasProducerConsumerRelation(t1, t2) &&
+         !hasProducerConsumerRelation(t2, t1);
+}
+
+// Returns true if any op between producer and consumer uses producer's results.
+static bool hasInterveningUses(TaskflowTaskOp producer,
+                               TaskflowTaskOp consumer) {
+  llvm::SmallPtrSet<Value, 8> results;
+  for (Value v : producer.getWriteOutputs()) results.insert(v);
+  for (Value v : producer.getValueOutputs()) results.insert(v);
+  bool in_range = false;
+  for (Operation &op : *producer->getBlock()) {
+    if (&op == producer.getOperation()) { in_range = true; continue; }
+    if (&op == consumer.getOperation()) break;
+    if (in_range)
+      for (Value v : op.getOperands())
+        if (results.count(v)) return true;
+  }
+  return false;
+}
+
+// Returns true if all outputs of the task have at most one use.
+static bool hasOnlySingleUseOutputs(TaskflowTaskOp task) {
+  for (Value r : task.getWriteOutputs())
+    if (!r.hasOneUse() && !r.use_empty()) return false;
+  for (Value r : task.getValueOutputs())
+    if (!r.hasOneUse() && !r.use_empty()) return false;
+  return true;
+}
+
+// Clones non-structural ops from a task body (skips counters, hyperblocks,
+// and yields).
+static void cloneTaskBodyMiscOps(Block &body, OpBuilder &builder,
+                                 IRMapping &mapping) {
+  for (Operation &op : body) {
+    if (isa<TaskflowCounterOp, TaskflowHyperblockOp, TaskflowYieldOp>(&op))
+      continue;
+    builder.clone(op, mapping);
+  }
+}
+
+// Returns true if two tasks have compatible loop structures for merging.
+// Handles both counter-chain and scf.for-inside-hyperblock cases.
+static bool haveMatchingLoopStructure(TaskflowTaskOp t1, TaskflowTaskOp t2) {
+  auto hb1 = findHyperblock(t1), hb2 = findHyperblock(t2);
+  if (!hb1 || !hb2) return false;
+  if (hb1.getIterArgs().size() > 0 || hb2.getIterArgs().size() > 0)
+    return false;
+
+  auto c1 = extractCounterChain(t1), c2 = extractCounterChain(t2);
+  if (!c1.empty() && !c2.empty())
+    return counterChainsMatch(c1, c2);
+
+  // No counter chains: compares scf.for loops in hyperblock bodies.
+  if (c1.empty() && c2.empty()) {
+    auto f1 = findInnerForOp(hb1), f2 = findInnerForOp(hb2);
+    if (f1 && f2) return scfForBoundsMatch(f1, f2);
+    // Both have no loops: trivially compatible.
+    if (!f1 && !f2) return true;
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Producer-consumer fusion
+//===----------------------------------------------------------------------===//
+
+// Fuses a producer into its consumer by merging their counter chains and
+// hyperblock bodies into a single task with direct SSA value forwarding
+// for the intermediate memref.
+static TaskflowTaskOp
+fuseProducerConsumerTasks(TaskflowTaskOp producer, TaskflowTaskOp consumer,
+                         Value intermediate_memref, OpBuilder &builder) {
+  Location loc = consumer.getLoc();
+  auto prod_read  = producer.getReadMemrefs();
+  auto prod_write = producer.getWriteMemrefs();
+  auto prod_val   = producer.getValueInputs();
+  auto cons_read  = consumer.getReadMemrefs();
+  auto cons_write = consumer.getWriteMemrefs();
+  auto cons_val   = consumer.getValueInputs();
+
+  // Collects fused inputs, keeping intermediate in write_memrefs for
+  // block arg availability but excluding it from consumer's read side.
+  SmallVector<Value> fused_read, fused_write, fused_val;
+  for (Value v : prod_read)  fused_read.push_back(v);
+  for (Value v : cons_read)
+    if (v != intermediate_memref) fused_read.push_back(v);
+  for (Value v : prod_write) fused_write.push_back(v);
+  for (Value v : cons_write)
+    if (v != intermediate_memref) fused_write.push_back(v);
+  for (Value v : prod_val) fused_val.push_back(v);
+  for (Value v : cons_val) fused_val.push_back(v);
+
+  // Output types come from the consumer only.
+  SmallVector<Type> write_out_types(consumer.getWriteOutputs().getTypes());
+  SmallVector<Type> val_out_types(consumer.getValueOutputs().getTypes());
+
+  SmallVector<Value> orig_reads, orig_writes;
+  for (Value v : producer.getOriginalReadMemrefs())  orig_reads.push_back(v);
+  for (Value v : consumer.getOriginalReadMemrefs())  orig_reads.push_back(v);
+  for (Value v : producer.getOriginalWriteMemrefs()) orig_writes.push_back(v);
+  for (Value v : consumer.getOriginalWriteMemrefs()) orig_writes.push_back(v);
+
+  auto fused = builder.create<TaskflowTaskOp>(
+      loc, write_out_types, val_out_types,
+      fused_read, fused_write, fused_val,
+      builder.getStringAttr("fused_pc"),
+      orig_reads, orig_writes);
+
+  Block *body = new Block();
+  fused.getBody().push_back(body);
+  for (Value v : fused_read)  body->addArgument(v.getType(), loc);
+  for (Value v : fused_write) body->addArgument(v.getType(), loc);
+  for (Value v : fused_val)   body->addArgument(v.getType(), loc);
+
+  unsigned fused_read_n  = fused_read.size();
+  unsigned fused_write_n = fused_write.size();
+
+  // --- Maps producer block args to fused block args ---
+  IRMapping mapping;
+  Block &prod_body = producer.getBody().front();
+  for (unsigned i = 0; i < prod_read.size(); ++i)
+    mapping.map(prod_body.getArgument(i), body->getArgument(i));
+  for (unsigned i = 0; i < prod_write.size(); ++i)
+    mapping.map(prod_body.getArgument(prod_read.size() + i),
+                body->getArgument(fused_read_n + i));
+  for (unsigned i = 0; i < prod_val.size(); ++i)
+    mapping.map(prod_body.getArgument(prod_read.size() + prod_write.size() + i),
+                body->getArgument(fused_read_n + fused_write_n + i));
+
+  // Identifies the producer's task-body block arg for the intermediate write
+  // memref. The intermediate_memref is the producer's write OUTPUT result;
+  // finds which write output index it corresponds to, then gets the matching
+  // write memref block arg.
+  BlockArgument prod_intermediate_arg = nullptr;
+  for (unsigned i = 0; i < producer.getWriteOutputs().size(); ++i)
+    if (producer.getWriteOutputs()[i] == intermediate_memref)
+      prod_intermediate_arg = prod_body.getArgument(prod_read.size() + i);
+
+  OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToStart(body);
+
+  // --- Maps consumer block args to fused block args ---
+  Block &cons_body = consumer.getBody().front();
+  unsigned cons_read_fused = prod_read.size();
+  for (unsigned i = 0; i < cons_read.size(); ++i) {
+    if (cons_read[i] == intermediate_memref) {
+      if (prod_intermediate_arg)
+        mapping.map(cons_body.getArgument(i),
+                    mapping.lookupOrDefault(prod_intermediate_arg));
+    } else {
+      mapping.map(cons_body.getArgument(i),
+                  body->getArgument(cons_read_fused++));
+    }
+  }
+  unsigned cons_write_fused = fused_read_n + prod_write.size();
+  for (unsigned i = 0; i < cons_write.size(); ++i) {
+    if (cons_write[i] == intermediate_memref) {
+      if (prod_intermediate_arg)
+        mapping.map(cons_body.getArgument(cons_read.size() + i),
+                    mapping.lookupOrDefault(prod_intermediate_arg));
+    } else {
+      mapping.map(cons_body.getArgument(cons_read.size() + i),
+                  body->getArgument(cons_write_fused++));
+    }
+  }
+  unsigned cons_val_fused = fused_read_n + fused_write_n + prod_val.size();
+  for (unsigned i = 0; i < cons_val.size(); ++i)
+    mapping.map(cons_body.getArgument(cons_read.size() + cons_write.size() + i),
+                body->getArgument(cons_val_fused + i));
+
+  // Clones counter chain from producer.
+  auto prod_counters = extractCounterChain(producer);
+  for (auto ctr : prod_counters) builder.clone(*ctr, mapping);
+
+  // Clones non-counter, non-hyperblock, non-yield ops from both task bodies
+  // (e.g. arith.constant for loop bounds).
+  cloneTaskBodyMiscOps(prod_body, builder, mapping);
+  cloneTaskBodyMiscOps(cons_body, builder, mapping);
+
+  // Locates both hyperblocks.
+  auto prod_hb = findHyperblock(producer);
+  auto cons_hb = findHyperblock(consumer);
+  Block &prod_hb_body = prod_hb.getBody().front();
+  Block &cons_hb_body = cons_hb.getBody().front();
+
+  // Creates fused hyperblock.
+  SmallVector<Value> triggers;
+  for (Value v : prod_hb.getIndices())
+    triggers.push_back(mapping.lookupOrDefault(v));
+  auto fused_hb = builder.create<TaskflowHyperblockOp>(
+      loc, TypeRange{}, triggers, ValueRange{});
+  Block *hb_body = &fused_hb.getBody().emplaceBlock();
+  for (auto arg : prod_hb_body.getArguments())
+    hb_body->addArgument(arg.getType(), loc);
+
+  // Maps hyperblock block args.
+  for (unsigned i = 0; i < prod_hb_body.getNumArguments(); ++i)
+    mapping.map(prod_hb_body.getArgument(i), hb_body->getArgument(i));
+  for (unsigned i = 0; i < cons_hb_body.getNumArguments(); ++i)
+    mapping.map(cons_hb_body.getArgument(i), hb_body->getArgument(i));
+
+  // Identifies the consumer's intermediate read block arg for load skipping.
+  BlockArgument cons_intermediate_arg = nullptr;
+  for (unsigned i = 0; i < cons_read.size(); ++i)
+    if (cons_read[i] == intermediate_memref)
+      cons_intermediate_arg = cons_body.getArgument(i);
+
+  {
+    OpBuilder::InsertionGuard hb_guard(builder);
+    builder.setInsertionPointToStart(hb_body);
+
+    auto prod_for = findInnerForOp(prod_hb);
+    auto cons_for = findInnerForOp(cons_hb);
+
+    if (prod_for && cons_for) {
+      // --- SCF loop fusion: merges for-body ops into a single scf.for ---
+
+      // Clones non-for, non-yield ops from both hyperblock bodies.
+      for (Operation &op : prod_hb_body) {
+        if (isa<TaskflowHyperblockYieldOp, scf::ForOp>(&op)) continue;
+        builder.clone(op, mapping);
+      }
+      for (Operation &op : cons_hb_body) {
+        if (isa<TaskflowHyperblockYieldOp, scf::ForOp>(&op)) continue;
+        builder.clone(op, mapping);
+      }
+
+      // Creates merged scf.for with producer's bounds.
+      Value lb   = mapping.lookupOrDefault(prod_for.getLowerBound());
+      Value ub   = mapping.lookupOrDefault(prod_for.getUpperBound());
+      Value step = mapping.lookupOrDefault(prod_for.getStep());
+      auto merged_for = builder.create<scf::ForOp>(loc, lb, ub, step);
+      mapping.map(prod_for.getInductionVar(), merged_for.getInductionVar());
+      mapping.map(cons_for.getInductionVar(), merged_for.getInductionVar());
+
+      OpBuilder::InsertionGuard for_guard(builder);
+      builder.setInsertionPoint(merged_for.getBody()->getTerminator());
+
+      // Clones producer for-body; skips store to intermediate, records
+      // the forwarded value.
+      Value forwarded = nullptr;
+      for (Operation &op : *prod_for.getBody()) {
+        if (isa<scf::YieldOp>(&op)) continue;
+        if (auto store = dyn_cast<memref::StoreOp>(&op)) {
+          if (prod_intermediate_arg &&
+              store.getMemRef() == prod_intermediate_arg) {
+            forwarded = mapping.lookupOrDefault(store.getValueToStore());
+            continue;
+          }
+        }
+        builder.clone(op, mapping);
+      }
+
+      // Clones consumer for-body; replaces loads from intermediate
+      // with the forwarded value.
+      for (Operation &op : *cons_for.getBody()) {
+        if (isa<scf::YieldOp>(&op)) continue;
+        if (auto load = dyn_cast<memref::LoadOp>(&op)) {
+          if (cons_intermediate_arg && forwarded &&
+              load.getMemRef() == cons_intermediate_arg) {
+            mapping.map(load.getResult(), forwarded);
+            continue;
+          }
+        }
+        builder.clone(op, mapping);
+      }
+
+    } else {
+      // --- Counter-chain path: clones hyperblock bodies sequentially ---
+      Value forwarded = nullptr;
+      for (Operation &op : prod_hb_body) {
+        if (isa<TaskflowHyperblockYieldOp>(&op)) continue;
+        if (auto store = dyn_cast<memref::StoreOp>(&op)) {
+          if (prod_intermediate_arg &&
+              store.getMemRef() == prod_intermediate_arg) {
+            forwarded = mapping.lookupOrDefault(store.getValueToStore());
+            continue;
+          }
+        }
+        builder.clone(op, mapping);
+      }
+      for (Operation &op : cons_hb_body) {
+        if (isa<TaskflowHyperblockYieldOp>(&op)) continue;
+        if (auto load = dyn_cast<memref::LoadOp>(&op)) {
+          if (cons_intermediate_arg && forwarded &&
+              load.getMemRef() == cons_intermediate_arg) {
+            mapping.map(load.getResult(), forwarded);
+            continue;
+          }
+        }
+        builder.clone(op, mapping);
+      }
+    }
+
+    builder.create<TaskflowHyperblockYieldOp>(loc);
+  }
+
+  // Creates the task yield from consumer's yield operands.
+  auto cons_yield = cast<TaskflowYieldOp>(consumer.getBody().front().getTerminator());
+  SmallVector<Value> yield_mem, yield_val;
+  for (Value v : cons_yield.getMemoryResults())
+    yield_mem.push_back(mapping.lookupOrDefault(v));
+  for (Value v : cons_yield.getValueResults())
+    yield_val.push_back(mapping.lookupOrDefault(v));
+  builder.create<TaskflowYieldOp>(loc, yield_mem, yield_val);
+  return fused;
+}
+
+//===----------------------------------------------------------------------===//
+// Sibling fusion
+//===----------------------------------------------------------------------===//
+
+// Fuses two sibling tasks by merging their counter chains and hyperblock
+// bodies into a single task with deduplicated inputs.
+static TaskflowTaskOp fuseSiblingTasks(TaskflowTaskOp t1, TaskflowTaskOp t2,
+                                       OpBuilder &builder) {
+  Location loc = t1.getLoc();
+
+  // Deduplicates inputs across both tasks.
+  SmallVector<Value> fused_read, fused_write, fused_val;
+  llvm::SmallDenseMap<Value, unsigned> read_idx, write_idx, val_idx;
+  collectUnique(t1.getReadMemrefs(),  t2.getReadMemrefs(),  fused_read,  read_idx);
+  collectUnique(t1.getWriteMemrefs(), t2.getWriteMemrefs(), fused_write, write_idx);
+  collectUnique(t1.getValueInputs(),  t2.getValueInputs(),  fused_val,   val_idx);
+
+  // Combined output types.
+  SmallVector<Type> write_out_types, val_out_types;
+  write_out_types.append(t1.getWriteOutputs().getTypes().begin(),
+                         t1.getWriteOutputs().getTypes().end());
+  write_out_types.append(t2.getWriteOutputs().getTypes().begin(),
+                         t2.getWriteOutputs().getTypes().end());
+  val_out_types.append(t1.getValueOutputs().getTypes().begin(),
+                       t1.getValueOutputs().getTypes().end());
+  val_out_types.append(t2.getValueOutputs().getTypes().begin(),
+                       t2.getValueOutputs().getTypes().end());
+
+  SmallVector<Value> orig_reads, orig_writes;
+  for (Value v : t1.getOriginalReadMemrefs())  orig_reads.push_back(v);
+  for (Value v : t2.getOriginalReadMemrefs())  orig_reads.push_back(v);
+  for (Value v : t1.getOriginalWriteMemrefs()) orig_writes.push_back(v);
+  for (Value v : t2.getOriginalWriteMemrefs()) orig_writes.push_back(v);
+
+  auto fused = builder.create<TaskflowTaskOp>(
+      loc, write_out_types, val_out_types,
+      fused_read, fused_write, fused_val,
+      builder.getStringAttr("fused_sibling"),
+      orig_reads, orig_writes);
+
+  Block *body = new Block();
+  fused.getBody().push_back(body);
+  for (Value v : fused_read)  body->addArgument(v.getType(), loc);
+  for (Value v : fused_write) body->addArgument(v.getType(), loc);
+  for (Value v : fused_val)   body->addArgument(v.getType(), loc);
+
+  unsigned rn = fused_read.size(), wn = fused_write.size();
+
+  // Lambda to map a task's block args to fused block args via index maps.
+  auto mapBlockArgs = [&](TaskflowTaskOp task, IRMapping &m) {
+    Block &tb = task.getBody().front();
+    auto r = task.getReadMemrefs();
+    auto w = task.getWriteMemrefs();
+    auto v = task.getValueInputs();
+    for (unsigned i = 0; i < r.size(); ++i)
+      m.map(tb.getArgument(i), body->getArgument(read_idx[r[i]]));
+    for (unsigned i = 0; i < w.size(); ++i)
+      m.map(tb.getArgument(r.size() + i),
+            body->getArgument(rn + write_idx[w[i]]));
+    for (unsigned i = 0; i < v.size(); ++i)
+      m.map(tb.getArgument(r.size() + w.size() + i),
+            body->getArgument(rn + wn + val_idx[v[i]]));
+  };
+
+  OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToStart(body);
+
+  // Clones counter chain from t1.
+  IRMapping mapping;
+  mapBlockArgs(t1, mapping);
+  auto counters = extractCounterChain(t1);
+  for (auto ctr : counters) builder.clone(*ctr, mapping);
+
+  // Clones non-counter, non-hyperblock, non-yield ops from both task bodies.
+  Block &t1_body = t1.getBody().front();
+  Block &t2_body = t2.getBody().front();
+  IRMapping mapping2;
+  mapBlockArgs(t2, mapping2);
+  cloneTaskBodyMiscOps(t1_body, builder, mapping);
+  cloneTaskBodyMiscOps(t2_body, builder, mapping2);
+
+  // Locates both hyperblocks.
+  auto hb1 = findHyperblock(t1), hb2 = findHyperblock(t2);
+  Block &hb1_body = hb1.getBody().front();
+  Block &hb2_body = hb2.getBody().front();
+
+  // Creates fused hyperblock.
+  SmallVector<Value> triggers;
+  for (Value v : hb1.getIndices())
+    triggers.push_back(mapping.lookupOrDefault(v));
+  auto fused_hb = builder.create<TaskflowHyperblockOp>(
+      loc, TypeRange{}, triggers, ValueRange{});
+  Block *hb_body = &fused_hb.getBody().emplaceBlock();
+  for (auto arg : hb1_body.getArguments())
+    hb_body->addArgument(arg.getType(), loc);
+
+  // Maps hyperblock block args.
+  for (unsigned i = 0; i < hb1_body.getNumArguments(); ++i)
+    mapping.map(hb1_body.getArgument(i), hb_body->getArgument(i));
+  for (unsigned i = 0; i < hb2_body.getNumArguments(); ++i)
+    mapping2.map(hb2_body.getArgument(i), hb_body->getArgument(i));
+
+  {
+    OpBuilder::InsertionGuard hb_guard(builder);
+    builder.setInsertionPointToStart(hb_body);
+
+    auto for1 = findInnerForOp(hb1), for2 = findInnerForOp(hb2);
+
+    if (for1 && for2) {
+      // --- SCF loop fusion: merges for-body ops into a single scf.for ---
+
+      // Clones non-for, non-yield ops from both hyperblock bodies.
+      for (Operation &op : hb1_body) {
+        if (isa<TaskflowHyperblockYieldOp, scf::ForOp>(&op)) continue;
+        builder.clone(op, mapping);
+      }
+      for (Operation &op : hb2_body) {
+        if (isa<TaskflowHyperblockYieldOp, scf::ForOp>(&op)) continue;
+        builder.clone(op, mapping2);
+      }
+
+      // Creates merged scf.for with t1's bounds.
+      Value lb   = mapping.lookupOrDefault(for1.getLowerBound());
+      Value ub   = mapping.lookupOrDefault(for1.getUpperBound());
+      Value step = mapping.lookupOrDefault(for1.getStep());
+      auto merged_for = builder.create<scf::ForOp>(loc, lb, ub, step);
+      mapping.map(for1.getInductionVar(), merged_for.getInductionVar());
+      mapping2.map(for2.getInductionVar(), merged_for.getInductionVar());
+
+      OpBuilder::InsertionGuard for_guard(builder);
+      builder.setInsertionPoint(merged_for.getBody()->getTerminator());
+
+      // Clones t1 for-body.
+      for (Operation &op : *for1.getBody()) {
+        if (isa<scf::YieldOp>(&op)) continue;
+        builder.clone(op, mapping);
+      }
+      // Clones t2 for-body.
+      for (Operation &op : *for2.getBody()) {
+        if (isa<scf::YieldOp>(&op)) continue;
+        builder.clone(op, mapping2);
+      }
+
+    } else {
+      // --- Counter-chain path: clones hyperblock bodies sequentially ---
+      for (Operation &op : hb1_body) {
+        if (isa<TaskflowHyperblockYieldOp>(&op)) continue;
+        builder.clone(op, mapping);
+      }
+      for (Operation &op : hb2_body) {
+        if (isa<TaskflowHyperblockYieldOp>(&op)) continue;
+        builder.clone(op, mapping2);
+      }
+    }
+
+    builder.create<TaskflowHyperblockYieldOp>(loc);
+  }
+
+  // Creates combined task yield.
+  auto t1_yield = cast<TaskflowYieldOp>(t1.getBody().front().getTerminator());
+  auto t2_yield = cast<TaskflowYieldOp>(t2.getBody().front().getTerminator());
+  SmallVector<Value> all_mem, all_val;
+  for (Value v : t1_yield.getMemoryResults())
+    all_mem.push_back(mapping.lookupOrDefault(v));
+  for (Value v : t1_yield.getValueResults())
+    all_val.push_back(mapping.lookupOrDefault(v));
+  for (Value v : t2_yield.getMemoryResults())
+    all_mem.push_back(mapping2.lookupOrDefault(v));
+  for (Value v : t2_yield.getValueResults())
+    all_val.push_back(mapping2.lookupOrDefault(v));
+  builder.create<TaskflowYieldOp>(loc, all_mem, all_val);
+  return fused;
+}
+
+//===----------------------------------------------------------------------===//
+// Profitability analysis
+//===----------------------------------------------------------------------===//
+
+// Represents metrics for evaluating fusion profitability.
+struct FusionMetrics {
+  int rec_mii = 1;
+  int res_mii = 1;
+  int max_fanout = 0;
+  int num_ops = 0;
+};
+
+// Calculates the maximum fanout across all ops in a region.
+static int calculateMaxFanoutInRegion(Region &region) {
+  int max_fanout = 0;
+  region.walk([&](Operation *op) {
+    for (Value result : op->getResults()) {
+      int fanout = std::distance(result.use_begin(), result.use_end());
+      max_fanout = std::max(max_fanout, fanout);
+    }
+  });
+  return max_fanout;
+}
+
+// Runs the taskflow-to-neura pipeline on a cloned module and computes
+// MII metrics on the resulting "test_fused_kernel" function.
+static FusionMetrics computeRealMetrics(
+    ModuleOp test_module, const neura::Architecture &architecture) {
+  FusionMetrics metrics;
+  auto cloned = test_module.clone();
+
+  // Phase 1: converts taskflow ops to neura kernels.
+  {
+    PassManager pm(cloned.getContext());
+    pm.addPass(taskflow::createClassifyCountersPass());
+    pm.addPass(createConvertTaskflowToNeuraPass());
+    pm.enableVerifier(false);
+    if (failed(pm.run(cloned))) {
+      metrics.rec_mii = 100;
+      metrics.res_mii = 100;
+      cloned.erase();
+      return metrics;
+    }
+  }
+
+  // Converts func.return to neura.return for accelerator-marked functions
+  // so that the subsequent neura passes can process them correctly.
+  cloned.walk([&](func::ReturnOp ret) {
+    auto func_op = ret->getParentOfType<func::FuncOp>();
+    if (!func_op || !func_op->getAttrOfType<StringAttr>("accelerator")) {
+      return;
+    }
+    OpBuilder builder(ret);
+    auto neura_ret = builder.create<neura::ReturnOp>(ret.getLoc(), ret.getOperands());
+    if (ret.getNumOperands() > 0) {
+      neura_ret->setAttr("return_type", builder.getStringAttr("value"));
+    } else {
+      neura_ret->setAttr("return_type", builder.getStringAttr("void"));
+    }
+    ret.erase();
+  });
+
+  // Phase 2: runs the neura compilation pipeline.
+  {
+    PassManager pm(cloned.getContext());
+    pm.addPass(neura::createAssignAcceleratorPass());
+    pm.addPass(createLowerArithToNeuraPass());
+    pm.addPass(createLowerMemRefToNeuraPass());
+    pm.addPass(neura::createCanonicalizeReturnPass());
+    pm.addPass(neura::createCanonicalizeCastPass());
+    pm.addPass(neura::createPromoteInputArgToConstPass());
+    pm.addPass(neura::createCanonicalizeLiveInPass());
+    pm.addPass(neura::createLeveragePredicatedValuePass());
+    pm.addPass(neura::createTransformCtrlToDataFlowPass());
+    pm.enableVerifier(false);
+    if (failed(pm.run(cloned))) {
+      metrics.rec_mii = 100;
+      metrics.res_mii = 100;
+      cloned.erase();
+      return metrics;
+    }
+  }
+
+  cloned.walk([&](func::FuncOp func_op) {
+    if (func_op.getName() != "test_fused_kernel") {
+      return;
+    }
+    metrics.res_mii = neura::calculateResMii(func_op.getBody(), architecture);
+    auto cycles = neura::collectRecurrenceCycles(func_op.getBody());
+    metrics.rec_mii = 1;
+    for (const auto &cycle : cycles) {
+      metrics.rec_mii = std::max(metrics.rec_mii, cycle.length);
+    }
+    int num_ops = 0;
+    func_op.walk([&](Operation *op) {
+      if (!isa<func::FuncOp>(op) && !op->hasTrait<OpTrait::IsTerminator>()) {
+        ++num_ops;
+      }
+    });
+    metrics.num_ops = num_ops;
+    if (!func_op.getBody().empty()) {
+      metrics.max_fanout = calculateMaxFanoutInRegion(func_op.getBody());
+    }
+  });
+
+  cloned.erase();
+  return metrics;
+}
+
+// Creates a test module containing a single task wrapped in a function.
+static ModuleOp createTestModuleForTask(TaskflowTaskOp task) {
+  MLIRContext *ctx = task->getContext();
+  OpBuilder builder(ctx);
+  Location loc = builder.getUnknownLoc();
+  auto module = ModuleOp::create(loc);
+  builder.setInsertionPointToStart(module.getBody());
+
+  // Collects unique operand values from the task.
+  llvm::SetVector<Value> unique_operands;
+  for (Value v : task->getOperands()) {
+    unique_operands.insert(v);
+  }
+
+  SmallVector<Type> input_types;
+  for (Value v : unique_operands) {
+    input_types.push_back(v.getType());
+  }
+  SmallVector<Type> output_types;
+  for (Value v : task->getResults()) {
+    output_types.push_back(v.getType());
+  }
+  if (input_types.empty()) {
+    input_types.push_back(builder.getI64Type());
+  }
+  if (output_types.empty()) {
+    output_types.push_back(builder.getI64Type());
+  }
+
+  auto func_type = builder.getFunctionType(input_types, output_types);
+  auto func_op = builder.create<func::FuncOp>(loc, "test_fused_kernel", func_type);
+  func_op->setAttr("accelerator", builder.getStringAttr("neura"));
+
+  Block *entry = func_op.addEntryBlock();
+  builder.setInsertionPointToStart(entry);
+
+  IRMapping mapping;
+  for (auto [i, v] : llvm::enumerate(unique_operands)) {
+    mapping.map(v, entry->getArgument(i));
+  }
+
+  auto *cloned = builder.clone(*task.getOperation(), mapping);
+  auto cloned_task = cast<TaskflowTaskOp>(cloned);
+
+  SmallVector<Value> ret;
+  for (Value v : cloned_task->getResults()) {
+    ret.push_back(v);
+  }
+  if (ret.empty()) {
+    ret.push_back(entry->getArgument(0));
+  }
+  builder.create<func::ReturnOp>(loc, ret);
+  return module;
+}
+
+// Computes metrics for a single task by running the neura pipeline.
+static FusionMetrics computeSingleTaskMetrics(
+    TaskflowTaskOp task, const neura::Architecture &architecture) {
+  auto module = createTestModuleForTask(task);
+  FusionMetrics metrics = computeRealMetrics(module, architecture);
+  module.erase();
+  return metrics;
+}
+
+// Creates a test module with two tasks fused together.
+static ModuleOp createFusedTestModule(
+    TaskflowTaskOp task1, TaskflowTaskOp task2,
+    bool is_producer_consumer, Value intermediate_memref) {
+  MLIRContext *ctx = task1->getContext();
+  OpBuilder builder(ctx);
+  Location loc = builder.getUnknownLoc();
+  auto module = ModuleOp::create(loc);
+  builder.setInsertionPointToStart(module.getBody());
+
+  // Collects unique external values (excludes task1's results used by task2).
+  llvm::SetVector<Value> unique_vals;
+  for (Value v : task1->getOperands()) {
+    unique_vals.insert(v);
+  }
+  for (Value v : task2->getOperands()) {
+    unique_vals.insert(v);
+  }
+  for (Value v : task1->getResults()) {
+    unique_vals.remove(v);
+  }
+
+  SmallVector<Type> input_types;
+  for (Value v : unique_vals) {
+    input_types.push_back(v.getType());
+  }
+  if (input_types.empty()) {
+    input_types.push_back(builder.getI64Type());
+  }
+
+  // Placeholder output type; the fused task's results drive the actual return.
+  SmallVector<Type> output_types;
+  output_types.push_back(builder.getI64Type());                       
+
+  auto func_type = builder.getFunctionType(input_types, output_types);
+  auto func_op = builder.create<func::FuncOp>(loc, "test_fused_kernel", func_type);
+  func_op->setAttr("accelerator", builder.getStringAttr("neura"));
+
+  Block *entry = func_op.addEntryBlock();
+  builder.setInsertionPointToStart(entry);
+
+  IRMapping mapping;
+  for (auto [i, v] : llvm::enumerate(unique_vals)) {
+    mapping.map(v, entry->getArgument(i));
+  }
+
+  // Clones task1.
+  auto *ct1_op = builder.clone(*task1.getOperation(), mapping);
+  auto ct1 = cast<TaskflowTaskOp>(ct1_op);
+
+  // Maps task1's results so task2's operands resolve correctly.
+  for (auto [orig, cloned] : llvm::zip(task1->getResults(), ct1->getResults())) {
+    mapping.map(orig, cloned);
+  }
+
+  // Clones task2.
+  auto *ct2_op = builder.clone(*task2.getOperation(), mapping);
+  auto ct2 = cast<TaskflowTaskOp>(ct2_op);
+
+  // Resolves the cloned intermediate memref.
+  Value cloned_intermediate;
+  if (is_producer_consumer && intermediate_memref) {
+    cloned_intermediate = mapping.lookupOrDefault(intermediate_memref);
+  }
+
+  // Fuses the cloned tasks.
+  TaskflowTaskOp fused;
+  if (is_producer_consumer) {
+    fused = fuseProducerConsumerTasks(ct1, ct2, cloned_intermediate, builder);
+  } else {
+    fused = fuseSiblingTasks(ct1, ct2, builder);
+  }
+
+  // Fixes the function return type and creates a return op.
+  SmallVector<Value> ret;
+  for (Value v : fused->getResults()) {
+    ret.push_back(v);
+  }
+  if (ret.empty()) {
+    ret.push_back(entry->getArgument(0));
+  }
+
+  // Removes placeholder return type and rebuilds with actual types.
+  SmallVector<Type> real_output_types;
+  for (Value v : ret) {
+    real_output_types.push_back(v.getType());
+  }
+  func_op.setFunctionType(builder.getFunctionType(input_types, real_output_types));
+  builder.create<func::ReturnOp>(loc, ret);
+
+  // Erases un-fused clones (consumer first to drop uses of producer results).
+  ct2->erase();
+  ct1->erase();
+  return module;
+}
+
+// Computes metrics for the fused version of two tasks.
+static FusionMetrics computeFusedTaskMetrics(
+    TaskflowTaskOp task1, TaskflowTaskOp task2,
+    bool is_producer_consumer, Value intermediate_memref,
+    const neura::Architecture &architecture) {
+  auto module = createFusedTestModule(task1, task2, is_producer_consumer, intermediate_memref);
+  FusionMetrics metrics = computeRealMetrics(module, architecture);
+  module.erase();
+  return metrics;
+}
+
+// Estimates the effective MII considering utilization and fanout penalties.
+static int estimateMII(const FusionMetrics &metrics, int total_tiles) {
+  const float alpha = 0.5f;
+  const float beta = 0.5f;
+  int mii = std::max(metrics.rec_mii, metrics.res_mii);
+  float utilization_factor = 1.0f + alpha * (metrics.num_ops / static_cast<float>(total_tiles));
+  float fanout_factor = 1.0f + beta * std::max(metrics.max_fanout - 4, 0);
+  return static_cast<int>(std::ceil(utilization_factor * fanout_factor * mii));
+}
+
+// Checks if fusion is profitable by comparing estimated MII of the fused
+// task against the individual tasks. For producer-consumer fusion, the
+// unfused MII is the sum (sequential execution); for sibling fusion, it
+// is the max (independent execution).
+static bool isFusionProfitable(TaskflowTaskOp task1, TaskflowTaskOp task2,
+                               bool is_producer_consumer,
+                               Value intermediate = nullptr) {
+  neura::Architecture architecture(1, 1);
+  int total_tiles = architecture.getNumTiles();
+
+  FusionMetrics m1 = computeSingleTaskMetrics(task1, architecture);
+  FusionMetrics m2 = computeSingleTaskMetrics(task2, architecture);
+  FusionMetrics fused = computeFusedTaskMetrics(task1, task2, is_producer_consumer, intermediate, architecture);
+
+  int mii_1 = estimateMII(m1, total_tiles);
+  int mii_2 = estimateMII(m2, total_tiles);
+  int mii_fused = estimateMII(fused, total_tiles);
+
+  // Uses raw MII (max of recurrence and resource MII) for the core
+  // comparison, since the estimateMII utilization penalty is additive
+  // and already reflected in res_mii.
+  int raw_1 = std::max(m1.rec_mii, m1.res_mii);
+  int raw_2 = std::max(m2.rec_mii, m2.res_mii);
+  int raw_fused = std::max(fused.rec_mii, fused.res_mii);
+  int unfused_mii = is_producer_consumer ? (raw_1 + raw_2) : std::max(raw_1, raw_2);
+  bool profitable = raw_fused <= unfused_mii;
+
+  llvm::errs() << "[fuse-task] Profitability:"
+               << " m1(rec=" << m1.rec_mii << " res=" << m1.res_mii << " ops=" << m1.num_ops << " fan=" << m1.max_fanout << " mii=" << mii_1 << ")"
+               << " m2(rec=" << m2.rec_mii << " res=" << m2.res_mii << " ops=" << m2.num_ops << " fan=" << m2.max_fanout << " mii=" << mii_2 << ")"
+               << " fused(rec=" << fused.rec_mii << " res=" << fused.res_mii << " ops=" << fused.num_ops << " fan=" << fused.max_fanout << " mii=" << mii_fused << ")"
+               << " -> " << (profitable ? "PROFITABLE" : "REJECTED") << "\n";
+  return profitable;
+}
+
+//===----------------------------------------------------------------------===//
+// Rewrite patterns
+//===----------------------------------------------------------------------===//
+
+// Fuses a producer task into its consumer when the producer's output feeds
+// directly into the consumer and the loop structures match.
+struct ProducerConsumerTaskFusion
+    : public OpRewritePattern<TaskflowTaskOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TaskflowTaskOp consumer,
+                                PatternRewriter &rewriter) const override {
+    TaskflowTaskOp producer = nullptr;
+    Value intermediate;
+
+    // Searches consumer's read and write memrefs for a fusible producer.
+    auto tryFindProducer = [&](ValueRange inputs) -> bool {
+      for (Value in : inputs) {
+        auto def = in.getDefiningOp<TaskflowTaskOp>();
+        if (!canFuseTasks(def, consumer) ||
+            !hasOnlySingleUseOutputs(def) ||
+            hasInterveningUses(def, consumer) ||
+            !haveMatchingLoopStructure(def, consumer) ||
+            !isFusionProfitable(def, consumer, true, in))
+          continue;
+        producer = def;
+        intermediate = in;
+        return true;
+      }
+      return false;
+    };
+    if (!tryFindProducer(consumer.getReadMemrefs()) &&
+        !tryFindProducer(consumer.getWriteMemrefs()))
+      return failure();
+
+    auto fused = fuseProducerConsumerTasks(producer, consumer,
+                                           intermediate, rewriter);
+    for (auto [o, n] : llvm::zip(consumer.getWriteOutputs(),
+                                 fused.getWriteOutputs()))
+      o.replaceAllUsesWith(n);
+    for (auto [o, n] : llvm::zip(consumer.getValueOutputs(),
+                                 fused.getValueOutputs()))
+      o.replaceAllUsesWith(n);
+    rewriter.eraseOp(consumer);
+    rewriter.eraseOp(producer);
+    return success();
+  }
+};
+
+// Fuses sibling tasks that share inputs and have matching loop structures.
+struct SiblingTaskFusion : public OpRewritePattern<TaskflowTaskOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TaskflowTaskOp t1,
+                                PatternRewriter &rewriter) const override {
+    TaskflowTaskOp t2 = nullptr;
+    for (Operation *op = t1->getNextNode(); op; op = op->getNextNode()) {
+      auto next = dyn_cast<TaskflowTaskOp>(op);
+      if (!next) continue;
+      if (areSiblingTasks(t1, next) && canFuseTasks(t1, next) &&
+          haveMatchingLoopStructure(t1, next) &&
+          isFusionProfitable(t1, next, false)) {
+        t2 = next;
+        break;
+      }
+    }
+    if (!t2) return failure();
+
+    auto fused = fuseSiblingTasks(t1, t2, rewriter);
+    unsigned t1_wo = t1.getWriteOutputs().size();
+    unsigned t1_vo = t1.getValueOutputs().size();
+    for (unsigned i = 0; i < t1_wo; ++i)
+      t1.getWriteOutputs()[i].replaceAllUsesWith(fused.getWriteOutputs()[i]);
+    for (unsigned i = 0; i < t1_vo; ++i)
+      t1.getValueOutputs()[i].replaceAllUsesWith(fused.getValueOutputs()[i]);
+    for (unsigned i = 0; i < t2.getWriteOutputs().size(); ++i)
+      t2.getWriteOutputs()[i].replaceAllUsesWith(
+          fused.getWriteOutputs()[t1_wo + i]);
+    for (unsigned i = 0; i < t2.getValueOutputs().size(); ++i)
+      t2.getValueOutputs()[i].replaceAllUsesWith(
+          fused.getValueOutputs()[t1_vo + i]);
+    rewriter.eraseOp(t1);
+    rewriter.eraseOp(t2);
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Pass definition
+//===----------------------------------------------------------------------===//
+
+// Applies producer-consumer and sibling task fusion using MII-based
+// profitability analysis and loop body merging.
+struct FuseTaskPass
+    : public PassWrapper<FuseTaskPass, OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(FuseTaskPass)
+
+  StringRef getArgument() const override { return "fuse-task"; }
+  StringRef getDescription() const override {
+    return "Fuses taskflow.task ops via counter chain and hyperblock merging.";
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<LLVM::LLVMDialect, func::FuncDialect,
+                    arith::ArithDialect, memref::MemRefDialect,
+                    scf::SCFDialect, cf::ControlFlowDialect,
+                    math::MathDialect, affine::AffineDialect,
+                    neura::NeuraDialect, taskflow::TaskflowDialect>();
+  }
+
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    patterns.add<ProducerConsumerTaskFusion>(&getContext(), /*benefit=*/10);
+    patterns.add<SiblingTaskFusion>(&getContext(), /*benefit=*/5);
+    if (failed(applyPatternsGreedily(getOperation(),
+                                     FrozenRewritePatternSet(std::move(patterns)))))
+      signalPassFailure();
+  }
+};
+
+} // namespace
+
+namespace mlir::taskflow {
+std::unique_ptr<Pass> createFuseTaskPass() {
+  return std::make_unique<FuseTaskPass>();
+}
+} // namespace mlir::taskflow
diff --git a/test/neura/kernel_fusion/kernel.cpp b/test/neura/kernel_fusion/kernel.cpp
new file mode 100644
index 00000000..7b1cb617
--- /dev/null
+++ b/test/neura/kernel_fusion/kernel.cpp
@@ -0,0 +1,152 @@
+// Test cases for FuseTaskPass
+//
+// Build workflow using Polygeist:
+//   1. cgeist kernel.cpp -S -O2                          -> SCF loops
+//   2. polygeist-opt --raise-scf-to-affine                -> Affine loops
+//   3. mlir-neura-opt --affine-loop-tree-serialization \
+//        --convert-affine-to-taskflow \
+//        --construct-hyperblock-from-task                  -> taskflow.task ops
+//   4. mlir-neura-opt --fuse-task                         -> Fused tasks
+
+#define N 64
+
+float A[N], B[N], C[N], D[N], E[N], F[N], G[N], H[N], X[N], Y[N];
+
+// Producer-Consumer Fusion: kernel0 -> kernel1
+// kernel0: C[i] = A[i] + B[i]
+// kernel1: D[i] = C[i] * 2.0
+void test_producer_consumer_fusion(float A[], float B[], float C[], float D[]) {
+    for (int i = 0; i < N; i++) {
+        C[i] = A[i] + B[i];
+    }
+
+    for (int i = 0; i < N; i++) {
+        D[i] = C[i] * 2.0f;
+    }
+}
+
+// Multiple Consumers: kernel0 -> kernel1, kernel0 -> kernel2
+// kernel0: C[i] = A[i] + B[i]
+// kernel1: D[i] = C[i] * 2.0
+// kernel2: E[i] = C[i] + 1.0
+void test_multiple_consumers(float A[], float B[], float C[], float D[], float E[]) {
+    for (int i = 0; i < N; i++) {
+        C[i] = A[i] + B[i];
+    }
+
+    for (int i = 0; i < N; i++) {
+        D[i] = C[i] * 2.0f;
+    }
+
+    for (int i = 0; i < N; i++) {
+        E[i] = C[i] + 1.0f;
+    }
+}
+
+// Sibling Fusion: kernel0 || kernel1 (share input A)
+// kernel0: E[i] = A[i] * 3.0
+// kernel1: F[i] = A[i] + 1.0
+void test_sibling_fusion(float A[], float E[], float F[]) {
+    for (int i = 0; i < N; i++) {
+        E[i] = A[i] * 3.0f;
+    }
+
+    for (int i = 0; i < N; i++) {
+        F[i] = A[i] + 1.0f;
+    }
+}
+
+// No Shared Input: kernel0, kernel1 (no fusion - different inputs)
+// kernel0: G[i] = X[i] * 2.0
+// kernel1: H[i] = Y[i] + 3.0
+void test_no_shared_input(float X[], float Y[], float G[], float H[]) {
+    for (int i = 0; i < N; i++) {
+        G[i] = X[i] * 2.0f;
+    }
+
+    for (int i = 0; i < N; i++) {
+        H[i] = Y[i] + 3.0f;
+    }
+}
+
+// Chain Fusion: kernel0 -> kernel1 -> kernel2
+// kernel0: C[i] = A[i] + B[i]
+// kernel1: D[i] = C[i] * 2.0
+// kernel2: E[i] = D[i] + 1.0
+void test_chain_fusion(float A[], float B[], float C[], float D[], float E[]) {
+    for (int i = 0; i < N; i++) {
+        C[i] = A[i] + B[i];
+    }
+
+    for (int i = 0; i < N; i++) {
+        D[i] = C[i] * 2.0f;
+    }
+
+    for (int i = 0; i < N; i++) {
+        E[i] = D[i] + 1.0f;
+    }
+}
+
+// Complex Sibling: (kernel0 || kernel1 || kernel2), kernel3
+// kernel0: C[i] = A[i] * 2.0
+// kernel1: D[i] = A[i] + 1.0  } siblings (share A)
+// kernel2: E[i] = A[i] - 1.0
+// kernel3: F[i] = B[i] * 3.0  (independent)
+void test_complex_sibling(float A[], float B[], float C[], float D[], float E[], float F[]) {
+    for (int i = 0; i < N; i++) {
+        C[i] = A[i] * 2.0f;
+    }
+
+    for (int i = 0; i < N; i++) {
+        D[i] = A[i] + 1.0f;
+    }
+
+    for (int i = 0; i < N; i++) {
+        E[i] = A[i] - 1.0f;
+    }
+
+    for (int i = 0; i < N; i++) {
+        F[i] = B[i] * 3.0f;
+    }
+}
+
+// Mixed Patterns: (kernel0 -> kernel3) || (kernel1 || kernel2)
+// kernel0: C[i] = A[i] + B[i] ─┐
+// kernel1: D[i] = A[i] * 2.0   ├─ siblings (share A)
+// kernel2: E[i] = A[i] + 3.0  ─┘
+// kernel3: F[i] = C[i] * 2.0    (consumer of kernel0)
+void test_mixed_patterns(float A[], float B[], float C[], float D[], float E[], float F[]) {
+    for (int i = 0; i < N; i++) {
+        C[i] = A[i] + B[i];
+    }
+
+    for (int i = 0; i < N; i++) {
+        D[i] = A[i] * 2.0f;
+    }
+
+    for (int i = 0; i < N; i++) {
+        E[i] = A[i] + 3.0f;
+    }
+
+    for (int i = 0; i < N; i++) {
+        F[i] = C[i] * 2.0f;
+    }
+}
+
+int main() {
+    for (int i = 0; i < N; i++) {
+        A[i] = (float)i;
+        B[i] = (float)(i * 2);
+        X[i] = (float)(i + 1);
+        Y[i] = (float)(i - 1);
+    }
+
+    test_producer_consumer_fusion(A, B, C, D);
+    test_sibling_fusion(A, E, F);
+    test_no_shared_input(X, Y, G, H);
+    test_chain_fusion(A, B, C, D, E);
+    test_complex_sibling(A, B, C, D, E, F);
+    test_mixed_patterns(A, B, C, D, E, F);
+
+    return 0;
+}
diff --git a/test/neura/kernel_fusion/test.mlir b/test/neura/kernel_fusion/test.mlir
new file mode 100644
index 00000000..ddb76a01
--- /dev/null
+++ b/test/neura/kernel_fusion/test.mlir
@@ -0,0 +1,233 @@
+// RUN: mlir-neura-opt %s \
+// RUN:   --affine-loop-tree-serialization \
+// RUN:   --convert-affine-to-taskflow \
+// RUN:   --construct-hyperblock-from-task \
+// RUN:   --fuse-task \
+// RUN:   2>&1 | FileCheck %s
+
+module {
+
+// =============================================================================
+// TEST 1: Producer-Consumer Fusion
+// Loops share intermediate memref C: loop1 writes C, loop2 reads C.
+// Expected: Fused into one task; intermediate store/load eliminated.
+// =============================================================================
+
+// CHECK-LABEL: func.func @test_producer_consumer
+// CHECK:         taskflow.task @fused_pc
+// CHECK:           arith.addf
+// CHECK-NEXT:      arith.mulf
+// CHECK-NEXT:      memref.store
+// CHECK-NOT:     taskflow.task
+// CHECK:         return
+
+func.func @test_producer_consumer(%A: memref<64xf32>, %B: memref<64xf32>,
+                                  %C: memref<64xf32>, %D: memref<64xf32>) {
+  %cst = arith.constant 2.000000e+00 : f32
+  affine.for %i = 0 to 64 {
+    %v = affine.load %A[%i] : memref<64xf32>
+    %w = affine.load %B[%i] : memref<64xf32>
+    %s = arith.addf %v, %w : f32
+    affine.store %s, %C[%i] : memref<64xf32>
+  }
+  affine.for %i = 0 to 64 {
+    %v = affine.load %C[%i] : memref<64xf32>
+    %r = arith.mulf %v, %cst : f32
+    affine.store %r, %D[%i] : memref<64xf32>
+  }
+  return
+}
+
+// =============================================================================
+// TEST 2: Sibling Fusion
+// Both loops read from A without data dependency.
+// Expected: Fused into one task with deduplicated read of A.
+// =============================================================================
+
+// CHECK-LABEL: func.func @test_sibling
+// CHECK:         taskflow.task @fused_sibling
+// CHECK:           arith.mulf
+// CHECK:           memref.store
+// CHECK:           arith.addf
+// CHECK:           memref.store
+// CHECK-NOT:     taskflow.task
+// CHECK:         return
+
+func.func @test_sibling(%A: memref<64xf32>, %B: memref<64xf32>,
+                         %C: memref<64xf32>) {
+  %cst3 = arith.constant 3.000000e+00 : f32
+  %cst1 = arith.constant 1.000000e+00 : f32
+  affine.for %i = 0 to 64 {
+    %v = affine.load %A[%i] : memref<64xf32>
+    %r = arith.mulf %v, %cst3 : f32
+    affine.store %r, %B[%i] : memref<64xf32>
+  }
+  affine.for %i = 0 to 64 {
+    %v = affine.load %A[%i] : memref<64xf32>
+    %r = arith.addf %v, %cst1 : f32
+    affine.store %r, %C[%i] : memref<64xf32>
+  }
+  return
+}
+
+// =============================================================================
+// TEST 3: No Shared Input (No Fusion)
+// Loops read from different arrays (A and B) with no dependency.
+// Expected: Two separate tasks remain.
+// =============================================================================
+
+// CHECK-LABEL: func.func @test_no_shared_input
+// CHECK:         taskflow.task @Task_0
+// CHECK:         taskflow.task @Task_1
+// CHECK:         return
+
+func.func @test_no_shared_input(%A: memref<64xf32>, %B: memref<64xf32>,
+                                 %C: memref<64xf32>, %D: memref<64xf32>) {
+  %cst2 = arith.constant 2.000000e+00 : f32
+  %cst3 = arith.constant 3.000000e+00 : f32
+  affine.for %i = 0 to 64 {
+    %v = affine.load %A[%i] : memref<64xf32>
+    %r = arith.mulf %v, %cst2 : f32
+    affine.store %r, %C[%i] : memref<64xf32>
+  }
+  affine.for %i = 0 to 64 {
+    %v = affine.load %B[%i] : memref<64xf32>
+    %r = arith.addf %v, %cst3 : f32
+    affine.store %r, %D[%i] : memref<64xf32>
+  }
+  return
+}
+
+// =============================================================================
+// TEST 4: Chain Fusion (A -> B -> C)
+// Three loops chained via intermediates C and D.
+// Expected: All fused into one task; all intermediate store/load eliminated.
+// =============================================================================
+
+// CHECK-LABEL: func.func @test_chain
+// CHECK:         taskflow.task @fused_pc
+// CHECK:           arith.addf
+// CHECK-NEXT:      arith.mulf
+// CHECK-NEXT:      arith.addf
+// CHECK-NEXT:      memref.store
+// CHECK-NOT:     taskflow.task
+// CHECK:         return
+
+func.func @test_chain(%A: memref<64xf32>, %B: memref<64xf32>,
+                       %C: memref<64xf32>, %D: memref<64xf32>,
+                       %E: memref<64xf32>) {
+  %cst2 = arith.constant 2.000000e+00 : f32
+  %cst1 = arith.constant 1.000000e+00 : f32
+  affine.for %i = 0 to 64 {
+    %v = affine.load %A[%i] : memref<64xf32>
+    %w = affine.load %B[%i] : memref<64xf32>
+    %s = arith.addf %v, %w : f32
+    affine.store %s, %C[%i] : memref<64xf32>
+  }
+  affine.for %i = 0 to 64 {
+    %v = affine.load %C[%i] : memref<64xf32>
+    %r = arith.mulf %v, %cst2 : f32
+    affine.store %r, %D[%i] : memref<64xf32>
+  }
+  affine.for %i = 0 to 64 {
+    %v = affine.load %D[%i] : memref<64xf32>
+    %r = arith.addf %v, %cst1 : f32
+    affine.store %r, %E[%i] : memref<64xf32>
+  }
+  return
+}
+
+// =============================================================================
+// TEST 5: Sibling Fusion Rejected - Compute Pressure
+// Both loops read from shared array A and each performs a long multiply-add
+// chain. The fused kernel's operation count is large enough to push res_mii
+// above the individual res_mii values, making fusion unprofitable.
+// Expected: Two separate tasks remain.
+// =============================================================================
+
+// CHECK-LABEL: func.func @test_sibling_rejected_compute
+// CHECK:         taskflow.task @Task_0
+// CHECK:         taskflow.task @Task_1
+// CHECK-NOT:     fused_sibling
+// CHECK:         return
+
+func.func @test_sibling_rejected_compute(%A: memref<64xf32>,
+                                          %B: memref<64xf32>,
+                                          %C: memref<64xf32>) {
+  %c1 = arith.constant 1.5e+00 : f32
+  %c2 = arith.constant 2.5e+00 : f32
+  affine.for %i = 0 to 64 {
+    %v  = affine.load %A[%i] : memref<64xf32>
+    %r1 = arith.mulf %v,  %c1 : f32
+    %r2 = arith.mulf %r1, %c2 : f32
+    %r3 = arith.mulf %r2, %c1 : f32
+    %r4 = arith.mulf %r3, %c2 : f32
+    %r5 = arith.mulf %r4, %c1 : f32
+    %r6 = arith.mulf %r5, %c2 : f32
+    %r7 = arith.addf %r6, %v  : f32
+    affine.store %r7, %B[%i] : memref<64xf32>
+  }
+  affine.for %i = 0 to 64 {
+    %v  = affine.load %A[%i] : memref<64xf32>
+    %r1 = arith.mulf %v,  %c2 : f32
+    %r2 = arith.mulf %r1, %c1 : f32
+    %r3 = arith.mulf %r2, %c2 : f32
+    %r4 = arith.mulf %r3, %c1 : f32
+    %r5 = arith.mulf %r4, %c2 : f32
+    %r6 = arith.mulf %r5, %c1 : f32
+    %r7 = arith.addf %r6, %v  : f32
+    affine.store %r7, %C[%i] : memref<64xf32>
+  }
+  return
+}
+
+// =============================================================================
+// TEST 6: Sibling Fusion Rejected - Memory Pressure
+// Both loops share array A but each also loads from two additional independent
+// arrays, so no deduplication occurs for those loads. The accumulated load and
+// compute operations in the fused kernel raise res_mii beyond either individual,
+// making fusion unprofitable.
+// Expected: Two separate tasks remain.
+// =============================================================================
+
+// CHECK-LABEL: func.func @test_sibling_rejected_memory
+// CHECK:         taskflow.task @Task_0
+// CHECK:         taskflow.task @Task_1
+// CHECK-NOT:     fused_sibling
+// CHECK:         return
+
+func.func @test_sibling_rejected_memory(%A: memref<64xf32>,
+                                         %B: memref<64xf32>,
+                                         %C: memref<64xf32>,
+                                         %D: memref<64xf32>,
+                                         %E: memref<64xf32>,
+                                         %F: memref<64xf32>,
+                                         %G: memref<64xf32>) {
+  affine.for %i = 0 to 64 {
+    %va = affine.load %A[%i] : memref<64xf32>
+    %vb = affine.load %B[%i] : memref<64xf32>
+    %vc = affine.load %C[%i] : memref<64xf32>
+    %s1 = arith.addf %va, %vb : f32
+    %s2 = arith.mulf %s1, %vc : f32
+    %s3 = arith.mulf %s2, %va : f32
+    %s4 = arith.addf %s3, %vb : f32
+    %s5 = arith.mulf %s4, %vc : f32
+    %s6 = arith.addf %s5, %va : f32
+    affine.store %s6, %D[%i] : memref<64xf32>
+  }
+  affine.for %i = 0 to 64 {
+    %va = affine.load %A[%i] : memref<64xf32>
+    %ve = affine.load %E[%i] : memref<64xf32>
+    %vf = affine.load %F[%i] : memref<64xf32>
+    %s1 = arith.addf %va, %ve : f32
+    %s2 = arith.mulf %s1, %vf : f32
+    %s3 = arith.mulf %s2, %va : f32
+    %s4 = arith.addf %s3, %ve : f32
+    %s5 = arith.mulf %s4, %vf : f32
+    %s6 = arith.addf %s5, %va : f32
+    affine.store %s6, %G[%i] : memref<64xf32>
+  }
+  return
+}
+
+}
diff --git a/tools/mlir-neura-opt/mlir-neura-opt.cpp b/tools/mlir-neura-opt/mlir-neura-opt.cpp
index a809f6f3..587ab050 100644
--- a/tools/mlir-neura-opt/mlir-neura-opt.cpp
+++ b/tools/mlir-neura-opt/mlir-neura-opt.cpp
@@ -34,6 +34,9 @@ using mlir::neura::util::ArchParser;
 // Global variable to store architecture spec file path
 static std::string architecture_spec_file;
 
+// Global variable to store latency spec file path
+static std::string latency_spec_file;
+
 const Architecture &mlir::neura::getArchitecture() {
   static Architecture instance = []() {
     auto arch_parser = ArchParser(architecture_spec_file);
@@ -46,6 +49,10 @@ const Architecture &mlir::neura::getArchitecture() {
   return instance;
 }
 
+const std::string &mlir::neura::getLatencySpecFile() {
+  return latency_spec_file;
+}
+
 int main(int argc, char **argv) {
   // Manually scan and strip --architecture-spec from argv, keep others for
   // MlirOptMain.
@@ -68,6 +75,15 @@ int main(int argc, char **argv) {
       architecture_spec_file =
           arg_ref.substr(strlen("--architecture-spec=")).str();
       continue;
+    } else if (arg_ref == "--latency-spec") {
+      if (i + 1 < argc) {
+        latency_spec_file = argv[i + 1];
+        ++i; // skip value
+        continue;
+      }
+    } else if (arg_ref.starts_with("--latency-spec=")) {
+      latency_spec_file = arg_ref.substr(strlen("--latency-spec=")).str();
+      continue;
     }
     forwarded_args.push_back(argv[i]);
   }