diff --git a/include/NeuraDialect/Architecture/ArchitectureSpec.h b/include/NeuraDialect/Architecture/ArchitectureSpec.h
index 2df350f7..1a63244c 100644
--- a/include/NeuraDialect/Architecture/ArchitectureSpec.h
+++ b/include/NeuraDialect/Architecture/ArchitectureSpec.h
@@ -72,6 +72,10 @@ struct LinkOverride {
 // This is set by the command line tool when a YAML file is provided.
 std::string getArchitectureSpecFile();
 
+// Function for getting the latency specification file path.
+// This is set by the command line tool when a YAML file is provided.
+std::string getLatencySpecFile();
+
 // Function for getting tile defaults configuration.
 TileDefaults getTileDefaults();
 
diff --git a/include/NeuraDialect/Mapping/MappingState.h b/include/NeuraDialect/Mapping/MappingState.h
index a43cea04..99dc737f 100644
--- a/include/NeuraDialect/Mapping/MappingState.h
+++ b/include/NeuraDialect/Mapping/MappingState.h
@@ -10,6 +10,13 @@
 namespace mlir {
 namespace neura {
 
+// Occupy status for multi-cycle pipeline support.
+// These states define how a tile/FU is occupied at a given time step.
+#define SINGLE_OCCUPY     0 // A single-cycle op is in the FU (exclusive)
+#define START_PIPE_OCCUPY 1 // A multi-cycle op starts in the FU
+#define END_PIPE_OCCUPY   2 // A multi-cycle op ends in the FU
+#define IN_PIPE_OCCUPY    3 // A multi-cycle op is occupying the FU (pipelined)
+
 // Represents a spatial-temporal location: (resource, time_step)
 struct MappingLoc {
   BasicResource *resource;
@@ -54,9 +61,20 @@ namespace neura {
 class MappingState {
 public:
   MappingState(const Architecture &arch, int II, bool is_spatial_only);
-  // Binds a (tile/link, time_step) location to an operation.
+  // Binds a (tile/link, time_step) location to an operation with default
+  // SINGLE_OCCUPY status.
   bool bindOp(const MappingLoc &loc, Operation *op);
 
+  // Binds a (tile/link, time_step) location to an operation with specified
+  // occupy status for multi-cycle pipeline support.
+  bool bindOp(const MappingLoc &loc, Operation *op, int occupy_status);
+
+  // Binds multiple locations for a multi-cycle operation.
+  // This sets START_PIPE_OCCUPY at start_time, IN_PIPE_OCCUPY for intermediate
+  // times, and END_PIPE_OCCUPY at end_time-1.
+  bool bindMultiCycleOp(BasicResource *resource, int start_time, int latency,
+                        Operation *op);
+
   // Unbinds an operation from its (tile/link, time_step) location,
   // which is useful for backtracking.
   void unbindOp(Operation *op);
@@ -67,6 +85,19 @@ class MappingState {
   // it will check (tile 2, step 1), (tile 2, step 5), (tile 2, step 9), etc.
   bool isAvailableAcrossTime(const MappingLoc &loc) const;
 
+  // Checks if a location is available for a specific occupy status.
+  // This implements the pipeline-aware availability checking:
+  // - SINGLE_OCCUPY: only available if location is completely free
+  // - START_PIPE_OCCUPY: available if free or IN_PIPE_OCCUPY or END_PIPE_OCCUPY
+  // - END_PIPE_OCCUPY: available if free or IN_PIPE_OCCUPY or START_PIPE_OCCUPY
+  // - IN_PIPE_OCCUPY: always available (can pipeline with any status)
+  bool isAvailableForOccupyStatus(const MappingLoc &loc,
+                                  int new_occupy_status) const;
+
+  // Gets the occupy status at a specific location across time domain.
+  // Returns -1 if the location is not occupied.
+  int getOccupyStatusAcrossTime(const MappingLoc &loc) const;
+
   // Checks if a hardware resource is available across a time range.
   // This function leverages the isAvailableAcrossTime function in each
   // time step.
@@ -111,7 +142,8 @@ class MappingState {
   void dumpOpToLocs(llvm::raw_ostream &os = llvm::errs()) const;
 
   // Getters for state information.
-  const std::set<MappingLoc> &getOccupiedLocs() const {
+  const std::map<MappingLoc, std::vector<std::pair<int, Operation *>>> &
+  getOccupiedLocs() const {
     return this->occupied_locs;
   }
   const std::map<MappingLoc, Operation *> &getLocToOp() const {
@@ -122,7 +154,9 @@ class MappingState {
   }
 
   // Setters for state information.
-  void setOccupiedLocs(const std::set<MappingLoc> &locs) {
+  void setOccupiedLocs(
+      const std::map<MappingLoc, std::vector<std::pair<int, Operation *>>>
+          &locs) {
     this->occupied_locs = locs;
   }
   void setLocToOp(const std::map<MappingLoc, Operation *> &loc_to_op) {
@@ -139,7 +173,9 @@ class MappingState {
   bool is_spatial_only;
   static constexpr int kMaxSteps = 10;
 
-  std::set<MappingLoc> occupied_locs;
+  // Maps location to a list of (occupy_status, operation) pairs.
+  // Multiple ops can occupy the same location with compatible pipeline states.
+  std::map<MappingLoc, std::vector<std::pair<int, Operation *>>> occupied_locs;
   std::map<MappingLoc, Operation *> loc_to_op;
   std::map<Operation *, std::vector<MappingLoc>> op_to_locs;
 };
@@ -160,7 +196,7 @@ class MappingStateSnapshot {
   }
 
 private:
-  std::set<MappingLoc> occupied_locs;
+  std::map<MappingLoc, std::vector<std::pair<int, Operation *>>> occupied_locs;
   std::map<MappingLoc, Operation *> loc_to_op;
   std::map<Operation *, std::vector<MappingLoc>> op_to_locs;
 };
diff --git a/include/NeuraDialect/Mapping/mapping_util.h b/include/NeuraDialect/Mapping/mapping_util.h
index 0a36d476..dfe7ca4d 100644
--- a/include/NeuraDialect/Mapping/mapping_util.h
+++ b/include/NeuraDialect/Mapping/mapping_util.h
@@ -116,5 +116,12 @@ bool canReachLocInTime(const std::vector<Operation *> &producers,
 Register *getAvailableRegister(const MappingState &mapping_state, Tile *tile,
                                int start_time, int exclusive_end_time);
 
+// Gets the execution latency of an operation from its "latency" attribute.
+// Returns 1 (single-cycle) if the attribute is not present.
+int getOpLatency(Operation *op);
+
+// Checks if an operation is a multi-cycle operation (latency > 1).
+bool isMultiCycleOp(Operation *op);
+
 } // namespace neura
 } // namespace mlir
diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h
index 95aa70c8..340886ed 100644
--- a/include/NeuraDialect/NeuraPasses.h
+++ b/include/NeuraDialect/NeuraPasses.h
@@ -38,6 +38,7 @@ std::unique_ptr<mlir::Pass> createWrapLoopInKernelPass();
 // Hardware specific optimization passes
 std::unique_ptr<mlir::Pass> createFuseLoopControlPass();
 std::unique_ptr<mlir::Pass> createFusePatternPass();
+std::unique_ptr<mlir::Pass> createFuseKernelPass();
 
 // Hardware agnostic optimization passes
 std::unique_ptr<mlir::Pass> createFoldConstantPass();
@@ -49,6 +50,7 @@ std::unique_ptr<mlir::Pass> createInitPatternPass();
 
 // Hardware optimization passes
 std::unique_ptr<mlir::Pass> createHardwareMergePass();
+std::unique_ptr<mlir::Pass> createInitExecLatencyPass();
 
 #define GEN_PASS_REGISTRATION
 #include "NeuraDialect/NeuraPasses.h.inc"
diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td
index ec0df60b..90a5d5e3 100644
--- a/include/NeuraDialect/NeuraPasses.td
+++ b/include/NeuraDialect/NeuraPasses.td
@@ -20,6 +20,21 @@ def FusePattern : Pass<"fuse-pattern", "ModuleOp"> {
   let constructor = "neura::createFusePatternPass()";
 }
 
+def FuseKernel : Pass<"fuse-kernel", "ModuleOp"> {
+  let summary = "Fuses kernel operations in the Neura dialect";
+  let description = [{
+    This pass fuses neura.kernel operations using producer-consumer and sibling
+    fusion strategies, inspired by MLIR's linalg and affine loop fusion.
+    
+    Producer-Consumer Fusion: Fuses a producer kernel into its consumer when
+    the producer's output is only used by the consumer.
+    
+    Sibling Fusion: Fuses kernels that share the same input operands and have
+    no data dependencies between them.
+  }];
+  let constructor = "neura::createFuseKernelPass()";
+}
+
 def InsertDataMov : Pass<"insert-data-mov", "ModuleOp"> {
   let summary = "Inserts data move operations in the Neura dialect";
   let description =
@@ -194,4 +209,12 @@ def HardwareMerge : Pass<"hardware-merge", "ModuleOp"> {
   }];
   let constructor = "neura::createHardwareMergePass()";
 }
+
+def InitExecLatency : Pass<"init-exec-latency", "ModuleOp"> {
+  let summary = "Initialize execution latency information";
+  let description = [{
+    This pass initializes execution latency information.
+  }];
+  let constructor = "neura::createInitExecLatencyPass()";
+}
 #endif // NEURA_PASSES_TD
\ No newline at end of file
diff --git a/lib/NeuraDialect/Mapping/MappingState.cpp b/lib/NeuraDialect/Mapping/MappingState.cpp
index 110d1976..d537eeea 100644
--- a/lib/NeuraDialect/Mapping/MappingState.cpp
+++ b/lib/NeuraDialect/Mapping/MappingState.cpp
@@ -3,6 +3,7 @@
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 
 using namespace mlir;
 using namespace mlir::neura;
@@ -30,14 +31,62 @@ MappingState::MappingState(const Architecture &arch, int II,
     : II(II), is_spatial_only(is_spatial_only) {}
 
 bool MappingState::bindOp(const MappingLoc &loc, Operation *op) {
+  // Default to SINGLE_OCCUPY for backward compatibility
+  return bindOp(loc, op, SINGLE_OCCUPY);
+}
+
+bool MappingState::bindOp(const MappingLoc &loc, Operation *op,
+                          int occupy_status) {
+  // Check if the location is available for the specified occupy status
+  if (!isAvailableForOccupyStatus(loc, occupy_status)) {
+    return false;
+  }
+
   loc_to_op[loc] = op;
-  occupied_locs.insert(loc);
+  occupied_locs[loc].push_back({occupy_status, op});
   auto it = op_to_locs.find(op);
   assert(it == op_to_locs.end() && "Operation already has reserved locations");
   op_to_locs[op].push_back(loc);
   return true;
 }
 
+bool MappingState::bindMultiCycleOp(BasicResource *resource, int start_time,
+                                    int latency, Operation *op) {
+  // First check if all locations are available
+  for (int t = start_time; t < start_time + latency; ++t) {
+    MappingLoc check_loc = {resource, t};
+    int status;
+    if (t == start_time) {
+      status = START_PIPE_OCCUPY;
+    } else if (t == start_time + latency - 1) {
+      status = END_PIPE_OCCUPY;
+    } else {
+      status = IN_PIPE_OCCUPY;
+    }
+    if (!isAvailableForOccupyStatus(check_loc, status)) {
+      return false;
+    }
+  }
+
+  // Now bind all locations
+  for (int t = start_time; t < start_time + latency; ++t) {
+    MappingLoc loc = {resource, t};
+    int status;
+    if (t == start_time) {
+      status = START_PIPE_OCCUPY;
+    } else if (t == start_time + latency - 1) {
+      status = END_PIPE_OCCUPY;
+    } else {
+      status = IN_PIPE_OCCUPY;
+    }
+
+    loc_to_op[loc] = op;
+    occupied_locs[loc].push_back({status, op});
+    op_to_locs[op].push_back(loc);
+  }
+  return true;
+}
+
 void MappingState::unbindOp(Operation *op) {
   auto it = op_to_locs.find(op);
   if (it == op_to_locs.end()) {
@@ -46,7 +95,21 @@ void MappingState::unbindOp(Operation *op) {
 
   for (const MappingLoc &loc : it->second) {
     loc_to_op.erase(loc);
-    occupied_locs.erase(loc);
+    // Remove entries for this op from occupied_locs
+    auto occ_it = occupied_locs.find(loc);
+    if (occ_it != occupied_locs.end()) {
+      auto &entries = occ_it->second;
+      entries.erase(
+          std::remove_if(entries.begin(), entries.end(),
+                         [op](const std::pair<int, Operation *> &entry) {
+                           return entry.second == op;
+                         }),
+          entries.end());
+      // Remove the location entirely if no more entries
+      if (entries.empty()) {
+        occupied_locs.erase(occ_it);
+      }
+    }
   }
 
   op_to_locs.erase(it);
@@ -57,21 +120,128 @@ bool MappingState::isAvailableAcrossTime(const MappingLoc &loc) const {
   if (this->is_spatial_only) {
     for (int t = 0; t < II * kMaxSteps; ++t) {
       MappingLoc check_loc = {loc.resource, t};
-      if (occupied_locs.find(check_loc) != occupied_locs.end()) {
-        return false;
+      auto it = occupied_locs.find(check_loc);
+      if (it != occupied_locs.end()) {
+        // Check if all existing occupy statuses allow new single-cycle op
+        for (const auto &entry : it->second) {
+          if (entry.first != IN_PIPE_OCCUPY) {
+            return false;
+          }
+        }
       }
     }
     return true;
   } else {
-
     // Checks the availability across time domain.
     for (int t = loc.time_step % II; t < II * kMaxSteps; t += II) {
       MappingLoc check_loc = {loc.resource, t};
-      if (occupied_locs.find(check_loc) != occupied_locs.end()) {
+      auto it = occupied_locs.find(check_loc);
+      if (it != occupied_locs.end()) {
+        // Check if all existing occupy statuses allow new single-cycle op
+        for (const auto &entry : it->second) {
+          if (entry.first != IN_PIPE_OCCUPY) {
+            return false;
+          }
+        }
+      }
+    }
+    return true;
+  }
+}
+
+bool MappingState::isAvailableForOccupyStatus(const MappingLoc &loc,
+                                              int new_occupy_status) const {
+  // Helper lambda to check a single location against all existing entries
+  auto checkSingleLoc = [this, new_occupy_status](const MappingLoc &check_loc) -> bool {
+    auto it = occupied_locs.find(check_loc);
+    if (it == occupied_locs.end() || it->second.empty()) {
+      // Location is free, always available
+      return true;
+    }
+
+    // Check against all existing entries at this location
+    for (const auto &entry : it->second) {
+      int existing_status = entry.first;
+
+      // Implement the pipeline-aware availability rules:
+      // - SINGLE_OCCUPY (0): exclusive, no other op can share
+      // - START_PIPE_OCCUPY (1): cannot coexist with SINGLE or another START
+      // - END_PIPE_OCCUPY (2): cannot coexist with SINGLE or another END
+      // - IN_PIPE_OCCUPY (3): can coexist with any status except SINGLE
+
+      if (existing_status == SINGLE_OCCUPY) {
+        // SINGLE_OCCUPY blocks everything
+        return false;
+      }
+
+      if (new_occupy_status == SINGLE_OCCUPY) {
+        // SINGLE_OCCUPY cannot be placed if anything is there
         return false;
       }
+
+      if (new_occupy_status == START_PIPE_OCCUPY) {
+        // START cannot coexist with another START
+        if (existing_status == START_PIPE_OCCUPY) {
+          return false;
+        }
+      }
+
+      if (new_occupy_status == END_PIPE_OCCUPY) {
+        // END cannot coexist with another END
+        if (existing_status == END_PIPE_OCCUPY) {
+          return false;
+        }
+      }
+
+      // IN_PIPE_OCCUPY can coexist with START, END, or other IN_PIPE
     }
     return true;
+  };
+
+  // For spatial mapping, check all time steps
+  if (this->is_spatial_only) {
+    for (int t = 0; t < II * kMaxSteps; ++t) {
+      MappingLoc check_loc = {loc.resource, t};
+      if (!checkSingleLoc(check_loc)) {
+        return false;
+      }
+    }
+    return true;
+  } else {
+    // Check across time domain (modulo II)
+    for (int t = loc.time_step % II; t < II * kMaxSteps; t += II) {
+      MappingLoc check_loc = {loc.resource, t};
+      if (!checkSingleLoc(check_loc)) {
+        return false;
+      }
+    }
+    return true;
+  }
+}
+
+int MappingState::getOccupyStatusAcrossTime(const MappingLoc &loc) const {
+  // For spatial mapping, check all time steps
+  if (this->is_spatial_only) {
+    for (int t = 0; t < II * kMaxSteps; ++t) {
+      MappingLoc check_loc = {loc.resource, t};
+      auto it = occupied_locs.find(check_loc);
+      if (it != occupied_locs.end() && !it->second.empty()) {
+        // Return the first status found (most restrictive)
+        return it->second[0].first;
+      }
+    }
+    return -1;
+  } else {
+    // Check across time domain (modulo II)
+    for (int t = loc.time_step % II; t < II * kMaxSteps; t += II) {
+      MappingLoc check_loc = {loc.resource, t};
+      auto it = occupied_locs.find(check_loc);
+      if (it != occupied_locs.end() && !it->second.empty()) {
+        // Return the first status found (most restrictive)
+        return it->second[0].first;
+      }
+    }
+    return -1;
   }
 }
 
@@ -202,12 +372,9 @@ void MappingState::reserveRoute(Operation *op, ArrayRef<MappingLoc> path) {
   op_to_locs[op] = std::vector<MappingLoc>(path.begin(), path.end());
 
   for (const MappingLoc &loc : path) {
-    assert(occupied_locs.find(loc) == occupied_locs.end() &&
-           "Mapping location already occupied");
     loc_to_op[loc] = op;
-    assert(occupied_locs.find(loc) == occupied_locs.end() &&
-           "Mapping location already occupied in occupied_locs");
-    occupied_locs.insert(loc);
+    // Use SINGLE_OCCUPY for route reservations (links/registers)
+    occupied_locs[loc].push_back({SINGLE_OCCUPY, op});
   }
 }
 
@@ -221,7 +388,21 @@ void MappingState::releaseRoute(Operation *op) {
 
   for (const MappingLoc &loc : route) {
     loc_to_op.erase(loc);
-    occupied_locs.erase(loc);
+    // Remove entries for this op from occupied_locs
+    auto occ_it = occupied_locs.find(loc);
+    if (occ_it != occupied_locs.end()) {
+      auto &entries = occ_it->second;
+      entries.erase(
+          std::remove_if(entries.begin(), entries.end(),
+                         [op](const std::pair<int, Operation *> &entry) {
+                           return entry.second == op;
+                         }),
+          entries.end());
+      // Remove the location entirely if no more entries
+      if (entries.empty()) {
+        occupied_locs.erase(occ_it);
+      }
+    }
   }
 
   op_to_locs.erase(it);
diff --git a/lib/NeuraDialect/Mapping/mapping_util.cpp b/lib/NeuraDialect/Mapping/mapping_util.cpp
index f5b7a86d..65abaab6 100644
--- a/lib/NeuraDialect/Mapping/mapping_util.cpp
+++ b/lib/NeuraDialect/Mapping/mapping_util.cpp
@@ -4,6 +4,7 @@
 #include "NeuraDialect/Mapping/mapping_util.h"
 #include "NeuraDialect/NeuraOps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/Operation.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
@@ -1112,13 +1113,37 @@ llvm::SmallVector<Operation *> mlir::neura::getCtrlMovUsers(Operation *op) {
 
 bool mlir::neura::placeAndRoute(Operation *op, const MappingLoc &target_loc,
                                 MappingState &mapping_state) {
-  if (mapping_state.bindOp(target_loc, op)) {
+  // Get the latency of the operation to determine if it's multi-cycle
+  int latency = getOpLatency(op);
+  bool is_multi_cycle = latency > 1;
+
+  bool bind_success = false;
+  if (is_multi_cycle) {
+    // For multi-cycle ops, bind across multiple time steps with pipeline status
+    bind_success = mapping_state.bindMultiCycleOp(
+        target_loc.resource, target_loc.time_step, latency, op);
+    if (bind_success) {
+      llvm::errs() << "[DEBUG] Bound multi-cycle op (latency=" << latency
+                   << ") " << *op << " onto loc: "
+                   << target_loc.resource->getType() << "#"
+                   << target_loc.resource->getId()
+                   << " @t=" << target_loc.time_step << " to t="
+                   << (target_loc.time_step + latency - 1) << "\n";
+    }
+  } else {
+    // For single-cycle ops, use default SINGLE_OCCUPY binding
+    bind_success = mapping_state.bindOp(target_loc, op);
+    if (bind_success) {
+      llvm::errs() << "[DEBUG] Schedule op " << *op
+                   << " onto loc: " << target_loc.resource->getType() << "#"
+                   << target_loc.resource->getId()
+                   << " @t=" << target_loc.time_step << "\n";
+    }
+  }
+
+  if (bind_success) {
     std::vector<Operation *> routed_operands;
     std::vector<Operation *> routed_ctrl_movs;
-    llvm::errs() << "[DEBUG] Schedule op " << *op
-                 << " onto loc: " << target_loc.resource->getType() << "#"
-                 << target_loc.resource->getId()
-                 << " @t=" << target_loc.time_step << "\n";
     // Tries to route the data move operations.
     for (Value operand : op->getOperands()) {
       llvm::errs() << "Processing operand: " << operand << "\n";
@@ -1219,4 +1244,17 @@ bool mlir::neura::placeAndRoute(Operation *op, const MappingLoc &target_loc,
     return true;
   }
   return false;
+}
+
+int mlir::neura::getOpLatency(Operation *op) {
+  // Try to get the latency attribute from the operation
+  if (auto latency_attr = op->getAttrOfType<IntegerAttr>("latency")) {
+    return latency_attr.getInt();
+  }
+  // Default to single-cycle if no latency attribute is present
+  return 1;
+}
+
+bool mlir::neura::isMultiCycleOp(Operation *op) {
+  return getOpLatency(op) > 1;
 }
\ No newline at end of file
diff --git a/lib/NeuraDialect/Transforms/CMakeLists.txt b/lib/NeuraDialect/Transforms/CMakeLists.txt
index 85200b48..da52fc00 100644
--- a/lib/NeuraDialect/Transforms/CMakeLists.txt
+++ b/lib/NeuraDialect/Transforms/CMakeLists.txt
@@ -5,6 +5,7 @@ add_mlir_library(
     InsertDataMovPass.cpp
     InsertCtrlMovPass.cpp
     FusePatternPass.cpp
+    FuseKernelPass.cpp
     AssignAcceleratorPass.cpp
     TransformCtrlToDataFlowPass.cpp
     LeveragePredicatedValuePass.cpp
@@ -18,6 +19,7 @@ add_mlir_library(
     TransformToSteerControlPass.cpp
     RemovePredicatedTypePass.cpp
     HardwareMergePass.cpp
+    InitExecLatencyPass.cpp
     GraphMining/HardwareTemplate.cpp
     WrapLoopInKernelPass.cpp
 
diff --git a/lib/NeuraDialect/Transforms/FuseKernelPass.cpp b/lib/NeuraDialect/Transforms/FuseKernelPass.cpp
new file mode 100644
index 00000000..611c5787
--- /dev/null
+++ b/lib/NeuraDialect/Transforms/FuseKernelPass.cpp
@@ -0,0 +1,592 @@
+//===- FuseKernelPass.cpp - Kernel Fusion Pass for Neura Dialect ----------===//
+//
+// This pass implements kernel fusion for the Neura dialect:
+// 1. Producer-Consumer Fusion: Fuses a producer kernel into its consumer.
+// 2. Sibling Fusion: Fuses kernels that share inputs without data dependency.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NeuraDialect/NeuraOps.h"
+#include "NeuraDialect/NeuraPasses.h"
+#include "NeuraDialect/Architecture/Architecture.h"
+#include "NeuraDialect/Mapping/mapping_util.h"
+#include "Conversion/ConversionPasses.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+
+using namespace mlir;
+
+#define GEN_PASS_DEF_FUSEKERNEL
+#include "NeuraDialect/NeuraPasses.h.inc"
+
+namespace {
+
+// Represents metrics for evaluating fusion profitability.
+struct FusionMetrics {
+  int rec_mii = 1;
+  int res_mii = 1;
+  int max_fanout = 0;
+  int num_ops = 0;
+};
+
+// Calculates the maximum fanout in a block.
+int calculateMaxFanoutInBlock(Block &block) {
+  int max_fanout = 0;
+  for (Operation &op : block) {
+    for (Value result : op.getResults()) {
+      int fanout = std::distance(result.use_begin(), result.use_end());
+      max_fanout = std::max(max_fanout, fanout);
+    }
+  }
+  return max_fanout;
+}
+
+// Runs the neura transformation pipeline on a cloned module and computes MII metrics.
+FusionMetrics computeRealMetrics(ModuleOp test_module, const neura::Architecture &architecture) {
+  FusionMetrics metrics;
+  auto cloned_module = test_module.clone();
+
+  PassManager pm(cloned_module.getContext());
+  pm.addPass(mlir::neura::createAssignAcceleratorPass());
+  pm.addPass(mlir::createLowerArithToNeuraPass());
+  pm.addPass(neura::createCanonicalizeReturnPass());
+  pm.addPass(neura::createCanonicalizeCastPass());
+  pm.addPass(neura::createPromoteFuncArgToConstPass());
+  pm.addPass(neura::createCanonicalizeLiveInPass());
+  pm.addPass(neura::createLeveragePredicatedValuePass());
+  pm.addPass(neura::createTransformCtrlToDataFlowPass());
+  pm.enableVerifier(true);
+
+  if (failed(pm.run(cloned_module))) {
+    metrics.rec_mii = 100;
+    metrics.res_mii = 100;
+    cloned_module.erase();
+    return metrics;
+  }
+
+  cloned_module.walk([&](func::FuncOp func_op) {
+    if (func_op.getName() != "test_fused_kernel") {
+      return;
+    }
+    metrics.res_mii = neura::calculateResMii(func_op, architecture);
+    auto cycles = neura::collectRecurrenceCycles(func_op);
+    metrics.rec_mii = 1;
+    for (const auto &cycle : cycles) {
+      metrics.rec_mii = std::max(metrics.rec_mii, cycle.length);
+    }
+    int num_ops = 0;
+    func_op.walk([&](Operation *op) {
+      if (!isa<func::FuncOp>(op) && !op->hasTrait<OpTrait::IsTerminator>()) {
+        ++num_ops;
+      }
+    });
+    metrics.num_ops = num_ops;
+    if (!func_op.getBody().empty()) {
+      metrics.max_fanout = calculateMaxFanoutInBlock(func_op.getBody().front());
+    }
+  });
+
+  cloned_module.erase();
+  return metrics;
+}
+
+// Clones operations from a kernel block, collecting yield values.
+void cloneKernelBlockOps(Block &source_block, OpBuilder &builder, IRMapping &mapping, SmallVectorImpl<Value> &yield_values) {
+  for (Operation &op : source_block) {
+    if (auto yield_op = dyn_cast<neura::YieldOp>(&op)) {
+      for (Value v : yield_op.getOperands()) {
+        yield_values.push_back(mapping.lookup(v));
+      }
+      continue;
+    }
+    builder.clone(op, mapping);
+  }
+}
+
+// Creates a test function from a kernel's body and returns the function.
+func::FuncOp cloneKernelToTestFunction(neura::KernelOp kernel, OpBuilder &builder, Location loc) {
+  Block &kernel_block = kernel.getBody().front();
+
+  SmallVector<Type> input_types;
+  for (auto arg : kernel_block.getArguments()) {
+    input_types.push_back(arg.getType());
+  }
+  SmallVector<Type> output_types(kernel.getResultTypes());
+
+  if (input_types.empty()) {
+    input_types.push_back(builder.getI64Type());
+  }
+  if (output_types.empty()) {
+    output_types.push_back(builder.getI64Type());
+  }
+
+  auto func_type = builder.getFunctionType(input_types, output_types);
+  auto func_op = builder.create<func::FuncOp>(loc, "test_fused_kernel", func_type);
+  func_op->setAttr("accelerator", builder.getStringAttr("neura"));
+
+  Block *entry_block = func_op.addEntryBlock();
+  builder.setInsertionPointToStart(entry_block);
+
+  IRMapping mapping;
+  for (auto [kernel_arg, func_arg] : llvm::zip(kernel_block.getArguments(), entry_block->getArguments())) {
+    mapping.map(kernel_arg, func_arg);
+  }
+
+  SmallVector<Value> yield_values;
+  cloneKernelBlockOps(kernel_block, builder, mapping, yield_values);
+
+  if (yield_values.empty()) {
+    yield_values.push_back(entry_block->getArgument(0));
+  }
+  auto return_op = builder.create<neura::ReturnOp>(loc, yield_values);
+  return_op->setAttr("return_type", builder.getStringAttr("value"));
+
+  return func_op;
+}
+
+// Computes metrics for a single kernel by creating a test module.
+FusionMetrics computeSingleKernelMetrics(neura::KernelOp kernel, const neura::Architecture &architecture) {
+  MLIRContext *ctx = kernel.getContext();
+  OpBuilder builder(ctx);
+
+  auto module = ModuleOp::create(builder.getUnknownLoc());
+  builder.setInsertionPointToStart(module.getBody());
+  cloneKernelToTestFunction(kernel, builder, builder.getUnknownLoc());
+
+  FusionMetrics metrics = computeRealMetrics(module, architecture);
+  module.erase();
+  return metrics;
+}
+
+// Computes metrics for fused kernels by directly merging kernel bodies into a test function.
+FusionMetrics computeFusedKernelMetrics(neura::KernelOp kernel1, neura::KernelOp kernel2, bool is_producer_consumer, Value fused_value, const neura::Architecture &architecture) {
+  MLIRContext *ctx = kernel1.getContext();
+  OpBuilder builder(ctx);
+  Location loc = builder.getUnknownLoc();
+
+  auto module = ModuleOp::create(loc);
+  builder.setInsertionPointToStart(module.getBody());
+
+  Block &k1_block = kernel1.getBody().front();
+  Block &k2_block = kernel2.getBody().front();
+
+  // Collects input types from both kernels.
+  SmallVector<Type> input_types;
+  for (auto arg : k1_block.getArguments()) {
+    input_types.push_back(arg.getType());
+  }
+
+  // Finds which kernel2 arg corresponds to fused_value for producer-consumer fusion.
+  int fused_value_arg_idx = -1;
+  if (is_producer_consumer && fused_value) {
+    for (auto [idx, input] : llvm::enumerate(kernel2.getInputs())) {
+      if (input == fused_value) {
+        fused_value_arg_idx = idx;
+        break;
+      }
+    }
+  }
+
+  for (auto [idx, arg] : llvm::enumerate(k2_block.getArguments())) {
+    if (static_cast<int>(idx) != fused_value_arg_idx) {
+      input_types.push_back(arg.getType());
+    }
+  }
+
+  // Determines output types based on fusion type.
+  SmallVector<Type> output_types;
+  if (is_producer_consumer) {
+    output_types.append(kernel2.getResultTypes().begin(), kernel2.getResultTypes().end());
+  } else {
+    output_types.append(kernel1.getResultTypes().begin(), kernel1.getResultTypes().end());
+    output_types.append(kernel2.getResultTypes().begin(), kernel2.getResultTypes().end());
+  }
+  if (input_types.empty()) {
+    input_types.push_back(builder.getI64Type());
+  }
+  if (output_types.empty()) {
+    output_types.push_back(builder.getI64Type());
+  }
+
+  // Creates test function.
+  auto func_type = builder.getFunctionType(input_types, output_types);
+  auto func_op = builder.create<func::FuncOp>(loc, "test_fused_kernel", func_type);
+  func_op->setAttr("accelerator", builder.getStringAttr("neura"));
+  Block *entry_block = func_op.addEntryBlock();
+  builder.setInsertionPointToStart(entry_block);
+
+  // Maps kernel1's block arguments to function arguments.
+  IRMapping mapping;
+  unsigned func_arg_idx = 0;
+  for (auto k1_arg : k1_block.getArguments()) {
+    mapping.map(k1_arg, entry_block->getArgument(func_arg_idx++));
+  }
+
+  // Clones kernel1's operations.
+  SmallVector<Value> k1_yields;
+  cloneKernelBlockOps(k1_block, builder, mapping, k1_yields);
+
+  // Maps kernel2's block arguments.
+  for (auto [idx, k2_arg] : llvm::enumerate(k2_block.getArguments())) {
+    if (is_producer_consumer && static_cast<int>(idx) == fused_value_arg_idx) {
+      if (!k1_yields.empty()) {
+        mapping.map(k2_arg, k1_yields[0]);
+      }
+    } else {
+      mapping.map(k2_arg, entry_block->getArgument(func_arg_idx++));
+    }
+  }
+
+  // Clones kernel2's operations.
+  SmallVector<Value> k2_yields;
+  cloneKernelBlockOps(k2_block, builder, mapping, k2_yields);
+
+  // Creates return with appropriate yields.
+  SmallVector<Value> return_values;
+  if (is_producer_consumer) {
+    return_values = k2_yields;
+  } else {
+    return_values.append(k1_yields.begin(), k1_yields.end());
+    return_values.append(k2_yields.begin(), k2_yields.end());
+  }
+  if (return_values.empty()) {
+    return_values.push_back(entry_block->getArgument(0));
+  }
+  auto return_op = builder.create<neura::ReturnOp>(loc, return_values);
+  return_op->setAttr("return_type", builder.getStringAttr("value"));
+
+  FusionMetrics metrics = computeRealMetrics(module, architecture);
+  module.erase();
+  return metrics;
+}
+
+int estimateMII(const FusionMetrics &metrics, int total_ops, int total_tiles) {
+  const float alpha = 0.5;
+  const float beta = 0.5;
+  int mii = std::max(metrics.rec_mii, metrics.res_mii);
+  return std::ceil((1.0 + alpha * (total_ops / float(total_tiles))) * (1 + beta * std::max(metrics.max_fanout - 4, 0)) * mii);
+}
+
+// Checks if fusion is profitable based on MII and fanout metrics.
+bool isFusionProfitable(neura::KernelOp kernel1, neura::KernelOp kernel2, bool is_producer_consumer, Value fused_value = nullptr) {
+  neura::Architecture architecture(1, 1, neura::BaseTopology::MESH, 4, 4, neura::BaseTopology::MESH);
+
+  FusionMetrics m1 = computeSingleKernelMetrics(kernel1, architecture);
+  FusionMetrics m2 = computeSingleKernelMetrics(kernel2, architecture);
+  FusionMetrics fused = computeFusedKernelMetrics(kernel1, kernel2, is_producer_consumer, fused_value, architecture);
+  
+  return estimateMII(fused, fused.num_ops, architecture.getNumTiles()) <= std::max(estimateMII(m1, m1.num_ops, architecture.getNumTiles()), estimateMII(m2, m2.num_ops, architecture.getNumTiles()));
+
+}
+
+// Checks if two kernels can be fused (same block, producer before consumer).
+bool canFuseKernels(neura::KernelOp producer, neura::KernelOp consumer) {
+  if (!producer || !consumer || producer == consumer) {
+    return false;
+  }
+  if (producer->getBlock() != consumer->getBlock()) {
+    return false;
+  }
+  return producer->isBeforeInBlock(consumer);
+}
+
+// Returns true if consumer uses any of producer's results.
+bool hasProducerConsumerRelation(neura::KernelOp producer, neura::KernelOp consumer) {
+  for (Value result : producer.getOutputs()) {
+    for (Value input : consumer.getInputs()) {
+      if (result == input) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// Checks if two kernels are siblings (share inputs but no data dependency).
+bool areSiblingKernels(neura::KernelOp kernel1, neura::KernelOp kernel2) {
+  llvm::SmallPtrSet<Value, 8> kernel1_inputs(kernel1.getInputs().begin(), kernel1.getInputs().end());
+  bool share_input = llvm::any_of(kernel2.getInputs(), [&](Value input) {
+    return kernel1_inputs.contains(input);
+  });
+  return share_input && !hasProducerConsumerRelation(kernel1, kernel2) && !hasProducerConsumerRelation(kernel2, kernel1);
+}
+
+// Checks if any operation between producer and consumer uses producer's results.
+bool hasInterveningUses(neura::KernelOp producer, neura::KernelOp consumer) {
+  llvm::SmallPtrSet<Value, 8> producer_results(producer.getOutputs().begin(), producer.getOutputs().end());
+  bool in_range = false;
+  for (Operation &op : *producer->getBlock()) {
+    if (&op == producer.getOperation()) {
+      in_range = true;
+      continue;
+    }
+    if (&op == consumer.getOperation()) {
+      break;
+    }
+    if (in_range) {
+      for (Value operand : op.getOperands()) {
+        if (producer_results.contains(operand)) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+// Collects inputs from two kernels, avoiding duplicates.
+void collectFusedInputs(OperandRange inputs1, OperandRange inputs2, SmallVectorImpl<Value> &fused_inputs, SmallVectorImpl<Type> &fused_input_types, llvm::SmallDenseMap<Value, unsigned> &input_index_map) {
+  for (Value input : inputs1) {
+    input_index_map[input] = fused_inputs.size();
+    fused_inputs.push_back(input);
+    fused_input_types.push_back(input.getType());
+  }
+  for (Value input : inputs2) {
+    if (!input_index_map.count(input)) {
+      input_index_map[input] = fused_inputs.size();
+      fused_inputs.push_back(input);
+      fused_input_types.push_back(input.getType());
+    }
+  }
+}
+
+// Clones operations from a kernel block with input index mapping for sibling fusion.
+void cloneKernelOpsWithIndexMap(Block &source_block, Block *fused_block, OpBuilder &builder, IRMapping &mapping, OperandRange kernel_inputs, const llvm::SmallDenseMap<Value, unsigned> &input_index_map, SmallVectorImpl<Value> *yield_values) {
+  for (auto [idx, old_arg] : llvm::enumerate(source_block.getArguments())) {
+    Value original_input = kernel_inputs[idx];
+    mapping.map(old_arg, fused_block->getArgument(input_index_map.lookup(original_input)));
+  }
+  for (Operation &op : source_block) {
+    if (auto yield_op = dyn_cast<neura::YieldOp>(&op)) {
+      if (yield_values) {
+        for (Value v : yield_op.getOperands()) {
+          yield_values->push_back(mapping.lookup(v));
+        }
+      }
+      continue;
+    }
+    builder.clone(op, mapping);
+  }
+}
+
+// Fuses a producer kernel into its consumer and returns the fused kernel.
+neura::KernelOp fuseProducerConsumerKernels(neura::KernelOp producer, neura::KernelOp consumer, Value fused_value, OpBuilder &builder) {
+  Location loc = consumer.getLoc();
+
+  SmallVector<Value> fused_inputs;
+  SmallVector<Type> fused_input_types;
+  for (Value input : producer.getInputs()) {
+    fused_inputs.push_back(input);
+    fused_input_types.push_back(input.getType());
+  }
+  for (Value input : consumer.getInputs()) {
+    if (input != fused_value) {
+      fused_inputs.push_back(input);
+      fused_input_types.push_back(input.getType());
+    }
+  }
+
+  SmallVector<Type> fused_output_types(consumer.getResultTypes());
+  auto fused_kernel = builder.create<neura::KernelOp>(loc, fused_output_types, fused_inputs, consumer.getCgraIdAttr(), builder.getStringAttr("fused_producer_consumer"), consumer.getAcceleratorAttr());
+
+  Block *fused_block = builder.createBlock(&fused_kernel.getBody());
+  for (Type t : fused_input_types) {
+    fused_block->addArgument(t, loc);
+  }
+
+  OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToStart(fused_block);
+
+  // Maps and clones producer's operations.
+  IRMapping producer_mapping;
+  Block &producer_block = producer.getBody().front();
+  for (auto [old_arg, new_arg] : llvm::zip(producer_block.getArguments(), fused_block->getArguments().take_front(producer.getInputs().size()))) {
+    producer_mapping.map(old_arg, new_arg);
+  }
+  SmallVector<Value> producer_yields;
+  cloneKernelBlockOps(producer_block, builder, producer_mapping, producer_yields);
+
+  // Maps and clones consumer's operations with fused value mapped to producer's output.
+  IRMapping consumer_mapping;
+  Block &consumer_block = consumer.getBody().front();
+  unsigned consumer_input_idx = producer.getInputs().size();
+  for (auto [idx, old_arg] : llvm::enumerate(consumer_block.getArguments())) {
+    Value original_input = consumer.getInputs()[idx];
+    if (original_input == fused_value) {
+      consumer_mapping.map(old_arg, producer_yields.empty() ? Value() : producer_yields[0]);
+    } else {
+      consumer_mapping.map(old_arg, fused_block->getArgument(consumer_input_idx++));
+    }
+  }
+  SmallVector<Value> consumer_yields;
+  cloneKernelBlockOps(consumer_block, builder, consumer_mapping, consumer_yields);
+
+  builder.create<neura::YieldOp>(loc, consumer_yields);
+  return fused_kernel;
+}
+
+// Fuses two sibling kernels and returns the fused kernel.
+neura::KernelOp fuseSiblingKernels(neura::KernelOp kernel1, neura::KernelOp kernel2, OpBuilder &builder) {
+  Location loc = kernel1.getLoc();
+
+  SmallVector<Value> fused_inputs;
+  SmallVector<Type> fused_input_types;
+  llvm::SmallDenseMap<Value, unsigned> input_index_map;
+  collectFusedInputs(kernel1.getInputs(), kernel2.getInputs(), fused_inputs, fused_input_types, input_index_map);
+
+  SmallVector<Type> fused_output_types(kernel1.getResultTypes());
+  fused_output_types.append(kernel2.getResultTypes().begin(), kernel2.getResultTypes().end());
+
+  auto fused_kernel = builder.create<neura::KernelOp>(loc, fused_output_types, fused_inputs, kernel1.getCgraIdAttr(), builder.getStringAttr("fused_sibling"), kernel1.getAcceleratorAttr());
+
+  Block *fused_block = builder.createBlock(&fused_kernel.getBody());
+  for (Type t : fused_input_types) {
+    fused_block->addArgument(t, loc);
+  }
+
+  OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToStart(fused_block);
+
+  IRMapping mapping1;
+  Block &block1 = kernel1.getBody().front();
+  SmallVector<Value> kernel1_yields;
+  cloneKernelOpsWithIndexMap(block1, fused_block, builder, mapping1, kernel1.getInputs(), input_index_map, &kernel1_yields);
+
+  IRMapping mapping2;
+  Block &block2 = kernel2.getBody().front();
+  SmallVector<Value> kernel2_yields;
+  cloneKernelOpsWithIndexMap(block2, fused_block, builder, mapping2, kernel2.getInputs(), input_index_map, &kernel2_yields);
+
+  SmallVector<Value> all_yields(kernel1_yields);
+  all_yields.append(kernel2_yields);
+  builder.create<neura::YieldOp>(loc, all_yields);
+
+  return fused_kernel;
+}
+
+// Pattern that fuses a producer kernel into its consumer.
+struct ProducerConsumerFusion : public OpRewritePattern<neura::KernelOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(neura::KernelOp consumer, PatternRewriter &rewriter) const override {
+    neura::KernelOp producer = nullptr;
+    Value fused_value;
+
+    for (Value input : consumer.getInputs()) {
+      auto def_op = input.getDefiningOp<neura::KernelOp>();
+      if (!canFuseKernels(def_op, consumer)) {
+        continue;
+      }
+      bool has_only_one_use = llvm::all_of(def_op.getOutputs(), [](Value result) {
+        return result.hasOneUse() || result.use_empty();
+      });
+      if (!has_only_one_use || hasInterveningUses(def_op, consumer)) {
+        continue;
+      }
+      if (!isFusionProfitable(def_op, consumer, true, input)) {
+        continue;
+      }
+      producer = def_op;
+      fused_value = input;
+      break;
+    }
+
+    if (!producer) {
+      return failure();
+    }
+
+    auto fused_kernel = fuseProducerConsumerKernels(producer, consumer, fused_value, rewriter);
+    rewriter.replaceOp(consumer, fused_kernel.getOutputs());
+    rewriter.eraseOp(producer);
+    return success();
+  }
+};
+
+// Pattern that fuses kernels sharing the same inputs without data dependencies.
+struct SiblingFusion : public OpRewritePattern<neura::KernelOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(neura::KernelOp kernel1, PatternRewriter &rewriter) const override {
+    neura::KernelOp kernel2 = nullptr;
+
+    for (Operation *op = kernel1->getNextNode(); op; op = op->getNextNode()) {
+      if (auto next_kernel = dyn_cast<neura::KernelOp>(op)) {
+        if (areSiblingKernels(kernel1, next_kernel) && canFuseKernels(kernel1, next_kernel) && isFusionProfitable(kernel1, next_kernel, false)) {
+          kernel2 = next_kernel;
+          break;
+        }
+      }
+    }
+
+    if (!kernel2) {
+      return failure();
+    }
+
+    auto fused_kernel = fuseSiblingKernels(kernel1, kernel2, rewriter);
+
+    SmallVector<Value> kernel1_results, kernel2_results;
+    for (unsigned i = 0; i < kernel1.getNumResults(); ++i) {
+      kernel1_results.push_back(fused_kernel.getResult(i));
+    }
+    for (unsigned i = 0; i < kernel2.getNumResults(); ++i) {
+      kernel2_results.push_back(fused_kernel.getResult(kernel1.getNumResults() + i));
+    }
+
+    rewriter.replaceOp(kernel1, kernel1_results);
+    rewriter.replaceOp(kernel2, kernel2_results);
+    return success();
+  }
+};
+
+// Pass that fuses neura.kernel operations using producer-consumer and sibling fusion.
+struct FuseKernelPass : public PassWrapper<FuseKernelPass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(FuseKernelPass)
+
+  StringRef getArgument() const override { return "fuse-kernel"; }
+  StringRef getDescription() const override { return "Fuses neura.kernel operations using producer-consumer and sibling fusion."; }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<mlir::LLVM::LLVMDialect>();
+    registry.insert<mlir::func::FuncDialect>();
+    registry.insert<mlir::arith::ArithDialect>();
+    registry.insert<mlir::neura::NeuraDialect>();
+  }
+
+  void runOnOperation() override {
+    ModuleOp module = getOperation();
+
+    RewritePatternSet patterns(&getContext());
+    patterns.add<ProducerConsumerFusion>(&getContext(), 10);
+    patterns.add<SiblingFusion>(&getContext(), 5);
+
+    FrozenRewritePatternSet frozen(std::move(patterns));
+    module.walk([&](func::FuncOp func_op) {
+      if (failed(applyPatternsGreedily(func_op, frozen))) {
+        signalPassFailure();
+      }
+    });
+
+    unsigned num_kernels = 0;
+    module.walk([&](neura::KernelOp) { ++num_kernels; });
+    llvm::outs() << "[FuseKernelPass] Remaining kernels after fusion: " << num_kernels << "\n";
+  }
+};
+
+} // namespace
+
+namespace mlir::neura {
+std::unique_ptr<Pass> createFuseKernelPass() {
+  return std::make_unique<FuseKernelPass>();
+}
+} // namespace mlir::neura
diff --git a/lib/NeuraDialect/Transforms/GenerateCodePass.cpp b/lib/NeuraDialect/Transforms/GenerateCodePass.cpp
index 18766349..6c383bc6 100644
--- a/lib/NeuraDialect/Transforms/GenerateCodePass.cpp
+++ b/lib/NeuraDialect/Transforms/GenerateCodePass.cpp
@@ -85,6 +85,7 @@ static bool isCtrlMov(Operation *op) { return dyn_cast<CtrlMovOp>(op) != nullptr
 static bool isPhiStart(Operation *op) { return dyn_cast<PhiStartOp>(op) != nullptr; }
 static bool isReserve(Operation *op) { return dyn_cast<ReserveOp>(op) != nullptr; }
 static bool isConstant(Operation *op) { return dyn_cast<ConstantOp>(op) != nullptr; }
+static bool isFusedOp(Operation *op) { return dyn_cast<FusedOp>(op) != nullptr; }
 // ---- Constant for phi_start operation ----.
 static constexpr unsigned kReserveOpIndex = 1;
 
@@ -484,24 +485,29 @@ struct GenerateCodePass
                SmallVector<Operation*> &ctrl_movs,
                DenseMap<Value, Operation*> &reserve_to_phi_map) {
     function.walk([&](Operation *op) {
-      // placement for every op (even for mov/reserve).
+      // Skips operations inside fused_op regions. 
+      if (op->getParentOp() && isFusedOp(op->getParentOp())) {
+        return;
+      }
+
+      // Records Records placement for every op (even for mov/reserve).
       operation_placements[op] = getTileLocation(op);
 
-      // build reserve -> phi mapping.
+      // Builds reserve -> phi mapping for loop-carried dependencies.
       if (isPhiStart(op)) {
         if (Value reserve = getReserveOperand(op)) {
           reserve_to_phi_map[reserve] = op;
         }
       }
 
-      // collect forwarders.
+      // Collects forwarders for later expansion.
       if (isDataMov(op)) { data_movs.push_back(op); return; }
       if (isCtrlMov(op)) { ctrl_movs.push_back(op); return; }
 
-      // skip Reserve from materialization.
+      // Skips Reserve from materialization.
       if (isReserve(op)) return;
 
-      // materialize all other ops placed on tiles (compute/phi/const/etc.).
+      // Materializes all other ops placed on tiles (compute/phi/const/fused_op/etc.).
       TileLocation placement = operation_placements[op];
       if (!placement.has_tile) return;
 
@@ -831,6 +837,31 @@ struct GenerateCodePass
                      const DenseMap<Value, Operation*> &reserve2phi) {
     if (!validateForwarderShape<IsCtrl>(forwarder)) return;
 
+    // Checks if this data_mov/ctrl_mov has mapping_locs assigned by MapToAcceleratorPass.
+    auto mapping_locs = getMappingLocations(forwarder);
+    if (!mapping_locs || mapping_locs.empty()) {
+      // Skips this mov operation - it will be handled by its consumer or does not need routing.
+      // This is expected for data_mov that only feeds into ctrl_mov.
+      if constexpr (!IsCtrl) {
+        // For data_mov without mapping, verifies if it is only used by ctrl_mov.
+        bool only_ctrl_mov_users = true;
+        for (OpOperand &use : forwarder->getResult(0).getUses()) {
+          if (!isa<CtrlMovOp>(use.getOwner())) {
+            only_ctrl_mov_users = false;
+            break;
+          }
+        }
+        if (only_ctrl_mov_users) {
+          // This is expected - ctrl_mov handles this data transfer implicitly.
+          return;
+        } else {
+          // This data_mov has non-ctrl_mov users but no mapping - this is an error.
+          forwarder->emitWarning("data_mov without mapping_locs has non-ctrl_mov users");
+        }
+      }
+      return;
+    }
+
     MovBasics<IsCtrl> basics = buildMovBasics<IsCtrl>(forwarder, topo);
 
     emitMovRoutingInstructions<IsCtrl>(forwarder, basics, topo);
@@ -1029,6 +1060,10 @@ struct GenerateCodePass
       if (operation == func.getOperation()) return;  // Skips function itself.
       if (isReserve(operation)) return; // Skips reserve nodes entirely (bypass later).
       if (isa<YieldOp>(operation)) return; // Skips yield nodes entirely (bypass later).
+      // Skips operations inside fused_op regions - they are handled by hardware
+      if (operation->getParentOp() && isFusedOp(operation->getParentOp())) {
+        return;
+      }
 
       int dfg_id = getDfgId(operation);
       if (dfg_id < 0) {
diff --git a/lib/NeuraDialect/Transforms/InitExecLatencyPass.cpp b/lib/NeuraDialect/Transforms/InitExecLatencyPass.cpp
new file mode 100644
index 00000000..c50023bc
--- /dev/null
+++ b/lib/NeuraDialect/Transforms/InitExecLatencyPass.cpp
@@ -0,0 +1,184 @@
+//===- InitExecLatencyPass.cpp - Initialize Execution Latency --------------===//
+//
+// This pass initializes execution latency information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+#include "NeuraDialect/Architecture/ArchitectureSpec.h"
+#include "NeuraDialect/NeuraDialect.h"
+#include "NeuraDialect/NeuraOps.h"
+#include "NeuraDialect/NeuraPasses.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/YAMLParser.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+
+using namespace mlir;
+
+#define GEN_PASS_DEF_INITEXECLATENCY
+#include "NeuraDialect/NeuraPasses.h.inc"
+
+namespace {
+
+// Helper function to parse YAML scalar to integer
+static bool parseYamlScalarInt(const llvm::yaml::Node *node, int &result) {
+  auto *scalar = llvm::dyn_cast_or_null<llvm::yaml::ScalarNode>(node);
+  if (!scalar)
+    return false;
+  llvm::SmallString<64> value_string;
+  llvm::StringRef value_ref = scalar->getValue(value_string);
+  long long temp_value = 0;
+  if (value_ref.getAsInteger(10, temp_value))
+    return false;
+  result = static_cast<int>(temp_value);
+  return true;
+}
+
+// Helper function to parse YAML scalar to string
+static bool parseYamlScalarString(const llvm::yaml::Node *node,
+                                   std::string &result) {
+  auto *scalar = llvm::dyn_cast_or_null<llvm::yaml::ScalarNode>(node);
+  if (!scalar)
+    return false;
+  llvm::SmallString<64> value_string;
+  llvm::StringRef value_ref = scalar->getValue(value_string);
+  result = value_ref.str();
+  return true;
+}
+
+// Parse latency YAML file: expects a mapping of operation names to latency values
+static bool parseLatencyYaml(const std::string &file_path,
+                              std::map<std::string, int> &latency_map) {
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> buffer_or_err =
+      llvm::MemoryBuffer::getFile(file_path);
+  if (!buffer_or_err) {
+    llvm::errs() << "[InitExecLatencyPass] Failed to open latency specification file: "
+                 << file_path << "\n";
+    return false;
+  }
+
+  llvm::SourceMgr sm;
+  sm.AddNewSourceBuffer(std::move(*buffer_or_err), llvm::SMLoc());
+  llvm::yaml::Stream yaml_stream(
+      sm.getMemoryBuffer(sm.getMainFileID())->getBuffer(), sm);
+
+  llvm::yaml::Document &yaml_doc = *yaml_stream.begin();
+  if (yaml_stream.failed()) {
+    llvm::errs() << "[InitExecLatencyPass] YAML parse error in: " << file_path << "\n";
+    return false;
+  }
+
+  auto *root = yaml_doc.getRoot();
+  if (!root) {
+    llvm::errs() << "[InitExecLatencyPass] Empty YAML document\n";
+    return false;
+  }
+
+  auto *root_map = llvm::dyn_cast<llvm::yaml::MappingNode>(root);
+  if (!root_map) {
+    llvm::errs() << "[InitExecLatencyPass] YAML root is not a mapping\n";
+    return false;
+  }
+
+  for (auto &key_value_pair : *root_map) {
+    auto *key_node =
+        llvm::dyn_cast_or_null<llvm::yaml::ScalarNode>(key_value_pair.getKey());
+    if (!key_node)
+      continue;
+    
+    std::string op_name;
+    if (!parseYamlScalarString(key_node, op_name))
+      continue;
+
+    int latency_value = 0;
+    if (!parseYamlScalarInt(key_value_pair.getValue(), latency_value))
+      continue;
+
+    latency_map[op_name] = latency_value;
+  }
+
+  return true;
+}
+
+void SetLatency(Operation *op, std::map<std::string, int> &latency_map) {
+    // Get operation name and look up latency
+    std::string op_name = op->getName().getStringRef().str();
+    if (op_name.compare("neura.fused_op") == 0) {
+      op_name = op->getAttrOfType<StringAttr>("pattern_name").getValue().str();
+    }
+    op_name = op_name.substr(op_name.find_last_of(".") + 1); // remove neura. prefix if exists
+    auto it = latency_map.find(op_name);
+    if (it != latency_map.end()) {
+        op->setAttr("latency", 
+        IntegerAttr::get(IntegerType::get(op->getContext(), 32), it->second));
+    }
+    else {
+        op->setAttr("latency", 
+            IntegerAttr::get(IntegerType::get(op->getContext(), 32), 1)); 
+    }
+}
+
+struct InitExecLatencyPass
+    : public PassWrapper<InitExecLatencyPass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(InitExecLatencyPass)
+  
+  InitExecLatencyPass() = default;
+  InitExecLatencyPass(const InitExecLatencyPass &pass)
+      : PassWrapper<InitExecLatencyPass, OperationPass<ModuleOp>>(pass) {}
+  
+  StringRef getArgument() const override { return "init-exec-latency"; }
+  StringRef getDescription() const override {
+    return "Initialize execution latency information.";
+  }
+  
+  void runOnOperation() override {
+
+    ModuleOp module_op = getOperation();
+    llvm::errs() << "[InitExecLatencyPass] Running init-exec-latency pass\n";
+    // Get latency spec file from global function (set by command line)
+    std::string latency_file = mlir::neura::getLatencySpecFile();
+    if (latency_file.empty()) {
+      latency_file = "latency_map.yaml"; // default file name
+    }
+    
+    llvm::errs() << "[InitExecLatencyPass] Latency file: " << latency_file << "\n";
+    // Builds a map of operation name to latency
+    std::map<std::string, int> latency_map;
+    if (!parseLatencyYaml(latency_file, latency_map)) {
+      llvm::errs() << "[InitExecLatencyPass] Failed to parse latency specification file: " << latency_file << "\n";
+      return;
+    }
+
+    // Apply latency values to operations
+    module_op.walk([&](Operation *op) {
+      if (!op->getRegions().empty()) {
+        for (Region &region : op->getRegions()) {
+          region.walk([&](Operation *inner_op) {
+            // Skip operations inside fused_op regions
+            if (inner_op->getParentOp() && isa<neura::FusedOp>(inner_op->getParentOp())) {
+              return;
+            }
+
+            if (inner_op->getName().getStringRef().str() == "neura.data_mov" || inner_op->getName().getStringRef().str() == "neura.reserve") {
+              return;
+            }
+            
+            SetLatency(inner_op, latency_map);
+          });
+        }
+      }
+    });
+  }
+};
+
+} // namespace
+
+namespace mlir::neura {
+std::unique_ptr<mlir::Pass> createInitExecLatencyPass() {
+  return std::make_unique<InitExecLatencyPass>();
+}
+} // namespace mlir::neura
diff --git a/test/neura/kernel_fusion/kernel.cpp b/test/neura/kernel_fusion/kernel.cpp
new file mode 100644
index 00000000..6e8e44d2
--- /dev/null
+++ b/test/neura/kernel_fusion/kernel.cpp
@@ -0,0 +1,150 @@
+// Test cases for FuseKernelPass
+//
+// Build workflow using Polygeist:
+//   1. cgeist kernel_fusion.cpp -S -O2       -> SCF loops (kernel_fusion_scf.mlir)
+//   2. polygeist-opt --raise-scf-to-affine   -> Affine loops (kernel_fusion_affine.mlir)
+//   3. mlir-neura-opt --wrap-loop-in-kernel  -> neura.kernel ops (kernel_fusion_wrapped.mlir)
+//   4. mlir-neura-opt --fuse-kernel          -> Fused kernels (kernel_fusion_fused.mlir)
+
+#define N 64
+
+float A[N], B[N], C[N], D[N], E[N], F[N], G[N], H[N], X[N], Y[N];
+
+// Producer-Consumer Fusion: kernel0 -> kernel1
+// kernel0: C[i] = A[i] + B[i]
+// kernel1: D[i] = C[i] * 2.0
+void test_producer_consumer_fusion(float A[], float B[], float C[], float D[]) {
+    for (int i = 0; i < N; i++) {
+        C[i] = A[i] + B[i];
+    }
+
+    for (int i = 0; i < N; i++) {
+        D[i] = C[i] * 2.0f;
+    }
+}
+
+// Multiple Consumers: kernel0 -> kernel1, kernel0 -> kernel2
+// kernel0: C[i] = A[i] + B[i]
+// kernel1: D[i] = C[i] * 2.0
+// kernel2: E[i] = C[i] + 1.0
+void test_multiple_consumers(float A[], float B[], float C[], float D[], float E[]) {
+    for (int i = 0; i < N; i++) {
+        C[i] = A[i] + B[i];
+    }
+
+    for (int i = 0; i < N; i++) {
+        D[i] = C[i] * 2.0f;
+    }
+
+    for (int i = 0; i < N; i++) {
+        E[i] = C[i] + 1.0f;
+    }
+}
+
+// Sibling Fusion: kernel0 || kernel1 (share input A)
+// kernel0: E[i] = A[i] * 3.0
+// kernel1: F[i] = A[i] + 1.0
+void test_sibling_fusion(float A[], float E[], float F[]) {
+    for (int i = 0; i < N; i++) {
+        E[i] = A[i] * 3.0f;
+    }
+
+    for (int i = 0; i < N; i++) {
+        F[i] = A[i] + 1.0f;
+    }
+}
+
+// No Shared Input: kernel0, kernel1 (no fusion - different inputs)
+// kernel0: G[i] = X[i] * 2.0
+// kernel1: H[i] = Y[i] + 3.0
+void test_no_shared_input(float X[], float Y[], float G[], float H[]) {
+    for (int i = 0; i < N; i++) {
+        G[i] = X[i] * 2.0f;
+    }
+
+    for (int i = 0; i < N; i++) {
+        H[i] = Y[i] + 3.0f;
+    }
+}
+
+// Chain Fusion: kernel0 -> kernel1 -> kernel2
+// kernel0: C[i] = A[i] + B[i]
+// kernel1: D[i] = C[i] * 2.0
+// kernel2: E[i] = D[i] + 1.0
+void test_chain_fusion(float A[], float B[], float C[], float D[], float E[]) {
+    for (int i = 0; i < N; i++) {
+        C[i] = A[i] + B[i];
+    }
+
+    for (int i = 0; i < N; i++) {
+        D[i] = C[i] * 2.0f;
+    }
+
+    for (int i = 0; i < N; i++) {
+        E[i] = D[i] + 1.0f;
+    }
+}
+
+// Complex Sibling: (kernel0 || kernel1 || kernel2), kernel3
+// kernel0: C[i] = A[i] * 2.0
+// kernel1: D[i] = A[i] + 1.0  } siblings (share A)
+// kernel2: E[i] = A[i] - 1.0
+// kernel3: F[i] = B[i] * 3.0  (independent)
+void test_complex_sibling(float A[], float B[], float C[], float D[], float E[], float F[]) {
+    for (int i = 0; i < N; i++) {
+        C[i] = A[i] * 2.0f;
+    }
+
+    for (int i = 0; i < N; i++) {
+        D[i] = A[i] + 1.0f;
+    }
+
+    for (int i = 0; i < N; i++) {
+        E[i] = A[i] - 1.0f;
+    }
+
+    for (int i = 0; i < N; i++) {
+        F[i] = B[i] * 3.0f;
+    }
+}
+
+// Mixed Patterns: (kernel0 -> kernel3) || (kernel1 || kernel2)
+// kernel0: C[i] = A[i] + B[i] ─┐
+// kernel1: D[i] = A[i] * 2.0   ├─ siblings (share A)
+// kernel2: E[i] = A[i] + 3.0  ─┘
+// kernel3: F[i] = C[i] * 2.0    (consumer of kernel0)
+void test_mixed_patterns(float A[], float B[], float C[], float D[], float E[], float F[]) {
+    for (int i = 0; i < N; i++) {
+        C[i] = A[i] + B[i];
+    }
+
+    for (int i = 0; i < N; i++) {
+        D[i] = A[i] * 2.0f;
+    }
+
+    for (int i = 0; i < N; i++) {
+        E[i] = A[i] + 3.0f;
+    }
+
+    for (int i = 0; i < N; i++) {
+        F[i] = C[i] * 2.0f;
+    }
+}
+
+int main() {
+    for (int i = 0; i < N; i++) {
+        A[i] = (float)i;
+        B[i] = (float)(i * 2);
+        X[i] = (float)(i + 1);
+        Y[i] = (float)(i - 1);
+    }
+
+    test_producer_consumer_fusion(A, B, C, D);
+    test_sibling_fusion(A, E, F);
+    test_no_shared_input(X, Y, G, H);
+    test_chain_fusion(A, B, C, D, E);
+    test_complex_sibling(A, B, C, D, E, F);
+    test_mixed_patterns(A, B, C, D, E, F);
+
+    return 0;
+}
diff --git a/test/neura/kernel_fusion/test.mlir b/test/neura/kernel_fusion/test.mlir
new file mode 100644
index 00000000..3693ec24
--- /dev/null
+++ b/test/neura/kernel_fusion/test.mlir
@@ -0,0 +1,210 @@
+// RUN: mlir-neura-opt --wrap-loop-in-kernel --fuse-kernel %s 2>&1 | FileCheck %s
+
+// =============================================================================
+// TEST 1: Producer-Consumer Fusion
+// Expected: Both loops should be fused into a single kernel.
+// =============================================================================
+
+// CHECK-LABEL: func.func @test_producer_consumer_fusion(%arg0: memref<?xf32>, %arg1: memref<?xf32>, %arg2: memref<?xf32>, %arg3: memref<?xf32>) {
+// CHECK: neura.kernel ins(%arg0, %arg1, %arg2, %cst, %arg3 : memref<?xf32>, memref<?xf32>, memref<?xf32>, f32, memref<?xf32>) attributes {kernel_name = "fused_sibling"} {
+// CHECK: affine.for
+// CHECK: arith.addf
+// CHECK: affine.for
+// CHECK: arith.mulf
+// CHECK-NOT: neura.kernel
+// CHECK: return
+
+func.func @test_producer_consumer_fusion(%arg0: memref<?xf32>, %arg1: memref<?xf32>, %arg2: memref<?xf32>, %arg3: memref<?xf32>) {
+  %cst = arith.constant 2.000000e+00 : f32
+  affine.for %arg4 = 0 to 64 {
+    %0 = memref.load %arg0[%arg4] : memref<?xf32>
+    %1 = memref.load %arg1[%arg4] : memref<?xf32>
+    %2 = arith.addf %0, %1 : f32
+    memref.store %2, %arg2[%arg4] : memref<?xf32>
+  }
+  affine.for %arg4 = 0 to 64 {
+    %0 = memref.load %arg2[%arg4] : memref<?xf32>
+    %1 = arith.mulf %0, %cst : f32
+    memref.store %1, %arg3[%arg4] : memref<?xf32>
+  }
+  return
+}
+
+// =============================================================================
+// TEST 2: Sibling Fusion
+// Expected: Both loops should be fused into a single kernel.
+// =============================================================================
+
+// CHECK-LABEL: func.func @test_sibling_fusion(%arg0: memref<?xf32>, %arg1: memref<?xf32>, %arg2: memref<?xf32>) {
+// CHECK: neura.kernel ins(%arg0, %cst_0, %arg1, %cst, %arg2 : memref<?xf32>, f32, memref<?xf32>, f32, memref<?xf32>) attributes {kernel_name = "fused_sibling"} {
+// CHECK: affine.for
+// CHECK: arith.mulf
+// CHECK: affine.for
+// CHECK: arith.addf
+// CHECK-NOT: neura.kernel
+// CHECK: return
+
+func.func @test_sibling_fusion(%arg0: memref<?xf32>, %arg1: memref<?xf32>, %arg2: memref<?xf32>) {
+  %cst = arith.constant 1.000000e+00 : f32
+  %cst_0 = arith.constant 3.000000e+00 : f32
+  affine.for %arg3 = 0 to 64 {
+    %0 = memref.load %arg0[%arg3] : memref<?xf32>
+    %1 = arith.mulf %0, %cst_0 : f32
+    memref.store %1, %arg1[%arg3] : memref<?xf32>
+  }
+  affine.for %arg3 = 0 to 64 {
+    %0 = memref.load %arg0[%arg3] : memref<?xf32>
+    %1 = arith.addf %0, %cst : f32
+    memref.store %1, %arg2[%arg3] : memref<?xf32>
+  }
+  return
+}
+
+// =============================================================================
+// TEST 3: No Shared Input (No Fusion)
+// Expected: Kernels should NOT be fused as siblings since they dont't share input.
+// =============================================================================
+// CHECK-LABEL: func.func @test_no_shared_input(%arg0: memref<?xf32>, %arg1: memref<?xf32>, %arg2: memref<?xf32>, %arg3: memref<?xf32>) {
+// CHECK: neura.kernel
+// CHECK-SAME: kernel_name = "kernel_0"
+// CHECK: neura.kernel
+// CHECK-SAME: kernel_name = "kernel_1"
+// CHECK: return
+
+func.func @test_no_shared_input(%arg0: memref<?xf32>, %arg1: memref<?xf32>, %arg2: memref<?xf32>, %arg3: memref<?xf32>) {
+  %cst = arith.constant 3.000000e+00 : f32
+  %cst_0 = arith.constant 2.000000e+00 : f32
+  affine.for %arg4 = 0 to 64 {
+    %0 = memref.load %arg0[%arg4] : memref<?xf32>
+    %1 = arith.mulf %0, %cst_0 : f32
+    memref.store %1, %arg2[%arg4] : memref<?xf32>
+  }
+  affine.for %arg4 = 0 to 64 {
+    %0 = memref.load %arg1[%arg4] : memref<?xf32>
+    %1 = arith.addf %0, %cst : f32
+    memref.store %1, %arg3[%arg4] : memref<?xf32>
+  }
+  return
+}
+
+// =============================================================================
+// TEST 4: Chain fusion: A -> B -> C
+// Expected: All kernels should be fused into a single kernel.
+// =============================================================================
+// CHECK-LABEL: func.func @test_chain_fusion(%arg0: memref<?xf32>, %arg1: memref<?xf32>, %arg2: memref<?xf32>, %arg3: memref<?xf32>, %arg4: memref<?xf32>) {
+// CHECK: neura.kernel ins(%arg0, %arg1, %arg2, %cst_0, %arg3, %cst, %arg4 : memref<?xf32>, memref<?xf32>, memref<?xf32>, f32, memref<?xf32>, f32, memref<?xf32>) attributes {kernel_name = "fused_sibling"} {
+// CHECK: affine.for
+// CHECK: arith.addf
+// CHECK: affine.for
+// CHECK: arith.mulf
+// CHECK: affine.for
+// CHECK: arith.addf
+// CHECK-NOT: neura.kernel
+// CHECK: return
+
+func.func @test_chain_fusion(%arg0: memref<?xf32>, %arg1: memref<?xf32>, %arg2: memref<?xf32>, %arg3: memref<?xf32>, %arg4: memref<?xf32>) {
+  %cst = arith.constant 1.000000e+00 : f32
+  %cst_0 = arith.constant 2.000000e+00 : f32
+  affine.for %arg5 = 0 to 64 {
+    %0 = memref.load %arg0[%arg5] : memref<?xf32>
+    %1 = memref.load %arg1[%arg5] : memref<?xf32>
+    %2 = arith.addf %0, %1 : f32
+    memref.store %2, %arg2[%arg5] : memref<?xf32>
+  }
+  affine.for %arg5 = 0 to 64 {
+    %0 = memref.load %arg2[%arg5] : memref<?xf32>
+    %1 = arith.mulf %0, %cst_0 : f32
+    memref.store %1, %arg3[%arg5] : memref<?xf32>
+  }
+  affine.for %arg5 = 0 to 64 {
+    %0 = memref.load %arg3[%arg5] : memref<?xf32>
+    %1 = arith.addf %0, %cst : f32
+    memref.store %1, %arg4[%arg5] : memref<?xf32>
+  }
+  return
+}
+
+// =============================================================================
+// TEST 5: Complex Sibling Fusion
+// Expected: Siblings that share inputs should be fused, but kernel_3 should remain as a separate kernel.
+// =============================================================================
+
+// CHECK-LABEL: func.func @test_complex_sibling(%arg0: memref<?xf32>, %arg1: memref<?xf32>, %arg2: memref<?xf32>, %arg3: memref<?xf32>, %arg4: memref<?xf32>, %arg5: memref<?xf32>) {
+// CHECK: neura.kernel ins(%arg0, %cst_1, %arg2, %cst_0, %arg3, %arg4 : memref<?xf32>, f32, memref<?xf32>, f32, memref<?xf32>, memref<?xf32>) attributes {kernel_name = "fused_sibling"} {
+// CHECK: affine.for
+// CHECK: arith.mulf
+// CHECK: affine.for
+// CHECK: arith.addf
+// CHECK: affine.for
+// CHECK: arith.subf
+// CHECK: neura.kernel
+// CHECK-SAME: kernel_name = "kernel_3"
+// CHECK: return
+
+func.func @test_complex_sibling(%arg0: memref<?xf32>, %arg1: memref<?xf32>, %arg2: memref<?xf32>, %arg3: memref<?xf32>, %arg4: memref<?xf32>, %arg5: memref<?xf32>) {
+  %cst = arith.constant 3.000000e+00 : f32
+  %cst_0 = arith.constant 1.000000e+00 : f32
+  %cst_1 = arith.constant 2.000000e+00 : f32
+  affine.for %arg6 = 0 to 64 {
+    %0 = memref.load %arg0[%arg6] : memref<?xf32>
+    %1 = arith.mulf %0, %cst_1 : f32
+    memref.store %1, %arg2[%arg6] : memref<?xf32>
+  }
+  affine.for %arg6 = 0 to 64 {
+    %0 = memref.load %arg0[%arg6] : memref<?xf32>
+    %1 = arith.addf %0, %cst_0 : f32
+    memref.store %1, %arg3[%arg6] : memref<?xf32>
+  }
+  affine.for %arg6 = 0 to 64 {
+    %0 = memref.load %arg0[%arg6] : memref<?xf32>
+    %1 = arith.subf %0, %cst_0 : f32
+    memref.store %1, %arg4[%arg6] : memref<?xf32>
+  }
+  affine.for %arg6 = 0 to 64 {
+    %0 = memref.load %arg1[%arg6] : memref<?xf32>
+    %1 = arith.mulf %0, %cst : f32
+    memref.store %1, %arg5[%arg6] : memref<?xf32>
+  }
+  return
+}
+
+// =============================================================================
+// TEST 6: Mixed Patterns
+// Expected: All four loops should be fused into a single kernel.
+// =============================================================================
+
+// CHECK-LABEL: func.func @test_mixed_patterns(%arg0: memref<?xf32>, %arg1: memref<?xf32>, %arg2: memref<?xf32>, %arg3: memref<?xf32>, %arg4: memref<?xf32>, %arg5: memref<?xf32>) {
+// CHECK: neura.kernel ins(%arg0, %arg1, %arg2, %cst_0, %arg3, %cst, %arg4, %arg5 : memref<?xf32>, memref<?xf32>, memref<?xf32>, f32, memref<?xf32>, f32, memref<?xf32>, memref<?xf32>) attributes {kernel_name = "fused_sibling"} {
+// CHECK: affine.for
+// CHECK: affine.for
+// CHECK: affine.for
+// CHECK: affine.for
+// CHECK-NOT: neura.kernel
+// CHECK: return
+
+func.func @test_mixed_patterns(%arg0: memref<?xf32>, %arg1: memref<?xf32>, %arg2: memref<?xf32>, %arg3: memref<?xf32>, %arg4: memref<?xf32>, %arg5: memref<?xf32>) {
+  %cst = arith.constant 3.000000e+00 : f32
+  %cst_0 = arith.constant 2.000000e+00 : f32
+  affine.for %arg6 = 0 to 64 {
+    %0 = memref.load %arg0[%arg6] : memref<?xf32>
+    %1 = memref.load %arg1[%arg6] : memref<?xf32>
+    %2 = arith.addf %0, %1 : f32
+    memref.store %2, %arg2[%arg6] : memref<?xf32>
+  }
+  affine.for %arg6 = 0 to 64 {
+    %0 = memref.load %arg0[%arg6] : memref<?xf32>
+    %1 = arith.mulf %0, %cst_0 : f32
+    memref.store %1, %arg3[%arg6] : memref<?xf32>
+  }
+  affine.for %arg6 = 0 to 64 {
+    %0 = memref.load %arg0[%arg6] : memref<?xf32>
+    %1 = arith.addf %0, %cst : f32
+    memref.store %1, %arg4[%arg6] : memref<?xf32>
+  }
+  affine.for %arg6 = 0 to 64 {
+    %0 = memref.load %arg2[%arg6] : memref<?xf32>
+    %1 = arith.mulf %0, %cst_0 : f32
+    memref.store %1, %arg5[%arg6] : memref<?xf32>
+  }
+  return
+}
diff --git a/tools/mlir-neura-opt/mlir-neura-opt.cpp b/tools/mlir-neura-opt/mlir-neura-opt.cpp
index f7569960..fd2a7ca8 100644
--- a/tools/mlir-neura-opt/mlir-neura-opt.cpp
+++ b/tools/mlir-neura-opt/mlir-neura-opt.cpp
@@ -31,12 +31,18 @@
 // Global variable to store architecture spec file path
 static std::string architecture_spec_file;
 static mlir::neura::TileDefaults tile_defaults;
+static std::string latency_spec_file;
 
 // Function to get the architecture spec file path
 std::string mlir::neura::getArchitectureSpecFile() {
   return architecture_spec_file;
 }
 
+// Function to get the latency spec file path
+std::string mlir::neura::getLatencySpecFile() {
+  return latency_spec_file;
+}
+
 // Function to get tile defaults configuration
 mlir::neura::TileDefaults mlir::neura::getTileDefaults() {
   return tile_defaults;
@@ -60,6 +66,15 @@ int main(int argc, char **argv) {
       architecture_spec_file =
           arg_ref.substr(strlen("--architecture-spec=")).str();
       continue;
+    } else if (arg_ref == "--latency-spec") {
+      if (i + 1 < argc) {
+        latency_spec_file = argv[i + 1];
+        ++i; // skip value
+        continue;
+      }
+    } else if (arg_ref.starts_with("--latency-spec=")) {
+      latency_spec_file = arg_ref.substr(strlen("--latency-spec=")).str();
+      continue;
     }
     forwarded_args.push_back(argv[i]);
   }