diff --git a/include/NeuraDialect/Architecture/Architecture.h b/include/NeuraDialect/Architecture/Architecture.h
index 7052e6d0..4f8e5cc2 100644
--- a/include/NeuraDialect/Architecture/Architecture.h
+++ b/include/NeuraDialect/Architecture/Architecture.h
@@ -494,6 +494,23 @@ class Architecture {
   // Checks if the architecture supports counter operations.
   bool canSupportCounter() const;
 
+  // Clones the architecture but with new per-cgra dimensions.
+  // The provided tile_overrides will be appended to the existing ones.
+  //
+  // Example — create an 8×4 tile array (2×1 CGRA rectangle) with all tiles
+  // present:
+  //   auto arch_2x1 = getArchitecture().cloneWithNewDimensions(8, 4);
+  //
+  // Example — create a 12×8 bounding box for a T-shape (4 CGRAs) where only
+  // specific tiles are valid:
+  //   std::vector<TileOverride> overrides;
+  //   // First mark all tiles as non-existent, then mark valid ones existent.
+  //   // (see MapToAcceleratorPass for the full valid_tiles parsing logic)
+  //   auto arch_T = getArchitecture().cloneWithNewDimensions(8, 12, overrides);
+  std::unique_ptr<Architecture> cloneWithNewDimensions(
+      int new_per_cgra_rows, int new_per_cgra_columns,
+      const std::vector<TileOverride> &additional_overrides = {}) const;
+
 private:
   // Helper methods for constructor initialization.
   void initializeTiles(int rows, int columns);
@@ -532,6 +549,13 @@ class Architecture {
   int per_cgra_rows_;
   int per_cgra_columns_;
   int max_ctrl_mem_items_;
+
+  BaseTopology multi_cgra_base_topology_;
+  BaseTopology per_cgra_base_topology_;
+  TileDefaults tile_defaults_;
+  std::vector<TileOverride> tile_overrides_;
+  LinkDefaults link_defaults_;
+  std::vector<LinkOverride> link_overrides_;
 };
 
 // Function for getting the architecture object.
diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h
index 75ddbd24..56a9e785 100644
--- a/include/NeuraDialect/NeuraPasses.h
+++ b/include/NeuraDialect/NeuraPasses.h
@@ -23,7 +23,10 @@ std::unique_ptr<mlir::Pass> createInsertCtrlMovPass();
 std::unique_ptr<mlir::Pass> createAssignAcceleratorPass();
 std::unique_ptr<mlir::Pass> createTransformCtrlToDataFlowPass();
 std::unique_ptr<mlir::Pass> createLeveragePredicatedValuePass();
-std::unique_ptr<mlir::Pass> createMapToAcceleratorPass();
+// Creates the MapToAccelerator pass. Tile dimensions default to 0 (use
+// architecture singleton) when not specified via options.
+std::unique_ptr<mlir::Pass> createMapToAcceleratorPass(
+    const MapToAcceleratorOptions &options = MapToAcceleratorOptions{});
 std::unique_ptr<mlir::Pass> createGenerateCodePass();
 std::unique_ptr<mlir::Pass> createCanonicalizeReturnPass();
 std::unique_ptr<mlir::Pass> createCanonicalizeLiveInPass();
diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td
index 123bf1c8..f7fc06a3 100644
--- a/include/NeuraDialect/NeuraPasses.td
+++ b/include/NeuraDialect/NeuraPasses.td
@@ -54,7 +54,41 @@ def MapToAccelerator : Pass<"map-to-accelerator", "ModuleOp"> {
   let summary = "Map Neura operations onto a given accelerator";
   let description = [{
     This pass performs mapping from Neura operations to accelerator.
+
+    x-tiles and y-tiles specify the **tile** dimensions of the target array
+    (not the CGRA count).  Each CGRA contains a per_cgra_rows × per_cgra_cols
+    tile grid (currently 4×4).  So for a single CGRA, x-tiles=4 y-tiles=4;
+    for a 1×2 rectangular pair, x-tiles=8 y-tiles=4; etc.
+
+    When x-tiles=0 and y-tiles=0 (the default), the global Architecture
+    singleton determines the tile grid — this is equivalent to a single CGRA.
+
+    Examples:
+      Single CGRA (default):
+        --map-to-accelerator
+      1×3 rectangular (3 CGRAs in a row):
+        --map-to-accelerator x-tiles=12 y-tiles=4
+      T-shape (4 CGRAs: top row of 3 + centre below):
+        --map-to-accelerator x-tiles=12 y-tiles=8 \
+            valid-tiles="0_0,1_0,2_0,3_0,4_0,5_0,6_0,7_0,8_0,9_0,10_0,11_0,\
+                         4_1,5_1,6_1,7_1,4_4,5_4,6_4,7_4,4_5,5_5,6_5,7_5"
   }];
+  let options = [
+    Option<"x_tiles", "x-tiles", "int", /*default=*/"0",
+           "Total number of tiles in the X dimension of the target array "
+           "(not the number of CGRAs).  Each CGRA contributes per_cgra_cols "
+           "tiles.  0 means use the global Architecture singleton (1 CGRA).">,
+    Option<"y_tiles", "y-tiles", "int", /*default=*/"0",
+           "Total number of tiles in the Y dimension of the target array "
+           "(not the number of CGRAs).  Each CGRA contributes per_cgra_rows "
+           "tiles.  0 means use the global Architecture singleton (1 CGRA).">,
+    Option<"valid_tiles", "valid-tiles", "std::string", /*default=*/"\"\"",
+           "Comma-separated list of tile coordinates (x_y) that are actually "
+           "present in the array, used for non-rectangular CGRA shapes such as "
+           "L-blocks or T-blocks. Empty string means all tiles in the "
+           "x-tiles x y-tiles rectangle are valid. "
+           "Example: 0_0,1_0,0_1 selects three tiles forming an L-shape.">
+  ];
   let constructor = "neura::createMapToAcceleratorPass()";
 }
 
diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h
index a407b37f..a23c5b02 100644
--- a/include/TaskflowDialect/TaskflowPasses.h
+++ b/include/TaskflowDialect/TaskflowPasses.h
@@ -29,6 +29,7 @@ std::unique_ptr<mlir::Pass> createMapTaskOnCgraPass();
 std::unique_ptr<mlir::Pass> createAffineLoopTreeSerializationPass();
 std::unique_ptr<mlir::Pass> createAffineLoopPerfectionPass();
 std::unique_ptr<mlir::Pass> createMemoryAccessStreamingFusionPass();
+std::unique_ptr<mlir::Pass> createResourceAwareTaskOptimizationPass();
 
 #define GEN_PASS_REGISTRATION
 #include "TaskflowDialect/TaskflowPasses.h.inc"
diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td
index 04d40bc4..8d765498 100644
--- a/include/TaskflowDialect/TaskflowPasses.td
+++ b/include/TaskflowDialect/TaskflowPasses.td
@@ -86,4 +86,48 @@ def MemoryAccessStreamingFusion : Pass<"memory-access-streaming-fusion", "func::
   }];
   let constructor = "taskflow::createMemoryAccessStreamingFusionPass()";
 }
+
+def ResourceAwareTaskOptimization : Pass<"resource-aware-task-optimization", "func::FuncOp"> {
+  let summary = "Balances pipeline latency and fuses independent tasks for CGRA utilization.";
+  let description = [{
+    Two-phase optimization:
+    1. Utilization Fusion: merges independent (no-edge) tasks, selecting pairs
+       that minimize |trip_count_a - trip_count_b| for balanced utilization.
+    2. Pipeline Balance: allocates extra CGRAs to critical-path bottleneck tasks.
+       More CGRAs combine tile arrays into larger arrays for mapping, potentially
+       lowering compiled_ii.  Latency model: II * (trip_count - 1) + steps.
+    Targets a 4x4 CGRA grid (16 CGRAs total, one CGRA per cell).
+    Currently a single task may be allocated at most 4 CGRAs.
+    Supported CGRA array shapes for a task (all fit within the 4×4 grid):
+      - rect : a perfect rectangle, e.g. 1×1, 1×2, 2×1, 1×3, 3×1, 2×2, 1×4, 4×1.
+      - L    : an L-shaped block of 3 or 4 CGRAs, e.g.
+                 3 CGRAs: (0,0)(1,0)(0,1) — two in a row + one below-left.
+                 4 CGRAs: (0,0)(0,1)(0,2)(1,2) — three in a column + one offset.
+      - T    : a T-shaped block of 4 CGRAs, e.g.
+                 (0,0)(1,0)(2,0)(1,1) — three in a row + one below centre.
+    Non-rectangular shapes are represented by their bounding box plus an
+    explicit tile list that enumerates only the occupied CGRA positions.
+    Compiled_ii must come from the downstream Neura pipeline (asserts on failure).
+
+    Use --estimation-mode to control how task II/steps are estimated:
+      compiled   (default): runs the full Neura lowering + mapping pipeline
+                  for accurate compiled_ii and steps.
+      analytical : uses only ResMII/RecMII analytical estimates without
+                  running the mapper — much faster but less accurate.
+                  Useful for rapid design-space exploration.
+    Example:
+      --resource-aware-task-optimization estimation-mode=analytical
+  }];
+  let options = [
+    Option<"estimationMode", "estimation-mode", "std::string",
+           /*default=*/"\"compiled\"",
+           "Profiling estimation mode: 'compiled' (default) runs the full "
+           "Neura lowering + mapping pipeline; 'analytical' uses only "
+           "ResMII/RecMII analytical estimates (faster but less accurate).">
+  ];
+  let constructor = "taskflow::createResourceAwareTaskOptimizationPass()";
+  let dependentDialects = [
+    "mlir::affine::AffineDialect",
+    "mlir::func::FuncDialect"];
+}
 #endif // TASKFLOW_PASSES_TD
\ No newline at end of file
diff --git a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
index a6e68ef9..089af123 100644
--- a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
+++ b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
@@ -300,6 +300,74 @@ struct ArithIndexCastToNeuraCast
   }
 };
 
+struct ArithMinimumFToNeuraFCmpSel
+    : public OpRewritePattern<mlir::arith::MinimumFOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::MinimumFOp op,
+                                PatternRewriter &rewriter) const override {
+    Value lhs = op.getLhs();
+    Value rhs = op.getRhs();
+    Type result_type = op.getType();
+    Location loc = op.getLoc();
+
+    // minimumf(a, b) → sel(fcmp(a, b, "olt"), a, b)
+    // "olt" = ordered less-than: true when a < b (false if either is NaN).
+    Value cmp = rewriter.create<neura::FCmpOp>(
+        loc, result_type, lhs, rhs, rewriter.getStringAttr("olt"));
+    rewriter.replaceOpWithNewOp<neura::SelOp>(op, result_type, cmp, lhs, rhs);
+    return success();
+  }
+};
+
+struct ArithMaximumFToNeuraFCmpSel
+    : public OpRewritePattern<mlir::arith::MaximumFOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::MaximumFOp op,
+                                PatternRewriter &rewriter) const override {
+    Value lhs = op.getLhs();
+    Value rhs = op.getRhs();
+    Type result_type = op.getType();
+    Location loc = op.getLoc();
+
+    // maximumf(a, b) → sel(fcmp(a, b, "ogt"), a, b)
+    // "ogt" = ordered greater-than: true when a > b (false if either is NaN).
+    Value cmp = rewriter.create<neura::FCmpOp>(
+        loc, result_type, lhs, rhs, rewriter.getStringAttr("ogt"));
+    rewriter.replaceOpWithNewOp<neura::SelOp>(op, result_type, cmp, lhs, rhs);
+    return success();
+  }
+};
+
+// arith.andi(a, b) → neura.and(a, b)
+struct ArithAndIToNeuraAnd : public OpRewritePattern<mlir::arith::AndIOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::AndIOp op,
+                                PatternRewriter &rewriter) const override {
+    Value lhs = op.getLhs();
+    Value rhs = op.getRhs();
+    Type result_type = op.getType();
+    rewriter.replaceOpWithNewOp<neura::AndOp>(op, result_type, lhs, rhs);
+    return success();
+  }
+};
+
+// arith.ori(a, b) → neura.or(a, b)
+struct ArithOrIToNeuraOr : public OpRewritePattern<mlir::arith::OrIOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::OrIOp op,
+                                PatternRewriter &rewriter) const override {
+    Value lhs = op.getLhs();
+    Value rhs = op.getRhs();
+    Type result_type = op.getType();
+    rewriter.replaceOpWithNewOp<neura::OrOp>(op, result_type, lhs, rhs);
+    return success();
+  }
+};
+
 struct LowerArithToNeuraPass
     : public PassWrapper<LowerArithToNeuraPass, OperationPass<ModuleOp>> {
 
@@ -322,7 +390,9 @@ struct LowerArithToNeuraPass
              ArithExtUIToNeuraCast, ArithIndexCastToNeuraCast,
              ArithFDivToNeuraFDiv, ArithExtfToNeuraCast, ArithMulFToNeuraFMul,
              ArithSubIToNeuraSub, ArithSubFToNeuraFSub, ArithMulIToNeuraMul,
-             ArithDivSIToNeuraDiv, ArithRemSIToNeuraOp>(context);
+             ArithDivSIToNeuraDiv, ArithRemSIToNeuraOp,
+             ArithMinimumFToNeuraFCmpSel, ArithMaximumFToNeuraFCmpSel,
+             ArithAndIToNeuraAnd, ArithOrIToNeuraOr>(context);
     return patterns;
   }
 
diff --git a/lib/NeuraDialect/Architecture/Architecture.cpp b/lib/NeuraDialect/Architecture/Architecture.cpp
index 7141cb46..4e2737a3 100644
--- a/lib/NeuraDialect/Architecture/Architecture.cpp
+++ b/lib/NeuraDialect/Architecture/Architecture.cpp
@@ -561,12 +561,15 @@ Architecture::Architecture(int multi_cgra_rows, int multi_cgra_columns,
                            const std::vector<LinkOverride> &link_overrides) {
   this->multi_cgra_rows_ = multi_cgra_rows;
   this->multi_cgra_columns_ = multi_cgra_columns;
-  // TODO: Support multi-CGRA topology in the future:
-  // https://github.com/coredac/dataflow/issues/163.
-  // this->multi_cgra_base_topology_ = multi_cgra_base_topology;
+  this->multi_cgra_base_topology_ = multi_cgra_base_topology;
   this->per_cgra_rows_ = per_cgra_rows;
   this->per_cgra_columns_ = per_cgra_columns;
+  this->per_cgra_base_topology_ = per_cgra_base_topology;
   this->max_ctrl_mem_items_ = max_ctrl_mem_items;
+  this->tile_defaults_ = tile_defaults;
+  this->tile_overrides_ = tile_overrides;
+  this->link_defaults_ = link_defaults;
+  this->link_overrides_ = link_overrides;
 
   // Initializes architecture components using helper methods.
   initializeTiles(per_cgra_rows, per_cgra_columns);
@@ -576,6 +579,20 @@ Architecture::Architecture(int multi_cgra_rows, int multi_cgra_columns,
   applyLinkOverrides(link_overrides);
 }
 
+std::unique_ptr<Architecture> Architecture::cloneWithNewDimensions(
+    int new_per_cgra_rows, int new_per_cgra_columns,
+    const std::vector<TileOverride> &additional_overrides) const {
+  
+  std::vector<TileOverride> merged_overrides = tile_overrides_;
+  merged_overrides.insert(merged_overrides.end(), additional_overrides.begin(), additional_overrides.end());
+
+  return std::make_unique<Architecture>(
+      multi_cgra_rows_, multi_cgra_columns_, multi_cgra_base_topology_,
+      new_per_cgra_rows, new_per_cgra_columns, max_ctrl_mem_items_,
+      per_cgra_base_topology_, tile_defaults_, merged_overrides,
+      link_defaults_, link_overrides_);
+}
+
 Tile *Architecture::getTile(int id) {
   auto it = id_to_tile_.find(id);
   assert(it != id_to_tile_.end() && "Tile with given ID not found");
diff --git a/lib/NeuraDialect/Transforms/InsertDataMovPass.cpp b/lib/NeuraDialect/Transforms/InsertDataMovPass.cpp
index 1c887a67..b8a70018 100644
--- a/lib/NeuraDialect/Transforms/InsertDataMovPass.cpp
+++ b/lib/NeuraDialect/Transforms/InsertDataMovPass.cpp
@@ -23,8 +23,20 @@ struct InsertDataMovForNeuraOps : public RewritePattern {
 
   LogicalResult matchAndRewrite(Operation *op,
                                 PatternRewriter &rewriter) const override {
+    // Only processes operations from the neura dialect. Operations from
+    // other dialects (arith, math, etc.) should have been lowered to neura
+    // ops by earlier passes (LowerArithToNeura, etc.) before this pass runs.
     if (op->getDialect()->getNamespace() != accel::kNeuraTarget ||
-        isa<neura::DataMovOp>(op)) {
+        isa<neura::DataMovOp>(op) ||
+        // ReserveOp creates a loop-carried placeholder in the dataflow
+        // recurrence cycle: %v = neura.reserve; neura.ctrl_mov %next -> %v.
+        // Its result must NOT be wrapped in DataMovOp, because ctrl_mov needs
+        // a direct reference to the same SSA value used by phi_start.
+        // Inserting a DataMovOp between reserve and its consumers would break
+        // the ctrl_mov→reserve back-edge and corrupt the recurrence cycle.
+        isa<neura::ReserveOp>(op) ||
+        isa<neura::KernelOp>(op) ||
+        isa<neura::FusedOp>(op)) {
       return failure();
     }
 
@@ -91,8 +103,9 @@ struct InsertDataMovForNeuraOps : public RewritePattern {
     for (Value operand : op->getOperands()) {
       Operation *producer = operand.getDefiningOp();
 
-      // Skips adding mov for any operand that comes from a reserve op or
-      // already from data_mov.
+      // Does NOT wrap operands that come from reserve: the reserve result
+      // is the recurrence back-edge target for ctrl_mov.  Wrapping it would
+      // produce a new SSA value, breaking the ctrl_mov→reserve cycle.
       if (producer && (isa<neura::ReserveOp>(producer) ||
                        isa<neura::DataMovOp>(producer))) {
         new_operands.push_back(operand);
diff --git a/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp b/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp
index 9b5ee423..f6166968 100644
--- a/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp
+++ b/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp
@@ -46,6 +46,11 @@ struct MapToAcceleratorPass
   }
 
   MapToAcceleratorPass() = default;
+  MapToAcceleratorPass(const MapToAcceleratorOptions &options) : MapToAcceleratorPass() {
+    this->x_tiles = options.x_tiles;
+    this->y_tiles = options.y_tiles;
+    this->valid_tiles = options.valid_tiles;
+  }
   MapToAcceleratorPass(const MapToAcceleratorPass &pass)
       : PassWrapper<MapToAcceleratorPass, OperationPass<ModuleOp>>(pass) {}
   Option<std::string> mappingStrategy{
@@ -72,6 +77,18 @@ struct MapToAcceleratorPass
       llvm::cl::desc(
           "Dump the resource allocation table after mapping (default: true)"),
       llvm::cl::init(true)};
+  Option<int> x_tiles{
+      *this, "x-tiles",
+      llvm::cl::desc("Override number of tiles in X dimension (0 = default)."),
+      llvm::cl::init(0)};
+  Option<int> y_tiles{
+      *this, "y-tiles",
+      llvm::cl::desc("Override number of tiles in Y dimension (0 = default)."),
+      llvm::cl::init(0)};
+  Option<std::string> valid_tiles{
+      *this, "valid-tiles",
+      llvm::cl::desc("Comma separated list of valid tile coords x_y,x_y to support non-rectangular shapes."),
+      llvm::cl::init("")};
 
   // Configures mapping strategy and mode based on command-line options.
   bool configureMappingStrategy(StringRef mapping_strategy_opt,
@@ -239,15 +256,15 @@ struct MapToAcceleratorPass
     }
 
     // Filters out operations inside fused_op regions.
-    // Only map the fused_op itself, not the operations within its region
+    // Only maps the fused_op itself, not the operations within its region.
     std::vector<Operation *> filtered_ops;
     int skipped_count = 0;
     for (Operation *op : topologically_sorted_ops) {
       Operation *parent_op = op->getParentOp();
-      // Check if parent is a fused_op by checking operation name
+      // Checks if the parent is a fused_op by inspecting the operation name.
       if (parent_op &&
           parent_op->getName().getStringRef().contains(attr::val::kOpFused)) {
-        // Skip operations inside fused_op region
+        // Skips operations inside a fused_op region.
         llvm::outs() << "[MapToAcceleratorPass] Skipping op inside fused_op: "
                      << *op << "\n";
         skipped_count++;
@@ -290,9 +307,9 @@ struct MapToAcceleratorPass
       MappingState mapping_state(architecture, ii, is_spatial_only);
       if (mapping_strategy->map(sorted_ops_with_alap_levels, critical_ops,
                                 architecture, mapping_state)) {
-        // success
+        // Success.
         if (dumpMappingTable) {
-          // logs to stderr
+          // Logs to stderr.
           mapping_state.dumpOpToLocs();
         }
         mapping_state.encodeMappingState();
@@ -355,7 +372,53 @@ struct MapToAcceleratorPass
       return;
     }
 
-    const Architecture &architecture = mlir::neura::getArchitecture();
+    const Architecture &global_arch = mlir::neura::getArchitecture();
+    std::unique_ptr<Architecture> custom_arch;
+    const Architecture *target_arch = &global_arch;
+
+    if (x_tiles.getValue() > 0 && y_tiles.getValue() > 0) {
+      std::vector<TileOverride> additional_overrides;
+      if (!valid_tiles.getValue().empty()) {
+        llvm::SmallVector<llvm::StringRef, 4> coords;
+        llvm::StringRef(valid_tiles.getValue()).split(coords, ',');
+        
+        // Default: mark all tiles as non-existent first if valid_tiles provided.
+        for (int y = 0; y < y_tiles.getValue(); ++y) {
+          for (int x = 0; x < x_tiles.getValue(); ++x) {
+            TileOverride to;
+            to.tile_x = x;
+            to.tile_y = y;
+            to.existence = false;
+            additional_overrides.push_back(to);
+          }
+        }
+        
+        // Then mark the valid ones as existent.
+        for (llvm::StringRef coord : coords) {
+          auto pair = coord.split('_');
+          int x, y;
+          if (!pair.first.getAsInteger(10, x) && !pair.second.getAsInteger(10, y)) {
+            TileOverride to;
+            to.tile_x = x;
+            to.tile_y = y;
+            to.existence = true;
+            additional_overrides.push_back(to);
+          }
+        }
+      }
+
+      // Builds a custom architecture with the requested tile dimensions.
+      // For non-rectangular shapes, tiles marked existence=false are removed
+      // before inter-tile links are created, so no boundary links connect to
+      // absent tiles.
+      custom_arch = global_arch.cloneWithNewDimensions(
+        y_tiles.getValue(), x_tiles.getValue(), additional_overrides);
+      target_arch = custom_arch.get();
+      llvm::errs() << "[MapToAcceleratorPass] Overriding architecture dimensions to "
+                   << y_tiles.getValue() << "x" << x_tiles.getValue() << " tiles.\n";
+    }
+
+    const Architecture &architecture = *target_arch;
 
     // Maps kernels.
     module.walk([&](neura::KernelOp kernel_op) {
@@ -398,8 +461,9 @@ struct MapToAcceleratorPass
 
 namespace mlir::neura {
 
-std::unique_ptr<Pass> createMapToAcceleratorPass() {
-  return std::make_unique<MapToAcceleratorPass>();
+std::unique_ptr<Pass> createMapToAcceleratorPass(
+    const MapToAcceleratorOptions &options) {
+  return std::make_unique<MapToAcceleratorPass>(options);
 }
 
 } // namespace mlir::neura
diff --git a/lib/TaskflowDialect/Transforms/Optimizations/CMakeLists.txt b/lib/TaskflowDialect/Transforms/Optimizations/CMakeLists.txt
index 9f56a1f3..9e7faebc 100644
--- a/lib/TaskflowDialect/Transforms/Optimizations/CMakeLists.txt
+++ b/lib/TaskflowDialect/Transforms/Optimizations/CMakeLists.txt
@@ -4,17 +4,23 @@ add_mlir_conversion_library(MLIRTaskflowOptimization
   AffineLoopTreeSerializationPass.cpp
   AffineLoopPerfectionPass.cpp
   MemoryAccessStreamingFusion.cpp
+  ResourceAwareTaskOptimizationPass.cpp
 
   DEPENDS
   MLIRTaskflowTransformsIncGen
 
   LINK_LIBS PUBLIC
   MLIRTaskflow
+  MLIRTaskflowTransforms
+  MLIRAffineDialect
   MLIRArithDialect
   MLIRFuncDialect
   MLIRLinalgDialect
   MLIRIR
   MLIRPass
   MLIRTransforms
+  MLIRNeura
+  MLIRNeuraTransforms
+  MLIRConversion
   MLIRSupport
 )
\ No newline at end of file
diff --git a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp
new file mode 100644
index 00000000..c5052b83
--- /dev/null
+++ b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp
@@ -0,0 +1,1884 @@
+//===- ResourceAwareTaskOptimizationPass.cpp - Pipeline Balance & Fusion --===//
+// This pass performs two-phase optimization on the task graph:
+// 1. Utilization Fusion: merges independent (no-edge) tasks, selecting pairs
+//    that minimize |trip_count_a - trip_count_b| for balanced utilization.
+// 2. Pipeline Balance: allocates extra CGRAs to critical-path bottleneck tasks.
+//    More CGRAs combine tile arrays into larger arrays for mapping, potentially
+//    lowering compiled_ii.  Latency model: II * (trip_count - 1) + steps.
+//
+// Targets a 4x4 CGRA grid (16 CGRAs total).  Each task may use up to 4 CGRAs.
+// Supported per-task shapes: rect (1×1..4×1/1×4/2×2), L (3 or 4 CGRAs), T (4 CGRAs).
+// Compiled_ii must come from the downstream pipeline (asserts on failure).
+//
+//===----------------------------------------------------------------------===//
+
+#include "TaskflowDialect/TaskflowOps.h"
+#include "TaskflowDialect/TaskflowPasses.h"
+
+#include "NeuraDialect/Architecture/Architecture.h"
+#include "NeuraDialect/Mapping/mapping_util.h"
+#include "NeuraDialect/NeuraAttributes.h"
+#include "NeuraDialect/NeuraDialect.h"
+#include "NeuraDialect/NeuraOps.h"
+#include "NeuraDialect/NeuraPasses.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <algorithm>
+#include <set>
+
+
+using namespace mlir;
+using namespace mlir::taskflow;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Constants
+//===----------------------------------------------------------------------===//
+
+constexpr int kCgraGridRows = 4;
+constexpr int kCgraGridCols = 4;
+constexpr int kTotalCGRAs = kCgraGridRows * kCgraGridCols; // 16
+constexpr int kMaxBalanceIterations = 100;
+constexpr int kMaxCgrasPerTask = 4;  // Max CGRAs allocatable to a single task.
+
+// Sentinel value: 0 means "not yet profiled". After profileTask() runs,
+// both steps and ii MUST be > 0. An assert fires if profiling fails.
+constexpr int64_t kUnprofiled = 0;
+
+//===----------------------------------------------------------------------===//
+// CGRA Shape Utilities
+//===----------------------------------------------------------------------===//
+
+// Represents a CGRA allocation shape on the grid.
+//
+// For rectangular shapes: rows × cols == cgra_count, and `cgra_positions`
+// is empty (all cells in the bounding box are used).
+//
+// For non-rectangular shapes (L, T): `cgra_positions` stores the explicit
+// (col, row) coordinates of the occupied CGRAs.  `rows`/`cols` give the
+// bounding box so that tile-level x_tiles/y_tiles can be computed.
+struct CgraShape {
+  int rows;  // Bounding-box CGRA rows.
+  int cols;  // Bounding-box CGRA columns.
+  bool is_rectangular;  // True if all cells in the bbox are used.
+  // Explicit CGRA positions for non-rectangular shapes.
+  // Each pair is (col, row) in CGRA coordinates.  Empty for rectangles.
+  SmallVector<std::pair<int, int>> cgra_positions;
+
+  int area() const { return rows * cols; }
+
+  // Returns a human-readable description for log messages only (not IR).
+  std::string describe(int cgra_count) const {
+    std::string s = std::to_string(rows) + "x" + std::to_string(cols);
+    if (!is_rectangular) {
+      s += "(non-rect, " + std::to_string(cgra_count) + " CGRAs:";
+      for (auto &[c, r] : cgra_positions)
+        s += " (" + std::to_string(c) + "," + std::to_string(r) + ")";
+      s += ")";
+    }
+    return s;
+  }
+
+  // Returns the shape string written into the IR tile_shape attribute.
+  // For rectangular shapes: "NxM" (e.g. "2x2").
+  // For non-rectangular shapes: "NxM[(c0,r0)(c1,r1)...]" listing only the
+  // occupied CGRA positions so that downstream passes can reconstruct the
+  // exact valid tile set for multi-CGRA mapping.
+  std::string irAttr() const {
+    std::string s = std::to_string(rows) + "x" + std::to_string(cols);
+    if (!is_rectangular && !cgra_positions.empty()) {
+      s += "[";
+      for (auto &[c, r] : cgra_positions)
+        s += "(" + std::to_string(c) + "," + std::to_string(r) + ")";
+      s += "]";
+    }
+    return s;
+  }
+};
+
+// Returns all valid rectangular shapes for `cgra_count` CGRAs.
+static SmallVector<CgraShape> getRectangularShapes(int cgra_count) {
+  SmallVector<CgraShape> shapes;
+  for (int r = 1; r <= kCgraGridRows; ++r) {
+    for (int c = 1; c <= kCgraGridCols; ++c) {
+      if (r * c == cgra_count) {
+        shapes.push_back({r, c, /*is_rectangular=*/true, /*cgra_positions=*/{}});
+      }
+    }
+  }
+  return shapes;
+}
+
+// Returns true if `cgra_count` CGRAs can fit on the grid and does not
+// exceed the per-task limit.
+static bool canFitOnGrid(int cgra_count) {
+  return cgra_count >= 1 && cgra_count <= kMaxCgrasPerTask;
+}
+
+// Returns the set of non-rectangular shapes for `cgra_count` CGRAs.
+// Currently defined for cgra_count == 3 (L-shape) and cgra_count == 4
+// (L-shape and T-shape variants).  Each shape's coordinates are chosen
+// so the bounding box is as small as possible.
+static SmallVector<CgraShape> getNonRectangularShapes(int cgra_count) {
+  SmallVector<CgraShape> shapes;
+
+  if (cgra_count == 3) {
+    // L-shape 3 CGRAs: (0,0)(1,0)(0,1) — bbox 2×2
+    shapes.push_back({2, 2, false, {{0,0},{1,0},{0,1}}});
+  }
+
+  if (cgra_count == 4) {
+    // T-shape: three in a row + one below centre
+    //   (0,0)(1,0)(2,0)(1,1)  — bbox 2×3
+    shapes.push_back({2, 3, false, {{0,0},{1,0},{2,0},{1,1}}});
+
+    // L-shape: three in a column + one offset
+    //   (0,0)(0,1)(0,2)(1,2)  — bbox 3×2
+    shapes.push_back({3, 2, false, {{0,0},{0,1},{0,2},{1,2}}});
+  }
+
+  return shapes;
+}
+
+// Picks the best shape for display/profiling.
+// We prefer shapes with the most compact physical layout (smallest maximum distance
+// between nodes) to minimize communication latency. In cases of identical bounding
+// box area, we prefer more square-like bounds over long rectangles.
+//
+// TODO: This function only picks a localized shape for an idealized single task mapping.
+// Global placement and conflict resolution across multiple tasks is legitimately deferred 
+// to downstream map-on-cgra pass, as speculative profiling assumes unconstrained placement.
+static CgraShape pickBestShape(int cgra_count) {
+  // For cgra_count == 3, the 2x2 L-shape has a smaller maximum physical routing distance 
+  // (dist=2) compared to a 1x3 rectangle (dist=3), despite having a larger bounding box. 
+  // We explicitly prefer the more compact L-shape here for better speculative latency.
+  if (cgra_count == 3) {
+    auto non_rect_shapes = getNonRectangularShapes(3);
+    if (!non_rect_shapes.empty()) {
+      return non_rect_shapes.front();
+    }
+  }
+
+  SmallVector<CgraShape> candidates = getRectangularShapes(cgra_count);
+  for (const auto &s : getNonRectangularShapes(cgra_count)) {
+    candidates.push_back(s);
+  }
+
+  if (!candidates.empty()) {
+    return *std::min_element(candidates.begin(), candidates.end(),
+        [](const CgraShape &a, const CgraShape &b) {
+          int area_a = a.area();
+          int area_b = b.area();
+          if (area_a != area_b) return area_a < area_b;
+          return std::abs(a.rows - a.cols) < std::abs(b.rows - b.cols);
+        });
+  }
+
+  // Fallback: smallest bounding box (should not be reached for 1..4 CGRAs).
+  CgraShape best = {kCgraGridRows, kCgraGridCols, false, {}};
+  for (int r = 1; r <= kCgraGridRows; ++r) {
+    for (int c = 1; c <= kCgraGridCols; ++c) {
+      if (r * c >= cgra_count && r * c < best.area()) {
+        best = {r, c, false, {}};
+      }
+    }
+  }
+  return best;
+}
+
+//===----------------------------------------------------------------------===//
+// Task Dependency Graph
+//===----------------------------------------------------------------------===//
+
+struct TaskGraphNode {
+  size_t id;
+  TaskflowTaskOp op;
+  int64_t trip_count = 1;
+  int64_t steps = kUnprofiled;
+  int64_t ii = kUnprofiled;
+  int cgra_count = 1;
+  CgraShape shape = {1, 1, true};
+
+  // Dependency edges (both SSA and memory).
+  SmallVector<TaskGraphNode *> predecessors;
+  SmallVector<TaskGraphNode *> successors;
+
+  TaskGraphNode(size_t id, TaskflowTaskOp op) : id(id), op(op) {}
+
+  // Returns estimated task latency using the pipelined execution model:
+  //   latency = II * (trip_count - 1) + steps.
+  int64_t estimatedLatency() const {
+    return ii * (trip_count - 1) + steps;
+  }
+};
+
+class TaskDependencyGraph {
+public:
+  SmallVector<std::unique_ptr<TaskGraphNode>> nodes;
+  DenseMap<Operation *, TaskGraphNode *> op_to_node;
+
+  void build(func::FuncOp func, bool skip_mapper = false) {
+    // 1. Creates TaskGraphNodes.
+    size_t task_id = 0;
+    func.walk([&](TaskflowTaskOp task) {
+      auto node = std::make_unique<TaskGraphNode>(task_id++, task);
+      
+      // If the task already has profiling attributes (e.g., from fusion),
+      // skip expensive speculative lowering and use those directly.
+      bool has_precomputed = task->hasAttr("compiled_ii") && task->hasAttr("steps");
+      if (!has_precomputed) {
+        // Speculative lowering to Neura to get real metrics.
+        profileTask(node.get(), task, skip_mapper);
+      }
+
+      // Reads existing trip_count attribute if set by fusion.
+      if (auto attr = task->getAttrOfType<IntegerAttr>("trip_count")) {
+        node->trip_count = attr.getInt();
+      } else {
+        node->trip_count = computeTripCount(task);
+      }
+      
+      // Overrides with explicit attributes if present.
+      if (auto attr = task->getAttrOfType<IntegerAttr>("steps")) {
+        node->steps = attr.getInt();
+      }
+      if (auto attr = task->getAttrOfType<IntegerAttr>("compiled_ii")) {
+        node->ii = attr.getInt();
+      }
+      if (auto attr = task->getAttrOfType<IntegerAttr>("cgra_count")) {
+        node->cgra_count = attr.getInt();
+      }
+      
+      op_to_node[task] = node.get();
+      nodes.push_back(std::move(node));
+    });
+
+    // 2. Builds SSA edges (value dependencies between tasks).
+    for (auto &consumer : nodes) {
+      for (Value operand : consumer->op.getValueInputs()) {
+        if (auto producer_op = operand.getDefiningOp<TaskflowTaskOp>()) {
+          if (auto *producer = op_to_node[producer_op.getOperation()]) {
+            addEdge(producer, consumer.get());
+          }
+        }
+      }
+    }
+
+    // 3. Builds memory edges.
+    for (auto &consumer : nodes) {
+      // RAW: producer wrote a memref that this task reads.
+      for (Value memref : consumer->op.getReadMemrefs()) {
+        if (auto producer_op = memref.getDefiningOp<TaskflowTaskOp>()) {
+          if (auto *producer = op_to_node[producer_op.getOperation()]) {
+            addEdge(producer, consumer.get());
+          }
+        }
+      }
+      // WAW: producer wrote a memref that this task also writes.
+      for (Value memref : consumer->op.getWriteMemrefs()) {
+        if (auto producer_op = memref.getDefiningOp<TaskflowTaskOp>()) {
+          if (auto *producer = op_to_node[producer_op.getOperation()]) {
+            addEdge(producer, consumer.get());
+          }
+        }
+      }
+    }
+
+    llvm::errs() << "TaskDependencyGraph: " << nodes.size()
+                 << " tasks\n";
+    for (auto &n : nodes) {
+      llvm::errs() << "  Task " << n->id << " ("
+                   << n->op.getTaskName().str() << "): trip_count="
+                   << n->trip_count << ", ii=" << n->ii
+                   << ", steps=" << n->steps
+                   << ", preds=" << n->predecessors.size()
+                   << ", succs=" << n->successors.size() << "\n";
+    }
+  }
+
+  // Returns true if there is any (direct or transitive) dependency from
+  // source_node to dest_node.
+  bool hasDependency(TaskGraphNode *source_node,
+                     TaskGraphNode *dest_node) const {
+    if (source_node == dest_node) return true;
+    DenseSet<TaskGraphNode *> visited;
+    SmallVector<TaskGraphNode *> worklist;
+    worklist.push_back(source_node);
+    while (!worklist.empty()) {
+      auto *current = worklist.pop_back_val();
+      if (current == dest_node) return true;
+      if (!visited.insert(current).second) continue;
+      for (auto *succ : current->successors) {
+        worklist.push_back(succ);
+      }
+    }
+    return false;
+  }
+
+  // Returns true if a and b are completely independent (no path in either
+  // direction).
+  bool areIndependent(TaskGraphNode *a, TaskGraphNode *b) const {
+    return !hasDependency(a, b) && !hasDependency(b, a);
+  }
+
+  // Returns total CGRAs allocated.
+  int getTotalAllocatedCGRAs() const {
+    int total = 0;
+    for (auto &node : nodes) {
+      total += node->cgra_count;
+    }
+    return total;
+  }
+
+  // Public wrapper for profileTask: used by UtilizationFuser to re-profile
+  // fused tasks with the real downstream Neura pipeline.
+  // When skip_mapper=true, only ResMII/RecMII analytical estimates are used
+  // (no MapToAcceleratorPass). This is safe for speculative balance checks
+  // where the mapper may backtrack indefinitely on larger tile arrays.
+  void profileTaskPublic(TaskGraphNode *node, TaskflowTaskOp task,
+                         bool skip_mapper = false) {
+    profileTask(node, task, skip_mapper);
+  }
+
+private:
+  llvm::DenseSet<std::pair<TaskGraphNode *, TaskGraphNode *>> edge_set;
+
+  void addEdge(TaskGraphNode *from, TaskGraphNode *to) {
+    auto key = std::make_pair(from, to);
+    if (edge_set.insert(key).second) {
+      from->successors.push_back(to);
+      to->predecessors.push_back(from);
+    }
+  }
+
+  // Profiles a single TaskflowTaskOp: clones the task, wraps the kernel in a
+  // standalone func, and runs InsertDataMov + MapToAcceleratorPass to obtain
+  // ii.  skip_mapper: use only ResMII/RecMII analytical estimates.
+  void profileTask(TaskGraphNode *node, TaskflowTaskOp task,
+                   bool skip_mapper = false) {
+    MLIRContext *ctx = task.getContext();
+    OpBuilder builder(ctx);
+    Location loc = task.getLoc();
+
+    auto parent_func = task->getParentOfType<func::FuncOp>();
+    assert(parent_func &&
+           "[profileTask] FATAL: task has no parent func::FuncOp. "
+           "compiled_ii must come from downstream pipeline.");
+
+    // Verifies exactly one neura.kernel per task (post-lowering invariant).
+    neura::KernelOp the_kernel;
+    task.walk([&](neura::KernelOp k) {
+      assert(!the_kernel && "task has more than one neura.kernel op");
+      the_kernel = k;
+    });
+    assert(the_kernel && "task has no neura.kernel op");
+
+    // Clones the task into a temporary module so we don't mutate the real IR.
+    auto tmp_mod = ModuleOp::create(loc);
+    neura::KernelOp cloned_kernel;
+    {
+      OpBuilder b(tmp_mod.getBodyRegion());
+      IRMapping mapping;
+      Operation *cloned_task = b.clone(*task.getOperation(), mapping);
+      cast<TaskflowTaskOp>(cloned_task).walk([&](neura::KernelOp k) {
+        cloned_kernel = k;
+      });
+    }
+ 
+    // Computes tile dimensions for the target CGRA shape.
+    int per_cgra_cols = neura::getArchitecture().getPerCgraColumns();
+    int per_cgra_rows = neura::getArchitecture().getPerCgraRows();
+    int x_tiles = node->shape.cols * per_cgra_cols;
+    int y_tiles = node->shape.rows * per_cgra_rows;
+    std::string valid_tiles;
+    if (!node->shape.is_rectangular) {
+      // Enumerates individual tile coordinates for non-rectangular shapes
+      // so the mapper knows exactly which tiles are valid.
+      llvm::raw_string_ostream os(valid_tiles);
+      for (auto &[cgra_c, cgra_r] : node->shape.cgra_positions) {
+        for (int tr = 0; tr < per_cgra_rows; ++tr) {
+          for (int tc = 0; tc < per_cgra_cols; ++tc) {
+            if (!os.str().empty()) os << ",";
+            os << (cgra_c * per_cgra_cols + tc)
+               << "_"
+               << (cgra_r * per_cgra_rows + tr);
+          }
+        }
+      }
+    }
+
+    // Runs Neura pipeline on the kernel to get compiled_ii and steps.
+    auto phase2_module = ModuleOp::create(loc);
+    int compiled_ii = 0;
+    int cp_depth = 1;
+
+    if (succeeded(
+            runNeuraPipelineOnKernel(ctx, cloned_kernel, phase2_module,
+                                    compiled_ii, cp_depth,
+                                    x_tiles, y_tiles, valid_tiles,
+                                    skip_mapper))) {
+      llvm::errs() << "[profileTask] kernel in " << task.getTaskName()
+                   << ": compiled_ii=" << compiled_ii
+                   << ", cp_depth=" << cp_depth << "\n";
+    } else {
+      llvm::errs() << "[profileTask] Phase 2 failed for kernel in "
+                   << task.getTaskName() << ", extracting partial\n";
+      extractMetricsFromPartialIR(phase2_module, compiled_ii, cp_depth,
+                                  x_tiles, y_tiles);
+    }
+    phase2_module.erase();
+
+    assert(compiled_ii > 0 &&
+           "[profileTask] FATAL: compiled_ii is 0 after downstream pipeline.");
+    node->ii = compiled_ii;
+    node->steps = std::max(cp_depth, 1);
+
+    llvm::errs() << "[profileTask] " << task.getTaskName()
+                 << ": compiled_ii=" << node->ii
+                 << ", steps=" << node->steps << "\n";
+
+    // Erases the temporary module.
+    tmp_mod.erase();
+  }
+
+  // Wraps a neura.kernel into a standalone func in dst_module, runs
+  // InsertDataMov + mapper, and returns compiled_ii / cp_depth.
+  // x_tiles/y_tiles: multi-CGRA tile grid dimensions.
+  // valid_tiles: explicit tile list for non-rectangular shapes (empty = full).
+  // skip_mapper: skip MapToAcceleratorPass, use ResMII/RecMII only.
+  LogicalResult runNeuraPipelineOnKernel(MLIRContext *ctx,
+                                         neura::KernelOp kernel,
+                                         ModuleOp dst_module,
+                                         int &compiled_ii,
+                                         int &cp_depth,
+                                         int x_tiles = 0,
+                                         int y_tiles = 0,
+                                         const std::string &valid_tiles = "",
+                                         bool skip_mapper = false) {
+    Location loc = kernel.getLoc();
+    OpBuilder builder(ctx);
+    builder.setInsertionPointToStart(dst_module.getBody());
+
+    // Builds function signature: all kernel inputs + iter_args as arguments.
+    Region &kernel_body = kernel.getBody();
+    if (kernel_body.empty())
+      return failure();
+
+    Block &entry = kernel_body.front();
+    SmallVector<Type> arg_types;
+    for (BlockArgument arg : entry.getArguments())
+      arg_types.push_back(arg.getType());
+
+    // Result types from the kernel op.
+    SmallVector<Type> result_types(kernel.getResultTypes());
+
+    auto func_type = builder.getFunctionType(arg_types, result_types);
+    auto wrapper_func = builder.create<func::FuncOp>(
+        loc, "__speculative_kernel__", func_type);
+
+    // Tags as neura accelerator — all downstream passes check this.
+    wrapper_func->setAttr("accelerator",
+                          builder.getStringAttr("neura"));
+
+    // Clones the entire kernel region (all blocks) into the func body.
+    Region &func_region = wrapper_func.getBody();
+    IRMapping mapping;
+    kernel_body.cloneInto(&func_region, mapping);
+
+    // The cloned region now contains a copy of every block from the kernel.
+    // Walks through and replaces neura.yield terminators with func.return.
+    for (Block &block : func_region) {
+      if (auto yield = dyn_cast<neura::YieldOp>(block.getTerminator())) {
+        builder.setInsertionPoint(yield);
+        SmallVector<Value> return_vals;
+        for (Value v : yield.getResults()) {
+          return_vals.push_back(v);
+        }
+        builder.create<func::ReturnOp>(loc, return_vals);
+        yield.erase();
+      }
+    }
+
+    // The kernel body is already in neura dataflow IR (all lowering passes
+    // completed before this pass).  Only InsertDataMov is needed before mapper.
+    PassManager pm(ctx);
+    pm.enableVerifier(false);
+
+    // InsertDataMov: wraps operands with neura.data_mov for the mapper.
+    pm.addPass(neura::createInsertDataMovPass());
+
+    if (failed(pm.run(dst_module))) {
+      // Pre-mapper pipeline failed — extract best-effort metrics from partial
+      // Neura IR using ResMII/RecMII analysis with the correct multi-CGRA arch.
+      extractMetricsFromPartialIR(dst_module, compiled_ii, cp_depth,
+                                  x_tiles, y_tiles);
+      return failure();
+    }
+
+    // Computes ResMII/RecMII as analytical lower-bound (fallback when mapper
+    // is skipped or fails).  Uses a custom arch sized to the actual tile array.
+    {
+      std::unique_ptr<neura::Architecture> custom_arch;
+      const neura::Architecture *arch_ptr = &neura::getArchitecture();
+      if (x_tiles > 0 && y_tiles > 0) {
+        custom_arch = neura::getArchitecture().cloneWithNewDimensions(
+            y_tiles, x_tiles);
+        arch_ptr = custom_arch.get();
+      }
+      const neura::Architecture &architecture = *arch_ptr;
+
+      dst_module.walk([&](func::FuncOp fn) {
+        if (!fn->hasAttr("accelerator")) return;
+        Region &region = fn.getBody();
+        if (region.empty()) return;
+        int res_mii = neura::calculateResMii(region, architecture);
+        auto cycles = neura::collectRecurrenceCycles(region);
+        int rec_mii = 1;
+        for (auto &cycle : cycles)
+          rec_mii = std::max(rec_mii, cycle.length);
+        compiled_ii = std::max({compiled_ii, res_mii, rec_mii});
+        // Derives cp_depth from ALAP (As-Late-As-Possible) scheduling levels.
+        std::set<Operation *> critical_ops;
+        for (auto &cycle : cycles)
+          for (Operation *op : cycle.operations) critical_ops.insert(op);
+        auto sorted_ops = neura::getTopologicallySortedOps(region);
+        if (!sorted_ops.empty()) {
+          auto level_buckets = neura::getOpsInAlapLevels(sorted_ops, critical_ops);
+          cp_depth = std::max(cp_depth, (int)level_buckets.size());
+        }
+        llvm::errs() << "[profileTask] analytical fallback: res_mii=" << res_mii
+                     << " rec_mii=" << rec_mii
+                     << " tiles=" << architecture.getNumTiles() << "\n";
+      });
+    }
+
+    // Optionally run MapToAcceleratorPass to get the true compiled_ii.
+    //
+    // Guards:
+    //   1. skip_mapper=true: caller explicitly requests analytical-only (e.g.
+    //      speculative balance probes where the mapper may loop indefinitely).
+    //   2. All non-Reserve operand producers must be DataMovOp (mapper crashes
+    //      otherwise on unsupported ops like arith.minimumf).
+    //   3. Kernel must be small enough (<= kMapperOpLimit ops) to avoid
+    //      exponential backtracking blowup during speculative profiling.
+    //
+    // If any guard fires, the ResMII/RecMII values computed above serve as
+    // the analytical lower-bound estimate (under-estimates true II on smaller
+    // arrays, but are safe and instant).
+    if (skip_mapper) {
+      llvm::errs() << "[profileTask] Skipping mapper (analytical-only mode). "
+                   << "Using analytical compiled_ii=" << compiled_ii << "\n";
+      return success();
+    }
+
+    constexpr int kMapperOpLimit = 150;
+    bool all_data_movs_ok = true;
+    int total_mapped_ops = 0;
+    dst_module.walk([&](func::FuncOp fn) {
+      if (!fn->hasAttr("accelerator")) return;
+      fn.walk([&](Operation *op) {
+        if (isa<func::ReturnOp>(op)) return;
+        total_mapped_ops++;
+        if (isa<neura::ReserveOp, neura::DataMovOp, neura::CtrlMovOp>(op))
+          return;
+        for (Value operand : op->getOperands()) {
+          Operation *producer = operand.getDefiningOp();
+          if (!producer) continue;
+          if (!isa<neura::DataMovOp, neura::ReserveOp,
+                   neura::PhiStartOp, neura::GrantOnceOp,
+                   neura::GrantPredicateOp, neura::YieldOp,
+                   neura::KernelOp>(producer))
+            all_data_movs_ok = false;
+        }
+      });
+    });
+
+    llvm::errs() << "[profileTask] mapper guard: total_ops=" << total_mapped_ops
+                 << " all_data_movs=" << all_data_movs_ok
+                 << " limit=" << kMapperOpLimit << "\n";
+
+    if (all_data_movs_ok && total_mapped_ops <= kMapperOpLimit) {
+      // Runs MapToAcceleratorPass in a fresh pass manager on the already-lowered
+      // dst_module (pre-mapper pipeline already ran above).
+      // Passes the correct tile dimensions so the mapper uses the right array.
+      PassManager pm2(ctx);
+      pm2.enableVerifier(false);
+      if (x_tiles > 0 && y_tiles > 0) {
+        neura::MapToAcceleratorOptions map_options;
+        map_options.x_tiles = x_tiles;
+        map_options.y_tiles = y_tiles;
+        map_options.valid_tiles = valid_tiles;
+        pm2.addPass(neura::createMapToAcceleratorPass(map_options));
+      } else {
+        pm2.addPass(neura::createMapToAcceleratorPass());
+      }
+
+      if (succeeded(pm2.run(dst_module))) {
+        // Reads true compiled_ii from mapping_info; overrides analytical estimate.
+        dst_module.walk([&](func::FuncOp fn) {
+          if (!fn->hasAttr("accelerator")) return;
+          if (auto mapping_info =
+                  fn->getAttrOfType<DictionaryAttr>(neura::attr::kMappingInfo)) {
+            if (auto ii_attr =
+                    mapping_info.getAs<IntegerAttr>(neura::attr::kCompiledII)) {
+              compiled_ii = (int)ii_attr.getInt(); // authoritative value
+              llvm::errs() << "[profileTask] mapper returned real II="
+                           << compiled_ii << "\n";
+            }
+          }
+        });
+        return success();
+      }
+      // Mapper failed for all II values — keep ResMII/RecMII from above.
+      llvm::errs() << "[profileTask] WARNING: MapToAcceleratorPass failed, "
+                   << "keeping analytical fallback compiled_ii=" << compiled_ii
+                   << "\n";
+    } else {
+      llvm::errs() << "[profileTask] Skipping mapper (too large or DataMov "
+                   << "check failed). Using analytical compiled_ii="
+                   << compiled_ii << "\n";
+    }
+
+    // Fallback already computed via ResMII/RecMII above; nothing more to do.
+    return success();
+  }
+
+  // Extracts ResMII/RecMII from partially-lowered IR when the full pipeline
+  // fails.  Uses custom arch sized to x_tiles × y_tiles if provided.
+  void extractMetricsFromPartialIR(ModuleOp tmp_module,
+                                   int &out_ii, int &out_cp_depth,
+                                   int x_tiles = 0, int y_tiles = 0) {
+    // Builds architecture: uses custom tile dimensions if provided.
+    std::unique_ptr<neura::Architecture> custom_arch;
+    const neura::Architecture *arch_ptr = &neura::getArchitecture();
+    if (x_tiles > 0 && y_tiles > 0) {
+      custom_arch = neura::getArchitecture().cloneWithNewDimensions(
+          y_tiles, x_tiles);
+      arch_ptr = custom_arch.get();
+    }
+    const neura::Architecture &architecture = *arch_ptr;
+
+    int res_mii = 1;
+    int rec_mii = 1;
+    int cp_depth = 1;
+
+    // Tries func-level analysis on partially-lowered funcs.
+    tmp_module.walk([&](func::FuncOp fn) {
+      if (!fn->hasAttr("accelerator"))
+        return;
+      Region &region = fn.getBody();
+      if (region.empty())
+        return;
+
+      int local_res = neura::calculateResMii(region, architecture);
+      res_mii = std::max(res_mii, local_res);
+
+      auto cycles = neura::collectRecurrenceCycles(region);
+      std::set<Operation *> critical_ops;
+      for (auto &cycle : cycles) {
+        rec_mii = std::max(rec_mii, (int)cycle.length);
+        for (Operation *op : cycle.operations)
+          critical_ops.insert(op);
+      }
+
+      auto sorted_ops = neura::getTopologicallySortedOps(region);
+      if (!sorted_ops.empty()) {
+        auto level_buckets =
+            neura::getOpsInAlapLevels(sorted_ops, critical_ops);
+        cp_depth = std::max(cp_depth, (int)level_buckets.size());
+      }
+    });
+
+    out_ii = std::max(res_mii, rec_mii);
+    out_cp_depth = std::max(cp_depth, 1);
+
+    llvm::errs() << "[profileTask] (partial) ii=" << out_ii
+                 << " (res_mii=" << res_mii
+                 << ", rec_mii=" << rec_mii
+                 << "), steps=" << out_cp_depth << "\n";
+  }
+
+  // Computes total trip count for a task.
+  //
+  // The trip count is extracted from the taskflow.counter chain in the task
+  // body. Each counter has lower_bound, upper_bound, and step attributes.
+  // The trip count of a single counter is:
+  //   ceil((upper_bound - lower_bound) / step)
+  //
+  // Counters form chains (root -> relay -> leaf). The trip count of a chain
+  // is the product of each counter's individual trip count.
+  //
+  // Multiple independent counter chains execute concurrently on the CGRA,
+  // so the total trip count is max(chain_product) across chains.
+  static int64_t computeTripCount(TaskflowTaskOp task) {
+    // Collects all taskflow.counter ops in the task body.
+    SmallVector<TaskflowCounterOp> counters;
+    for (Operation &op : task.getBody().front()) {
+      if (auto counter = dyn_cast<TaskflowCounterOp>(&op))
+        counters.push_back(counter);
+    }
+
+    if (counters.empty()) {
+      // Defensive fallback: try neura.counter ops inside kernels.
+      int64_t total = 1;
+      task.walk([&](neura::KernelOp kernel) {
+        int64_t kernel_product = 1;
+        kernel.walk([&](Operation *op) {
+          if (op->getName().getStringRef() == "neura.counter") {
+            auto lb = op->getAttrOfType<IntegerAttr>("lower_bound");
+            auto ub = op->getAttrOfType<IntegerAttr>("upper_bound");
+            auto st = op->getAttrOfType<IntegerAttr>("step");
+            if (lb && ub && st && st.getInt() > 0) {
+              int64_t range = ub.getInt() - lb.getInt();
+              int64_t step = st.getInt();
+              int64_t tc = (range + step - 1) / step;
+              if (tc > 0) kernel_product *= tc;
+            }
+          }
+        });
+        total = std::max(total, kernel_product);
+      });
+      return (total > 0) ? total : 1;
+    }
+
+    // Builds counter chains from taskflow.counter ops.
+    // A root counter has no parent_index. A relay/leaf counter has a
+    // parent_index that is the result of another counter.
+    // Finds root counters (no parent).
+    SmallVector<TaskflowCounterOp> roots;
+    for (auto counter : counters) {
+      if (!counter.getParentIndex())
+        roots.push_back(counter);
+    }
+
+    // Builds a map from parent counter result -> child counters.
+    DenseMap<Value, SmallVector<TaskflowCounterOp>> parent_to_children;
+    for (auto counter : counters) {
+      if (auto parent = counter.getParentIndex())
+        parent_to_children[parent].push_back(counter);
+    }
+
+    // Computes trip count for a single counter.
+    auto counterTripCount = [](TaskflowCounterOp counter) -> int64_t {
+      int64_t lb = counter.getLowerBound().getSExtValue();
+      int64_t ub = counter.getUpperBound().getSExtValue();
+      int64_t step = counter.getStep().getSExtValue();
+      if (step <= 0) return 1;
+      int64_t range = ub - lb;
+      return (range > 0) ? ((range + step - 1) / step) : 1;
+    };
+
+    // DFS from each root, accumulating the product along the chain.
+    // Independent chains are concurrent -> take max across chains.
+    int64_t total = 1;
+    for (auto root : roots) {
+      // Follows chain: root -> children -> grandchildren ...
+      // Chain product = product of all counters in this chain.
+      int64_t chain_product = 1;
+      SmallVector<TaskflowCounterOp> worklist;
+      worklist.push_back(root);
+      while (!worklist.empty()) {
+        auto cur = worklist.pop_back_val();
+        chain_product *= counterTripCount(cur);
+        auto it = parent_to_children.find(cur.getCounterIndex());
+        if (it != parent_to_children.end()) {
+          for (auto child : it->second)
+            worklist.push_back(child);
+        }
+      }
+      total = std::max(total, chain_product);
+    }
+
+    return (total > 0) ? total : 1;
+  }
+
+};
+
+//===----------------------------------------------------------------------===//
+// Pipeline Balancer
+//===----------------------------------------------------------------------===//
+// Identifies critical-path bottlenecks and allocates extra CGRAs.
+
+class PipelineBalancer {
+public:
+  using ProfileFn = std::function<void(TaskGraphNode *, TaskflowTaskOp)>;
+
+  // Runs pipeline balance on the graph.
+  //
+  // For each iteration, speculatively increments the bottleneck task's
+  // cgra_count by 1 and re-profiles it via profile_fn. If the new estimated
+  // latency is lower, the change is accepted; otherwise it is reverted and
+  // the node is marked saturated (no further CGRA additions help it).
+  //
+  // This avoids blindly assigning more CGRAs without checking whether the
+  // larger array actually produces a better compiled_ii.
+  //
+  // Returns true if any changes were accepted.
+  bool balance(TaskDependencyGraph &graph, ProfileFn profile_fn) {
+    bool changed = false;
+    // Tracks nodes for which adding one more CGRA did not reduce latency.
+    // These are skipped in subsequent iterations.
+    llvm::DenseSet<TaskGraphNode *> saturated_nodes;
+
+    for (int iter = 0; iter < kMaxBalanceIterations; ++iter) {
+      int total_cgras = graph.getTotalAllocatedCGRAs();
+      if (total_cgras >= kTotalCGRAs) {
+        break;
+      }
+
+      // Recomputes critical path each iteration (may shift after rebalance).
+      TaskGraphNode *bottleneck = findBottleneck(graph, saturated_nodes);
+      if (!bottleneck) {
+        break;
+      }
+
+      int old_cgra_count = bottleneck->cgra_count;
+      int new_cgra_count = old_cgra_count + 1;
+
+      // Check if incrementing cgra_count is feasible on the 4×4 grid.
+      // TODO: This currently only checks the capacity (total CGRA count). Ideally, 
+      // we should invoke a global placement pass (aka MapTaskOnCgraPass) here to 
+      // verify if the speculatively increased CGRA count and its proposed shape 
+      // actually fit on the 4x4 grid alongside other previously allocated tasks.
+      //
+      // Currently, MapTaskOnCgraPass does not support multi-CGRA task placement. 
+      // Once it does, we should call it here; if global placement fails for the 
+      // "best" shape, we should backtrack and try alternative shapes before 
+      // saturating the node.
+      if (!canFitOnGrid(new_cgra_count)) {
+        saturated_nodes.insert(bottleneck);
+        continue;
+      }
+
+      // Saves state for potential rollback.
+      int64_t old_latency = bottleneck->estimatedLatency();
+      int64_t old_ii     = bottleneck->ii;
+      int64_t old_steps  = bottleneck->steps;
+      CgraShape old_shape = bottleneck->shape;
+
+      // Speculatively applies the new CGRA count and re-profiles.
+      bottleneck->cgra_count = new_cgra_count;
+      bottleneck->shape = pickBestShape(new_cgra_count);
+
+      llvm::errs()
+          << "  Balance: trying Task " << bottleneck->id << " ("
+          << bottleneck->op.getTaskName().str()
+          << ") cgra_count=" << old_cgra_count << "->" << new_cgra_count
+          << ", shape=" << bottleneck->shape.describe(new_cgra_count)
+          << ", tile_array="
+          << (bottleneck->shape.rows * neura::getArchitecture().getPerCgraRows())
+          << "x"
+          << (bottleneck->shape.cols * neura::getArchitecture().getPerCgraColumns())
+          << ", old_ii=" << old_ii << ", old_lat=" << old_latency << "\n";
+
+      profile_fn(bottleneck, bottleneck->op);
+
+      int64_t new_latency = bottleneck->estimatedLatency();
+
+      if (new_latency < old_latency) {
+        // Accepted: the larger array produces a measurably better latency.
+        changed = true;
+        llvm::errs()
+            << "  Balance: ACCEPTED Task " << bottleneck->id << " ("
+            << bottleneck->op.getTaskName().str()
+            << ") cgra_count=" << new_cgra_count
+            << ", ii=" << old_ii << "->" << bottleneck->ii
+            << ", lat=" << old_latency << "->" << new_latency
+            << ", total_cgras=" << graph.getTotalAllocatedCGRAs() << "\n";
+      } else {
+        // Rejected: no latency improvement — roll back and mark saturated.
+        llvm::errs()
+            << "  Balance: REJECTED Task " << bottleneck->id
+            << " (ii=" << bottleneck->ii << ", lat=" << new_latency
+            << " >= old_lat=" << old_latency << "). Reverting.\n";
+        bottleneck->cgra_count = old_cgra_count;
+        bottleneck->shape      = old_shape;
+        bottleneck->ii         = old_ii;
+        bottleneck->steps      = old_steps;
+        saturated_nodes.insert(bottleneck);
+      }
+    }
+
+    return changed;
+  }
+
+  private:
+    // Computes the weighted critical path length from a given node to any sink.
+    int64_t computeCriticalPathFrom(TaskGraphNode *node,
+                                    DenseMap<TaskGraphNode *, int64_t> &cache) {
+      auto it = cache.find(node);
+      if (it != cache.end()) {
+        return it->second;
+      }
+
+      int64_t max_successor_path = 0;
+      for (auto *succ : node->successors) {
+        max_successor_path =
+            std::max(max_successor_path, computeCriticalPathFrom(succ, cache));
+      }
+
+      int64_t path = node->estimatedLatency() + max_successor_path;
+      cache[node] = path;
+      return path;
+    }
+
+    // Computes the longest path from any source to the given node
+    // (depth_from_source). Uses dynamic programming with memoization.
+    int64_t computeDepthFromSource(TaskGraphNode *node,
+                                   DenseMap<TaskGraphNode *, int64_t> &cache) {
+      auto it = cache.find(node);
+      if (it != cache.end()) {
+        return it->second;
+      }
+
+      int64_t max_predecessor_depth = 0;
+      for (auto *pred : node->predecessors) {
+        max_predecessor_depth =
+            std::max(max_predecessor_depth,
+                     computeDepthFromSource(pred, cache));
+      }
+
+      // depth_from_source(node) = max(depth_from_source(pred) for all preds)
+      //                           + node's own latency.
+      int64_t depth = max_predecessor_depth + node->estimatedLatency();
+      cache[node] = depth;
+      return depth;
+    }
+
+    // Finds the bottleneck node on the critical path using full slack analysis.
+    //
+    // For each node, slack is defined as:
+    //   slack(node) = global_critical_path
+    //                 - depth_from_source(node)
+    //                 - depth_to_sink(node)
+    //                 + node->estimatedLatency()
+    //
+    // where depth_from_source includes the node's own latency, and
+    // depth_to_sink (computeCriticalPathFrom) also includes the node's own
+    // latency, so we add it back once to avoid double-counting.
+    //
+    // A node is on the critical path iff slack == 0.
+    // Among critical-path nodes, the one with highest individual latency
+    // is the bottleneck (reducing its latency most benefits the pipeline).
+    TaskGraphNode *findBottleneck(TaskDependencyGraph &graph,
+                                  const llvm::DenseSet<TaskGraphNode *> &ignored) {
+      llvm::DenseMap<TaskGraphNode *, int64_t> to_sink_cache;
+      llvm::DenseMap<TaskGraphNode *, int64_t> from_source_cache;
+
+      // Computes depth_to_sink for all nodes (via computeCriticalPathFrom).
+      int64_t global_critical_path = 0;
+      for (auto &node : graph.nodes) {
+        int64_t cp = computeCriticalPathFrom(node.get(), to_sink_cache);
+        global_critical_path = std::max(global_critical_path, cp);
+      }
+
+      // Computes depth_from_source for all nodes.
+      for (auto &node : graph.nodes) {
+        computeDepthFromSource(node.get(), from_source_cache);
+      }
+
+      // Finds the critical-path node with highest individual latency.
+      TaskGraphNode *bottleneck = nullptr;
+      int64_t max_latency = -1;
+
+      for (auto &node : graph.nodes) {
+        if (ignored.count(node.get())) continue;
+        if (node->cgra_count >= node->trip_count) continue;
+        // Per-task CGRA limit: no point trying to add more.
+        if (node->cgra_count >= kMaxCgrasPerTask) continue;
+
+        int64_t depth_from = from_source_cache[node.get()];
+        int64_t depth_to = to_sink_cache[node.get()];
+
+        // slack = global_cp - depth_from - depth_to + node_latency
+        // (because both depth_from and depth_to include node's own latency).
+        int64_t slack = global_critical_path - depth_from - depth_to
+                        + node->estimatedLatency();
+
+        if (slack != 0) continue; // Not on the critical path.
+
+        if (node->estimatedLatency() > max_latency) {
+          max_latency = node->estimatedLatency();
+          bottleneck = node.get();
+        }
+      }
+      return bottleneck;
+    }
+
+};
+
+//===----------------------------------------------------------------------===//
+// Utilization Fusion
+//===----------------------------------------------------------------------===//
+// Merges independent tasks (no edge in either direction) into a single task
+// to reduce total CGRA count.  Fusion candidates are chosen to minimize
+// |trip_count_a - trip_count_b| for balanced utilization.
+
+class UtilizationFuser {
+public:
+  using ProfileFn = std::function<void(TaskGraphNode *, TaskflowTaskOp)>;
+
+  // Runs utilization fusion. Returns true if any fusions occurred.
+  // Only performs ONE fusion per call — the caller should rebuild the graph
+  // and call again if more fusions are desired.
+  bool fuse(func::FuncOp func, TaskDependencyGraph &graph,
+            ProfileFn profile_fn) {
+    auto pair = findBestFusionCandidate(graph);
+    if (!pair) {
+      return false;
+    }
+
+    auto [node_a, node_b] = *pair;
+
+    llvm::errs()
+        << "  Fuse: Task " << node_a->id << " ("
+        << node_a->op.getTaskName().str() << ") + Task " << node_b->id
+        << " (" << node_b->op.getTaskName().str() << ")\n";
+
+    return performFusion(func, node_a, node_b, graph, profile_fn);
+  }
+
+private:
+  // Finds the best pair of independent tasks to fuse.
+  // Selects the pair with the most balanced trip_count (minimizes
+  // |trip_count_a - trip_count_b|) to avoid wasting computation when
+  // the fused task executes both loop nests concurrently on the shared array.
+  std::optional<std::pair<TaskGraphNode *, TaskGraphNode *>>
+  findBestFusionCandidate(TaskDependencyGraph &graph) {
+    TaskGraphNode *best_a = nullptr;
+    TaskGraphNode *best_b = nullptr;
+    int64_t best_cost = INT64_MAX;
+
+    for (size_t i = 0; i < graph.nodes.size(); ++i) {
+      for (size_t j = i + 1; j < graph.nodes.size(); ++j) {
+        auto *a = graph.nodes[i].get();
+        auto *b = graph.nodes[j].get();
+
+        if (!graph.areIndependent(a, b)) {
+          continue;
+        }
+
+        // Fusion requires single-block task bodies (counter-mode tasks).
+        if (!a->op.getBody().hasOneBlock() ||
+            !b->op.getBody().hasOneBlock()) {
+          continue;
+        }
+
+        // Legality: checks no intermediate task depends on a or b.
+        if (!canSafelyFuse(a, b, graph)) {
+          continue;
+        }
+
+        // Utilization metric: minimize |trip_count_a - trip_count_b|.
+        // Balanced trip counts mean less wasted computation when fused
+        // tasks execute concurrently on the shared tile array.
+        int64_t cost = std::abs(a->trip_count - b->trip_count);
+        if (cost < best_cost) {
+          best_cost = cost;
+          best_a = a;
+          best_b = b;
+        }
+      }
+    }
+
+    if (!best_a || !best_b) {
+      return std::nullopt;
+    }
+    return std::make_pair(best_a, best_b);
+  }
+
+  // Checks whether fusing tasks a and b is safe w.r.t. dominance.
+  // Returns false if any other task positioned between a and b in the IR
+  // has a dependency (edge) on either a or b — because moving the fused
+  // task would break that intermediate dependency.
+  bool canSafelyFuse(TaskGraphNode *a, TaskGraphNode *b,
+                     TaskDependencyGraph &graph) {
+    auto *task_a = a->op.getOperation();
+    auto *task_b = b->op.getOperation();
+
+    if (task_a->getBlock() != task_b->getBlock()) return false;
+
+    // Ensures task_a is before task_b.
+    if (!task_a->isBeforeInBlock(task_b)) {
+      std::swap(task_a, task_b);
+      std::swap(a, b);
+    }
+
+    // Check: no other task between a and b should have an edge from/to a or b.
+    for (auto &node : graph.nodes) {
+      if (node.get() == a || node.get() == b) continue;
+
+      auto *other_op = node->op.getOperation();
+      if (other_op->getBlock() != task_a->getBlock()) continue;
+
+      // Is this node between task_a and task_b?
+      if (task_a->isBeforeInBlock(other_op) &&
+          other_op->isBeforeInBlock(task_b)) {
+        // Checks if this intermediate task has any dependency on a or b.
+        if (!graph.areIndependent(a, node.get()) ||
+            !graph.areIndependent(b, node.get())) {
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  // Performs IR-level fusion of two independent tasks.
+  //
+  // DFG-Level Fusion:
+  //   Since this pass runs post-lowering, each task body is single-block
+  //   containing counter ops, one neura.kernel op, and a taskflow.yield.
+  //   Fusion concatenates both DFGs into a single neura.kernel (they are
+  //   independent, so just placed side-by-side).  The fused task is then
+  //   profiled through InsertDataMov + mapper to get accurate compiled_ii.
+  bool performFusion(func::FuncOp func, TaskGraphNode *node_a,
+                     TaskGraphNode *node_b, TaskDependencyGraph &graph,
+                     ProfileFn profile_fn) {
+    auto task_a = node_a->op;
+    auto task_b = node_b->op;
+
+    // Safety: both tasks must be in the same block.
+    if (task_a->getBlock() != task_b->getBlock()) {
+      llvm::errs() << "  [Fuse] Skipping: tasks in different blocks\n";
+      return false;
+    }
+
+    // Safety: fusion requires single-block task bodies.
+    if (!task_a.getBody().hasOneBlock() || !task_b.getBody().hasOneBlock()) {
+      llvm::errs() << "  [Fuse] Skipping: multi-block task body\n";
+      return false;
+    }
+
+    // Ensures task_a comes before task_b in the IR for correct dominance.
+    if (!task_a->isBeforeInBlock(task_b)) {
+      std::swap(task_a, task_b);
+      std::swap(node_a, node_b);
+    }
+
+    llvm::errs() << "  [Fuse] Merging " << task_a.getTaskName() << " + "
+                 << task_b.getTaskName() << "\n";
+
+    // Computes the correct insertion point: must be after all operands of
+    // both tasks are defined, but before any consumer of either task's
+    // results. We find the latest-positioned operand definition and insert
+    // right after it.
+    Operation *latest_def = task_a.getOperation();
+    auto updateLatest = [&](ValueRange operands) {
+      for (Value v : operands) {
+        if (auto *def_op = v.getDefiningOp()) {
+          if (def_op->getBlock() == task_a->getBlock() &&
+              latest_def->isBeforeInBlock(def_op)) {
+            latest_def = def_op;
+          }
+        }
+      }
+    };
+    updateLatest(task_a.getReadMemrefs());
+    updateLatest(task_a.getWriteMemrefs());
+    updateLatest(task_a.getValueInputs());
+    updateLatest(task_b.getReadMemrefs());
+    updateLatest(task_b.getWriteMemrefs());
+    updateLatest(task_b.getValueInputs());
+
+    // Inserts right after the latest operand definition.
+    OpBuilder builder(latest_def->getBlock(),
+                      std::next(Block::iterator(latest_def)));
+
+    // Step 1: Builds merged operand lists.
+    SmallVector<Value> merged_read_memrefs;
+    SmallVector<Value> merged_write_memrefs;
+    SmallVector<Value> merged_value_inputs;
+    SmallVector<Value> merged_original_read_memrefs;
+    SmallVector<Value> merged_original_write_memrefs;
+
+    // Deduplicates values when merging operand lists from both tasks.
+    auto addUnique = [](SmallVector<Value> &target, ValueRange source) {
+      for (Value v : source) {
+        if (llvm::find(target, v) == target.end()) {
+          target.push_back(v);
+        }
+      }
+    };
+
+    addUnique(merged_read_memrefs, task_a.getReadMemrefs());
+    addUnique(merged_read_memrefs, task_b.getReadMemrefs());
+    addUnique(merged_write_memrefs, task_a.getWriteMemrefs());
+    addUnique(merged_write_memrefs, task_b.getWriteMemrefs());
+    addUnique(merged_value_inputs, task_a.getValueInputs());
+    addUnique(merged_value_inputs, task_b.getValueInputs());
+    addUnique(merged_original_read_memrefs, task_a.getOriginalReadMemrefs());
+    addUnique(merged_original_read_memrefs, task_b.getOriginalReadMemrefs());
+    addUnique(merged_original_write_memrefs, task_a.getOriginalWriteMemrefs());
+    addUnique(merged_original_write_memrefs, task_b.getOriginalWriteMemrefs());
+
+    // Step 2: Builds result types.
+    SmallVector<Type> write_output_types;
+    for (Value v : merged_write_memrefs) {
+      write_output_types.push_back(v.getType());
+    }
+    SmallVector<Type> value_output_types;
+    for (Value v : task_a.getValueOutputs()) {
+      value_output_types.push_back(v.getType());
+    }
+    for (Value v : task_b.getValueOutputs()) {
+      value_output_types.push_back(v.getType());
+    }
+
+    // Step 3: Creates fused task name.
+    std::string fused_name = task_a.getTaskName().str() + "_" +
+                             task_b.getTaskName().str() + "_utilfused";
+
+    // Step 4: Creates the fused task op.
+    auto fused_task = builder.create<TaskflowTaskOp>(
+        task_a.getLoc(), write_output_types, value_output_types,
+        merged_read_memrefs, merged_write_memrefs, merged_value_inputs,
+        fused_name, merged_original_read_memrefs,
+        merged_original_write_memrefs);
+
+    // ================================================================
+    // Region-Level Fusion (single-block task bodies)
+    // ================================================================
+
+    // Step 5: Clones both task regions into the fused task body.
+    // Maps source task's block args to fused task's block args.
+    auto buildTaskArgMapping =
+        [&](TaskflowTaskOp orig_task, Region &fused_region,
+            IRMapping &mapping) {
+      Block &src_entry = orig_task.getBody().front();
+      unsigned src_idx = 0;
+      unsigned read_count = orig_task.getReadMemrefs().size();
+      unsigned write_count = orig_task.getWriteMemrefs().size();
+
+      for (unsigned i = 0; i < read_count; ++i) {
+        Value orig_memref = orig_task.getReadMemrefs()[i];
+        auto it = llvm::find(merged_read_memrefs, orig_memref);
+        assert(it != merged_read_memrefs.end());
+        unsigned fused_idx = std::distance(merged_read_memrefs.begin(), it);
+        mapping.map(src_entry.getArgument(src_idx + i),
+                    fused_region.front().getArgument(fused_idx));
+      }
+      src_idx += read_count;
+
+      for (unsigned i = 0; i < write_count; ++i) {
+        Value orig_memref = orig_task.getWriteMemrefs()[i];
+        auto it = llvm::find(merged_write_memrefs, orig_memref);
+        assert(it != merged_write_memrefs.end());
+        unsigned fused_idx = merged_read_memrefs.size() +
+                             std::distance(merged_write_memrefs.begin(), it);
+        mapping.map(src_entry.getArgument(src_idx + i),
+                    fused_region.front().getArgument(fused_idx));
+      }
+      src_idx += write_count;
+
+      for (unsigned i = 0; i < orig_task.getValueInputs().size(); ++i) {
+        Value orig_val = orig_task.getValueInputs()[i];
+        auto it = llvm::find(merged_value_inputs, orig_val);
+        assert(it != merged_value_inputs.end());
+        unsigned fused_idx = merged_read_memrefs.size() +
+                             merged_write_memrefs.size() +
+                             std::distance(merged_value_inputs.begin(), it);
+        mapping.map(src_entry.getArgument(src_idx + i),
+                    fused_region.front().getArgument(fused_idx));
+      }
+    };
+
+    // Creates the fused task's entry block with merged block args.
+    Block *entry_block = new Block();
+    fused_task.getBody().push_back(entry_block);
+    for (Value v : merged_read_memrefs)
+      entry_block->addArgument(v.getType(), fused_task.getLoc());
+    for (Value v : merged_write_memrefs)
+      entry_block->addArgument(v.getType(), fused_task.getLoc());
+    for (Value v : merged_value_inputs)
+      entry_block->addArgument(v.getType(), fused_task.getLoc());
+
+    // Clones non-yield ops from task_a and task_b into fused entry block.
+    IRMapping mapping_a;
+    buildTaskArgMapping(task_a, fused_task.getBody(), mapping_a);
+
+    IRMapping mapping_b;
+    buildTaskArgMapping(task_b, fused_task.getBody(), mapping_b);
+
+    // Clones all non-yield ops from task_a's body into the fused entry block.
+    {
+      OpBuilder ob = OpBuilder::atBlockEnd(entry_block);
+      for (Operation &op : task_a.getBody().front()) {
+        if (isa<TaskflowYieldOp>(&op)) continue;
+        ob.clone(op, mapping_a);
+      }
+    }
+
+    // Clones all non-yield ops from task_b's body into the fused entry block.
+    {
+      OpBuilder ob = OpBuilder::atBlockEnd(entry_block);
+      for (Operation &op : task_b.getBody().front()) {
+        if (isa<TaskflowYieldOp>(&op)) continue;
+        ob.clone(op, mapping_b);
+      }
+    }
+
+    // Identifies the two cloned kernels in the fused entry block.
+    neura::KernelOp cloned_kernel_a, cloned_kernel_b;
+    {
+      SmallVector<neura::KernelOp> fused_kernels;
+      fused_task.walk([&](neura::KernelOp k) { fused_kernels.push_back(k); });
+      assert(fused_kernels.size() == 2 &&
+             "[performFusion] expected exactly 2 cloned kernels");
+      cloned_kernel_a = fused_kernels[0];
+      cloned_kernel_b = fused_kernels[1];
+    }
+
+    // Merges the two cloned kernels into one fused kernel.
+    SmallVector<Value> merged_kernel_inputs;
+    auto addKernelInputs = [&](neura::KernelOp kernel) {
+      for (Value inp : kernel.getInputs()) {
+        if (llvm::find(merged_kernel_inputs, inp) ==
+            merged_kernel_inputs.end()) {
+          merged_kernel_inputs.push_back(inp);
+        }
+      }
+    };
+    addKernelInputs(cloned_kernel_a);
+    addKernelInputs(cloned_kernel_b);
+
+    // Concatenates iter_args from both kernels (kernel_a first, then kernel_b).
+    SmallVector<Value> merged_iter_args;
+    for (Value v : cloned_kernel_a.getIterArgsInit())
+      merged_iter_args.push_back(v);
+    for (Value v : cloned_kernel_b.getIterArgsInit())
+      merged_iter_args.push_back(v);
+
+    // Concatenates result types from both kernels.
+    SmallVector<Type> merged_kernel_results;
+    for (Type t : cloned_kernel_a.getResultTypes())
+      merged_kernel_results.push_back(t);
+    for (Type t : cloned_kernel_b.getResultTypes())
+      merged_kernel_results.push_back(t);
+
+    // Creates the fused kernel op right before cloned_kernel_a.
+    OpBuilder fused_kb(cloned_kernel_a);
+    auto fused_kernel = fused_kb.create<neura::KernelOp>(
+        task_a.getLoc(), merged_kernel_results, merged_kernel_inputs,
+        merged_iter_args,
+        /*cgra_id=*/nullptr, /*kernel_name=*/nullptr,
+        /*accelerator=*/builder.getStringAttr("neura"));
+    fused_kernel->setAttr("dataflow_mode",
+                          builder.getStringAttr("predicate"));
+
+    // Builds kernel entry block and block-arg mappings.
+    Region &fused_kernel_region = fused_kernel.getBody();
+    Block *kernel_body = builder.createBlock(&fused_kernel_region);
+    for (Value v : merged_kernel_inputs)
+      kernel_body->addArgument(v.getType(), task_a.getLoc());
+    for (Value v : merged_iter_args)
+      kernel_body->addArgument(v.getType(), task_a.getLoc());
+
+    // Maps each original kernel's block args to the fused kernel's block args.
+    // iter_offset tracks where this kernel's iter_args start in the merged list.
+    auto buildKernelArgMapping =
+        [&](neura::KernelOp kernel, unsigned iter_offset) -> IRMapping {
+      IRMapping km;
+      Block &src_entry = kernel.getBody().front();
+      unsigned src_idx = 0;
+
+      // Maps kernel input args.
+      for (Value inp : kernel.getInputs()) {
+        auto it = llvm::find(merged_kernel_inputs, inp);
+        assert(it != merged_kernel_inputs.end());
+        unsigned fused_idx = std::distance(merged_kernel_inputs.begin(), it);
+        km.map(src_entry.getArgument(src_idx),
+               kernel_body->getArgument(fused_idx));
+        src_idx++;
+      }
+
+      // Maps iter_args.
+      for (unsigned i = 0; i < kernel.getIterArgsInit().size(); ++i) {
+        km.map(src_entry.getArgument(src_idx + i),
+               kernel_body->getArgument(
+                   merged_kernel_inputs.size() + iter_offset + i));
+      }
+
+      return km;
+    };
+
+    IRMapping kernel_mapping_a = buildKernelArgMapping(
+        cloned_kernel_a, 0);
+    IRMapping kernel_mapping_b = buildKernelArgMapping(
+        cloned_kernel_b, cloned_kernel_a.getIterArgsInit().size());
+
+    // Clones DFG ops from both kernels and creates the combined neura.yield.
+    {
+      OpBuilder kb = OpBuilder::atBlockEnd(kernel_body);
+      for (auto &op : cloned_kernel_a.getBody().front().getOperations()) {
+        if (isa<neura::YieldOp>(&op)) continue;
+        kb.clone(op, kernel_mapping_a);
+      }
+      for (auto &op : cloned_kernel_b.getBody().front().getOperations()) {
+        if (isa<neura::YieldOp>(&op)) continue;
+        kb.clone(op, kernel_mapping_b);
+      }
+
+      // Collects yield operands from both kernels' original yields.
+      SmallVector<Value> merged_iter_args_next;
+      SmallVector<Value> merged_results;
+      if (auto yield_a = dyn_cast<neura::YieldOp>(
+              cloned_kernel_a.getBody().front().getTerminator())) {
+        for (Value v : yield_a.getIterArgsNext())
+          merged_iter_args_next.push_back(
+              kernel_mapping_a.lookupOrDefault(v));
+        for (Value v : yield_a.getResults())
+          merged_results.push_back(kernel_mapping_a.lookupOrDefault(v));
+      }
+      if (auto yield_b = dyn_cast<neura::YieldOp>(
+              cloned_kernel_b.getBody().front().getTerminator())) {
+        for (Value v : yield_b.getIterArgsNext())
+          merged_iter_args_next.push_back(
+              kernel_mapping_b.lookupOrDefault(v));
+        for (Value v : yield_b.getResults())
+          merged_results.push_back(kernel_mapping_b.lookupOrDefault(v));
+      }
+
+      // Creates the combined neura.yield and preserves yield_type from kernel_a.
+      auto fused_yield = kb.create<neura::YieldOp>(
+          task_a.getLoc(), merged_iter_args_next, merged_results);
+      if (auto yield_a = dyn_cast<neura::YieldOp>(
+              cloned_kernel_a.getBody().front().getTerminator())) {
+        if (auto attr = yield_a->getAttr("yield_type"))
+          fused_yield->setAttr("yield_type", attr);
+      }
+    }
+
+    // Replaces uses of cloned kernels with fused kernel results, then erases.
+    {
+      unsigned result_idx = 0;
+      for (unsigned i = 0; i < cloned_kernel_a.getNumResults(); ++i) {
+        cloned_kernel_a.getResult(i).replaceAllUsesWith(
+            fused_kernel.getResult(result_idx++));
+      }
+      for (unsigned i = 0; i < cloned_kernel_b.getNumResults(); ++i) {
+        cloned_kernel_b.getResult(i).replaceAllUsesWith(
+            fused_kernel.getResult(result_idx++));
+      }
+      cloned_kernel_a.erase();
+      cloned_kernel_b.erase();
+    }
+
+    // Builds and inserts the merged taskflow.yield.
+    {
+      // Writes outputs pass through the entry block's write-memref args.
+      SmallVector<Value> yield_writes;
+      for (size_t i = 0; i < merged_write_memrefs.size(); ++i) {
+        yield_writes.push_back(
+            entry_block->getArgument(merged_read_memrefs.size() + i));
+      }
+
+      // Value outputs come from the fused kernel's results.
+      SmallVector<Value> yield_values;
+      unsigned val_idx = 0;
+      for (unsigned i = 0; i < task_a.getValueOutputs().size(); ++i)
+        yield_values.push_back(fused_kernel.getResult(val_idx++));
+      for (unsigned i = 0; i < task_b.getValueOutputs().size(); ++i)
+        yield_values.push_back(fused_kernel.getResult(val_idx++));
+
+      // Erases auto-inserted yield and creates the merged one.
+      if (!entry_block->empty()) {
+        if (auto existing_yield = dyn_cast<TaskflowYieldOp>(
+                entry_block->back())) {
+          existing_yield.erase();
+        }
+      }
+      OpBuilder tb = OpBuilder::atBlockEnd(entry_block);
+      tb.create<TaskflowYieldOp>(fused_task.getLoc(), yield_writes,
+                                 yield_values);
+    }
+
+    // Step 6: Sets fused trip_count (max of both independent tasks).
+    int64_t fused_trip = std::max(node_a->trip_count, node_b->trip_count);
+    fused_task->setAttr("trip_count",
+                        OpBuilder(fused_task).getI64IntegerAttr(fused_trip));
+
+    // Profiles the fused task to obtain its compiled_ii and steps.
+    {
+      TaskGraphNode fused_node(/*id=*/0, fused_task);
+      fused_node.trip_count = fused_trip;
+      profile_fn(&fused_node, fused_task);
+      fused_task->setAttr("steps",
+                          OpBuilder(fused_task).getI64IntegerAttr(fused_node.steps));
+      fused_task->setAttr("compiled_ii",
+                          OpBuilder(fused_task).getI64IntegerAttr(fused_node.ii));
+    }
+
+
+    // Step 7: Replaces uses of original tasks' results.
+    // Value outputs are ordered: task_a's value outputs first, then task_b's.
+    unsigned val_offset_a = 0;
+    unsigned val_offset_b = task_a.getValueOutputs().size();
+    replaceTaskResults(task_a, fused_task, merged_write_memrefs, val_offset_a);
+    replaceTaskResults(task_b, fused_task, merged_write_memrefs, val_offset_b);
+
+    // Step 8: Erases original tasks.
+    // Verifies no remaining uses before erasing.
+    auto verifyNoUses = [](TaskflowTaskOp task, StringRef label) {
+      for (Value result : task->getResults()) {
+        if (!result.use_empty()) {
+          llvm::errs() << "[performFusion] ERROR: " << label
+                       << " result #" << result.cast<OpResult>().getResultNumber()
+                       << " still has uses:\n";
+          for (auto &use : result.getUses()) {
+            llvm::errs() << "  used by: ";
+            use.getOwner()->print(llvm::errs());
+            llvm::errs() << "\n";
+          }
+        }
+      }
+    };
+    verifyNoUses(task_a, "task_a");
+    verifyNoUses(task_b, "task_b");
+    task_a.erase();
+    task_b.erase();
+
+    return true;
+  }
+
+  // Finds the index of a value in a list.
+  unsigned findOperandIndex(const SmallVector<Value> &list, Value v) {
+    for (unsigned i = 0; i < list.size(); ++i) {
+      if (list[i] == v) return i;
+    }
+    llvm_unreachable("Value not found in operand list");
+  }
+
+  // Replaces results of an original task with corresponding results from the
+  // fused task. Handles both write outputs (memrefs) and value outputs
+  // (reductions, iter_args).
+  void replaceTaskResults(TaskflowTaskOp orig_task, TaskflowTaskOp fused_task,
+                          const SmallVector<Value> &merged_write_memrefs,
+                          unsigned value_output_offset) {
+    // Writes outputs: maps by matching the original write memref to its
+    // position in the merged write memrefs list.
+    for (unsigned i = 0; i < orig_task.getWriteOutputs().size(); ++i) {
+      Value orig_result = orig_task.getWriteOutputs()[i];
+      Value orig_write = orig_task.getWriteMemrefs()[i];
+      unsigned fused_idx = findOperandIndex(merged_write_memrefs, orig_write);
+      orig_result.replaceAllUsesWith(fused_task.getWriteOutputs()[fused_idx]);
+    }
+    // Value outputs: each original task's value_output[i] maps to
+    // fused_task.getValueOutputs()[value_output_offset + i].
+    for (unsigned i = 0; i < orig_task.getValueOutputs().size(); ++i) {
+      Value orig_val = orig_task.getValueOutputs()[i];
+      orig_val.replaceAllUsesWith(
+          fused_task.getValueOutputs()[value_output_offset + i]);
+    }
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Pass Definition
+//===----------------------------------------------------------------------===//
+
+struct ResourceAwareTaskOptimizationPass
+    : public PassWrapper<ResourceAwareTaskOptimizationPass,
+                         OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      ResourceAwareTaskOptimizationPass)
+
+  ResourceAwareTaskOptimizationPass() = default;
+  ResourceAwareTaskOptimizationPass(const ResourceAwareTaskOptimizationPass &other)
+      : PassWrapper(other) {}
+
+  StringRef getArgument() const override {
+    return "resource-aware-task-optimization";
+  }
+
+  StringRef getDescription() const override {
+    return "Balances pipeline latency and fuses independent tasks for CGRA "
+           "utilization";
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<func::FuncDialect>();
+    registry.insert<memref::MemRefDialect>();
+    registry.insert<neura::NeuraDialect>();
+    registry.insert<taskflow::TaskflowDialect>();
+  }
+
+  // Estimation mode for profiling task II / steps.
+  //   "compiled" (default): runs the full Neura lowering + mapping pipeline
+  //       to obtain accurate compiled_ii and steps from MapToAcceleratorPass.
+  //   "analytical": uses only ResMII / RecMII analytical estimates without
+  //       running the mapper.  Much faster but less accurate — useful for
+  //       rapid design-space exploration or when the mapper is unavailable.
+  Option<std::string> estimationMode{
+      *this, "estimation-mode",
+      llvm::cl::desc(
+          "Profiling estimation mode: 'compiled' (default) runs the full "
+          "Neura lowering + mapping pipeline for accurate II/steps; "
+          "'analytical' uses only ResMII/RecMII analytical estimates "
+          "(faster but less accurate)."),
+      llvm::cl::init("compiled")};
+
+  // Controls whether the balance phase skips the mapper during speculative
+  // profiling.  Default is true (analytical-only) for speed — the mapper can
+  // backtrack indefinitely on larger tile arrays.  Set to false to run the
+  // real mapper during balance probes for accurate compiled_ii at the cost
+  // of longer compile times.
+  Option<bool> balanceSkipMapper{
+      *this, "balance-skip-mapper",
+      llvm::cl::desc(
+          "Whether balance probes skip the mapper and use only analytical "
+          "ResMII/RecMII estimates (default: true).  Set to false for "
+          "accurate compiled_ii during balance at the cost of compile time."),
+      llvm::cl::init(true)};
+
+  void runOnOperation() override {
+    func::FuncOp func = getOperation();
+
+    bool use_analytical = (estimationMode.getValue() == "analytical");
+
+    llvm::errs() << "=== ResourceAwareTaskOptimization on "
+                 << func.getName()
+                 << " (estimation-mode=" << estimationMode.getValue()
+                 << ") ===\n";
+
+    constexpr int kMaxOuterIterations = 10;
+
+    for (int outer = 0; outer < kMaxOuterIterations; ++outer) {
+      // Rebuilds graph from current IR state.
+      TaskDependencyGraph graph;
+      graph.build(func, use_analytical);
+
+      if (graph.nodes.empty()) {
+        return;
+      }
+
+      int num_tasks = graph.nodes.size();
+
+      // Asserts that initial tasks fit in the grid.
+      assert(num_tasks <= kTotalCGRAs &&
+             "Number of tasks exceeds 4x4 CGRA grid capacity! "
+             "Reduce task count via streaming fusion or increase grid size.");
+
+      llvm::errs() << "[ResourceAware] Iteration " << outer << ": "
+                   << num_tasks << " tasks\n";
+      for (auto &node : graph.nodes) {
+        llvm::errs() << "  Task " << node->id << " ("
+                     << node->op.getTaskName() << "): trip_count="
+                     << node->trip_count << ", cgra_count=" << node->cgra_count
+                     << ", est_latency=" << node->estimatedLatency() << "\n";
+      }
+
+      // Phase 1: Utilization Fusion.
+      // Fuses independent tasks to free up CGRA budget for balance.
+      UtilizationFuser fuser;
+      // Exposes TaskDependencyGraph::profileTask to UtilizationFuser via a
+      // lambda so fused tasks get real profiling.  In analytical mode, the
+      // mapper is skipped entirely (only ResMII/RecMII estimates are used).
+      auto profile_fn = [&graph, use_analytical](TaskGraphNode *node,
+                                                  TaskflowTaskOp task) {
+        graph.profileTaskPublic(node, task, /*skip_mapper=*/use_analytical);
+      };
+      bool fuse_changed = fuser.fuse(func, graph, profile_fn);
+
+      llvm::errs() << "[ResourceAware] After fusion: total_cgras="
+                   << graph.getTotalAllocatedCGRAs() << "\n";
+
+      // Rebuilds graph after fusion (tasks may have been erased/created).
+      if (fuse_changed) {
+        graph = TaskDependencyGraph();
+        graph.build(func, use_analytical);
+      }
+
+      // Phase 2: Latency-Aware Pipeline Balance.
+      // Balance probes use analytical-only profiling by default.
+      bool balance_skip = use_analytical || balanceSkipMapper.getValue();
+      auto balance_profile_fn = [&graph, balance_skip](TaskGraphNode *node,
+                                         TaskflowTaskOp task) {
+        graph.profileTaskPublic(node, task, /*skip_mapper=*/balance_skip);
+      };
+      PipelineBalancer balancer;
+      bool balance_changed = balancer.balance(graph, balance_profile_fn);
+
+      // Writes back attributes so the next iteration sees them.
+      if (balance_changed || fuse_changed) {
+        for (auto &node : graph.nodes) {
+          OpBuilder b(node->op);
+          node->op->setAttr(
+              "cgra_count", b.getI32IntegerAttr(node->cgra_count));
+          if (node->ii != kUnprofiled) {
+            node->op->setAttr("compiled_ii", b.getI32IntegerAttr(node->ii));
+          }
+          if (node->steps != kUnprofiled) {
+            node->op->setAttr("steps", b.getI32IntegerAttr(node->steps));
+          }
+          if (node->trip_count > 0) {
+            node->op->setAttr("trip_count",
+                              b.getI32IntegerAttr(node->trip_count));
+          }
+          if (balance_changed && node->cgra_count > 1) {
+            llvm::errs() << "  [Balance] " << node->op.getTaskName()
+                         << " -> cgra_count=" << node->cgra_count
+                         << ", est_latency=" << node->estimatedLatency()
+                         << "\n";
+          }
+        }
+      }
+
+      llvm::errs() << "[ResourceAware] After balance: total_cgras="
+                   << graph.getTotalAllocatedCGRAs() << "\n";
+
+      if (!balance_changed && !fuse_changed) {
+        // Converged — writes ALL attributes (cgra_count, ii, steps) to IR
+        // for every task. Non-fused tasks only got cgra_count written during
+        // intermediate iterations; ii, steps, and trip_count live only in the
+        // graph node and must be persisted here.
+        //
+        // Note: no re-profiling is done here.  When balance-skip-mapper=true
+        // (the default), the balance phase uses analytical estimates; those
+        // are the values written to the final IR.  When
+        // balance-skip-mapper=false, the balance phase already ran the real
+        // mapper for each speculative probe, so the graph already contains
+        // accurate compiled_ii / steps values.  Either way, the converged
+        // graph state is authoritative and written directly to IR.
+
+        for (auto &node : graph.nodes) {
+          OpBuilder b(node->op);
+          node->shape = pickBestShape(node->cgra_count);
+          node->op->setAttr("cgra_count",
+                            b.getI32IntegerAttr(node->cgra_count));
+          node->op->setAttr("compiled_ii",
+                            b.getI32IntegerAttr(node->ii));
+          node->op->setAttr("steps",
+                            b.getI32IntegerAttr(node->steps));
+          node->op->setAttr("trip_count",
+                            b.getI32IntegerAttr(node->trip_count));
+          // Writes tile_shape attribute: simple "NxM" bounding-box string.
+          // The detailed occupancy diagram is printed in the summary below.
+          std::string shape_str = node->shape.irAttr();
+          node->op->setAttr("tile_shape", b.getStringAttr(shape_str));
+        }
+        break;
+      }
+    }
+
+    // Performs final validation and tile occupation summary with visual 4x4 grid.
+    {
+      TaskDependencyGraph final_graph;
+      final_graph.build(func, use_analytical);
+      int final_total = final_graph.getTotalAllocatedCGRAs();
+
+      // Assigns each task a single character label for the combined grid.
+      // Tasks are labelled '0','1','2',... ; free cells shown as '.'.
+      // grid[row][col] == -1 means free.
+      std::vector<std::vector<int>> combined_grid(
+          kCgraGridRows, std::vector<int>(kCgraGridCols, -1));
+
+      // Packs tasks onto the grid left-to-right, top-to-bottom.
+      int next_col = 0, next_row = 0;
+      int task_idx = 0;
+
+      llvm::errs() << "\n=== Tile Occupation Summary (4x" << kCgraGridCols
+                   << " CGRA Grid) ===\n";
+
+      for (auto &node : final_graph.nodes) {
+        auto shape = pickBestShape(node->cgra_count);
+        int tile_rows = shape.rows * neura::getArchitecture().getPerCgraRows();
+        int tile_cols = shape.cols * neura::getArchitecture().getPerCgraColumns();
+
+        // Per-task grid (shape.rows x shape.cols bbox, filled up to cgra_count).
+        llvm::errs() << "\n  [" << task_idx << "] " << node->op.getTaskName()
+                     << "  cgra_count=" << node->cgra_count
+                     << "  shape=" << shape.describe(node->cgra_count)
+                     << "  tile_array=" << tile_rows << "x" << tile_cols
+                     << "  ii=" << node->ii
+                     << "  steps=" << node->steps
+                     << "  trip_count=" << node->trip_count << "\n";
+
+        // Draws a per-task bounding-box grid (shape.rows x shape.cols).
+        int remaining = node->cgra_count;
+        llvm::errs() << "      +" ;
+        for (int c = 0; c < shape.cols; ++c) llvm::errs() << "---+";
+        llvm::errs() << "\n";
+        for (int r = 0; r < shape.rows; ++r) {
+          llvm::errs() << "      |";
+          for (int c = 0; c < shape.cols; ++c) {
+            if (remaining > 0) {
+              llvm::errs() << " # |";
+              --remaining;
+            } else {
+              llvm::errs() << "   |";
+            }
+          }
+          llvm::errs() << "\n";
+          llvm::errs() << "      +";
+          for (int c = 0; c < shape.cols; ++c) llvm::errs() << "---+";
+          llvm::errs() << "\n";
+        }
+
+        // Places onto combined grid (pack sequentially).
+        int placed = 0;
+        for (int r = next_row; r < kCgraGridRows && placed < node->cgra_count; ++r) {
+          for (int c = (r == next_row ? next_col : 0);
+               c < kCgraGridCols && placed < node->cgra_count; ++c) {
+            combined_grid[r][c] = task_idx;
+            next_row = r;
+            next_col = c + 1;
+            if (next_col >= kCgraGridCols) { next_col = 0; next_row = r + 1; }
+            ++placed;
+          }
+        }
+        ++task_idx;
+      }
+
+      // Prints combined 4xN grid.
+      llvm::errs() << "\n  Combined 4x" << kCgraGridCols << " Grid"
+                   << " (" << final_total << "/" << kTotalCGRAs << " used):\n";
+      llvm::errs() << "  +";
+      for (int c = 0; c < kCgraGridCols; ++c) llvm::errs() << "---+";
+      llvm::errs() << "\n";
+      for (int r = 0; r < kCgraGridRows; ++r) {
+        llvm::errs() << "  |";
+        for (int c = 0; c < kCgraGridCols; ++c) {
+          int t = combined_grid[r][c];
+          if (t < 0)
+            llvm::errs() << " . |";
+          else
+            llvm::errs() << " " << (char)('0' + t) << " |";
+        }
+        llvm::errs() << "\n";
+        llvm::errs() << "  +";
+        for (int c = 0; c < kCgraGridCols; ++c) llvm::errs() << "---+";
+        llvm::errs() << "\n";
+      }
+      llvm::errs() << "  (" << (kTotalCGRAs - final_total) << " free)\n";
+      llvm::errs() << "================================================\n";
+
+      llvm::errs() << "[ResourceAware] Final: " << final_graph.nodes.size()
+                   << " tasks, " << final_total << " CGRAs\n";
+      assert(final_total <= kTotalCGRAs &&
+             "Total CGRA allocation exceeds 4x4 grid after optimization!");
+    }
+  }
+};
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// Pass Registration
+//===----------------------------------------------------------------------===//
+
+std::unique_ptr<mlir::Pass>
+mlir::taskflow::createResourceAwareTaskOptimizationPass() {
+  return std::make_unique<ResourceAwareTaskOptimizationPass>();
+}
diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
index 2a4eb496..364bcadc 100644
--- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
+++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
@@ -33,6 +33,32 @@
 // RUN: -o %t.placement.mlir
 // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT
 
+// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
+// RUN: --affine-loop-perfection \
+// RUN: --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --classify-counters \
+// RUN: --convert-taskflow-to-neura \
+// RUN: --lower-affine \
+// RUN: --convert-scf-to-cf \
+// RUN: --convert-cf-to-llvm \
+// RUN: --assign-accelerator \
+// RUN: --lower-memref-to-neura \
+// RUN: --lower-arith-to-neura \
+// RUN: --lower-builtin-to-neura \
+// RUN: --lower-llvm-to-neura \
+// RUN: --promote-input-arg-to-const \
+// RUN: --fold-constant \
+// RUN: --canonicalize-return \
+// RUN: --canonicalize-live-in \
+// RUN: --leverage-predicated-value \
+// RUN: --transform-ctrl-to-data-flow \
+// RUN: --fold-constant \
+// RUN: '--resource-aware-task-optimization=balance-skip-mapper=false' \
+// RUN: --architecture-spec=%S/../../../arch_spec/architecture_with_counter.yaml \
+// RUN: -o %t.resopt.mlir
+// RUN: FileCheck %s --input-file=%t.resopt.mlir --check-prefixes=RESOPT
+
 #set = affine_set<(d0, d1) : (d0 - 3 == 0, d1 - 7 == 0)>
 module attributes {} {
   func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
@@ -355,3 +381,20 @@ module attributes {} {
 // PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_locations = [], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]}
 // PLACEMENT:      taskflow.task @Task_2
 // PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locations = [{col = 1 : i32, row = 1 : i32}], write_sram_locations = [{col = 0 : i32, row = 1 : i32}]}
+
+// CGRA Tile Occupation after RESOPT (4x4 grid, col x row):
+// +---+---+---+---+
+// | 0 | 1 | . | . |   Task_0_Task_1_utilfused (1x1, cgra_count=1)
+// +---+---+---+---+   Task_2 (1x1, cgra_count=1)
+// | . | . | . | . |
+// +---+---+---+---+
+// | . | . | . | . |
+// +---+---+---+---+
+// | . | . | . | . |
+// +---+---+---+---+
+// 0=Task_0_Task_1_utilfused, 1=Task_2; 2/16 CGRAs used
+
+// RESOPT:      taskflow.task @Task_0_Task_1_utilfused
+// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 3 : i32, steps = 5 : i32, tile_shape = "1x1", trip_count = 32 : i32}
+// RESOPT:      taskflow.task @Task_2
+// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 7 : i32, tile_shape = "1x1", trip_count = 32 : i32}
diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
index bcdbbe86..42f99361 100644
--- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
+++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
@@ -18,6 +18,32 @@
 // RUN: -o %t.stream.mlir
 // RUN: FileCheck %s --input-file=%t.stream.mlir --check-prefixes=STREAM
 
+// RUN: mlir-neura-opt %t.stream.mlir \
+// RUN: --affine-loop-tree-serialization \
+// RUN: --affine-loop-perfection \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --classify-counters \
+// RUN: --convert-taskflow-to-neura \
+// RUN: --lower-affine \
+// RUN: --convert-scf-to-cf \
+// RUN: --convert-cf-to-llvm \
+// RUN: --assign-accelerator \
+// RUN: --lower-memref-to-neura \
+// RUN: --lower-arith-to-neura \
+// RUN: --lower-builtin-to-neura \
+// RUN: --lower-llvm-to-neura \
+// RUN: --promote-input-arg-to-const \
+// RUN: --fold-constant \
+// RUN: --canonicalize-return \
+// RUN: --canonicalize-live-in \
+// RUN: --leverage-predicated-value \
+// RUN: --transform-ctrl-to-data-flow \
+// RUN: --fold-constant \
+// RUN: '--resource-aware-task-optimization=balance-skip-mapper=false' \
+// RUN: --architecture-spec=%S/../../../arch_spec/architecture_with_counter.yaml \
+// RUN: -o %t.resopt.mlir
+// RUN: FileCheck %s --input-file=%t.resopt.mlir --check-prefixes=RESOPT
+
 // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
 // RUN: --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
@@ -505,4 +531,24 @@ module attributes {} {
 // PLACEMENT:      taskflow.task @Task_3
 // PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 2 : i32, row = 0 : i32}], read_sram_locations = [{col = 2 : i32, row = 0 : i32}], write_sram_locations = [{col = 2 : i32, row = 1 : i32}]}
 // PLACEMENT:      taskflow.task @Task_4
-// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_locations = [{col = 1 : i32, row = 1 : i32}, {col = 2 : i32, row = 1 : i32}], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]}
\ No newline at end of file
+// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_locations = [{col = 1 : i32, row = 1 : i32}, {col = 2 : i32, row = 1 : i32}], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]}
+
+// RESOPT:      taskflow.task @Task_1
+// RESOPT:      cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 4 : i32, tile_shape = "1x1", trip_count = 160 : i32
+// RESOPT:      taskflow.task @Task_0_Task_2_fused_Task_3_utilfused
+// RESOPT:      cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 5 : i32, tile_shape = "1x1", trip_count = 192 : i32
+// RESOPT:      taskflow.task @Task_4
+// RESOPT:      cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 4 : i32, tile_shape = "1x1", trip_count = 36 : i32
+// RESOPT:      return
+
+// CGRA Tile Occupation after RESOPT (4x4 grid, col x row):
+// +---+---+---+---+
+// | 0 | 1 | 2 | . |   Task_1 (1x1, cgra_count=1)
+// +---+---+---+---+   Task_0_Task_2_fused_Task_3_utilfused (1x1, cgra_count=1)
+// | . | . | . | . |   Task_4 (1x1, cgra_count=1)
+// +---+---+---+---+
+// | . | . | . | . |
+// +---+---+---+---+
+// | . | . | . | . |
+// +---+---+---+---+
+// 0=Task_1, 1=Task_0_Task_2_fused_Task_3_utilfused, 2=Task_4; 3/16 CGRAs used
\ No newline at end of file
diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
index 6c0cd57b..3d63f767 100644
--- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
+++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
@@ -7,6 +7,32 @@
 // RUN: -o %t.taskflow.mlir
 // RUN: FileCheck %s --input-file=%t.taskflow.mlir --check-prefixes=TASKFLOW
 
+// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
+// RUN: --affine-loop-perfection \
+// RUN: --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --classify-counters \
+// RUN: --convert-taskflow-to-neura \
+// RUN: --lower-affine \
+// RUN: --convert-scf-to-cf \
+// RUN: --convert-cf-to-llvm \
+// RUN: --assign-accelerator \
+// RUN: --lower-memref-to-neura \
+// RUN: --lower-arith-to-neura \
+// RUN: --lower-builtin-to-neura \
+// RUN: --lower-llvm-to-neura \
+// RUN: --promote-input-arg-to-const \
+// RUN: --fold-constant \
+// RUN: --canonicalize-return \
+// RUN: --canonicalize-live-in \
+// RUN: --leverage-predicated-value \
+// RUN: --transform-ctrl-to-data-flow \
+// RUN: --fold-constant \
+// RUN: '--resource-aware-task-optimization=balance-skip-mapper=false' \
+// RUN: --architecture-spec=%S/../../../arch_spec/architecture_with_counter.yaml \
+// RUN: -o %t.resopt.mlir
+// RUN: FileCheck %s --input-file=%t.resopt.mlir --check-prefixes=RESOPT
+
 // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
 // RUN: --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
@@ -132,4 +158,20 @@ module {
 // PLACEMENT:      taskflow.task @Task_0
 // PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locations = [{col = 0 : i32, row = 0 : i32}], write_sram_locations = [{col = 0 : i32, row = 0 : i32}]}
 // PLACEMENT:      taskflow.task @Task_1
-// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locations = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locations = [{col = 1 : i32, row = 0 : i32}]}
\ No newline at end of file
+// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locations = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locations = [{col = 1 : i32, row = 0 : i32}]}
+
+// RESOPT:      taskflow.task @Task_0_Task_1_utilfused
+// RESOPT:      cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 4 : i32, tile_shape = "1x1", trip_count = 64 : i32
+// RESOPT:      return
+
+// CGRA Tile Occupation after RESOPT (4x4 grid, col x row):
+// +---+---+---+---+
+// | 0 | . | . | . |   row=0: Task_0_Task_1_utilfused (1x1, cgra_count=1)
+// +---+---+---+---+
+// | . | . | . | . |
+// +---+---+---+---+
+// | . | . | . | . |
+// +---+---+---+---+
+// | . | . | . | . |
+// +---+---+---+---+
+// 0=Task_0_Task_1_utilfused; 1/16 CGRAs used
\ No newline at end of file
diff --git a/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir b/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir
index 83dcb02a..f1741b0a 100644
--- a/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir
+++ b/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir
@@ -15,6 +15,33 @@
 // RUN: -o %t.stream.mlir
 // RUN: FileCheck %s --input-file=%t.stream.mlir --check-prefixes=STREAM
 
+// RUN: mlir-neura-opt %t.stream.mlir \
+// RUN: --affine-loop-tree-serialization \
+// RUN: --affine-loop-perfection \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --classify-counters \
+// RUN: --convert-taskflow-to-neura \
+// RUN: --lower-affine \
+// RUN: --convert-scf-to-cf \
+// RUN: --convert-cf-to-llvm \
+// RUN: --assign-accelerator \
+// RUN: --lower-memref-to-neura \
+// RUN: --lower-arith-to-neura \
+// RUN: --lower-builtin-to-neura \
+// RUN: --lower-llvm-to-neura \
+// RUN: --promote-input-arg-to-const \
+// RUN: --fold-constant \
+// RUN: --canonicalize-return \
+// RUN: --canonicalize-live-in \
+// RUN: --leverage-predicated-value \
+// RUN: --transform-ctrl-to-data-flow \
+// RUN: --fold-constant \
+// RUN: '--resource-aware-task-optimization=balance-skip-mapper=false' \
+// RUN: --architecture-spec=%S/../../../arch_spec/architecture_with_counter.yaml \
+// RUN: -o %t.resopt.mlir
+// RUN: FileCheck %s --input-file=%t.resopt.mlir --check-prefixes=RESOPT
+
+
 module attributes {torch.debug_module_name = "SimpleResNetBlock"} {
   func.func @forward(%arg0: tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32> {
     %0 = "tosa.const"() <{value = dense<"0x7BEEA13C"> : tensor<64x64x3x3xf32>}> : () -> tensor<64x64x3x3xf32>
@@ -675,3 +702,32 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} {
 // STREAM-NEXT:   }
 // STREAM-NEXT: }
 
+
+// RESOPT:      taskflow.task @Task_1_Task_0_Task_2_utilfused_utilfused
+// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 4 : i32, steps = 3 : i32, tile_shape = "1x1", trip_count = 6400 : i32}
+// RESOPT:      taskflow.task @Task_3
+// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 6 : i32, tile_shape = "1x1", trip_count = 2359296 : i32}
+// RESOPT:      taskflow.task @Task_4_Task_5_fused_Task_7_utilfused
+// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 7 : i32, tile_shape = "1x1", trip_count = 6400 : i32}
+// RESOPT:      taskflow.task @Task_6_Task_8_utilfused
+// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 3 : i32, tile_shape = "1x1", trip_count = 4096 : i32}
+// RESOPT:      taskflow.task @Task_9
+// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 6 : i32, tile_shape = "1x1", trip_count = 2359296 : i32}
+// RESOPT:      taskflow.task @Task_10_Task_11_Task_12_fused_fused
+// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 8 : i32, tile_shape = "1x1", trip_count = 4096 : i32}
+// RESOPT:      return
+
+
+// CGRA Tile Occupation after RESOPT (4x4 grid, col x row):
+// +---+---+---+---+
+// | 0 | 1 | 2 | 3 |   row=0: 0=Task_1_..._utilfused, 1=Task_3, 2=Task_4_..._utilfused, 3=Task_6_Task_8_utilfused
+// +---+---+---+---+
+// | 4 | 5 | . | . |   row=1: 4=Task_9, 5=Task_10_..._fused_fused
+// +---+---+---+---+
+// | . | . | . | . |
+// +---+---+---+---+
+// | . | . | . | . |
+// +---+---+---+---+
+// 0=Task_1_Task_0_Task_2_utilfused_utilfused, 1=Task_3, 2=Task_4_Task_5_fused_Task_7_utilfused
+// 3=Task_6_Task_8_utilfused, 4=Task_9, 5=Task_10_Task_11_Task_12_fused_fused
+// 6/16 CGRAs used
\ No newline at end of file
diff --git a/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir b/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir
new file mode 100644
index 00000000..ffc37f2d
--- /dev/null
+++ b/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir
@@ -0,0 +1,208 @@
+// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
+// RUN: -o %t.serialized.mlir
+// RUN: FileCheck %s --input-file=%t.serialized.mlir --check-prefixes=SERIALIZED
+
+// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
+// RUN: --convert-affine-to-taskflow \
+// RUN: -o %t.taskflow.mlir
+// RUN: FileCheck %s --input-file=%t.taskflow.mlir --check-prefixes=TASKFLOW
+
+// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
+// RUN: --affine-loop-perfection \
+// RUN: --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --classify-counters \
+// RUN: --convert-taskflow-to-neura \
+// RUN: --lower-affine \
+// RUN: --convert-scf-to-cf \
+// RUN: --convert-cf-to-llvm \
+// RUN: --assign-accelerator \
+// RUN: --lower-memref-to-neura \
+// RUN: --lower-arith-to-neura \
+// RUN: --lower-builtin-to-neura \
+// RUN: --lower-llvm-to-neura \
+// RUN: --promote-input-arg-to-const \
+// RUN: --fold-constant \
+// RUN: --canonicalize-return \
+// RUN: --canonicalize-live-in \
+// RUN: --leverage-predicated-value \
+// RUN: --transform-ctrl-to-data-flow \
+// RUN: --fold-constant \
+// RUN: '--resource-aware-task-optimization' \
+// RUN: --architecture-spec=%S/../../../arch_spec/architecture_with_counter.yaml \
+// RUN: -o %t.resopt.mlir
+// RUN: FileCheck %s --input-file=%t.resopt.mlir --check-prefixes=RESOPT
+
+module {
+  // Example: Stereo image disparity preprocessing — a real computer vision kernel.
+  //
+  // This models a common pattern in stereo vision pipelines where for each pixel
+  // pair from left/right cameras, we compute multiple cost metrics (SAD variants)
+  // plus gradient-based features. The heavy per-pixel compute makes res_mii > 1
+  // on a single 4×4 CGRA (16 tiles), forcing the balance pass to allocate
+  // multiple CGRAs.
+  //
+  // Task 0 (heavy): Stereo cost computation — per pixel computes:
+  //   - Left/right scale+bias normalization (6 channels: R,G,B × 2 views)
+  //   - Channel-wise absolute differences
+  //   - Weighted sum of absolute differences
+  //   - Gradient features (horizontal differences)
+  //   Total: ~40+ materialized Neura ops → res_mii=3 on 16 tiles.
+  //   On 32 tiles (2 CGRAs), res_mii drops to 2, enabling II reduction.
+  //
+  // Task 1 (light): Simple post-processing (few ops, res_mii=1).
+  func.func @stereo_cost_computation(
+      %L_R: memref<64xf32>, %L_G: memref<64xf32>, %L_B: memref<64xf32>,
+      %R_R: memref<64xf32>, %R_G: memref<64xf32>, %R_B: memref<64xf32>,
+      %cost: memref<64xf32>, %grad: memref<64xf32>,
+      %w1: f32, %w2: f32, %w3: f32,
+      %scale: f32, %bias: f32,
+      %aux_in: memref<64xf32>, %aux_out: memref<64xf32>) {
+
+    // Task 0: Stereo matching cost with multi-feature extraction
+    affine.for %i = 0 to 64 {
+      // Load left view RGB
+      %lr = affine.load %L_R[%i] : memref<64xf32>
+      %lg = affine.load %L_G[%i] : memref<64xf32>
+      %lb = affine.load %L_B[%i] : memref<64xf32>
+
+      // Load right view RGB
+      %rr = affine.load %R_R[%i] : memref<64xf32>
+      %rg = affine.load %R_G[%i] : memref<64xf32>
+      %rb = affine.load %R_B[%i] : memref<64xf32>
+
+      // Normalize left: l_ch = L_ch * scale + bias  (6 ops: 3 fmul + 3 fadd)
+      %lr_s = arith.mulf %lr, %scale : f32
+      %lr_n = arith.addf %lr_s, %bias : f32
+      %lg_s = arith.mulf %lg, %scale : f32
+      %lg_n = arith.addf %lg_s, %bias : f32
+      %lb_s = arith.mulf %lb, %scale : f32
+      %lb_n = arith.addf %lb_s, %bias : f32
+
+      // Normalize right: r_ch = R_ch * scale + bias  (6 ops: 3 fmul + 3 fadd)
+      %rr_s = arith.mulf %rr, %scale : f32
+      %rr_n = arith.addf %rr_s, %bias : f32
+      %rg_s = arith.mulf %rg, %scale : f32
+      %rg_n = arith.addf %rg_s, %bias : f32
+      %rb_s = arith.mulf %rb, %scale : f32
+      %rb_n = arith.addf %rb_s, %bias : f32
+
+      // Per-channel differences  (3 ops: 3 fsub)
+      %dr = arith.subf %lr_n, %rr_n : f32
+      %dg = arith.subf %lg_n, %rg_n : f32
+      %db = arith.subf %lb_n, %rb_n : f32
+
+      // Squared differences for SSD cost  (3 ops: 3 fmul)
+      %dr2 = arith.mulf %dr, %dr : f32
+      %dg2 = arith.mulf %dg, %dg : f32
+      %db2 = arith.mulf %db, %db : f32
+
+      // Weighted SSD: cost = w1*dr² + w2*dg² + w3*db²  (5 ops: 3 fmul + 2 fadd)
+      %wdr = arith.mulf %dr2, %w1 : f32
+      %wdg = arith.mulf %dg2, %w2 : f32
+      %wdb = arith.mulf %db2, %w3 : f32
+      %sum_rg = arith.addf %wdr, %wdg : f32
+      %cost_val = arith.addf %sum_rg, %wdb : f32
+
+      // Gradient feature: horizontal gradient = (lr-lb)*w1 + (rr-rb)*w2
+      // (4 ops: 2 fsub + 2 fmul)
+      %gl = arith.subf %lr_n, %lb_n : f32
+      %gr = arith.subf %rr_n, %rb_n : f32
+      %gls = arith.mulf %gl, %w1 : f32
+      %grs = arith.mulf %gr, %w2 : f32
+
+      // Combined gradient (1 op: 1 fadd)
+      %grad_val = arith.addf %gls, %grs : f32
+
+      // Store results
+      affine.store %cost_val, %cost[%i] : memref<64xf32>
+      affine.store %grad_val, %grad[%i] : memref<64xf32>
+    }
+
+    // Task 1: Simple post-processing — bias addition (light, stays on 1 CGRA)
+    affine.for %j = 0 to 64 {
+      %a = affine.load %aux_in[%j] : memref<64xf32>
+      %b2 = arith.addf %a, %bias : f32
+      affine.store %b2, %aux_out[%j] : memref<64xf32>
+    }
+
+    return
+  }
+}
+
+// SERIALIZED:      module {
+// SERIALIZED-NEXT:   func.func @stereo_cost_computation(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>, %arg3: memref<64xf32>, %arg4: memref<64xf32>, %arg5: memref<64xf32>, %arg6: memref<64xf32>, %arg7: memref<64xf32>, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: memref<64xf32>, %arg14: memref<64xf32>) {
+// SERIALIZED-NEXT:     affine.for %arg15 = 0 to 64 {
+// SERIALIZED-NEXT:       %0 = affine.load %arg0[%arg15] : memref<64xf32>
+// SERIALIZED-NEXT:       %1 = affine.load %arg1[%arg15] : memref<64xf32>
+// SERIALIZED-NEXT:       %2 = affine.load %arg2[%arg15] : memref<64xf32>
+// SERIALIZED-NEXT:       %3 = affine.load %arg3[%arg15] : memref<64xf32>
+// SERIALIZED-NEXT:       %4 = affine.load %arg4[%arg15] : memref<64xf32>
+// SERIALIZED-NEXT:       %5 = affine.load %arg5[%arg15] : memref<64xf32>
+// SERIALIZED-NEXT:       %6 = arith.mulf %0, %arg11 : f32
+// SERIALIZED-NEXT:       %7 = arith.addf %6, %arg12 : f32
+// SERIALIZED-NEXT:       %8 = arith.mulf %1, %arg11 : f32
+// SERIALIZED-NEXT:       %9 = arith.addf %8, %arg12 : f32
+// SERIALIZED-NEXT:       %10 = arith.mulf %2, %arg11 : f32
+// SERIALIZED-NEXT:       %11 = arith.addf %10, %arg12 : f32
+// SERIALIZED-NEXT:       %12 = arith.mulf %3, %arg11 : f32
+// SERIALIZED-NEXT:       %13 = arith.addf %12, %arg12 : f32
+// SERIALIZED-NEXT:       %14 = arith.mulf %4, %arg11 : f32
+// SERIALIZED-NEXT:       %15 = arith.addf %14, %arg12 : f32
+// SERIALIZED-NEXT:       %16 = arith.mulf %5, %arg11 : f32
+// SERIALIZED-NEXT:       %17 = arith.addf %16, %arg12 : f32
+// SERIALIZED-NEXT:       %18 = arith.subf %7, %13 : f32
+// SERIALIZED-NEXT:       %19 = arith.subf %9, %15 : f32
+// SERIALIZED-NEXT:       %20 = arith.subf %11, %17 : f32
+// SERIALIZED-NEXT:       %21 = arith.mulf %18, %18 : f32
+// SERIALIZED-NEXT:       %22 = arith.mulf %19, %19 : f32
+// SERIALIZED-NEXT:       %23 = arith.mulf %20, %20 : f32
+// SERIALIZED-NEXT:       %24 = arith.mulf %21, %arg8 : f32
+// SERIALIZED-NEXT:       %25 = arith.mulf %22, %arg9 : f32
+// SERIALIZED-NEXT:       %26 = arith.mulf %23, %arg10 : f32
+// SERIALIZED-NEXT:       %27 = arith.addf %24, %25 : f32
+// SERIALIZED-NEXT:       %28 = arith.addf %27, %26 : f32
+// SERIALIZED-NEXT:       %29 = arith.subf %7, %11 : f32
+// SERIALIZED-NEXT:       %30 = arith.subf %13, %17 : f32
+// SERIALIZED-NEXT:       %31 = arith.mulf %29, %arg8 : f32
+// SERIALIZED-NEXT:       %32 = arith.mulf %30, %arg9 : f32
+// SERIALIZED-NEXT:       %33 = arith.addf %31, %32 : f32
+// SERIALIZED-NEXT:       affine.store %28, %arg6[%arg15] : memref<64xf32>
+// SERIALIZED-NEXT:       affine.store %33, %arg7[%arg15] : memref<64xf32>
+// SERIALIZED-NEXT:     }
+// SERIALIZED-NEXT:     affine.for %arg15 = 0 to 64 {
+// SERIALIZED-NEXT:       %0 = affine.load %arg13[%arg15] : memref<64xf32>
+// SERIALIZED-NEXT:       %1 = arith.addf %0, %arg12 : f32
+// SERIALIZED-NEXT:       affine.store %1, %arg14[%arg15] : memref<64xf32>
+// SERIALIZED-NEXT:     }
+// SERIALIZED-NEXT:     return
+// SERIALIZED-NEXT:   }
+// SERIALIZED-NEXT: }
+
+// TASKFLOW:      module {
+// TASKFLOW-NEXT:   func.func @stereo_cost_computation
+// TASKFLOW:        %write_outputs:2 = taskflow.task @Task_0
+// TASKFLOW:          affine.for %arg28 = 0 to 64 {
+// TASKFLOW:          }
+// TASKFLOW:          taskflow.yield
+// TASKFLOW:        %write_outputs_0 = taskflow.task @Task_1
+// TASKFLOW:          affine.for %arg18 = 0 to 64 {
+// TASKFLOW:          }
+// TASKFLOW:          taskflow.yield
+// TASKFLOW:        return
+
+// RESOPT:      taskflow.task @Task_0_Task_1_utilfused
+// RESOPT-SAME: {cgra_count = 3 : i32, compiled_ii = 1 : i32, steps = 10 : i32, tile_shape = "2x2[(0,0)(1,0)(0,1)]", trip_count = 64 : i32}
+// RESOPT:      return
+
+// CGRA Tile Occupation after RESOPT (4x4 grid, col x row):
+// +---+---+---+---+
+// | 0 | 0 | . | . |   row=0: Task_0_Task_1_utilfused occupies 3 CGRAs
+// +---+---+---+---+         in a 2x2 non-rectangular layout:
+// | 0 | . | . | . |         (0,0), (1,0), (0,1)
+// +---+---+---+---+
+// | . | . | . | . |   Total tile array: 8x8 (3 CGRAs × 16 tiles = 48 tiles)
+// +---+---+---+---+
+// | . | . | . | . |   res_mii=3 (16 tiles) → 2 (32 tiles) → 1 (48 tiles)
+// +---+---+---+---+   
+// 0=Task_0_Task_1_utilfused (cgra_count=3); 3/16 CGRAs used
\ No newline at end of file