coredac · guosran · Mar 4, 2026 · Feb 13, 2026 · Feb 16, 2026 · Feb 16, 2026
diff --git a/include/NeuraDialect/Architecture/Architecture.h b/include/NeuraDialect/Architecture/Architecture.h
@@ -494,6 +494,23 @@ class Architecture {
   // Checks if the architecture supports counter operations.
   bool canSupportCounter() const;
 
+  // Clones the architecture but with new per-cgra dimensions.
+  // The provided tile_overrides will be appended to the existing ones.
+  //
+  // Example — create an 8×4 tile array (2×1 CGRA rectangle) with all tiles
+  // present:
+  //   auto arch_2x1 = getArchitecture().cloneWithNewDimensions(8, 4);
+  //
+  // Example — create a 12×8 bounding box for a T-shape (4 CGRAs) where only
+  // specific tiles are valid:
+  //   std::vector<TileOverride> overrides;
+  //   // First mark all tiles as non-existent, then mark valid ones existent.
+  //   // (see MapToAcceleratorPass for the full valid_tiles parsing logic)
+  //   auto arch_T = getArchitecture().cloneWithNewDimensions(8, 12, overrides);
+  std::unique_ptr<Architecture> cloneWithNewDimensions(
+      int new_per_cgra_rows, int new_per_cgra_columns,
+      const std::vector<TileOverride> &additional_overrides = {}) const;
+
 private:
   // Helper methods for constructor initialization.
   void initializeTiles(int rows, int columns);
@@ -532,6 +549,13 @@ class Architecture {
   int per_cgra_rows_;
   int per_cgra_columns_;
   int max_ctrl_mem_items_;
+
+  BaseTopology multi_cgra_base_topology_;
+  BaseTopology per_cgra_base_topology_;
+  TileDefaults tile_defaults_;
+  std::vector<TileOverride> tile_overrides_;
+  LinkDefaults link_defaults_;
+  std::vector<LinkOverride> link_overrides_;
 };
 
 // Function for getting the architecture object.

diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h
@@ -23,7 +23,10 @@ std::unique_ptr<mlir::Pass> createInsertCtrlMovPass();
 std::unique_ptr<mlir::Pass> createAssignAcceleratorPass();
 std::unique_ptr<mlir::Pass> createTransformCtrlToDataFlowPass();
 std::unique_ptr<mlir::Pass> createLeveragePredicatedValuePass();
-std::unique_ptr<mlir::Pass> createMapToAcceleratorPass();
+// Creates the MapToAccelerator pass. Tile dimensions default to 0 (use
+// architecture singleton) when not specified via options.
+std::unique_ptr<mlir::Pass> createMapToAcceleratorPass(
+    const MapToAcceleratorOptions &options = MapToAcceleratorOptions{});
 std::unique_ptr<mlir::Pass> createGenerateCodePass();
 std::unique_ptr<mlir::Pass> createCanonicalizeReturnPass();
 std::unique_ptr<mlir::Pass> createCanonicalizeLiveInPass();

diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td
@@ -54,7 +54,41 @@ def MapToAccelerator : Pass<"map-to-accelerator", "ModuleOp"> {
   let summary = "Map Neura operations onto a given accelerator";
   let description = [{
     This pass performs mapping from Neura operations to accelerator.
+
+    x-tiles and y-tiles specify the **tile** dimensions of the target array
+    (not the CGRA count).  Each CGRA contains a per_cgra_rows × per_cgra_cols
+    tile grid (currently 4×4).  So for a single CGRA, x-tiles=4 y-tiles=4;
+    for a 1×2 rectangular pair, x-tiles=8 y-tiles=4; etc.
+
+    When x-tiles=0 and y-tiles=0 (the default), the global Architecture
+    singleton determines the tile grid — this is equivalent to a single CGRA.
+
+    Examples:
+      Single CGRA (default):
+        --map-to-accelerator
+      1×3 rectangular (3 CGRAs in a row):
+        --map-to-accelerator x-tiles=12 y-tiles=4
+      T-shape (4 CGRAs: top row of 3 + centre below):
+        --map-to-accelerator x-tiles=12 y-tiles=8 \
+            valid-tiles="0_0,1_0,2_0,3_0,4_0,5_0,6_0,7_0,8_0,9_0,10_0,11_0,\
+                         4_1,5_1,6_1,7_1,4_4,5_4,6_4,7_4,4_5,5_5,6_5,7_5"
   }];
+  let options = [
+    Option<"x_tiles", "x-tiles", "int", /*default=*/"0",
+           "Total number of tiles in the X dimension of the target array "
+           "(not the number of CGRAs).  Each CGRA contributes per_cgra_cols "
+           "tiles.  0 means use the global Architecture singleton (1 CGRA).">,
+    Option<"y_tiles", "y-tiles", "int", /*default=*/"0",
+           "Total number of tiles in the Y dimension of the target array "
+           "(not the number of CGRAs).  Each CGRA contributes per_cgra_rows "
+           "tiles.  0 means use the global Architecture singleton (1 CGRA).">,
+    Option<"valid_tiles", "valid-tiles", "std::string", /*default=*/"\"\"",
+           "Comma-separated list of tile coordinates (x_y) that are actually "
+           "present in the array, used for non-rectangular CGRA shapes such as "
+           "L-blocks or T-blocks. Empty string means all tiles in the "
+           "x-tiles x y-tiles rectangle are valid. "
+           "Example: 0_0,1_0,0_1 selects three tiles forming an L-shape.">
+  ];
   let constructor = "neura::createMapToAcceleratorPass()";
 }
 

diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h
@@ -29,6 +29,7 @@ std::unique_ptr<mlir::Pass> createMapTaskOnCgraPass();
 std::unique_ptr<mlir::Pass> createAffineLoopTreeSerializationPass();
 std::unique_ptr<mlir::Pass> createAffineLoopPerfectionPass();
 std::unique_ptr<mlir::Pass> createMemoryAccessStreamingFusionPass();
+std::unique_ptr<mlir::Pass> createResourceAwareTaskOptimizationPass();
 
 #define GEN_PASS_REGISTRATION
 #include "TaskflowDialect/TaskflowPasses.h.inc"

diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td
@@ -86,4 +86,48 @@ def MemoryAccessStreamingFusion : Pass<"memory-access-streaming-fusion", "func::
   }];
   let constructor = "taskflow::createMemoryAccessStreamingFusionPass()";
 }
+
+def ResourceAwareTaskOptimization : Pass<"resource-aware-task-optimization", "func::FuncOp"> {
+  let summary = "Balances pipeline latency and fuses independent tasks for CGRA utilization.";
+  let description = [{
+    Two-phase optimization:
+    1. Utilization Fusion: merges independent (no-edge) tasks, selecting pairs
+       that minimize |trip_count_a - trip_count_b| for balanced utilization.
+    2. Pipeline Balance: allocates extra CGRAs to critical-path bottleneck tasks.
+       More CGRAs combine tile arrays into larger arrays for mapping, potentially
+       lowering compiled_ii.  Latency model: II * (trip_count - 1) + steps.
+    Targets a 4x4 CGRA grid (16 CGRAs total, one CGRA per cell).
+    Currently a single task may be allocated at most 4 CGRAs.
+    Supported CGRA array shapes for a task (all fit within the 4×4 grid):
+      - rect : a perfect rectangle, e.g. 1×1, 1×2, 2×1, 1×3, 3×1, 2×2, 1×4, 4×1.
+      - L    : an L-shaped block of 3 or 4 CGRAs, e.g.
+                 3 CGRAs: (0,0)(1,0)(0,1) — two in a row + one below-left.
+                 4 CGRAs: (0,0)(0,1)(0,2)(1,2) — three in a column + one offset.
+      - T    : a T-shaped block of 4 CGRAs, e.g.
+                 (0,0)(1,0)(2,0)(1,1) — three in a row + one below centre.
+    Non-rectangular shapes are represented by their bounding box plus an
+    explicit tile list that enumerates only the occupied CGRA positions.
+    Compiled_ii must come from the downstream Neura pipeline (asserts on failure).
+
+    Use --estimation-mode to control how task II/steps are estimated:
+      compiled   (default): runs the full Neura lowering + mapping pipeline
+                  for accurate compiled_ii and steps.
+      analytical : uses only ResMII/RecMII analytical estimates without
+                  running the mapper — much faster but less accurate.
+                  Useful for rapid design-space exploration.
+    Example:
+      --resource-aware-task-optimization estimation-mode=analytical
+  }];
+  let options = [
+    Option<"estimationMode", "estimation-mode", "std::string",
+           /*default=*/"\"compiled\"",
+           "Profiling estimation mode: 'compiled' (default) runs the full "
+           "Neura lowering + mapping pipeline; 'analytical' uses only "
+           "ResMII/RecMII analytical estimates (faster but less accurate).">
+  ];
+  let constructor = "taskflow::createResourceAwareTaskOptimizationPass()";
+  let dependentDialects = [
+    "mlir::affine::AffineDialect",
+    "mlir::func::FuncDialect"];
+}
 #endif // TASKFLOW_PASSES_TD
diff --git a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
@@ -300,6 +300,74 @@ struct ArithIndexCastToNeuraCast
   }
 };
 
+struct ArithMinimumFToNeuraFCmpSel
+    : public OpRewritePattern<mlir::arith::MinimumFOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::MinimumFOp op,
+                                PatternRewriter &rewriter) const override {
+    Value lhs = op.getLhs();
+    Value rhs = op.getRhs();
+    Type result_type = op.getType();
+    Location loc = op.getLoc();
+
+    // minimumf(a, b) → sel(fcmp(a, b, "olt"), a, b)
+    // "olt" = ordered less-than: true when a < b (false if either is NaN).
+    Value cmp = rewriter.create<neura::FCmpOp>(
+        loc, result_type, lhs, rhs, rewriter.getStringAttr("olt"));
+    rewriter.replaceOpWithNewOp<neura::SelOp>(op, result_type, cmp, lhs, rhs);
+    return success();
+  }
+};
+
+struct ArithMaximumFToNeuraFCmpSel
+    : public OpRewritePattern<mlir::arith::MaximumFOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::MaximumFOp op,
+                                PatternRewriter &rewriter) const override {
+    Value lhs = op.getLhs();
+    Value rhs = op.getRhs();
+    Type result_type = op.getType();
+    Location loc = op.getLoc();
+
+    // maximumf(a, b) → sel(fcmp(a, b, "ogt"), a, b)
+    // "ogt" = ordered greater-than: true when a > b (false if either is NaN).
+    Value cmp = rewriter.create<neura::FCmpOp>(
+        loc, result_type, lhs, rhs, rewriter.getStringAttr("ogt"));
+    rewriter.replaceOpWithNewOp<neura::SelOp>(op, result_type, cmp, lhs, rhs);
+    return success();
+  }
+};
+
+// arith.andi(a, b) → neura.and(a, b)
+struct ArithAndIToNeuraAnd : public OpRewritePattern<mlir::arith::AndIOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::AndIOp op,
+                                PatternRewriter &rewriter) const override {
+    Value lhs = op.getLhs();
+    Value rhs = op.getRhs();
+    Type result_type = op.getType();
+    rewriter.replaceOpWithNewOp<neura::AndOp>(op, result_type, lhs, rhs);
+    return success();
+  }
+};
+
+// arith.ori(a, b) → neura.or(a, b)
+struct ArithOrIToNeuraOr : public OpRewritePattern<mlir::arith::OrIOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::OrIOp op,
+                                PatternRewriter &rewriter) const override {
+    Value lhs = op.getLhs();
+    Value rhs = op.getRhs();
+    Type result_type = op.getType();
+    rewriter.replaceOpWithNewOp<neura::OrOp>(op, result_type, lhs, rhs);
+    return success();
+  }
+};
+
 struct LowerArithToNeuraPass
     : public PassWrapper<LowerArithToNeuraPass, OperationPass<ModuleOp>> {
 
@@ -322,7 +390,9 @@ struct LowerArithToNeuraPass
              ArithExtUIToNeuraCast, ArithIndexCastToNeuraCast,
              ArithFDivToNeuraFDiv, ArithExtfToNeuraCast, ArithMulFToNeuraFMul,
              ArithSubIToNeuraSub, ArithSubFToNeuraFSub, ArithMulIToNeuraMul,
-             ArithDivSIToNeuraDiv, ArithRemSIToNeuraOp>(context);
+             ArithDivSIToNeuraDiv, ArithRemSIToNeuraOp,
+             ArithMinimumFToNeuraFCmpSel, ArithMaximumFToNeuraFCmpSel,
+             ArithAndIToNeuraAnd, ArithOrIToNeuraOr>(context);
     return patterns;
   }
 

diff --git a/lib/NeuraDialect/Architecture/Architecture.cpp b/lib/NeuraDialect/Architecture/Architecture.cpp
@@ -561,12 +561,15 @@ Architecture::Architecture(int multi_cgra_rows, int multi_cgra_columns,
                            const std::vector<LinkOverride> &link_overrides) {
   this->multi_cgra_rows_ = multi_cgra_rows;
   this->multi_cgra_columns_ = multi_cgra_columns;
-  // TODO: Support multi-CGRA topology in the future:
-  // https://github.com/coredac/dataflow/issues/163.
-  // this->multi_cgra_base_topology_ = multi_cgra_base_topology;
+  this->multi_cgra_base_topology_ = multi_cgra_base_topology;
   this->per_cgra_rows_ = per_cgra_rows;
   this->per_cgra_columns_ = per_cgra_columns;
+  this->per_cgra_base_topology_ = per_cgra_base_topology;
   this->max_ctrl_mem_items_ = max_ctrl_mem_items;
+  this->tile_defaults_ = tile_defaults;
+  this->tile_overrides_ = tile_overrides;
+  this->link_defaults_ = link_defaults;
+  this->link_overrides_ = link_overrides;
 
   // Initializes architecture components using helper methods.
   initializeTiles(per_cgra_rows, per_cgra_columns);
@@ -576,6 +579,20 @@ Architecture::Architecture(int multi_cgra_rows, int multi_cgra_columns,
   applyLinkOverrides(link_overrides);
 }
 
+std::unique_ptr<Architecture> Architecture::cloneWithNewDimensions(
+    int new_per_cgra_rows, int new_per_cgra_columns,
+    const std::vector<TileOverride> &additional_overrides) const {
+
+  std::vector<TileOverride> merged_overrides = tile_overrides_;
+  merged_overrides.insert(merged_overrides.end(), additional_overrides.begin(), additional_overrides.end());
+
+  return std::make_unique<Architecture>(
+      multi_cgra_rows_, multi_cgra_columns_, multi_cgra_base_topology_,
+      new_per_cgra_rows, new_per_cgra_columns, max_ctrl_mem_items_,
+      per_cgra_base_topology_, tile_defaults_, merged_overrides,
+      link_defaults_, link_overrides_);
+}
+
 Tile *Architecture::getTile(int id) {
   auto it = id_to_tile_.find(id);
   assert(it != id_to_tile_.end() && "Tile with given ID not found");

diff --git a/lib/NeuraDialect/Transforms/InsertDataMovPass.cpp b/lib/NeuraDialect/Transforms/InsertDataMovPass.cpp
@@ -23,8 +23,20 @@ struct InsertDataMovForNeuraOps : public RewritePattern {
 
   LogicalResult matchAndRewrite(Operation *op,
                                 PatternRewriter &rewriter) const override {
+    // Only processes operations from the neura dialect. Operations from
+    // other dialects (arith, math, etc.) should have been lowered to neura
+    // ops by earlier passes (LowerArithToNeura, etc.) before this pass runs.
     if (op->getDialect()->getNamespace() != accel::kNeuraTarget ||
-        isa<neura::DataMovOp>(op)) {
+        isa<neura::DataMovOp>(op) ||
+        // ReserveOp creates a loop-carried placeholder in the dataflow
+        // recurrence cycle: %v = neura.reserve; neura.ctrl_mov %next -> %v.
+        // Its result must NOT be wrapped in DataMovOp, because ctrl_mov needs
+        // a direct reference to the same SSA value used by phi_start.
+        // Inserting a DataMovOp between reserve and its consumers would break
+        // the ctrl_mov→reserve back-edge and corrupt the recurrence cycle.
+        isa<neura::ReserveOp>(op) ||
+        isa<neura::KernelOp>(op) ||
+        isa<neura::FusedOp>(op)) {
       return failure();
     }
 
@@ -91,8 +103,9 @@ struct InsertDataMovForNeuraOps : public RewritePattern {
     for (Value operand : op->getOperands()) {
       Operation *producer = operand.getDefiningOp();
 
-      // Skips adding mov for any operand that comes from a reserve op or
-      // already from data_mov.
+      // Does NOT wrap operands that come from reserve: the reserve result
+      // is the recurrence back-edge target for ctrl_mov.  Wrapping it would
+      // produce a new SSA value, breaking the ctrl_mov→reserve cycle.
       if (producer && (isa<neura::ReserveOp>(producer) ||
                        isa<neura::DataMovOp>(producer))) {
         new_operands.push_back(operand);