diff --git a/include/TaskflowDialect/Allocation/allocation_utils.h b/include/TaskflowDialect/Allocation/allocation_utils.h
new file mode 100644
index 00000000..f7f9b7de
--- /dev/null
+++ b/include/TaskflowDialect/Allocation/allocation_utils.h
@@ -0,0 +1,112 @@
+//===- allocation_utils.h - Shared CGRA allocation utilities --------------===//
+//
+// Shared utility types and functions used by AllocateCgraToTaskPass and
+// ResourceAwareTaskOptimizationPass for CGRA grid placement feasibility
+// checks and task-to-CGRA mapping.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TASKFLOW_ALLOCATION_UTILS_H
+#define TASKFLOW_ALLOCATION_UTILS_H
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace mlir {
+namespace taskflow {
+
+//===----------------------------------------------------------------------===//
+// Grid constants
+//===----------------------------------------------------------------------===//
+
+constexpr int kCgraGridRows = 4;
+constexpr int kCgraGridCols = 4;
+
+//===----------------------------------------------------------------------===//
+// CgraShape
+//===----------------------------------------------------------------------===//
+
+// Represents a CGRA allocation shape on the grid.
+//
+// For rectangular shapes: rows × cols == cgra_count, and `cgra_positions`
+// is empty (all cells in the bounding box are used).
+//
+// For non-rectangular shapes (L, T): `cgra_positions` stores the explicit
+// (col, row) coordinates of the occupied CGRAs.  `rows`/`cols` give the
+// bounding box so that tile-level x_tiles/y_tiles can be computed.
+struct CgraShape {
+  int rows;            // Bounding-box CGRA rows.
+  int cols;            // Bounding-box CGRA columns.
+  bool is_rectangular; // True if all cells in the bbox are used.
+  // Explicit CGRA positions for non-rectangular shapes.
+  // Each pair is (col, row) in CGRA coordinates.  Empty for rectangles.
+  llvm::SmallVector<std::pair<int, int>> cgra_positions;
+
+  // Returns the bounding-box area (rows * cols).  For rectangular shapes this
+  // equals cgra_count; for non-rectangular shapes it is larger than cgra_count
+  // (some cells in the bbox are unoccupied).  Used only for shape sorting
+  // (prefer smaller bounding boxes), not for counting occupied CGRAs.
+  int area() const { return rows * cols; }
+
+  // Returns a human-readable description for log messages only (not IR).
+  std::string describe(int cgra_count) const;
+
+  // Returns the shape string written into the IR tile_shape attribute.
+  // For rectangular shapes: "NxM" (e.g. "2x2").
+  // For non-rectangular shapes: "NxM[(c0,r0)(c1,r1)...]" listing only the
+  // occupied CGRA positions so that downstream passes can reconstruct the
+  // exact valid tile set for multi-CGRA mapping.
+  std::string irAttr() const;
+};
+
+//===----------------------------------------------------------------------===//
+// Shape Enumeration Utilities
+//===----------------------------------------------------------------------===//
+
+// Generates all placement-candidate shapes for `cgra_count` CGRAs, including
+// rotations. Rectangular shapes include both orientations (rows×cols and
+// cols×rows, deduplicated for squares). Non-rectangular shapes include all
+// four 90° rotations.
+//
+// Ordering (tried first to last):
+//   1. Rectangular shapes, sorted by squareness (e.g. 2×2 before 1×4),
+//      with smaller bounding-box area as tiebreaker.
+//   2. Non-rectangular shapes (L, T, etc.) in all unique rotations.
+llvm::SmallVector<CgraShape> getAllPlacementShapes(int cgra_count);
+
+//===----------------------------------------------------------------------===//
+// Global Placement Feasibility
+//===----------------------------------------------------------------------===//
+
+// Simulates greedy placement of all tasks' shapes on the kCgraGridRows ×
+// kCgraGridCols grid to verify that they physically fit without overlap.
+//
+// For each task, all valid shapes (including rotations) are tried. Rectangular
+// shapes prefer square-like orientations (e.g. 2×2 over 1×4). Non-rectangular
+// shapes are tried in all four 90° rotations.
+//
+// `task_cgra_counts` contains the cgra_count for every task in the graph
+// (including the speculatively modified one).
+//
+// Returns true if all tasks can be placed without overlap.
+bool canAllTasksFitOnGrid(llvm::ArrayRef<int> task_cgra_counts);
+
+//===----------------------------------------------------------------------===//
+// Direct Pass Invocation
+//===----------------------------------------------------------------------===//
+
+// Runs the CGRA task placement logic directly on a function, producing
+// `task_mapping_info` attributes with global grid placement that respects
+// multi-CGRA shapes.
+//
+// grid_rows/grid_cols default to 4x4 (kCgraGridRows/kCgraGridCols).
+//
+// Defined in lib/TaskflowDialect/Util/AllocateCgraTaskMapper.cpp.
+void runAllocateCgraToTask(mlir::func::FuncOp func,
+                           int grid_rows = kCgraGridRows,
+                           int grid_cols = kCgraGridCols);
+
+} // namespace taskflow
+} // namespace mlir
+
+#endif // TASKFLOW_CGRA_PLACEMENT_UTILS_H
diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h
index 92393d7c..041cdbc7 100644
--- a/include/TaskflowDialect/TaskflowPasses.h
+++ b/include/TaskflowDialect/TaskflowPasses.h
@@ -5,6 +5,8 @@
 
 #include "TaskflowDialect/TaskflowDialect.h"
 #include "TaskflowDialect/TaskflowOps.h"
+#include "TaskflowDialect/Allocation/allocation_utils.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassRegistry.h"
@@ -21,7 +23,7 @@ void registerTosaToAffineConversionPassPipeline();
 #include "TaskflowDialect/TaskflowPasses.h.inc"
 std::unique_ptr<mlir::Pass> createConstructHyperblockFromTaskPass();
 std::unique_ptr<mlir::Pass> createClassifyCountersPass();
-std::unique_ptr<mlir::Pass> createMapTaskOnCgraPass();
+std::unique_ptr<mlir::Pass> createAllocateCgraToTaskPass();
 std::unique_ptr<mlir::Pass> createFuseTaskPass();
 
 //=========================================================//
diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td
index 5cf07cd7..701e8246 100644
--- a/include/TaskflowDialect/TaskflowPasses.td
+++ b/include/TaskflowDialect/TaskflowPasses.td
@@ -63,16 +63,16 @@ def ClassifyCounters : Pass<"classify-counters", "ModuleOp"> {
   let constructor = "taskflow::createClassifyCountersPass()";
 }
 
-def MapTaskOnCgra : Pass<"map-task-on-cgra", "func::FuncOp"> {
+def AllocateCgraToTask : Pass<"allocate-cgra-to-task", "func::FuncOp"> {
   let summary = "Maps Taskflow tasks onto a 2D CGRA grid array";
   let description = [{
     This pass maps Taskflow tasks onto a 2D CGRA grid array.
     Fusion candidates (same-header SSA dependencies) are placed on adjacent
     CGRAs to enable direct data forwarding.
 
-    Uses a default 3x3 CGRA grid.
+    Uses a default 4x4 CGRA grid.
   }];
-  let constructor = "taskflow::createMapTaskOnCgraPass()";
+  let constructor = "taskflow::createAllocateCgraToTaskPass()";
 }
 
 def FuseTask : Pass<"fuse-task", "func::FuncOp"> {
diff --git a/lib/TaskflowDialect/Allocation/CMakeLists.txt b/lib/TaskflowDialect/Allocation/CMakeLists.txt
new file mode 100644
index 00000000..3f48ebf9
--- /dev/null
+++ b/lib/TaskflowDialect/Allocation/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_mlir_library(MLIRTaskflowAllocation
+    allocation_utils.cpp
+    allocation_utils_mapper.cpp
+
+    DEPENDS
+    MLIRTaskflowTransformsIncGen
+
+    LINK_LIBS PUBLIC
+    MLIRIR
+    MLIRPass
+    MLIRSupport
+    MLIRFuncDialect
+    MLIRTaskflow
+    LLVMSupport
+)
diff --git a/lib/TaskflowDialect/Allocation/allocation_utils.cpp b/lib/TaskflowDialect/Allocation/allocation_utils.cpp
new file mode 100644
index 00000000..b8e842c2
--- /dev/null
+++ b/lib/TaskflowDialect/Allocation/allocation_utils.cpp
@@ -0,0 +1,278 @@
+//===- allocation_utils.cpp - Shared CGRA allocation utilities ------------===//
+//
+// Implements shared utility functions for CGRA grid placement used by
+// AllocateCgraToTaskPass and ResourceAwareTaskOptimizationPass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TaskflowDialect/Allocation/allocation_utils.h"
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <algorithm>
+#include <climits>
+#include <string>
+
+using namespace mlir;
+using namespace mlir::taskflow;
+
+//===----------------------------------------------------------------------===//
+// CgraShape member implementations
+//===----------------------------------------------------------------------===//
+
+std::string CgraShape::describe(int cgra_count) const {
+  std::string s = std::to_string(rows) + "x" + std::to_string(cols);
+  if (!is_rectangular) {
+    s += "(non-rect, " + std::to_string(cgra_count) + " CGRAs:";
+    for (auto &[c, r] : cgra_positions)
+      s += " (" + std::to_string(c) + "," + std::to_string(r) + ")";
+    s += ")";
+  }
+  return s;
+}
+
+std::string CgraShape::irAttr() const {
+  std::string s = std::to_string(rows) + "x" + std::to_string(cols);
+  if (!is_rectangular && !cgra_positions.empty()) {
+    s += "[";
+    for (auto &[c, r] : cgra_positions)
+      s += "(" + std::to_string(c) + "," + std::to_string(r) + ")";
+    s += "]";
+  }
+  return s;
+}
+
+//===----------------------------------------------------------------------===//
+// Internal helpers
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+// Returns all valid rectangular shapes for `cgra_count` CGRAs.
+SmallVector<CgraShape> getRectangularShapes(int cgra_count) {
+  SmallVector<CgraShape> shapes;
+  for (int r = 1; r <= kCgraGridRows; ++r) {
+    for (int c = 1; c <= kCgraGridCols; ++c) {
+      if (r * c == cgra_count)
+        shapes.push_back(
+            {r, c, /*is_rectangular=*/true, /*cgra_positions=*/{}});
+    }
+  }
+  return shapes;
+}
+
+// Returns the set of non-rectangular shapes for `cgra_count` CGRAs.
+// Currently defined for cgra_count == 3 (L-shape) and cgra_count == 4
+// (L-shape and T-shape variants).
+SmallVector<CgraShape> getNonRectangularShapes(int cgra_count) {
+  SmallVector<CgraShape> shapes;
+
+  if (cgra_count == 3) {
+    // L-shape 3 CGRAs: (0,0)(1,0)(0,1) — bbox 2×2
+    shapes.push_back({2, 2, false, {{0, 0}, {1, 0}, {0, 1}}});
+  }
+
+  if (cgra_count == 4) {
+    // T-shape: three in a row + one below centre
+    //   (0,0)(1,0)(2,0)(1,1)  — bbox 2×3
+    shapes.push_back({2, 3, false, {{0, 0}, {1, 0}, {2, 0}, {1, 1}}});
+
+    // L-shape: three in a column + one offset
+    //   (0,0)(0,1)(0,2)(1,2)  — bbox 3×2
+    shapes.push_back({3, 2, false, {{0, 0}, {0, 1}, {0, 2}, {1, 2}}});
+  }
+
+  return shapes;
+}
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// getAllPlacementShapes
+//===----------------------------------------------------------------------===//
+
+SmallVector<CgraShape> mlir::taskflow::getAllPlacementShapes(int cgra_count) {
+  SmallVector<CgraShape> shapes;
+
+  // 1. Rectangular shapes with both orientations, deduplicated.
+  {
+    llvm::DenseSet<int64_t> seen_keys; // encodes (rows<<16)|cols
+    for (int row_dim = 1; row_dim <= kCgraGridRows; ++row_dim) {
+      for (int col_dim = 1; col_dim <= kCgraGridCols; ++col_dim) {
+        if (row_dim * col_dim == cgra_count) {
+          int64_t key = ((int64_t)row_dim << 16) | col_dim;
+          if (seen_keys.insert(key).second) {
+            shapes.push_back({row_dim, col_dim, true, {}});
+            // Adds the rotated orientation if different (e.g. 1×4 -> 4×1).
+            if (row_dim != col_dim) {
+              int64_t rotated_key = ((int64_t)col_dim << 16) | row_dim;
+              if (seen_keys.insert(rotated_key).second) {
+                shapes.push_back({col_dim, row_dim, true, {}});
+              }
+            }
+          }
+        }
+      }
+    }
+    // Sorts rectangles: prefer more square-like (smaller |rows-cols|), then
+    // smaller bounding-box area as tiebreaker.
+    llvm::sort(shapes, [](const CgraShape &lhs, const CgraShape &rhs) {
+      int squareness_lhs = std::abs(lhs.rows - lhs.cols);
+      int squareness_rhs = std::abs(rhs.rows - rhs.cols);
+      if (squareness_lhs != squareness_rhs)
+        return squareness_lhs < squareness_rhs;
+      return lhs.area() < rhs.area();
+    });
+  }
+
+  // 2. Non-rectangular shapes with all four 90° rotations.
+  auto base_non_rect = getNonRectangularShapes(cgra_count);
+  for (const auto &base : base_non_rect) {
+    // Generates 4 rotations of the cgra_positions list.
+    // Rotation by 90° CW: (col, row) -> (row, -col).
+    // Each rotation is normalised so that offsets start from (0, 0).
+    SmallVector<SmallVector<std::pair<int, int>>, 4> rotation_variants;
+    rotation_variants.push_back(
+        SmallVector<std::pair<int, int>>(base.cgra_positions));
+
+    auto prev_positions = base.cgra_positions;
+    for (int rotation_idx = 0; rotation_idx < 3; ++rotation_idx) {
+      SmallVector<std::pair<int, int>> rotated_positions;
+      for (auto &[col_off, row_off] : prev_positions)
+        rotated_positions.push_back(
+            {row_off, -col_off}); // 90° CW in (col, row) space
+
+      // Normalises to non-negative offsets starting from (0, 0).
+      int min_col = INT_MAX, min_row = INT_MAX;
+      for (auto &[col_off, row_off] : rotated_positions) {
+        min_col = std::min(min_col, col_off);
+        min_row = std::min(min_row, row_off);
+      }
+      for (auto &[col_off, row_off] : rotated_positions) {
+        col_off -= min_col;
+        row_off -= min_row;
+      }
+      rotation_variants.push_back(rotated_positions);
+      prev_positions = rotated_positions;
+    }
+
+    // Deduplicates rotations that produce the same position set.
+    // Hash parameters: multiplier 131 and positional weight 17 are chosen to
+    // give low collision rates for small integer coordinate sets.
+    llvm::DenseSet<int64_t> seen_hashes;
+    for (auto &positions : rotation_variants) {
+      auto sorted_positions = positions;
+      llvm::sort(sorted_positions,
+                 [](const std::pair<int, int> &lhs,
+                    const std::pair<int, int> &rhs) { return lhs < rhs; });
+      int64_t hash = 0;
+      for (auto &[col_off, row_off] : sorted_positions)
+        hash = hash * 131 + col_off * 17 + row_off;
+      if (!seen_hashes.insert(hash).second) {
+        continue;
+      }
+      // Computes bounding box for this rotation.
+      int max_col = 0, max_row = 0;
+      for (auto &[col_off, row_off] : positions) {
+        max_col = std::max(max_col, col_off);
+        max_row = std::max(max_row, row_off);
+      }
+      shapes.push_back({max_row + 1, max_col + 1, false, std::move(positions)});
+    }
+  }
+
+  return shapes;
+}
+
+//===----------------------------------------------------------------------===//
+// canAllTasksFitOnGrid
+//===----------------------------------------------------------------------===//
+
+bool mlir::taskflow::canAllTasksFitOnGrid(ArrayRef<int> task_cgra_counts) {
+  constexpr int kTotalCGRAs = kCgraGridRows * kCgraGridCols;
+
+  // Quick capacity check: total CGRAs must not exceed grid size.
+  int total_cgras = 0;
+  for (int count : task_cgra_counts)
+    total_cgras += count;
+  if (total_cgras > kTotalCGRAs) {
+    return false;
+  }
+
+  // Simulates placement on a grid.
+  bool occupied[kCgraGridRows][kCgraGridCols] = {};
+
+  // Sorts tasks by descending cgra_count for better packing (largest-first
+  // decreasing, a standard bin-packing heuristic).  Each task may have a
+  // different cgra_count because the balance phase only increments one
+  // bottleneck at a time; this array reflects the heterogeneous allocation
+  // across all tasks in the current trial configuration.
+  SmallVector<int> sorted_counts(task_cgra_counts.begin(),
+                                 task_cgra_counts.end());
+  llvm::sort(sorted_counts, [](int lhs, int rhs) { return lhs > rhs; });
+
+  for (int cgra_count : sorted_counts) {
+    SmallVector<CgraShape> candidates = getAllPlacementShapes(cgra_count);
+    bool placed = false;
+
+    for (const auto &shape : candidates) {
+      if (placed)
+        break;
+
+      if (shape.is_rectangular) {
+        // Rectangular: tries every origin where the rows×cols bbox fits.
+        for (int origin_row = 0;
+             origin_row <= kCgraGridRows - shape.rows && !placed;
+             ++origin_row) {
+          for (int origin_col = 0;
+               origin_col <= kCgraGridCols - shape.cols && !placed;
+               ++origin_col) {
+            bool fits = true;
+            for (int delta_row = 0; delta_row < shape.rows && fits; ++delta_row)
+              for (int delta_col = 0; delta_col < shape.cols && fits;
+                   ++delta_col)
+                if (occupied[origin_row + delta_row][origin_col + delta_col])
+                  fits = false;
+            if (fits) {
+              for (int delta_row = 0; delta_row < shape.rows; ++delta_row)
+                for (int delta_col = 0; delta_col < shape.cols; ++delta_col)
+                  occupied[origin_row + delta_row][origin_col + delta_col] =
+                      true;
+              placed = true;
+            }
+          }
+        }
+      } else {
+        // Non-rectangular: cgra_positions stores (col, row) offsets.
+        for (int origin_row = 0; origin_row < kCgraGridRows && !placed;
+             ++origin_row) {
+          for (int origin_col = 0; origin_col < kCgraGridCols && !placed;
+               ++origin_col) {
+            bool fits = true;
+            for (auto &[col_off, row_off] : shape.cgra_positions) {
+              int abs_row = origin_row + row_off;
+              int abs_col = origin_col + col_off;
+              if (abs_row < 0 || abs_row >= kCgraGridRows || abs_col < 0 ||
+                  abs_col >= kCgraGridCols || occupied[abs_row][abs_col]) {
+                fits = false;
+                break;
+              }
+            }
+            if (fits) {
+              for (auto &[col_off, row_off] : shape.cgra_positions)
+                occupied[origin_row + row_off][origin_col + col_off] = true;
+              placed = true;
+            }
+          }
+        }
+      }
+    }
+
+    if (!placed) {
+      return false;
+    }
+  }
+  return true;
+}
diff --git a/lib/TaskflowDialect/Allocation/allocation_utils_mapper.cpp b/lib/TaskflowDialect/Allocation/allocation_utils_mapper.cpp
new file mode 100644
index 00000000..e23d2ae8
--- /dev/null
+++ b/lib/TaskflowDialect/Allocation/allocation_utils_mapper.cpp
@@ -0,0 +1,728 @@
+//===- allocation_utils_mapper.cpp - Task-to-CGRA mapping implementation --===//
+//
+// Implements runAllocateCgraToTask and the internal TaskMapper used by
+// AllocateCgraToTaskPass.  Kept under TaskflowDialect/Allocation per code review.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TaskflowDialect/TaskflowDialect.h"
+#include "TaskflowDialect/TaskflowOps.h"
+#include "TaskflowDialect/Allocation/allocation_utils.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Builders.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <algorithm>
+#include <cassert>
+#include <climits>
+#include <cmath>
+#include <functional>
+#include <optional>
+#include <set>
+#include <string>
+#include <vector>
+
+using namespace mlir;
+using namespace mlir::taskflow;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// CGRA Grid Position
+//===----------------------------------------------------------------------===//
+/// Represents a position on the 2D CGRA grid.
+struct CgraPosition {
+  int row;
+  int col;
+
+  bool operator==(const CgraPosition &other) const {
+    return row == other.row && col == other.col;
+  }
+
+  bool operator!=(const CgraPosition &other) const { return !(*this == other); }
+
+  int manhattanDistance(const CgraPosition &other) const {
+    return std::abs(row - other.row) + std::abs(col - other.col);
+  }
+
+  /// Returns true if the two positions are directly adjacent (Manhattan
+  /// distance == 1), i.e. share an edge on the grid.
+  bool isAdjacent(const CgraPosition &other) const {
+    return manhattanDistance(other) == 1;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Task Placement Info
+//===----------------------------------------------------------------------===//
+/// Stores the placement result for a task: the set of CGRAs assigned to it.
+/// A task can span one or more contiguous CGRAs (rectangular or non-rect).
+struct TaskPlacement {
+  SmallVector<CgraPosition> cgra_positions; // CGRAs assigned to this task.
+
+  /// Returns the primary (first) CGRA position.
+  CgraPosition primary() const {
+    return cgra_positions.empty() ? CgraPosition{-1, -1} : cgra_positions[0];
+  }
+
+  /// Returns the number of CGRAs assigned to this task.
+  size_t cgraCount() const { return cgra_positions.size(); }
+
+  /// Returns true if any CGRA in this task is grid-adjacent to any CGRA
+  /// in `other`, indicating that direct data forwarding between tasks is
+  /// possible without going through the network.
+  bool hasTaskAdjacentCgra(const TaskPlacement &other) const {
+    for (const auto &pos : cgra_positions) {
+      for (const auto &other_pos : other.cgra_positions) {
+        if (pos.isAdjacent(other_pos)) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Task-Memory Graph
+//===----------------------------------------------------------------------===//
+
+struct MemoryNode;
+
+/// Represents a Task node in the dependency graph.
+struct TaskNode {
+  size_t id;
+  TaskflowTaskOp op;
+  int dependency_depth = 0; // Longest path to any sink in the dependency graph.
+
+  // Edges based on original (pre-streaming-fusion) memory accesses.
+  SmallVector<MemoryNode *> read_memrefs;  // MemoryNodes this task reads.
+  SmallVector<MemoryNode *> write_memrefs; // MemoryNodes this task writes.
+  // SSA value edges between tasks.
+  SmallVector<TaskNode *> ssa_users; // Tasks that consume this task's output.
+  SmallVector<TaskNode *>
+      ssa_operands; // Tasks whose output this task consumes.
+
+  // Placement result — populated by TaskMapper::place().
+  SmallVector<CgraPosition> placement;
+
+  TaskNode(size_t id, TaskflowTaskOp op) : id(id), op(op) {}
+};
+
+/// Represents a MemRef node in the dependency graph.
+struct MemoryNode {
+  Value memref;
+
+  // Access edges.
+  SmallVector<TaskNode *> readers; // Tasks that read this memref.
+  SmallVector<TaskNode *> writers; // Tasks that write this memref.
+
+  // SRAM assignment result — populated by TaskMapper::assignAllSrams().
+  std::optional<CgraPosition> assigned_sram_pos;
+
+  MemoryNode(Value memref) : memref(memref) {}
+};
+
+class TaskMemoryGraph {
+public:
+  SmallVector<std::unique_ptr<TaskNode>> task_nodes;
+  SmallVector<std::unique_ptr<MemoryNode>> memory_nodes;
+  DenseMap<Value, MemoryNode *> memref_to_node;
+  DenseMap<Operation *, TaskNode *> op_to_node;
+
+  void build(func::FuncOp func) {
+    // Phase 1: Create a TaskNode for every TaskflowTaskOp in the function.
+    size_t task_id = 0;
+    func.walk([&](TaskflowTaskOp task) {
+      auto node = std::make_unique<TaskNode>(task_id++, task);
+      op_to_node[task] = node.get();
+      task_nodes.push_back(std::move(node));
+    });
+
+    // Phase 2: Create MemoryNodes using ORIGINAL memrefs (canonical identity).
+    // Uses original_read_memrefs / original_write_memrefs so that aliased
+    // memories (created by streaming-fusion) share the same MemoryNode.
+    for (auto &t_node : task_nodes) {
+      // Uses original_read_memrefs for canonical memory identity.
+      for (Value orig_memref : t_node->op.getOriginalReadMemrefs()) {
+        MemoryNode *m_node = getOrCreateMemoryNode(orig_memref);
+        t_node->read_memrefs.push_back(m_node);
+        m_node->readers.push_back(t_node.get());
+      }
+      // Uses original_write_memrefs for canonical memory identity.
+      for (Value orig_memref : t_node->op.getOriginalWriteMemrefs()) {
+        MemoryNode *m_node = getOrCreateMemoryNode(orig_memref);
+        t_node->write_memrefs.push_back(m_node);
+        m_node->writers.push_back(t_node.get());
+      }
+    }
+
+    // Phase 3: Build SSA edges (inter-task value dependencies).
+    // A consumer task directly uses a value produced by a producer task.
+    for (auto &consumer_node : task_nodes) {
+      // Iterates all operands to be safe (not only getValueInputs()).
+      for (Value operand : consumer_node->op.getValueInputs()) {
+        if (auto producer_op = operand.getDefiningOp<TaskflowTaskOp>()) {
+          if (auto *producer_node = op_to_node[producer_op]) {
+            producer_node->ssa_users.push_back(consumer_node.get());
+            consumer_node->ssa_operands.push_back(producer_node);
+          }
+        }
+      }
+    }
+  }
+
+private:
+  MemoryNode *getOrCreateMemoryNode(Value memref) {
+    if (memref_to_node.count(memref)) {
+      return memref_to_node[memref];
+    }
+
+    auto node = std::make_unique<MemoryNode>(memref);
+    MemoryNode *ptr = node.get();
+    memref_to_node[memref] = ptr;
+    memory_nodes.push_back(std::move(node));
+    return ptr;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+/// Maps a task-memory graph onto a 2D CGRA grid.
+///
+/// Uses a two-phase fixed-point iteration:
+///   Phase 1: Place tasks on the grid (scoring by SSA + memory proximity).
+///   Phase 2: Assign each MemRef to the nearest SRAM given task positions.
+/// Iterates until SRAM assignments converge (critical-path-first ordering).
+class TaskMapper {
+public:
+  TaskMapper(int grid_rows, int grid_cols)
+      : grid_rows_(grid_rows), grid_cols_(grid_cols) {
+    occupied_.resize(grid_rows_);
+    for (auto &row : occupied_) {
+      row.resize(grid_cols_, false);
+    }
+  }
+
+  /// Places all tasks and performs iterative SRAM assignment for `func`.
+  void place(func::FuncOp func) {
+    SmallVector<TaskflowTaskOp> tasks;
+    func.walk([&](TaskflowTaskOp task) { tasks.push_back(task); });
+
+    if (tasks.empty()) {
+      llvm::errs() << "No tasks to place.\n";
+      return;
+    }
+
+    // Builds Task-Memory Graph.
+    TaskMemoryGraph graph;
+    graph.build(func);
+
+    if (graph.task_nodes.empty()) {
+      llvm::errs() << "No tasks to place.\n";
+      return;
+    }
+
+    // Computes dependency depth for each task.
+    // Dependency depth = longest path from this node to any sink node in the
+    // dependency graph (via SSA or memory edges).  Tasks with higher depth
+    // have longer dependent chains after them; placing them first gives their
+    // successors the best chance of landing on adjacent grid cells.
+    computeDependencyDepth(graph);
+
+    // Sorts tasks by dependency depth (Critical Path First).
+    SmallVector<TaskNode *> sorted_tasks;
+    for (auto &node : graph.task_nodes)
+      sorted_tasks.push_back(node.get());
+
+    std::stable_sort(sorted_tasks.begin(), sorted_tasks.end(),
+                     [](TaskNode *a, TaskNode *b) {
+                       return a->dependency_depth > b->dependency_depth;
+                     });
+
+    // Fixed-point iteration: task placement scoring depends on SRAM
+    // positions (memory proximity), and SRAM assignment depends on task
+    // positions (centroid of accessing tasks).  Each iteration re-places
+    // all tasks using the latest SRAM assignments, then re-assigns SRAMs.
+    // Converges when SRAM assignments stabilise (no change between iters).
+    constexpr int kMaxIterations = 10;
+
+    for (int iter = 0; iter < kMaxIterations; ++iter) {
+      if (iter > 0) {
+        resetTaskPlacements(graph);
+      }
+
+      // Phase 1: Place tasks (scoring uses current SRAM assignments).
+      for (TaskNode *task_node : sorted_tasks) {
+        int cgra_count = 1;
+        if (auto attr =
+                task_node->op->getAttrOfType<IntegerAttr>("cgra_count")) {
+          cgra_count = attr.getInt();
+        }
+
+        TaskPlacement placement =
+            findBestPlacement(task_node, cgra_count, graph);
+
+        assert(!placement.cgra_positions.empty() &&
+               "findBestPlacement must succeed: cgra_count should be "
+               "validated by the upstream resource-aware optimization pass "
+               "or manually assigned resource binding attributes");
+
+        // Commits placement and marks occupied grid cells.
+        for (const auto &pos : placement.cgra_positions) {
+          task_node->placement.push_back(pos);
+        }
+
+        for (const auto &pos : placement.cgra_positions) {
+          if (pos.row >= 0 && pos.row < grid_rows_ && pos.col >= 0 &&
+              pos.col < grid_cols_) {
+            occupied_[pos.row][pos.col] = true;
+          }
+        }
+      }
+
+      // Phase 2: Assign SRAMs (assuming fixed task positions).
+      // If nothing moved, task scores won't change → convergence reached.
+      bool sram_moved = assignAllSrams(graph);
+
+      if (iter > 0 && !sram_moved) {
+        break;
+      }
+    }
+
+    // Annotates result: writes task_mapping_info attribute to each task op.
+    OpBuilder builder(func.getContext());
+    for (auto &task_node : graph.task_nodes) {
+      if (task_node->placement.empty()) {
+        continue;
+      }
+
+      SmallVector<NamedAttribute, 4> mapping_attrs;
+
+      // 1. CGRA positions.
+      SmallVector<Attribute> pos_attrs;
+      for (const auto &pos : task_node->placement) {
+        SmallVector<NamedAttribute, 2> coord_attrs;
+        coord_attrs.push_back(
+            NamedAttribute(StringAttr::get(func.getContext(), "row"),
+                           builder.getI32IntegerAttr(pos.row)));
+        coord_attrs.push_back(
+            NamedAttribute(StringAttr::get(func.getContext(), "col"),
+                           builder.getI32IntegerAttr(pos.col)));
+        pos_attrs.push_back(
+            DictionaryAttr::get(func.getContext(), coord_attrs));
+      }
+      mapping_attrs.push_back(
+          NamedAttribute(StringAttr::get(func.getContext(), "cgra_positions"),
+                         builder.getArrayAttr(pos_attrs)));
+
+      // 2. Read SRAM locations.
+      SmallVector<Attribute> read_sram_attrs;
+      for (MemoryNode *mem : task_node->read_memrefs) {
+        if (mem->assigned_sram_pos) {
+          SmallVector<NamedAttribute, 2> sram_coord;
+          sram_coord.push_back(NamedAttribute(
+              StringAttr::get(func.getContext(), "row"),
+              builder.getI32IntegerAttr(mem->assigned_sram_pos->row)));
+          sram_coord.push_back(NamedAttribute(
+              StringAttr::get(func.getContext(), "col"),
+              builder.getI32IntegerAttr(mem->assigned_sram_pos->col)));
+          read_sram_attrs.push_back(
+              DictionaryAttr::get(func.getContext(), sram_coord));
+        }
+      }
+      mapping_attrs.push_back(NamedAttribute(
+          StringAttr::get(func.getContext(), "read_sram_locations"),
+          builder.getArrayAttr(read_sram_attrs)));
+
+      // 3. Write SRAM locations.
+      SmallVector<Attribute> write_sram_attrs;
+      for (MemoryNode *mem : task_node->write_memrefs) {
+        if (mem->assigned_sram_pos) {
+          SmallVector<NamedAttribute, 2> sram_coord;
+          sram_coord.push_back(NamedAttribute(
+              StringAttr::get(func.getContext(), "row"),
+              builder.getI32IntegerAttr(mem->assigned_sram_pos->row)));
+          sram_coord.push_back(NamedAttribute(
+              StringAttr::get(func.getContext(), "col"),
+              builder.getI32IntegerAttr(mem->assigned_sram_pos->col)));
+
+          write_sram_attrs.push_back(
+              DictionaryAttr::get(func.getContext(), sram_coord));
+        }
+      }
+      mapping_attrs.push_back(NamedAttribute(
+          StringAttr::get(func.getContext(), "write_sram_locations"),
+          builder.getArrayAttr(write_sram_attrs)));
+
+      // Sets task_mapping_info attribute on the task op.
+      task_node->op->setAttr(
+          "task_mapping_info",
+          DictionaryAttr::get(func.getContext(), mapping_attrs));
+    }
+  }
+
+private:
+  /// Clears all task placements and resets the occupied-cell grid.
+  void resetTaskPlacements(TaskMemoryGraph &graph) {
+    for (auto &task : graph.task_nodes) {
+      task->placement.clear();
+    }
+    // Clears grid.
+    for (int r = 0; r < grid_rows_; ++r) {
+      std::fill(occupied_[r].begin(), occupied_[r].end(), false);
+    }
+  }
+
+  /// Assigns each MemoryNode to the SRAM at the centroid of all CGRAs that
+  /// access it (readers + writers).  Returns true if any assignment changed,
+  /// which is used as the convergence criterion for the outer iteration loop.
+  bool assignAllSrams(TaskMemoryGraph &graph) {
+    bool changed = false;
+    for (auto &mem_node : graph.memory_nodes) {
+      int total_row = 0, total_col = 0, count = 0;
+      // Computes centroid of all tasks that read this memory.
+      for (TaskNode *reader : mem_node->readers) {
+        for (const CgraPosition &pos : reader->placement) {
+          total_row += pos.row;
+          total_col += pos.col;
+          count++;
+        }
+      }
+      // Computes centroid of all tasks that write this memory.
+      for (TaskNode *writer : mem_node->writers) {
+        for (const CgraPosition &pos : writer->placement) {
+          total_row += pos.row;
+          total_col += pos.col;
+          count++;
+        }
+      }
+
+      std::optional<CgraPosition> new_sram_pos;
+      if (count > 0) {
+        // Rounds to the nearest integer (round-half-up).
+        int avg_row = (total_row + count / 2) / count;
+        int avg_col = (total_col + count / 2) / count;
+        new_sram_pos = CgraPosition{avg_row, avg_col};
+      }
+
+      if (mem_node->assigned_sram_pos != new_sram_pos) {
+        mem_node->assigned_sram_pos = new_sram_pos;
+        changed = true;
+      }
+    }
+    return changed;
+  }
+
+  // Parses a tile_shape string like "2x2" or "2x2[(0,0)(1,0)(0,1)]".
+  // Returns (col, row) offsets relative to the placement origin.
+  // Reserved for IR-driven tile shapes; placement currently uses implicit
+  // rectangular enumeration in findBestPlacement.
+  SmallVector<std::pair<int, int>> parseTileShapeOffsets(StringRef tile_shape,
+                                                         int cgra_count) {
+    SmallVector<std::pair<int, int>> offsets;
+
+    if (tile_shape.empty() || cgra_count <= 1) {
+      offsets.push_back({0, 0});
+      return offsets;
+    }
+
+    size_t bracket_pos = tile_shape.find('[');
+    if (bracket_pos != StringRef::npos) {
+      StringRef positions_str = tile_shape.substr(bracket_pos);
+      size_t pos = 0;
+      while (pos < positions_str.size()) {
+        size_t open = positions_str.find('(', pos);
+        if (open == StringRef::npos)
+          break;
+        size_t close = positions_str.find(')', open);
+        if (close == StringRef::npos)
+          break;
+        StringRef pair_str = positions_str.slice(open + 1, close);
+        auto [col_str, row_str] = pair_str.split(',');
+        int col_off = 0, row_off = 0;
+        col_str.getAsInteger(10, col_off);
+        row_str.getAsInteger(10, row_off);
+        offsets.push_back({col_off, row_off});
+        pos = close + 1;
+      }
+    } else {
+      auto [rows_str, cols_str] = tile_shape.split('x');
+      int rows = 1, cols = 1;
+      rows_str.getAsInteger(10, rows);
+      cols_str.getAsInteger(10, cols);
+      for (int r = 0; r < rows; ++r) {
+        for (int c = 0; c < cols; ++c) {
+          offsets.push_back({c, r});
+        }
+      }
+    }
+
+    assert(!offsets.empty() && "tile_shape parsing yielded empty offsets");
+    return offsets;
+  }
+
+  // Finds the best placement for `task_node` requiring exactly `cgra_count`
+  // CGRAs.  Strategy:
+  //   1. Rectangular: tries all (rows × cols) factorizations of cgra_count,
+  //      preferring square-like shapes (lower |rows-cols|).  For each shape,
+  //      sweeps every origin on the grid and picks the highest-scoring free
+  //      position.
+  //   2. Non-rectangular fallback: if no rectangle fits (fragmented grid),
+  //      runs a polyomino DFS (tryNonRectShapes) to find any connected
+  //      k-CGRA cluster.
+  // Returns an empty TaskPlacement only if the grid is completely full
+  // (should not happen if cgra_count was validated upstream).
+  TaskPlacement findBestPlacement(TaskNode *task_node, int cgra_count,
+                                  TaskMemoryGraph &graph) {
+    // Phase 1: try all rectangular factorizations of cgra_count.
+    for (int rows = 1; rows <= cgra_count; ++rows) {
+      if (cgra_count % rows != 0) {
+        continue;
+      }
+      int cols = cgra_count / rows;
+
+      // Builds the list of (col_offset, row_offset) cells relative to the
+      // top-left origin for this rows×cols rectangle.
+      SmallVector<std::pair<int, int>> shape_offsets;
+      for (int shape_row = 0; shape_row < rows; ++shape_row) {
+        for (int shape_col = 0; shape_col < cols; ++shape_col) {
+          shape_offsets.push_back({shape_col, shape_row});
+        }
+      }
+
+      // Sweeps every valid origin on the grid and keeps the highest-scoring
+      // non-overlapping placement for this shape.
+      int best_score = INT_MIN;
+      TaskPlacement best_placement;
+      for (int origin_row = 0; origin_row < grid_rows_; ++origin_row) {
+        for (int origin_col = 0; origin_col < grid_cols_; ++origin_col) {
+          // Checks that every cell of the rectangle is within bounds and free.
+          bool valid = true;
+          TaskPlacement candidate;
+          for (auto &[col_off, row_off] : shape_offsets) {
+            int abs_row = origin_row + row_off;
+            int abs_col = origin_col + col_off;
+            if (abs_row < 0 || abs_row >= grid_rows_ || abs_col < 0 ||
+                abs_col >= grid_cols_ || occupied_[abs_row][abs_col]) {
+              valid = false;
+              break;
+            }
+            candidate.cgra_positions.push_back({abs_row, abs_col});
+          }
+          if (!valid) {
+            continue;
+          }
+          // Scores the candidate by proximity to dependent tasks and SRAMs.
+          int score = computeScore(task_node, candidate, graph);
+          if (score > best_score) {
+            best_score = score;
+            best_placement = candidate;
+          }
+        }
+      }
+      // Returns the best placement found for this shape, if any.
+      if (!best_placement.cgra_positions.empty()) {
+        return best_placement;
+      }
+    }
+
+    // Phase 2: no rectangle fit — try non-rectangular connected shapes via DFS.
+    if (cgra_count > 1) {
+      TaskPlacement p = tryNonRectShapes(task_node, cgra_count, graph);
+      if (!p.cgra_positions.empty()) {
+        return p;
+      }
+    }
+
+    return {};
+  }
+
+  TaskPlacement tryNonRectShapes(TaskNode *task_node, int k,
+                                 TaskMemoryGraph &graph) {
+    std::set<uint64_t> visited_masks;
+    int best_score = INT_MIN;
+    TaskPlacement best_placement;
+
+    std::function<void(SmallVector<CgraPosition> &, uint64_t)> search =
+        [&](SmallVector<CgraPosition> &current, uint64_t mask) {
+          if ((int)current.size() == k) {
+            if (visited_masks.insert(mask).second) {
+              TaskPlacement candidate;
+              candidate.cgra_positions = current;
+              int score = computeScore(task_node, candidate, graph);
+              if (score > best_score) {
+                best_score = score;
+                best_placement = candidate;
+              }
+            }
+            return;
+          }
+          // Explores all 4-connected neighbours of every cell already in the
+          // current polyomino.  delta_row/delta_col encode the four cardinal
+          // directions: up, down, left, right.
+          constexpr int delta_row[] = {-1, 1, 0, 0};
+          constexpr int delta_col[] = {0, 0, -1, 1};
+          for (size_t i = 0; i < current.size(); ++i) {
+            const CgraPosition &cell = current[i];
+            for (int dir = 0; dir < 4; ++dir) {
+              int next_row = cell.row + delta_row[dir];
+              int next_col = cell.col + delta_col[dir];
+              if (next_row >= 0 && next_row < grid_rows_ && next_col >= 0 &&
+                  next_col < grid_cols_ && !occupied_[next_row][next_col]) {
+                uint64_t bit = 1ULL << (next_row * grid_cols_ + next_col);
+                if ((mask & bit) == 0) {
+                  current.push_back({next_row, next_col});
+                  search(current, mask | bit);
+                  current.pop_back();
+                }
+              }
+            }
+          }
+        };
+
+    // Seeds the DFS from every free cell on the grid.
+    for (int seed_row = 0; seed_row < grid_rows_; ++seed_row) {
+      for (int seed_col = 0; seed_col < grid_cols_; ++seed_col) {
+        if (!occupied_[seed_row][seed_col]) {
+          SmallVector<CgraPosition> start = {{seed_row, seed_col}};
+          search(start, 1ULL << (seed_row * grid_cols_ + seed_col));
+        }
+      }
+    }
+    return best_placement;
+  }
+
+  /// Computes the placement score for `task_node` at `placement`.
+  ///
+  /// Score = α·SSA_Dist + β·Mem_Dist.
+  ///   SSA_Dist : sum of distances to already-placed SSA predecessors and
+  ///              successors (negative; penalises far-away neighbours).
+  ///   Mem_Dist : sum of distances to assigned SRAMs for read/write memrefs
+  ///              (negative; memory proximity is weighted more heavily).
+  ///
+  /// Higher score is better; 0 means all neighbours are co-located.
+  int computeScore(TaskNode *task_node, const TaskPlacement &placement,
+                   TaskMemoryGraph &graph) {
+    // Weight constants (tunable).
+    constexpr int kAlpha = 10; // SSA proximity weight.
+    constexpr int kBeta = 50;  // Memory proximity weight (high priority).
+
+    int ssa_score = 0;
+    int mem_score = 0;
+
+    auto minDistToPlacement =
+        [&](const SmallVector<CgraPosition> &other) -> int {
+      int min_dist = INT_MAX;
+      for (const auto &pos : placement.cgra_positions) {
+        for (const auto &opos : other) {
+          min_dist = std::min(min_dist, pos.manhattanDistance(opos));
+        }
+      }
+      return min_dist;
+    };
+
+    auto minDistToTarget = [&](const CgraPosition &target) -> int {
+      int min_dist = INT_MAX;
+      for (const auto &pos : placement.cgra_positions) {
+        min_dist = std::min(min_dist, pos.manhattanDistance(target));
+      }
+      return min_dist;
+    };
+
+    // 1. SSA proximity — penalise distance to producers and consumers.
+    for (TaskNode *producer : task_node->ssa_operands) {
+      if (!producer->placement.empty()) {
+        // Uses negative distance: closer = higher score.
+        int dist = minDistToPlacement(producer->placement);
+        ssa_score -= dist;
+      }
+    }
+    for (TaskNode *consumer : task_node->ssa_users) {
+      if (!consumer->placement.empty()) {
+        int dist = minDistToPlacement(consumer->placement);
+        ssa_score -= dist;
+      }
+    }
+
+    // 2. Memory proximity — penalise distance to assigned SRAMs.
+    // For read memrefs (data sources).
+    for (MemoryNode *mem : task_node->read_memrefs) {
+      if (mem->assigned_sram_pos) {
+        int dist = minDistToTarget(*mem->assigned_sram_pos);
+        mem_score -= dist;
+      }
+    }
+    // For write memrefs: if the SRAM is already assigned (e.g. read by a
+    // previous task), we want to be close to it too.
+    for (MemoryNode *mem : task_node->write_memrefs) {
+      if (mem->assigned_sram_pos) {
+        int dist = minDistToTarget(*mem->assigned_sram_pos);
+        mem_score -= dist;
+      }
+    }
+
+    return kAlpha * ssa_score + kBeta * mem_score;
+  }
+
+  /// Computes dependency depth for every task in the graph.
+  ///
+  /// Dependency depth = longest path from a node to any sink in the dependency
+  /// graph (traversing both SSA and write→read memory edges).
+  ///
+  /// Tasks with higher dependency depth have longer chains of dependent tasks
+  /// downstream.  Placing them first (critical-path-first) ensures that:
+  ///   1. They receive priority access to good grid positions.
+  ///   2. Their dependent tasks can later be placed adjacent, minimising
+  ///      inter-task communication distance.
+  void computeDependencyDepth(TaskMemoryGraph &graph) {
+    DenseMap<TaskNode *, int> depth_cache;
+    for (auto &node : graph.task_nodes) {
+      node->dependency_depth = calculateDepth(node.get(), depth_cache);
+    }
+  }
+
+  /// Recursively calculates dependency depth for a single task (memoised).
+  int calculateDepth(TaskNode *node, DenseMap<TaskNode *, int> &depth_cache) {
+    if (depth_cache.count(node)) {
+      return depth_cache[node];
+    }
+
+    int max_child_depth = 0;
+    // SSA dependencies: tasks that consume this task's output values.
+    for (TaskNode *child : node->ssa_users) {
+      max_child_depth =
+          std::max(max_child_depth, calculateDepth(child, depth_cache) + 1);
+    }
+
+    // Memory dependencies: Producer → Mem → Consumer write-after-read chains.
+    for (MemoryNode *mem : node->write_memrefs) {
+      for (TaskNode *reader : mem->readers) {
+        if (reader != node) {
+          max_child_depth = std::max(max_child_depth,
+                                     calculateDepth(reader, depth_cache) + 1);
+        }
+      }
+    }
+
+    return depth_cache[node] = max_child_depth;
+  }
+
+  int grid_rows_;
+  int grid_cols_;
+  std::vector<std::vector<bool>> occupied_;
+};
+
+} // namespace
+
+namespace mlir {
+namespace taskflow {
+
+void runAllocateCgraToTask(func::FuncOp func, int grid_rows, int grid_cols) {
+  TaskMapper mapper(grid_rows, grid_cols);
+  mapper.place(func);
+}
+
+} // namespace taskflow
+} // namespace mlir
diff --git a/lib/TaskflowDialect/CMakeLists.txt b/lib/TaskflowDialect/CMakeLists.txt
index 49d60c57..cdb02d55 100644
--- a/lib/TaskflowDialect/CMakeLists.txt
+++ b/lib/TaskflowDialect/CMakeLists.txt
@@ -13,5 +13,5 @@ add_mlir_dialect_library(MLIRTaskflow
         MLIRInferTypeOpInterface
 )
 
-add_subdirectory(Transforms)
-add_subdirectory(Transforms/Optimizations)
\ No newline at end of file
+add_subdirectory(Allocation)
+add_subdirectory(Transforms)
\ No newline at end of file
diff --git a/lib/TaskflowDialect/Transforms/AllocateCgraToTaskPass.cpp b/lib/TaskflowDialect/Transforms/AllocateCgraToTaskPass.cpp
new file mode 100644
index 00000000..cf298fce
--- /dev/null
+++ b/lib/TaskflowDialect/Transforms/AllocateCgraToTaskPass.cpp
@@ -0,0 +1,52 @@
+//===- AllocateCgraToTaskPass.cpp - Task to CGRA Mapping Pass -===//
+//
+// This pass maps Taskflow tasks onto a 2D CGRA grid array:
+// 1. Places tasks with SSA dependencies on adjacent CGRAs.
+// 2. Assigns memrefs to SRAMs (each MemRef is assigned to exactly one SRAM,
+//    determined by proximity to the task that first accesses it).
+//
+// Implementation lives in lib/TaskflowDialect/Allocation/allocation_utils_mapper.cpp
+// (runAllocateCgraToTask).
+//
+//===----------------------------------------------------------------------===//
+
+#include "TaskflowDialect/TaskflowDialect.h"
+#include "TaskflowDialect/TaskflowPasses.h"
+#include "TaskflowDialect/Allocation/allocation_utils.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using namespace mlir::taskflow;
+
+namespace {
+
+struct AllocateCgraToTaskPass
+    : public PassWrapper<AllocateCgraToTaskPass, OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(AllocateCgraToTaskPass)
+
+  AllocateCgraToTaskPass() = default;
+
+  StringRef getArgument() const override { return "allocate-cgra-to-task"; }
+
+  StringRef getDescription() const override {
+    return "Maps Taskflow tasks onto a 2D CGRA grid with adjacency "
+           "optimization and memory mapping.";
+  }
+
+  void runOnOperation() override {
+    runAllocateCgraToTask(getOperation(), kCgraGridRows, kCgraGridCols);
+  }
+};
+
+} // namespace
+
+namespace mlir {
+namespace taskflow {
+
+std::unique_ptr<Pass> createAllocateCgraToTaskPass() {
+  return std::make_unique<AllocateCgraToTaskPass>();
+}
+
+} // namespace taskflow
+} // namespace mlir
diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt
index 60078298..ae722f21 100644
--- a/lib/TaskflowDialect/Transforms/CMakeLists.txt
+++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt
@@ -3,7 +3,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 add_mlir_library(MLIRTaskflowTransforms
     ConstructHyperblockFromTaskPass.cpp
     ClassifyCountersPass.cpp
-    MapTaskOnCgraPass.cpp
+    AllocateCgraToTaskPass.cpp
     FuseTaskPass.cpp
 
     DEPENDS
@@ -15,9 +15,12 @@ add_mlir_library(MLIRTaskflowTransforms
     MLIRSupport
     MLIRTransforms
     MLIRTaskflow
+    MLIRTaskflowAllocation
     MLIRNeura
     MLIRNeuraTransforms
     MLIRConversion
     ${dialect_libs}
     LLVMSupport
-)
\ No newline at end of file
+)
+
+add_subdirectory(Optimizations)
\ No newline at end of file
diff --git a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
deleted file mode 100644
index d8225ece..00000000
--- a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
+++ /dev/null
@@ -1,593 +0,0 @@
-//===- MapTaskOnCgraPass.cpp - Task to CGRA Mapping Pass ----------------===//
-//
-// This pass maps Taskflow tasks onto a 2D CGRA grid array:
-// 1. Places tasks with SSA dependencies on adjacent CGRAs.
-// 2. Assigns memrefs to SRAMs (each MemRef is assigned to exactly one SRAM,
-//    determined by proximity to the task that first accesses it).
-//
-//===----------------------------------------------------------------------===//
-
-#include "TaskflowDialect/TaskflowDialect.h"
-#include "TaskflowDialect/TaskflowOps.h"
-#include "TaskflowDialect/TaskflowPasses.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/Pass/Pass.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include <algorithm>
-#include <climits>
-#include <cmath>
-#include <optional>
-#include <string>
-#include <vector>
-
-using namespace mlir;
-using namespace mlir::taskflow;
-
-namespace {
-
-//===----------------------------------------------------------------------===//
-// CGRA Grid Position
-//===----------------------------------------------------------------------===//
-/// Represents a position on the 2D CGRA grid.
-struct CGRAPosition {
-  int row;
-  int col;
-
-  bool operator==(const CGRAPosition &other) const {
-    return row == other.row && col == other.col;
-  }
-
-  bool operator!=(const CGRAPosition &other) const { return !(*this == other); }
-
-  /// Computes Manhattan distance to another position.
-  int manhattanDistance(const CGRAPosition &other) const {
-    return std::abs(row - other.row) + std::abs(col - other.col);
-  }
-
-  /// Checks if adjacent (Manhattan distance = 1).
-  bool isAdjacent(const CGRAPosition &other) const {
-    return manhattanDistance(other) == 1;
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Task Placement Info
-//===----------------------------------------------------------------------===//
-/// Stores placement info for a task: can span multiple combined CGRAs.
-struct TaskPlacement {
-  SmallVector<CGRAPosition> cgra_positions; // CGRAs assigned to this task.
-
-  /// Returns the primary (first) position.
-  CGRAPosition primary() const {
-    return cgra_positions.empty() ? CGRAPosition{-1, -1} : cgra_positions[0];
-  }
-
-  /// Returns the number of CGRAs assigned.
-  size_t cgraCount() const { return cgra_positions.size(); }
-
-  /// Checks if any CGRA in this task is adjacent to any in other task.
-  bool hasAdjacentCGRA(const TaskPlacement &other) const {
-    for (const auto &pos : cgra_positions) {
-      for (const auto &other_pos : other.cgra_positions) {
-        if (pos.isAdjacent(other_pos)) {
-          return true;
-        }
-      }
-    }
-    return false;
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Task-Memory Graph
-//===----------------------------------------------------------------------===//
-
-struct MemoryNode;
-
-/// Represents a Task node in the graph.
-struct TaskNode {
-  size_t id;
-  TaskflowTaskOp op;
-  int dependency_depth = 0; // Longest path to any sink in the dependency graph.
-
-  // Edges based on original memory access.
-  SmallVector<MemoryNode *> read_memrefs;  // Original read memrefs.
-  SmallVector<MemoryNode *> write_memrefs; // Original write memrefs.
-  SmallVector<TaskNode *> ssa_users;
-  SmallVector<TaskNode *> ssa_operands;
-
-  // Placement result
-  SmallVector<CGRAPosition> placement;
-
-  TaskNode(size_t id, TaskflowTaskOp op) : id(id), op(op) {}
-};
-
-/// Represents a Memory node (MemRef) in the graph.
-struct MemoryNode {
-  Value memref;
-
-  // Edges.
-  SmallVector<TaskNode *> readers;
-  SmallVector<TaskNode *> writers;
-
-  // Mapping result.
-  std::optional<CGRAPosition> assigned_sram_pos;
-
-  MemoryNode(Value memref) : memref(memref) {}
-};
-
-/// The Task-Memory Dependency Graph.
-class TaskMemoryGraph {
-public:
-  SmallVector<std::unique_ptr<TaskNode>> task_nodes;
-  SmallVector<std::unique_ptr<MemoryNode>> memory_nodes;
-  DenseMap<Value, MemoryNode *> memref_to_node;
-  DenseMap<Operation *, TaskNode *> op_to_node;
-
-  void build(func::FuncOp func) {
-    // 1. Creates TaskNodes.
-    size_t task_id = 0;
-    func.walk([&](TaskflowTaskOp task) {
-      auto node = std::make_unique<TaskNode>(task_id++, task);
-      op_to_node[task] = node.get();
-      task_nodes.push_back(std::move(node));
-    });
-
-    // 2. Creates MemoryNodes using ORIGINAL memrefs (canonical identity).
-    // Uses original_read_memrefs/original_write_memrefs to ensure aliased
-    // memories share the same MemoryNode.
-    for (auto &t_node : task_nodes) {
-      // Uses original_read_memrefs for canonical memory identity.
-      for (Value orig_memref : t_node->op.getOriginalReadMemrefs()) {
-        MemoryNode *m_node = getOrCreateMemoryNode(orig_memref);
-        t_node->read_memrefs.push_back(m_node);
-        m_node->readers.push_back(t_node.get());
-      }
-
-      // Uses original_write_memrefs for canonical memory identity.
-      for (Value orig_memref : t_node->op.getOriginalWriteMemrefs()) {
-        MemoryNode *m_node = getOrCreateMemoryNode(orig_memref);
-        t_node->write_memrefs.push_back(m_node);
-        m_node->writers.push_back(t_node.get());
-      }
-    }
-
-    // 3. Builds SSA Edges (Inter-Task Value Dependencies).
-    // Identifies if a task uses a value produced by another task.
-    for (auto &consumer_node : task_nodes) {
-      // Iterates all operands for now to be safe.
-      for (Value operand : consumer_node->op.getValueInputs()) {
-        if (auto producer_op = operand.getDefiningOp<TaskflowTaskOp>()) {
-          if (auto *producer_node = op_to_node[producer_op]) {
-            producer_node->ssa_users.push_back(consumer_node.get());
-            consumer_node->ssa_operands.push_back(producer_node);
-          }
-        }
-      }
-    }
-  }
-
-private:
-  MemoryNode *getOrCreateMemoryNode(Value memref) {
-    if (memref_to_node.count(memref)) {
-      return memref_to_node[memref];
-    }
-
-    auto node = std::make_unique<MemoryNode>(memref);
-    MemoryNode *ptr = node.get();
-    memref_to_node[memref] = ptr;
-    memory_nodes.push_back(std::move(node));
-    return ptr;
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Task Mapper
-//===----------------------------------------------------------------------===//
-/// Maps a task-memory graph onto a 2D CGRA grid.
-
-class TaskMapper {
-public:
-  TaskMapper(int grid_rows, int grid_cols)
-      : grid_rows_(grid_rows), grid_cols_(grid_cols) {
-    occupied_.resize(grid_rows_);
-    for (auto &row : occupied_) {
-      row.resize(grid_cols_, false);
-    }
-  }
-
-  /// Places all tasks and performs memory mapping.
-  void place(func::FuncOp func) {
-    SmallVector<TaskflowTaskOp> tasks;
-    func.walk([&](TaskflowTaskOp task) { tasks.push_back(task); });
-
-    if (tasks.empty()) {
-      llvm::errs() << "No tasks to place.\n";
-      return;
-    }
-
-    // Builds Task-Memory Graph.
-    TaskMemoryGraph graph;
-    graph.build(func);
-
-    if (graph.task_nodes.empty()) {
-      llvm::errs() << "No tasks to place.\n";
-      return;
-    }
-
-    // Computes Dependency Depth for each task.
-    // Dependency depth = longest path from this node to any sink in the
-    // dependency graph (considering both SSA and memory edges). Tasks with
-    // higher depth are more "critical" and are placed first to ensure their
-    // dependent chains have good locality.
-    computeDependencyDepth(graph);
-
-    // Sorts tasks by dependency depth (Critical Path First).
-    SmallVector<TaskNode *> sorted_tasks;
-    for (auto &node : graph.task_nodes)
-      sorted_tasks.push_back(node.get());
-
-    std::stable_sort(sorted_tasks.begin(), sorted_tasks.end(),
-                     [](TaskNode *a, TaskNode *b) {
-                       return a->dependency_depth > b->dependency_depth;
-                     });
-
-    // Critical-path-first placement:
-    // 1. Computes dependency depth for each task (longest path to sink).
-    // 2. Sorts tasks by dependency depth (higher = more critical).
-    // 3. Places tasks in sorted order with heuristic scoring.
-    // Iterative Refinement Loop (Coordinate Descent).
-    // Alternates between Task Placement (Phase 1) and SRAM Assignment (Phase
-    // 2).
-    constexpr int kMaxIterations = 10;
-
-    for (int iter = 0; iter < kMaxIterations; ++iter) {
-      // Phase 1: Place Tasks (assuming fixed SRAMs).
-      if (iter > 0) {
-        resetTaskPlacements(graph);
-      }
-
-      for (TaskNode *task_node : sorted_tasks) {
-        int cgra_count = 1;
-        if (auto attr =
-                task_node->op->getAttrOfType<IntegerAttr>("cgra_count")) {
-          cgra_count = attr.getInt();
-        }
-
-        // Finds best placement using SRAM positions from previous iter (or
-        // -1/default).
-        TaskPlacement placement =
-            findBestPlacement(task_node, cgra_count, graph);
-
-        // Commits Placement.
-        task_node->placement.push_back(placement.primary());
-        // Handles mapping one task on multi-CGRAs.
-        // TODO: Introduce explicit multi-CGRA binding logic.
-        for (size_t i = 1; i < placement.cgra_positions.size(); ++i) {
-          task_node->placement.push_back(placement.cgra_positions[i]);
-        }
-
-        // Marks occupied.
-        for (const auto &pos : placement.cgra_positions) {
-          if (pos.row >= 0 && pos.row < grid_rows_ && pos.col >= 0 &&
-              pos.col < grid_cols_) {
-            occupied_[pos.row][pos.col] = true;
-          }
-        }
-      }
-
-      // Phase 2: Assign SRAMs (assuming fixed tasks).
-      bool sram_moved = assignAllSRAMs(graph);
-
-      // Convergence Check.
-      // If SRAMs didn't move, it means task placement based on them likely
-      // won't change either.
-      if (iter > 0 && !sram_moved) {
-        break;
-      }
-    }
-
-    // Annotates result.
-    OpBuilder builder(func.getContext());
-    for (auto &task_node : graph.task_nodes) {
-      if (task_node->placement.empty()) {
-        continue;
-      }
-
-      SmallVector<NamedAttribute, 4> mapping_attrs;
-
-      // 1. CGRA positions.
-      SmallVector<Attribute> pos_attrs;
-      for (const auto &pos : task_node->placement) {
-        SmallVector<NamedAttribute, 2> coord_attrs;
-        coord_attrs.push_back(
-            NamedAttribute(StringAttr::get(func.getContext(), "row"),
-                           builder.getI32IntegerAttr(pos.row)));
-        coord_attrs.push_back(
-            NamedAttribute(StringAttr::get(func.getContext(), "col"),
-                           builder.getI32IntegerAttr(pos.col)));
-        pos_attrs.push_back(
-            DictionaryAttr::get(func.getContext(), coord_attrs));
-      }
-      mapping_attrs.push_back(
-          NamedAttribute(StringAttr::get(func.getContext(), "cgra_positions"),
-                         builder.getArrayAttr(pos_attrs)));
-
-      // 2. Reads SRAM Locations.
-      SmallVector<Attribute> read_sram_attrs;
-      for (MemoryNode *mem : task_node->read_memrefs) {
-        if (mem->assigned_sram_pos) {
-          SmallVector<NamedAttribute, 2> sram_coord;
-          sram_coord.push_back(NamedAttribute(
-              StringAttr::get(func.getContext(), "row"),
-              builder.getI32IntegerAttr(mem->assigned_sram_pos->row)));
-          sram_coord.push_back(NamedAttribute(
-              StringAttr::get(func.getContext(), "col"),
-              builder.getI32IntegerAttr(mem->assigned_sram_pos->col)));
-          read_sram_attrs.push_back(
-              DictionaryAttr::get(func.getContext(), sram_coord));
-        }
-      }
-      mapping_attrs.push_back(NamedAttribute(
-          StringAttr::get(func.getContext(), "read_sram_locations"),
-          builder.getArrayAttr(read_sram_attrs)));
-
-      // 3. Writes SRAM Locations.
-      SmallVector<Attribute> write_sram_attrs;
-      for (MemoryNode *mem : task_node->write_memrefs) {
-        if (mem->assigned_sram_pos) {
-          SmallVector<NamedAttribute, 2> sram_coord;
-          sram_coord.push_back(NamedAttribute(
-              StringAttr::get(func.getContext(), "row"),
-              builder.getI32IntegerAttr(mem->assigned_sram_pos->row)));
-          sram_coord.push_back(NamedAttribute(
-              StringAttr::get(func.getContext(), "col"),
-              builder.getI32IntegerAttr(mem->assigned_sram_pos->col)));
-
-          write_sram_attrs.push_back(
-              DictionaryAttr::get(func.getContext(), sram_coord));
-        }
-      }
-      mapping_attrs.push_back(NamedAttribute(
-          StringAttr::get(func.getContext(), "write_sram_locations"),
-          builder.getArrayAttr(write_sram_attrs)));
-
-      // Sets Attribute.
-      task_node->op->setAttr(
-          "task_mapping_info",
-          DictionaryAttr::get(func.getContext(), mapping_attrs));
-    }
-  }
-
-private:
-  /// Clears task placement and occupied grid.
-  void resetTaskPlacements(TaskMemoryGraph &graph) {
-    for (auto &task : graph.task_nodes) {
-      task->placement.clear();
-    }
-    // Clears grid.
-    for (int r = 0; r < grid_rows_; ++r) {
-      std::fill(occupied_[r].begin(), occupied_[r].end(), false);
-    }
-  }
-
-  /// Assigns all memory nodes to SRAMs based on centroid of accessing tasks.
-  /// Returns true if any SRAM assignment changed.
-  bool assignAllSRAMs(TaskMemoryGraph &graph) {
-    bool changed = false;
-    for (auto &mem_node : graph.memory_nodes) {
-      // Computes centroid of all tasks that access this memory.
-      int total_row = 0, total_col = 0, count = 0;
-      for (TaskNode *reader : mem_node->readers) {
-        if (!reader->placement.empty()) {
-          total_row += reader->placement[0].row;
-          total_col += reader->placement[0].col;
-          count++;
-        }
-      }
-      for (TaskNode *writer : mem_node->writers) {
-        if (!writer->placement.empty()) {
-          total_row += writer->placement[0].row;
-          total_col += writer->placement[0].col;
-          count++;
-        }
-      }
-
-      std::optional<CGRAPosition> new_sram_pos;
-      if (count > 0) {
-        // Rounds to the nearest integer.
-        int avg_row = (total_row + count / 2) / count;
-        int avg_col = (total_col + count / 2) / count;
-        new_sram_pos = CGRAPosition{avg_row, avg_col};
-      }
-
-      if (mem_node->assigned_sram_pos != new_sram_pos) {
-        mem_node->assigned_sram_pos = new_sram_pos;
-        changed = true;
-      }
-    }
-    return changed;
-  }
-
-  /// Finds best placement for a task.
-  /// TODO: Currently defaults to single-CGRA placement. Multi-CGRA binding
-  /// logic (cgra_count > 1) is experimental/placeholder and should ideally be
-  /// handled by an upstream resource binding pass.
-  TaskPlacement findBestPlacement(TaskNode *task_node, int cgra_count,
-                                  TaskMemoryGraph &graph) {
-    int best_score = INT_MIN;
-    TaskPlacement best_placement;
-
-    // Baseline: For cgra_count=1, finds single best position.
-    for (int r = 0; r < grid_rows_; ++r) {
-      for (int c = 0; c < grid_cols_; ++c) {
-        if (occupied_[r][c]) {
-          continue;
-        }
-
-        TaskPlacement candidate;
-        candidate.cgra_positions.push_back({r, c});
-
-        int score = computeScore(task_node, candidate, graph);
-        if (score > best_score) {
-          best_score = score;
-          best_placement = candidate;
-        }
-      }
-    }
-
-    // Error handling: No available position found (grid over-subscribed).
-    if (best_placement.cgra_positions.empty()) {
-      assert(false &&
-             "No available CGRA position found (grid over-subscribed).");
-    }
-
-    return best_placement;
-  }
-
-  /// Computes placement score based on Task-Memory Graph.
-  /// TODO: Introduce explicit 'direct_wires' attributes in the IR for
-  /// downstream hardware generators to configure fast bypass paths between
-  /// adjacent PEs with dependencies.
-  ///
-  /// Score = α·SSA_Dist + β·Mem_Dist.
-  ///
-  /// SSA_Dist: Minimize distance to placed SSA predecessors (ssa_operands).
-  /// Mem_Dist: Minimize distance to assigned SRAMs for read/write memrefs.
-  int computeScore(TaskNode *task_node, const TaskPlacement &placement,
-                   TaskMemoryGraph &graph) {
-    // Weight constants (tunable).
-    constexpr int kAlpha = 10; // SSA proximity weight.
-    constexpr int kBeta = 50;  // Memory proximity weight (high priority).
-
-    int ssa_score = 0;
-    int mem_score = 0;
-
-    CGRAPosition current_pos = placement.primary();
-
-    // 1. SSA proximity (predecessors & successors).
-    for (TaskNode *producer : task_node->ssa_operands) {
-      if (!producer->placement.empty()) {
-        int dist = current_pos.manhattanDistance(producer->placement[0]);
-        // Uses negative distance to penalize far-away placements.
-        ssa_score -= dist;
-      }
-    }
-    for (TaskNode *consumer : task_node->ssa_users) {
-      if (!consumer->placement.empty()) {
-        int dist = current_pos.manhattanDistance(consumer->placement[0]);
-        ssa_score -= dist;
-      }
-    }
-
-    // 2. Memory proximity.
-    // For read memrefs.
-    for (MemoryNode *mem : task_node->read_memrefs) {
-      if (mem->assigned_sram_pos) {
-        int dist = current_pos.manhattanDistance(*mem->assigned_sram_pos);
-        mem_score -= dist;
-      }
-    }
-    // For write memrefs.
-    // If we write to a memory that is already assigned (e.g. read by previous
-    // task), we want to be close to it too.
-    for (MemoryNode *mem : task_node->write_memrefs) {
-      if (mem->assigned_sram_pos) {
-        int dist = current_pos.manhattanDistance(*mem->assigned_sram_pos);
-        mem_score -= dist;
-      }
-    }
-
-    return kAlpha * ssa_score + kBeta * mem_score;
-  }
-
-  /// Computes dependency depth for all tasks in the graph.
-  ///
-  /// Dependency depth = longest path from this node to any sink node in the
-  /// dependency graph (via SSA or memory edges).
-  ///
-  /// Tasks with higher dependency depth have longer chains of dependent tasks
-  /// after them. By placing these tasks first:
-  /// 1. They get priority access to good grid positions.
-  /// 2. Their dependent tasks can then be positioned adjacent to them,
-  ///    minimizing inter-task communication distance.
-  void computeDependencyDepth(TaskMemoryGraph &graph) {
-    DenseMap<TaskNode *, int> depth_cache;
-    for (auto &node : graph.task_nodes) {
-      node->dependency_depth = calculateDepth(node.get(), depth_cache);
-    }
-  }
-
-  /// Recursively calculates dependency depth for a single task.
-  int calculateDepth(TaskNode *node, DenseMap<TaskNode *, int> &depth_cache) {
-    if (depth_cache.count(node)) {
-      return depth_cache[node];
-    }
-
-    int max_child_depth = 0;
-    // SSA dependencies.
-    for (TaskNode *child : node->ssa_users) {
-      max_child_depth =
-          std::max(max_child_depth, calculateDepth(child, depth_cache) + 1);
-    }
-
-    // Memory dependencies (Producer -> Mem -> Consumer).
-    for (MemoryNode *mem : node->write_memrefs) {
-      for (TaskNode *reader : mem->readers) {
-        if (reader != node) {
-          max_child_depth = std::max(max_child_depth,
-                                     calculateDepth(reader, depth_cache) + 1);
-        }
-      }
-    }
-
-    return depth_cache[node] = max_child_depth;
-  }
-
-  int grid_rows_;
-  int grid_cols_;
-  std::vector<std::vector<bool>> occupied_;
-};
-
-//===----------------------------------------------------------------------===//
-// Pass Definition
-//===----------------------------------------------------------------------===//
-struct MapTaskOnCgraPass
-    : public PassWrapper<MapTaskOnCgraPass, OperationPass<func::FuncOp>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MapTaskOnCgraPass)
-
-  MapTaskOnCgraPass() = default;
-
-  StringRef getArgument() const override { return "map-task-on-cgra"; }
-
-  StringRef getDescription() const override {
-    return "Maps Taskflow tasks onto a 2D CGRA grid with adjacency "
-           "optimization and memory mapping.";
-  }
-
-  void runOnOperation() override {
-    func::FuncOp func = getOperation();
-    constexpr int kDefaultGridRows = 3;
-    constexpr int kDefaultGridCols = 3;
-    TaskMapper mapper(kDefaultGridRows, kDefaultGridCols);
-    mapper.place(func);
-  }
-};
-
-} // namespace
-
-namespace mlir {
-namespace taskflow {
-
-std::unique_ptr<Pass> createMapTaskOnCgraPass() {
-  return std::make_unique<MapTaskOnCgraPass>();
-}
-
-} // namespace taskflow
-} // namespace mlir
diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
index 0e18b971..4d4892f5 100644
--- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
+++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
@@ -29,7 +29,7 @@
 // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
 // RUN: --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
-// RUN: --map-task-on-cgra \
+// RUN: --allocate-cgra-to-task \
 // RUN: -o %t.placement.mlir
 // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT
 
diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
index fdbe54da..ece77e32 100644
--- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
+++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
@@ -61,7 +61,7 @@
 // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
 // RUN: --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
-// RUN: --map-task-on-cgra \
+// RUN: --allocate-cgra-to-task \
 // RUN: -o %t.placement.mlir
 // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT
 
@@ -524,7 +524,7 @@ module attributes {} {
 
 // PLACEMENT: module {
 // PLACEMENT-NEXT:   func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref<?x8x6xi32>, %arg1: memref<?x8x5xi32>, %arg2: memref<?x8x5xi32>, %arg3: memref<?x7xi32>, %arg4: memref<?x9xi32>, %arg5: memref<?xi32>, %arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: memref<?xi32>, %arg9: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
-// PLACEMENT-NEXT:     %dependency_read_out, %dependency_write_out = taskflow.task @Task_0 dependency_read_in(%arg0 : memref<?x8x6xi32>) dependency_write_in(%arg5 : memref<?xi32>) [original_read_memrefs(%arg0 : memref<?x8x6xi32>), original_write_memrefs(%arg5 : memref<?xi32>)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locations = [{col = 0 : i32, row = 0 : i32}], write_sram_locations = [{col = 0 : i32, row = 1 : i32}]}} : (memref<?x8x6xi32>, memref<?xi32>) -> (memref<?x8x6xi32>, memref<?xi32>) {
+// PLACEMENT-NEXT:     %dependency_read_out, %dependency_write_out = taskflow.task @Task_0 dependency_read_in(%arg0 : memref<?x8x6xi32>) dependency_write_in(%arg5 : memref<?xi32>) [original_read_memrefs(%arg0 : memref<?x8x6xi32>), original_write_memrefs(%arg5 : memref<?xi32>)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locations = [{col = 0 : i32, row = 0 : i32}], write_sram_locations = [{col = 2 : i32, row = 0 : i32}]}} : (memref<?x8x6xi32>, memref<?xi32>) -> (memref<?x8x6xi32>, memref<?xi32>) {
 // PLACEMENT-NEXT:     ^bb0(%arg10: memref<?x8x6xi32>, %arg11: memref<?xi32>):
 // PLACEMENT-NEXT:       %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
 // PLACEMENT-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
@@ -537,7 +537,7 @@ module attributes {} {
 // PLACEMENT-NEXT:       }) : (index, index, index) -> ()
 // PLACEMENT-NEXT:       taskflow.yield reads(%arg10 : memref<?x8x6xi32>) writes(%arg11 : memref<?xi32>)
 // PLACEMENT-NEXT:     }
-// PLACEMENT-NEXT:     %dependency_read_out_0:2, %dependency_write_out_1 = taskflow.task @Task_1 dependency_read_in(%arg1, %arg2 : memref<?x8x5xi32>, memref<?x8x5xi32>) dependency_write_in(%arg6 : memref<?xi32>) [original_read_memrefs(%arg1, %arg2 : memref<?x8x5xi32>, memref<?x8x5xi32>), original_write_memrefs(%arg6 : memref<?xi32>)] {task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locations = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]}} : (memref<?x8x5xi32>, memref<?x8x5xi32>, memref<?xi32>) -> (memref<?x8x5xi32>, memref<?x8x5xi32>, memref<?xi32>) {
+// PLACEMENT-NEXT:     %dependency_read_out_0:2, %dependency_write_out_1 = taskflow.task @Task_1 dependency_read_in(%arg1, %arg2 : memref<?x8x5xi32>, memref<?x8x5xi32>) dependency_write_in(%arg6 : memref<?xi32>) [original_read_memrefs(%arg1, %arg2 : memref<?x8x5xi32>, memref<?x8x5xi32>), original_write_memrefs(%arg6 : memref<?xi32>)] {task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locations = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locations = [{col = 2 : i32, row = 0 : i32}]}} : (memref<?x8x5xi32>, memref<?x8x5xi32>, memref<?xi32>) -> (memref<?x8x5xi32>, memref<?x8x5xi32>, memref<?xi32>) {
 // PLACEMENT-NEXT:     ^bb0(%arg10: memref<?x8x5xi32>, %arg11: memref<?x8x5xi32>, %arg12: memref<?xi32>):
 // PLACEMENT-NEXT:       %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
 // PLACEMENT-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
@@ -552,7 +552,7 @@ module attributes {} {
 // PLACEMENT-NEXT:       }) : (index, index, index) -> ()
 // PLACEMENT-NEXT:       taskflow.yield reads(%arg10, %arg11 : memref<?x8x5xi32>, memref<?x8x5xi32>) writes(%arg12 : memref<?xi32>)
 // PLACEMENT-NEXT:     }
-// PLACEMENT-NEXT:     %dependency_read_out_2:3, %dependency_write_out_3 = taskflow.task @Task_2 dependency_read_in(%dependency_write_out, %dependency_write_out_1, %arg9 : memref<?xi32>, memref<?xi32>, memref<?xi32>) dependency_write_in(%arg9 : memref<?xi32>) [original_read_memrefs(%arg5, %arg6, %arg9 : memref<?xi32>, memref<?xi32>, memref<?xi32>), original_write_memrefs(%arg9 : memref<?xi32>)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locations = [{col = 0 : i32, row = 1 : i32}, {col = 1 : i32, row = 1 : i32}, {col = 0 : i32, row = 1 : i32}], write_sram_locations = [{col = 0 : i32, row = 1 : i32}]}} : (memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>) -> (memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>) {
+// PLACEMENT-NEXT:     %dependency_read_out_2:3, %dependency_write_out_3 = taskflow.task @Task_2 dependency_read_in(%dependency_write_out, %dependency_write_out_1, %arg9 : memref<?xi32>, memref<?xi32>, memref<?xi32>) dependency_write_in(%arg9 : memref<?xi32>) [original_read_memrefs(%arg5, %arg6, %arg9 : memref<?xi32>, memref<?xi32>, memref<?xi32>), original_write_memrefs(%arg9 : memref<?xi32>)] {task_mapping_info = {cgra_positions = [{col = 3 : i32, row = 0 : i32}], read_sram_locations = [{col = 2 : i32, row = 0 : i32}, {col = 2 : i32, row = 0 : i32}, {col = 3 : i32, row = 0 : i32}], write_sram_locations = [{col = 3 : i32, row = 0 : i32}]}} : (memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>) -> (memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>) {
 // PLACEMENT-NEXT:     ^bb0(%arg10: memref<?xi32>, %arg11: memref<?xi32>, %arg12: memref<?xi32>, %arg13: memref<?xi32>):
 // PLACEMENT-NEXT:       %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
 // PLACEMENT-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
@@ -571,7 +571,7 @@ module attributes {} {
 // PLACEMENT-NEXT:       }) : (index) -> ()
 // PLACEMENT-NEXT:       taskflow.yield reads(%arg10, %arg11, %arg13 : memref<?xi32>, memref<?xi32>, memref<?xi32>) writes(%arg13 : memref<?xi32>)
 // PLACEMENT-NEXT:     }
-// PLACEMENT-NEXT:     %dependency_read_out_4, %dependency_write_out_5 = taskflow.task @Task_3 dependency_read_in(%arg3 : memref<?x7xi32>) dependency_write_in(%arg7 : memref<?xi32>) [original_read_memrefs(%arg3 : memref<?x7xi32>), original_write_memrefs(%arg7 : memref<?xi32>)] {task_mapping_info = {cgra_positions = [{col = 2 : i32, row = 0 : i32}], read_sram_locations = [{col = 2 : i32, row = 0 : i32}], write_sram_locations = [{col = 2 : i32, row = 1 : i32}]}} : (memref<?x7xi32>, memref<?xi32>) -> (memref<?x7xi32>, memref<?xi32>) {
+// PLACEMENT-NEXT:     %dependency_read_out_4, %dependency_write_out_5 = taskflow.task @Task_3 dependency_read_in(%arg3 : memref<?x7xi32>) dependency_write_in(%arg7 : memref<?xi32>) [original_read_memrefs(%arg3 : memref<?x7xi32>), original_write_memrefs(%arg7 : memref<?xi32>)] {task_mapping_info = {cgra_positions = [{col = 2 : i32, row = 0 : i32}], read_sram_locations = [{col = 2 : i32, row = 0 : i32}], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]}} : (memref<?x7xi32>, memref<?xi32>) -> (memref<?x7xi32>, memref<?xi32>) {
 // PLACEMENT-NEXT:     ^bb0(%arg10: memref<?x7xi32>, %arg11: memref<?xi32>):
 // PLACEMENT-NEXT:       %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
 // PLACEMENT-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 7 : index} : index
@@ -583,7 +583,7 @@ module attributes {} {
 // PLACEMENT-NEXT:       }) : (index, index) -> ()
 // PLACEMENT-NEXT:       taskflow.yield reads(%arg10 : memref<?x7xi32>) writes(%arg11 : memref<?xi32>)
 // PLACEMENT-NEXT:     }
-// PLACEMENT-NEXT:     %dependency_read_out_6:2, %dependency_write_out_7 = taskflow.task @Task_4 dependency_read_in(%arg4, %dependency_write_out_5 : memref<?x9xi32>, memref<?xi32>) dependency_write_in(%arg8 : memref<?xi32>) [original_read_memrefs(%arg4, %arg7 : memref<?x9xi32>, memref<?xi32>), original_write_memrefs(%arg8 : memref<?xi32>)] {task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_locations = [{col = 1 : i32, row = 1 : i32}, {col = 2 : i32, row = 1 : i32}], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]}} : (memref<?x9xi32>, memref<?xi32>, memref<?xi32>) -> (memref<?x9xi32>, memref<?xi32>, memref<?xi32>) {
+// PLACEMENT-NEXT:     %dependency_read_out_6:2, %dependency_write_out_7 = taskflow.task @Task_4 dependency_read_in(%arg4, %dependency_write_out_5 : memref<?x9xi32>, memref<?xi32>) dependency_write_in(%arg8 : memref<?xi32>) [original_read_memrefs(%arg4, %arg7 : memref<?x9xi32>, memref<?xi32>), original_write_memrefs(%arg8 : memref<?xi32>)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locations = [{col = 0 : i32, row = 1 : i32}, {col = 1 : i32, row = 1 : i32}], write_sram_locations = [{col = 0 : i32, row = 1 : i32}]}} : (memref<?x9xi32>, memref<?xi32>, memref<?xi32>) -> (memref<?x9xi32>, memref<?xi32>, memref<?xi32>) {
 // PLACEMENT-NEXT:     ^bb0(%arg10: memref<?x9xi32>, %arg11: memref<?xi32>, %arg12: memref<?xi32>):
 // PLACEMENT-NEXT:       %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
 // PLACEMENT-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 9 : index} : index
diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
index fa1135ad..ea42d03e 100644
--- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
+++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
@@ -42,7 +42,7 @@
 // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
 // RUN: --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
-// RUN: --map-task-on-cgra \
+// RUN: --allocate-cgra-to-task \
 // RUN: -o %t.placement.mlir
 // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT