From 31911d228f1b4e2613cf0a260da35c8bf95078a5 Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Sun, 1 Feb 2026 05:46:33 +0800
Subject: [PATCH 01/15] feat(taskflow): implement graph-based ACT placement and
 memory management framework

---
 include/TaskflowDialect/TaskflowPasses.h      |   4 +
 include/TaskflowDialect/TaskflowPasses.td     |  15 +
 lib/TaskflowDialect/Transforms/CMakeLists.txt |   4 +
 .../Transforms/PlaceACTOnCGRAPass.cpp         | 481 ++++++++++++++++++
 .../irregular-loop/irregular-loop.mlir        |  13 +
 .../taskflow/multi-nested/multi-nested.mlir   |  20 +-
 .../parallel-nested/parallel-nested.mlir      |  14 +-
 7 files changed, 549 insertions(+), 2 deletions(-)
 create mode 100644 lib/TaskflowDialect/Transforms/PlaceACTOnCGRAPass.cpp

diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h
index c0007ce1..61073053 100644
--- a/include/TaskflowDialect/TaskflowPasses.h
+++ b/include/TaskflowDialect/TaskflowPasses.h
@@ -17,7 +17,11 @@ namespace taskflow {
 #include "TaskflowDialect/TaskflowPasses.h.inc"
 std::unique_ptr<mlir::Pass> createConstructHyperblockFromTaskPass();
 std::unique_ptr<mlir::Pass> createCanonicalizeTaskPass();
+<<<<<<< HEAD
 std::unique_ptr<mlir::Pass> createClassifyCountersPass();
+=======
+std::unique_ptr<mlir::Pass> createPlaceACTOnCGRAPass();
+>>>>>>> a0f7fc7 (feat(taskflow): implement graph-based ACT placement and memory management framework)
 
 #define GEN_PASS_REGISTRATION
 #include "TaskflowDialect/TaskflowPasses.h.inc"
diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td
index 4fc2137f..c01d8f0d 100644
--- a/include/TaskflowDialect/TaskflowPasses.td
+++ b/include/TaskflowDialect/TaskflowPasses.td
@@ -30,6 +30,7 @@ def CanonicalizeTask: Pass<"canonicalize-task", "func::FuncOp">{
   let constructor = "taskflow::createCanonicalizeTaskPass()";
 }
 
+<<<<<<< HEAD
 def ClassifyCounters : Pass<"classify-counters", "ModuleOp">{
   let summary = "Classifies counters as root/relay/leaf";
   let description = [{
@@ -42,5 +43,19 @@ def ClassifyCounters : Pass<"classify-counters", "ModuleOp">{
     Leaf counters are mapped to CGRA tile arrays.
   }];
   let constructor = "taskflow::createClassifyCountersPass()";
+=======
+
+
+def PlaceACTOnCGRA : Pass<"place-act-on-cgra", "func::FuncOp"> {
+  let summary = "Places ACTs onto a 2D CGRA grid with adjacency optimization";
+  let description = [{
+    This pass places Atomic Canonical Tasks (ACTs) onto a 2D CGRA grid.
+    Fusion candidates (same-header SSA dependencies) are placed on adjacent
+    CGRAs to enable direct data forwarding.
+
+    Uses a default 4x4 CGRA grid.
+  }];
+  let constructor = "taskflow::createPlaceACTOnCGRAPass()";
+>>>>>>> a0f7fc7 (feat(taskflow): implement graph-based ACT placement and memory management framework)
 }
 #endif // TASKFLOW_PASSES_TD
\ No newline at end of file
diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt
index e44401d8..5177f4ae 100644
--- a/lib/TaskflowDialect/Transforms/CMakeLists.txt
+++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt
@@ -3,7 +3,11 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 add_mlir_library(MLIRTaskflowTransforms
     ConstructHyperblockFromTaskPass.cpp
     CanonicalizeTaskPass.cpp
+<<<<<<< HEAD
     ClassifyCountersPass.cpp
+=======
+    PlaceACTOnCGRAPass.cpp
+>>>>>>> a0f7fc7 (feat(taskflow): implement graph-based ACT placement and memory management framework)
 
     DEPENDS
     MLIRTaskflowTransformsIncGen
diff --git a/lib/TaskflowDialect/Transforms/PlaceACTOnCGRAPass.cpp b/lib/TaskflowDialect/Transforms/PlaceACTOnCGRAPass.cpp
new file mode 100644
index 00000000..e0d67c78
--- /dev/null
+++ b/lib/TaskflowDialect/Transforms/PlaceACTOnCGRAPass.cpp
@@ -0,0 +1,481 @@
+//===- PlaceACTOnCGRAPass.cpp - ACT to CGRA Placement Pass ----------------===//
+//
+// This pass places Atomic Canonical Tasks (ACTs) onto a 2D CGRA grid：
+// 1. SSA use-def placement: Tasks with SSA dependencies placed on adjacent CGRAs.
+// 2. Memory mapping: Assigns memrefs to SRAMs (single-SRAM constraint per data).
+//
+//===----------------------------------------------------------------------===//
+
+#include "TaskflowDialect/TaskflowDialect.h"
+#include "TaskflowDialect/TaskflowOps.h"
+#include "TaskflowDialect/TaskflowPasses.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <algorithm>
+#include <climits>
+#include <cmath>
+#include <string>
+#include <vector>
+
+using namespace mlir;
+using namespace mlir::taskflow;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// CGRA Grid Position
+//===----------------------------------------------------------------------===//
+/// Represents a position on the 2D CGRA grid.
+struct CGRAPosition {
+  int row;
+  int col;
+
+  bool operator==(const CGRAPosition &other) const {
+    return row == other.row && col == other.col;
+  }
+
+  /// Computes Manhattan distance to another position.
+  int manhattanDistance(const CGRAPosition &other) const {
+    return std::abs(row - other.row) + std::abs(col - other.col);
+  }
+
+  /// Checks if adjacent (Manhattan distance = 1).
+  bool isAdjacent(const CGRAPosition &other) const {
+    return manhattanDistance(other) == 1;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Task Placement Info
+//===----------------------------------------------------------------------===//
+/// Stores placement info for a task: can span multiple combined CGRAs.
+struct TaskPlacement {
+  SmallVector<CGRAPosition> cgra_positions; // CGRAs assigned to this task.
+
+  /// Returns the primary (first) position.
+  CGRAPosition primary() const {
+    return cgra_positions.empty() ? CGRAPosition{-1, -1} : cgra_positions[0];
+  }
+
+  /// Returns the number of CGRAs assigned.
+  size_t cgraCount() const { return cgra_positions.size(); }
+
+  /// Checks if any CGRA in this task is adjacent to any in other task.
+  bool hasAdjacentCGRA(const TaskPlacement &other) const {
+    for (const auto &pos : cgra_positions) {
+      for (const auto &other_pos : other.cgra_positions) {
+        if (pos.isAdjacent(other_pos)) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Task-Memory Graph
+//===----------------------------------------------------------------------===//
+
+struct MemoryNode;
+
+/// Represents a Task node in the graph.
+struct TaskNode {
+  size_t id;
+  TaskflowTaskOp op;
+  int alap_level = 0;
+  
+  // Edges
+  SmallVector<MemoryNode *> read_memrefs;
+  SmallVector<MemoryNode *> write_memrefs;
+  SmallVector<TaskNode *> ssa_users;
+  SmallVector<TaskNode *> ssa_operands;
+
+  // Placement result
+  SmallVector<CGRAPosition> placement;
+
+  TaskNode(size_t id, TaskflowTaskOp op) : id(id), op(op) {}
+};
+
+/// Represents a Memory node (MemRef) in the graph.
+struct MemoryNode {
+  Value memref;
+  
+  // Edges
+  SmallVector<TaskNode *> readers;
+  SmallVector<TaskNode *> writers;
+
+  // Mapping result
+  int assigned_sram_id = -1;
+
+  MemoryNode(Value memref) : memref(memref) {}
+};
+
+/// The Task-Memory Dependency Graph.
+class TaskMemoryGraph {
+public:
+  SmallVector<std::unique_ptr<TaskNode>> task_nodes;
+  SmallVector<std::unique_ptr<MemoryNode>> memory_nodes;
+  DenseMap<Value, MemoryNode *> memref_to_node;
+  DenseMap<Operation *, TaskNode *> op_to_node;
+
+  void build(func::FuncOp func) {
+    // 1. Creates TaskNodes.
+    size_t task_id = 0;
+    func.walk([&](TaskflowTaskOp task) {
+      auto node = std::make_unique<TaskNode>(task_id++, task);
+      op_to_node[task] = node.get();
+      task_nodes.push_back(std::move(node));
+    });
+
+    // 2. Creates MemoryNodes and defines Edges.
+    for (auto &t_node : task_nodes) {
+      // Memory Inputs (Reads)
+      for (Value input : t_node->op.getMemoryInputs()) {
+        MemoryNode *m_node = getOrCreateMemoryNode(input);
+        t_node->read_memrefs.push_back(m_node);
+        m_node->readers.push_back(t_node.get());
+      }
+
+      // Memory Outputs (Writes)
+      for (Value output : t_node->op.getMemoryOutputs()) {
+        MemoryNode *m_node = getOrCreateMemoryNode(output);
+        t_node->write_memrefs.push_back(m_node);
+        m_node->writers.push_back(t_node.get());
+      }
+    }
+
+    // 3. Builds SSA Edges (Inter-Task Value Dependencies).
+    // Identifies if a task uses a value produced by another task.
+    for (auto &consumer_node : task_nodes) {
+        // Iterate all operands for now to be safe.
+        for (Value operand : consumer_node->op->getOperands()) {
+            if (auto producer_op = operand.getDefiningOp<TaskflowTaskOp>()) {
+                if (auto *producer_node = op_to_node[producer_op]) {
+                    producer_node->ssa_users.push_back(consumer_node.get());
+                    consumer_node->ssa_operands.push_back(producer_node);
+                }
+            }
+        }
+    }
+  }
+
+private:
+  MemoryNode *getOrCreateMemoryNode(Value memref) {
+    if (memref_to_node.count(memref))
+      return memref_to_node[memref];
+    
+    auto node = std::make_unique<MemoryNode>(memref);
+    MemoryNode *ptr = node.get();
+    memref_to_node[memref] = ptr;
+    memory_nodes.push_back(std::move(node));
+    return ptr;
+  }
+};
+
+
+//===----------------------------------------------------------------------===//
+// CGRA Placer
+//===----------------------------------------------------------------------===//
+/// Places ACTs onto a 2D CGRA grid with memory mapping.
+class CGRAPlacer {
+public:
+  CGRAPlacer(int grid_rows, int grid_cols)
+      : grid_rows_(grid_rows), grid_cols_(grid_cols) {
+    occupied_.resize(grid_rows_);
+    for (auto &row : occupied_) {
+      row.resize(grid_cols_, false);
+    }
+  }
+
+  /// Places all tasks and performs memory mapping.
+  void place(func::FuncOp func) {
+    SmallVector<TaskflowTaskOp> tasks;
+    func.walk([&](TaskflowTaskOp task) { tasks.push_back(task); });
+
+    if (tasks.empty()) {
+      llvm::errs() << "No tasks to place.\n";
+      return;
+    }
+
+
+    // Extracts counter chains and builds dependency graph.
+    // Builds Task-Memory Graph.
+    TaskMemoryGraph graph;
+    graph.build(func);
+
+    if (graph.task_nodes.empty()) {
+      llvm::errs() << "No tasks to place.\n";
+      return;
+    }
+
+    // Computes ALAP Levels on the Task Graph.
+    computeALAP(graph);
+
+    // Sorts tasks by ALAP level (Critical Path First).
+    SmallVector<TaskNode *> sorted_tasks;
+    for (auto &node : graph.task_nodes) sorted_tasks.push_back(node.get());
+    
+    std::stable_sort(sorted_tasks.begin(), sorted_tasks.end(), 
+        [](TaskNode *a, TaskNode *b) {
+            return a->alap_level > b->alap_level; 
+        });
+
+    // Critical path priority placement:
+    // 1. Computes ALAP level for each task (longest path to sink).
+    // 2. Sorts tasks by: (a) ALAP level, (b) criticality, (c) degree.
+    // 3. Places tasks in sorted order with heuristic scoring.
+    // Placement Loop.
+    for (TaskNode *task_node : sorted_tasks) {
+      int cgra_count = 1;
+      if (auto attr = task_node->op->getAttrOfType<IntegerAttr>("cgra_count")) {
+        cgra_count = attr.getInt();
+      }
+
+      // Finds Best Placement.
+      // Heuristic: Minimizes distance to:
+      // 1. SSA Producers (that are already placed).
+      // 2. SRAMs of Input MemRefs (if already assigned).
+      TaskPlacement placement = findBestPlacement(task_node, cgra_count, graph);
+      
+      // Commits Placement.
+      task_node->placement.push_back(placement.primary());
+      // Handles multi-cgra if needed.
+      for (size_t i = 1; i < placement.cgra_positions.size(); ++i) {
+         task_node->placement.push_back(placement.cgra_positions[i]);
+      }
+
+      // Marks Occupied.
+      for (const auto &pos : placement.cgra_positions) {
+        if (pos.row >= 0 && pos.row < grid_rows_ && pos.col >= 0 && pos.col < grid_cols_)
+            occupied_[pos.row][pos.col] = true;
+      }
+
+      // Maps Associated Memory Nodes.
+      // For each MemRef this task touches, if not yet assigned to SRAM, assign to nearest.
+      for (MemoryNode *mem_node : task_node->read_memrefs) {
+        if (mem_node->assigned_sram_id == -1) {
+            mem_node->assigned_sram_id = assignSRAM(mem_node, placement);
+        }
+      }
+      for (MemoryNode *mem_node : task_node->write_memrefs) {
+        if (mem_node->assigned_sram_id == -1) {
+            mem_node->assigned_sram_id = assignSRAM(mem_node, placement);
+        }
+      }
+    }
+
+    // Annotates Result.
+    OpBuilder builder(func.getContext());
+    for (auto &task_node : graph.task_nodes) {
+        if (task_node->placement.empty()) continue;
+        CGRAPosition pos = task_node->placement[0];
+        task_node->op->setAttr("cgra_row", builder.getI32IntegerAttr(pos.row));
+        task_node->op->setAttr("cgra_col", builder.getI32IntegerAttr(pos.col));
+        task_node->op->setAttr("cgra_count", builder.getI32IntegerAttr(task_node->placement.size()));
+    }
+  }
+
+private:
+  /// Assigns a memref to the closest SRAM near the given task position.
+  /// TODO: Integrate with Arch Spec to map logical SRAM IDs (row*100 + col) to
+  /// physical hardware Block IDs, especially for shared or asymmetric SRAMs.
+  int assignSRAM(MemoryNode *mem_node, const TaskPlacement &placement) {
+    if (mem_node->assigned_sram_id != -1)
+      return mem_node->assigned_sram_id;
+
+    // Assigns to a new SRAM near the task's primary CGRA.
+    CGRAPosition pos = placement.primary();
+    int sram_id = pos.row * 100 + pos.col; // Simple encoding: row*100 + col.
+    mem_node->assigned_sram_id = sram_id;
+    return sram_id;
+  }
+
+private:
+  /// Finds best placement for a task requiring cgra_count CGRAs.
+  /// TODO: Implement a block-search algorithm for tasks with cgra_count > 1 to
+  /// find contiguous rectangular regions instead of single tiles.
+  TaskPlacement findBestPlacement(TaskNode *task_node, int cgra_count,
+                                  TaskMemoryGraph &graph) {
+    int best_score = INT_MIN;
+    TaskPlacement best_placement;
+
+    // Baseline: For cgra_count=1, finds single best position.
+    for (int r = 0; r < grid_rows_; ++r) {
+      for (int c = 0; c < grid_cols_; ++c) {
+        if (occupied_[r][c])
+          continue;
+
+        TaskPlacement candidate;
+        candidate.cgra_positions.push_back({r, c});
+
+        int score = computeScore(task_node, candidate, graph);
+        if (score > best_score) {
+          best_score = score;
+          best_placement = candidate;
+        }
+      }
+    }
+
+    // Error handling: No available position found (grid over-subscribed).
+    if (best_placement.cgra_positions.empty()) {
+      llvm::errs() << "Warning: No available CGRA position for task "
+                   << task_node->id << ". Grid is over-subscribed (" << grid_rows_
+                   << "x" << grid_cols_ << " grid with all cells occupied).\n";
+      // Fallback: Assign to position (0,0) with a warning.
+      best_placement.cgra_positions.push_back({0, 0});
+    }
+
+    return best_placement;
+  }
+
+  /// Computes placement score based on Task-Memory Graph.
+  /// TODO: Introduce explicit 'direct_wires' attributes in the IR for
+  /// downstream hardware generators to configure fast bypass paths between
+  /// adjacent PEs with dependencies.
+  ///
+  /// Score = α·SSA_Dist + β·Mem_Dist + γ·Balance
+  ///
+  /// SSA_Dist: Minimize distance to placed SSA predecessors (ssa_operands).
+  /// Mem_Dist: Minimize distance to assigned SRAMs for read/write memrefs.
+  int computeScore(TaskNode *task_node, const TaskPlacement &placement,
+                   TaskMemoryGraph &graph) {
+    // Weight constants (tunable).
+    constexpr int kAlpha = 10;   // SSA proximity weight
+    constexpr int kBeta = 50;    // Memory proximity weight (High priority)
+    constexpr int kGamma = 20;   // Load balance weight
+
+    int ssa_score = 0;
+    int mem_score = 0;
+    int bal_score = 0;
+    
+    CGRAPosition current_pos = placement.primary();
+
+    // 1. SSA Proximity (Predecessors)
+    for (TaskNode *producer : task_node->ssa_operands) {
+        if (!producer->placement.empty()) {
+            int dist = current_pos.manhattanDistance(producer->placement[0]);
+            // Uses negative distance to penalize far-away placements.
+            ssa_score -= dist;
+        }
+    }
+
+    // 2. Memory Proximity
+    // For Read MemRefs
+    for (MemoryNode *mem : task_node->read_memrefs) {
+        if (mem->assigned_sram_id != -1) {
+            // SRAM ID encoding: row*100 + col
+            int sram_r = mem->assigned_sram_id / 100;
+            int sram_c = mem->assigned_sram_id % 100;
+            CGRAPosition sram_pos{sram_r, sram_c};
+            int dist = current_pos.manhattanDistance(sram_pos);
+            mem_score -= dist;
+        }
+    }
+    // For Write MemRefs
+    // If we write to a memory that is already assigned (e.g. read by previous task),
+    // we want to be close to it too.
+    for (MemoryNode *mem : task_node->write_memrefs) {
+         if (mem->assigned_sram_id != -1) {
+            int sram_r = mem->assigned_sram_id / 100;
+            int sram_c = mem->assigned_sram_id % 100;
+            CGRAPosition sram_pos{sram_r, sram_c};
+            int dist = current_pos.manhattanDistance(sram_pos);
+            mem_score -= dist;
+        }
+    }
+
+    // 3. Load Balance
+    // Prefers less crowded rows/cols.
+    int row_count = 0, col_count = 0;
+    for (int c = 0; c < grid_cols_; ++c) { if (occupied_[current_pos.row][c]) row_count++; }
+    for (int r = 0; r < grid_rows_; ++r) { if (occupied_[r][current_pos.col]) col_count++; }
+    bal_score = (grid_cols_ - row_count) + (grid_rows_ - col_count);
+
+    return kAlpha * ssa_score + kBeta * mem_score + kGamma * bal_score;
+  }
+
+  /// Computes ALAP levels for efficient scheduling order.
+  void computeALAP(TaskMemoryGraph &graph) {
+    // 1. Calculates in-degrees for topological sort simulation.
+    DenseMap<TaskNode*, int> in_degree;
+    for (auto &node : graph.task_nodes) {
+        for (TaskNode *user : node->ssa_users) in_degree[user]++;
+    }
+    
+    // 2. DFS for longest path from node to any sink (ALAP Level).
+    DenseMap<TaskNode*, int> memo;
+    for (auto &node : graph.task_nodes) {
+        node->alap_level = calculateLevel(node.get(), memo);
+    }
+  }
+
+  int calculateLevel(TaskNode *node, DenseMap<TaskNode*, int> &memo) {
+    if (memo.count(node)) return memo[node];
+
+    int max_child_level = 0;
+    for (TaskNode *child : node->ssa_users) {
+        max_child_level = std::max(max_child_level, calculateLevel(child, memo) + 1);
+    }
+
+    // Check memory dependencies too (Producer -> Mem -> Consumer)
+    for (MemoryNode *mem : node->write_memrefs) {
+        for (TaskNode *reader : mem->readers) {
+            if (reader != node)
+                max_child_level = std::max(max_child_level, calculateLevel(reader, memo) + 1);
+        }
+    }
+
+    return memo[node] = max_child_level;
+  }
+
+
+
+  int grid_rows_;
+  int grid_cols_;
+  std::vector<std::vector<bool>> occupied_;
+};
+
+//===----------------------------------------------------------------------===//
+// Pass Definition
+//===----------------------------------------------------------------------===//
+struct PlaceACTOnCGRAPass
+    : public PassWrapper<PlaceACTOnCGRAPass, OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PlaceACTOnCGRAPass)
+
+  PlaceACTOnCGRAPass() = default;
+
+  StringRef getArgument() const override { return "place-act-on-cgra"; }
+
+  StringRef getDescription() const override {
+    return "Places ACTs onto a 2D CGRA grid with adjacency optimization and "
+           "memory mapping.";
+  }
+
+  void runOnOperation() override {
+    func::FuncOp func = getOperation();
+    constexpr int kDefaultGridRows = 4;
+    constexpr int kDefaultGridCols = 4;
+    CGRAPlacer placer(kDefaultGridRows, kDefaultGridCols);
+    placer.place(func);
+  }
+};
+
+} // namespace
+
+namespace mlir {
+namespace taskflow {
+
+std::unique_ptr<Pass> createPlaceACTOnCGRAPass() {
+  return std::make_unique<PlaceACTOnCGRAPass>();
+}
+
+} // namespace taskflow
+} // namespace mlir
diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
index 9d1e6f46..7a8aa882 100644
--- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
+++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
@@ -13,6 +13,13 @@
 // RUN: -o %t.canonicalized.mlir
 // RUN: FileCheck %s --input-file=%t.canonicalized.mlir --check-prefixes=CANONICALIZE
 
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --canonicalize-task \
+// RUN: --place-act-on-cgra \
+// RUN: -o %t.placement.mlir
+// RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT
+
 #set = affine_set<(d0, d1) : (d0 - 3 == 0, d1 - 7 == 0)>
 module attributes {} {
   func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
@@ -216,3 +223,9 @@ module attributes {} {
 // CANONICALIZE-NEXT:   }
 // CANONICALIZE-NEXT: }
 
+// PLACEMENT: task_name = "Task_0"
+// PLACEMENT: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
+// PLACEMENT: task_name = "Task_1"
+// PLACEMENT: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32
+// PLACEMENT: task_name = "Task_2"
+// PLACEMENT: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
index c5f75f28..c878bbec 100644
--- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
+++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
@@ -13,6 +13,13 @@
 // RUN: -o %t.canonicalized.mlir
 // RUN: FileCheck %s --input-file=%t.canonicalized.mlir --check-prefixes=CANONICALIZE
 
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --canonicalize-task \
+// RUN: --place-act-on-cgra \
+// RUN: -o %t.placement.mlir
+// RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT
+
 module attributes {} {
   func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref<?x8x6xi32>, %arg1: memref<?x8x5xi32>, %arg2: memref<?x8x5xi32>, %arg3: memref<?x7xi32>, %arg4: memref<?x9xi32>, %arg5: memref<?xi32>, %arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: memref<?xi32>, %arg9: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
     affine.for %arg10 = 0 to 4 {
@@ -231,4 +238,15 @@ module attributes {} {
 // CANONICALIZE-NEXT:     %0 = affine.load %memory_outputs_1[0] : memref<?xi32>
 // CANONICALIZE-NEXT:     return %0 : i32
 // CANONICALIZE-NEXT:   }
-// CANONICALIZE-NEXT: }
\ No newline at end of file
+// CANONICALIZE-NEXT: }
+
+// PLACEMENT: task_name = "Task_0"
+// PLACEMENT: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
+// PLACEMENT: task_name = "Task_1"
+// PLACEMENT: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32
+// PLACEMENT: task_name = "Task_2"
+// PLACEMENT: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
+// PLACEMENT: task_name = "Task_3"
+// PLACEMENT: cgra_col = 2 : i32, cgra_count = 1 : i32, cgra_row = 2 : i32
+// PLACEMENT: task_name = "Task_4"
+// PLACEMENT: cgra_col = 3 : i32, cgra_count = 1 : i32, cgra_row = 2 : i32
\ No newline at end of file
diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
index ee37c831..d5c32cf7 100644
--- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
+++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
@@ -13,6 +13,13 @@
 // RUN: -o %t.canonicalized.mlir
 // RUN: FileCheck %s --input-file=%t.canonicalized.mlir --check-prefixes=CANONICALIZE
 
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --canonicalize-task \
+// RUN: --place-act-on-cgra \
+// RUN: -o %t.placement.mlir
+// RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT
+
 module {
   // Example: Parallel nested loops scenario
   // Task 0: Single-level loop (vector scaling)
@@ -133,4 +140,9 @@ module {
 // CANONICALIZE-NEXT:     }) : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> memref<8x8xf32>
 // CANONICALIZE-NEXT:     return
 // CANONICALIZE-NEXT:   }
-// CANONICALIZE-NEXT: }
\ No newline at end of file
+// CANONICALIZE-NEXT: }
+
+// PLACEMENT: task_name = "Task_0"
+// PLACEMENT: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
+// PLACEMENT: task_name = "Task_1"
+// PLACEMENT: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32
\ No newline at end of file

From 4ead25c38c4e164407fe4bd86bf2f71bd0be7298 Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Sun, 1 Feb 2026 06:10:21 +0800
Subject: [PATCH 02/15] chore: trigger PR sync and terminology update

---
 include/TaskflowDialect/TaskflowPasses.h      |  5 +-
 include/TaskflowDialect/TaskflowPasses.td     | 13 ++--
 lib/TaskflowDialect/Transforms/CMakeLists.txt |  5 +-
 ...nCGRAPass.cpp => MapCTOnCGRAArrayPass.cpp} | 75 +++++++++++--------
 test/e2e/bicg/bicg_int_kernel.mlir            |  2 +-
 .../irregular-loop/irregular-loop.mlir        |  2 +-
 .../taskflow/multi-nested/multi-nested.mlir   |  2 +-
 .../parallel-nested/parallel-nested.mlir      |  2 +-
 8 files changed, 53 insertions(+), 53 deletions(-)
 rename lib/TaskflowDialect/Transforms/{PlaceACTOnCGRAPass.cpp => MapCTOnCGRAArrayPass.cpp} (86%)

diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h
index 61073053..f766ceee 100644
--- a/include/TaskflowDialect/TaskflowPasses.h
+++ b/include/TaskflowDialect/TaskflowPasses.h
@@ -17,11 +17,8 @@ namespace taskflow {
 #include "TaskflowDialect/TaskflowPasses.h.inc"
 std::unique_ptr<mlir::Pass> createConstructHyperblockFromTaskPass();
 std::unique_ptr<mlir::Pass> createCanonicalizeTaskPass();
-<<<<<<< HEAD
 std::unique_ptr<mlir::Pass> createClassifyCountersPass();
-=======
-std::unique_ptr<mlir::Pass> createPlaceACTOnCGRAPass();
->>>>>>> a0f7fc7 (feat(taskflow): implement graph-based ACT placement and memory management framework)
+std::unique_ptr<mlir::Pass> createMapCTOnCGRAArrayPass();
 
 #define GEN_PASS_REGISTRATION
 #include "TaskflowDialect/TaskflowPasses.h.inc"
diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td
index c01d8f0d..c7de281b 100644
--- a/include/TaskflowDialect/TaskflowPasses.td
+++ b/include/TaskflowDialect/TaskflowPasses.td
@@ -30,7 +30,6 @@ def CanonicalizeTask: Pass<"canonicalize-task", "func::FuncOp">{
   let constructor = "taskflow::createCanonicalizeTaskPass()";
 }
 
-<<<<<<< HEAD
 def ClassifyCounters : Pass<"classify-counters", "ModuleOp">{
   let summary = "Classifies counters as root/relay/leaf";
   let description = [{
@@ -43,19 +42,17 @@ def ClassifyCounters : Pass<"classify-counters", "ModuleOp">{
     Leaf counters are mapped to CGRA tile arrays.
   }];
   let constructor = "taskflow::createClassifyCountersPass()";
-=======
-
+}
 
-def PlaceACTOnCGRA : Pass<"place-act-on-cgra", "func::FuncOp"> {
-  let summary = "Places ACTs onto a 2D CGRA grid with adjacency optimization";
+def MapCTOnCGRAArray : Pass<"map-ct-on-cgra-array", "func::FuncOp"> {
+  let summary = "Maps Canonical Tasks (CTs) onto a 2D CGRA grid array";
   let description = [{
-    This pass places Atomic Canonical Tasks (ACTs) onto a 2D CGRA grid.
+    This pass maps Canonical Tasks (CTs) onto a 2D CGRA grid array.
     Fusion candidates (same-header SSA dependencies) are placed on adjacent
     CGRAs to enable direct data forwarding.
 
     Uses a default 4x4 CGRA grid.
   }];
-  let constructor = "taskflow::createPlaceACTOnCGRAPass()";
->>>>>>> a0f7fc7 (feat(taskflow): implement graph-based ACT placement and memory management framework)
+  let constructor = "taskflow::createMapCTOnCGRAArrayPass()";
 }
 #endif // TASKFLOW_PASSES_TD
\ No newline at end of file
diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt
index 5177f4ae..d777396e 100644
--- a/lib/TaskflowDialect/Transforms/CMakeLists.txt
+++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt
@@ -3,11 +3,8 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 add_mlir_library(MLIRTaskflowTransforms
     ConstructHyperblockFromTaskPass.cpp
     CanonicalizeTaskPass.cpp
-<<<<<<< HEAD
     ClassifyCountersPass.cpp
-=======
-    PlaceACTOnCGRAPass.cpp
->>>>>>> a0f7fc7 (feat(taskflow): implement graph-based ACT placement and memory management framework)
+    MapCTOnCGRAArrayPass.cpp
 
     DEPENDS
     MLIRTaskflowTransformsIncGen
diff --git a/lib/TaskflowDialect/Transforms/PlaceACTOnCGRAPass.cpp b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
similarity index 86%
rename from lib/TaskflowDialect/Transforms/PlaceACTOnCGRAPass.cpp
rename to lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
index e0d67c78..2b478927 100644
--- a/lib/TaskflowDialect/Transforms/PlaceACTOnCGRAPass.cpp
+++ b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
@@ -1,8 +1,9 @@
-//===- PlaceACTOnCGRAPass.cpp - ACT to CGRA Placement Pass ----------------===//
+//===- MapCTOnCGRAArrayPass.cpp - CT to CGRA Mapping Pass ----------------===//
 //
-// This pass places Atomic Canonical Tasks (ACTs) onto a 2D CGRA grid：
-// 1. SSA use-def placement: Tasks with SSA dependencies placed on adjacent CGRAs.
-// 2. Memory mapping: Assigns memrefs to SRAMs (single-SRAM constraint per data).
+// This pass maps Canonical Tasks (CTs) onto a 2D CGRA grid array:
+// 1. Places tasks with SSA dependencies on adjacent CGRAs.
+// 2. Assigns memrefs to SRAMs (each MemRef is assigned to exactly one SRAM,
+//    determined by proximity to the task that first accesses it).
 //
 //===----------------------------------------------------------------------===//
 
@@ -92,9 +93,9 @@ struct TaskNode {
   TaskflowTaskOp op;
   int alap_level = 0;
   
-  // Edges
-  SmallVector<MemoryNode *> read_memrefs;
-  SmallVector<MemoryNode *> write_memrefs;
+  // Edges (Note: read/write naming refers to taskflow memory_inputs/outputs)
+  SmallVector<MemoryNode *> read_memrefs;  // taskflow.task memory_inputs (readiness triggers)
+  SmallVector<MemoryNode *> write_memrefs; // taskflow.task memory_outputs (produce readiness)
   SmallVector<TaskNode *> ssa_users;
   SmallVector<TaskNode *> ssa_operands;
 
@@ -137,16 +138,29 @@ class TaskMemoryGraph {
 
     // 2. Creates MemoryNodes and defines Edges.
     for (auto &t_node : task_nodes) {
-      // Memory Inputs (Reads)
+      // Memory Inputs (dependency for readiness)
       for (Value input : t_node->op.getMemoryInputs()) {
         MemoryNode *m_node = getOrCreateMemoryNode(input);
         t_node->read_memrefs.push_back(m_node);
         m_node->readers.push_back(t_node.get());
       }
 
-      // Memory Outputs (Writes)
+      // Memory Outputs (produced readiness)
+      Operation *terminator = t_node->op.getBody().front().getTerminator();
       for (Value output : t_node->op.getMemoryOutputs()) {
-        MemoryNode *m_node = getOrCreateMemoryNode(output);
+        unsigned res_idx = mlir::cast<OpResult>(output).getResultNumber();
+        MemoryNode *m_node = nullptr;
+
+        // Ensures the output corresponds to a yield operand.
+        assert(res_idx < terminator->getNumOperands() && "Invalid yield operand index");
+        
+        Value source = terminator->getOperand(res_idx);
+        // Gets (or create if alloc inside) the node for the source value.
+        m_node = getOrCreateMemoryNode(source);
+        
+        // Maps the Output Result to the SAME node.
+        memref_to_node[output] = m_node;
+
         t_node->write_memrefs.push_back(m_node);
         m_node->writers.push_back(t_node.get());
       }
@@ -155,7 +169,7 @@ class TaskMemoryGraph {
     // 3. Builds SSA Edges (Inter-Task Value Dependencies).
     // Identifies if a task uses a value produced by another task.
     for (auto &consumer_node : task_nodes) {
-        // Iterate all operands for now to be safe.
+        // Interates all operands for now to be safe.
         for (Value operand : consumer_node->op->getOperands()) {
             if (auto producer_op = operand.getDefiningOp<TaskflowTaskOp>()) {
                 if (auto *producer_node = op_to_node[producer_op]) {
@@ -298,10 +312,11 @@ class CGRAPlacer {
     return sram_id;
   }
 
-private:
-  /// Finds best placement for a task requiring cgra_count CGRAs.
-  /// TODO: Implement a block-search algorithm for tasks with cgra_count > 1 to
-  /// find contiguous rectangular regions instead of single tiles.
+
+  /// Finds best placement for a task.
+  /// TODO: Currently defaults to single-CGRA placement. Multi-CGRA binding logic
+  /// (cgra_count > 1) is experimental/placeholder and should ideally be handled 
+  /// by an upstream resource binding pass.
   TaskPlacement findBestPlacement(TaskNode *task_node, int cgra_count,
                                   TaskMemoryGraph &graph) {
     int best_score = INT_MIN;
@@ -402,15 +417,9 @@ class CGRAPlacer {
     return kAlpha * ssa_score + kBeta * mem_score + kGamma * bal_score;
   }
 
-  /// Computes ALAP levels for efficient scheduling order.
+  /// Computes ALAP levels considering both SSA and memory dependencies.
   void computeALAP(TaskMemoryGraph &graph) {
-    // 1. Calculates in-degrees for topological sort simulation.
-    DenseMap<TaskNode*, int> in_degree;
-    for (auto &node : graph.task_nodes) {
-        for (TaskNode *user : node->ssa_users) in_degree[user]++;
-    }
-    
-    // 2. DFS for longest path from node to any sink (ALAP Level).
+    // DFS for longest path from node to any sink (ALAP Level).
     DenseMap<TaskNode*, int> memo;
     for (auto &node : graph.task_nodes) {
         node->alap_level = calculateLevel(node.get(), memo);
@@ -425,7 +434,7 @@ class CGRAPlacer {
         max_child_level = std::max(max_child_level, calculateLevel(child, memo) + 1);
     }
 
-    // Check memory dependencies too (Producer -> Mem -> Consumer)
+    // Checks memory dependencies too (Producer -> Mem -> Consumer).
     for (MemoryNode *mem : node->write_memrefs) {
         for (TaskNode *reader : mem->readers) {
             if (reader != node)
@@ -446,17 +455,17 @@ class CGRAPlacer {
 //===----------------------------------------------------------------------===//
 // Pass Definition
 //===----------------------------------------------------------------------===//
-struct PlaceACTOnCGRAPass
-    : public PassWrapper<PlaceACTOnCGRAPass, OperationPass<func::FuncOp>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PlaceACTOnCGRAPass)
+struct MapCTOnCGRAArrayPass
+    : public PassWrapper<MapCTOnCGRAArrayPass, OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MapCTOnCGRAArrayPass)
 
-  PlaceACTOnCGRAPass() = default;
+  MapCTOnCGRAArrayPass() = default;
 
-  StringRef getArgument() const override { return "place-act-on-cgra"; }
+  StringRef getArgument() const override { return "map-ct-on-cgra-array"; }
 
   StringRef getDescription() const override {
-    return "Places ACTs onto a 2D CGRA grid with adjacency optimization and "
-           "memory mapping.";
+    return "Maps Canonical Tasks (CTs) onto a 2D CGRA grid with adjacency "
+           "optimization and memory mapping.";
   }
 
   void runOnOperation() override {
@@ -473,8 +482,8 @@ struct PlaceACTOnCGRAPass
 namespace mlir {
 namespace taskflow {
 
-std::unique_ptr<Pass> createPlaceACTOnCGRAPass() {
-  return std::make_unique<PlaceACTOnCGRAPass>();
+std::unique_ptr<Pass> createMapCTOnCGRAArrayPass() {
+  return std::make_unique<MapCTOnCGRAArrayPass>();
 }
 
 } // namespace taskflow
diff --git a/test/e2e/bicg/bicg_int_kernel.mlir b/test/e2e/bicg/bicg_int_kernel.mlir
index 32f17705..f9aa4d3d 100644
--- a/test/e2e/bicg/bicg_int_kernel.mlir
+++ b/test/e2e/bicg/bicg_int_kernel.mlir
@@ -11,7 +11,7 @@
 // RUN: mlir-neura-opt %t-kernel.mlir \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
index 7a8aa882..06d09b92 100644
--- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
+++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
@@ -16,7 +16,7 @@
 // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
 // RUN: --canonicalize-task \
-// RUN: --place-act-on-cgra \
+// RUN: --map-ct-on-cgra-array \
 // RUN: -o %t.placement.mlir
 // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT
 
diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
index c878bbec..3f885541 100644
--- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
+++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
@@ -16,7 +16,7 @@
 // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
 // RUN: --canonicalize-task \
-// RUN: --place-act-on-cgra \
+// RUN: --map-ct-on-cgra-array \
 // RUN: -o %t.placement.mlir
 // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT
 
diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
index d5c32cf7..8e05d825 100644
--- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
+++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
@@ -16,7 +16,7 @@
 // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
 // RUN: --canonicalize-task \
-// RUN: --place-act-on-cgra \
+// RUN: --map-ct-on-cgra-array \
 // RUN: -o %t.placement.mlir
 // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT
 

From 4251f080b1b25c0aa9b52da43e3914437c4e6f12 Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Wed, 4 Feb 2026 05:27:38 +0800
Subject: [PATCH 03/15] Optimize CGRA placement pass: handle memory aliasing
 with original memrefs, iterative 3x3 grid placement, and centroid-based SRAM
 allocation

---
 .../Transforms/MapCTOnCGRAArrayPass.cpp       | 559 ++++++++++++++++++
 .../irregular-loop/irregular-loop.mlir        |  14 +
 .../taskflow/multi-nested/multi-nested.mlir   |  21 +-
 .../parallel-nested/parallel-nested.mlir      |  14 +-
 4 files changed, 606 insertions(+), 2 deletions(-)
 create mode 100644 lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp

diff --git a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
new file mode 100644
index 00000000..8c3fd34e
--- /dev/null
+++ b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
@@ -0,0 +1,559 @@
+//===- MapCTOnCGRAArrayPass.cpp - CT to CGRA Mapping Pass ----------------===//
+//
+// This pass maps Canonical Tasks (CTs) onto a 2D CGRA grid array:
+// 1. Places tasks with SSA dependencies on adjacent CGRAs.
+// 2. Assigns memrefs to SRAMs (each MemRef is assigned to exactly one SRAM,
+//    determined by proximity to the task that first accesses it).
+//
+//===----------------------------------------------------------------------===//
+
+#include "TaskflowDialect/TaskflowDialect.h"
+#include "TaskflowDialect/TaskflowOps.h"
+#include "TaskflowDialect/TaskflowPasses.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <algorithm>
+#include <climits>
+#include <cmath>
+#include <string>
+#include <vector>
+
+using namespace mlir;
+using namespace mlir::taskflow;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// CGRA Grid Position
+//===----------------------------------------------------------------------===//
+/// Represents a position on the 2D CGRA grid.
+struct CGRAPosition {
+  int row;
+  int col;
+
+  bool operator==(const CGRAPosition &other) const {
+    return row == other.row && col == other.col;
+  }
+
+  /// Computes Manhattan distance to another position.
+  int manhattanDistance(const CGRAPosition &other) const {
+    return std::abs(row - other.row) + std::abs(col - other.col);
+  }
+
+  /// Checks if adjacent (Manhattan distance = 1).
+  bool isAdjacent(const CGRAPosition &other) const {
+    return manhattanDistance(other) == 1;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Task Placement Info
+//===----------------------------------------------------------------------===//
+/// Stores placement info for a task: can span multiple combined CGRAs.
+struct TaskPlacement {
+  SmallVector<CGRAPosition> cgra_positions; // CGRAs assigned to this task.
+
+  /// Returns the primary (first) position.
+  CGRAPosition primary() const {
+    return cgra_positions.empty() ? CGRAPosition{-1, -1} : cgra_positions[0];
+  }
+
+  /// Returns the number of CGRAs assigned.
+  size_t cgraCount() const { return cgra_positions.size(); }
+
+  /// Checks if any CGRA in this task is adjacent to any in other task.
+  bool hasAdjacentCGRA(const TaskPlacement &other) const {
+    for (const auto &pos : cgra_positions) {
+      for (const auto &other_pos : other.cgra_positions) {
+        if (pos.isAdjacent(other_pos)) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Task-Memory Graph
+//===----------------------------------------------------------------------===//
+
+struct MemoryNode;
+
+/// Represents a Task node in the graph.
+struct TaskNode {
+  size_t id;
+  TaskflowTaskOp op;
+  int alap_level = 0;
+  
+  // Edges (Note: read/write naming refers to taskflow memory_inputs/outputs)
+  SmallVector<MemoryNode *> read_memrefs;  // taskflow.task memory_inputs (readiness triggers)
+  SmallVector<MemoryNode *> write_memrefs; // taskflow.task memory_outputs (produce readiness)
+  SmallVector<TaskNode *> ssa_users;
+  SmallVector<TaskNode *> ssa_operands;
+
+  // Placement result
+  SmallVector<CGRAPosition> placement;
+
+  TaskNode(size_t id, TaskflowTaskOp op) : id(id), op(op) {}
+};
+
+/// Represents a Memory node (MemRef) in the graph.
+struct MemoryNode {
+  Value memref;
+  
+  // Edges
+  SmallVector<TaskNode *> readers;
+  SmallVector<TaskNode *> writers;
+
+  // Mapping result
+  int assigned_sram_id = -1;
+
+  MemoryNode(Value memref) : memref(memref) {}
+};
+
+/// The Task-Memory Dependency Graph.
+class TaskMemoryGraph {
+public:
+  SmallVector<std::unique_ptr<TaskNode>> task_nodes;
+  SmallVector<std::unique_ptr<MemoryNode>> memory_nodes;
+  DenseMap<Value, MemoryNode *> memref_to_node;
+  DenseMap<Operation *, TaskNode *> op_to_node;
+
+  void build(func::FuncOp func) {
+    // 1. Creates TaskNodes.
+    size_t task_id = 0;
+    func.walk([&](TaskflowTaskOp task) {
+      auto node = std::make_unique<TaskNode>(task_id++, task);
+      op_to_node[task] = node.get();
+      task_nodes.push_back(std::move(node));
+    });
+
+    // 2. Creates MemoryNodes using ORIGINAL memrefs (canonical identity).
+    // Uses original_read_memrefs/original_write_memrefs to ensure aliased
+    // memories share the same MemoryNode.
+    for (auto &t_node : task_nodes) {
+      // Uses original_read_memrefs for canonical memory identity.
+      for (Value orig_memref : t_node->op.getOriginalReadMemrefs()) {
+        MemoryNode *m_node = getOrCreateMemoryNode(orig_memref);
+        t_node->read_memrefs.push_back(m_node);
+        m_node->readers.push_back(t_node.get());
+      }
+
+      // Uses original_write_memrefs for canonical memory identity.
+      for (Value orig_memref : t_node->op.getOriginalWriteMemrefs()) {
+        MemoryNode *m_node = getOrCreateMemoryNode(orig_memref);
+        t_node->write_memrefs.push_back(m_node);
+        m_node->writers.push_back(t_node.get());
+      }
+    }
+
+    // 3. Builds SSA Edges (Inter-Task Value Dependencies).
+    // Identifies if a task uses a value produced by another task.
+    for (auto &consumer_node : task_nodes) {
+        // Interates all operands for now to be safe.
+        for (Value operand : consumer_node->op->getOperands()) {
+            if (auto producer_op = operand.getDefiningOp<TaskflowTaskOp>()) {
+                if (auto *producer_node = op_to_node[producer_op]) {
+                    producer_node->ssa_users.push_back(consumer_node.get());
+                    consumer_node->ssa_operands.push_back(producer_node);
+                }
+            }
+        }
+    }
+  }
+
+private:
+  MemoryNode *getOrCreateMemoryNode(Value memref) {
+    if (memref_to_node.count(memref))
+      return memref_to_node[memref];
+    
+    auto node = std::make_unique<MemoryNode>(memref);
+    MemoryNode *ptr = node.get();
+    memref_to_node[memref] = ptr;
+    memory_nodes.push_back(std::move(node));
+    return ptr;
+  }
+};
+
+/// Prints the Task-Memory graph in DOT format for visualization.
+void printGraphDOT(TaskMemoryGraph &graph, llvm::raw_ostream &os) {
+  os << "digraph TaskMemGraph {\n";
+  os << "  rankdir=TB;\n";
+  // Task nodes (circles).
+  for (auto &t : graph.task_nodes) {
+    os << "  T" << t->id << " [shape=circle, label=\"" << t->id << "\"];\n";
+  }
+  // Memory nodes (rectangles).
+  for (size_t i = 0; i < graph.memory_nodes.size(); ++i) {
+    os << "  M" << i << " [shape=box, label=\"mem" << i << "\"];\n";
+  }
+  // Edges: Task -> Memory (write) and Memory -> Task (read).
+  for (auto &t : graph.task_nodes) {
+    for (size_t i = 0; i < graph.memory_nodes.size(); ++i) {
+      MemoryNode *m = graph.memory_nodes[i].get();
+      for (auto *writer : m->writers) {
+        if (writer == t.get()) {
+          os << "  T" << t->id << " -> M" << i << ";\n";
+        }
+      }
+      for (auto *reader : m->readers) {
+        if (reader == t.get()) {
+          os << "  M" << i << " -> T" << t->id << ";\n";
+        }
+      }
+    }
+  }
+  os << "}\n";
+}
+
+
+//===----------------------------------------------------------------------===//
+// CGRA Placer
+//===----------------------------------------------------------------------===//
+/// Places ACTs onto a 2D CGRA grid with memory mapping.
+class CGRAPlacer {
+public:
+  CGRAPlacer(int grid_rows, int grid_cols)
+      : grid_rows_(grid_rows), grid_cols_(grid_cols) {
+    occupied_.resize(grid_rows_);
+    for (auto &row : occupied_) {
+      row.resize(grid_cols_, false);
+    }
+  }
+
+  /// Places all tasks and performs memory mapping.
+  void place(func::FuncOp func) {
+    SmallVector<TaskflowTaskOp> tasks;
+    func.walk([&](TaskflowTaskOp task) { tasks.push_back(task); });
+
+    if (tasks.empty()) {
+      llvm::errs() << "No tasks to place.\n";
+      return;
+    }
+
+
+    // Extracts counter chains and builds dependency graph.
+    // Builds Task-Memory Graph.
+    TaskMemoryGraph graph;
+    graph.build(func);
+
+    // Prints graph visualization to stderr for debugging.
+    // llvm::errs() << "\n=== Task-Memory Graph (DOT format) ===\n";
+    // printGraphDOT(graph, llvm::errs());
+    // llvm::errs() << "=== Graph Stats: " << graph.task_nodes.size() << " tasks, "
+    //              << graph.memory_nodes.size() << " memories ===\n\n";
+
+    if (graph.task_nodes.empty()) {
+      llvm::errs() << "No tasks to place.\n";
+      return;
+    }
+
+    // Computes ALAP Levels on the Task Graph.
+    computeALAP(graph);
+
+    // Sorts tasks by ALAP level (Critical Path First).
+    SmallVector<TaskNode *> sorted_tasks;
+    for (auto &node : graph.task_nodes) sorted_tasks.push_back(node.get());
+    
+    std::stable_sort(sorted_tasks.begin(), sorted_tasks.end(), 
+        [](TaskNode *a, TaskNode *b) {
+            return a->alap_level > b->alap_level; 
+        });
+
+    // Critical path priority placement:
+    // 1. Computes ALAP level for each task (longest path to sink).
+    // 2. Sorts tasks by: (a) ALAP level, (b) criticality, (c) degree.
+    // 3. Places tasks in sorted order with heuristic scoring.
+    // Iterative Refinement Loop (Coordinate Descent).
+    // Alternates between Task Placement (Phase 1) and SRAM Assignment (Phase 2).
+    constexpr int kMaxIterations = 10;
+    
+    llvm::errs() << "\n=== Starting Iterative Placement (Max " << kMaxIterations << ") ===\n";
+
+    for (int iter = 0; iter < kMaxIterations; ++iter) {
+        // Phase 1: Place Tasks (assuming fixed SRAMs).
+        if (iter > 0) resetTaskPlacements(graph);
+
+        for (TaskNode *task_node : sorted_tasks) {
+          int cgra_count = 1;
+          if (auto attr = task_node->op->getAttrOfType<IntegerAttr>("cgra_count")) {
+            cgra_count = attr.getInt();
+          }
+
+          // Finds Best Placement using SRAM positions from previous iter (or -1/default).
+          TaskPlacement placement = findBestPlacement(task_node, cgra_count, graph);
+          
+          // Commits Placement.
+          task_node->placement.push_back(placement.primary());
+          // Handles multi-cgra if needed.
+          for (size_t i = 1; i < placement.cgra_positions.size(); ++i) {
+             task_node->placement.push_back(placement.cgra_positions[i]);
+          }
+
+          // Marks Occupied.
+          for (const auto &pos : placement.cgra_positions) {
+            if (pos.row >= 0 && pos.row < grid_rows_ && pos.col >= 0 && pos.col < grid_cols_)
+                occupied_[pos.row][pos.col] = true;
+          }
+        }
+
+        // Phase 2: Assign SRAMs (assuming fixed Tasks).
+        bool sram_moved = assignAllSRAMs(graph);
+        
+        llvm::errs() << "Iter " << iter << ": SRAMs moved = " << (sram_moved ? "Yes" : "No") << "\n";
+
+        // Convergence Check.
+        // If SRAMs didn't move, it means task placement based on them likely won't change either.
+        if (iter > 0 && !sram_moved) {
+            llvm::errs() << "Converged at iteration " << iter << ".\n";
+            break; 
+        }
+    }
+
+    // Annotates Result.
+    OpBuilder builder(func.getContext());
+    for (auto &task_node : graph.task_nodes) {
+        if (task_node->placement.empty()) continue;
+        CGRAPosition pos = task_node->placement[0];
+        task_node->op->setAttr("cgra_row", builder.getI32IntegerAttr(pos.row));
+        task_node->op->setAttr("cgra_col", builder.getI32IntegerAttr(pos.col));
+        task_node->op->setAttr("cgra_count", builder.getI32IntegerAttr(task_node->placement.size()));
+    }
+  }
+
+private:
+  /// Clears task placement and occupied grid.
+  void resetTaskPlacements(TaskMemoryGraph &graph) {
+    for (auto &task : graph.task_nodes) {
+        task->placement.clear();
+    }
+    // Clears grid.
+    for (int r = 0; r < grid_rows_; ++r) {
+        std::fill(occupied_[r].begin(), occupied_[r].end(), false);
+    }
+  }
+
+  /// Assigns all memory nodes to SRAMs based on centroid of accessing tasks.
+  /// Returns true if any SRAM assignment changed.
+  bool assignAllSRAMs(TaskMemoryGraph &graph) {
+    bool changed = false;
+    for (auto &mem_node : graph.memory_nodes) {
+      // Computes centroid of all tasks that access this memory.
+      int total_row = 0, total_col = 0, count = 0;
+      for (TaskNode *reader : mem_node->readers) {
+        if (!reader->placement.empty()) {
+          total_row += reader->placement[0].row;
+          total_col += reader->placement[0].col;
+          count++;
+        }
+      }
+      for (TaskNode *writer : mem_node->writers) {
+        if (!writer->placement.empty()) {
+          total_row += writer->placement[0].row;
+          total_col += writer->placement[0].col;
+          count++;
+        }
+      }
+      
+      int new_sram_id = 0;
+      if (count > 0) {
+        // Rounds to the nearest integer.
+        int avg_row = (total_row + count / 2) / count;
+        int avg_col = (total_col + count / 2) / count;
+        new_sram_id = avg_row * 100 + avg_col;
+      } else {
+        new_sram_id = 0; // Default fallback
+      }
+
+      if (mem_node->assigned_sram_id != new_sram_id) {
+        mem_node->assigned_sram_id = new_sram_id;
+        changed = true;
+      }
+    }
+    return changed;
+  }
+
+
+  /// Finds best placement for a task.
+  /// TODO: Currently defaults to single-CGRA placement. Multi-CGRA binding logic
+  /// (cgra_count > 1) is experimental/placeholder and should ideally be handled 
+  /// by an upstream resource binding pass.
+  TaskPlacement findBestPlacement(TaskNode *task_node, int cgra_count,
+                                  TaskMemoryGraph &graph) {
+    int best_score = INT_MIN;
+    TaskPlacement best_placement;
+
+    // Baseline: For cgra_count=1, finds single best position.
+    for (int r = 0; r < grid_rows_; ++r) {
+      for (int c = 0; c < grid_cols_; ++c) {
+        if (occupied_[r][c])
+          continue;
+
+        TaskPlacement candidate;
+        candidate.cgra_positions.push_back({r, c});
+
+        int score = computeScore(task_node, candidate, graph);
+        if (score > best_score) {
+          best_score = score;
+          best_placement = candidate;
+        }
+      }
+    }
+
+    // Error handling: No available position found (grid over-subscribed).
+    if (best_placement.cgra_positions.empty()) {
+      llvm::errs() << "Warning: No available CGRA position for task "
+                   << task_node->id << ". Grid is over-subscribed (" << grid_rows_
+                   << "x" << grid_cols_ << " grid with all cells occupied).\n";
+      // Fallback: Assign to position (0,0) with a warning.
+      best_placement.cgra_positions.push_back({0, 0});
+    }
+
+    return best_placement;
+  }
+
+  /// Computes placement score based on Task-Memory Graph.
+  /// TODO: Introduce explicit 'direct_wires' attributes in the IR for
+  /// downstream hardware generators to configure fast bypass paths between
+  /// adjacent PEs with dependencies.
+  ///
+  /// Score = α·SSA_Dist + β·Mem_Dist + γ·Balance
+  ///
+  /// SSA_Dist: Minimize distance to placed SSA predecessors (ssa_operands).
+  /// Mem_Dist: Minimize distance to assigned SRAMs for read/write memrefs.
+  int computeScore(TaskNode *task_node, const TaskPlacement &placement,
+                   TaskMemoryGraph &graph) {
+    // Weight constants (tunable).
+    constexpr int kAlpha = 10;   // SSA proximity weight
+    constexpr int kBeta = 50;    // Memory proximity weight (High priority)
+    constexpr int kGamma = 20;   // Load balance weight
+
+    int ssa_score = 0;
+    int mem_score = 0;
+    int bal_score = 0;
+    
+    CGRAPosition current_pos = placement.primary();
+
+    // 1. SSA Proximity (Predecessors)
+    for (TaskNode *producer : task_node->ssa_operands) {
+        if (!producer->placement.empty()) {
+            int dist = current_pos.manhattanDistance(producer->placement[0]);
+            // Uses negative distance to penalize far-away placements.
+            ssa_score -= dist;
+        }
+    }
+
+    // 2. Memory Proximity
+    // For Read MemRefs
+    for (MemoryNode *mem : task_node->read_memrefs) {
+        if (mem->assigned_sram_id != -1) {
+            // SRAM ID encoding: row*100 + col
+            int sram_r = mem->assigned_sram_id / 100;
+            int sram_c = mem->assigned_sram_id % 100;
+            CGRAPosition sram_pos{sram_r, sram_c};
+            int dist = current_pos.manhattanDistance(sram_pos);
+            mem_score -= dist;
+        }
+    }
+    // For Write MemRefs
+    // If we write to a memory that is already assigned (e.g. read by previous task),
+    // we want to be close to it too.
+    for (MemoryNode *mem : task_node->write_memrefs) {
+         if (mem->assigned_sram_id != -1) {
+            int sram_r = mem->assigned_sram_id / 100;
+            int sram_c = mem->assigned_sram_id % 100;
+            CGRAPosition sram_pos{sram_r, sram_c};
+            int dist = current_pos.manhattanDistance(sram_pos);
+            mem_score -= dist;
+        }
+    }
+
+    // 3. Load Balance
+    // Prefers less crowded rows/cols.
+    int row_count = 0, col_count = 0;
+    for (int c = 0; c < grid_cols_; ++c) { if (occupied_[current_pos.row][c]) row_count++; }
+    for (int r = 0; r < grid_rows_; ++r) { if (occupied_[r][current_pos.col]) col_count++; }
+    bal_score = (grid_cols_ - row_count) + (grid_rows_ - col_count);
+
+    return kAlpha * ssa_score + kBeta * mem_score + kGamma * bal_score;
+  }
+
+  /// Computes ALAP levels considering both SSA and memory dependencies.
+  void computeALAP(TaskMemoryGraph &graph) {
+    // DFS for longest path from node to any sink (ALAP Level).
+    DenseMap<TaskNode*, int> memo;
+    for (auto &node : graph.task_nodes) {
+        node->alap_level = calculateLevel(node.get(), memo);
+    }
+  }
+
+  int calculateLevel(TaskNode *node, DenseMap<TaskNode*, int> &memo) {
+    if (memo.count(node)) return memo[node];
+
+    int max_child_level = 0;
+    for (TaskNode *child : node->ssa_users) {
+        max_child_level = std::max(max_child_level, calculateLevel(child, memo) + 1);
+    }
+
+    // Checks memory dependencies too (Producer -> Mem -> Consumer).
+    for (MemoryNode *mem : node->write_memrefs) {
+        for (TaskNode *reader : mem->readers) {
+            if (reader != node)
+                max_child_level = std::max(max_child_level, calculateLevel(reader, memo) + 1);
+        }
+    }
+
+    return memo[node] = max_child_level;
+  }
+
+
+
+  int grid_rows_;
+  int grid_cols_;
+  std::vector<std::vector<bool>> occupied_;
+};
+
+//===----------------------------------------------------------------------===//
+// Pass Definition
+//===----------------------------------------------------------------------===//
+struct MapCTOnCGRAArrayPass
+    : public PassWrapper<MapCTOnCGRAArrayPass, OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MapCTOnCGRAArrayPass)
+
+  MapCTOnCGRAArrayPass() = default;
+
+  StringRef getArgument() const override { return "map-ct-on-cgra-array"; }
+
+  StringRef getDescription() const override {
+    return "Maps Canonical Tasks (CTs) onto a 2D CGRA grid with adjacency "
+           "optimization and memory mapping.";
+  }
+
+  void runOnOperation() override {
+    func::FuncOp func = getOperation();
+    constexpr int kDefaultGridRows = 3;
+    constexpr int kDefaultGridCols = 3;
+    CGRAPlacer placer(kDefaultGridRows, kDefaultGridCols);
+    placer.place(func);
+  }
+};
+
+} // namespace
+
+namespace mlir {
+namespace taskflow {
+
+std::unique_ptr<Pass> createMapCTOnCGRAArrayPass() {
+  return std::make_unique<MapCTOnCGRAArrayPass>();
+}
+
+} // namespace taskflow
+} // namespace mlir
diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
index 515322ee..2510b28e 100644
--- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
+++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
@@ -19,6 +19,13 @@
 // RUN: -o %t.hyperblock.mlir
 // RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK
 
+// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
+// RUN: --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --map-ct-on-cgra-array \
+// RUN: -o %t.placement.mlir
+// RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT
+
 #set = affine_set<(d0, d1) : (d0 - 3 == 0, d1 - 7 == 0)>
 module attributes {} {
   func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
@@ -301,3 +308,10 @@ module attributes {} {
 // HYPERBLOCK-NEXT:   }
 // HYPERBLOCK-NEXT: }
 
+// PLACEMENT:      taskflow.task @Task_0
+// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
+// PLACEMENT:      taskflow.task @Task_1
+// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
+// PLACEMENT:      taskflow.task @Task_2
+// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32
+
diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
index 7de1eb55..9cd26dc6 100644
--- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
+++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
@@ -19,6 +19,13 @@
 // RUN: -o %t.hyperblock.mlir
 // RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK
 
+// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
+// RUN: --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --map-ct-on-cgra-array \
+// RUN: -o %t.placement.mlir
+// RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT
+
 module attributes {} {
   func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref<?x8x6xi32>, %arg1: memref<?x8x5xi32>, %arg2: memref<?x8x5xi32>, %arg3: memref<?x7xi32>, %arg4: memref<?x9xi32>, %arg5: memref<?xi32>, %arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: memref<?xi32>, %arg9: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
     affine.for %arg10 = 0 to 4 {
@@ -366,4 +373,16 @@ module attributes {} {
 // HYPERBLOCK-NEXT:    %0 = affine.load %write_outputs_1[0] : memref<?xi32>
 // HYPERBLOCK-NEXT:    return %0 : i32
 // HYPERBLOCK-NEXT:  }
-// HYPERBLOCK-NEXT:}
\ No newline at end of file
+
+// HYPERBLOCK-NEXT:}
+
+// PLACEMENT:      taskflow.task @Task_0
+// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
+// PLACEMENT:      taskflow.task @Task_1
+// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32
+// PLACEMENT:      taskflow.task @Task_2
+// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
+// PLACEMENT:      taskflow.task @Task_3
+// PLACEMENT-SAME: cgra_col = 2 : i32, cgra_count = 1 : i32, cgra_row = 2 : i32
+// PLACEMENT:      taskflow.task @Task_4
+// PLACEMENT-SAME: cgra_col = 2 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32
\ No newline at end of file
diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
index 7655769e..d43363f6 100644
--- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
+++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
@@ -13,6 +13,13 @@
 // RUN: -o %t.hyperblock.mlir
 // RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK
 
+// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
+// RUN: --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --map-ct-on-cgra-array \
+// RUN: -o %t.placement.mlir
+// RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT
+
 module {
   // Example: Parallel nested loops scenario
   // Task 0: Single-level loop (vector scaling)
@@ -120,4 +127,9 @@ module {
 // HYPERBLOCK-NEXT:     }
 // HYPERBLOCK-NEXT:     return
 // HYPERBLOCK-NEXT:   }
-// HYPERBLOCK-NEXT: }
\ No newline at end of file
+// HYPERBLOCK-NEXT: }
+
+// PLACEMENT:      taskflow.task @Task_0
+// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
+// PLACEMENT:      taskflow.task @Task_1
+// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32
\ No newline at end of file

From a690d4fb266a1c7b0e558df94e1c4cd05db068fe Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Wed, 4 Feb 2026 05:27:38 +0800
Subject: [PATCH 04/15] Optimize CGRA placement pass: handle memory aliasing
 with original memrefs, iterative 3x3 grid placement, and centroid-based SRAM
 allocation

---
 .../Transforms/MapCTOnCGRAArrayPass.cpp       | 207 ++++++++++++------
 .../irregular-loop/irregular-loop.mlir        |  14 ++
 .../taskflow/multi-nested/multi-nested.mlir   |  21 +-
 .../parallel-nested/parallel-nested.mlir      |  14 +-
 4 files changed, 185 insertions(+), 71 deletions(-)

diff --git a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
index 2b478927..8c3fd34e 100644
--- a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
+++ b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
@@ -136,31 +136,20 @@ class TaskMemoryGraph {
       task_nodes.push_back(std::move(node));
     });
 
-    // 2. Creates MemoryNodes and defines Edges.
+    // 2. Creates MemoryNodes using ORIGINAL memrefs (canonical identity).
+    // Uses original_read_memrefs/original_write_memrefs to ensure aliased
+    // memories share the same MemoryNode.
     for (auto &t_node : task_nodes) {
-      // Memory Inputs (dependency for readiness)
-      for (Value input : t_node->op.getMemoryInputs()) {
-        MemoryNode *m_node = getOrCreateMemoryNode(input);
+      // Uses original_read_memrefs for canonical memory identity.
+      for (Value orig_memref : t_node->op.getOriginalReadMemrefs()) {
+        MemoryNode *m_node = getOrCreateMemoryNode(orig_memref);
         t_node->read_memrefs.push_back(m_node);
         m_node->readers.push_back(t_node.get());
       }
 
-      // Memory Outputs (produced readiness)
-      Operation *terminator = t_node->op.getBody().front().getTerminator();
-      for (Value output : t_node->op.getMemoryOutputs()) {
-        unsigned res_idx = mlir::cast<OpResult>(output).getResultNumber();
-        MemoryNode *m_node = nullptr;
-
-        // Ensures the output corresponds to a yield operand.
-        assert(res_idx < terminator->getNumOperands() && "Invalid yield operand index");
-        
-        Value source = terminator->getOperand(res_idx);
-        // Gets (or create if alloc inside) the node for the source value.
-        m_node = getOrCreateMemoryNode(source);
-        
-        // Maps the Output Result to the SAME node.
-        memref_to_node[output] = m_node;
-
+      // Uses original_write_memrefs for canonical memory identity.
+      for (Value orig_memref : t_node->op.getOriginalWriteMemrefs()) {
+        MemoryNode *m_node = getOrCreateMemoryNode(orig_memref);
         t_node->write_memrefs.push_back(m_node);
         m_node->writers.push_back(t_node.get());
       }
@@ -194,6 +183,37 @@ class TaskMemoryGraph {
   }
 };
 
+/// Prints the Task-Memory graph in DOT format for visualization.
+void printGraphDOT(TaskMemoryGraph &graph, llvm::raw_ostream &os) {
+  os << "digraph TaskMemGraph {\n";
+  os << "  rankdir=TB;\n";
+  // Task nodes (circles).
+  for (auto &t : graph.task_nodes) {
+    os << "  T" << t->id << " [shape=circle, label=\"" << t->id << "\"];\n";
+  }
+  // Memory nodes (rectangles).
+  for (size_t i = 0; i < graph.memory_nodes.size(); ++i) {
+    os << "  M" << i << " [shape=box, label=\"mem" << i << "\"];\n";
+  }
+  // Edges: Task -> Memory (write) and Memory -> Task (read).
+  for (auto &t : graph.task_nodes) {
+    for (size_t i = 0; i < graph.memory_nodes.size(); ++i) {
+      MemoryNode *m = graph.memory_nodes[i].get();
+      for (auto *writer : m->writers) {
+        if (writer == t.get()) {
+          os << "  T" << t->id << " -> M" << i << ";\n";
+        }
+      }
+      for (auto *reader : m->readers) {
+        if (reader == t.get()) {
+          os << "  M" << i << " -> T" << t->id << ";\n";
+        }
+      }
+    }
+  }
+  os << "}\n";
+}
+
 
 //===----------------------------------------------------------------------===//
 // CGRA Placer
@@ -225,6 +245,12 @@ class CGRAPlacer {
     TaskMemoryGraph graph;
     graph.build(func);
 
+    // Prints graph visualization to stderr for debugging.
+    // llvm::errs() << "\n=== Task-Memory Graph (DOT format) ===\n";
+    // printGraphDOT(graph, llvm::errs());
+    // llvm::errs() << "=== Graph Stats: " << graph.task_nodes.size() << " tasks, "
+    //              << graph.memory_nodes.size() << " memories ===\n\n";
+
     if (graph.task_nodes.empty()) {
       llvm::errs() << "No tasks to place.\n";
       return;
@@ -246,44 +272,50 @@ class CGRAPlacer {
     // 1. Computes ALAP level for each task (longest path to sink).
     // 2. Sorts tasks by: (a) ALAP level, (b) criticality, (c) degree.
     // 3. Places tasks in sorted order with heuristic scoring.
-    // Placement Loop.
-    for (TaskNode *task_node : sorted_tasks) {
-      int cgra_count = 1;
-      if (auto attr = task_node->op->getAttrOfType<IntegerAttr>("cgra_count")) {
-        cgra_count = attr.getInt();
-      }
-
-      // Finds Best Placement.
-      // Heuristic: Minimizes distance to:
-      // 1. SSA Producers (that are already placed).
-      // 2. SRAMs of Input MemRefs (if already assigned).
-      TaskPlacement placement = findBestPlacement(task_node, cgra_count, graph);
-      
-      // Commits Placement.
-      task_node->placement.push_back(placement.primary());
-      // Handles multi-cgra if needed.
-      for (size_t i = 1; i < placement.cgra_positions.size(); ++i) {
-         task_node->placement.push_back(placement.cgra_positions[i]);
-      }
+    // Iterative Refinement Loop (Coordinate Descent).
+    // Alternates between Task Placement (Phase 1) and SRAM Assignment (Phase 2).
+    constexpr int kMaxIterations = 10;
+    
+    llvm::errs() << "\n=== Starting Iterative Placement (Max " << kMaxIterations << ") ===\n";
+
+    for (int iter = 0; iter < kMaxIterations; ++iter) {
+        // Phase 1: Place Tasks (assuming fixed SRAMs).
+        if (iter > 0) resetTaskPlacements(graph);
+
+        for (TaskNode *task_node : sorted_tasks) {
+          int cgra_count = 1;
+          if (auto attr = task_node->op->getAttrOfType<IntegerAttr>("cgra_count")) {
+            cgra_count = attr.getInt();
+          }
+
+          // Finds Best Placement using SRAM positions from previous iter (or -1/default).
+          TaskPlacement placement = findBestPlacement(task_node, cgra_count, graph);
+          
+          // Commits Placement.
+          task_node->placement.push_back(placement.primary());
+          // Handles multi-cgra if needed.
+          for (size_t i = 1; i < placement.cgra_positions.size(); ++i) {
+             task_node->placement.push_back(placement.cgra_positions[i]);
+          }
+
+          // Marks Occupied.
+          for (const auto &pos : placement.cgra_positions) {
+            if (pos.row >= 0 && pos.row < grid_rows_ && pos.col >= 0 && pos.col < grid_cols_)
+                occupied_[pos.row][pos.col] = true;
+          }
+        }
 
-      // Marks Occupied.
-      for (const auto &pos : placement.cgra_positions) {
-        if (pos.row >= 0 && pos.row < grid_rows_ && pos.col >= 0 && pos.col < grid_cols_)
-            occupied_[pos.row][pos.col] = true;
-      }
+        // Phase 2: Assign SRAMs (assuming fixed Tasks).
+        bool sram_moved = assignAllSRAMs(graph);
+        
+        llvm::errs() << "Iter " << iter << ": SRAMs moved = " << (sram_moved ? "Yes" : "No") << "\n";
 
-      // Maps Associated Memory Nodes.
-      // For each MemRef this task touches, if not yet assigned to SRAM, assign to nearest.
-      for (MemoryNode *mem_node : task_node->read_memrefs) {
-        if (mem_node->assigned_sram_id == -1) {
-            mem_node->assigned_sram_id = assignSRAM(mem_node, placement);
-        }
-      }
-      for (MemoryNode *mem_node : task_node->write_memrefs) {
-        if (mem_node->assigned_sram_id == -1) {
-            mem_node->assigned_sram_id = assignSRAM(mem_node, placement);
+        // Convergence Check.
+        // If SRAMs didn't move, it means task placement based on them likely won't change either.
+        if (iter > 0 && !sram_moved) {
+            llvm::errs() << "Converged at iteration " << iter << ".\n";
+            break; 
         }
-      }
     }
 
     // Annotates Result.
@@ -298,18 +330,55 @@ class CGRAPlacer {
   }
 
 private:
-  /// Assigns a memref to the closest SRAM near the given task position.
-  /// TODO: Integrate with Arch Spec to map logical SRAM IDs (row*100 + col) to
-  /// physical hardware Block IDs, especially for shared or asymmetric SRAMs.
-  int assignSRAM(MemoryNode *mem_node, const TaskPlacement &placement) {
-    if (mem_node->assigned_sram_id != -1)
-      return mem_node->assigned_sram_id;
-
-    // Assigns to a new SRAM near the task's primary CGRA.
-    CGRAPosition pos = placement.primary();
-    int sram_id = pos.row * 100 + pos.col; // Simple encoding: row*100 + col.
-    mem_node->assigned_sram_id = sram_id;
-    return sram_id;
+  /// Clears task placement and occupied grid.
+  void resetTaskPlacements(TaskMemoryGraph &graph) {
+    for (auto &task : graph.task_nodes) {
+        task->placement.clear();
+    }
+    // Clears grid.
+    for (int r = 0; r < grid_rows_; ++r) {
+        std::fill(occupied_[r].begin(), occupied_[r].end(), false);
+    }
+  }
+
+  /// Assigns all memory nodes to SRAMs based on centroid of accessing tasks.
+  /// Returns true if any SRAM assignment changed.
+  bool assignAllSRAMs(TaskMemoryGraph &graph) {
+    bool changed = false;
+    for (auto &mem_node : graph.memory_nodes) {
+      // Computes centroid of all tasks that access this memory.
+      int total_row = 0, total_col = 0, count = 0;
+      for (TaskNode *reader : mem_node->readers) {
+        if (!reader->placement.empty()) {
+          total_row += reader->placement[0].row;
+          total_col += reader->placement[0].col;
+          count++;
+        }
+      }
+      for (TaskNode *writer : mem_node->writers) {
+        if (!writer->placement.empty()) {
+          total_row += writer->placement[0].row;
+          total_col += writer->placement[0].col;
+          count++;
+        }
+      }
+      
+      int new_sram_id = 0;
+      if (count > 0) {
+        // Rounds to the nearest integer.
+        int avg_row = (total_row + count / 2) / count;
+        int avg_col = (total_col + count / 2) / count;
+        new_sram_id = avg_row * 100 + avg_col;
+      } else {
+        new_sram_id = 0; // Default fallback
+      }
+
+      if (mem_node->assigned_sram_id != new_sram_id) {
+        mem_node->assigned_sram_id = new_sram_id;
+        changed = true;
+      }
+    }
+    return changed;
   }
 
 
@@ -470,8 +539,8 @@ struct MapCTOnCGRAArrayPass
 
   void runOnOperation() override {
     func::FuncOp func = getOperation();
-    constexpr int kDefaultGridRows = 4;
-    constexpr int kDefaultGridCols = 4;
+    constexpr int kDefaultGridRows = 3;
+    constexpr int kDefaultGridCols = 3;
     CGRAPlacer placer(kDefaultGridRows, kDefaultGridCols);
     placer.place(func);
   }
diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
index 515322ee..2510b28e 100644
--- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
+++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
@@ -19,6 +19,13 @@
 // RUN: -o %t.hyperblock.mlir
 // RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK
 
+// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
+// RUN: --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --map-ct-on-cgra-array \
+// RUN: -o %t.placement.mlir
+// RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT
+
 #set = affine_set<(d0, d1) : (d0 - 3 == 0, d1 - 7 == 0)>
 module attributes {} {
   func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
@@ -301,3 +308,10 @@ module attributes {} {
 // HYPERBLOCK-NEXT:   }
 // HYPERBLOCK-NEXT: }
 
+// PLACEMENT:      taskflow.task @Task_0
+// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
+// PLACEMENT:      taskflow.task @Task_1
+// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
+// PLACEMENT:      taskflow.task @Task_2
+// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32
+
diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
index 7de1eb55..9cd26dc6 100644
--- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
+++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
@@ -19,6 +19,13 @@
 // RUN: -o %t.hyperblock.mlir
 // RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK
 
+// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
+// RUN: --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --map-ct-on-cgra-array \
+// RUN: -o %t.placement.mlir
+// RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT
+
 module attributes {} {
   func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref<?x8x6xi32>, %arg1: memref<?x8x5xi32>, %arg2: memref<?x8x5xi32>, %arg3: memref<?x7xi32>, %arg4: memref<?x9xi32>, %arg5: memref<?xi32>, %arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: memref<?xi32>, %arg9: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
     affine.for %arg10 = 0 to 4 {
@@ -366,4 +373,16 @@ module attributes {} {
 // HYPERBLOCK-NEXT:    %0 = affine.load %write_outputs_1[0] : memref<?xi32>
 // HYPERBLOCK-NEXT:    return %0 : i32
 // HYPERBLOCK-NEXT:  }
-// HYPERBLOCK-NEXT:}
\ No newline at end of file
+
+// HYPERBLOCK-NEXT:}
+
+// PLACEMENT:      taskflow.task @Task_0
+// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
+// PLACEMENT:      taskflow.task @Task_1
+// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32
+// PLACEMENT:      taskflow.task @Task_2
+// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
+// PLACEMENT:      taskflow.task @Task_3
+// PLACEMENT-SAME: cgra_col = 2 : i32, cgra_count = 1 : i32, cgra_row = 2 : i32
+// PLACEMENT:      taskflow.task @Task_4
+// PLACEMENT-SAME: cgra_col = 2 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32
\ No newline at end of file
diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
index 7655769e..d43363f6 100644
--- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
+++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
@@ -13,6 +13,13 @@
 // RUN: -o %t.hyperblock.mlir
 // RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK
 
+// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
+// RUN: --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --map-ct-on-cgra-array \
+// RUN: -o %t.placement.mlir
+// RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT
+
 module {
   // Example: Parallel nested loops scenario
   // Task 0: Single-level loop (vector scaling)
@@ -120,4 +127,9 @@ module {
 // HYPERBLOCK-NEXT:     }
 // HYPERBLOCK-NEXT:     return
 // HYPERBLOCK-NEXT:   }
-// HYPERBLOCK-NEXT: }
\ No newline at end of file
+// HYPERBLOCK-NEXT: }
+
+// PLACEMENT:      taskflow.task @Task_0
+// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
+// PLACEMENT:      taskflow.task @Task_1
+// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32
\ No newline at end of file

From ca8fdf1d28f83989394d88dc5bbdd99f61d7a0ca Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Wed, 4 Feb 2026 05:40:59 +0800
Subject: [PATCH 05/15] chore: update dot visualization labels with coordinates
 and comment out debug logs

---
 .../Transforms/MapCTOnCGRAArrayPass.cpp       | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
index 8c3fd34e..fbabdb44 100644
--- a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
+++ b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
@@ -189,11 +189,22 @@ void printGraphDOT(TaskMemoryGraph &graph, llvm::raw_ostream &os) {
   os << "  rankdir=TB;\n";
   // Task nodes (circles).
   for (auto &t : graph.task_nodes) {
-    os << "  T" << t->id << " [shape=circle, label=\"" << t->id << "\"];\n";
+    std::string pos_str = "";
+    if (!t->placement.empty()) {
+        pos_str = "\\n(" + std::to_string(t->placement[0].row) + "," + std::to_string(t->placement[0].col) + ")";
+    }
+    os << "  T" << t->id << " [shape=circle, label=\"Task" << t->id << pos_str << "\"];\n";
   }
   // Memory nodes (rectangles).
   for (size_t i = 0; i < graph.memory_nodes.size(); ++i) {
-    os << "  M" << i << " [shape=box, label=\"mem" << i << "\"];\n";
+    MemoryNode *m = graph.memory_nodes[i].get();
+    std::string sram_str = "";
+    if (m->assigned_sram_id != -1) {
+        int r = m->assigned_sram_id / 100;
+        int c = m->assigned_sram_id % 100;
+        sram_str = "\\nSRAM(" + std::to_string(r) + "," + std::to_string(c) + ")";
+    }
+    os << "  M" << i << " [shape=box, label=\"Mem" << i << sram_str << "\"];\n";
   }
   // Edges: Task -> Memory (write) and Memory -> Task (read).
   for (auto &t : graph.task_nodes) {
@@ -318,6 +329,11 @@ class CGRAPlacer {
         }
     }
 
+    // Prints final graph visualization to stderr.
+    // llvm::errs() << "\n=== Final Task-Memory Mapping (DOT format) ===\n";
+    // printGraphDOT(graph, llvm::errs());
+    // llvm::errs() << "===============================================\n\n";
+
     // Annotates Result.
     OpBuilder builder(func.getContext());
     for (auto &task_node : graph.task_nodes) {

From 82d26b029760778a66d126f503ffcc47da1cf0d5 Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Wed, 4 Feb 2026 05:46:52 +0800
Subject: [PATCH 06/15] clean: remove all visualization and debug log code from
 MapCTOnCGRAArrayPass

---
 .../Transforms/MapCTOnCGRAArrayPass.cpp       | 58 ++-----------------
 1 file changed, 5 insertions(+), 53 deletions(-)

diff --git a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
index fbabdb44..c56613af 100644
--- a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
+++ b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
@@ -183,47 +183,6 @@ class TaskMemoryGraph {
   }
 };
 
-/// Prints the Task-Memory graph in DOT format for visualization.
-void printGraphDOT(TaskMemoryGraph &graph, llvm::raw_ostream &os) {
-  os << "digraph TaskMemGraph {\n";
-  os << "  rankdir=TB;\n";
-  // Task nodes (circles).
-  for (auto &t : graph.task_nodes) {
-    std::string pos_str = "";
-    if (!t->placement.empty()) {
-        pos_str = "\\n(" + std::to_string(t->placement[0].row) + "," + std::to_string(t->placement[0].col) + ")";
-    }
-    os << "  T" << t->id << " [shape=circle, label=\"Task" << t->id << pos_str << "\"];\n";
-  }
-  // Memory nodes (rectangles).
-  for (size_t i = 0; i < graph.memory_nodes.size(); ++i) {
-    MemoryNode *m = graph.memory_nodes[i].get();
-    std::string sram_str = "";
-    if (m->assigned_sram_id != -1) {
-        int r = m->assigned_sram_id / 100;
-        int c = m->assigned_sram_id % 100;
-        sram_str = "\\nSRAM(" + std::to_string(r) + "," + std::to_string(c) + ")";
-    }
-    os << "  M" << i << " [shape=box, label=\"Mem" << i << sram_str << "\"];\n";
-  }
-  // Edges: Task -> Memory (write) and Memory -> Task (read).
-  for (auto &t : graph.task_nodes) {
-    for (size_t i = 0; i < graph.memory_nodes.size(); ++i) {
-      MemoryNode *m = graph.memory_nodes[i].get();
-      for (auto *writer : m->writers) {
-        if (writer == t.get()) {
-          os << "  T" << t->id << " -> M" << i << ";\n";
-        }
-      }
-      for (auto *reader : m->readers) {
-        if (reader == t.get()) {
-          os << "  M" << i << " -> T" << t->id << ";\n";
-        }
-      }
-    }
-  }
-  os << "}\n";
-}
 
 
 //===----------------------------------------------------------------------===//
@@ -256,11 +215,7 @@ class CGRAPlacer {
     TaskMemoryGraph graph;
     graph.build(func);
 
-    // Prints graph visualization to stderr for debugging.
-    // llvm::errs() << "\n=== Task-Memory Graph (DOT format) ===\n";
-    // printGraphDOT(graph, llvm::errs());
-    // llvm::errs() << "=== Graph Stats: " << graph.task_nodes.size() << " tasks, "
-    //              << graph.memory_nodes.size() << " memories ===\n\n";
+
 
     if (graph.task_nodes.empty()) {
       llvm::errs() << "No tasks to place.\n";
@@ -287,7 +242,7 @@ class CGRAPlacer {
     // Alternates between Task Placement (Phase 1) and SRAM Assignment (Phase 2).
     constexpr int kMaxIterations = 10;
     
-    llvm::errs() << "\n=== Starting Iterative Placement (Max " << kMaxIterations << ") ===\n";
+
 
     for (int iter = 0; iter < kMaxIterations; ++iter) {
         // Phase 1: Place Tasks (assuming fixed SRAMs).
@@ -319,20 +274,17 @@ class CGRAPlacer {
         // Phase 2: Assign SRAMs (assuming fixed Tasks).
         bool sram_moved = assignAllSRAMs(graph);
         
-        llvm::errs() << "Iter " << iter << ": SRAMs moved = " << (sram_moved ? "Yes" : "No") << "\n";
+
 
         // Convergence Check.
         // If SRAMs didn't move, it means task placement based on them likely won't change either.
         if (iter > 0 && !sram_moved) {
-            llvm::errs() << "Converged at iteration " << iter << ".\n";
+
             break; 
         }
     }
 
-    // Prints final graph visualization to stderr.
-    // llvm::errs() << "\n=== Final Task-Memory Mapping (DOT format) ===\n";
-    // printGraphDOT(graph, llvm::errs());
-    // llvm::errs() << "===============================================\n\n";
+
 
     // Annotates Result.
     OpBuilder builder(func.getContext());

From 16310bdec898a2848e174b8e40b5ef33b87ef49d Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Wed, 4 Feb 2026 05:48:51 +0800
Subject: [PATCH 07/15] test: fix placement coordinates in irregular-loop.mlir

---
 test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
index 2510b28e..3550e14d 100644
--- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
+++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
@@ -311,7 +311,7 @@ module attributes {} {
 // PLACEMENT:      taskflow.task @Task_0
 // PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
 // PLACEMENT:      taskflow.task @Task_1
-// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
-// PLACEMENT:      taskflow.task @Task_2
 // PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32
+// PLACEMENT:      taskflow.task @Task_2
+// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
 

From 8b59105b32dda5be1238ca5b07bf0f082c4358ec Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Thu, 5 Feb 2026 01:21:08 +0800
Subject: [PATCH 08/15] refactor(cgra-placement): resolve review comments

- Fix SRAM ID encoding for large grids\n- Add successors to SSA proximity score\n- Remove load balance metric\n- Improve error handling with assert\n- Fix comments and variable naming\n- Ensure SSA dependencies use value inputs only\n- Update tests for new placement logic
---
 .../Transforms/MapCTOnCGRAArrayPass.cpp       | 75 +++++++++----------
 .../irregular-loop/irregular-loop.mlir        |  2 +-
 .../taskflow/multi-nested/multi-nested.mlir   |  8 +-
 .../parallel-nested/parallel-nested.mlir      |  2 +-
 4 files changed, 43 insertions(+), 44 deletions(-)

diff --git a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
index c56613af..1465d086 100644
--- a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
+++ b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
@@ -93,9 +93,9 @@ struct TaskNode {
   TaskflowTaskOp op;
   int alap_level = 0;
   
-  // Edges (Note: read/write naming refers to taskflow memory_inputs/outputs)
-  SmallVector<MemoryNode *> read_memrefs;  // taskflow.task memory_inputs (readiness triggers)
-  SmallVector<MemoryNode *> write_memrefs; // taskflow.task memory_outputs (produce readiness)
+  // Edges based on original memory access.
+  SmallVector<MemoryNode *> read_memrefs;  // original_read_memrefs
+  SmallVector<MemoryNode *> write_memrefs; // original_write_memrefs
   SmallVector<TaskNode *> ssa_users;
   SmallVector<TaskNode *> ssa_operands;
 
@@ -159,7 +159,7 @@ class TaskMemoryGraph {
     // Identifies if a task uses a value produced by another task.
     for (auto &consumer_node : task_nodes) {
         // Interates all operands for now to be safe.
-        for (Value operand : consumer_node->op->getOperands()) {
+        for (Value operand : consumer_node->op.getValueInputs()) {
             if (auto producer_op = operand.getDefiningOp<TaskflowTaskOp>()) {
                 if (auto *producer_node = op_to_node[producer_op]) {
                     producer_node->ssa_users.push_back(consumer_node.get());
@@ -172,8 +172,9 @@ class TaskMemoryGraph {
 
 private:
   MemoryNode *getOrCreateMemoryNode(Value memref) {
-    if (memref_to_node.count(memref))
+    if (memref_to_node.count(memref)){
       return memref_to_node[memref];
+    }
     
     auto node = std::make_unique<MemoryNode>(memref);
     MemoryNode *ptr = node.get();
@@ -186,9 +187,10 @@ class TaskMemoryGraph {
 
 
 //===----------------------------------------------------------------------===//
-// CGRA Placer
+// Task Mapper
 //===----------------------------------------------------------------------===//
-/// Places ACTs onto a 2D CGRA grid with memory mapping.
+/// Maps a task-memory graph onto a 2D CGRA grid.
+
 class CGRAPlacer {
 public:
   CGRAPlacer(int grid_rows, int grid_cols)
@@ -210,7 +212,6 @@ class CGRAPlacer {
     }
 
 
-    // Extracts counter chains and builds dependency graph.
     // Builds Task-Memory Graph.
     TaskMemoryGraph graph;
     graph.build(func);
@@ -223,6 +224,9 @@ class CGRAPlacer {
     }
 
     // Computes ALAP Levels on the Task Graph.
+    // We use ALAP (As Late As Possible) to identify the critical path and
+    // prioritize tasks that are closer to the sink, which helps in minimizing
+    // the overall dependency latency during placement.
     computeALAP(graph);
 
     // Sorts tasks by ALAP level (Critical Path First).
@@ -336,7 +340,8 @@ class CGRAPlacer {
         // Rounds to the nearest integer.
         int avg_row = (total_row + count / 2) / count;
         int avg_col = (total_col + count / 2) / count;
-        new_sram_id = avg_row * 100 + avg_col;
+        // SRAM ID encoding: (row << 16) | col
+        new_sram_id = (avg_row << 16) | (avg_col & 0xFFFF);
       } else {
         new_sram_id = 0; // Default fallback
       }
@@ -378,11 +383,7 @@ class CGRAPlacer {
 
     // Error handling: No available position found (grid over-subscribed).
     if (best_placement.cgra_positions.empty()) {
-      llvm::errs() << "Warning: No available CGRA position for task "
-                   << task_node->id << ". Grid is over-subscribed (" << grid_rows_
-                   << "x" << grid_cols_ << " grid with all cells occupied).\n";
-      // Fallback: Assign to position (0,0) with a warning.
-      best_placement.cgra_positions.push_back({0, 0});
+      assert(false && "No available CGRA position found (grid over-subscribed).");
     }
 
     return best_placement;
@@ -393,7 +394,7 @@ class CGRAPlacer {
   /// downstream hardware generators to configure fast bypass paths between
   /// adjacent PEs with dependencies.
   ///
-  /// Score = α·SSA_Dist + β·Mem_Dist + γ·Balance
+  /// Score = α·SSA_Dist + β·Mem_Dist
   ///
   /// SSA_Dist: Minimize distance to placed SSA predecessors (ssa_operands).
   /// Mem_Dist: Minimize distance to assigned SRAMs for read/write memrefs.
@@ -402,15 +403,13 @@ class CGRAPlacer {
     // Weight constants (tunable).
     constexpr int kAlpha = 10;   // SSA proximity weight
     constexpr int kBeta = 50;    // Memory proximity weight (High priority)
-    constexpr int kGamma = 20;   // Load balance weight
 
     int ssa_score = 0;
     int mem_score = 0;
-    int bal_score = 0;
     
     CGRAPosition current_pos = placement.primary();
 
-    // 1. SSA Proximity (Predecessors)
+    // 1. SSA Proximity (Predecessors & Successors)
     for (TaskNode *producer : task_node->ssa_operands) {
         if (!producer->placement.empty()) {
             int dist = current_pos.manhattanDistance(producer->placement[0]);
@@ -418,14 +417,20 @@ class CGRAPlacer {
             ssa_score -= dist;
         }
     }
+    for (TaskNode *consumer : task_node->ssa_users) {
+        if (!consumer->placement.empty()) {
+            int dist = current_pos.manhattanDistance(consumer->placement[0]);
+            ssa_score -= dist;
+        }
+    }
 
     // 2. Memory Proximity
     // For Read MemRefs
     for (MemoryNode *mem : task_node->read_memrefs) {
         if (mem->assigned_sram_id != -1) {
-            // SRAM ID encoding: row*100 + col
-            int sram_r = mem->assigned_sram_id / 100;
-            int sram_c = mem->assigned_sram_id % 100;
+            // SRAM ID encoding: (row << 16) | col
+            int sram_r = mem->assigned_sram_id >> 16;
+            int sram_c = mem->assigned_sram_id & 0xFFFF;
             CGRAPosition sram_pos{sram_r, sram_c};
             int dist = current_pos.manhattanDistance(sram_pos);
             mem_score -= dist;
@@ -436,50 +441,44 @@ class CGRAPlacer {
     // we want to be close to it too.
     for (MemoryNode *mem : task_node->write_memrefs) {
          if (mem->assigned_sram_id != -1) {
-            int sram_r = mem->assigned_sram_id / 100;
-            int sram_c = mem->assigned_sram_id % 100;
+            // SRAM ID encoding: (row << 16) | col
+            int sram_r = mem->assigned_sram_id >> 16;
+            int sram_c = mem->assigned_sram_id & 0xFFFF;
             CGRAPosition sram_pos{sram_r, sram_c};
             int dist = current_pos.manhattanDistance(sram_pos);
             mem_score -= dist;
         }
     }
 
-    // 3. Load Balance
-    // Prefers less crowded rows/cols.
-    int row_count = 0, col_count = 0;
-    for (int c = 0; c < grid_cols_; ++c) { if (occupied_[current_pos.row][c]) row_count++; }
-    for (int r = 0; r < grid_rows_; ++r) { if (occupied_[r][current_pos.col]) col_count++; }
-    bal_score = (grid_cols_ - row_count) + (grid_rows_ - col_count);
-
-    return kAlpha * ssa_score + kBeta * mem_score + kGamma * bal_score;
+    return kAlpha * ssa_score + kBeta * mem_score;
   }
 
   /// Computes ALAP levels considering both SSA and memory dependencies.
   void computeALAP(TaskMemoryGraph &graph) {
     // DFS for longest path from node to any sink (ALAP Level).
-    DenseMap<TaskNode*, int> memo;
+    DenseMap<TaskNode*, int> node_alap_cache;
     for (auto &node : graph.task_nodes) {
-        node->alap_level = calculateLevel(node.get(), memo);
+        node->alap_level = calculateLevel(node.get(), node_alap_cache);
     }
   }
 
-  int calculateLevel(TaskNode *node, DenseMap<TaskNode*, int> &memo) {
-    if (memo.count(node)) return memo[node];
+  int calculateLevel(TaskNode *node, DenseMap<TaskNode*, int> &node_alap_cache) {
+    if (node_alap_cache.count(node)) return node_alap_cache[node];
 
     int max_child_level = 0;
     for (TaskNode *child : node->ssa_users) {
-        max_child_level = std::max(max_child_level, calculateLevel(child, memo) + 1);
+        max_child_level = std::max(max_child_level, calculateLevel(child, node_alap_cache) + 1);
     }
 
     // Checks memory dependencies too (Producer -> Mem -> Consumer).
     for (MemoryNode *mem : node->write_memrefs) {
         for (TaskNode *reader : mem->readers) {
             if (reader != node)
-                max_child_level = std::max(max_child_level, calculateLevel(reader, memo) + 1);
+                max_child_level = std::max(max_child_level, calculateLevel(reader, node_alap_cache) + 1);
         }
     }
 
-    return memo[node] = max_child_level;
+    return node_alap_cache[node] = max_child_level;
   }
 
 
diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
index 3550e14d..b6c57dd1 100644
--- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
+++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
@@ -313,5 +313,5 @@ module attributes {} {
 // PLACEMENT:      taskflow.task @Task_1
 // PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32
 // PLACEMENT:      taskflow.task @Task_2
-// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
+// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32
 
diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
index 9cd26dc6..43164d50 100644
--- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
+++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
@@ -379,10 +379,10 @@ module attributes {} {
 // PLACEMENT:      taskflow.task @Task_0
 // PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
 // PLACEMENT:      taskflow.task @Task_1
-// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32
-// PLACEMENT:      taskflow.task @Task_2
 // PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
+// PLACEMENT:      taskflow.task @Task_2
+// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32
 // PLACEMENT:      taskflow.task @Task_3
-// PLACEMENT-SAME: cgra_col = 2 : i32, cgra_count = 1 : i32, cgra_row = 2 : i32
+// PLACEMENT-SAME: cgra_col = 2 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
 // PLACEMENT:      taskflow.task @Task_4
-// PLACEMENT-SAME: cgra_col = 2 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32
\ No newline at end of file
+// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32
\ No newline at end of file
diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
index d43363f6..0950619a 100644
--- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
+++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
@@ -132,4 +132,4 @@ module {
 // PLACEMENT:      taskflow.task @Task_0
 // PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
 // PLACEMENT:      taskflow.task @Task_1
-// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32
\ No newline at end of file
+// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
\ No newline at end of file

From 846ea2d68d0e6ace66aa7e90028efbd2f2e6722e Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Thu, 5 Feb 2026 01:30:29 +0800
Subject: [PATCH 09/15] feat(cgra-placement): wrap mapping info in attribute
 and update tests

---
 .../Transforms/MapCTOnCGRAArrayPass.cpp       | 46 +++++++++++++++++--
 relu_branch.mlir                              |  3 --
 relu_main.mlir                                |  3 --
 .../irregular-loop/irregular-loop.mlir        |  6 +--
 .../taskflow/multi-nested/multi-nested.mlir   | 10 ++--
 .../parallel-nested/parallel-nested.mlir      |  4 +-
 6 files changed, 51 insertions(+), 21 deletions(-)
 delete mode 100644 relu_branch.mlir
 delete mode 100644 relu_main.mlir

diff --git a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
index 1465d086..e1783d1d 100644
--- a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
+++ b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
@@ -263,7 +263,8 @@ class CGRAPlacer {
           
           // Commits Placement.
           task_node->placement.push_back(placement.primary());
-          // Handles multi-cgra if needed.
+          // Handles mapping one task on multi-CGRAs.
+          // TODO: Introduce explicit multi-CGRA binding logic.
           for (size_t i = 1; i < placement.cgra_positions.size(); ++i) {
              task_node->placement.push_back(placement.cgra_positions[i]);
           }
@@ -294,10 +295,45 @@ class CGRAPlacer {
     OpBuilder builder(func.getContext());
     for (auto &task_node : graph.task_nodes) {
         if (task_node->placement.empty()) continue;
-        CGRAPosition pos = task_node->placement[0];
-        task_node->op->setAttr("cgra_row", builder.getI32IntegerAttr(pos.row));
-        task_node->op->setAttr("cgra_col", builder.getI32IntegerAttr(pos.col));
-        task_node->op->setAttr("cgra_count", builder.getI32IntegerAttr(task_node->placement.size()));
+        
+        SmallVector<NamedAttribute, 4> mapping_attrs;
+
+        // 1. CGRA Positions
+        SmallVector<Attribute> pos_attrs;
+        for (const auto &pos : task_node->placement) {
+            SmallVector<NamedAttribute, 2> coord_attrs;
+            coord_attrs.push_back(NamedAttribute(
+                StringAttr::get(func.getContext(), "row"),
+                builder.getI32IntegerAttr(pos.row)));
+            coord_attrs.push_back(NamedAttribute(
+                StringAttr::get(func.getContext(), "col"),
+                builder.getI32IntegerAttr(pos.col)));
+            pos_attrs.push_back(DictionaryAttr::get(func.getContext(), coord_attrs));
+        }
+        mapping_attrs.push_back(NamedAttribute(
+            StringAttr::get(func.getContext(), "cgra_positions"),
+            builder.getArrayAttr(pos_attrs)));
+
+        // 2. Read SRAM IDs
+        SmallVector<Attribute> read_sram_attrs;
+        for (MemoryNode *mem : task_node->read_memrefs) {
+            read_sram_attrs.push_back(builder.getI32IntegerAttr(mem->assigned_sram_id));
+        }
+        mapping_attrs.push_back(NamedAttribute(
+            StringAttr::get(func.getContext(), "read_sram_ids"),
+            builder.getArrayAttr(read_sram_attrs)));
+
+        // 3. Write SRAM IDs
+        SmallVector<Attribute> write_sram_attrs;
+        for (MemoryNode *mem : task_node->write_memrefs) {
+            write_sram_attrs.push_back(builder.getI32IntegerAttr(mem->assigned_sram_id));
+        }
+        mapping_attrs.push_back(NamedAttribute(
+            StringAttr::get(func.getContext(), "write_sram_ids"),
+            builder.getArrayAttr(write_sram_attrs)));
+
+        // Set Attribute
+        task_node->op->setAttr("mapping_info", DictionaryAttr::get(func.getContext(), mapping_attrs));
     }
   }
 
diff --git a/relu_branch.mlir b/relu_branch.mlir
deleted file mode 100644
index 0a1e5150..00000000
--- a/relu_branch.mlir
+++ /dev/null
@@ -1,3 +0,0 @@
-module {
-}
-
diff --git a/relu_main.mlir b/relu_main.mlir
deleted file mode 100644
index 0a1e5150..00000000
--- a/relu_main.mlir
+++ /dev/null
@@ -1,3 +0,0 @@
-module {
-}
-
diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
index b6c57dd1..e5cca889 100644
--- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
+++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
@@ -309,9 +309,9 @@ module attributes {} {
 // HYPERBLOCK-NEXT: }
 
 // PLACEMENT:      taskflow.task @Task_0
-// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
+// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_ids = [], write_sram_ids = []}
 // PLACEMENT:      taskflow.task @Task_1
-// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32
+// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_ids = [], write_sram_ids = [65537 : i32]}
 // PLACEMENT:      taskflow.task @Task_2
-// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32
+// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_ids = [65537 : i32], write_sram_ids = [65536 : i32]}
 
diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
index 43164d50..2d158b5d 100644
--- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
+++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
@@ -377,12 +377,12 @@ module attributes {} {
 // HYPERBLOCK-NEXT:}
 
 // PLACEMENT:      taskflow.task @Task_0
-// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
+// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_ids = [0 : i32], write_sram_ids = [65536 : i32]}
 // PLACEMENT:      taskflow.task @Task_1
-// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
+// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_ids = [1 : i32, 1 : i32], write_sram_ids = [65537 : i32]}
 // PLACEMENT:      taskflow.task @Task_2
-// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32
+// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_ids = [65536 : i32, 65537 : i32, 65536 : i32], write_sram_ids = [65536 : i32]}
 // PLACEMENT:      taskflow.task @Task_3
-// PLACEMENT-SAME: cgra_col = 2 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
+// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 2 : i32, row = 0 : i32}], read_sram_ids = [2 : i32], write_sram_ids = [65538 : i32]}
 // PLACEMENT:      taskflow.task @Task_4
-// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32
\ No newline at end of file
+// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_ids = [65537 : i32, 65538 : i32], write_sram_ids = [65537 : i32]}
\ No newline at end of file
diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
index 0950619a..ec98f6a1 100644
--- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
+++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
@@ -130,6 +130,6 @@ module {
 // HYPERBLOCK-NEXT: }
 
 // PLACEMENT:      taskflow.task @Task_0
-// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
+// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_ids = [0 : i32], write_sram_ids = [0 : i32]}
 // PLACEMENT:      taskflow.task @Task_1
-// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32
\ No newline at end of file
+// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_ids = [1 : i32, 1 : i32], write_sram_ids = [1 : i32]}
\ No newline at end of file

From 1ae4b13be4de50a2e7581150e57cb207c7cff7a2 Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Fri, 6 Feb 2026 02:13:52 +0800
Subject: [PATCH 10/15] refactor(cgra-placement): update task mapping info for
 better readability and structure

- Rename CGRAPlacer to TaskMapper\n- Rename mapping_info to task_mapping_info\n- Use readable SRAM locations {row, col} in task_mapping_info\n- Provide full task lines in test cases
---
 .../Transforms/MapCTOnCGRAArrayPass.cpp       | 40 ++++++++++++++-----
 .../irregular-loop/irregular-loop.mlir        |  6 +--
 .../taskflow/multi-nested/multi-nested.mlir   | 10 ++---
 .../parallel-nested/parallel-nested.mlir      |  4 +-
 4 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
index e1783d1d..44fb41e6 100644
--- a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
+++ b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
@@ -191,9 +191,9 @@ class TaskMemoryGraph {
 //===----------------------------------------------------------------------===//
 /// Maps a task-memory graph onto a 2D CGRA grid.
 
-class CGRAPlacer {
+class TaskMapper {
 public:
-  CGRAPlacer(int grid_rows, int grid_cols)
+  TaskMapper(int grid_rows, int grid_cols)
       : grid_rows_(grid_rows), grid_cols_(grid_cols) {
     occupied_.resize(grid_rows_);
     for (auto &row : occupied_) {
@@ -314,26 +314,44 @@ class CGRAPlacer {
             StringAttr::get(func.getContext(), "cgra_positions"),
             builder.getArrayAttr(pos_attrs)));
 
-        // 2. Read SRAM IDs
+        // 2. Read SRAM Locations
         SmallVector<Attribute> read_sram_attrs;
         for (MemoryNode *mem : task_node->read_memrefs) {
-            read_sram_attrs.push_back(builder.getI32IntegerAttr(mem->assigned_sram_id));
+            int row = mem->assigned_sram_id >> 16;
+            int col = mem->assigned_sram_id & 0xFFFF;
+            SmallVector<NamedAttribute, 2> coord_attrs;
+            coord_attrs.push_back(NamedAttribute(
+                StringAttr::get(func.getContext(), "row"),
+                builder.getI32IntegerAttr(row)));
+            coord_attrs.push_back(NamedAttribute(
+                StringAttr::get(func.getContext(), "col"),
+                builder.getI32IntegerAttr(col)));
+            read_sram_attrs.push_back(DictionaryAttr::get(func.getContext(), coord_attrs));
         }
         mapping_attrs.push_back(NamedAttribute(
-            StringAttr::get(func.getContext(), "read_sram_ids"),
+            StringAttr::get(func.getContext(), "read_sram_locs"),
             builder.getArrayAttr(read_sram_attrs)));
 
-        // 3. Write SRAM IDs
+        // 3. Write SRAM Locations
         SmallVector<Attribute> write_sram_attrs;
         for (MemoryNode *mem : task_node->write_memrefs) {
-            write_sram_attrs.push_back(builder.getI32IntegerAttr(mem->assigned_sram_id));
+            int row = mem->assigned_sram_id >> 16;
+            int col = mem->assigned_sram_id & 0xFFFF;
+            SmallVector<NamedAttribute, 2> coord_attrs;
+            coord_attrs.push_back(NamedAttribute(
+                StringAttr::get(func.getContext(), "row"),
+                builder.getI32IntegerAttr(row)));
+            coord_attrs.push_back(NamedAttribute(
+                StringAttr::get(func.getContext(), "col"),
+                builder.getI32IntegerAttr(col)));
+            write_sram_attrs.push_back(DictionaryAttr::get(func.getContext(), coord_attrs));
         }
         mapping_attrs.push_back(NamedAttribute(
-            StringAttr::get(func.getContext(), "write_sram_ids"),
+            StringAttr::get(func.getContext(), "write_sram_locs"),
             builder.getArrayAttr(write_sram_attrs)));
 
         // Set Attribute
-        task_node->op->setAttr("mapping_info", DictionaryAttr::get(func.getContext(), mapping_attrs));
+        task_node->op->setAttr("task_mapping_info", DictionaryAttr::get(func.getContext(), mapping_attrs));
     }
   }
 
@@ -544,8 +562,8 @@ struct MapCTOnCGRAArrayPass
     func::FuncOp func = getOperation();
     constexpr int kDefaultGridRows = 3;
     constexpr int kDefaultGridCols = 3;
-    CGRAPlacer placer(kDefaultGridRows, kDefaultGridCols);
-    placer.place(func);
+    TaskMapper mapper(kDefaultGridRows, kDefaultGridCols);
+    mapper.place(func);
   }
 };
 
diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
index e5cca889..2d614b98 100644
--- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
+++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
@@ -309,9 +309,9 @@ module attributes {} {
 // HYPERBLOCK-NEXT: }
 
 // PLACEMENT:      taskflow.task @Task_0
-// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_ids = [], write_sram_ids = []}
+// PLACEMENT-SAME: value_inputs(%c0_i32 : i32) {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locs = [], write_sram_locs = []}}
 // PLACEMENT:      taskflow.task @Task_1
-// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_ids = [], write_sram_ids = [65537 : i32]}
+// PLACEMENT-SAME: write_memrefs(%alloca_0 : memref<4x8xi32>) value_inputs(%c8_i32 : i32) [original_write_memrefs(%alloca_0 : memref<4x8xi32>)] {task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_locs = [], write_sram_locs = [{col = 1 : i32, row = 1 : i32}]}}
 // PLACEMENT:      taskflow.task @Task_2
-// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_ids = [65537 : i32], write_sram_ids = [65536 : i32]}
+// PLACEMENT-SAME: read_memrefs(%write_outputs : memref<4x8xi32>) write_memrefs(%alloca : memref<i32>) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0 : memref<4x8xi32>), original_write_memrefs(%alloca : memref<i32>)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locs = [{col = 1 : i32, row = 1 : i32}], write_sram_locs = [{col = 0 : i32, row = 1 : i32}]}}
 
diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
index 2d158b5d..ce3d4071 100644
--- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
+++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
@@ -377,12 +377,12 @@ module attributes {} {
 // HYPERBLOCK-NEXT:}
 
 // PLACEMENT:      taskflow.task @Task_0
-// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_ids = [0 : i32], write_sram_ids = [65536 : i32]}
+// PLACEMENT-SAME: read_memrefs(%arg0 : memref<?x8x6xi32>) write_memrefs(%arg5 : memref<?xi32>) [original_read_memrefs(%arg0 : memref<?x8x6xi32>), original_write_memrefs(%arg5 : memref<?xi32>)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locs = [{col = 0 : i32, row = 0 : i32}], write_sram_locs = [{col = 0 : i32, row = 1 : i32}]}}
 // PLACEMENT:      taskflow.task @Task_1
-// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_ids = [1 : i32, 1 : i32], write_sram_ids = [65537 : i32]}
+// PLACEMENT-SAME: read_memrefs(%arg1, %arg2 : memref<?x8x5xi32>, memref<?x8x5xi32>) write_memrefs(%arg6 : memref<?xi32>) [original_read_memrefs(%arg1, %arg2 : memref<?x8x5xi32>, memref<?x8x5xi32>), original_write_memrefs(%arg6 : memref<?xi32>)] {task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locs = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locs = [{col = 1 : i32, row = 1 : i32}]}}
 // PLACEMENT:      taskflow.task @Task_2
-// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_ids = [65536 : i32, 65537 : i32, 65536 : i32], write_sram_ids = [65536 : i32]}
+// PLACEMENT-SAME: read_memrefs(%write_outputs, %write_outputs_0, %arg9 : memref<?xi32>, memref<?xi32>, memref<?xi32>) write_memrefs(%arg9 : memref<?xi32>) [original_read_memrefs(%arg5, %arg6, %arg9 : memref<?xi32>, memref<?xi32>, memref<?xi32>), original_write_memrefs(%arg9 : memref<?xi32>)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locs = [{col = 0 : i32, row = 1 : i32}, {col = 1 : i32, row = 1 : i32}, {col = 0 : i32, row = 1 : i32}], write_sram_locs = [{col = 0 : i32, row = 1 : i32}]}}
 // PLACEMENT:      taskflow.task @Task_3
-// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 2 : i32, row = 0 : i32}], read_sram_ids = [2 : i32], write_sram_ids = [65538 : i32]}
+// PLACEMENT-SAME: read_memrefs(%arg3 : memref<?x7xi32>) write_memrefs(%arg7 : memref<?xi32>) [original_read_memrefs(%arg3 : memref<?x7xi32>), original_write_memrefs(%arg7 : memref<?xi32>)] {task_mapping_info = {cgra_positions = [{col = 2 : i32, row = 0 : i32}], read_sram_locs = [{col = 2 : i32, row = 0 : i32}], write_sram_locs = [{col = 2 : i32, row = 1 : i32}]}}
 // PLACEMENT:      taskflow.task @Task_4
-// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_ids = [65537 : i32, 65538 : i32], write_sram_ids = [65537 : i32]}
\ No newline at end of file
+// PLACEMENT-SAME: read_memrefs(%arg4, %write_outputs_2 : memref<?x9xi32>, memref<?xi32>) write_memrefs(%arg8 : memref<?xi32>) [original_read_memrefs(%arg4, %arg7 : memref<?x9xi32>, memref<?xi32>), original_write_memrefs(%arg8 : memref<?xi32>)] {task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_locs = [{col = 1 : i32, row = 1 : i32}, {col = 2 : i32, row = 1 : i32}], write_sram_locs = [{col = 1 : i32, row = 1 : i32}]}}
\ No newline at end of file
diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
index ec98f6a1..a91737f3 100644
--- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
+++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
@@ -130,6 +130,6 @@ module {
 // HYPERBLOCK-NEXT: }
 
 // PLACEMENT:      taskflow.task @Task_0
-// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_ids = [0 : i32], write_sram_ids = [0 : i32]}
+// PLACEMENT-SAME: read_memrefs(%arg0 : memref<16xf32>) write_memrefs(%arg0 : memref<16xf32>) value_inputs(%arg4 : f32) [original_read_memrefs(%arg0 : memref<16xf32>), original_write_memrefs(%arg0 : memref<16xf32>)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locs = [{col = 0 : i32, row = 0 : i32}], write_sram_locs = [{col = 0 : i32, row = 0 : i32}]}}
 // PLACEMENT:      taskflow.task @Task_1
-// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_ids = [1 : i32, 1 : i32], write_sram_ids = [1 : i32]}
\ No newline at end of file
+// PLACEMENT-SAME: read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>) write_memrefs(%arg3 : memref<8x8xf32>) [original_read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>), original_write_memrefs(%arg3 : memref<8x8xf32>)] {task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locs = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locs = [{col = 1 : i32, row = 0 : i32}]}}
\ No newline at end of file

From a6603029246f2a1dd937de91fbfd11d0b710d66c Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Fri, 6 Feb 2026 02:36:27 +0800
Subject: [PATCH 11/15] Refactor CGRA placement: rename pass and classes,
 address reviewer comments on mapping info readability and encoding

---
 include/TaskflowDialect/TaskflowPasses.h      |   2 +-
 include/TaskflowDialect/TaskflowPasses.td     |  10 +-
 lib/TaskflowDialect/Transforms/CMakeLists.txt |   2 +-
 ...GRAArrayPass.cpp => MapTaskOnCgraPass.cpp} | 109 ++++++++----------
 .../irregular-loop/irregular-loop.mlir        |   9 +-
 .../taskflow/multi-nested/multi-nested.mlir   |  13 +--
 .../parallel-nested/parallel-nested.mlir      |   6 +-
 7 files changed, 70 insertions(+), 81 deletions(-)
 rename lib/TaskflowDialect/Transforms/{MapCTOnCGRAArrayPass.cpp => MapTaskOnCgraPass.cpp} (84%)

diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h
index 33a8199e..c50544c9 100644
--- a/include/TaskflowDialect/TaskflowPasses.h
+++ b/include/TaskflowDialect/TaskflowPasses.h
@@ -17,7 +17,7 @@ namespace taskflow {
 #include "TaskflowDialect/TaskflowPasses.h.inc"
 std::unique_ptr<mlir::Pass> createConstructHyperblockFromTaskPass();
 std::unique_ptr<mlir::Pass> createClassifyCountersPass();
-std::unique_ptr<mlir::Pass> createMapCTOnCGRAArrayPass();
+std::unique_ptr<mlir::Pass> createMapTaskOnCgraPass();
 
 //=========================================================//
 // Optimization Passes
diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td
index dd7f4db2..0b37631c 100644
--- a/include/TaskflowDialect/TaskflowPasses.td
+++ b/include/TaskflowDialect/TaskflowPasses.td
@@ -46,15 +46,15 @@ def ClassifyCounters : Pass<"classify-counters", "ModuleOp">{
   let constructor = "taskflow::createClassifyCountersPass()";
 }
 
-def MapCTOnCGRAArray : Pass<"map-ct-on-cgra-array", "func::FuncOp"> {
-  let summary = "Maps Canonical Tasks (CTs) onto a 2D CGRA grid array";
+def MapTaskOnCgra : Pass<"map-task-on-cgra", "func::FuncOp"> {
+  let summary = "Maps Taskflow tasks onto a 2D CGRA grid array";
   let description = [{
-    This pass maps Canonical Tasks (CTs) onto a 2D CGRA grid array.
+    This pass maps Taskflow tasks onto a 2D CGRA grid array.
     Fusion candidates (same-header SSA dependencies) are placed on adjacent
     CGRAs to enable direct data forwarding.
 
-    Uses a default 4x4 CGRA grid.
+    Uses a default 3x3 CGRA grid.
   }];
-  let constructor = "taskflow::createMapCTOnCGRAArrayPass()";
+  let constructor = "taskflow::createMapTaskOnCgraPass()";
 }
 #endif // TASKFLOW_PASSES_TD
\ No newline at end of file
diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt
index d2254cf7..5dcb6736 100644
--- a/lib/TaskflowDialect/Transforms/CMakeLists.txt
+++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt
@@ -3,7 +3,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 add_mlir_library(MLIRTaskflowTransforms
     ConstructHyperblockFromTaskPass.cpp
     ClassifyCountersPass.cpp
-    MapCTOnCGRAArrayPass.cpp
+    MapTaskOnCgraPass.cpp
 
     DEPENDS
     MLIRTaskflowTransformsIncGen
diff --git a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
similarity index 84%
rename from lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
rename to lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
index 44fb41e6..adf845d9 100644
--- a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp
+++ b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
@@ -1,4 +1,4 @@
-//===- MapCTOnCGRAArrayPass.cpp - CT to CGRA Mapping Pass ----------------===//
+//===- MapTaskOnCgraPass.cpp - Task to CGRA Mapping Pass ----------------===//
 //
 // This pass maps Canonical Tasks (CTs) onto a 2D CGRA grid array:
 // 1. Places tasks with SSA dependencies on adjacent CGRAs.
@@ -22,6 +22,7 @@
 #include <algorithm>
 #include <climits>
 #include <cmath>
+#include <optional>
 #include <string>
 #include <vector>
 
@@ -42,6 +43,10 @@ struct CGRAPosition {
     return row == other.row && col == other.col;
   }
 
+  bool operator!=(const CGRAPosition &other) const {
+    return !(*this == other);
+  }
+
   /// Computes Manhattan distance to another position.
   int manhattanDistance(const CGRAPosition &other) const {
     return std::abs(row - other.row) + std::abs(col - other.col);
@@ -114,7 +119,7 @@ struct MemoryNode {
   SmallVector<TaskNode *> writers;
 
   // Mapping result
-  int assigned_sram_id = -1;
+  std::optional<CGRAPosition> assigned_sram_pos;
 
   MemoryNode(Value memref) : memref(memref) {}
 };
@@ -315,40 +320,37 @@ class TaskMapper {
             builder.getArrayAttr(pos_attrs)));
 
         // 2. Read SRAM Locations
-        SmallVector<Attribute> read_sram_attrs;
-        for (MemoryNode *mem : task_node->read_memrefs) {
-            int row = mem->assigned_sram_id >> 16;
-            int col = mem->assigned_sram_id & 0xFFFF;
-            SmallVector<NamedAttribute, 2> coord_attrs;
-            coord_attrs.push_back(NamedAttribute(
-                StringAttr::get(func.getContext(), "row"),
-                builder.getI32IntegerAttr(row)));
-            coord_attrs.push_back(NamedAttribute(
-                StringAttr::get(func.getContext(), "col"),
-                builder.getI32IntegerAttr(col)));
-            read_sram_attrs.push_back(DictionaryAttr::get(func.getContext(), coord_attrs));
+        if (!task_node->read_memrefs.empty()) {
+            SmallVector<Attribute> read_sram_attrs;
+            for (MemoryNode *mem : task_node->read_memrefs) {
+                if (mem->assigned_sram_pos) {
+                    SmallVector<NamedAttribute, 2> sram_coord;
+                    sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "row"), builder.getI32IntegerAttr(mem->assigned_sram_pos->row)));
+                    sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "col"), builder.getI32IntegerAttr(mem->assigned_sram_pos->col)));
+                    read_sram_attrs.push_back(DictionaryAttr::get(func.getContext(), sram_coord));
+                }
+            }
+            mapping_attrs.push_back(NamedAttribute(
+                StringAttr::get(func.getContext(), "read_sram_locations"),
+                builder.getArrayAttr(read_sram_attrs)));
         }
-        mapping_attrs.push_back(NamedAttribute(
-            StringAttr::get(func.getContext(), "read_sram_locs"),
-            builder.getArrayAttr(read_sram_attrs)));
 
         // 3. Write SRAM Locations
-        SmallVector<Attribute> write_sram_attrs;
-        for (MemoryNode *mem : task_node->write_memrefs) {
-            int row = mem->assigned_sram_id >> 16;
-            int col = mem->assigned_sram_id & 0xFFFF;
-            SmallVector<NamedAttribute, 2> coord_attrs;
-            coord_attrs.push_back(NamedAttribute(
-                StringAttr::get(func.getContext(), "row"),
-                builder.getI32IntegerAttr(row)));
-            coord_attrs.push_back(NamedAttribute(
-                StringAttr::get(func.getContext(), "col"),
-                builder.getI32IntegerAttr(col)));
-            write_sram_attrs.push_back(DictionaryAttr::get(func.getContext(), coord_attrs));
+        if (!task_node->write_memrefs.empty()) {
+            SmallVector<Attribute> write_sram_attrs;
+            for (MemoryNode *mem : task_node->write_memrefs) {
+                if (mem->assigned_sram_pos) {
+                  SmallVector<NamedAttribute, 2> sram_coord;
+                  sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "row"), builder.getI32IntegerAttr(mem->assigned_sram_pos->row)));
+                  sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "col"), builder.getI32IntegerAttr(mem->assigned_sram_pos->col)));
+
+                  write_sram_attrs.push_back(DictionaryAttr::get(func.getContext(), sram_coord));
+                }
+            }
+            mapping_attrs.push_back(NamedAttribute(
+                StringAttr::get(func.getContext(), "write_sram_locations"),
+                builder.getArrayAttr(write_sram_attrs)));
         }
-        mapping_attrs.push_back(NamedAttribute(
-            StringAttr::get(func.getContext(), "write_sram_locs"),
-            builder.getArrayAttr(write_sram_attrs)));
 
         // Set Attribute
         task_node->op->setAttr("task_mapping_info", DictionaryAttr::get(func.getContext(), mapping_attrs));
@@ -389,19 +391,16 @@ class TaskMapper {
         }
       }
       
-      int new_sram_id = 0;
+      std::optional<CGRAPosition> new_sram_pos;
       if (count > 0) {
         // Rounds to the nearest integer.
         int avg_row = (total_row + count / 2) / count;
         int avg_col = (total_col + count / 2) / count;
-        // SRAM ID encoding: (row << 16) | col
-        new_sram_id = (avg_row << 16) | (avg_col & 0xFFFF);
-      } else {
-        new_sram_id = 0; // Default fallback
+        new_sram_pos = CGRAPosition{avg_row, avg_col};
       }
 
-      if (mem_node->assigned_sram_id != new_sram_id) {
-        mem_node->assigned_sram_id = new_sram_id;
+      if (mem_node->assigned_sram_pos != new_sram_pos) {
+        mem_node->assigned_sram_pos = new_sram_pos;
         changed = true;
       }
     }
@@ -481,12 +480,8 @@ class TaskMapper {
     // 2. Memory Proximity
     // For Read MemRefs
     for (MemoryNode *mem : task_node->read_memrefs) {
-        if (mem->assigned_sram_id != -1) {
-            // SRAM ID encoding: (row << 16) | col
-            int sram_r = mem->assigned_sram_id >> 16;
-            int sram_c = mem->assigned_sram_id & 0xFFFF;
-            CGRAPosition sram_pos{sram_r, sram_c};
-            int dist = current_pos.manhattanDistance(sram_pos);
+        if (mem->assigned_sram_pos) {
+            int dist = current_pos.manhattanDistance(*mem->assigned_sram_pos);
             mem_score -= dist;
         }
     }
@@ -494,12 +489,8 @@ class TaskMapper {
     // If we write to a memory that is already assigned (e.g. read by previous task),
     // we want to be close to it too.
     for (MemoryNode *mem : task_node->write_memrefs) {
-         if (mem->assigned_sram_id != -1) {
-            // SRAM ID encoding: (row << 16) | col
-            int sram_r = mem->assigned_sram_id >> 16;
-            int sram_c = mem->assigned_sram_id & 0xFFFF;
-            CGRAPosition sram_pos{sram_r, sram_c};
-            int dist = current_pos.manhattanDistance(sram_pos);
+         if (mem->assigned_sram_pos) {
+            int dist = current_pos.manhattanDistance(*mem->assigned_sram_pos);
             mem_score -= dist;
         }
     }
@@ -545,16 +536,16 @@ class TaskMapper {
 //===----------------------------------------------------------------------===//
 // Pass Definition
 //===----------------------------------------------------------------------===//
-struct MapCTOnCGRAArrayPass
-    : public PassWrapper<MapCTOnCGRAArrayPass, OperationPass<func::FuncOp>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MapCTOnCGRAArrayPass)
+struct MapTaskOnCgraPass
+    : public PassWrapper<MapTaskOnCgraPass, OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MapTaskOnCgraPass)
 
-  MapCTOnCGRAArrayPass() = default;
+  MapTaskOnCgraPass() = default;
 
-  StringRef getArgument() const override { return "map-ct-on-cgra-array"; }
+  StringRef getArgument() const override { return "map-task-on-cgra"; }
 
   StringRef getDescription() const override {
-    return "Maps Canonical Tasks (CTs) onto a 2D CGRA grid with adjacency "
+    return "Maps Taskflow tasks onto a 2D CGRA grid with adjacency "
            "optimization and memory mapping.";
   }
 
@@ -572,8 +563,8 @@ struct MapCTOnCGRAArrayPass
 namespace mlir {
 namespace taskflow {
 
-std::unique_ptr<Pass> createMapCTOnCGRAArrayPass() {
-  return std::make_unique<MapCTOnCGRAArrayPass>();
+std::unique_ptr<Pass> createMapTaskOnCgraPass() {
+  return std::make_unique<MapTaskOnCgraPass>();
 }
 
 } // namespace taskflow
diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
index 2d614b98..057d6c15 100644
--- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
+++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
@@ -22,7 +22,7 @@
 // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
 // RUN: --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
-// RUN: --map-ct-on-cgra-array \
+// RUN: --map-task-on-cgra \
 // RUN: -o %t.placement.mlir
 // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT
 
@@ -309,9 +309,8 @@ module attributes {} {
 // HYPERBLOCK-NEXT: }
 
 // PLACEMENT:      taskflow.task @Task_0
-// PLACEMENT-SAME: value_inputs(%c0_i32 : i32) {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locs = [], write_sram_locs = []}}
+// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}]}
 // PLACEMENT:      taskflow.task @Task_1
-// PLACEMENT-SAME: write_memrefs(%alloca_0 : memref<4x8xi32>) value_inputs(%c8_i32 : i32) [original_write_memrefs(%alloca_0 : memref<4x8xi32>)] {task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_locs = [], write_sram_locs = [{col = 1 : i32, row = 1 : i32}]}}
+// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]}
 // PLACEMENT:      taskflow.task @Task_2
-// PLACEMENT-SAME: read_memrefs(%write_outputs : memref<4x8xi32>) write_memrefs(%alloca : memref<i32>) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0 : memref<4x8xi32>), original_write_memrefs(%alloca : memref<i32>)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locs = [{col = 1 : i32, row = 1 : i32}], write_sram_locs = [{col = 0 : i32, row = 1 : i32}]}}
-
+// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locations = [{col = 1 : i32, row = 1 : i32}], write_sram_locations = [{col = 0 : i32, row = 1 : i32}]}
diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
index ce3d4071..1bfb2bb2 100644
--- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
+++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
@@ -22,7 +22,7 @@
 // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
 // RUN: --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
-// RUN: --map-ct-on-cgra-array \
+// RUN: --map-task-on-cgra \
 // RUN: -o %t.placement.mlir
 // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT
 
@@ -373,16 +373,15 @@ module attributes {} {
 // HYPERBLOCK-NEXT:    %0 = affine.load %write_outputs_1[0] : memref<?xi32>
 // HYPERBLOCK-NEXT:    return %0 : i32
 // HYPERBLOCK-NEXT:  }
-
 // HYPERBLOCK-NEXT:}
 
 // PLACEMENT:      taskflow.task @Task_0
-// PLACEMENT-SAME: read_memrefs(%arg0 : memref<?x8x6xi32>) write_memrefs(%arg5 : memref<?xi32>) [original_read_memrefs(%arg0 : memref<?x8x6xi32>), original_write_memrefs(%arg5 : memref<?xi32>)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locs = [{col = 0 : i32, row = 0 : i32}], write_sram_locs = [{col = 0 : i32, row = 1 : i32}]}}
+// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locations = [{col = 0 : i32, row = 0 : i32}], write_sram_locations = [{col = 0 : i32, row = 1 : i32}]}
 // PLACEMENT:      taskflow.task @Task_1
-// PLACEMENT-SAME: read_memrefs(%arg1, %arg2 : memref<?x8x5xi32>, memref<?x8x5xi32>) write_memrefs(%arg6 : memref<?xi32>) [original_read_memrefs(%arg1, %arg2 : memref<?x8x5xi32>, memref<?x8x5xi32>), original_write_memrefs(%arg6 : memref<?xi32>)] {task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locs = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locs = [{col = 1 : i32, row = 1 : i32}]}}
+// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locations = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]}
 // PLACEMENT:      taskflow.task @Task_2
-// PLACEMENT-SAME: read_memrefs(%write_outputs, %write_outputs_0, %arg9 : memref<?xi32>, memref<?xi32>, memref<?xi32>) write_memrefs(%arg9 : memref<?xi32>) [original_read_memrefs(%arg5, %arg6, %arg9 : memref<?xi32>, memref<?xi32>, memref<?xi32>), original_write_memrefs(%arg9 : memref<?xi32>)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locs = [{col = 0 : i32, row = 1 : i32}, {col = 1 : i32, row = 1 : i32}, {col = 0 : i32, row = 1 : i32}], write_sram_locs = [{col = 0 : i32, row = 1 : i32}]}}
+// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locations = [{col = 0 : i32, row = 1 : i32}, {col = 1 : i32, row = 1 : i32}, {col = 0 : i32, row = 1 : i32}], write_sram_locations = [{col = 0 : i32, row = 1 : i32}]}
 // PLACEMENT:      taskflow.task @Task_3
-// PLACEMENT-SAME: read_memrefs(%arg3 : memref<?x7xi32>) write_memrefs(%arg7 : memref<?xi32>) [original_read_memrefs(%arg3 : memref<?x7xi32>), original_write_memrefs(%arg7 : memref<?xi32>)] {task_mapping_info = {cgra_positions = [{col = 2 : i32, row = 0 : i32}], read_sram_locs = [{col = 2 : i32, row = 0 : i32}], write_sram_locs = [{col = 2 : i32, row = 1 : i32}]}}
+// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 2 : i32, row = 0 : i32}], read_sram_locations = [{col = 2 : i32, row = 0 : i32}], write_sram_locations = [{col = 2 : i32, row = 1 : i32}]}
 // PLACEMENT:      taskflow.task @Task_4
-// PLACEMENT-SAME: read_memrefs(%arg4, %write_outputs_2 : memref<?x9xi32>, memref<?xi32>) write_memrefs(%arg8 : memref<?xi32>) [original_read_memrefs(%arg4, %arg7 : memref<?x9xi32>, memref<?xi32>), original_write_memrefs(%arg8 : memref<?xi32>)] {task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_locs = [{col = 1 : i32, row = 1 : i32}, {col = 2 : i32, row = 1 : i32}], write_sram_locs = [{col = 1 : i32, row = 1 : i32}]}}
\ No newline at end of file
+// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_locations = [{col = 1 : i32, row = 1 : i32}, {col = 2 : i32, row = 1 : i32}], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]}
\ No newline at end of file
diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
index a91737f3..6c0cd57b 100644
--- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
+++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
@@ -16,7 +16,7 @@
 // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
 // RUN: --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
-// RUN: --map-ct-on-cgra-array \
+// RUN: --map-task-on-cgra \
 // RUN: -o %t.placement.mlir
 // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT
 
@@ -130,6 +130,6 @@ module {
 // HYPERBLOCK-NEXT: }
 
 // PLACEMENT:      taskflow.task @Task_0
-// PLACEMENT-SAME: read_memrefs(%arg0 : memref<16xf32>) write_memrefs(%arg0 : memref<16xf32>) value_inputs(%arg4 : f32) [original_read_memrefs(%arg0 : memref<16xf32>), original_write_memrefs(%arg0 : memref<16xf32>)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locs = [{col = 0 : i32, row = 0 : i32}], write_sram_locs = [{col = 0 : i32, row = 0 : i32}]}}
+// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locations = [{col = 0 : i32, row = 0 : i32}], write_sram_locations = [{col = 0 : i32, row = 0 : i32}]}
 // PLACEMENT:      taskflow.task @Task_1
-// PLACEMENT-SAME: read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>) write_memrefs(%arg3 : memref<8x8xf32>) [original_read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>), original_write_memrefs(%arg3 : memref<8x8xf32>)] {task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locs = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locs = [{col = 1 : i32, row = 0 : i32}]}}
\ No newline at end of file
+// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locations = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locations = [{col = 1 : i32, row = 0 : i32}]}
\ No newline at end of file

From 47ea607e948950fadd54dfdf2573e23de91f4f35 Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Fri, 6 Feb 2026 03:11:58 +0800
Subject: [PATCH 12/15] Refactor CGRA placement: rename pass to MapTaskOnCgra,
 update dependency terminology to dependency_depth, and improve SRAM mapping
 readability

---
 .../Transforms/MapTaskOnCgraPass.cpp          | 102 +++++++++---------
 .../irregular-loop/irregular-loop.mlir        |   4 +-
 2 files changed, 56 insertions(+), 50 deletions(-)

diff --git a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
index adf845d9..7d859e6d 100644
--- a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
+++ b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
@@ -96,7 +96,7 @@ struct MemoryNode;
 struct TaskNode {
   size_t id;
   TaskflowTaskOp op;
-  int alap_level = 0;
+  int dependency_depth = 0;  // Longest path to any sink in the dependency graph.
   
   // Edges based on original memory access.
   SmallVector<MemoryNode *> read_memrefs;  // original_read_memrefs
@@ -228,24 +228,25 @@ class TaskMapper {
       return;
     }
 
-    // Computes ALAP Levels on the Task Graph.
-    // We use ALAP (As Late As Possible) to identify the critical path and
-    // prioritize tasks that are closer to the sink, which helps in minimizing
-    // the overall dependency latency during placement.
-    computeALAP(graph);
+    // Computes Dependency Depth for each task.
+    // Dependency depth = longest path from this node to any sink in the
+    // dependency graph (considering both SSA and memory edges). Tasks with
+    // higher depth are more "critical" and are placed first to ensure their
+    // dependent chains have good locality.
+    computeDependencyDepth(graph);
 
-    // Sorts tasks by ALAP level (Critical Path First).
+    // Sorts tasks by dependency depth (Critical Path First).
     SmallVector<TaskNode *> sorted_tasks;
     for (auto &node : graph.task_nodes) sorted_tasks.push_back(node.get());
     
     std::stable_sort(sorted_tasks.begin(), sorted_tasks.end(), 
         [](TaskNode *a, TaskNode *b) {
-            return a->alap_level > b->alap_level; 
+            return a->dependency_depth > b->dependency_depth; 
         });
 
-    // Critical path priority placement:
-    // 1. Computes ALAP level for each task (longest path to sink).
-    // 2. Sorts tasks by: (a) ALAP level, (b) criticality, (c) degree.
+    // Critical-path-first placement:
+    // 1. Computes dependency depth for each task (longest path to sink).
+    // 2. Sorts tasks by dependency depth (higher = more critical).
     // 3. Places tasks in sorted order with heuristic scoring.
     // Iterative Refinement Loop (Coordinate Descent).
     // Alternates between Task Placement (Phase 1) and SRAM Assignment (Phase 2).
@@ -320,37 +321,33 @@ class TaskMapper {
             builder.getArrayAttr(pos_attrs)));
 
         // 2. Read SRAM Locations
-        if (!task_node->read_memrefs.empty()) {
-            SmallVector<Attribute> read_sram_attrs;
-            for (MemoryNode *mem : task_node->read_memrefs) {
-                if (mem->assigned_sram_pos) {
-                    SmallVector<NamedAttribute, 2> sram_coord;
-                    sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "row"), builder.getI32IntegerAttr(mem->assigned_sram_pos->row)));
-                    sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "col"), builder.getI32IntegerAttr(mem->assigned_sram_pos->col)));
-                    read_sram_attrs.push_back(DictionaryAttr::get(func.getContext(), sram_coord));
-                }
+        SmallVector<Attribute> read_sram_attrs;
+        for (MemoryNode *mem : task_node->read_memrefs) {
+            if (mem->assigned_sram_pos) {
+                SmallVector<NamedAttribute, 2> sram_coord;
+                sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "row"), builder.getI32IntegerAttr(mem->assigned_sram_pos->row)));
+                sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "col"), builder.getI32IntegerAttr(mem->assigned_sram_pos->col)));
+                read_sram_attrs.push_back(DictionaryAttr::get(func.getContext(), sram_coord));
             }
-            mapping_attrs.push_back(NamedAttribute(
-                StringAttr::get(func.getContext(), "read_sram_locations"),
-                builder.getArrayAttr(read_sram_attrs)));
         }
+        mapping_attrs.push_back(NamedAttribute(
+            StringAttr::get(func.getContext(), "read_sram_locations"),
+            builder.getArrayAttr(read_sram_attrs)));
 
         // 3. Write SRAM Locations
-        if (!task_node->write_memrefs.empty()) {
-            SmallVector<Attribute> write_sram_attrs;
-            for (MemoryNode *mem : task_node->write_memrefs) {
-                if (mem->assigned_sram_pos) {
-                  SmallVector<NamedAttribute, 2> sram_coord;
-                  sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "row"), builder.getI32IntegerAttr(mem->assigned_sram_pos->row)));
-                  sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "col"), builder.getI32IntegerAttr(mem->assigned_sram_pos->col)));
-
-                  write_sram_attrs.push_back(DictionaryAttr::get(func.getContext(), sram_coord));
-                }
+        SmallVector<Attribute> write_sram_attrs;
+        for (MemoryNode *mem : task_node->write_memrefs) {
+            if (mem->assigned_sram_pos) {
+              SmallVector<NamedAttribute, 2> sram_coord;
+              sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "row"), builder.getI32IntegerAttr(mem->assigned_sram_pos->row)));
+              sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "col"), builder.getI32IntegerAttr(mem->assigned_sram_pos->col)));
+
+              write_sram_attrs.push_back(DictionaryAttr::get(func.getContext(), sram_coord));
             }
-            mapping_attrs.push_back(NamedAttribute(
-                StringAttr::get(func.getContext(), "write_sram_locations"),
-                builder.getArrayAttr(write_sram_attrs)));
         }
+        mapping_attrs.push_back(NamedAttribute(
+            StringAttr::get(func.getContext(), "write_sram_locations"),
+            builder.getArrayAttr(write_sram_attrs)));
 
         // Set Attribute
         task_node->op->setAttr("task_mapping_info", DictionaryAttr::get(func.getContext(), mapping_attrs));
@@ -498,32 +495,41 @@ class TaskMapper {
     return kAlpha * ssa_score + kBeta * mem_score;
   }
 
-  /// Computes ALAP levels considering both SSA and memory dependencies.
-  void computeALAP(TaskMemoryGraph &graph) {
-    // DFS for longest path from node to any sink (ALAP Level).
-    DenseMap<TaskNode*, int> node_alap_cache;
+  /// Computes dependency depth for all tasks in the graph.
+  ///
+  /// Dependency depth = longest path from this node to any sink node in the
+  /// dependency graph (via SSA or memory edges).
+  ///
+  /// Tasks with higher dependency depth have longer chains of dependent tasks
+  /// after them. By placing these tasks first:
+  /// 1. They get priority access to good grid positions.
+  /// 2. Their dependent tasks can then be positioned adjacent to them,
+  ///    minimizing inter-task communication distance.
+  void computeDependencyDepth(TaskMemoryGraph &graph) {
+    DenseMap<TaskNode*, int> depth_cache;
     for (auto &node : graph.task_nodes) {
-        node->alap_level = calculateLevel(node.get(), node_alap_cache);
+        node->dependency_depth = calculateDepth(node.get(), depth_cache);
     }
   }
 
-  int calculateLevel(TaskNode *node, DenseMap<TaskNode*, int> &node_alap_cache) {
-    if (node_alap_cache.count(node)) return node_alap_cache[node];
+  int calculateDepth(TaskNode *node, DenseMap<TaskNode*, int> &depth_cache) {
+    if (depth_cache.count(node)) return depth_cache[node];
 
-    int max_child_level = 0;
+    int max_child_depth = 0;
+    // SSA dependencies.
     for (TaskNode *child : node->ssa_users) {
-        max_child_level = std::max(max_child_level, calculateLevel(child, node_alap_cache) + 1);
+        max_child_depth = std::max(max_child_depth, calculateDepth(child, depth_cache) + 1);
     }
 
-    // Checks memory dependencies too (Producer -> Mem -> Consumer).
+    // Memory dependencies (Producer -> Mem -> Consumer).
     for (MemoryNode *mem : node->write_memrefs) {
         for (TaskNode *reader : mem->readers) {
             if (reader != node)
-                max_child_level = std::max(max_child_level, calculateLevel(reader, node_alap_cache) + 1);
+                max_child_depth = std::max(max_child_depth, calculateDepth(reader, depth_cache) + 1);
         }
     }
 
-    return node_alap_cache[node] = max_child_level;
+    return depth_cache[node] = max_child_depth;
   }
 
 
diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
index 057d6c15..a8b9d8c2 100644
--- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
+++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
@@ -309,8 +309,8 @@ module attributes {} {
 // HYPERBLOCK-NEXT: }
 
 // PLACEMENT:      taskflow.task @Task_0
-// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}]}
+// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locations = [], write_sram_locations = []}
 // PLACEMENT:      taskflow.task @Task_1
-// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]}
+// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_locations = [], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]}
 // PLACEMENT:      taskflow.task @Task_2
 // PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locations = [{col = 1 : i32, row = 1 : i32}], write_sram_locations = [{col = 0 : i32, row = 1 : i32}]}

From db107d35e42749dbe148f5cfda1bd583f4ba157e Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Fri, 6 Feb 2026 03:19:58 +0800
Subject: [PATCH 13/15] Style polish: add braces to if statements, update
 header comments, and clean up whitespace

---
 .../Transforms/MapTaskOnCgraPass.cpp          | 34 +++++++++++--------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
index 7d859e6d..0cd2c719 100644
--- a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
+++ b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
@@ -1,6 +1,6 @@
 //===- MapTaskOnCgraPass.cpp - Task to CGRA Mapping Pass ----------------===//
 //
-// This pass maps Canonical Tasks (CTs) onto a 2D CGRA grid array:
+// This pass maps Taskflow tasks onto a 2D CGRA grid array:
 // 1. Places tasks with SSA dependencies on adjacent CGRAs.
 // 2. Assigns memrefs to SRAMs (each MemRef is assigned to exactly one SRAM,
 //    determined by proximity to the task that first accesses it).
@@ -251,12 +251,12 @@ class TaskMapper {
     // Iterative Refinement Loop (Coordinate Descent).
     // Alternates between Task Placement (Phase 1) and SRAM Assignment (Phase 2).
     constexpr int kMaxIterations = 10;
-    
-
-
+  
     for (int iter = 0; iter < kMaxIterations; ++iter) {
         // Phase 1: Place Tasks (assuming fixed SRAMs).
-        if (iter > 0) resetTaskPlacements(graph);
+        if (iter > 0) {
+            resetTaskPlacements(graph);
+        }
 
         for (TaskNode *task_node : sorted_tasks) {
           int cgra_count = 1;
@@ -277,8 +277,9 @@ class TaskMapper {
 
           // Marks Occupied.
           for (const auto &pos : placement.cgra_positions) {
-            if (pos.row >= 0 && pos.row < grid_rows_ && pos.col >= 0 && pos.col < grid_cols_)
+            if (pos.row >= 0 && pos.row < grid_rows_ && pos.col >= 0 && pos.col < grid_cols_) {
                 occupied_[pos.row][pos.col] = true;
+            }
           }
         }
 
@@ -290,7 +291,6 @@ class TaskMapper {
         // Convergence Check.
         // If SRAMs didn't move, it means task placement based on them likely won't change either.
         if (iter > 0 && !sram_moved) {
-
             break; 
         }
     }
@@ -300,7 +300,9 @@ class TaskMapper {
     // Annotates Result.
     OpBuilder builder(func.getContext());
     for (auto &task_node : graph.task_nodes) {
-        if (task_node->placement.empty()) continue;
+        if (task_node->placement.empty()) {
+            continue;
+        }
         
         SmallVector<NamedAttribute, 4> mapping_attrs;
 
@@ -320,7 +322,7 @@ class TaskMapper {
             StringAttr::get(func.getContext(), "cgra_positions"),
             builder.getArrayAttr(pos_attrs)));
 
-        // 2. Read SRAM Locations
+        // 2. Reads SRAM Locations.
         SmallVector<Attribute> read_sram_attrs;
         for (MemoryNode *mem : task_node->read_memrefs) {
             if (mem->assigned_sram_pos) {
@@ -334,7 +336,7 @@ class TaskMapper {
             StringAttr::get(func.getContext(), "read_sram_locations"),
             builder.getArrayAttr(read_sram_attrs)));
 
-        // 3. Write SRAM Locations
+        // 3. Writes SRAM Locations.
         SmallVector<Attribute> write_sram_attrs;
         for (MemoryNode *mem : task_node->write_memrefs) {
             if (mem->assigned_sram_pos) {
@@ -349,7 +351,7 @@ class TaskMapper {
             StringAttr::get(func.getContext(), "write_sram_locations"),
             builder.getArrayAttr(write_sram_attrs)));
 
-        // Set Attribute
+        // Sets Attribute.
         task_node->op->setAttr("task_mapping_info", DictionaryAttr::get(func.getContext(), mapping_attrs));
     }
   }
@@ -417,8 +419,9 @@ class TaskMapper {
     // Baseline: For cgra_count=1, finds single best position.
     for (int r = 0; r < grid_rows_; ++r) {
       for (int c = 0; c < grid_cols_; ++c) {
-        if (occupied_[r][c])
+        if (occupied_[r][c]) {
           continue;
+        }
 
         TaskPlacement candidate;
         candidate.cgra_positions.push_back({r, c});
@@ -513,7 +516,9 @@ class TaskMapper {
   }
 
   int calculateDepth(TaskNode *node, DenseMap<TaskNode*, int> &depth_cache) {
-    if (depth_cache.count(node)) return depth_cache[node];
+    if (depth_cache.count(node)) {
+        return depth_cache[node];
+    }
 
     int max_child_depth = 0;
     // SSA dependencies.
@@ -524,8 +529,9 @@ class TaskMapper {
     // Memory dependencies (Producer -> Mem -> Consumer).
     for (MemoryNode *mem : node->write_memrefs) {
         for (TaskNode *reader : mem->readers) {
-            if (reader != node)
+            if (reader != node) {
                 max_child_depth = std::max(max_child_depth, calculateDepth(reader, depth_cache) + 1);
+            }
         }
     }
 

From 09eed47236679d7c4284a0537fdf2b5e6eb90627 Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Fri, 6 Feb 2026 12:15:04 +0800
Subject: [PATCH 14/15] Cleanup: fix comment formatting and typos according to
 reviewer feedback

---
 .../Transforms/MapTaskOnCgraPass.cpp          | 28 +++++++++----------
 test/benchmark/CGRA-Bench                     |  2 +-
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
index 0cd2c719..54367af1 100644
--- a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
+++ b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
@@ -99,8 +99,8 @@ struct TaskNode {
   int dependency_depth = 0;  // Longest path to any sink in the dependency graph.
   
   // Edges based on original memory access.
-  SmallVector<MemoryNode *> read_memrefs;  // original_read_memrefs
-  SmallVector<MemoryNode *> write_memrefs; // original_write_memrefs
+  SmallVector<MemoryNode *> read_memrefs;  // Original read memrefs.
+  SmallVector<MemoryNode *> write_memrefs; // Original write memrefs.
   SmallVector<TaskNode *> ssa_users;
   SmallVector<TaskNode *> ssa_operands;
 
@@ -114,11 +114,11 @@ struct TaskNode {
 struct MemoryNode {
   Value memref;
   
-  // Edges
+  // Edges.
   SmallVector<TaskNode *> readers;
   SmallVector<TaskNode *> writers;
-
-  // Mapping result
+  
+  // Mapping result.
   std::optional<CGRAPosition> assigned_sram_pos;
 
   MemoryNode(Value memref) : memref(memref) {}
@@ -163,7 +163,7 @@ class TaskMemoryGraph {
     // 3. Builds SSA Edges (Inter-Task Value Dependencies).
     // Identifies if a task uses a value produced by another task.
     for (auto &consumer_node : task_nodes) {
-        // Interates all operands for now to be safe.
+        // Iterates all operands for now to be safe.
         for (Value operand : consumer_node->op.getValueInputs()) {
             if (auto producer_op = operand.getDefiningOp<TaskflowTaskOp>()) {
                 if (auto *producer_node = op_to_node[producer_op]) {
@@ -306,7 +306,7 @@ class TaskMapper {
         
         SmallVector<NamedAttribute, 4> mapping_attrs;
 
-        // 1. CGRA Positions
+        // 1. CGRA positions.
         SmallVector<Attribute> pos_attrs;
         for (const auto &pos : task_node->placement) {
             SmallVector<NamedAttribute, 2> coord_attrs;
@@ -447,22 +447,22 @@ class TaskMapper {
   /// downstream hardware generators to configure fast bypass paths between
   /// adjacent PEs with dependencies.
   ///
-  /// Score = α·SSA_Dist + β·Mem_Dist
+  /// Score = α·SSA_Dist + β·Mem_Dist.
   ///
   /// SSA_Dist: Minimize distance to placed SSA predecessors (ssa_operands).
   /// Mem_Dist: Minimize distance to assigned SRAMs for read/write memrefs.
   int computeScore(TaskNode *task_node, const TaskPlacement &placement,
                    TaskMemoryGraph &graph) {
     // Weight constants (tunable).
-    constexpr int kAlpha = 10;   // SSA proximity weight
-    constexpr int kBeta = 50;    // Memory proximity weight (High priority)
+    constexpr int kAlpha = 10;   // SSA proximity weight.
+    constexpr int kBeta = 50;    // Memory proximity weight (high priority).
 
     int ssa_score = 0;
     int mem_score = 0;
     
     CGRAPosition current_pos = placement.primary();
 
-    // 1. SSA Proximity (Predecessors & Successors)
+    // 1. SSA proximity (predecessors & successors).
     for (TaskNode *producer : task_node->ssa_operands) {
         if (!producer->placement.empty()) {
             int dist = current_pos.manhattanDistance(producer->placement[0]);
@@ -477,15 +477,15 @@ class TaskMapper {
         }
     }
 
-    // 2. Memory Proximity
-    // For Read MemRefs
+    // 2. Memory proximity.
+    // For read memrefs.
     for (MemoryNode *mem : task_node->read_memrefs) {
         if (mem->assigned_sram_pos) {
             int dist = current_pos.manhattanDistance(*mem->assigned_sram_pos);
             mem_score -= dist;
         }
     }
-    // For Write MemRefs
+    // For write memrefs.
     // If we write to a memory that is already assigned (e.g. read by previous task),
     // we want to be close to it too.
     for (MemoryNode *mem : task_node->write_memrefs) {
diff --git a/test/benchmark/CGRA-Bench b/test/benchmark/CGRA-Bench
index 2b5e78b2..2beecc59 160000
--- a/test/benchmark/CGRA-Bench
+++ b/test/benchmark/CGRA-Bench
@@ -1 +1 @@
-Subproject commit 2b5e78b24d481c8465c82672a8d5177a86119aed
+Subproject commit 2beecc599bd268f8665344ba2271f48c97db7aa0

From d016323ba122e0f10e2808861ba0c6d5434d23e6 Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Fri, 6 Feb 2026 12:17:30 +0800
Subject: [PATCH 15/15] Cleanup: fix comment formatting and typos according to
 reviewer feedback

---
 lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
index 54367af1..c04df0b7 100644
--- a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
+++ b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
@@ -177,7 +177,7 @@ class TaskMemoryGraph {
 
 private:
   MemoryNode *getOrCreateMemoryNode(Value memref) {
-    if (memref_to_node.count(memref)){
+    if (memref_to_node.count(memref)) {
       return memref_to_node[memref];
     }
     
@@ -264,7 +264,7 @@ class TaskMapper {
             cgra_count = attr.getInt();
           }
 
-          // Finds Best Placement using SRAM positions from previous iter (or -1/default).
+          // Finds best placement using SRAM positions from previous iter (or -1/default).
           TaskPlacement placement = findBestPlacement(task_node, cgra_count, graph);
           
           // Commits Placement.
@@ -275,7 +275,7 @@ class TaskMapper {
              task_node->placement.push_back(placement.cgra_positions[i]);
           }
 
-          // Marks Occupied.
+          // Marks occupied.
           for (const auto &pos : placement.cgra_positions) {
             if (pos.row >= 0 && pos.row < grid_rows_ && pos.col >= 0 && pos.col < grid_cols_) {
                 occupied_[pos.row][pos.col] = true;
@@ -283,7 +283,7 @@ class TaskMapper {
           }
         }
 
-        // Phase 2: Assign SRAMs (assuming fixed Tasks).
+        // Phase 2: Assign SRAMs (assuming fixed tasks).
         bool sram_moved = assignAllSRAMs(graph);
         
 
@@ -297,7 +297,7 @@ class TaskMapper {
 
 
 
-    // Annotates Result.
+    // Annotates result.
     OpBuilder builder(func.getContext());
     for (auto &task_node : graph.task_nodes) {
         if (task_node->placement.empty()) {
@@ -515,6 +515,7 @@ class TaskMapper {
     }
   }
 
+  /// Recursively calculates dependency depth for a single task.
   int calculateDepth(TaskNode *node, DenseMap<TaskNode*, int> &depth_cache) {
     if (depth_cache.count(node)) {
         return depth_cache[node];