From 31911d228f1b4e2613cf0a260da35c8bf95078a5 Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Sun, 1 Feb 2026 05:46:33 +0800 Subject: [PATCH 01/15] feat(taskflow): implement graph-based ACT placement and memory management framework --- include/TaskflowDialect/TaskflowPasses.h | 4 + include/TaskflowDialect/TaskflowPasses.td | 15 + lib/TaskflowDialect/Transforms/CMakeLists.txt | 4 + .../Transforms/PlaceACTOnCGRAPass.cpp | 481 ++++++++++++++++++ .../irregular-loop/irregular-loop.mlir | 13 + .../taskflow/multi-nested/multi-nested.mlir | 20 +- .../parallel-nested/parallel-nested.mlir | 14 +- 7 files changed, 549 insertions(+), 2 deletions(-) create mode 100644 lib/TaskflowDialect/Transforms/PlaceACTOnCGRAPass.cpp diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h index c0007ce1..61073053 100644 --- a/include/TaskflowDialect/TaskflowPasses.h +++ b/include/TaskflowDialect/TaskflowPasses.h @@ -17,7 +17,11 @@ namespace taskflow { #include "TaskflowDialect/TaskflowPasses.h.inc" std::unique_ptr createConstructHyperblockFromTaskPass(); std::unique_ptr createCanonicalizeTaskPass(); +<<<<<<< HEAD std::unique_ptr createClassifyCountersPass(); +======= +std::unique_ptr createPlaceACTOnCGRAPass(); +>>>>>>> a0f7fc7 (feat(taskflow): implement graph-based ACT placement and memory management framework) #define GEN_PASS_REGISTRATION #include "TaskflowDialect/TaskflowPasses.h.inc" diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td index 4fc2137f..c01d8f0d 100644 --- a/include/TaskflowDialect/TaskflowPasses.td +++ b/include/TaskflowDialect/TaskflowPasses.td @@ -30,6 +30,7 @@ def CanonicalizeTask: Pass<"canonicalize-task", "func::FuncOp">{ let constructor = "taskflow::createCanonicalizeTaskPass()"; } +<<<<<<< HEAD def ClassifyCounters : Pass<"classify-counters", "ModuleOp">{ let summary = "Classifies counters as root/relay/leaf"; let description = [{ @@ -42,5 +43,19 @@ def ClassifyCounters : Pass<"classify-counters", "ModuleOp">{ Leaf counters are mapped to CGRA tile arrays. }]; let constructor = "taskflow::createClassifyCountersPass()"; +======= + + +def PlaceACTOnCGRA : Pass<"place-act-on-cgra", "func::FuncOp"> { + let summary = "Places ACTs onto a 2D CGRA grid with adjacency optimization"; + let description = [{ + This pass places Atomic Canonical Tasks (ACTs) onto a 2D CGRA grid. + Fusion candidates (same-header SSA dependencies) are placed on adjacent + CGRAs to enable direct data forwarding. + + Uses a default 4x4 CGRA grid. + }]; + let constructor = "taskflow::createPlaceACTOnCGRAPass()"; +>>>>>>> a0f7fc7 (feat(taskflow): implement graph-based ACT placement and memory management framework) } #endif // TASKFLOW_PASSES_TD \ No newline at end of file diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt index e44401d8..5177f4ae 100644 --- a/lib/TaskflowDialect/Transforms/CMakeLists.txt +++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt @@ -3,7 +3,11 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) add_mlir_library(MLIRTaskflowTransforms ConstructHyperblockFromTaskPass.cpp CanonicalizeTaskPass.cpp +<<<<<<< HEAD ClassifyCountersPass.cpp +======= + PlaceACTOnCGRAPass.cpp +>>>>>>> a0f7fc7 (feat(taskflow): implement graph-based ACT placement and memory management framework) DEPENDS MLIRTaskflowTransformsIncGen diff --git a/lib/TaskflowDialect/Transforms/PlaceACTOnCGRAPass.cpp b/lib/TaskflowDialect/Transforms/PlaceACTOnCGRAPass.cpp new file mode 100644 index 00000000..e0d67c78 --- /dev/null +++ b/lib/TaskflowDialect/Transforms/PlaceACTOnCGRAPass.cpp @@ -0,0 +1,481 @@ +//===- PlaceACTOnCGRAPass.cpp - ACT to CGRA Placement Pass ----------------===// +// +// This pass places Atomic Canonical Tasks (ACTs) onto a 2D CGRA grid: +// 1. SSA use-def placement: Tasks with SSA dependencies placed on adjacent CGRAs. +// 2. Memory mapping: Assigns memrefs to SRAMs (single-SRAM constraint per data). +// +//===----------------------------------------------------------------------===// + +#include "TaskflowDialect/TaskflowDialect.h" +#include "TaskflowDialect/TaskflowOps.h" +#include "TaskflowDialect/TaskflowPasses.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/Builders.h" +#include "mlir/Pass/Pass.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/raw_ostream.h" + +#include +#include +#include +#include +#include + +using namespace mlir; +using namespace mlir::taskflow; + +namespace { + +//===----------------------------------------------------------------------===// +// CGRA Grid Position +//===----------------------------------------------------------------------===// +/// Represents a position on the 2D CGRA grid. +struct CGRAPosition { + int row; + int col; + + bool operator==(const CGRAPosition &other) const { + return row == other.row && col == other.col; + } + + /// Computes Manhattan distance to another position. + int manhattanDistance(const CGRAPosition &other) const { + return std::abs(row - other.row) + std::abs(col - other.col); + } + + /// Checks if adjacent (Manhattan distance = 1). + bool isAdjacent(const CGRAPosition &other) const { + return manhattanDistance(other) == 1; + } +}; + +//===----------------------------------------------------------------------===// +// Task Placement Info +//===----------------------------------------------------------------------===// +/// Stores placement info for a task: can span multiple combined CGRAs. +struct TaskPlacement { + SmallVector cgra_positions; // CGRAs assigned to this task. + + /// Returns the primary (first) position. + CGRAPosition primary() const { + return cgra_positions.empty() ? CGRAPosition{-1, -1} : cgra_positions[0]; + } + + /// Returns the number of CGRAs assigned. + size_t cgraCount() const { return cgra_positions.size(); } + + /// Checks if any CGRA in this task is adjacent to any in other task. + bool hasAdjacentCGRA(const TaskPlacement &other) const { + for (const auto &pos : cgra_positions) { + for (const auto &other_pos : other.cgra_positions) { + if (pos.isAdjacent(other_pos)) { + return true; + } + } + } + return false; + } +}; + +//===----------------------------------------------------------------------===// +// Task-Memory Graph +//===----------------------------------------------------------------------===// + +struct MemoryNode; + +/// Represents a Task node in the graph. +struct TaskNode { + size_t id; + TaskflowTaskOp op; + int alap_level = 0; + + // Edges + SmallVector read_memrefs; + SmallVector write_memrefs; + SmallVector ssa_users; + SmallVector ssa_operands; + + // Placement result + SmallVector placement; + + TaskNode(size_t id, TaskflowTaskOp op) : id(id), op(op) {} +}; + +/// Represents a Memory node (MemRef) in the graph. +struct MemoryNode { + Value memref; + + // Edges + SmallVector readers; + SmallVector writers; + + // Mapping result + int assigned_sram_id = -1; + + MemoryNode(Value memref) : memref(memref) {} +}; + +/// The Task-Memory Dependency Graph. +class TaskMemoryGraph { +public: + SmallVector> task_nodes; + SmallVector> memory_nodes; + DenseMap memref_to_node; + DenseMap op_to_node; + + void build(func::FuncOp func) { + // 1. Creates TaskNodes. + size_t task_id = 0; + func.walk([&](TaskflowTaskOp task) { + auto node = std::make_unique(task_id++, task); + op_to_node[task] = node.get(); + task_nodes.push_back(std::move(node)); + }); + + // 2. Creates MemoryNodes and defines Edges. + for (auto &t_node : task_nodes) { + // Memory Inputs (Reads) + for (Value input : t_node->op.getMemoryInputs()) { + MemoryNode *m_node = getOrCreateMemoryNode(input); + t_node->read_memrefs.push_back(m_node); + m_node->readers.push_back(t_node.get()); + } + + // Memory Outputs (Writes) + for (Value output : t_node->op.getMemoryOutputs()) { + MemoryNode *m_node = getOrCreateMemoryNode(output); + t_node->write_memrefs.push_back(m_node); + m_node->writers.push_back(t_node.get()); + } + } + + // 3. Builds SSA Edges (Inter-Task Value Dependencies). + // Identifies if a task uses a value produced by another task. + for (auto &consumer_node : task_nodes) { + // Iterate all operands for now to be safe. + for (Value operand : consumer_node->op->getOperands()) { + if (auto producer_op = operand.getDefiningOp()) { + if (auto *producer_node = op_to_node[producer_op]) { + producer_node->ssa_users.push_back(consumer_node.get()); + consumer_node->ssa_operands.push_back(producer_node); + } + } + } + } + } + +private: + MemoryNode *getOrCreateMemoryNode(Value memref) { + if (memref_to_node.count(memref)) + return memref_to_node[memref]; + + auto node = std::make_unique(memref); + MemoryNode *ptr = node.get(); + memref_to_node[memref] = ptr; + memory_nodes.push_back(std::move(node)); + return ptr; + } +}; + + +//===----------------------------------------------------------------------===// +// CGRA Placer +//===----------------------------------------------------------------------===// +/// Places ACTs onto a 2D CGRA grid with memory mapping. +class CGRAPlacer { +public: + CGRAPlacer(int grid_rows, int grid_cols) + : grid_rows_(grid_rows), grid_cols_(grid_cols) { + occupied_.resize(grid_rows_); + for (auto &row : occupied_) { + row.resize(grid_cols_, false); + } + } + + /// Places all tasks and performs memory mapping. + void place(func::FuncOp func) { + SmallVector tasks; + func.walk([&](TaskflowTaskOp task) { tasks.push_back(task); }); + + if (tasks.empty()) { + llvm::errs() << "No tasks to place.\n"; + return; + } + + + // Extracts counter chains and builds dependency graph. + // Builds Task-Memory Graph. + TaskMemoryGraph graph; + graph.build(func); + + if (graph.task_nodes.empty()) { + llvm::errs() << "No tasks to place.\n"; + return; + } + + // Computes ALAP Levels on the Task Graph. + computeALAP(graph); + + // Sorts tasks by ALAP level (Critical Path First). + SmallVector sorted_tasks; + for (auto &node : graph.task_nodes) sorted_tasks.push_back(node.get()); + + std::stable_sort(sorted_tasks.begin(), sorted_tasks.end(), + [](TaskNode *a, TaskNode *b) { + return a->alap_level > b->alap_level; + }); + + // Critical path priority placement: + // 1. Computes ALAP level for each task (longest path to sink). + // 2. Sorts tasks by: (a) ALAP level, (b) criticality, (c) degree. + // 3. Places tasks in sorted order with heuristic scoring. + // Placement Loop. + for (TaskNode *task_node : sorted_tasks) { + int cgra_count = 1; + if (auto attr = task_node->op->getAttrOfType("cgra_count")) { + cgra_count = attr.getInt(); + } + + // Finds Best Placement. + // Heuristic: Minimizes distance to: + // 1. SSA Producers (that are already placed). + // 2. SRAMs of Input MemRefs (if already assigned). + TaskPlacement placement = findBestPlacement(task_node, cgra_count, graph); + + // Commits Placement. + task_node->placement.push_back(placement.primary()); + // Handles multi-cgra if needed. + for (size_t i = 1; i < placement.cgra_positions.size(); ++i) { + task_node->placement.push_back(placement.cgra_positions[i]); + } + + // Marks Occupied. + for (const auto &pos : placement.cgra_positions) { + if (pos.row >= 0 && pos.row < grid_rows_ && pos.col >= 0 && pos.col < grid_cols_) + occupied_[pos.row][pos.col] = true; + } + + // Maps Associated Memory Nodes. + // For each MemRef this task touches, if not yet assigned to SRAM, assign to nearest. + for (MemoryNode *mem_node : task_node->read_memrefs) { + if (mem_node->assigned_sram_id == -1) { + mem_node->assigned_sram_id = assignSRAM(mem_node, placement); + } + } + for (MemoryNode *mem_node : task_node->write_memrefs) { + if (mem_node->assigned_sram_id == -1) { + mem_node->assigned_sram_id = assignSRAM(mem_node, placement); + } + } + } + + // Annotates Result. + OpBuilder builder(func.getContext()); + for (auto &task_node : graph.task_nodes) { + if (task_node->placement.empty()) continue; + CGRAPosition pos = task_node->placement[0]; + task_node->op->setAttr("cgra_row", builder.getI32IntegerAttr(pos.row)); + task_node->op->setAttr("cgra_col", builder.getI32IntegerAttr(pos.col)); + task_node->op->setAttr("cgra_count", builder.getI32IntegerAttr(task_node->placement.size())); + } + } + +private: + /// Assigns a memref to the closest SRAM near the given task position. + /// TODO: Integrate with Arch Spec to map logical SRAM IDs (row*100 + col) to + /// physical hardware Block IDs, especially for shared or asymmetric SRAMs. + int assignSRAM(MemoryNode *mem_node, const TaskPlacement &placement) { + if (mem_node->assigned_sram_id != -1) + return mem_node->assigned_sram_id; + + // Assigns to a new SRAM near the task's primary CGRA. + CGRAPosition pos = placement.primary(); + int sram_id = pos.row * 100 + pos.col; // Simple encoding: row*100 + col. + mem_node->assigned_sram_id = sram_id; + return sram_id; + } + +private: + /// Finds best placement for a task requiring cgra_count CGRAs. + /// TODO: Implement a block-search algorithm for tasks with cgra_count > 1 to + /// find contiguous rectangular regions instead of single tiles. + TaskPlacement findBestPlacement(TaskNode *task_node, int cgra_count, + TaskMemoryGraph &graph) { + int best_score = INT_MIN; + TaskPlacement best_placement; + + // Baseline: For cgra_count=1, finds single best position. + for (int r = 0; r < grid_rows_; ++r) { + for (int c = 0; c < grid_cols_; ++c) { + if (occupied_[r][c]) + continue; + + TaskPlacement candidate; + candidate.cgra_positions.push_back({r, c}); + + int score = computeScore(task_node, candidate, graph); + if (score > best_score) { + best_score = score; + best_placement = candidate; + } + } + } + + // Error handling: No available position found (grid over-subscribed). + if (best_placement.cgra_positions.empty()) { + llvm::errs() << "Warning: No available CGRA position for task " + << task_node->id << ". Grid is over-subscribed (" << grid_rows_ + << "x" << grid_cols_ << " grid with all cells occupied).\n"; + // Fallback: Assign to position (0,0) with a warning. + best_placement.cgra_positions.push_back({0, 0}); + } + + return best_placement; + } + + /// Computes placement score based on Task-Memory Graph. + /// TODO: Introduce explicit 'direct_wires' attributes in the IR for + /// downstream hardware generators to configure fast bypass paths between + /// adjacent PEs with dependencies. + /// + /// Score = α·SSA_Dist + β·Mem_Dist + γ·Balance + /// + /// SSA_Dist: Minimize distance to placed SSA predecessors (ssa_operands). + /// Mem_Dist: Minimize distance to assigned SRAMs for read/write memrefs. + int computeScore(TaskNode *task_node, const TaskPlacement &placement, + TaskMemoryGraph &graph) { + // Weight constants (tunable). + constexpr int kAlpha = 10; // SSA proximity weight + constexpr int kBeta = 50; // Memory proximity weight (High priority) + constexpr int kGamma = 20; // Load balance weight + + int ssa_score = 0; + int mem_score = 0; + int bal_score = 0; + + CGRAPosition current_pos = placement.primary(); + + // 1. SSA Proximity (Predecessors) + for (TaskNode *producer : task_node->ssa_operands) { + if (!producer->placement.empty()) { + int dist = current_pos.manhattanDistance(producer->placement[0]); + // Uses negative distance to penalize far-away placements. + ssa_score -= dist; + } + } + + // 2. Memory Proximity + // For Read MemRefs + for (MemoryNode *mem : task_node->read_memrefs) { + if (mem->assigned_sram_id != -1) { + // SRAM ID encoding: row*100 + col + int sram_r = mem->assigned_sram_id / 100; + int sram_c = mem->assigned_sram_id % 100; + CGRAPosition sram_pos{sram_r, sram_c}; + int dist = current_pos.manhattanDistance(sram_pos); + mem_score -= dist; + } + } + // For Write MemRefs + // If we write to a memory that is already assigned (e.g. read by previous task), + // we want to be close to it too. + for (MemoryNode *mem : task_node->write_memrefs) { + if (mem->assigned_sram_id != -1) { + int sram_r = mem->assigned_sram_id / 100; + int sram_c = mem->assigned_sram_id % 100; + CGRAPosition sram_pos{sram_r, sram_c}; + int dist = current_pos.manhattanDistance(sram_pos); + mem_score -= dist; + } + } + + // 3. Load Balance + // Prefers less crowded rows/cols. + int row_count = 0, col_count = 0; + for (int c = 0; c < grid_cols_; ++c) { if (occupied_[current_pos.row][c]) row_count++; } + for (int r = 0; r < grid_rows_; ++r) { if (occupied_[r][current_pos.col]) col_count++; } + bal_score = (grid_cols_ - row_count) + (grid_rows_ - col_count); + + return kAlpha * ssa_score + kBeta * mem_score + kGamma * bal_score; + } + + /// Computes ALAP levels for efficient scheduling order. + void computeALAP(TaskMemoryGraph &graph) { + // 1. Calculates in-degrees for topological sort simulation. + DenseMap in_degree; + for (auto &node : graph.task_nodes) { + for (TaskNode *user : node->ssa_users) in_degree[user]++; + } + + // 2. DFS for longest path from node to any sink (ALAP Level). + DenseMap memo; + for (auto &node : graph.task_nodes) { + node->alap_level = calculateLevel(node.get(), memo); + } + } + + int calculateLevel(TaskNode *node, DenseMap &memo) { + if (memo.count(node)) return memo[node]; + + int max_child_level = 0; + for (TaskNode *child : node->ssa_users) { + max_child_level = std::max(max_child_level, calculateLevel(child, memo) + 1); + } + + // Check memory dependencies too (Producer -> Mem -> Consumer) + for (MemoryNode *mem : node->write_memrefs) { + for (TaskNode *reader : mem->readers) { + if (reader != node) + max_child_level = std::max(max_child_level, calculateLevel(reader, memo) + 1); + } + } + + return memo[node] = max_child_level; + } + + + + int grid_rows_; + int grid_cols_; + std::vector> occupied_; +}; + +//===----------------------------------------------------------------------===// +// Pass Definition +//===----------------------------------------------------------------------===// +struct PlaceACTOnCGRAPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PlaceACTOnCGRAPass) + + PlaceACTOnCGRAPass() = default; + + StringRef getArgument() const override { return "place-act-on-cgra"; } + + StringRef getDescription() const override { + return "Places ACTs onto a 2D CGRA grid with adjacency optimization and " + "memory mapping."; + } + + void runOnOperation() override { + func::FuncOp func = getOperation(); + constexpr int kDefaultGridRows = 4; + constexpr int kDefaultGridCols = 4; + CGRAPlacer placer(kDefaultGridRows, kDefaultGridCols); + placer.place(func); + } +}; + +} // namespace + +namespace mlir { +namespace taskflow { + +std::unique_ptr createPlaceACTOnCGRAPass() { + return std::make_unique(); +} + +} // namespace taskflow +} // namespace mlir diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir index 9d1e6f46..7a8aa882 100644 --- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir +++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir @@ -13,6 +13,13 @@ // RUN: -o %t.canonicalized.mlir // RUN: FileCheck %s --input-file=%t.canonicalized.mlir --check-prefixes=CANONICALIZE +// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task \ +// RUN: --canonicalize-task \ +// RUN: --place-act-on-cgra \ +// RUN: -o %t.placement.mlir +// RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT + #set = affine_set<(d0, d1) : (d0 - 3 == 0, d1 - 7 == 0)> module attributes {} { func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage} { @@ -216,3 +223,9 @@ module attributes {} { // CANONICALIZE-NEXT: } // CANONICALIZE-NEXT: } +// PLACEMENT: task_name = "Task_0" +// PLACEMENT: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 +// PLACEMENT: task_name = "Task_1" +// PLACEMENT: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32 +// PLACEMENT: task_name = "Task_2" +// PLACEMENT: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir index c5f75f28..c878bbec 100644 --- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir +++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir @@ -13,6 +13,13 @@ // RUN: -o %t.canonicalized.mlir // RUN: FileCheck %s --input-file=%t.canonicalized.mlir --check-prefixes=CANONICALIZE +// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task \ +// RUN: --canonicalize-task \ +// RUN: --place-act-on-cgra \ +// RUN: -o %t.placement.mlir +// RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT + module attributes {} { func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { affine.for %arg10 = 0 to 4 { @@ -231,4 +238,15 @@ module attributes {} { // CANONICALIZE-NEXT: %0 = affine.load %memory_outputs_1[0] : memref // CANONICALIZE-NEXT: return %0 : i32 // CANONICALIZE-NEXT: } -// CANONICALIZE-NEXT: } \ No newline at end of file +// CANONICALIZE-NEXT: } + +// PLACEMENT: task_name = "Task_0" +// PLACEMENT: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 +// PLACEMENT: task_name = "Task_1" +// PLACEMENT: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32 +// PLACEMENT: task_name = "Task_2" +// PLACEMENT: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 +// PLACEMENT: task_name = "Task_3" +// PLACEMENT: cgra_col = 2 : i32, cgra_count = 1 : i32, cgra_row = 2 : i32 +// PLACEMENT: task_name = "Task_4" +// PLACEMENT: cgra_col = 3 : i32, cgra_count = 1 : i32, cgra_row = 2 : i32 \ No newline at end of file diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir index ee37c831..d5c32cf7 100644 --- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir +++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir @@ -13,6 +13,13 @@ // RUN: -o %t.canonicalized.mlir // RUN: FileCheck %s --input-file=%t.canonicalized.mlir --check-prefixes=CANONICALIZE +// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task \ +// RUN: --canonicalize-task \ +// RUN: --place-act-on-cgra \ +// RUN: -o %t.placement.mlir +// RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT + module { // Example: Parallel nested loops scenario // Task 0: Single-level loop (vector scaling) @@ -133,4 +140,9 @@ module { // CANONICALIZE-NEXT: }) : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> memref<8x8xf32> // CANONICALIZE-NEXT: return // CANONICALIZE-NEXT: } -// CANONICALIZE-NEXT: } \ No newline at end of file +// CANONICALIZE-NEXT: } + +// PLACEMENT: task_name = "Task_0" +// PLACEMENT: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 +// PLACEMENT: task_name = "Task_1" +// PLACEMENT: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32 \ No newline at end of file From 4ead25c38c4e164407fe4bd86bf2f71bd0be7298 Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Sun, 1 Feb 2026 06:10:21 +0800 Subject: [PATCH 02/15] chore: trigger PR sync and terminology update --- include/TaskflowDialect/TaskflowPasses.h | 5 +- include/TaskflowDialect/TaskflowPasses.td | 13 ++-- lib/TaskflowDialect/Transforms/CMakeLists.txt | 5 +- ...nCGRAPass.cpp => MapCTOnCGRAArrayPass.cpp} | 75 +++++++++++-------- test/e2e/bicg/bicg_int_kernel.mlir | 2 +- .../irregular-loop/irregular-loop.mlir | 2 +- .../taskflow/multi-nested/multi-nested.mlir | 2 +- .../parallel-nested/parallel-nested.mlir | 2 +- 8 files changed, 53 insertions(+), 53 deletions(-) rename lib/TaskflowDialect/Transforms/{PlaceACTOnCGRAPass.cpp => MapCTOnCGRAArrayPass.cpp} (86%) diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h index 61073053..f766ceee 100644 --- a/include/TaskflowDialect/TaskflowPasses.h +++ b/include/TaskflowDialect/TaskflowPasses.h @@ -17,11 +17,8 @@ namespace taskflow { #include "TaskflowDialect/TaskflowPasses.h.inc" std::unique_ptr createConstructHyperblockFromTaskPass(); std::unique_ptr createCanonicalizeTaskPass(); -<<<<<<< HEAD std::unique_ptr createClassifyCountersPass(); -======= -std::unique_ptr createPlaceACTOnCGRAPass(); ->>>>>>> a0f7fc7 (feat(taskflow): implement graph-based ACT placement and memory management framework) +std::unique_ptr createMapCTOnCGRAArrayPass(); #define GEN_PASS_REGISTRATION #include "TaskflowDialect/TaskflowPasses.h.inc" diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td index c01d8f0d..c7de281b 100644 --- a/include/TaskflowDialect/TaskflowPasses.td +++ b/include/TaskflowDialect/TaskflowPasses.td @@ -30,7 +30,6 @@ def CanonicalizeTask: Pass<"canonicalize-task", "func::FuncOp">{ let constructor = "taskflow::createCanonicalizeTaskPass()"; } -<<<<<<< HEAD def ClassifyCounters : Pass<"classify-counters", "ModuleOp">{ let summary = "Classifies counters as root/relay/leaf"; let description = [{ @@ -43,19 +42,17 @@ def ClassifyCounters : Pass<"classify-counters", "ModuleOp">{ Leaf counters are mapped to CGRA tile arrays. }]; let constructor = "taskflow::createClassifyCountersPass()"; -======= - +} -def PlaceACTOnCGRA : Pass<"place-act-on-cgra", "func::FuncOp"> { - let summary = "Places ACTs onto a 2D CGRA grid with adjacency optimization"; +def MapCTOnCGRAArray : Pass<"map-ct-on-cgra-array", "func::FuncOp"> { + let summary = "Maps Canonical Tasks (CTs) onto a 2D CGRA grid array"; let description = [{ - This pass places Atomic Canonical Tasks (ACTs) onto a 2D CGRA grid. + This pass maps Canonical Tasks (CTs) onto a 2D CGRA grid array. Fusion candidates (same-header SSA dependencies) are placed on adjacent CGRAs to enable direct data forwarding. Uses a default 4x4 CGRA grid. }]; - let constructor = "taskflow::createPlaceACTOnCGRAPass()"; ->>>>>>> a0f7fc7 (feat(taskflow): implement graph-based ACT placement and memory management framework) + let constructor = "taskflow::createMapCTOnCGRAArrayPass()"; } #endif // TASKFLOW_PASSES_TD \ No newline at end of file diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt index 5177f4ae..d777396e 100644 --- a/lib/TaskflowDialect/Transforms/CMakeLists.txt +++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt @@ -3,11 +3,8 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) add_mlir_library(MLIRTaskflowTransforms ConstructHyperblockFromTaskPass.cpp CanonicalizeTaskPass.cpp -<<<<<<< HEAD ClassifyCountersPass.cpp -======= - PlaceACTOnCGRAPass.cpp ->>>>>>> a0f7fc7 (feat(taskflow): implement graph-based ACT placement and memory management framework) + MapCTOnCGRAArrayPass.cpp DEPENDS MLIRTaskflowTransformsIncGen diff --git a/lib/TaskflowDialect/Transforms/PlaceACTOnCGRAPass.cpp b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp similarity index 86% rename from lib/TaskflowDialect/Transforms/PlaceACTOnCGRAPass.cpp rename to lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp index e0d67c78..2b478927 100644 --- a/lib/TaskflowDialect/Transforms/PlaceACTOnCGRAPass.cpp +++ b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp @@ -1,8 +1,9 @@ -//===- PlaceACTOnCGRAPass.cpp - ACT to CGRA Placement Pass ----------------===// +//===- MapCTOnCGRAArrayPass.cpp - CT to CGRA Mapping Pass ----------------===// // -// This pass places Atomic Canonical Tasks (ACTs) onto a 2D CGRA grid: -// 1. SSA use-def placement: Tasks with SSA dependencies placed on adjacent CGRAs. -// 2. Memory mapping: Assigns memrefs to SRAMs (single-SRAM constraint per data). +// This pass maps Canonical Tasks (CTs) onto a 2D CGRA grid array: +// 1. Places tasks with SSA dependencies on adjacent CGRAs. +// 2. Assigns memrefs to SRAMs (each MemRef is assigned to exactly one SRAM, +// determined by proximity to the task that first accesses it). // //===----------------------------------------------------------------------===// @@ -92,9 +93,9 @@ struct TaskNode { TaskflowTaskOp op; int alap_level = 0; - // Edges - SmallVector read_memrefs; - SmallVector write_memrefs; + // Edges (Note: read/write naming refers to taskflow memory_inputs/outputs) + SmallVector read_memrefs; // taskflow.task memory_inputs (readiness triggers) + SmallVector write_memrefs; // taskflow.task memory_outputs (produce readiness) SmallVector ssa_users; SmallVector ssa_operands; @@ -137,16 +138,29 @@ class TaskMemoryGraph { // 2. Creates MemoryNodes and defines Edges. for (auto &t_node : task_nodes) { - // Memory Inputs (Reads) + // Memory Inputs (dependency for readiness) for (Value input : t_node->op.getMemoryInputs()) { MemoryNode *m_node = getOrCreateMemoryNode(input); t_node->read_memrefs.push_back(m_node); m_node->readers.push_back(t_node.get()); } - // Memory Outputs (Writes) + // Memory Outputs (produced readiness) + Operation *terminator = t_node->op.getBody().front().getTerminator(); for (Value output : t_node->op.getMemoryOutputs()) { - MemoryNode *m_node = getOrCreateMemoryNode(output); + unsigned res_idx = mlir::cast(output).getResultNumber(); + MemoryNode *m_node = nullptr; + + // Ensures the output corresponds to a yield operand. + assert(res_idx < terminator->getNumOperands() && "Invalid yield operand index"); + + Value source = terminator->getOperand(res_idx); + // Gets (or create if alloc inside) the node for the source value. + m_node = getOrCreateMemoryNode(source); + + // Maps the Output Result to the SAME node. + memref_to_node[output] = m_node; + t_node->write_memrefs.push_back(m_node); m_node->writers.push_back(t_node.get()); } @@ -155,7 +169,7 @@ class TaskMemoryGraph { // 3. Builds SSA Edges (Inter-Task Value Dependencies). // Identifies if a task uses a value produced by another task. for (auto &consumer_node : task_nodes) { - // Iterate all operands for now to be safe. + // Interates all operands for now to be safe. for (Value operand : consumer_node->op->getOperands()) { if (auto producer_op = operand.getDefiningOp()) { if (auto *producer_node = op_to_node[producer_op]) { @@ -298,10 +312,11 @@ class CGRAPlacer { return sram_id; } -private: - /// Finds best placement for a task requiring cgra_count CGRAs. - /// TODO: Implement a block-search algorithm for tasks with cgra_count > 1 to - /// find contiguous rectangular regions instead of single tiles. + + /// Finds best placement for a task. + /// TODO: Currently defaults to single-CGRA placement. Multi-CGRA binding logic + /// (cgra_count > 1) is experimental/placeholder and should ideally be handled + /// by an upstream resource binding pass. TaskPlacement findBestPlacement(TaskNode *task_node, int cgra_count, TaskMemoryGraph &graph) { int best_score = INT_MIN; @@ -402,15 +417,9 @@ class CGRAPlacer { return kAlpha * ssa_score + kBeta * mem_score + kGamma * bal_score; } - /// Computes ALAP levels for efficient scheduling order. + /// Computes ALAP levels considering both SSA and memory dependencies. void computeALAP(TaskMemoryGraph &graph) { - // 1. Calculates in-degrees for topological sort simulation. - DenseMap in_degree; - for (auto &node : graph.task_nodes) { - for (TaskNode *user : node->ssa_users) in_degree[user]++; - } - - // 2. DFS for longest path from node to any sink (ALAP Level). + // DFS for longest path from node to any sink (ALAP Level). DenseMap memo; for (auto &node : graph.task_nodes) { node->alap_level = calculateLevel(node.get(), memo); @@ -425,7 +434,7 @@ class CGRAPlacer { max_child_level = std::max(max_child_level, calculateLevel(child, memo) + 1); } - // Check memory dependencies too (Producer -> Mem -> Consumer) + // Checks memory dependencies too (Producer -> Mem -> Consumer). for (MemoryNode *mem : node->write_memrefs) { for (TaskNode *reader : mem->readers) { if (reader != node) @@ -446,17 +455,17 @@ class CGRAPlacer { //===----------------------------------------------------------------------===// // Pass Definition //===----------------------------------------------------------------------===// -struct PlaceACTOnCGRAPass - : public PassWrapper> { - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PlaceACTOnCGRAPass) +struct MapCTOnCGRAArrayPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MapCTOnCGRAArrayPass) - PlaceACTOnCGRAPass() = default; + MapCTOnCGRAArrayPass() = default; - StringRef getArgument() const override { return "place-act-on-cgra"; } + StringRef getArgument() const override { return "map-ct-on-cgra-array"; } StringRef getDescription() const override { - return "Places ACTs onto a 2D CGRA grid with adjacency optimization and " - "memory mapping."; + return "Maps Canonical Tasks (CTs) onto a 2D CGRA grid with adjacency " + "optimization and memory mapping."; } void runOnOperation() override { @@ -473,8 +482,8 @@ struct PlaceACTOnCGRAPass namespace mlir { namespace taskflow { -std::unique_ptr createPlaceACTOnCGRAPass() { - return std::make_unique(); +std::unique_ptr createMapCTOnCGRAArrayPass() { + return std::make_unique(); } } // namespace taskflow diff --git a/test/e2e/bicg/bicg_int_kernel.mlir b/test/e2e/bicg/bicg_int_kernel.mlir index 32f17705..f9aa4d3d 100644 --- a/test/e2e/bicg/bicg_int_kernel.mlir +++ b/test/e2e/bicg/bicg_int_kernel.mlir @@ -11,7 +11,7 @@ // RUN: mlir-neura-opt %t-kernel.mlir \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir index 7a8aa882..06d09b92 100644 --- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir +++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir @@ -16,7 +16,7 @@ // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ // RUN: --canonicalize-task \ -// RUN: --place-act-on-cgra \ +// RUN: --map-ct-on-cgra-array \ // RUN: -o %t.placement.mlir // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir index c878bbec..3f885541 100644 --- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir +++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir @@ -16,7 +16,7 @@ // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ // RUN: --canonicalize-task \ -// RUN: --place-act-on-cgra \ +// RUN: --map-ct-on-cgra-array \ // RUN: -o %t.placement.mlir // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir index d5c32cf7..8e05d825 100644 --- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir +++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir @@ -16,7 +16,7 @@ // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ // RUN: --canonicalize-task \ -// RUN: --place-act-on-cgra \ +// RUN: --map-ct-on-cgra-array \ // RUN: -o %t.placement.mlir // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT From 4251f080b1b25c0aa9b52da43e3914437c4e6f12 Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Wed, 4 Feb 2026 05:27:38 +0800 Subject: [PATCH 03/15] Optimize CGRA placement pass: handle memory aliasing with original memrefs, iterative 3x3 grid placement, and centroid-based SRAM allocation --- .../Transforms/MapCTOnCGRAArrayPass.cpp | 559 ++++++++++++++++++ .../irregular-loop/irregular-loop.mlir | 14 + .../taskflow/multi-nested/multi-nested.mlir | 21 +- .../parallel-nested/parallel-nested.mlir | 14 +- 4 files changed, 606 insertions(+), 2 deletions(-) create mode 100644 lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp diff --git a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp new file mode 100644 index 00000000..8c3fd34e --- /dev/null +++ b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp @@ -0,0 +1,559 @@ +//===- MapCTOnCGRAArrayPass.cpp - CT to CGRA Mapping Pass ----------------===// +// +// This pass maps Canonical Tasks (CTs) onto a 2D CGRA grid array: +// 1. Places tasks with SSA dependencies on adjacent CGRAs. +// 2. Assigns memrefs to SRAMs (each MemRef is assigned to exactly one SRAM, +// determined by proximity to the task that first accesses it). +// +//===----------------------------------------------------------------------===// + +#include "TaskflowDialect/TaskflowDialect.h" +#include "TaskflowDialect/TaskflowOps.h" +#include "TaskflowDialect/TaskflowPasses.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/Builders.h" +#include "mlir/Pass/Pass.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/raw_ostream.h" + +#include +#include +#include +#include +#include + +using namespace mlir; +using namespace mlir::taskflow; + +namespace { + +//===----------------------------------------------------------------------===// +// CGRA Grid Position +//===----------------------------------------------------------------------===// +/// Represents a position on the 2D CGRA grid. +struct CGRAPosition { + int row; + int col; + + bool operator==(const CGRAPosition &other) const { + return row == other.row && col == other.col; + } + + /// Computes Manhattan distance to another position. + int manhattanDistance(const CGRAPosition &other) const { + return std::abs(row - other.row) + std::abs(col - other.col); + } + + /// Checks if adjacent (Manhattan distance = 1). + bool isAdjacent(const CGRAPosition &other) const { + return manhattanDistance(other) == 1; + } +}; + +//===----------------------------------------------------------------------===// +// Task Placement Info +//===----------------------------------------------------------------------===// +/// Stores placement info for a task: can span multiple combined CGRAs. +struct TaskPlacement { + SmallVector cgra_positions; // CGRAs assigned to this task. + + /// Returns the primary (first) position. + CGRAPosition primary() const { + return cgra_positions.empty() ? CGRAPosition{-1, -1} : cgra_positions[0]; + } + + /// Returns the number of CGRAs assigned. + size_t cgraCount() const { return cgra_positions.size(); } + + /// Checks if any CGRA in this task is adjacent to any in other task. + bool hasAdjacentCGRA(const TaskPlacement &other) const { + for (const auto &pos : cgra_positions) { + for (const auto &other_pos : other.cgra_positions) { + if (pos.isAdjacent(other_pos)) { + return true; + } + } + } + return false; + } +}; + +//===----------------------------------------------------------------------===// +// Task-Memory Graph +//===----------------------------------------------------------------------===// + +struct MemoryNode; + +/// Represents a Task node in the graph. +struct TaskNode { + size_t id; + TaskflowTaskOp op; + int alap_level = 0; + + // Edges (Note: read/write naming refers to taskflow memory_inputs/outputs) + SmallVector read_memrefs; // taskflow.task memory_inputs (readiness triggers) + SmallVector write_memrefs; // taskflow.task memory_outputs (produce readiness) + SmallVector ssa_users; + SmallVector ssa_operands; + + // Placement result + SmallVector placement; + + TaskNode(size_t id, TaskflowTaskOp op) : id(id), op(op) {} +}; + +/// Represents a Memory node (MemRef) in the graph. +struct MemoryNode { + Value memref; + + // Edges + SmallVector readers; + SmallVector writers; + + // Mapping result + int assigned_sram_id = -1; + + MemoryNode(Value memref) : memref(memref) {} +}; + +/// The Task-Memory Dependency Graph. +class TaskMemoryGraph { +public: + SmallVector> task_nodes; + SmallVector> memory_nodes; + DenseMap memref_to_node; + DenseMap op_to_node; + + void build(func::FuncOp func) { + // 1. Creates TaskNodes. + size_t task_id = 0; + func.walk([&](TaskflowTaskOp task) { + auto node = std::make_unique(task_id++, task); + op_to_node[task] = node.get(); + task_nodes.push_back(std::move(node)); + }); + + // 2. Creates MemoryNodes using ORIGINAL memrefs (canonical identity). + // Uses original_read_memrefs/original_write_memrefs to ensure aliased + // memories share the same MemoryNode. + for (auto &t_node : task_nodes) { + // Uses original_read_memrefs for canonical memory identity. + for (Value orig_memref : t_node->op.getOriginalReadMemrefs()) { + MemoryNode *m_node = getOrCreateMemoryNode(orig_memref); + t_node->read_memrefs.push_back(m_node); + m_node->readers.push_back(t_node.get()); + } + + // Uses original_write_memrefs for canonical memory identity. + for (Value orig_memref : t_node->op.getOriginalWriteMemrefs()) { + MemoryNode *m_node = getOrCreateMemoryNode(orig_memref); + t_node->write_memrefs.push_back(m_node); + m_node->writers.push_back(t_node.get()); + } + } + + // 3. Builds SSA Edges (Inter-Task Value Dependencies). + // Identifies if a task uses a value produced by another task. + for (auto &consumer_node : task_nodes) { + // Interates all operands for now to be safe. + for (Value operand : consumer_node->op->getOperands()) { + if (auto producer_op = operand.getDefiningOp()) { + if (auto *producer_node = op_to_node[producer_op]) { + producer_node->ssa_users.push_back(consumer_node.get()); + consumer_node->ssa_operands.push_back(producer_node); + } + } + } + } + } + +private: + MemoryNode *getOrCreateMemoryNode(Value memref) { + if (memref_to_node.count(memref)) + return memref_to_node[memref]; + + auto node = std::make_unique(memref); + MemoryNode *ptr = node.get(); + memref_to_node[memref] = ptr; + memory_nodes.push_back(std::move(node)); + return ptr; + } +}; + +/// Prints the Task-Memory graph in DOT format for visualization. +void printGraphDOT(TaskMemoryGraph &graph, llvm::raw_ostream &os) { + os << "digraph TaskMemGraph {\n"; + os << " rankdir=TB;\n"; + // Task nodes (circles). + for (auto &t : graph.task_nodes) { + os << " T" << t->id << " [shape=circle, label=\"" << t->id << "\"];\n"; + } + // Memory nodes (rectangles). + for (size_t i = 0; i < graph.memory_nodes.size(); ++i) { + os << " M" << i << " [shape=box, label=\"mem" << i << "\"];\n"; + } + // Edges: Task -> Memory (write) and Memory -> Task (read). + for (auto &t : graph.task_nodes) { + for (size_t i = 0; i < graph.memory_nodes.size(); ++i) { + MemoryNode *m = graph.memory_nodes[i].get(); + for (auto *writer : m->writers) { + if (writer == t.get()) { + os << " T" << t->id << " -> M" << i << ";\n"; + } + } + for (auto *reader : m->readers) { + if (reader == t.get()) { + os << " M" << i << " -> T" << t->id << ";\n"; + } + } + } + } + os << "}\n"; +} + + +//===----------------------------------------------------------------------===// +// CGRA Placer +//===----------------------------------------------------------------------===// +/// Places ACTs onto a 2D CGRA grid with memory mapping. +class CGRAPlacer { +public: + CGRAPlacer(int grid_rows, int grid_cols) + : grid_rows_(grid_rows), grid_cols_(grid_cols) { + occupied_.resize(grid_rows_); + for (auto &row : occupied_) { + row.resize(grid_cols_, false); + } + } + + /// Places all tasks and performs memory mapping. + void place(func::FuncOp func) { + SmallVector tasks; + func.walk([&](TaskflowTaskOp task) { tasks.push_back(task); }); + + if (tasks.empty()) { + llvm::errs() << "No tasks to place.\n"; + return; + } + + + // Extracts counter chains and builds dependency graph. + // Builds Task-Memory Graph. + TaskMemoryGraph graph; + graph.build(func); + + // Prints graph visualization to stderr for debugging. + // llvm::errs() << "\n=== Task-Memory Graph (DOT format) ===\n"; + // printGraphDOT(graph, llvm::errs()); + // llvm::errs() << "=== Graph Stats: " << graph.task_nodes.size() << " tasks, " + // << graph.memory_nodes.size() << " memories ===\n\n"; + + if (graph.task_nodes.empty()) { + llvm::errs() << "No tasks to place.\n"; + return; + } + + // Computes ALAP Levels on the Task Graph. + computeALAP(graph); + + // Sorts tasks by ALAP level (Critical Path First). + SmallVector sorted_tasks; + for (auto &node : graph.task_nodes) sorted_tasks.push_back(node.get()); + + std::stable_sort(sorted_tasks.begin(), sorted_tasks.end(), + [](TaskNode *a, TaskNode *b) { + return a->alap_level > b->alap_level; + }); + + // Critical path priority placement: + // 1. Computes ALAP level for each task (longest path to sink). + // 2. Sorts tasks by: (a) ALAP level, (b) criticality, (c) degree. + // 3. Places tasks in sorted order with heuristic scoring. + // Iterative Refinement Loop (Coordinate Descent). + // Alternates between Task Placement (Phase 1) and SRAM Assignment (Phase 2). + constexpr int kMaxIterations = 10; + + llvm::errs() << "\n=== Starting Iterative Placement (Max " << kMaxIterations << ") ===\n"; + + for (int iter = 0; iter < kMaxIterations; ++iter) { + // Phase 1: Place Tasks (assuming fixed SRAMs). + if (iter > 0) resetTaskPlacements(graph); + + for (TaskNode *task_node : sorted_tasks) { + int cgra_count = 1; + if (auto attr = task_node->op->getAttrOfType("cgra_count")) { + cgra_count = attr.getInt(); + } + + // Finds Best Placement using SRAM positions from previous iter (or -1/default). + TaskPlacement placement = findBestPlacement(task_node, cgra_count, graph); + + // Commits Placement. + task_node->placement.push_back(placement.primary()); + // Handles multi-cgra if needed. + for (size_t i = 1; i < placement.cgra_positions.size(); ++i) { + task_node->placement.push_back(placement.cgra_positions[i]); + } + + // Marks Occupied. + for (const auto &pos : placement.cgra_positions) { + if (pos.row >= 0 && pos.row < grid_rows_ && pos.col >= 0 && pos.col < grid_cols_) + occupied_[pos.row][pos.col] = true; + } + } + + // Phase 2: Assign SRAMs (assuming fixed Tasks). + bool sram_moved = assignAllSRAMs(graph); + + llvm::errs() << "Iter " << iter << ": SRAMs moved = " << (sram_moved ? "Yes" : "No") << "\n"; + + // Convergence Check. + // If SRAMs didn't move, it means task placement based on them likely won't change either. + if (iter > 0 && !sram_moved) { + llvm::errs() << "Converged at iteration " << iter << ".\n"; + break; + } + } + + // Annotates Result. + OpBuilder builder(func.getContext()); + for (auto &task_node : graph.task_nodes) { + if (task_node->placement.empty()) continue; + CGRAPosition pos = task_node->placement[0]; + task_node->op->setAttr("cgra_row", builder.getI32IntegerAttr(pos.row)); + task_node->op->setAttr("cgra_col", builder.getI32IntegerAttr(pos.col)); + task_node->op->setAttr("cgra_count", builder.getI32IntegerAttr(task_node->placement.size())); + } + } + +private: + /// Clears task placement and occupied grid. + void resetTaskPlacements(TaskMemoryGraph &graph) { + for (auto &task : graph.task_nodes) { + task->placement.clear(); + } + // Clears grid. + for (int r = 0; r < grid_rows_; ++r) { + std::fill(occupied_[r].begin(), occupied_[r].end(), false); + } + } + + /// Assigns all memory nodes to SRAMs based on centroid of accessing tasks. + /// Returns true if any SRAM assignment changed. + bool assignAllSRAMs(TaskMemoryGraph &graph) { + bool changed = false; + for (auto &mem_node : graph.memory_nodes) { + // Computes centroid of all tasks that access this memory. + int total_row = 0, total_col = 0, count = 0; + for (TaskNode *reader : mem_node->readers) { + if (!reader->placement.empty()) { + total_row += reader->placement[0].row; + total_col += reader->placement[0].col; + count++; + } + } + for (TaskNode *writer : mem_node->writers) { + if (!writer->placement.empty()) { + total_row += writer->placement[0].row; + total_col += writer->placement[0].col; + count++; + } + } + + int new_sram_id = 0; + if (count > 0) { + // Rounds to the nearest integer. + int avg_row = (total_row + count / 2) / count; + int avg_col = (total_col + count / 2) / count; + new_sram_id = avg_row * 100 + avg_col; + } else { + new_sram_id = 0; // Default fallback + } + + if (mem_node->assigned_sram_id != new_sram_id) { + mem_node->assigned_sram_id = new_sram_id; + changed = true; + } + } + return changed; + } + + + /// Finds best placement for a task. + /// TODO: Currently defaults to single-CGRA placement. Multi-CGRA binding logic + /// (cgra_count > 1) is experimental/placeholder and should ideally be handled + /// by an upstream resource binding pass. + TaskPlacement findBestPlacement(TaskNode *task_node, int cgra_count, + TaskMemoryGraph &graph) { + int best_score = INT_MIN; + TaskPlacement best_placement; + + // Baseline: For cgra_count=1, finds single best position. + for (int r = 0; r < grid_rows_; ++r) { + for (int c = 0; c < grid_cols_; ++c) { + if (occupied_[r][c]) + continue; + + TaskPlacement candidate; + candidate.cgra_positions.push_back({r, c}); + + int score = computeScore(task_node, candidate, graph); + if (score > best_score) { + best_score = score; + best_placement = candidate; + } + } + } + + // Error handling: No available position found (grid over-subscribed). + if (best_placement.cgra_positions.empty()) { + llvm::errs() << "Warning: No available CGRA position for task " + << task_node->id << ". Grid is over-subscribed (" << grid_rows_ + << "x" << grid_cols_ << " grid with all cells occupied).\n"; + // Fallback: Assign to position (0,0) with a warning. + best_placement.cgra_positions.push_back({0, 0}); + } + + return best_placement; + } + + /// Computes placement score based on Task-Memory Graph. + /// TODO: Introduce explicit 'direct_wires' attributes in the IR for + /// downstream hardware generators to configure fast bypass paths between + /// adjacent PEs with dependencies. + /// + /// Score = α·SSA_Dist + β·Mem_Dist + γ·Balance + /// + /// SSA_Dist: Minimize distance to placed SSA predecessors (ssa_operands). + /// Mem_Dist: Minimize distance to assigned SRAMs for read/write memrefs. + int computeScore(TaskNode *task_node, const TaskPlacement &placement, + TaskMemoryGraph &graph) { + // Weight constants (tunable). + constexpr int kAlpha = 10; // SSA proximity weight + constexpr int kBeta = 50; // Memory proximity weight (High priority) + constexpr int kGamma = 20; // Load balance weight + + int ssa_score = 0; + int mem_score = 0; + int bal_score = 0; + + CGRAPosition current_pos = placement.primary(); + + // 1. SSA Proximity (Predecessors) + for (TaskNode *producer : task_node->ssa_operands) { + if (!producer->placement.empty()) { + int dist = current_pos.manhattanDistance(producer->placement[0]); + // Uses negative distance to penalize far-away placements. + ssa_score -= dist; + } + } + + // 2. Memory Proximity + // For Read MemRefs + for (MemoryNode *mem : task_node->read_memrefs) { + if (mem->assigned_sram_id != -1) { + // SRAM ID encoding: row*100 + col + int sram_r = mem->assigned_sram_id / 100; + int sram_c = mem->assigned_sram_id % 100; + CGRAPosition sram_pos{sram_r, sram_c}; + int dist = current_pos.manhattanDistance(sram_pos); + mem_score -= dist; + } + } + // For Write MemRefs + // If we write to a memory that is already assigned (e.g. read by previous task), + // we want to be close to it too. + for (MemoryNode *mem : task_node->write_memrefs) { + if (mem->assigned_sram_id != -1) { + int sram_r = mem->assigned_sram_id / 100; + int sram_c = mem->assigned_sram_id % 100; + CGRAPosition sram_pos{sram_r, sram_c}; + int dist = current_pos.manhattanDistance(sram_pos); + mem_score -= dist; + } + } + + // 3. Load Balance + // Prefers less crowded rows/cols. + int row_count = 0, col_count = 0; + for (int c = 0; c < grid_cols_; ++c) { if (occupied_[current_pos.row][c]) row_count++; } + for (int r = 0; r < grid_rows_; ++r) { if (occupied_[r][current_pos.col]) col_count++; } + bal_score = (grid_cols_ - row_count) + (grid_rows_ - col_count); + + return kAlpha * ssa_score + kBeta * mem_score + kGamma * bal_score; + } + + /// Computes ALAP levels considering both SSA and memory dependencies. + void computeALAP(TaskMemoryGraph &graph) { + // DFS for longest path from node to any sink (ALAP Level). + DenseMap memo; + for (auto &node : graph.task_nodes) { + node->alap_level = calculateLevel(node.get(), memo); + } + } + + int calculateLevel(TaskNode *node, DenseMap &memo) { + if (memo.count(node)) return memo[node]; + + int max_child_level = 0; + for (TaskNode *child : node->ssa_users) { + max_child_level = std::max(max_child_level, calculateLevel(child, memo) + 1); + } + + // Checks memory dependencies too (Producer -> Mem -> Consumer). + for (MemoryNode *mem : node->write_memrefs) { + for (TaskNode *reader : mem->readers) { + if (reader != node) + max_child_level = std::max(max_child_level, calculateLevel(reader, memo) + 1); + } + } + + return memo[node] = max_child_level; + } + + + + int grid_rows_; + int grid_cols_; + std::vector> occupied_; +}; + +//===----------------------------------------------------------------------===// +// Pass Definition +//===----------------------------------------------------------------------===// +struct MapCTOnCGRAArrayPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MapCTOnCGRAArrayPass) + + MapCTOnCGRAArrayPass() = default; + + StringRef getArgument() const override { return "map-ct-on-cgra-array"; } + + StringRef getDescription() const override { + return "Maps Canonical Tasks (CTs) onto a 2D CGRA grid with adjacency " + "optimization and memory mapping."; + } + + void runOnOperation() override { + func::FuncOp func = getOperation(); + constexpr int kDefaultGridRows = 3; + constexpr int kDefaultGridCols = 3; + CGRAPlacer placer(kDefaultGridRows, kDefaultGridCols); + placer.place(func); + } +}; + +} // namespace + +namespace mlir { +namespace taskflow { + +std::unique_ptr createMapCTOnCGRAArrayPass() { + return std::make_unique(); +} + +} // namespace taskflow +} // namespace mlir diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir index 515322ee..2510b28e 100644 --- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir +++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir @@ -19,6 +19,13 @@ // RUN: -o %t.hyperblock.mlir // RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK +// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ +// RUN: --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task \ +// RUN: --map-ct-on-cgra-array \ +// RUN: -o %t.placement.mlir +// RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT + #set = affine_set<(d0, d1) : (d0 - 3 == 0, d1 - 7 == 0)> module attributes {} { func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage} { @@ -301,3 +308,10 @@ module attributes {} { // HYPERBLOCK-NEXT: } // HYPERBLOCK-NEXT: } +// PLACEMENT: taskflow.task @Task_0 +// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 +// PLACEMENT: taskflow.task @Task_1 +// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 +// PLACEMENT: taskflow.task @Task_2 +// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32 + diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir index 7de1eb55..9cd26dc6 100644 --- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir +++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir @@ -19,6 +19,13 @@ // RUN: -o %t.hyperblock.mlir // RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK +// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ +// RUN: --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task \ +// RUN: --map-ct-on-cgra-array \ +// RUN: -o %t.placement.mlir +// RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT + module attributes {} { func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { affine.for %arg10 = 0 to 4 { @@ -366,4 +373,16 @@ module attributes {} { // HYPERBLOCK-NEXT: %0 = affine.load %write_outputs_1[0] : memref // HYPERBLOCK-NEXT: return %0 : i32 // HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT:} \ No newline at end of file + +// HYPERBLOCK-NEXT:} + +// PLACEMENT: taskflow.task @Task_0 +// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 +// PLACEMENT: taskflow.task @Task_1 +// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32 +// PLACEMENT: taskflow.task @Task_2 +// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 +// PLACEMENT: taskflow.task @Task_3 +// PLACEMENT-SAME: cgra_col = 2 : i32, cgra_count = 1 : i32, cgra_row = 2 : i32 +// PLACEMENT: taskflow.task @Task_4 +// PLACEMENT-SAME: cgra_col = 2 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32 \ No newline at end of file diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir index 7655769e..d43363f6 100644 --- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir +++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir @@ -13,6 +13,13 @@ // RUN: -o %t.hyperblock.mlir // RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK +// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ +// RUN: --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task \ +// RUN: --map-ct-on-cgra-array \ +// RUN: -o %t.placement.mlir +// RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT + module { // Example: Parallel nested loops scenario // Task 0: Single-level loop (vector scaling) @@ -120,4 +127,9 @@ module { // HYPERBLOCK-NEXT: } // HYPERBLOCK-NEXT: return // HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: } \ No newline at end of file +// HYPERBLOCK-NEXT: } + +// PLACEMENT: taskflow.task @Task_0 +// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 +// PLACEMENT: taskflow.task @Task_1 +// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32 \ No newline at end of file From a690d4fb266a1c7b0e558df94e1c4cd05db068fe Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Wed, 4 Feb 2026 05:27:38 +0800 Subject: [PATCH 04/15] Optimize CGRA placement pass: handle memory aliasing with original memrefs, iterative 3x3 grid placement, and centroid-based SRAM allocation --- .../Transforms/MapCTOnCGRAArrayPass.cpp | 207 ++++++++++++------ .../irregular-loop/irregular-loop.mlir | 14 ++ .../taskflow/multi-nested/multi-nested.mlir | 21 +- .../parallel-nested/parallel-nested.mlir | 14 +- 4 files changed, 185 insertions(+), 71 deletions(-) diff --git a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp index 2b478927..8c3fd34e 100644 --- a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp +++ b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp @@ -136,31 +136,20 @@ class TaskMemoryGraph { task_nodes.push_back(std::move(node)); }); - // 2. Creates MemoryNodes and defines Edges. + // 2. Creates MemoryNodes using ORIGINAL memrefs (canonical identity). + // Uses original_read_memrefs/original_write_memrefs to ensure aliased + // memories share the same MemoryNode. for (auto &t_node : task_nodes) { - // Memory Inputs (dependency for readiness) - for (Value input : t_node->op.getMemoryInputs()) { - MemoryNode *m_node = getOrCreateMemoryNode(input); + // Uses original_read_memrefs for canonical memory identity. + for (Value orig_memref : t_node->op.getOriginalReadMemrefs()) { + MemoryNode *m_node = getOrCreateMemoryNode(orig_memref); t_node->read_memrefs.push_back(m_node); m_node->readers.push_back(t_node.get()); } - // Memory Outputs (produced readiness) - Operation *terminator = t_node->op.getBody().front().getTerminator(); - for (Value output : t_node->op.getMemoryOutputs()) { - unsigned res_idx = mlir::cast(output).getResultNumber(); - MemoryNode *m_node = nullptr; - - // Ensures the output corresponds to a yield operand. - assert(res_idx < terminator->getNumOperands() && "Invalid yield operand index"); - - Value source = terminator->getOperand(res_idx); - // Gets (or create if alloc inside) the node for the source value. - m_node = getOrCreateMemoryNode(source); - - // Maps the Output Result to the SAME node. - memref_to_node[output] = m_node; - + // Uses original_write_memrefs for canonical memory identity. + for (Value orig_memref : t_node->op.getOriginalWriteMemrefs()) { + MemoryNode *m_node = getOrCreateMemoryNode(orig_memref); t_node->write_memrefs.push_back(m_node); m_node->writers.push_back(t_node.get()); } @@ -194,6 +183,37 @@ class TaskMemoryGraph { } }; +/// Prints the Task-Memory graph in DOT format for visualization. +void printGraphDOT(TaskMemoryGraph &graph, llvm::raw_ostream &os) { + os << "digraph TaskMemGraph {\n"; + os << " rankdir=TB;\n"; + // Task nodes (circles). + for (auto &t : graph.task_nodes) { + os << " T" << t->id << " [shape=circle, label=\"" << t->id << "\"];\n"; + } + // Memory nodes (rectangles). + for (size_t i = 0; i < graph.memory_nodes.size(); ++i) { + os << " M" << i << " [shape=box, label=\"mem" << i << "\"];\n"; + } + // Edges: Task -> Memory (write) and Memory -> Task (read). + for (auto &t : graph.task_nodes) { + for (size_t i = 0; i < graph.memory_nodes.size(); ++i) { + MemoryNode *m = graph.memory_nodes[i].get(); + for (auto *writer : m->writers) { + if (writer == t.get()) { + os << " T" << t->id << " -> M" << i << ";\n"; + } + } + for (auto *reader : m->readers) { + if (reader == t.get()) { + os << " M" << i << " -> T" << t->id << ";\n"; + } + } + } + } + os << "}\n"; +} + //===----------------------------------------------------------------------===// // CGRA Placer @@ -225,6 +245,12 @@ class CGRAPlacer { TaskMemoryGraph graph; graph.build(func); + // Prints graph visualization to stderr for debugging. + // llvm::errs() << "\n=== Task-Memory Graph (DOT format) ===\n"; + // printGraphDOT(graph, llvm::errs()); + // llvm::errs() << "=== Graph Stats: " << graph.task_nodes.size() << " tasks, " + // << graph.memory_nodes.size() << " memories ===\n\n"; + if (graph.task_nodes.empty()) { llvm::errs() << "No tasks to place.\n"; return; @@ -246,44 +272,50 @@ class CGRAPlacer { // 1. Computes ALAP level for each task (longest path to sink). // 2. Sorts tasks by: (a) ALAP level, (b) criticality, (c) degree. // 3. Places tasks in sorted order with heuristic scoring. - // Placement Loop. - for (TaskNode *task_node : sorted_tasks) { - int cgra_count = 1; - if (auto attr = task_node->op->getAttrOfType("cgra_count")) { - cgra_count = attr.getInt(); - } - - // Finds Best Placement. - // Heuristic: Minimizes distance to: - // 1. SSA Producers (that are already placed). - // 2. SRAMs of Input MemRefs (if already assigned). - TaskPlacement placement = findBestPlacement(task_node, cgra_count, graph); - - // Commits Placement. - task_node->placement.push_back(placement.primary()); - // Handles multi-cgra if needed. - for (size_t i = 1; i < placement.cgra_positions.size(); ++i) { - task_node->placement.push_back(placement.cgra_positions[i]); - } + // Iterative Refinement Loop (Coordinate Descent). + // Alternates between Task Placement (Phase 1) and SRAM Assignment (Phase 2). + constexpr int kMaxIterations = 10; + + llvm::errs() << "\n=== Starting Iterative Placement (Max " << kMaxIterations << ") ===\n"; + + for (int iter = 0; iter < kMaxIterations; ++iter) { + // Phase 1: Place Tasks (assuming fixed SRAMs). + if (iter > 0) resetTaskPlacements(graph); + + for (TaskNode *task_node : sorted_tasks) { + int cgra_count = 1; + if (auto attr = task_node->op->getAttrOfType("cgra_count")) { + cgra_count = attr.getInt(); + } + + // Finds Best Placement using SRAM positions from previous iter (or -1/default). + TaskPlacement placement = findBestPlacement(task_node, cgra_count, graph); + + // Commits Placement. + task_node->placement.push_back(placement.primary()); + // Handles multi-cgra if needed. + for (size_t i = 1; i < placement.cgra_positions.size(); ++i) { + task_node->placement.push_back(placement.cgra_positions[i]); + } + + // Marks Occupied. + for (const auto &pos : placement.cgra_positions) { + if (pos.row >= 0 && pos.row < grid_rows_ && pos.col >= 0 && pos.col < grid_cols_) + occupied_[pos.row][pos.col] = true; + } + } - // Marks Occupied. - for (const auto &pos : placement.cgra_positions) { - if (pos.row >= 0 && pos.row < grid_rows_ && pos.col >= 0 && pos.col < grid_cols_) - occupied_[pos.row][pos.col] = true; - } + // Phase 2: Assign SRAMs (assuming fixed Tasks). + bool sram_moved = assignAllSRAMs(graph); + + llvm::errs() << "Iter " << iter << ": SRAMs moved = " << (sram_moved ? "Yes" : "No") << "\n"; - // Maps Associated Memory Nodes. - // For each MemRef this task touches, if not yet assigned to SRAM, assign to nearest. - for (MemoryNode *mem_node : task_node->read_memrefs) { - if (mem_node->assigned_sram_id == -1) { - mem_node->assigned_sram_id = assignSRAM(mem_node, placement); - } - } - for (MemoryNode *mem_node : task_node->write_memrefs) { - if (mem_node->assigned_sram_id == -1) { - mem_node->assigned_sram_id = assignSRAM(mem_node, placement); + // Convergence Check. + // If SRAMs didn't move, it means task placement based on them likely won't change either. + if (iter > 0 && !sram_moved) { + llvm::errs() << "Converged at iteration " << iter << ".\n"; + break; } - } } // Annotates Result. @@ -298,18 +330,55 @@ class CGRAPlacer { } private: - /// Assigns a memref to the closest SRAM near the given task position. - /// TODO: Integrate with Arch Spec to map logical SRAM IDs (row*100 + col) to - /// physical hardware Block IDs, especially for shared or asymmetric SRAMs. - int assignSRAM(MemoryNode *mem_node, const TaskPlacement &placement) { - if (mem_node->assigned_sram_id != -1) - return mem_node->assigned_sram_id; - - // Assigns to a new SRAM near the task's primary CGRA. - CGRAPosition pos = placement.primary(); - int sram_id = pos.row * 100 + pos.col; // Simple encoding: row*100 + col. - mem_node->assigned_sram_id = sram_id; - return sram_id; + /// Clears task placement and occupied grid. + void resetTaskPlacements(TaskMemoryGraph &graph) { + for (auto &task : graph.task_nodes) { + task->placement.clear(); + } + // Clears grid. + for (int r = 0; r < grid_rows_; ++r) { + std::fill(occupied_[r].begin(), occupied_[r].end(), false); + } + } + + /// Assigns all memory nodes to SRAMs based on centroid of accessing tasks. + /// Returns true if any SRAM assignment changed. + bool assignAllSRAMs(TaskMemoryGraph &graph) { + bool changed = false; + for (auto &mem_node : graph.memory_nodes) { + // Computes centroid of all tasks that access this memory. + int total_row = 0, total_col = 0, count = 0; + for (TaskNode *reader : mem_node->readers) { + if (!reader->placement.empty()) { + total_row += reader->placement[0].row; + total_col += reader->placement[0].col; + count++; + } + } + for (TaskNode *writer : mem_node->writers) { + if (!writer->placement.empty()) { + total_row += writer->placement[0].row; + total_col += writer->placement[0].col; + count++; + } + } + + int new_sram_id = 0; + if (count > 0) { + // Rounds to the nearest integer. + int avg_row = (total_row + count / 2) / count; + int avg_col = (total_col + count / 2) / count; + new_sram_id = avg_row * 100 + avg_col; + } else { + new_sram_id = 0; // Default fallback + } + + if (mem_node->assigned_sram_id != new_sram_id) { + mem_node->assigned_sram_id = new_sram_id; + changed = true; + } + } + return changed; } @@ -470,8 +539,8 @@ struct MapCTOnCGRAArrayPass void runOnOperation() override { func::FuncOp func = getOperation(); - constexpr int kDefaultGridRows = 4; - constexpr int kDefaultGridCols = 4; + constexpr int kDefaultGridRows = 3; + constexpr int kDefaultGridCols = 3; CGRAPlacer placer(kDefaultGridRows, kDefaultGridCols); placer.place(func); } diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir index 515322ee..2510b28e 100644 --- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir +++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir @@ -19,6 +19,13 @@ // RUN: -o %t.hyperblock.mlir // RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK +// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ +// RUN: --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task \ +// RUN: --map-ct-on-cgra-array \ +// RUN: -o %t.placement.mlir +// RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT + #set = affine_set<(d0, d1) : (d0 - 3 == 0, d1 - 7 == 0)> module attributes {} { func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage} { @@ -301,3 +308,10 @@ module attributes {} { // HYPERBLOCK-NEXT: } // HYPERBLOCK-NEXT: } +// PLACEMENT: taskflow.task @Task_0 +// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 +// PLACEMENT: taskflow.task @Task_1 +// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 +// PLACEMENT: taskflow.task @Task_2 +// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32 + diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir index 7de1eb55..9cd26dc6 100644 --- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir +++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir @@ -19,6 +19,13 @@ // RUN: -o %t.hyperblock.mlir // RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK +// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ +// RUN: --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task \ +// RUN: --map-ct-on-cgra-array \ +// RUN: -o %t.placement.mlir +// RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT + module attributes {} { func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { affine.for %arg10 = 0 to 4 { @@ -366,4 +373,16 @@ module attributes {} { // HYPERBLOCK-NEXT: %0 = affine.load %write_outputs_1[0] : memref // HYPERBLOCK-NEXT: return %0 : i32 // HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT:} \ No newline at end of file + +// HYPERBLOCK-NEXT:} + +// PLACEMENT: taskflow.task @Task_0 +// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 +// PLACEMENT: taskflow.task @Task_1 +// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32 +// PLACEMENT: taskflow.task @Task_2 +// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 +// PLACEMENT: taskflow.task @Task_3 +// PLACEMENT-SAME: cgra_col = 2 : i32, cgra_count = 1 : i32, cgra_row = 2 : i32 +// PLACEMENT: taskflow.task @Task_4 +// PLACEMENT-SAME: cgra_col = 2 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32 \ No newline at end of file diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir index 7655769e..d43363f6 100644 --- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir +++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir @@ -13,6 +13,13 @@ // RUN: -o %t.hyperblock.mlir // RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK +// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ +// RUN: --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task \ +// RUN: --map-ct-on-cgra-array \ +// RUN: -o %t.placement.mlir +// RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT + module { // Example: Parallel nested loops scenario // Task 0: Single-level loop (vector scaling) @@ -120,4 +127,9 @@ module { // HYPERBLOCK-NEXT: } // HYPERBLOCK-NEXT: return // HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: } \ No newline at end of file +// HYPERBLOCK-NEXT: } + +// PLACEMENT: taskflow.task @Task_0 +// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 +// PLACEMENT: taskflow.task @Task_1 +// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32 \ No newline at end of file From ca8fdf1d28f83989394d88dc5bbdd99f61d7a0ca Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Wed, 4 Feb 2026 05:40:59 +0800 Subject: [PATCH 05/15] chore: update dot visualization labels with coordinates and comment out debug logs --- .../Transforms/MapCTOnCGRAArrayPass.cpp | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp index 8c3fd34e..fbabdb44 100644 --- a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp +++ b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp @@ -189,11 +189,22 @@ void printGraphDOT(TaskMemoryGraph &graph, llvm::raw_ostream &os) { os << " rankdir=TB;\n"; // Task nodes (circles). for (auto &t : graph.task_nodes) { - os << " T" << t->id << " [shape=circle, label=\"" << t->id << "\"];\n"; + std::string pos_str = ""; + if (!t->placement.empty()) { + pos_str = "\\n(" + std::to_string(t->placement[0].row) + "," + std::to_string(t->placement[0].col) + ")"; + } + os << " T" << t->id << " [shape=circle, label=\"Task" << t->id << pos_str << "\"];\n"; } // Memory nodes (rectangles). for (size_t i = 0; i < graph.memory_nodes.size(); ++i) { - os << " M" << i << " [shape=box, label=\"mem" << i << "\"];\n"; + MemoryNode *m = graph.memory_nodes[i].get(); + std::string sram_str = ""; + if (m->assigned_sram_id != -1) { + int r = m->assigned_sram_id / 100; + int c = m->assigned_sram_id % 100; + sram_str = "\\nSRAM(" + std::to_string(r) + "," + std::to_string(c) + ")"; + } + os << " M" << i << " [shape=box, label=\"Mem" << i << sram_str << "\"];\n"; } // Edges: Task -> Memory (write) and Memory -> Task (read). for (auto &t : graph.task_nodes) { @@ -318,6 +329,11 @@ class CGRAPlacer { } } + // Prints final graph visualization to stderr. + // llvm::errs() << "\n=== Final Task-Memory Mapping (DOT format) ===\n"; + // printGraphDOT(graph, llvm::errs()); + // llvm::errs() << "===============================================\n\n"; + // Annotates Result. OpBuilder builder(func.getContext()); for (auto &task_node : graph.task_nodes) { From 82d26b029760778a66d126f503ffcc47da1cf0d5 Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Wed, 4 Feb 2026 05:46:52 +0800 Subject: [PATCH 06/15] clean: remove all visualization and debug log code from MapCTOnCGRAArrayPass --- .../Transforms/MapCTOnCGRAArrayPass.cpp | 58 ++----------------- 1 file changed, 5 insertions(+), 53 deletions(-) diff --git a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp index fbabdb44..c56613af 100644 --- a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp +++ b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp @@ -183,47 +183,6 @@ class TaskMemoryGraph { } }; -/// Prints the Task-Memory graph in DOT format for visualization. -void printGraphDOT(TaskMemoryGraph &graph, llvm::raw_ostream &os) { - os << "digraph TaskMemGraph {\n"; - os << " rankdir=TB;\n"; - // Task nodes (circles). - for (auto &t : graph.task_nodes) { - std::string pos_str = ""; - if (!t->placement.empty()) { - pos_str = "\\n(" + std::to_string(t->placement[0].row) + "," + std::to_string(t->placement[0].col) + ")"; - } - os << " T" << t->id << " [shape=circle, label=\"Task" << t->id << pos_str << "\"];\n"; - } - // Memory nodes (rectangles). - for (size_t i = 0; i < graph.memory_nodes.size(); ++i) { - MemoryNode *m = graph.memory_nodes[i].get(); - std::string sram_str = ""; - if (m->assigned_sram_id != -1) { - int r = m->assigned_sram_id / 100; - int c = m->assigned_sram_id % 100; - sram_str = "\\nSRAM(" + std::to_string(r) + "," + std::to_string(c) + ")"; - } - os << " M" << i << " [shape=box, label=\"Mem" << i << sram_str << "\"];\n"; - } - // Edges: Task -> Memory (write) and Memory -> Task (read). - for (auto &t : graph.task_nodes) { - for (size_t i = 0; i < graph.memory_nodes.size(); ++i) { - MemoryNode *m = graph.memory_nodes[i].get(); - for (auto *writer : m->writers) { - if (writer == t.get()) { - os << " T" << t->id << " -> M" << i << ";\n"; - } - } - for (auto *reader : m->readers) { - if (reader == t.get()) { - os << " M" << i << " -> T" << t->id << ";\n"; - } - } - } - } - os << "}\n"; -} //===----------------------------------------------------------------------===// @@ -256,11 +215,7 @@ class CGRAPlacer { TaskMemoryGraph graph; graph.build(func); - // Prints graph visualization to stderr for debugging. - // llvm::errs() << "\n=== Task-Memory Graph (DOT format) ===\n"; - // printGraphDOT(graph, llvm::errs()); - // llvm::errs() << "=== Graph Stats: " << graph.task_nodes.size() << " tasks, " - // << graph.memory_nodes.size() << " memories ===\n\n"; + if (graph.task_nodes.empty()) { llvm::errs() << "No tasks to place.\n"; @@ -287,7 +242,7 @@ class CGRAPlacer { // Alternates between Task Placement (Phase 1) and SRAM Assignment (Phase 2). constexpr int kMaxIterations = 10; - llvm::errs() << "\n=== Starting Iterative Placement (Max " << kMaxIterations << ") ===\n"; + for (int iter = 0; iter < kMaxIterations; ++iter) { // Phase 1: Place Tasks (assuming fixed SRAMs). @@ -319,20 +274,17 @@ class CGRAPlacer { // Phase 2: Assign SRAMs (assuming fixed Tasks). bool sram_moved = assignAllSRAMs(graph); - llvm::errs() << "Iter " << iter << ": SRAMs moved = " << (sram_moved ? "Yes" : "No") << "\n"; + // Convergence Check. // If SRAMs didn't move, it means task placement based on them likely won't change either. if (iter > 0 && !sram_moved) { - llvm::errs() << "Converged at iteration " << iter << ".\n"; + break; } } - // Prints final graph visualization to stderr. - // llvm::errs() << "\n=== Final Task-Memory Mapping (DOT format) ===\n"; - // printGraphDOT(graph, llvm::errs()); - // llvm::errs() << "===============================================\n\n"; + // Annotates Result. OpBuilder builder(func.getContext()); From 16310bdec898a2848e174b8e40b5ef33b87ef49d Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Wed, 4 Feb 2026 05:48:51 +0800 Subject: [PATCH 07/15] test: fix placement coordinates in irregular-loop.mlir --- test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir index 2510b28e..3550e14d 100644 --- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir +++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir @@ -311,7 +311,7 @@ module attributes {} { // PLACEMENT: taskflow.task @Task_0 // PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 // PLACEMENT: taskflow.task @Task_1 -// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 -// PLACEMENT: taskflow.task @Task_2 // PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32 +// PLACEMENT: taskflow.task @Task_2 +// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 From 8b59105b32dda5be1238ca5b07bf0f082c4358ec Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Thu, 5 Feb 2026 01:21:08 +0800 Subject: [PATCH 08/15] refactor(cgra-placement): resolve review comments - Fix SRAM ID encoding for large grids\n- Add successors to SSA proximity score\n- Remove load balance metric\n- Improve error handling with assert\n- Fix comments and variable naming\n- Ensure SSA dependencies use value inputs only\n- Update tests for new placement logic --- .../Transforms/MapCTOnCGRAArrayPass.cpp | 75 +++++++++---------- .../irregular-loop/irregular-loop.mlir | 2 +- .../taskflow/multi-nested/multi-nested.mlir | 8 +- .../parallel-nested/parallel-nested.mlir | 2 +- 4 files changed, 43 insertions(+), 44 deletions(-) diff --git a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp index c56613af..1465d086 100644 --- a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp +++ b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp @@ -93,9 +93,9 @@ struct TaskNode { TaskflowTaskOp op; int alap_level = 0; - // Edges (Note: read/write naming refers to taskflow memory_inputs/outputs) - SmallVector read_memrefs; // taskflow.task memory_inputs (readiness triggers) - SmallVector write_memrefs; // taskflow.task memory_outputs (produce readiness) + // Edges based on original memory access. + SmallVector read_memrefs; // original_read_memrefs + SmallVector write_memrefs; // original_write_memrefs SmallVector ssa_users; SmallVector ssa_operands; @@ -159,7 +159,7 @@ class TaskMemoryGraph { // Identifies if a task uses a value produced by another task. for (auto &consumer_node : task_nodes) { // Interates all operands for now to be safe. - for (Value operand : consumer_node->op->getOperands()) { + for (Value operand : consumer_node->op.getValueInputs()) { if (auto producer_op = operand.getDefiningOp()) { if (auto *producer_node = op_to_node[producer_op]) { producer_node->ssa_users.push_back(consumer_node.get()); @@ -172,8 +172,9 @@ class TaskMemoryGraph { private: MemoryNode *getOrCreateMemoryNode(Value memref) { - if (memref_to_node.count(memref)) + if (memref_to_node.count(memref)){ return memref_to_node[memref]; + } auto node = std::make_unique(memref); MemoryNode *ptr = node.get(); @@ -186,9 +187,10 @@ class TaskMemoryGraph { //===----------------------------------------------------------------------===// -// CGRA Placer +// Task Mapper //===----------------------------------------------------------------------===// -/// Places ACTs onto a 2D CGRA grid with memory mapping. +/// Maps a task-memory graph onto a 2D CGRA grid. + class CGRAPlacer { public: CGRAPlacer(int grid_rows, int grid_cols) @@ -210,7 +212,6 @@ class CGRAPlacer { } - // Extracts counter chains and builds dependency graph. // Builds Task-Memory Graph. TaskMemoryGraph graph; graph.build(func); @@ -223,6 +224,9 @@ class CGRAPlacer { } // Computes ALAP Levels on the Task Graph. + // We use ALAP (As Late As Possible) to identify the critical path and + // prioritize tasks that are closer to the sink, which helps in minimizing + // the overall dependency latency during placement. computeALAP(graph); // Sorts tasks by ALAP level (Critical Path First). @@ -336,7 +340,8 @@ class CGRAPlacer { // Rounds to the nearest integer. int avg_row = (total_row + count / 2) / count; int avg_col = (total_col + count / 2) / count; - new_sram_id = avg_row * 100 + avg_col; + // SRAM ID encoding: (row << 16) | col + new_sram_id = (avg_row << 16) | (avg_col & 0xFFFF); } else { new_sram_id = 0; // Default fallback } @@ -378,11 +383,7 @@ class CGRAPlacer { // Error handling: No available position found (grid over-subscribed). if (best_placement.cgra_positions.empty()) { - llvm::errs() << "Warning: No available CGRA position for task " - << task_node->id << ". Grid is over-subscribed (" << grid_rows_ - << "x" << grid_cols_ << " grid with all cells occupied).\n"; - // Fallback: Assign to position (0,0) with a warning. - best_placement.cgra_positions.push_back({0, 0}); + assert(false && "No available CGRA position found (grid over-subscribed)."); } return best_placement; @@ -393,7 +394,7 @@ class CGRAPlacer { /// downstream hardware generators to configure fast bypass paths between /// adjacent PEs with dependencies. /// - /// Score = α·SSA_Dist + β·Mem_Dist + γ·Balance + /// Score = α·SSA_Dist + β·Mem_Dist /// /// SSA_Dist: Minimize distance to placed SSA predecessors (ssa_operands). /// Mem_Dist: Minimize distance to assigned SRAMs for read/write memrefs. @@ -402,15 +403,13 @@ class CGRAPlacer { // Weight constants (tunable). constexpr int kAlpha = 10; // SSA proximity weight constexpr int kBeta = 50; // Memory proximity weight (High priority) - constexpr int kGamma = 20; // Load balance weight int ssa_score = 0; int mem_score = 0; - int bal_score = 0; CGRAPosition current_pos = placement.primary(); - // 1. SSA Proximity (Predecessors) + // 1. SSA Proximity (Predecessors & Successors) for (TaskNode *producer : task_node->ssa_operands) { if (!producer->placement.empty()) { int dist = current_pos.manhattanDistance(producer->placement[0]); @@ -418,14 +417,20 @@ class CGRAPlacer { ssa_score -= dist; } } + for (TaskNode *consumer : task_node->ssa_users) { + if (!consumer->placement.empty()) { + int dist = current_pos.manhattanDistance(consumer->placement[0]); + ssa_score -= dist; + } + } // 2. Memory Proximity // For Read MemRefs for (MemoryNode *mem : task_node->read_memrefs) { if (mem->assigned_sram_id != -1) { - // SRAM ID encoding: row*100 + col - int sram_r = mem->assigned_sram_id / 100; - int sram_c = mem->assigned_sram_id % 100; + // SRAM ID encoding: (row << 16) | col + int sram_r = mem->assigned_sram_id >> 16; + int sram_c = mem->assigned_sram_id & 0xFFFF; CGRAPosition sram_pos{sram_r, sram_c}; int dist = current_pos.manhattanDistance(sram_pos); mem_score -= dist; @@ -436,50 +441,44 @@ class CGRAPlacer { // we want to be close to it too. for (MemoryNode *mem : task_node->write_memrefs) { if (mem->assigned_sram_id != -1) { - int sram_r = mem->assigned_sram_id / 100; - int sram_c = mem->assigned_sram_id % 100; + // SRAM ID encoding: (row << 16) | col + int sram_r = mem->assigned_sram_id >> 16; + int sram_c = mem->assigned_sram_id & 0xFFFF; CGRAPosition sram_pos{sram_r, sram_c}; int dist = current_pos.manhattanDistance(sram_pos); mem_score -= dist; } } - // 3. Load Balance - // Prefers less crowded rows/cols. - int row_count = 0, col_count = 0; - for (int c = 0; c < grid_cols_; ++c) { if (occupied_[current_pos.row][c]) row_count++; } - for (int r = 0; r < grid_rows_; ++r) { if (occupied_[r][current_pos.col]) col_count++; } - bal_score = (grid_cols_ - row_count) + (grid_rows_ - col_count); - - return kAlpha * ssa_score + kBeta * mem_score + kGamma * bal_score; + return kAlpha * ssa_score + kBeta * mem_score; } /// Computes ALAP levels considering both SSA and memory dependencies. void computeALAP(TaskMemoryGraph &graph) { // DFS for longest path from node to any sink (ALAP Level). - DenseMap memo; + DenseMap node_alap_cache; for (auto &node : graph.task_nodes) { - node->alap_level = calculateLevel(node.get(), memo); + node->alap_level = calculateLevel(node.get(), node_alap_cache); } } - int calculateLevel(TaskNode *node, DenseMap &memo) { - if (memo.count(node)) return memo[node]; + int calculateLevel(TaskNode *node, DenseMap &node_alap_cache) { + if (node_alap_cache.count(node)) return node_alap_cache[node]; int max_child_level = 0; for (TaskNode *child : node->ssa_users) { - max_child_level = std::max(max_child_level, calculateLevel(child, memo) + 1); + max_child_level = std::max(max_child_level, calculateLevel(child, node_alap_cache) + 1); } // Checks memory dependencies too (Producer -> Mem -> Consumer). for (MemoryNode *mem : node->write_memrefs) { for (TaskNode *reader : mem->readers) { if (reader != node) - max_child_level = std::max(max_child_level, calculateLevel(reader, memo) + 1); + max_child_level = std::max(max_child_level, calculateLevel(reader, node_alap_cache) + 1); } } - return memo[node] = max_child_level; + return node_alap_cache[node] = max_child_level; } diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir index 3550e14d..b6c57dd1 100644 --- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir +++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir @@ -313,5 +313,5 @@ module attributes {} { // PLACEMENT: taskflow.task @Task_1 // PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32 // PLACEMENT: taskflow.task @Task_2 -// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 +// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32 diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir index 9cd26dc6..43164d50 100644 --- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir +++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir @@ -379,10 +379,10 @@ module attributes {} { // PLACEMENT: taskflow.task @Task_0 // PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 // PLACEMENT: taskflow.task @Task_1 -// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32 -// PLACEMENT: taskflow.task @Task_2 // PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 +// PLACEMENT: taskflow.task @Task_2 +// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32 // PLACEMENT: taskflow.task @Task_3 -// PLACEMENT-SAME: cgra_col = 2 : i32, cgra_count = 1 : i32, cgra_row = 2 : i32 +// PLACEMENT-SAME: cgra_col = 2 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 // PLACEMENT: taskflow.task @Task_4 -// PLACEMENT-SAME: cgra_col = 2 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32 \ No newline at end of file +// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32 \ No newline at end of file diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir index d43363f6..0950619a 100644 --- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir +++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir @@ -132,4 +132,4 @@ module { // PLACEMENT: taskflow.task @Task_0 // PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 // PLACEMENT: taskflow.task @Task_1 -// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32 \ No newline at end of file +// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 \ No newline at end of file From 846ea2d68d0e6ace66aa7e90028efbd2f2e6722e Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Thu, 5 Feb 2026 01:30:29 +0800 Subject: [PATCH 09/15] feat(cgra-placement): wrap mapping info in attribute and update tests --- .../Transforms/MapCTOnCGRAArrayPass.cpp | 46 +++++++++++++++++-- relu_branch.mlir | 3 -- relu_main.mlir | 3 -- .../irregular-loop/irregular-loop.mlir | 6 +-- .../taskflow/multi-nested/multi-nested.mlir | 10 ++-- .../parallel-nested/parallel-nested.mlir | 4 +- 6 files changed, 51 insertions(+), 21 deletions(-) delete mode 100644 relu_branch.mlir delete mode 100644 relu_main.mlir diff --git a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp index 1465d086..e1783d1d 100644 --- a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp +++ b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp @@ -263,7 +263,8 @@ class CGRAPlacer { // Commits Placement. task_node->placement.push_back(placement.primary()); - // Handles multi-cgra if needed. + // Handles mapping one task on multi-CGRAs. + // TODO: Introduce explicit multi-CGRA binding logic. for (size_t i = 1; i < placement.cgra_positions.size(); ++i) { task_node->placement.push_back(placement.cgra_positions[i]); } @@ -294,10 +295,45 @@ class CGRAPlacer { OpBuilder builder(func.getContext()); for (auto &task_node : graph.task_nodes) { if (task_node->placement.empty()) continue; - CGRAPosition pos = task_node->placement[0]; - task_node->op->setAttr("cgra_row", builder.getI32IntegerAttr(pos.row)); - task_node->op->setAttr("cgra_col", builder.getI32IntegerAttr(pos.col)); - task_node->op->setAttr("cgra_count", builder.getI32IntegerAttr(task_node->placement.size())); + + SmallVector mapping_attrs; + + // 1. CGRA Positions + SmallVector pos_attrs; + for (const auto &pos : task_node->placement) { + SmallVector coord_attrs; + coord_attrs.push_back(NamedAttribute( + StringAttr::get(func.getContext(), "row"), + builder.getI32IntegerAttr(pos.row))); + coord_attrs.push_back(NamedAttribute( + StringAttr::get(func.getContext(), "col"), + builder.getI32IntegerAttr(pos.col))); + pos_attrs.push_back(DictionaryAttr::get(func.getContext(), coord_attrs)); + } + mapping_attrs.push_back(NamedAttribute( + StringAttr::get(func.getContext(), "cgra_positions"), + builder.getArrayAttr(pos_attrs))); + + // 2. Read SRAM IDs + SmallVector read_sram_attrs; + for (MemoryNode *mem : task_node->read_memrefs) { + read_sram_attrs.push_back(builder.getI32IntegerAttr(mem->assigned_sram_id)); + } + mapping_attrs.push_back(NamedAttribute( + StringAttr::get(func.getContext(), "read_sram_ids"), + builder.getArrayAttr(read_sram_attrs))); + + // 3. Write SRAM IDs + SmallVector write_sram_attrs; + for (MemoryNode *mem : task_node->write_memrefs) { + write_sram_attrs.push_back(builder.getI32IntegerAttr(mem->assigned_sram_id)); + } + mapping_attrs.push_back(NamedAttribute( + StringAttr::get(func.getContext(), "write_sram_ids"), + builder.getArrayAttr(write_sram_attrs))); + + // Set Attribute + task_node->op->setAttr("mapping_info", DictionaryAttr::get(func.getContext(), mapping_attrs)); } } diff --git a/relu_branch.mlir b/relu_branch.mlir deleted file mode 100644 index 0a1e5150..00000000 --- a/relu_branch.mlir +++ /dev/null @@ -1,3 +0,0 @@ -module { -} - diff --git a/relu_main.mlir b/relu_main.mlir deleted file mode 100644 index 0a1e5150..00000000 --- a/relu_main.mlir +++ /dev/null @@ -1,3 +0,0 @@ -module { -} - diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir index b6c57dd1..e5cca889 100644 --- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir +++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir @@ -309,9 +309,9 @@ module attributes {} { // HYPERBLOCK-NEXT: } // PLACEMENT: taskflow.task @Task_0 -// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 +// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_ids = [], write_sram_ids = []} // PLACEMENT: taskflow.task @Task_1 -// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32 +// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_ids = [], write_sram_ids = [65537 : i32]} // PLACEMENT: taskflow.task @Task_2 -// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32 +// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_ids = [65537 : i32], write_sram_ids = [65536 : i32]} diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir index 43164d50..2d158b5d 100644 --- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir +++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir @@ -377,12 +377,12 @@ module attributes {} { // HYPERBLOCK-NEXT:} // PLACEMENT: taskflow.task @Task_0 -// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 +// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_ids = [0 : i32], write_sram_ids = [65536 : i32]} // PLACEMENT: taskflow.task @Task_1 -// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 +// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_ids = [1 : i32, 1 : i32], write_sram_ids = [65537 : i32]} // PLACEMENT: taskflow.task @Task_2 -// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32 +// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_ids = [65536 : i32, 65537 : i32, 65536 : i32], write_sram_ids = [65536 : i32]} // PLACEMENT: taskflow.task @Task_3 -// PLACEMENT-SAME: cgra_col = 2 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 +// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 2 : i32, row = 0 : i32}], read_sram_ids = [2 : i32], write_sram_ids = [65538 : i32]} // PLACEMENT: taskflow.task @Task_4 -// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 1 : i32 \ No newline at end of file +// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_ids = [65537 : i32, 65538 : i32], write_sram_ids = [65537 : i32]} \ No newline at end of file diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir index 0950619a..ec98f6a1 100644 --- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir +++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir @@ -130,6 +130,6 @@ module { // HYPERBLOCK-NEXT: } // PLACEMENT: taskflow.task @Task_0 -// PLACEMENT-SAME: cgra_col = 0 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 +// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_ids = [0 : i32], write_sram_ids = [0 : i32]} // PLACEMENT: taskflow.task @Task_1 -// PLACEMENT-SAME: cgra_col = 1 : i32, cgra_count = 1 : i32, cgra_row = 0 : i32 \ No newline at end of file +// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_ids = [1 : i32, 1 : i32], write_sram_ids = [1 : i32]} \ No newline at end of file From 1ae4b13be4de50a2e7581150e57cb207c7cff7a2 Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Fri, 6 Feb 2026 02:13:52 +0800 Subject: [PATCH 10/15] refactor(cgra-placement): update task mapping info for better readability and structure - Rename CGRAPlacer to TaskMapper\n- Rename mapping_info to task_mapping_info\n- Use readable SRAM locations {row, col} in task_mapping_info\n- Provide full task lines in test cases --- .../Transforms/MapCTOnCGRAArrayPass.cpp | 40 ++++++++++++++----- .../irregular-loop/irregular-loop.mlir | 6 +-- .../taskflow/multi-nested/multi-nested.mlir | 10 ++--- .../parallel-nested/parallel-nested.mlir | 4 +- 4 files changed, 39 insertions(+), 21 deletions(-) diff --git a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp index e1783d1d..44fb41e6 100644 --- a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp +++ b/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp @@ -191,9 +191,9 @@ class TaskMemoryGraph { //===----------------------------------------------------------------------===// /// Maps a task-memory graph onto a 2D CGRA grid. -class CGRAPlacer { +class TaskMapper { public: - CGRAPlacer(int grid_rows, int grid_cols) + TaskMapper(int grid_rows, int grid_cols) : grid_rows_(grid_rows), grid_cols_(grid_cols) { occupied_.resize(grid_rows_); for (auto &row : occupied_) { @@ -314,26 +314,44 @@ class CGRAPlacer { StringAttr::get(func.getContext(), "cgra_positions"), builder.getArrayAttr(pos_attrs))); - // 2. Read SRAM IDs + // 2. Read SRAM Locations SmallVector read_sram_attrs; for (MemoryNode *mem : task_node->read_memrefs) { - read_sram_attrs.push_back(builder.getI32IntegerAttr(mem->assigned_sram_id)); + int row = mem->assigned_sram_id >> 16; + int col = mem->assigned_sram_id & 0xFFFF; + SmallVector coord_attrs; + coord_attrs.push_back(NamedAttribute( + StringAttr::get(func.getContext(), "row"), + builder.getI32IntegerAttr(row))); + coord_attrs.push_back(NamedAttribute( + StringAttr::get(func.getContext(), "col"), + builder.getI32IntegerAttr(col))); + read_sram_attrs.push_back(DictionaryAttr::get(func.getContext(), coord_attrs)); } mapping_attrs.push_back(NamedAttribute( - StringAttr::get(func.getContext(), "read_sram_ids"), + StringAttr::get(func.getContext(), "read_sram_locs"), builder.getArrayAttr(read_sram_attrs))); - // 3. Write SRAM IDs + // 3. Write SRAM Locations SmallVector write_sram_attrs; for (MemoryNode *mem : task_node->write_memrefs) { - write_sram_attrs.push_back(builder.getI32IntegerAttr(mem->assigned_sram_id)); + int row = mem->assigned_sram_id >> 16; + int col = mem->assigned_sram_id & 0xFFFF; + SmallVector coord_attrs; + coord_attrs.push_back(NamedAttribute( + StringAttr::get(func.getContext(), "row"), + builder.getI32IntegerAttr(row))); + coord_attrs.push_back(NamedAttribute( + StringAttr::get(func.getContext(), "col"), + builder.getI32IntegerAttr(col))); + write_sram_attrs.push_back(DictionaryAttr::get(func.getContext(), coord_attrs)); } mapping_attrs.push_back(NamedAttribute( - StringAttr::get(func.getContext(), "write_sram_ids"), + StringAttr::get(func.getContext(), "write_sram_locs"), builder.getArrayAttr(write_sram_attrs))); // Set Attribute - task_node->op->setAttr("mapping_info", DictionaryAttr::get(func.getContext(), mapping_attrs)); + task_node->op->setAttr("task_mapping_info", DictionaryAttr::get(func.getContext(), mapping_attrs)); } } @@ -544,8 +562,8 @@ struct MapCTOnCGRAArrayPass func::FuncOp func = getOperation(); constexpr int kDefaultGridRows = 3; constexpr int kDefaultGridCols = 3; - CGRAPlacer placer(kDefaultGridRows, kDefaultGridCols); - placer.place(func); + TaskMapper mapper(kDefaultGridRows, kDefaultGridCols); + mapper.place(func); } }; diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir index e5cca889..2d614b98 100644 --- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir +++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir @@ -309,9 +309,9 @@ module attributes {} { // HYPERBLOCK-NEXT: } // PLACEMENT: taskflow.task @Task_0 -// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_ids = [], write_sram_ids = []} +// PLACEMENT-SAME: value_inputs(%c0_i32 : i32) {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locs = [], write_sram_locs = []}} // PLACEMENT: taskflow.task @Task_1 -// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_ids = [], write_sram_ids = [65537 : i32]} +// PLACEMENT-SAME: write_memrefs(%alloca_0 : memref<4x8xi32>) value_inputs(%c8_i32 : i32) [original_write_memrefs(%alloca_0 : memref<4x8xi32>)] {task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_locs = [], write_sram_locs = [{col = 1 : i32, row = 1 : i32}]}} // PLACEMENT: taskflow.task @Task_2 -// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_ids = [65537 : i32], write_sram_ids = [65536 : i32]} +// PLACEMENT-SAME: read_memrefs(%write_outputs : memref<4x8xi32>) write_memrefs(%alloca : memref) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0 : memref<4x8xi32>), original_write_memrefs(%alloca : memref)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locs = [{col = 1 : i32, row = 1 : i32}], write_sram_locs = [{col = 0 : i32, row = 1 : i32}]}} diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir index 2d158b5d..ce3d4071 100644 --- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir +++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir @@ -377,12 +377,12 @@ module attributes {} { // HYPERBLOCK-NEXT:} // PLACEMENT: taskflow.task @Task_0 -// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_ids = [0 : i32], write_sram_ids = [65536 : i32]} +// PLACEMENT-SAME: read_memrefs(%arg0 : memref) write_memrefs(%arg5 : memref) [original_read_memrefs(%arg0 : memref), original_write_memrefs(%arg5 : memref)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locs = [{col = 0 : i32, row = 0 : i32}], write_sram_locs = [{col = 0 : i32, row = 1 : i32}]}} // PLACEMENT: taskflow.task @Task_1 -// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_ids = [1 : i32, 1 : i32], write_sram_ids = [65537 : i32]} +// PLACEMENT-SAME: read_memrefs(%arg1, %arg2 : memref, memref) write_memrefs(%arg6 : memref) [original_read_memrefs(%arg1, %arg2 : memref, memref), original_write_memrefs(%arg6 : memref)] {task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locs = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locs = [{col = 1 : i32, row = 1 : i32}]}} // PLACEMENT: taskflow.task @Task_2 -// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_ids = [65536 : i32, 65537 : i32, 65536 : i32], write_sram_ids = [65536 : i32]} +// PLACEMENT-SAME: read_memrefs(%write_outputs, %write_outputs_0, %arg9 : memref, memref, memref) write_memrefs(%arg9 : memref) [original_read_memrefs(%arg5, %arg6, %arg9 : memref, memref, memref), original_write_memrefs(%arg9 : memref)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locs = [{col = 0 : i32, row = 1 : i32}, {col = 1 : i32, row = 1 : i32}, {col = 0 : i32, row = 1 : i32}], write_sram_locs = [{col = 0 : i32, row = 1 : i32}]}} // PLACEMENT: taskflow.task @Task_3 -// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 2 : i32, row = 0 : i32}], read_sram_ids = [2 : i32], write_sram_ids = [65538 : i32]} +// PLACEMENT-SAME: read_memrefs(%arg3 : memref) write_memrefs(%arg7 : memref) [original_read_memrefs(%arg3 : memref), original_write_memrefs(%arg7 : memref)] {task_mapping_info = {cgra_positions = [{col = 2 : i32, row = 0 : i32}], read_sram_locs = [{col = 2 : i32, row = 0 : i32}], write_sram_locs = [{col = 2 : i32, row = 1 : i32}]}} // PLACEMENT: taskflow.task @Task_4 -// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_ids = [65537 : i32, 65538 : i32], write_sram_ids = [65537 : i32]} \ No newline at end of file +// PLACEMENT-SAME: read_memrefs(%arg4, %write_outputs_2 : memref, memref) write_memrefs(%arg8 : memref) [original_read_memrefs(%arg4, %arg7 : memref, memref), original_write_memrefs(%arg8 : memref)] {task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_locs = [{col = 1 : i32, row = 1 : i32}, {col = 2 : i32, row = 1 : i32}], write_sram_locs = [{col = 1 : i32, row = 1 : i32}]}} \ No newline at end of file diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir index ec98f6a1..a91737f3 100644 --- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir +++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir @@ -130,6 +130,6 @@ module { // HYPERBLOCK-NEXT: } // PLACEMENT: taskflow.task @Task_0 -// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_ids = [0 : i32], write_sram_ids = [0 : i32]} +// PLACEMENT-SAME: read_memrefs(%arg0 : memref<16xf32>) write_memrefs(%arg0 : memref<16xf32>) value_inputs(%arg4 : f32) [original_read_memrefs(%arg0 : memref<16xf32>), original_write_memrefs(%arg0 : memref<16xf32>)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locs = [{col = 0 : i32, row = 0 : i32}], write_sram_locs = [{col = 0 : i32, row = 0 : i32}]}} // PLACEMENT: taskflow.task @Task_1 -// PLACEMENT-SAME: mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_ids = [1 : i32, 1 : i32], write_sram_ids = [1 : i32]} \ No newline at end of file +// PLACEMENT-SAME: read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>) write_memrefs(%arg3 : memref<8x8xf32>) [original_read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>), original_write_memrefs(%arg3 : memref<8x8xf32>)] {task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locs = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locs = [{col = 1 : i32, row = 0 : i32}]}} \ No newline at end of file From a6603029246f2a1dd937de91fbfd11d0b710d66c Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Fri, 6 Feb 2026 02:36:27 +0800 Subject: [PATCH 11/15] Refactor CGRA placement: rename pass and classes, address reviewer comments on mapping info readability and encoding --- include/TaskflowDialect/TaskflowPasses.h | 2 +- include/TaskflowDialect/TaskflowPasses.td | 10 +- lib/TaskflowDialect/Transforms/CMakeLists.txt | 2 +- ...GRAArrayPass.cpp => MapTaskOnCgraPass.cpp} | 109 ++++++++---------- .../irregular-loop/irregular-loop.mlir | 9 +- .../taskflow/multi-nested/multi-nested.mlir | 13 +-- .../parallel-nested/parallel-nested.mlir | 6 +- 7 files changed, 70 insertions(+), 81 deletions(-) rename lib/TaskflowDialect/Transforms/{MapCTOnCGRAArrayPass.cpp => MapTaskOnCgraPass.cpp} (84%) diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h index 33a8199e..c50544c9 100644 --- a/include/TaskflowDialect/TaskflowPasses.h +++ b/include/TaskflowDialect/TaskflowPasses.h @@ -17,7 +17,7 @@ namespace taskflow { #include "TaskflowDialect/TaskflowPasses.h.inc" std::unique_ptr createConstructHyperblockFromTaskPass(); std::unique_ptr createClassifyCountersPass(); -std::unique_ptr createMapCTOnCGRAArrayPass(); +std::unique_ptr createMapTaskOnCgraPass(); //=========================================================// // Optimization Passes diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td index dd7f4db2..0b37631c 100644 --- a/include/TaskflowDialect/TaskflowPasses.td +++ b/include/TaskflowDialect/TaskflowPasses.td @@ -46,15 +46,15 @@ def ClassifyCounters : Pass<"classify-counters", "ModuleOp">{ let constructor = "taskflow::createClassifyCountersPass()"; } -def MapCTOnCGRAArray : Pass<"map-ct-on-cgra-array", "func::FuncOp"> { - let summary = "Maps Canonical Tasks (CTs) onto a 2D CGRA grid array"; +def MapTaskOnCgra : Pass<"map-task-on-cgra", "func::FuncOp"> { + let summary = "Maps Taskflow tasks onto a 2D CGRA grid array"; let description = [{ - This pass maps Canonical Tasks (CTs) onto a 2D CGRA grid array. + This pass maps Taskflow tasks onto a 2D CGRA grid array. Fusion candidates (same-header SSA dependencies) are placed on adjacent CGRAs to enable direct data forwarding. - Uses a default 4x4 CGRA grid. + Uses a default 3x3 CGRA grid. }]; - let constructor = "taskflow::createMapCTOnCGRAArrayPass()"; + let constructor = "taskflow::createMapTaskOnCgraPass()"; } #endif // TASKFLOW_PASSES_TD \ No newline at end of file diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt index d2254cf7..5dcb6736 100644 --- a/lib/TaskflowDialect/Transforms/CMakeLists.txt +++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt @@ -3,7 +3,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) add_mlir_library(MLIRTaskflowTransforms ConstructHyperblockFromTaskPass.cpp ClassifyCountersPass.cpp - MapCTOnCGRAArrayPass.cpp + MapTaskOnCgraPass.cpp DEPENDS MLIRTaskflowTransformsIncGen diff --git a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp similarity index 84% rename from lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp rename to lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp index 44fb41e6..adf845d9 100644 --- a/lib/TaskflowDialect/Transforms/MapCTOnCGRAArrayPass.cpp +++ b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp @@ -1,4 +1,4 @@ -//===- MapCTOnCGRAArrayPass.cpp - CT to CGRA Mapping Pass ----------------===// +//===- MapTaskOnCgraPass.cpp - Task to CGRA Mapping Pass ----------------===// // // This pass maps Canonical Tasks (CTs) onto a 2D CGRA grid array: // 1. Places tasks with SSA dependencies on adjacent CGRAs. @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -42,6 +43,10 @@ struct CGRAPosition { return row == other.row && col == other.col; } + bool operator!=(const CGRAPosition &other) const { + return !(*this == other); + } + /// Computes Manhattan distance to another position. int manhattanDistance(const CGRAPosition &other) const { return std::abs(row - other.row) + std::abs(col - other.col); @@ -114,7 +119,7 @@ struct MemoryNode { SmallVector writers; // Mapping result - int assigned_sram_id = -1; + std::optional assigned_sram_pos; MemoryNode(Value memref) : memref(memref) {} }; @@ -315,40 +320,37 @@ class TaskMapper { builder.getArrayAttr(pos_attrs))); // 2. Read SRAM Locations - SmallVector read_sram_attrs; - for (MemoryNode *mem : task_node->read_memrefs) { - int row = mem->assigned_sram_id >> 16; - int col = mem->assigned_sram_id & 0xFFFF; - SmallVector coord_attrs; - coord_attrs.push_back(NamedAttribute( - StringAttr::get(func.getContext(), "row"), - builder.getI32IntegerAttr(row))); - coord_attrs.push_back(NamedAttribute( - StringAttr::get(func.getContext(), "col"), - builder.getI32IntegerAttr(col))); - read_sram_attrs.push_back(DictionaryAttr::get(func.getContext(), coord_attrs)); + if (!task_node->read_memrefs.empty()) { + SmallVector read_sram_attrs; + for (MemoryNode *mem : task_node->read_memrefs) { + if (mem->assigned_sram_pos) { + SmallVector sram_coord; + sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "row"), builder.getI32IntegerAttr(mem->assigned_sram_pos->row))); + sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "col"), builder.getI32IntegerAttr(mem->assigned_sram_pos->col))); + read_sram_attrs.push_back(DictionaryAttr::get(func.getContext(), sram_coord)); + } + } + mapping_attrs.push_back(NamedAttribute( + StringAttr::get(func.getContext(), "read_sram_locations"), + builder.getArrayAttr(read_sram_attrs))); } - mapping_attrs.push_back(NamedAttribute( - StringAttr::get(func.getContext(), "read_sram_locs"), - builder.getArrayAttr(read_sram_attrs))); // 3. Write SRAM Locations - SmallVector write_sram_attrs; - for (MemoryNode *mem : task_node->write_memrefs) { - int row = mem->assigned_sram_id >> 16; - int col = mem->assigned_sram_id & 0xFFFF; - SmallVector coord_attrs; - coord_attrs.push_back(NamedAttribute( - StringAttr::get(func.getContext(), "row"), - builder.getI32IntegerAttr(row))); - coord_attrs.push_back(NamedAttribute( - StringAttr::get(func.getContext(), "col"), - builder.getI32IntegerAttr(col))); - write_sram_attrs.push_back(DictionaryAttr::get(func.getContext(), coord_attrs)); + if (!task_node->write_memrefs.empty()) { + SmallVector write_sram_attrs; + for (MemoryNode *mem : task_node->write_memrefs) { + if (mem->assigned_sram_pos) { + SmallVector sram_coord; + sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "row"), builder.getI32IntegerAttr(mem->assigned_sram_pos->row))); + sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "col"), builder.getI32IntegerAttr(mem->assigned_sram_pos->col))); + + write_sram_attrs.push_back(DictionaryAttr::get(func.getContext(), sram_coord)); + } + } + mapping_attrs.push_back(NamedAttribute( + StringAttr::get(func.getContext(), "write_sram_locations"), + builder.getArrayAttr(write_sram_attrs))); } - mapping_attrs.push_back(NamedAttribute( - StringAttr::get(func.getContext(), "write_sram_locs"), - builder.getArrayAttr(write_sram_attrs))); // Set Attribute task_node->op->setAttr("task_mapping_info", DictionaryAttr::get(func.getContext(), mapping_attrs)); @@ -389,19 +391,16 @@ class TaskMapper { } } - int new_sram_id = 0; + std::optional new_sram_pos; if (count > 0) { // Rounds to the nearest integer. int avg_row = (total_row + count / 2) / count; int avg_col = (total_col + count / 2) / count; - // SRAM ID encoding: (row << 16) | col - new_sram_id = (avg_row << 16) | (avg_col & 0xFFFF); - } else { - new_sram_id = 0; // Default fallback + new_sram_pos = CGRAPosition{avg_row, avg_col}; } - if (mem_node->assigned_sram_id != new_sram_id) { - mem_node->assigned_sram_id = new_sram_id; + if (mem_node->assigned_sram_pos != new_sram_pos) { + mem_node->assigned_sram_pos = new_sram_pos; changed = true; } } @@ -481,12 +480,8 @@ class TaskMapper { // 2. Memory Proximity // For Read MemRefs for (MemoryNode *mem : task_node->read_memrefs) { - if (mem->assigned_sram_id != -1) { - // SRAM ID encoding: (row << 16) | col - int sram_r = mem->assigned_sram_id >> 16; - int sram_c = mem->assigned_sram_id & 0xFFFF; - CGRAPosition sram_pos{sram_r, sram_c}; - int dist = current_pos.manhattanDistance(sram_pos); + if (mem->assigned_sram_pos) { + int dist = current_pos.manhattanDistance(*mem->assigned_sram_pos); mem_score -= dist; } } @@ -494,12 +489,8 @@ class TaskMapper { // If we write to a memory that is already assigned (e.g. read by previous task), // we want to be close to it too. for (MemoryNode *mem : task_node->write_memrefs) { - if (mem->assigned_sram_id != -1) { - // SRAM ID encoding: (row << 16) | col - int sram_r = mem->assigned_sram_id >> 16; - int sram_c = mem->assigned_sram_id & 0xFFFF; - CGRAPosition sram_pos{sram_r, sram_c}; - int dist = current_pos.manhattanDistance(sram_pos); + if (mem->assigned_sram_pos) { + int dist = current_pos.manhattanDistance(*mem->assigned_sram_pos); mem_score -= dist; } } @@ -545,16 +536,16 @@ class TaskMapper { //===----------------------------------------------------------------------===// // Pass Definition //===----------------------------------------------------------------------===// -struct MapCTOnCGRAArrayPass - : public PassWrapper> { - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MapCTOnCGRAArrayPass) +struct MapTaskOnCgraPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MapTaskOnCgraPass) - MapCTOnCGRAArrayPass() = default; + MapTaskOnCgraPass() = default; - StringRef getArgument() const override { return "map-ct-on-cgra-array"; } + StringRef getArgument() const override { return "map-task-on-cgra"; } StringRef getDescription() const override { - return "Maps Canonical Tasks (CTs) onto a 2D CGRA grid with adjacency " + return "Maps Taskflow tasks onto a 2D CGRA grid with adjacency " "optimization and memory mapping."; } @@ -572,8 +563,8 @@ struct MapCTOnCGRAArrayPass namespace mlir { namespace taskflow { -std::unique_ptr createMapCTOnCGRAArrayPass() { - return std::make_unique(); +std::unique_ptr createMapTaskOnCgraPass() { + return std::make_unique(); } } // namespace taskflow diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir index 2d614b98..057d6c15 100644 --- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir +++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir @@ -22,7 +22,7 @@ // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ // RUN: --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ -// RUN: --map-ct-on-cgra-array \ +// RUN: --map-task-on-cgra \ // RUN: -o %t.placement.mlir // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT @@ -309,9 +309,8 @@ module attributes {} { // HYPERBLOCK-NEXT: } // PLACEMENT: taskflow.task @Task_0 -// PLACEMENT-SAME: value_inputs(%c0_i32 : i32) {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locs = [], write_sram_locs = []}} +// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}]} // PLACEMENT: taskflow.task @Task_1 -// PLACEMENT-SAME: write_memrefs(%alloca_0 : memref<4x8xi32>) value_inputs(%c8_i32 : i32) [original_write_memrefs(%alloca_0 : memref<4x8xi32>)] {task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_locs = [], write_sram_locs = [{col = 1 : i32, row = 1 : i32}]}} +// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]} // PLACEMENT: taskflow.task @Task_2 -// PLACEMENT-SAME: read_memrefs(%write_outputs : memref<4x8xi32>) write_memrefs(%alloca : memref) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0 : memref<4x8xi32>), original_write_memrefs(%alloca : memref)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locs = [{col = 1 : i32, row = 1 : i32}], write_sram_locs = [{col = 0 : i32, row = 1 : i32}]}} - +// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locations = [{col = 1 : i32, row = 1 : i32}], write_sram_locations = [{col = 0 : i32, row = 1 : i32}]} diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir index ce3d4071..1bfb2bb2 100644 --- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir +++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir @@ -22,7 +22,7 @@ // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ // RUN: --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ -// RUN: --map-ct-on-cgra-array \ +// RUN: --map-task-on-cgra \ // RUN: -o %t.placement.mlir // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT @@ -373,16 +373,15 @@ module attributes {} { // HYPERBLOCK-NEXT: %0 = affine.load %write_outputs_1[0] : memref // HYPERBLOCK-NEXT: return %0 : i32 // HYPERBLOCK-NEXT: } - // HYPERBLOCK-NEXT:} // PLACEMENT: taskflow.task @Task_0 -// PLACEMENT-SAME: read_memrefs(%arg0 : memref) write_memrefs(%arg5 : memref) [original_read_memrefs(%arg0 : memref), original_write_memrefs(%arg5 : memref)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locs = [{col = 0 : i32, row = 0 : i32}], write_sram_locs = [{col = 0 : i32, row = 1 : i32}]}} +// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locations = [{col = 0 : i32, row = 0 : i32}], write_sram_locations = [{col = 0 : i32, row = 1 : i32}]} // PLACEMENT: taskflow.task @Task_1 -// PLACEMENT-SAME: read_memrefs(%arg1, %arg2 : memref, memref) write_memrefs(%arg6 : memref) [original_read_memrefs(%arg1, %arg2 : memref, memref), original_write_memrefs(%arg6 : memref)] {task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locs = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locs = [{col = 1 : i32, row = 1 : i32}]}} +// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locations = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]} // PLACEMENT: taskflow.task @Task_2 -// PLACEMENT-SAME: read_memrefs(%write_outputs, %write_outputs_0, %arg9 : memref, memref, memref) write_memrefs(%arg9 : memref) [original_read_memrefs(%arg5, %arg6, %arg9 : memref, memref, memref), original_write_memrefs(%arg9 : memref)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locs = [{col = 0 : i32, row = 1 : i32}, {col = 1 : i32, row = 1 : i32}, {col = 0 : i32, row = 1 : i32}], write_sram_locs = [{col = 0 : i32, row = 1 : i32}]}} +// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locations = [{col = 0 : i32, row = 1 : i32}, {col = 1 : i32, row = 1 : i32}, {col = 0 : i32, row = 1 : i32}], write_sram_locations = [{col = 0 : i32, row = 1 : i32}]} // PLACEMENT: taskflow.task @Task_3 -// PLACEMENT-SAME: read_memrefs(%arg3 : memref) write_memrefs(%arg7 : memref) [original_read_memrefs(%arg3 : memref), original_write_memrefs(%arg7 : memref)] {task_mapping_info = {cgra_positions = [{col = 2 : i32, row = 0 : i32}], read_sram_locs = [{col = 2 : i32, row = 0 : i32}], write_sram_locs = [{col = 2 : i32, row = 1 : i32}]}} +// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 2 : i32, row = 0 : i32}], read_sram_locations = [{col = 2 : i32, row = 0 : i32}], write_sram_locations = [{col = 2 : i32, row = 1 : i32}]} // PLACEMENT: taskflow.task @Task_4 -// PLACEMENT-SAME: read_memrefs(%arg4, %write_outputs_2 : memref, memref) write_memrefs(%arg8 : memref) [original_read_memrefs(%arg4, %arg7 : memref, memref), original_write_memrefs(%arg8 : memref)] {task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_locs = [{col = 1 : i32, row = 1 : i32}, {col = 2 : i32, row = 1 : i32}], write_sram_locs = [{col = 1 : i32, row = 1 : i32}]}} \ No newline at end of file +// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_locations = [{col = 1 : i32, row = 1 : i32}, {col = 2 : i32, row = 1 : i32}], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]} \ No newline at end of file diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir index a91737f3..6c0cd57b 100644 --- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir +++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir @@ -16,7 +16,7 @@ // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ // RUN: --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ -// RUN: --map-ct-on-cgra-array \ +// RUN: --map-task-on-cgra \ // RUN: -o %t.placement.mlir // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT @@ -130,6 +130,6 @@ module { // HYPERBLOCK-NEXT: } // PLACEMENT: taskflow.task @Task_0 -// PLACEMENT-SAME: read_memrefs(%arg0 : memref<16xf32>) write_memrefs(%arg0 : memref<16xf32>) value_inputs(%arg4 : f32) [original_read_memrefs(%arg0 : memref<16xf32>), original_write_memrefs(%arg0 : memref<16xf32>)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locs = [{col = 0 : i32, row = 0 : i32}], write_sram_locs = [{col = 0 : i32, row = 0 : i32}]}} +// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locations = [{col = 0 : i32, row = 0 : i32}], write_sram_locations = [{col = 0 : i32, row = 0 : i32}]} // PLACEMENT: taskflow.task @Task_1 -// PLACEMENT-SAME: read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>) write_memrefs(%arg3 : memref<8x8xf32>) [original_read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>), original_write_memrefs(%arg3 : memref<8x8xf32>)] {task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locs = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locs = [{col = 1 : i32, row = 0 : i32}]}} \ No newline at end of file +// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locations = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locations = [{col = 1 : i32, row = 0 : i32}]} \ No newline at end of file From 47ea607e948950fadd54dfdf2573e23de91f4f35 Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Fri, 6 Feb 2026 03:11:58 +0800 Subject: [PATCH 12/15] Refactor CGRA placement: rename pass to MapTaskOnCgra, update dependency terminology to dependency_depth, and improve SRAM mapping readability --- .../Transforms/MapTaskOnCgraPass.cpp | 102 +++++++++--------- .../irregular-loop/irregular-loop.mlir | 4 +- 2 files changed, 56 insertions(+), 50 deletions(-) diff --git a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp index adf845d9..7d859e6d 100644 --- a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp +++ b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp @@ -96,7 +96,7 @@ struct MemoryNode; struct TaskNode { size_t id; TaskflowTaskOp op; - int alap_level = 0; + int dependency_depth = 0; // Longest path to any sink in the dependency graph. // Edges based on original memory access. SmallVector read_memrefs; // original_read_memrefs @@ -228,24 +228,25 @@ class TaskMapper { return; } - // Computes ALAP Levels on the Task Graph. - // We use ALAP (As Late As Possible) to identify the critical path and - // prioritize tasks that are closer to the sink, which helps in minimizing - // the overall dependency latency during placement. - computeALAP(graph); + // Computes Dependency Depth for each task. + // Dependency depth = longest path from this node to any sink in the + // dependency graph (considering both SSA and memory edges). Tasks with + // higher depth are more "critical" and are placed first to ensure their + // dependent chains have good locality. + computeDependencyDepth(graph); - // Sorts tasks by ALAP level (Critical Path First). + // Sorts tasks by dependency depth (Critical Path First). SmallVector sorted_tasks; for (auto &node : graph.task_nodes) sorted_tasks.push_back(node.get()); std::stable_sort(sorted_tasks.begin(), sorted_tasks.end(), [](TaskNode *a, TaskNode *b) { - return a->alap_level > b->alap_level; + return a->dependency_depth > b->dependency_depth; }); - // Critical path priority placement: - // 1. Computes ALAP level for each task (longest path to sink). - // 2. Sorts tasks by: (a) ALAP level, (b) criticality, (c) degree. + // Critical-path-first placement: + // 1. Computes dependency depth for each task (longest path to sink). + // 2. Sorts tasks by dependency depth (higher = more critical). // 3. Places tasks in sorted order with heuristic scoring. // Iterative Refinement Loop (Coordinate Descent). // Alternates between Task Placement (Phase 1) and SRAM Assignment (Phase 2). @@ -320,37 +321,33 @@ class TaskMapper { builder.getArrayAttr(pos_attrs))); // 2. Read SRAM Locations - if (!task_node->read_memrefs.empty()) { - SmallVector read_sram_attrs; - for (MemoryNode *mem : task_node->read_memrefs) { - if (mem->assigned_sram_pos) { - SmallVector sram_coord; - sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "row"), builder.getI32IntegerAttr(mem->assigned_sram_pos->row))); - sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "col"), builder.getI32IntegerAttr(mem->assigned_sram_pos->col))); - read_sram_attrs.push_back(DictionaryAttr::get(func.getContext(), sram_coord)); - } + SmallVector read_sram_attrs; + for (MemoryNode *mem : task_node->read_memrefs) { + if (mem->assigned_sram_pos) { + SmallVector sram_coord; + sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "row"), builder.getI32IntegerAttr(mem->assigned_sram_pos->row))); + sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "col"), builder.getI32IntegerAttr(mem->assigned_sram_pos->col))); + read_sram_attrs.push_back(DictionaryAttr::get(func.getContext(), sram_coord)); } - mapping_attrs.push_back(NamedAttribute( - StringAttr::get(func.getContext(), "read_sram_locations"), - builder.getArrayAttr(read_sram_attrs))); } + mapping_attrs.push_back(NamedAttribute( + StringAttr::get(func.getContext(), "read_sram_locations"), + builder.getArrayAttr(read_sram_attrs))); // 3. Write SRAM Locations - if (!task_node->write_memrefs.empty()) { - SmallVector write_sram_attrs; - for (MemoryNode *mem : task_node->write_memrefs) { - if (mem->assigned_sram_pos) { - SmallVector sram_coord; - sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "row"), builder.getI32IntegerAttr(mem->assigned_sram_pos->row))); - sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "col"), builder.getI32IntegerAttr(mem->assigned_sram_pos->col))); - - write_sram_attrs.push_back(DictionaryAttr::get(func.getContext(), sram_coord)); - } + SmallVector write_sram_attrs; + for (MemoryNode *mem : task_node->write_memrefs) { + if (mem->assigned_sram_pos) { + SmallVector sram_coord; + sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "row"), builder.getI32IntegerAttr(mem->assigned_sram_pos->row))); + sram_coord.push_back(NamedAttribute(StringAttr::get(func.getContext(), "col"), builder.getI32IntegerAttr(mem->assigned_sram_pos->col))); + + write_sram_attrs.push_back(DictionaryAttr::get(func.getContext(), sram_coord)); } - mapping_attrs.push_back(NamedAttribute( - StringAttr::get(func.getContext(), "write_sram_locations"), - builder.getArrayAttr(write_sram_attrs))); } + mapping_attrs.push_back(NamedAttribute( + StringAttr::get(func.getContext(), "write_sram_locations"), + builder.getArrayAttr(write_sram_attrs))); // Set Attribute task_node->op->setAttr("task_mapping_info", DictionaryAttr::get(func.getContext(), mapping_attrs)); @@ -498,32 +495,41 @@ class TaskMapper { return kAlpha * ssa_score + kBeta * mem_score; } - /// Computes ALAP levels considering both SSA and memory dependencies. - void computeALAP(TaskMemoryGraph &graph) { - // DFS for longest path from node to any sink (ALAP Level). - DenseMap node_alap_cache; + /// Computes dependency depth for all tasks in the graph. + /// + /// Dependency depth = longest path from this node to any sink node in the + /// dependency graph (via SSA or memory edges). + /// + /// Tasks with higher dependency depth have longer chains of dependent tasks + /// after them. By placing these tasks first: + /// 1. They get priority access to good grid positions. + /// 2. Their dependent tasks can then be positioned adjacent to them, + /// minimizing inter-task communication distance. + void computeDependencyDepth(TaskMemoryGraph &graph) { + DenseMap depth_cache; for (auto &node : graph.task_nodes) { - node->alap_level = calculateLevel(node.get(), node_alap_cache); + node->dependency_depth = calculateDepth(node.get(), depth_cache); } } - int calculateLevel(TaskNode *node, DenseMap &node_alap_cache) { - if (node_alap_cache.count(node)) return node_alap_cache[node]; + int calculateDepth(TaskNode *node, DenseMap &depth_cache) { + if (depth_cache.count(node)) return depth_cache[node]; - int max_child_level = 0; + int max_child_depth = 0; + // SSA dependencies. for (TaskNode *child : node->ssa_users) { - max_child_level = std::max(max_child_level, calculateLevel(child, node_alap_cache) + 1); + max_child_depth = std::max(max_child_depth, calculateDepth(child, depth_cache) + 1); } - // Checks memory dependencies too (Producer -> Mem -> Consumer). + // Memory dependencies (Producer -> Mem -> Consumer). for (MemoryNode *mem : node->write_memrefs) { for (TaskNode *reader : mem->readers) { if (reader != node) - max_child_level = std::max(max_child_level, calculateLevel(reader, node_alap_cache) + 1); + max_child_depth = std::max(max_child_depth, calculateDepth(reader, depth_cache) + 1); } } - return node_alap_cache[node] = max_child_level; + return depth_cache[node] = max_child_depth; } diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir index 057d6c15..a8b9d8c2 100644 --- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir +++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir @@ -309,8 +309,8 @@ module attributes {} { // HYPERBLOCK-NEXT: } // PLACEMENT: taskflow.task @Task_0 -// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}]} +// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locations = [], write_sram_locations = []} // PLACEMENT: taskflow.task @Task_1 -// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]} +// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_locations = [], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]} // PLACEMENT: taskflow.task @Task_2 // PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locations = [{col = 1 : i32, row = 1 : i32}], write_sram_locations = [{col = 0 : i32, row = 1 : i32}]} From db107d35e42749dbe148f5cfda1bd583f4ba157e Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Fri, 6 Feb 2026 03:19:58 +0800 Subject: [PATCH 13/15] Style polish: add braces to if statements, update header comments, and clean up whitespace --- .../Transforms/MapTaskOnCgraPass.cpp | 34 +++++++++++-------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp index 7d859e6d..0cd2c719 100644 --- a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp +++ b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp @@ -1,6 +1,6 @@ //===- MapTaskOnCgraPass.cpp - Task to CGRA Mapping Pass ----------------===// // -// This pass maps Canonical Tasks (CTs) onto a 2D CGRA grid array: +// This pass maps Taskflow tasks onto a 2D CGRA grid array: // 1. Places tasks with SSA dependencies on adjacent CGRAs. // 2. Assigns memrefs to SRAMs (each MemRef is assigned to exactly one SRAM, // determined by proximity to the task that first accesses it). @@ -251,12 +251,12 @@ class TaskMapper { // Iterative Refinement Loop (Coordinate Descent). // Alternates between Task Placement (Phase 1) and SRAM Assignment (Phase 2). constexpr int kMaxIterations = 10; - - - + for (int iter = 0; iter < kMaxIterations; ++iter) { // Phase 1: Place Tasks (assuming fixed SRAMs). - if (iter > 0) resetTaskPlacements(graph); + if (iter > 0) { + resetTaskPlacements(graph); + } for (TaskNode *task_node : sorted_tasks) { int cgra_count = 1; @@ -277,8 +277,9 @@ class TaskMapper { // Marks Occupied. for (const auto &pos : placement.cgra_positions) { - if (pos.row >= 0 && pos.row < grid_rows_ && pos.col >= 0 && pos.col < grid_cols_) + if (pos.row >= 0 && pos.row < grid_rows_ && pos.col >= 0 && pos.col < grid_cols_) { occupied_[pos.row][pos.col] = true; + } } } @@ -290,7 +291,6 @@ class TaskMapper { // Convergence Check. // If SRAMs didn't move, it means task placement based on them likely won't change either. if (iter > 0 && !sram_moved) { - break; } } @@ -300,7 +300,9 @@ class TaskMapper { // Annotates Result. OpBuilder builder(func.getContext()); for (auto &task_node : graph.task_nodes) { - if (task_node->placement.empty()) continue; + if (task_node->placement.empty()) { + continue; + } SmallVector mapping_attrs; @@ -320,7 +322,7 @@ class TaskMapper { StringAttr::get(func.getContext(), "cgra_positions"), builder.getArrayAttr(pos_attrs))); - // 2. Read SRAM Locations + // 2. Reads SRAM Locations. SmallVector read_sram_attrs; for (MemoryNode *mem : task_node->read_memrefs) { if (mem->assigned_sram_pos) { @@ -334,7 +336,7 @@ class TaskMapper { StringAttr::get(func.getContext(), "read_sram_locations"), builder.getArrayAttr(read_sram_attrs))); - // 3. Write SRAM Locations + // 3. Writes SRAM Locations. SmallVector write_sram_attrs; for (MemoryNode *mem : task_node->write_memrefs) { if (mem->assigned_sram_pos) { @@ -349,7 +351,7 @@ class TaskMapper { StringAttr::get(func.getContext(), "write_sram_locations"), builder.getArrayAttr(write_sram_attrs))); - // Set Attribute + // Sets Attribute. task_node->op->setAttr("task_mapping_info", DictionaryAttr::get(func.getContext(), mapping_attrs)); } } @@ -417,8 +419,9 @@ class TaskMapper { // Baseline: For cgra_count=1, finds single best position. for (int r = 0; r < grid_rows_; ++r) { for (int c = 0; c < grid_cols_; ++c) { - if (occupied_[r][c]) + if (occupied_[r][c]) { continue; + } TaskPlacement candidate; candidate.cgra_positions.push_back({r, c}); @@ -513,7 +516,9 @@ class TaskMapper { } int calculateDepth(TaskNode *node, DenseMap &depth_cache) { - if (depth_cache.count(node)) return depth_cache[node]; + if (depth_cache.count(node)) { + return depth_cache[node]; + } int max_child_depth = 0; // SSA dependencies. @@ -524,8 +529,9 @@ class TaskMapper { // Memory dependencies (Producer -> Mem -> Consumer). for (MemoryNode *mem : node->write_memrefs) { for (TaskNode *reader : mem->readers) { - if (reader != node) + if (reader != node) { max_child_depth = std::max(max_child_depth, calculateDepth(reader, depth_cache) + 1); + } } } From 09eed47236679d7c4284a0537fdf2b5e6eb90627 Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Fri, 6 Feb 2026 12:15:04 +0800 Subject: [PATCH 14/15] Cleanup: fix comment formatting and typos according to reviewer feedback --- .../Transforms/MapTaskOnCgraPass.cpp | 28 +++++++++---------- test/benchmark/CGRA-Bench | 2 +- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp index 0cd2c719..54367af1 100644 --- a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp +++ b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp @@ -99,8 +99,8 @@ struct TaskNode { int dependency_depth = 0; // Longest path to any sink in the dependency graph. // Edges based on original memory access. - SmallVector read_memrefs; // original_read_memrefs - SmallVector write_memrefs; // original_write_memrefs + SmallVector read_memrefs; // Original read memrefs. + SmallVector write_memrefs; // Original write memrefs. SmallVector ssa_users; SmallVector ssa_operands; @@ -114,11 +114,11 @@ struct TaskNode { struct MemoryNode { Value memref; - // Edges + // Edges. SmallVector readers; SmallVector writers; - - // Mapping result + + // Mapping result. std::optional assigned_sram_pos; MemoryNode(Value memref) : memref(memref) {} @@ -163,7 +163,7 @@ class TaskMemoryGraph { // 3. Builds SSA Edges (Inter-Task Value Dependencies). // Identifies if a task uses a value produced by another task. for (auto &consumer_node : task_nodes) { - // Interates all operands for now to be safe. + // Iterates all operands for now to be safe. for (Value operand : consumer_node->op.getValueInputs()) { if (auto producer_op = operand.getDefiningOp()) { if (auto *producer_node = op_to_node[producer_op]) { @@ -306,7 +306,7 @@ class TaskMapper { SmallVector mapping_attrs; - // 1. CGRA Positions + // 1. CGRA positions. SmallVector pos_attrs; for (const auto &pos : task_node->placement) { SmallVector coord_attrs; @@ -447,22 +447,22 @@ class TaskMapper { /// downstream hardware generators to configure fast bypass paths between /// adjacent PEs with dependencies. /// - /// Score = α·SSA_Dist + β·Mem_Dist + /// Score = α·SSA_Dist + β·Mem_Dist. /// /// SSA_Dist: Minimize distance to placed SSA predecessors (ssa_operands). /// Mem_Dist: Minimize distance to assigned SRAMs for read/write memrefs. int computeScore(TaskNode *task_node, const TaskPlacement &placement, TaskMemoryGraph &graph) { // Weight constants (tunable). - constexpr int kAlpha = 10; // SSA proximity weight - constexpr int kBeta = 50; // Memory proximity weight (High priority) + constexpr int kAlpha = 10; // SSA proximity weight. + constexpr int kBeta = 50; // Memory proximity weight (high priority). int ssa_score = 0; int mem_score = 0; CGRAPosition current_pos = placement.primary(); - // 1. SSA Proximity (Predecessors & Successors) + // 1. SSA proximity (predecessors & successors). for (TaskNode *producer : task_node->ssa_operands) { if (!producer->placement.empty()) { int dist = current_pos.manhattanDistance(producer->placement[0]); @@ -477,15 +477,15 @@ class TaskMapper { } } - // 2. Memory Proximity - // For Read MemRefs + // 2. Memory proximity. + // For read memrefs. for (MemoryNode *mem : task_node->read_memrefs) { if (mem->assigned_sram_pos) { int dist = current_pos.manhattanDistance(*mem->assigned_sram_pos); mem_score -= dist; } } - // For Write MemRefs + // For write memrefs. // If we write to a memory that is already assigned (e.g. read by previous task), // we want to be close to it too. for (MemoryNode *mem : task_node->write_memrefs) { diff --git a/test/benchmark/CGRA-Bench b/test/benchmark/CGRA-Bench index 2b5e78b2..2beecc59 160000 --- a/test/benchmark/CGRA-Bench +++ b/test/benchmark/CGRA-Bench @@ -1 +1 @@ -Subproject commit 2b5e78b24d481c8465c82672a8d5177a86119aed +Subproject commit 2beecc599bd268f8665344ba2271f48c97db7aa0 From d016323ba122e0f10e2808861ba0c6d5434d23e6 Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Fri, 6 Feb 2026 12:17:30 +0800 Subject: [PATCH 15/15] Cleanup: fix comment formatting and typos according to reviewer feedback --- lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp index 54367af1..c04df0b7 100644 --- a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp +++ b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp @@ -177,7 +177,7 @@ class TaskMemoryGraph { private: MemoryNode *getOrCreateMemoryNode(Value memref) { - if (memref_to_node.count(memref)){ + if (memref_to_node.count(memref)) { return memref_to_node[memref]; } @@ -264,7 +264,7 @@ class TaskMapper { cgra_count = attr.getInt(); } - // Finds Best Placement using SRAM positions from previous iter (or -1/default). + // Finds best placement using SRAM positions from previous iter (or -1/default). TaskPlacement placement = findBestPlacement(task_node, cgra_count, graph); // Commits Placement. @@ -275,7 +275,7 @@ class TaskMapper { task_node->placement.push_back(placement.cgra_positions[i]); } - // Marks Occupied. + // Marks occupied. for (const auto &pos : placement.cgra_positions) { if (pos.row >= 0 && pos.row < grid_rows_ && pos.col >= 0 && pos.col < grid_cols_) { occupied_[pos.row][pos.col] = true; @@ -283,7 +283,7 @@ class TaskMapper { } } - // Phase 2: Assign SRAMs (assuming fixed Tasks). + // Phase 2: Assign SRAMs (assuming fixed tasks). bool sram_moved = assignAllSRAMs(graph); @@ -297,7 +297,7 @@ class TaskMapper { - // Annotates Result. + // Annotates result. OpBuilder builder(func.getContext()); for (auto &task_node : graph.task_nodes) { if (task_node->placement.empty()) { @@ -515,6 +515,7 @@ class TaskMapper { } } + /// Recursively calculates dependency depth for a single task. int calculateDepth(TaskNode *node, DenseMap &depth_cache) { if (depth_cache.count(node)) { return depth_cache[node];