diff --git a/include/TaskflowDialect/Allocation/allocation_utils.h b/include/TaskflowDialect/Allocation/allocation_utils.h new file mode 100644 index 00000000..f7f9b7de --- /dev/null +++ b/include/TaskflowDialect/Allocation/allocation_utils.h @@ -0,0 +1,112 @@ +//===- allocation_utils.h - Shared CGRA allocation utilities --------------===// +// +// Shared utility types and functions used by AllocateCgraToTaskPass and +// ResourceAwareTaskOptimizationPass for CGRA grid placement feasibility +// checks and task-to-CGRA mapping. +// +//===----------------------------------------------------------------------===// + +#ifndef TASKFLOW_ALLOCATION_UTILS_H +#define TASKFLOW_ALLOCATION_UTILS_H + +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "llvm/ADT/SmallVector.h" + +namespace mlir { +namespace taskflow { + +//===----------------------------------------------------------------------===// +// Grid constants +//===----------------------------------------------------------------------===// + +constexpr int kCgraGridRows = 4; +constexpr int kCgraGridCols = 4; + +//===----------------------------------------------------------------------===// +// CgraShape +//===----------------------------------------------------------------------===// + +// Represents a CGRA allocation shape on the grid. +// +// For rectangular shapes: rows × cols == cgra_count, and `cgra_positions` +// is empty (all cells in the bounding box are used). +// +// For non-rectangular shapes (L, T): `cgra_positions` stores the explicit +// (col, row) coordinates of the occupied CGRAs. `rows`/`cols` give the +// bounding box so that tile-level x_tiles/y_tiles can be computed. +struct CgraShape { + int rows; // Bounding-box CGRA rows. + int cols; // Bounding-box CGRA columns. + bool is_rectangular; // True if all cells in the bbox are used. + // Explicit CGRA positions for non-rectangular shapes. + // Each pair is (col, row) in CGRA coordinates. Empty for rectangles. + llvm::SmallVector> cgra_positions; + + // Returns the bounding-box area (rows * cols). For rectangular shapes this + // equals cgra_count; for non-rectangular shapes it is larger than cgra_count + // (some cells in the bbox are unoccupied). Used only for shape sorting + // (prefer smaller bounding boxes), not for counting occupied CGRAs. + int area() const { return rows * cols; } + + // Returns a human-readable description for log messages only (not IR). + std::string describe(int cgra_count) const; + + // Returns the shape string written into the IR tile_shape attribute. + // For rectangular shapes: "NxM" (e.g. "2x2"). + // For non-rectangular shapes: "NxM[(c0,r0)(c1,r1)...]" listing only the + // occupied CGRA positions so that downstream passes can reconstruct the + // exact valid tile set for multi-CGRA mapping. + std::string irAttr() const; +}; + +//===----------------------------------------------------------------------===// +// Shape Enumeration Utilities +//===----------------------------------------------------------------------===// + +// Generates all placement-candidate shapes for `cgra_count` CGRAs, including +// rotations. Rectangular shapes include both orientations (rows×cols and +// cols×rows, deduplicated for squares). Non-rectangular shapes include all +// four 90° rotations. +// +// Ordering (tried first to last): +// 1. Rectangular shapes, sorted by squareness (e.g. 2×2 before 1×4), +// with smaller bounding-box area as tiebreaker. +// 2. Non-rectangular shapes (L, T, etc.) in all unique rotations. +llvm::SmallVector getAllPlacementShapes(int cgra_count); + +//===----------------------------------------------------------------------===// +// Global Placement Feasibility +//===----------------------------------------------------------------------===// + +// Simulates greedy placement of all tasks' shapes on the kCgraGridRows × +// kCgraGridCols grid to verify that they physically fit without overlap. +// +// For each task, all valid shapes (including rotations) are tried. Rectangular +// shapes prefer square-like orientations (e.g. 2×2 over 1×4). Non-rectangular +// shapes are tried in all four 90° rotations. +// +// `task_cgra_counts` contains the cgra_count for every task in the graph +// (including the speculatively modified one). +// +// Returns true if all tasks can be placed without overlap. +bool canAllTasksFitOnGrid(llvm::ArrayRef task_cgra_counts); + +//===----------------------------------------------------------------------===// +// Direct Pass Invocation +//===----------------------------------------------------------------------===// + +// Runs the CGRA task placement logic directly on a function, producing +// `task_mapping_info` attributes with global grid placement that respects +// multi-CGRA shapes. +// +// grid_rows/grid_cols default to 4x4 (kCgraGridRows/kCgraGridCols). +// +// Defined in lib/TaskflowDialect/Util/AllocateCgraTaskMapper.cpp. +void runAllocateCgraToTask(mlir::func::FuncOp func, + int grid_rows = kCgraGridRows, + int grid_cols = kCgraGridCols); + +} // namespace taskflow +} // namespace mlir + +#endif // TASKFLOW_CGRA_PLACEMENT_UTILS_H diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h index 92393d7c..041cdbc7 100644 --- a/include/TaskflowDialect/TaskflowPasses.h +++ b/include/TaskflowDialect/TaskflowPasses.h @@ -5,6 +5,8 @@ #include "TaskflowDialect/TaskflowDialect.h" #include "TaskflowDialect/TaskflowOps.h" +#include "TaskflowDialect/Allocation/allocation_utils.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" #include "mlir/Pass/PassRegistry.h" @@ -21,7 +23,7 @@ void registerTosaToAffineConversionPassPipeline(); #include "TaskflowDialect/TaskflowPasses.h.inc" std::unique_ptr createConstructHyperblockFromTaskPass(); std::unique_ptr createClassifyCountersPass(); -std::unique_ptr createMapTaskOnCgraPass(); +std::unique_ptr createAllocateCgraToTaskPass(); std::unique_ptr createFuseTaskPass(); //=========================================================// diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td index 5cf07cd7..701e8246 100644 --- a/include/TaskflowDialect/TaskflowPasses.td +++ b/include/TaskflowDialect/TaskflowPasses.td @@ -63,16 +63,16 @@ def ClassifyCounters : Pass<"classify-counters", "ModuleOp"> { let constructor = "taskflow::createClassifyCountersPass()"; } -def MapTaskOnCgra : Pass<"map-task-on-cgra", "func::FuncOp"> { +def AllocateCgraToTask : Pass<"allocate-cgra-to-task", "func::FuncOp"> { let summary = "Maps Taskflow tasks onto a 2D CGRA grid array"; let description = [{ This pass maps Taskflow tasks onto a 2D CGRA grid array. Fusion candidates (same-header SSA dependencies) are placed on adjacent CGRAs to enable direct data forwarding. - Uses a default 3x3 CGRA grid. + Uses a default 4x4 CGRA grid. }]; - let constructor = "taskflow::createMapTaskOnCgraPass()"; + let constructor = "taskflow::createAllocateCgraToTaskPass()"; } def FuseTask : Pass<"fuse-task", "func::FuncOp"> { diff --git a/lib/TaskflowDialect/Allocation/CMakeLists.txt b/lib/TaskflowDialect/Allocation/CMakeLists.txt new file mode 100644 index 00000000..3f48ebf9 --- /dev/null +++ b/lib/TaskflowDialect/Allocation/CMakeLists.txt @@ -0,0 +1,15 @@ +add_mlir_library(MLIRTaskflowAllocation + allocation_utils.cpp + allocation_utils_mapper.cpp + + DEPENDS + MLIRTaskflowTransformsIncGen + + LINK_LIBS PUBLIC + MLIRIR + MLIRPass + MLIRSupport + MLIRFuncDialect + MLIRTaskflow + LLVMSupport +) diff --git a/lib/TaskflowDialect/Allocation/allocation_utils.cpp b/lib/TaskflowDialect/Allocation/allocation_utils.cpp new file mode 100644 index 00000000..b8e842c2 --- /dev/null +++ b/lib/TaskflowDialect/Allocation/allocation_utils.cpp @@ -0,0 +1,278 @@ +//===- allocation_utils.cpp - Shared CGRA allocation utilities ------------===// +// +// Implements shared utility functions for CGRA grid placement used by +// AllocateCgraToTaskPass and ResourceAwareTaskOptimizationPass. +// +//===----------------------------------------------------------------------===// + +#include "TaskflowDialect/Allocation/allocation_utils.h" + +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/raw_ostream.h" + +#include +#include +#include + +using namespace mlir; +using namespace mlir::taskflow; + +//===----------------------------------------------------------------------===// +// CgraShape member implementations +//===----------------------------------------------------------------------===// + +std::string CgraShape::describe(int cgra_count) const { + std::string s = std::to_string(rows) + "x" + std::to_string(cols); + if (!is_rectangular) { + s += "(non-rect, " + std::to_string(cgra_count) + " CGRAs:"; + for (auto &[c, r] : cgra_positions) + s += " (" + std::to_string(c) + "," + std::to_string(r) + ")"; + s += ")"; + } + return s; +} + +std::string CgraShape::irAttr() const { + std::string s = std::to_string(rows) + "x" + std::to_string(cols); + if (!is_rectangular && !cgra_positions.empty()) { + s += "["; + for (auto &[c, r] : cgra_positions) + s += "(" + std::to_string(c) + "," + std::to_string(r) + ")"; + s += "]"; + } + return s; +} + +//===----------------------------------------------------------------------===// +// Internal helpers +//===----------------------------------------------------------------------===// + +namespace { + +// Returns all valid rectangular shapes for `cgra_count` CGRAs. +SmallVector getRectangularShapes(int cgra_count) { + SmallVector shapes; + for (int r = 1; r <= kCgraGridRows; ++r) { + for (int c = 1; c <= kCgraGridCols; ++c) { + if (r * c == cgra_count) + shapes.push_back( + {r, c, /*is_rectangular=*/true, /*cgra_positions=*/{}}); + } + } + return shapes; +} + +// Returns the set of non-rectangular shapes for `cgra_count` CGRAs. +// Currently defined for cgra_count == 3 (L-shape) and cgra_count == 4 +// (L-shape and T-shape variants). +SmallVector getNonRectangularShapes(int cgra_count) { + SmallVector shapes; + + if (cgra_count == 3) { + // L-shape 3 CGRAs: (0,0)(1,0)(0,1) — bbox 2×2 + shapes.push_back({2, 2, false, {{0, 0}, {1, 0}, {0, 1}}}); + } + + if (cgra_count == 4) { + // T-shape: three in a row + one below centre + // (0,0)(1,0)(2,0)(1,1) — bbox 2×3 + shapes.push_back({2, 3, false, {{0, 0}, {1, 0}, {2, 0}, {1, 1}}}); + + // L-shape: three in a column + one offset + // (0,0)(0,1)(0,2)(1,2) — bbox 3×2 + shapes.push_back({3, 2, false, {{0, 0}, {0, 1}, {0, 2}, {1, 2}}}); + } + + return shapes; +} + +} // namespace + +//===----------------------------------------------------------------------===// +// getAllPlacementShapes +//===----------------------------------------------------------------------===// + +SmallVector mlir::taskflow::getAllPlacementShapes(int cgra_count) { + SmallVector shapes; + + // 1. Rectangular shapes with both orientations, deduplicated. + { + llvm::DenseSet seen_keys; // encodes (rows<<16)|cols + for (int row_dim = 1; row_dim <= kCgraGridRows; ++row_dim) { + for (int col_dim = 1; col_dim <= kCgraGridCols; ++col_dim) { + if (row_dim * col_dim == cgra_count) { + int64_t key = ((int64_t)row_dim << 16) | col_dim; + if (seen_keys.insert(key).second) { + shapes.push_back({row_dim, col_dim, true, {}}); + // Adds the rotated orientation if different (e.g. 1×4 -> 4×1). + if (row_dim != col_dim) { + int64_t rotated_key = ((int64_t)col_dim << 16) | row_dim; + if (seen_keys.insert(rotated_key).second) { + shapes.push_back({col_dim, row_dim, true, {}}); + } + } + } + } + } + } + // Sorts rectangles: prefer more square-like (smaller |rows-cols|), then + // smaller bounding-box area as tiebreaker. + llvm::sort(shapes, [](const CgraShape &lhs, const CgraShape &rhs) { + int squareness_lhs = std::abs(lhs.rows - lhs.cols); + int squareness_rhs = std::abs(rhs.rows - rhs.cols); + if (squareness_lhs != squareness_rhs) + return squareness_lhs < squareness_rhs; + return lhs.area() < rhs.area(); + }); + } + + // 2. Non-rectangular shapes with all four 90° rotations. + auto base_non_rect = getNonRectangularShapes(cgra_count); + for (const auto &base : base_non_rect) { + // Generates 4 rotations of the cgra_positions list. + // Rotation by 90° CW: (col, row) -> (row, -col). + // Each rotation is normalised so that offsets start from (0, 0). + SmallVector>, 4> rotation_variants; + rotation_variants.push_back( + SmallVector>(base.cgra_positions)); + + auto prev_positions = base.cgra_positions; + for (int rotation_idx = 0; rotation_idx < 3; ++rotation_idx) { + SmallVector> rotated_positions; + for (auto &[col_off, row_off] : prev_positions) + rotated_positions.push_back( + {row_off, -col_off}); // 90° CW in (col, row) space + + // Normalises to non-negative offsets starting from (0, 0). + int min_col = INT_MAX, min_row = INT_MAX; + for (auto &[col_off, row_off] : rotated_positions) { + min_col = std::min(min_col, col_off); + min_row = std::min(min_row, row_off); + } + for (auto &[col_off, row_off] : rotated_positions) { + col_off -= min_col; + row_off -= min_row; + } + rotation_variants.push_back(rotated_positions); + prev_positions = rotated_positions; + } + + // Deduplicates rotations that produce the same position set. + // Hash parameters: multiplier 131 and positional weight 17 are chosen to + // give low collision rates for small integer coordinate sets. + llvm::DenseSet seen_hashes; + for (auto &positions : rotation_variants) { + auto sorted_positions = positions; + llvm::sort(sorted_positions, + [](const std::pair &lhs, + const std::pair &rhs) { return lhs < rhs; }); + int64_t hash = 0; + for (auto &[col_off, row_off] : sorted_positions) + hash = hash * 131 + col_off * 17 + row_off; + if (!seen_hashes.insert(hash).second) { + continue; + } + // Computes bounding box for this rotation. + int max_col = 0, max_row = 0; + for (auto &[col_off, row_off] : positions) { + max_col = std::max(max_col, col_off); + max_row = std::max(max_row, row_off); + } + shapes.push_back({max_row + 1, max_col + 1, false, std::move(positions)}); + } + } + + return shapes; +} + +//===----------------------------------------------------------------------===// +// canAllTasksFitOnGrid +//===----------------------------------------------------------------------===// + +bool mlir::taskflow::canAllTasksFitOnGrid(ArrayRef task_cgra_counts) { + constexpr int kTotalCGRAs = kCgraGridRows * kCgraGridCols; + + // Quick capacity check: total CGRAs must not exceed grid size. + int total_cgras = 0; + for (int count : task_cgra_counts) + total_cgras += count; + if (total_cgras > kTotalCGRAs) { + return false; + } + + // Simulates placement on a grid. + bool occupied[kCgraGridRows][kCgraGridCols] = {}; + + // Sorts tasks by descending cgra_count for better packing (largest-first + // decreasing, a standard bin-packing heuristic). Each task may have a + // different cgra_count because the balance phase only increments one + // bottleneck at a time; this array reflects the heterogeneous allocation + // across all tasks in the current trial configuration. + SmallVector sorted_counts(task_cgra_counts.begin(), + task_cgra_counts.end()); + llvm::sort(sorted_counts, [](int lhs, int rhs) { return lhs > rhs; }); + + for (int cgra_count : sorted_counts) { + SmallVector candidates = getAllPlacementShapes(cgra_count); + bool placed = false; + + for (const auto &shape : candidates) { + if (placed) + break; + + if (shape.is_rectangular) { + // Rectangular: tries every origin where the rows×cols bbox fits. + for (int origin_row = 0; + origin_row <= kCgraGridRows - shape.rows && !placed; + ++origin_row) { + for (int origin_col = 0; + origin_col <= kCgraGridCols - shape.cols && !placed; + ++origin_col) { + bool fits = true; + for (int delta_row = 0; delta_row < shape.rows && fits; ++delta_row) + for (int delta_col = 0; delta_col < shape.cols && fits; + ++delta_col) + if (occupied[origin_row + delta_row][origin_col + delta_col]) + fits = false; + if (fits) { + for (int delta_row = 0; delta_row < shape.rows; ++delta_row) + for (int delta_col = 0; delta_col < shape.cols; ++delta_col) + occupied[origin_row + delta_row][origin_col + delta_col] = + true; + placed = true; + } + } + } + } else { + // Non-rectangular: cgra_positions stores (col, row) offsets. + for (int origin_row = 0; origin_row < kCgraGridRows && !placed; + ++origin_row) { + for (int origin_col = 0; origin_col < kCgraGridCols && !placed; + ++origin_col) { + bool fits = true; + for (auto &[col_off, row_off] : shape.cgra_positions) { + int abs_row = origin_row + row_off; + int abs_col = origin_col + col_off; + if (abs_row < 0 || abs_row >= kCgraGridRows || abs_col < 0 || + abs_col >= kCgraGridCols || occupied[abs_row][abs_col]) { + fits = false; + break; + } + } + if (fits) { + for (auto &[col_off, row_off] : shape.cgra_positions) + occupied[origin_row + row_off][origin_col + col_off] = true; + placed = true; + } + } + } + } + } + + if (!placed) { + return false; + } + } + return true; +} diff --git a/lib/TaskflowDialect/Allocation/allocation_utils_mapper.cpp b/lib/TaskflowDialect/Allocation/allocation_utils_mapper.cpp new file mode 100644 index 00000000..e23d2ae8 --- /dev/null +++ b/lib/TaskflowDialect/Allocation/allocation_utils_mapper.cpp @@ -0,0 +1,728 @@ +//===- allocation_utils_mapper.cpp - Task-to-CGRA mapping implementation --===// +// +// Implements runAllocateCgraToTask and the internal TaskMapper used by +// AllocateCgraToTaskPass. Kept under TaskflowDialect/Allocation per code review. +// +//===----------------------------------------------------------------------===// + +#include "TaskflowDialect/TaskflowDialect.h" +#include "TaskflowDialect/TaskflowOps.h" +#include "TaskflowDialect/Allocation/allocation_utils.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/Builders.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/raw_ostream.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace mlir; +using namespace mlir::taskflow; + +namespace { + +//===----------------------------------------------------------------------===// +// CGRA Grid Position +//===----------------------------------------------------------------------===// +/// Represents a position on the 2D CGRA grid. +struct CgraPosition { + int row; + int col; + + bool operator==(const CgraPosition &other) const { + return row == other.row && col == other.col; + } + + bool operator!=(const CgraPosition &other) const { return !(*this == other); } + + int manhattanDistance(const CgraPosition &other) const { + return std::abs(row - other.row) + std::abs(col - other.col); + } + + /// Returns true if the two positions are directly adjacent (Manhattan + /// distance == 1), i.e. share an edge on the grid. + bool isAdjacent(const CgraPosition &other) const { + return manhattanDistance(other) == 1; + } +}; + +//===----------------------------------------------------------------------===// +// Task Placement Info +//===----------------------------------------------------------------------===// +/// Stores the placement result for a task: the set of CGRAs assigned to it. +/// A task can span one or more contiguous CGRAs (rectangular or non-rect). +struct TaskPlacement { + SmallVector cgra_positions; // CGRAs assigned to this task. + + /// Returns the primary (first) CGRA position. + CgraPosition primary() const { + return cgra_positions.empty() ? CgraPosition{-1, -1} : cgra_positions[0]; + } + + /// Returns the number of CGRAs assigned to this task. + size_t cgraCount() const { return cgra_positions.size(); } + + /// Returns true if any CGRA in this task is grid-adjacent to any CGRA + /// in `other`, indicating that direct data forwarding between tasks is + /// possible without going through the network. + bool hasTaskAdjacentCgra(const TaskPlacement &other) const { + for (const auto &pos : cgra_positions) { + for (const auto &other_pos : other.cgra_positions) { + if (pos.isAdjacent(other_pos)) { + return true; + } + } + } + return false; + } +}; + +//===----------------------------------------------------------------------===// +// Task-Memory Graph +//===----------------------------------------------------------------------===// + +struct MemoryNode; + +/// Represents a Task node in the dependency graph. +struct TaskNode { + size_t id; + TaskflowTaskOp op; + int dependency_depth = 0; // Longest path to any sink in the dependency graph. + + // Edges based on original (pre-streaming-fusion) memory accesses. + SmallVector read_memrefs; // MemoryNodes this task reads. + SmallVector write_memrefs; // MemoryNodes this task writes. + // SSA value edges between tasks. + SmallVector ssa_users; // Tasks that consume this task's output. + SmallVector + ssa_operands; // Tasks whose output this task consumes. + + // Placement result — populated by TaskMapper::place(). + SmallVector placement; + + TaskNode(size_t id, TaskflowTaskOp op) : id(id), op(op) {} +}; + +/// Represents a MemRef node in the dependency graph. +struct MemoryNode { + Value memref; + + // Access edges. + SmallVector readers; // Tasks that read this memref. + SmallVector writers; // Tasks that write this memref. + + // SRAM assignment result — populated by TaskMapper::assignAllSrams(). + std::optional assigned_sram_pos; + + MemoryNode(Value memref) : memref(memref) {} +}; + +class TaskMemoryGraph { +public: + SmallVector> task_nodes; + SmallVector> memory_nodes; + DenseMap memref_to_node; + DenseMap op_to_node; + + void build(func::FuncOp func) { + // Phase 1: Create a TaskNode for every TaskflowTaskOp in the function. + size_t task_id = 0; + func.walk([&](TaskflowTaskOp task) { + auto node = std::make_unique(task_id++, task); + op_to_node[task] = node.get(); + task_nodes.push_back(std::move(node)); + }); + + // Phase 2: Create MemoryNodes using ORIGINAL memrefs (canonical identity). + // Uses original_read_memrefs / original_write_memrefs so that aliased + // memories (created by streaming-fusion) share the same MemoryNode. + for (auto &t_node : task_nodes) { + // Uses original_read_memrefs for canonical memory identity. + for (Value orig_memref : t_node->op.getOriginalReadMemrefs()) { + MemoryNode *m_node = getOrCreateMemoryNode(orig_memref); + t_node->read_memrefs.push_back(m_node); + m_node->readers.push_back(t_node.get()); + } + // Uses original_write_memrefs for canonical memory identity. + for (Value orig_memref : t_node->op.getOriginalWriteMemrefs()) { + MemoryNode *m_node = getOrCreateMemoryNode(orig_memref); + t_node->write_memrefs.push_back(m_node); + m_node->writers.push_back(t_node.get()); + } + } + + // Phase 3: Build SSA edges (inter-task value dependencies). + // A consumer task directly uses a value produced by a producer task. + for (auto &consumer_node : task_nodes) { + // Iterates all operands to be safe (not only getValueInputs()). + for (Value operand : consumer_node->op.getValueInputs()) { + if (auto producer_op = operand.getDefiningOp()) { + if (auto *producer_node = op_to_node[producer_op]) { + producer_node->ssa_users.push_back(consumer_node.get()); + consumer_node->ssa_operands.push_back(producer_node); + } + } + } + } + } + +private: + MemoryNode *getOrCreateMemoryNode(Value memref) { + if (memref_to_node.count(memref)) { + return memref_to_node[memref]; + } + + auto node = std::make_unique(memref); + MemoryNode *ptr = node.get(); + memref_to_node[memref] = ptr; + memory_nodes.push_back(std::move(node)); + return ptr; + } +}; + +//===----------------------------------------------------------------------===// +/// Maps a task-memory graph onto a 2D CGRA grid. +/// +/// Uses a two-phase fixed-point iteration: +/// Phase 1: Place tasks on the grid (scoring by SSA + memory proximity). +/// Phase 2: Assign each MemRef to the nearest SRAM given task positions. +/// Iterates until SRAM assignments converge (critical-path-first ordering). +class TaskMapper { +public: + TaskMapper(int grid_rows, int grid_cols) + : grid_rows_(grid_rows), grid_cols_(grid_cols) { + occupied_.resize(grid_rows_); + for (auto &row : occupied_) { + row.resize(grid_cols_, false); + } + } + + /// Places all tasks and performs iterative SRAM assignment for `func`. + void place(func::FuncOp func) { + SmallVector tasks; + func.walk([&](TaskflowTaskOp task) { tasks.push_back(task); }); + + if (tasks.empty()) { + llvm::errs() << "No tasks to place.\n"; + return; + } + + // Builds Task-Memory Graph. + TaskMemoryGraph graph; + graph.build(func); + + if (graph.task_nodes.empty()) { + llvm::errs() << "No tasks to place.\n"; + return; + } + + // Computes dependency depth for each task. + // Dependency depth = longest path from this node to any sink node in the + // dependency graph (via SSA or memory edges). Tasks with higher depth + // have longer dependent chains after them; placing them first gives their + // successors the best chance of landing on adjacent grid cells. + computeDependencyDepth(graph); + + // Sorts tasks by dependency depth (Critical Path First). + SmallVector sorted_tasks; + for (auto &node : graph.task_nodes) + sorted_tasks.push_back(node.get()); + + std::stable_sort(sorted_tasks.begin(), sorted_tasks.end(), + [](TaskNode *a, TaskNode *b) { + return a->dependency_depth > b->dependency_depth; + }); + + // Fixed-point iteration: task placement scoring depends on SRAM + // positions (memory proximity), and SRAM assignment depends on task + // positions (centroid of accessing tasks). Each iteration re-places + // all tasks using the latest SRAM assignments, then re-assigns SRAMs. + // Converges when SRAM assignments stabilise (no change between iters). + constexpr int kMaxIterations = 10; + + for (int iter = 0; iter < kMaxIterations; ++iter) { + if (iter > 0) { + resetTaskPlacements(graph); + } + + // Phase 1: Place tasks (scoring uses current SRAM assignments). + for (TaskNode *task_node : sorted_tasks) { + int cgra_count = 1; + if (auto attr = + task_node->op->getAttrOfType("cgra_count")) { + cgra_count = attr.getInt(); + } + + TaskPlacement placement = + findBestPlacement(task_node, cgra_count, graph); + + assert(!placement.cgra_positions.empty() && + "findBestPlacement must succeed: cgra_count should be " + "validated by the upstream resource-aware optimization pass " + "or manually assigned resource binding attributes"); + + // Commits placement and marks occupied grid cells. + for (const auto &pos : placement.cgra_positions) { + task_node->placement.push_back(pos); + } + + for (const auto &pos : placement.cgra_positions) { + if (pos.row >= 0 && pos.row < grid_rows_ && pos.col >= 0 && + pos.col < grid_cols_) { + occupied_[pos.row][pos.col] = true; + } + } + } + + // Phase 2: Assign SRAMs (assuming fixed task positions). + // If nothing moved, task scores won't change → convergence reached. + bool sram_moved = assignAllSrams(graph); + + if (iter > 0 && !sram_moved) { + break; + } + } + + // Annotates result: writes task_mapping_info attribute to each task op. + OpBuilder builder(func.getContext()); + for (auto &task_node : graph.task_nodes) { + if (task_node->placement.empty()) { + continue; + } + + SmallVector mapping_attrs; + + // 1. CGRA positions. + SmallVector pos_attrs; + for (const auto &pos : task_node->placement) { + SmallVector coord_attrs; + coord_attrs.push_back( + NamedAttribute(StringAttr::get(func.getContext(), "row"), + builder.getI32IntegerAttr(pos.row))); + coord_attrs.push_back( + NamedAttribute(StringAttr::get(func.getContext(), "col"), + builder.getI32IntegerAttr(pos.col))); + pos_attrs.push_back( + DictionaryAttr::get(func.getContext(), coord_attrs)); + } + mapping_attrs.push_back( + NamedAttribute(StringAttr::get(func.getContext(), "cgra_positions"), + builder.getArrayAttr(pos_attrs))); + + // 2. Read SRAM locations. + SmallVector read_sram_attrs; + for (MemoryNode *mem : task_node->read_memrefs) { + if (mem->assigned_sram_pos) { + SmallVector sram_coord; + sram_coord.push_back(NamedAttribute( + StringAttr::get(func.getContext(), "row"), + builder.getI32IntegerAttr(mem->assigned_sram_pos->row))); + sram_coord.push_back(NamedAttribute( + StringAttr::get(func.getContext(), "col"), + builder.getI32IntegerAttr(mem->assigned_sram_pos->col))); + read_sram_attrs.push_back( + DictionaryAttr::get(func.getContext(), sram_coord)); + } + } + mapping_attrs.push_back(NamedAttribute( + StringAttr::get(func.getContext(), "read_sram_locations"), + builder.getArrayAttr(read_sram_attrs))); + + // 3. Write SRAM locations. + SmallVector write_sram_attrs; + for (MemoryNode *mem : task_node->write_memrefs) { + if (mem->assigned_sram_pos) { + SmallVector sram_coord; + sram_coord.push_back(NamedAttribute( + StringAttr::get(func.getContext(), "row"), + builder.getI32IntegerAttr(mem->assigned_sram_pos->row))); + sram_coord.push_back(NamedAttribute( + StringAttr::get(func.getContext(), "col"), + builder.getI32IntegerAttr(mem->assigned_sram_pos->col))); + + write_sram_attrs.push_back( + DictionaryAttr::get(func.getContext(), sram_coord)); + } + } + mapping_attrs.push_back(NamedAttribute( + StringAttr::get(func.getContext(), "write_sram_locations"), + builder.getArrayAttr(write_sram_attrs))); + + // Sets task_mapping_info attribute on the task op. + task_node->op->setAttr( + "task_mapping_info", + DictionaryAttr::get(func.getContext(), mapping_attrs)); + } + } + +private: + /// Clears all task placements and resets the occupied-cell grid. + void resetTaskPlacements(TaskMemoryGraph &graph) { + for (auto &task : graph.task_nodes) { + task->placement.clear(); + } + // Clears grid. + for (int r = 0; r < grid_rows_; ++r) { + std::fill(occupied_[r].begin(), occupied_[r].end(), false); + } + } + + /// Assigns each MemoryNode to the SRAM at the centroid of all CGRAs that + /// access it (readers + writers). Returns true if any assignment changed, + /// which is used as the convergence criterion for the outer iteration loop. + bool assignAllSrams(TaskMemoryGraph &graph) { + bool changed = false; + for (auto &mem_node : graph.memory_nodes) { + int total_row = 0, total_col = 0, count = 0; + // Computes centroid of all tasks that read this memory. + for (TaskNode *reader : mem_node->readers) { + for (const CgraPosition &pos : reader->placement) { + total_row += pos.row; + total_col += pos.col; + count++; + } + } + // Computes centroid of all tasks that write this memory. + for (TaskNode *writer : mem_node->writers) { + for (const CgraPosition &pos : writer->placement) { + total_row += pos.row; + total_col += pos.col; + count++; + } + } + + std::optional new_sram_pos; + if (count > 0) { + // Rounds to the nearest integer (round-half-up). + int avg_row = (total_row + count / 2) / count; + int avg_col = (total_col + count / 2) / count; + new_sram_pos = CgraPosition{avg_row, avg_col}; + } + + if (mem_node->assigned_sram_pos != new_sram_pos) { + mem_node->assigned_sram_pos = new_sram_pos; + changed = true; + } + } + return changed; + } + + // Parses a tile_shape string like "2x2" or "2x2[(0,0)(1,0)(0,1)]". + // Returns (col, row) offsets relative to the placement origin. + // Reserved for IR-driven tile shapes; placement currently uses implicit + // rectangular enumeration in findBestPlacement. + SmallVector> parseTileShapeOffsets(StringRef tile_shape, + int cgra_count) { + SmallVector> offsets; + + if (tile_shape.empty() || cgra_count <= 1) { + offsets.push_back({0, 0}); + return offsets; + } + + size_t bracket_pos = tile_shape.find('['); + if (bracket_pos != StringRef::npos) { + StringRef positions_str = tile_shape.substr(bracket_pos); + size_t pos = 0; + while (pos < positions_str.size()) { + size_t open = positions_str.find('(', pos); + if (open == StringRef::npos) + break; + size_t close = positions_str.find(')', open); + if (close == StringRef::npos) + break; + StringRef pair_str = positions_str.slice(open + 1, close); + auto [col_str, row_str] = pair_str.split(','); + int col_off = 0, row_off = 0; + col_str.getAsInteger(10, col_off); + row_str.getAsInteger(10, row_off); + offsets.push_back({col_off, row_off}); + pos = close + 1; + } + } else { + auto [rows_str, cols_str] = tile_shape.split('x'); + int rows = 1, cols = 1; + rows_str.getAsInteger(10, rows); + cols_str.getAsInteger(10, cols); + for (int r = 0; r < rows; ++r) { + for (int c = 0; c < cols; ++c) { + offsets.push_back({c, r}); + } + } + } + + assert(!offsets.empty() && "tile_shape parsing yielded empty offsets"); + return offsets; + } + + // Finds the best placement for `task_node` requiring exactly `cgra_count` + // CGRAs. Strategy: + // 1. Rectangular: tries all (rows × cols) factorizations of cgra_count, + // preferring square-like shapes (lower |rows-cols|). For each shape, + // sweeps every origin on the grid and picks the highest-scoring free + // position. + // 2. Non-rectangular fallback: if no rectangle fits (fragmented grid), + // runs a polyomino DFS (tryNonRectShapes) to find any connected + // k-CGRA cluster. + // Returns an empty TaskPlacement only if the grid is completely full + // (should not happen if cgra_count was validated upstream). + TaskPlacement findBestPlacement(TaskNode *task_node, int cgra_count, + TaskMemoryGraph &graph) { + // Phase 1: try all rectangular factorizations of cgra_count. + for (int rows = 1; rows <= cgra_count; ++rows) { + if (cgra_count % rows != 0) { + continue; + } + int cols = cgra_count / rows; + + // Builds the list of (col_offset, row_offset) cells relative to the + // top-left origin for this rows×cols rectangle. + SmallVector> shape_offsets; + for (int shape_row = 0; shape_row < rows; ++shape_row) { + for (int shape_col = 0; shape_col < cols; ++shape_col) { + shape_offsets.push_back({shape_col, shape_row}); + } + } + + // Sweeps every valid origin on the grid and keeps the highest-scoring + // non-overlapping placement for this shape. + int best_score = INT_MIN; + TaskPlacement best_placement; + for (int origin_row = 0; origin_row < grid_rows_; ++origin_row) { + for (int origin_col = 0; origin_col < grid_cols_; ++origin_col) { + // Checks that every cell of the rectangle is within bounds and free. + bool valid = true; + TaskPlacement candidate; + for (auto &[col_off, row_off] : shape_offsets) { + int abs_row = origin_row + row_off; + int abs_col = origin_col + col_off; + if (abs_row < 0 || abs_row >= grid_rows_ || abs_col < 0 || + abs_col >= grid_cols_ || occupied_[abs_row][abs_col]) { + valid = false; + break; + } + candidate.cgra_positions.push_back({abs_row, abs_col}); + } + if (!valid) { + continue; + } + // Scores the candidate by proximity to dependent tasks and SRAMs. + int score = computeScore(task_node, candidate, graph); + if (score > best_score) { + best_score = score; + best_placement = candidate; + } + } + } + // Returns the best placement found for this shape, if any. + if (!best_placement.cgra_positions.empty()) { + return best_placement; + } + } + + // Phase 2: no rectangle fit — try non-rectangular connected shapes via DFS. + if (cgra_count > 1) { + TaskPlacement p = tryNonRectShapes(task_node, cgra_count, graph); + if (!p.cgra_positions.empty()) { + return p; + } + } + + return {}; + } + + TaskPlacement tryNonRectShapes(TaskNode *task_node, int k, + TaskMemoryGraph &graph) { + std::set visited_masks; + int best_score = INT_MIN; + TaskPlacement best_placement; + + std::function &, uint64_t)> search = + [&](SmallVector ¤t, uint64_t mask) { + if ((int)current.size() == k) { + if (visited_masks.insert(mask).second) { + TaskPlacement candidate; + candidate.cgra_positions = current; + int score = computeScore(task_node, candidate, graph); + if (score > best_score) { + best_score = score; + best_placement = candidate; + } + } + return; + } + // Explores all 4-connected neighbours of every cell already in the + // current polyomino. delta_row/delta_col encode the four cardinal + // directions: up, down, left, right. + constexpr int delta_row[] = {-1, 1, 0, 0}; + constexpr int delta_col[] = {0, 0, -1, 1}; + for (size_t i = 0; i < current.size(); ++i) { + const CgraPosition &cell = current[i]; + for (int dir = 0; dir < 4; ++dir) { + int next_row = cell.row + delta_row[dir]; + int next_col = cell.col + delta_col[dir]; + if (next_row >= 0 && next_row < grid_rows_ && next_col >= 0 && + next_col < grid_cols_ && !occupied_[next_row][next_col]) { + uint64_t bit = 1ULL << (next_row * grid_cols_ + next_col); + if ((mask & bit) == 0) { + current.push_back({next_row, next_col}); + search(current, mask | bit); + current.pop_back(); + } + } + } + } + }; + + // Seeds the DFS from every free cell on the grid. + for (int seed_row = 0; seed_row < grid_rows_; ++seed_row) { + for (int seed_col = 0; seed_col < grid_cols_; ++seed_col) { + if (!occupied_[seed_row][seed_col]) { + SmallVector start = {{seed_row, seed_col}}; + search(start, 1ULL << (seed_row * grid_cols_ + seed_col)); + } + } + } + return best_placement; + } + + /// Computes the placement score for `task_node` at `placement`. + /// + /// Score = α·SSA_Dist + β·Mem_Dist. + /// SSA_Dist : sum of distances to already-placed SSA predecessors and + /// successors (negative; penalises far-away neighbours). + /// Mem_Dist : sum of distances to assigned SRAMs for read/write memrefs + /// (negative; memory proximity is weighted more heavily). + /// + /// Higher score is better; 0 means all neighbours are co-located. + int computeScore(TaskNode *task_node, const TaskPlacement &placement, + TaskMemoryGraph &graph) { + // Weight constants (tunable). + constexpr int kAlpha = 10; // SSA proximity weight. + constexpr int kBeta = 50; // Memory proximity weight (high priority). + + int ssa_score = 0; + int mem_score = 0; + + auto minDistToPlacement = + [&](const SmallVector &other) -> int { + int min_dist = INT_MAX; + for (const auto &pos : placement.cgra_positions) { + for (const auto &opos : other) { + min_dist = std::min(min_dist, pos.manhattanDistance(opos)); + } + } + return min_dist; + }; + + auto minDistToTarget = [&](const CgraPosition &target) -> int { + int min_dist = INT_MAX; + for (const auto &pos : placement.cgra_positions) { + min_dist = std::min(min_dist, pos.manhattanDistance(target)); + } + return min_dist; + }; + + // 1. SSA proximity — penalise distance to producers and consumers. + for (TaskNode *producer : task_node->ssa_operands) { + if (!producer->placement.empty()) { + // Uses negative distance: closer = higher score. + int dist = minDistToPlacement(producer->placement); + ssa_score -= dist; + } + } + for (TaskNode *consumer : task_node->ssa_users) { + if (!consumer->placement.empty()) { + int dist = minDistToPlacement(consumer->placement); + ssa_score -= dist; + } + } + + // 2. Memory proximity — penalise distance to assigned SRAMs. + // For read memrefs (data sources). + for (MemoryNode *mem : task_node->read_memrefs) { + if (mem->assigned_sram_pos) { + int dist = minDistToTarget(*mem->assigned_sram_pos); + mem_score -= dist; + } + } + // For write memrefs: if the SRAM is already assigned (e.g. read by a + // previous task), we want to be close to it too. + for (MemoryNode *mem : task_node->write_memrefs) { + if (mem->assigned_sram_pos) { + int dist = minDistToTarget(*mem->assigned_sram_pos); + mem_score -= dist; + } + } + + return kAlpha * ssa_score + kBeta * mem_score; + } + + /// Computes dependency depth for every task in the graph. + /// + /// Dependency depth = longest path from a node to any sink in the dependency + /// graph (traversing both SSA and write→read memory edges). + /// + /// Tasks with higher dependency depth have longer chains of dependent tasks + /// downstream. Placing them first (critical-path-first) ensures that: + /// 1. They receive priority access to good grid positions. + /// 2. Their dependent tasks can later be placed adjacent, minimising + /// inter-task communication distance. + void computeDependencyDepth(TaskMemoryGraph &graph) { + DenseMap depth_cache; + for (auto &node : graph.task_nodes) { + node->dependency_depth = calculateDepth(node.get(), depth_cache); + } + } + + /// Recursively calculates dependency depth for a single task (memoised). + int calculateDepth(TaskNode *node, DenseMap &depth_cache) { + if (depth_cache.count(node)) { + return depth_cache[node]; + } + + int max_child_depth = 0; + // SSA dependencies: tasks that consume this task's output values. + for (TaskNode *child : node->ssa_users) { + max_child_depth = + std::max(max_child_depth, calculateDepth(child, depth_cache) + 1); + } + + // Memory dependencies: Producer → Mem → Consumer write-after-read chains. + for (MemoryNode *mem : node->write_memrefs) { + for (TaskNode *reader : mem->readers) { + if (reader != node) { + max_child_depth = std::max(max_child_depth, + calculateDepth(reader, depth_cache) + 1); + } + } + } + + return depth_cache[node] = max_child_depth; + } + + int grid_rows_; + int grid_cols_; + std::vector> occupied_; +}; + +} // namespace + +namespace mlir { +namespace taskflow { + +void runAllocateCgraToTask(func::FuncOp func, int grid_rows, int grid_cols) { + TaskMapper mapper(grid_rows, grid_cols); + mapper.place(func); +} + +} // namespace taskflow +} // namespace mlir diff --git a/lib/TaskflowDialect/CMakeLists.txt b/lib/TaskflowDialect/CMakeLists.txt index 49d60c57..cdb02d55 100644 --- a/lib/TaskflowDialect/CMakeLists.txt +++ b/lib/TaskflowDialect/CMakeLists.txt @@ -13,5 +13,5 @@ add_mlir_dialect_library(MLIRTaskflow MLIRInferTypeOpInterface ) -add_subdirectory(Transforms) -add_subdirectory(Transforms/Optimizations) \ No newline at end of file +add_subdirectory(Allocation) +add_subdirectory(Transforms) \ No newline at end of file diff --git a/lib/TaskflowDialect/Transforms/AllocateCgraToTaskPass.cpp b/lib/TaskflowDialect/Transforms/AllocateCgraToTaskPass.cpp new file mode 100644 index 00000000..cf298fce --- /dev/null +++ b/lib/TaskflowDialect/Transforms/AllocateCgraToTaskPass.cpp @@ -0,0 +1,52 @@ +//===- AllocateCgraToTaskPass.cpp - Task to CGRA Mapping Pass -===// +// +// This pass maps Taskflow tasks onto a 2D CGRA grid array: +// 1. Places tasks with SSA dependencies on adjacent CGRAs. +// 2. Assigns memrefs to SRAMs (each MemRef is assigned to exactly one SRAM, +// determined by proximity to the task that first accesses it). +// +// Implementation lives in lib/TaskflowDialect/Allocation/allocation_utils_mapper.cpp +// (runAllocateCgraToTask). +// +//===----------------------------------------------------------------------===// + +#include "TaskflowDialect/TaskflowDialect.h" +#include "TaskflowDialect/TaskflowPasses.h" +#include "TaskflowDialect/Allocation/allocation_utils.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Pass/Pass.h" + +using namespace mlir; +using namespace mlir::taskflow; + +namespace { + +struct AllocateCgraToTaskPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(AllocateCgraToTaskPass) + + AllocateCgraToTaskPass() = default; + + StringRef getArgument() const override { return "allocate-cgra-to-task"; } + + StringRef getDescription() const override { + return "Maps Taskflow tasks onto a 2D CGRA grid with adjacency " + "optimization and memory mapping."; + } + + void runOnOperation() override { + runAllocateCgraToTask(getOperation(), kCgraGridRows, kCgraGridCols); + } +}; + +} // namespace + +namespace mlir { +namespace taskflow { + +std::unique_ptr createAllocateCgraToTaskPass() { + return std::make_unique(); +} + +} // namespace taskflow +} // namespace mlir diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt index 60078298..ae722f21 100644 --- a/lib/TaskflowDialect/Transforms/CMakeLists.txt +++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt @@ -3,7 +3,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) add_mlir_library(MLIRTaskflowTransforms ConstructHyperblockFromTaskPass.cpp ClassifyCountersPass.cpp - MapTaskOnCgraPass.cpp + AllocateCgraToTaskPass.cpp FuseTaskPass.cpp DEPENDS @@ -15,9 +15,12 @@ add_mlir_library(MLIRTaskflowTransforms MLIRSupport MLIRTransforms MLIRTaskflow + MLIRTaskflowAllocation MLIRNeura MLIRNeuraTransforms MLIRConversion ${dialect_libs} LLVMSupport -) \ No newline at end of file +) + +add_subdirectory(Optimizations) \ No newline at end of file diff --git a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp deleted file mode 100644 index d8225ece..00000000 --- a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp +++ /dev/null @@ -1,593 +0,0 @@ -//===- MapTaskOnCgraPass.cpp - Task to CGRA Mapping Pass ----------------===// -// -// This pass maps Taskflow tasks onto a 2D CGRA grid array: -// 1. Places tasks with SSA dependencies on adjacent CGRAs. -// 2. Assigns memrefs to SRAMs (each MemRef is assigned to exactly one SRAM, -// determined by proximity to the task that first accesses it). -// -//===----------------------------------------------------------------------===// - -#include "TaskflowDialect/TaskflowDialect.h" -#include "TaskflowDialect/TaskflowOps.h" -#include "TaskflowDialect/TaskflowPasses.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/IR/Builders.h" -#include "mlir/Pass/Pass.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SetVector.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/Support/raw_ostream.h" - -#include -#include -#include -#include -#include -#include - -using namespace mlir; -using namespace mlir::taskflow; - -namespace { - -//===----------------------------------------------------------------------===// -// CGRA Grid Position -//===----------------------------------------------------------------------===// -/// Represents a position on the 2D CGRA grid. -struct CGRAPosition { - int row; - int col; - - bool operator==(const CGRAPosition &other) const { - return row == other.row && col == other.col; - } - - bool operator!=(const CGRAPosition &other) const { return !(*this == other); } - - /// Computes Manhattan distance to another position. - int manhattanDistance(const CGRAPosition &other) const { - return std::abs(row - other.row) + std::abs(col - other.col); - } - - /// Checks if adjacent (Manhattan distance = 1). - bool isAdjacent(const CGRAPosition &other) const { - return manhattanDistance(other) == 1; - } -}; - -//===----------------------------------------------------------------------===// -// Task Placement Info -//===----------------------------------------------------------------------===// -/// Stores placement info for a task: can span multiple combined CGRAs. -struct TaskPlacement { - SmallVector cgra_positions; // CGRAs assigned to this task. - - /// Returns the primary (first) position. - CGRAPosition primary() const { - return cgra_positions.empty() ? CGRAPosition{-1, -1} : cgra_positions[0]; - } - - /// Returns the number of CGRAs assigned. - size_t cgraCount() const { return cgra_positions.size(); } - - /// Checks if any CGRA in this task is adjacent to any in other task. - bool hasAdjacentCGRA(const TaskPlacement &other) const { - for (const auto &pos : cgra_positions) { - for (const auto &other_pos : other.cgra_positions) { - if (pos.isAdjacent(other_pos)) { - return true; - } - } - } - return false; - } -}; - -//===----------------------------------------------------------------------===// -// Task-Memory Graph -//===----------------------------------------------------------------------===// - -struct MemoryNode; - -/// Represents a Task node in the graph. -struct TaskNode { - size_t id; - TaskflowTaskOp op; - int dependency_depth = 0; // Longest path to any sink in the dependency graph. - - // Edges based on original memory access. - SmallVector read_memrefs; // Original read memrefs. - SmallVector write_memrefs; // Original write memrefs. - SmallVector ssa_users; - SmallVector ssa_operands; - - // Placement result - SmallVector placement; - - TaskNode(size_t id, TaskflowTaskOp op) : id(id), op(op) {} -}; - -/// Represents a Memory node (MemRef) in the graph. -struct MemoryNode { - Value memref; - - // Edges. - SmallVector readers; - SmallVector writers; - - // Mapping result. - std::optional assigned_sram_pos; - - MemoryNode(Value memref) : memref(memref) {} -}; - -/// The Task-Memory Dependency Graph. -class TaskMemoryGraph { -public: - SmallVector> task_nodes; - SmallVector> memory_nodes; - DenseMap memref_to_node; - DenseMap op_to_node; - - void build(func::FuncOp func) { - // 1. Creates TaskNodes. - size_t task_id = 0; - func.walk([&](TaskflowTaskOp task) { - auto node = std::make_unique(task_id++, task); - op_to_node[task] = node.get(); - task_nodes.push_back(std::move(node)); - }); - - // 2. Creates MemoryNodes using ORIGINAL memrefs (canonical identity). - // Uses original_read_memrefs/original_write_memrefs to ensure aliased - // memories share the same MemoryNode. - for (auto &t_node : task_nodes) { - // Uses original_read_memrefs for canonical memory identity. - for (Value orig_memref : t_node->op.getOriginalReadMemrefs()) { - MemoryNode *m_node = getOrCreateMemoryNode(orig_memref); - t_node->read_memrefs.push_back(m_node); - m_node->readers.push_back(t_node.get()); - } - - // Uses original_write_memrefs for canonical memory identity. - for (Value orig_memref : t_node->op.getOriginalWriteMemrefs()) { - MemoryNode *m_node = getOrCreateMemoryNode(orig_memref); - t_node->write_memrefs.push_back(m_node); - m_node->writers.push_back(t_node.get()); - } - } - - // 3. Builds SSA Edges (Inter-Task Value Dependencies). - // Identifies if a task uses a value produced by another task. - for (auto &consumer_node : task_nodes) { - // Iterates all operands for now to be safe. - for (Value operand : consumer_node->op.getValueInputs()) { - if (auto producer_op = operand.getDefiningOp()) { - if (auto *producer_node = op_to_node[producer_op]) { - producer_node->ssa_users.push_back(consumer_node.get()); - consumer_node->ssa_operands.push_back(producer_node); - } - } - } - } - } - -private: - MemoryNode *getOrCreateMemoryNode(Value memref) { - if (memref_to_node.count(memref)) { - return memref_to_node[memref]; - } - - auto node = std::make_unique(memref); - MemoryNode *ptr = node.get(); - memref_to_node[memref] = ptr; - memory_nodes.push_back(std::move(node)); - return ptr; - } -}; - -//===----------------------------------------------------------------------===// -// Task Mapper -//===----------------------------------------------------------------------===// -/// Maps a task-memory graph onto a 2D CGRA grid. - -class TaskMapper { -public: - TaskMapper(int grid_rows, int grid_cols) - : grid_rows_(grid_rows), grid_cols_(grid_cols) { - occupied_.resize(grid_rows_); - for (auto &row : occupied_) { - row.resize(grid_cols_, false); - } - } - - /// Places all tasks and performs memory mapping. - void place(func::FuncOp func) { - SmallVector tasks; - func.walk([&](TaskflowTaskOp task) { tasks.push_back(task); }); - - if (tasks.empty()) { - llvm::errs() << "No tasks to place.\n"; - return; - } - - // Builds Task-Memory Graph. - TaskMemoryGraph graph; - graph.build(func); - - if (graph.task_nodes.empty()) { - llvm::errs() << "No tasks to place.\n"; - return; - } - - // Computes Dependency Depth for each task. - // Dependency depth = longest path from this node to any sink in the - // dependency graph (considering both SSA and memory edges). Tasks with - // higher depth are more "critical" and are placed first to ensure their - // dependent chains have good locality. - computeDependencyDepth(graph); - - // Sorts tasks by dependency depth (Critical Path First). - SmallVector sorted_tasks; - for (auto &node : graph.task_nodes) - sorted_tasks.push_back(node.get()); - - std::stable_sort(sorted_tasks.begin(), sorted_tasks.end(), - [](TaskNode *a, TaskNode *b) { - return a->dependency_depth > b->dependency_depth; - }); - - // Critical-path-first placement: - // 1. Computes dependency depth for each task (longest path to sink). - // 2. Sorts tasks by dependency depth (higher = more critical). - // 3. Places tasks in sorted order with heuristic scoring. - // Iterative Refinement Loop (Coordinate Descent). - // Alternates between Task Placement (Phase 1) and SRAM Assignment (Phase - // 2). - constexpr int kMaxIterations = 10; - - for (int iter = 0; iter < kMaxIterations; ++iter) { - // Phase 1: Place Tasks (assuming fixed SRAMs). - if (iter > 0) { - resetTaskPlacements(graph); - } - - for (TaskNode *task_node : sorted_tasks) { - int cgra_count = 1; - if (auto attr = - task_node->op->getAttrOfType("cgra_count")) { - cgra_count = attr.getInt(); - } - - // Finds best placement using SRAM positions from previous iter (or - // -1/default). - TaskPlacement placement = - findBestPlacement(task_node, cgra_count, graph); - - // Commits Placement. - task_node->placement.push_back(placement.primary()); - // Handles mapping one task on multi-CGRAs. - // TODO: Introduce explicit multi-CGRA binding logic. - for (size_t i = 1; i < placement.cgra_positions.size(); ++i) { - task_node->placement.push_back(placement.cgra_positions[i]); - } - - // Marks occupied. - for (const auto &pos : placement.cgra_positions) { - if (pos.row >= 0 && pos.row < grid_rows_ && pos.col >= 0 && - pos.col < grid_cols_) { - occupied_[pos.row][pos.col] = true; - } - } - } - - // Phase 2: Assign SRAMs (assuming fixed tasks). - bool sram_moved = assignAllSRAMs(graph); - - // Convergence Check. - // If SRAMs didn't move, it means task placement based on them likely - // won't change either. - if (iter > 0 && !sram_moved) { - break; - } - } - - // Annotates result. - OpBuilder builder(func.getContext()); - for (auto &task_node : graph.task_nodes) { - if (task_node->placement.empty()) { - continue; - } - - SmallVector mapping_attrs; - - // 1. CGRA positions. - SmallVector pos_attrs; - for (const auto &pos : task_node->placement) { - SmallVector coord_attrs; - coord_attrs.push_back( - NamedAttribute(StringAttr::get(func.getContext(), "row"), - builder.getI32IntegerAttr(pos.row))); - coord_attrs.push_back( - NamedAttribute(StringAttr::get(func.getContext(), "col"), - builder.getI32IntegerAttr(pos.col))); - pos_attrs.push_back( - DictionaryAttr::get(func.getContext(), coord_attrs)); - } - mapping_attrs.push_back( - NamedAttribute(StringAttr::get(func.getContext(), "cgra_positions"), - builder.getArrayAttr(pos_attrs))); - - // 2. Reads SRAM Locations. - SmallVector read_sram_attrs; - for (MemoryNode *mem : task_node->read_memrefs) { - if (mem->assigned_sram_pos) { - SmallVector sram_coord; - sram_coord.push_back(NamedAttribute( - StringAttr::get(func.getContext(), "row"), - builder.getI32IntegerAttr(mem->assigned_sram_pos->row))); - sram_coord.push_back(NamedAttribute( - StringAttr::get(func.getContext(), "col"), - builder.getI32IntegerAttr(mem->assigned_sram_pos->col))); - read_sram_attrs.push_back( - DictionaryAttr::get(func.getContext(), sram_coord)); - } - } - mapping_attrs.push_back(NamedAttribute( - StringAttr::get(func.getContext(), "read_sram_locations"), - builder.getArrayAttr(read_sram_attrs))); - - // 3. Writes SRAM Locations. - SmallVector write_sram_attrs; - for (MemoryNode *mem : task_node->write_memrefs) { - if (mem->assigned_sram_pos) { - SmallVector sram_coord; - sram_coord.push_back(NamedAttribute( - StringAttr::get(func.getContext(), "row"), - builder.getI32IntegerAttr(mem->assigned_sram_pos->row))); - sram_coord.push_back(NamedAttribute( - StringAttr::get(func.getContext(), "col"), - builder.getI32IntegerAttr(mem->assigned_sram_pos->col))); - - write_sram_attrs.push_back( - DictionaryAttr::get(func.getContext(), sram_coord)); - } - } - mapping_attrs.push_back(NamedAttribute( - StringAttr::get(func.getContext(), "write_sram_locations"), - builder.getArrayAttr(write_sram_attrs))); - - // Sets Attribute. - task_node->op->setAttr( - "task_mapping_info", - DictionaryAttr::get(func.getContext(), mapping_attrs)); - } - } - -private: - /// Clears task placement and occupied grid. - void resetTaskPlacements(TaskMemoryGraph &graph) { - for (auto &task : graph.task_nodes) { - task->placement.clear(); - } - // Clears grid. - for (int r = 0; r < grid_rows_; ++r) { - std::fill(occupied_[r].begin(), occupied_[r].end(), false); - } - } - - /// Assigns all memory nodes to SRAMs based on centroid of accessing tasks. - /// Returns true if any SRAM assignment changed. - bool assignAllSRAMs(TaskMemoryGraph &graph) { - bool changed = false; - for (auto &mem_node : graph.memory_nodes) { - // Computes centroid of all tasks that access this memory. - int total_row = 0, total_col = 0, count = 0; - for (TaskNode *reader : mem_node->readers) { - if (!reader->placement.empty()) { - total_row += reader->placement[0].row; - total_col += reader->placement[0].col; - count++; - } - } - for (TaskNode *writer : mem_node->writers) { - if (!writer->placement.empty()) { - total_row += writer->placement[0].row; - total_col += writer->placement[0].col; - count++; - } - } - - std::optional new_sram_pos; - if (count > 0) { - // Rounds to the nearest integer. - int avg_row = (total_row + count / 2) / count; - int avg_col = (total_col + count / 2) / count; - new_sram_pos = CGRAPosition{avg_row, avg_col}; - } - - if (mem_node->assigned_sram_pos != new_sram_pos) { - mem_node->assigned_sram_pos = new_sram_pos; - changed = true; - } - } - return changed; - } - - /// Finds best placement for a task. - /// TODO: Currently defaults to single-CGRA placement. Multi-CGRA binding - /// logic (cgra_count > 1) is experimental/placeholder and should ideally be - /// handled by an upstream resource binding pass. - TaskPlacement findBestPlacement(TaskNode *task_node, int cgra_count, - TaskMemoryGraph &graph) { - int best_score = INT_MIN; - TaskPlacement best_placement; - - // Baseline: For cgra_count=1, finds single best position. - for (int r = 0; r < grid_rows_; ++r) { - for (int c = 0; c < grid_cols_; ++c) { - if (occupied_[r][c]) { - continue; - } - - TaskPlacement candidate; - candidate.cgra_positions.push_back({r, c}); - - int score = computeScore(task_node, candidate, graph); - if (score > best_score) { - best_score = score; - best_placement = candidate; - } - } - } - - // Error handling: No available position found (grid over-subscribed). - if (best_placement.cgra_positions.empty()) { - assert(false && - "No available CGRA position found (grid over-subscribed)."); - } - - return best_placement; - } - - /// Computes placement score based on Task-Memory Graph. - /// TODO: Introduce explicit 'direct_wires' attributes in the IR for - /// downstream hardware generators to configure fast bypass paths between - /// adjacent PEs with dependencies. - /// - /// Score = α·SSA_Dist + β·Mem_Dist. - /// - /// SSA_Dist: Minimize distance to placed SSA predecessors (ssa_operands). - /// Mem_Dist: Minimize distance to assigned SRAMs for read/write memrefs. - int computeScore(TaskNode *task_node, const TaskPlacement &placement, - TaskMemoryGraph &graph) { - // Weight constants (tunable). - constexpr int kAlpha = 10; // SSA proximity weight. - constexpr int kBeta = 50; // Memory proximity weight (high priority). - - int ssa_score = 0; - int mem_score = 0; - - CGRAPosition current_pos = placement.primary(); - - // 1. SSA proximity (predecessors & successors). - for (TaskNode *producer : task_node->ssa_operands) { - if (!producer->placement.empty()) { - int dist = current_pos.manhattanDistance(producer->placement[0]); - // Uses negative distance to penalize far-away placements. - ssa_score -= dist; - } - } - for (TaskNode *consumer : task_node->ssa_users) { - if (!consumer->placement.empty()) { - int dist = current_pos.manhattanDistance(consumer->placement[0]); - ssa_score -= dist; - } - } - - // 2. Memory proximity. - // For read memrefs. - for (MemoryNode *mem : task_node->read_memrefs) { - if (mem->assigned_sram_pos) { - int dist = current_pos.manhattanDistance(*mem->assigned_sram_pos); - mem_score -= dist; - } - } - // For write memrefs. - // If we write to a memory that is already assigned (e.g. read by previous - // task), we want to be close to it too. - for (MemoryNode *mem : task_node->write_memrefs) { - if (mem->assigned_sram_pos) { - int dist = current_pos.manhattanDistance(*mem->assigned_sram_pos); - mem_score -= dist; - } - } - - return kAlpha * ssa_score + kBeta * mem_score; - } - - /// Computes dependency depth for all tasks in the graph. - /// - /// Dependency depth = longest path from this node to any sink node in the - /// dependency graph (via SSA or memory edges). - /// - /// Tasks with higher dependency depth have longer chains of dependent tasks - /// after them. By placing these tasks first: - /// 1. They get priority access to good grid positions. - /// 2. Their dependent tasks can then be positioned adjacent to them, - /// minimizing inter-task communication distance. - void computeDependencyDepth(TaskMemoryGraph &graph) { - DenseMap depth_cache; - for (auto &node : graph.task_nodes) { - node->dependency_depth = calculateDepth(node.get(), depth_cache); - } - } - - /// Recursively calculates dependency depth for a single task. - int calculateDepth(TaskNode *node, DenseMap &depth_cache) { - if (depth_cache.count(node)) { - return depth_cache[node]; - } - - int max_child_depth = 0; - // SSA dependencies. - for (TaskNode *child : node->ssa_users) { - max_child_depth = - std::max(max_child_depth, calculateDepth(child, depth_cache) + 1); - } - - // Memory dependencies (Producer -> Mem -> Consumer). - for (MemoryNode *mem : node->write_memrefs) { - for (TaskNode *reader : mem->readers) { - if (reader != node) { - max_child_depth = std::max(max_child_depth, - calculateDepth(reader, depth_cache) + 1); - } - } - } - - return depth_cache[node] = max_child_depth; - } - - int grid_rows_; - int grid_cols_; - std::vector> occupied_; -}; - -//===----------------------------------------------------------------------===// -// Pass Definition -//===----------------------------------------------------------------------===// -struct MapTaskOnCgraPass - : public PassWrapper> { - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MapTaskOnCgraPass) - - MapTaskOnCgraPass() = default; - - StringRef getArgument() const override { return "map-task-on-cgra"; } - - StringRef getDescription() const override { - return "Maps Taskflow tasks onto a 2D CGRA grid with adjacency " - "optimization and memory mapping."; - } - - void runOnOperation() override { - func::FuncOp func = getOperation(); - constexpr int kDefaultGridRows = 3; - constexpr int kDefaultGridCols = 3; - TaskMapper mapper(kDefaultGridRows, kDefaultGridCols); - mapper.place(func); - } -}; - -} // namespace - -namespace mlir { -namespace taskflow { - -std::unique_ptr createMapTaskOnCgraPass() { - return std::make_unique(); -} - -} // namespace taskflow -} // namespace mlir diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir index 0e18b971..4d4892f5 100644 --- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir +++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir @@ -29,7 +29,7 @@ // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ // RUN: --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ -// RUN: --map-task-on-cgra \ +// RUN: --allocate-cgra-to-task \ // RUN: -o %t.placement.mlir // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir index fdbe54da..ece77e32 100644 --- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir +++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir @@ -61,7 +61,7 @@ // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ // RUN: --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ -// RUN: --map-task-on-cgra \ +// RUN: --allocate-cgra-to-task \ // RUN: -o %t.placement.mlir // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT @@ -524,7 +524,7 @@ module attributes {} { // PLACEMENT: module { // PLACEMENT-NEXT: func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { -// PLACEMENT-NEXT: %dependency_read_out, %dependency_write_out = taskflow.task @Task_0 dependency_read_in(%arg0 : memref) dependency_write_in(%arg5 : memref) [original_read_memrefs(%arg0 : memref), original_write_memrefs(%arg5 : memref)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locations = [{col = 0 : i32, row = 0 : i32}], write_sram_locations = [{col = 0 : i32, row = 1 : i32}]}} : (memref, memref) -> (memref, memref) { +// PLACEMENT-NEXT: %dependency_read_out, %dependency_write_out = taskflow.task @Task_0 dependency_read_in(%arg0 : memref) dependency_write_in(%arg5 : memref) [original_read_memrefs(%arg0 : memref), original_write_memrefs(%arg5 : memref)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locations = [{col = 0 : i32, row = 0 : i32}], write_sram_locations = [{col = 2 : i32, row = 0 : i32}]}} : (memref, memref) -> (memref, memref) { // PLACEMENT-NEXT: ^bb0(%arg10: memref, %arg11: memref): // PLACEMENT-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index // PLACEMENT-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index @@ -537,7 +537,7 @@ module attributes {} { // PLACEMENT-NEXT: }) : (index, index, index) -> () // PLACEMENT-NEXT: taskflow.yield reads(%arg10 : memref) writes(%arg11 : memref) // PLACEMENT-NEXT: } -// PLACEMENT-NEXT: %dependency_read_out_0:2, %dependency_write_out_1 = taskflow.task @Task_1 dependency_read_in(%arg1, %arg2 : memref, memref) dependency_write_in(%arg6 : memref) [original_read_memrefs(%arg1, %arg2 : memref, memref), original_write_memrefs(%arg6 : memref)] {task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locations = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]}} : (memref, memref, memref) -> (memref, memref, memref) { +// PLACEMENT-NEXT: %dependency_read_out_0:2, %dependency_write_out_1 = taskflow.task @Task_1 dependency_read_in(%arg1, %arg2 : memref, memref) dependency_write_in(%arg6 : memref) [original_read_memrefs(%arg1, %arg2 : memref, memref), original_write_memrefs(%arg6 : memref)] {task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locations = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locations = [{col = 2 : i32, row = 0 : i32}]}} : (memref, memref, memref) -> (memref, memref, memref) { // PLACEMENT-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref): // PLACEMENT-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index // PLACEMENT-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index @@ -552,7 +552,7 @@ module attributes {} { // PLACEMENT-NEXT: }) : (index, index, index) -> () // PLACEMENT-NEXT: taskflow.yield reads(%arg10, %arg11 : memref, memref) writes(%arg12 : memref) // PLACEMENT-NEXT: } -// PLACEMENT-NEXT: %dependency_read_out_2:3, %dependency_write_out_3 = taskflow.task @Task_2 dependency_read_in(%dependency_write_out, %dependency_write_out_1, %arg9 : memref, memref, memref) dependency_write_in(%arg9 : memref) [original_read_memrefs(%arg5, %arg6, %arg9 : memref, memref, memref), original_write_memrefs(%arg9 : memref)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locations = [{col = 0 : i32, row = 1 : i32}, {col = 1 : i32, row = 1 : i32}, {col = 0 : i32, row = 1 : i32}], write_sram_locations = [{col = 0 : i32, row = 1 : i32}]}} : (memref, memref, memref, memref) -> (memref, memref, memref, memref) { +// PLACEMENT-NEXT: %dependency_read_out_2:3, %dependency_write_out_3 = taskflow.task @Task_2 dependency_read_in(%dependency_write_out, %dependency_write_out_1, %arg9 : memref, memref, memref) dependency_write_in(%arg9 : memref) [original_read_memrefs(%arg5, %arg6, %arg9 : memref, memref, memref), original_write_memrefs(%arg9 : memref)] {task_mapping_info = {cgra_positions = [{col = 3 : i32, row = 0 : i32}], read_sram_locations = [{col = 2 : i32, row = 0 : i32}, {col = 2 : i32, row = 0 : i32}, {col = 3 : i32, row = 0 : i32}], write_sram_locations = [{col = 3 : i32, row = 0 : i32}]}} : (memref, memref, memref, memref) -> (memref, memref, memref, memref) { // PLACEMENT-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref, %arg13: memref): // PLACEMENT-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index // PLACEMENT-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index @@ -571,7 +571,7 @@ module attributes {} { // PLACEMENT-NEXT: }) : (index) -> () // PLACEMENT-NEXT: taskflow.yield reads(%arg10, %arg11, %arg13 : memref, memref, memref) writes(%arg13 : memref) // PLACEMENT-NEXT: } -// PLACEMENT-NEXT: %dependency_read_out_4, %dependency_write_out_5 = taskflow.task @Task_3 dependency_read_in(%arg3 : memref) dependency_write_in(%arg7 : memref) [original_read_memrefs(%arg3 : memref), original_write_memrefs(%arg7 : memref)] {task_mapping_info = {cgra_positions = [{col = 2 : i32, row = 0 : i32}], read_sram_locations = [{col = 2 : i32, row = 0 : i32}], write_sram_locations = [{col = 2 : i32, row = 1 : i32}]}} : (memref, memref) -> (memref, memref) { +// PLACEMENT-NEXT: %dependency_read_out_4, %dependency_write_out_5 = taskflow.task @Task_3 dependency_read_in(%arg3 : memref) dependency_write_in(%arg7 : memref) [original_read_memrefs(%arg3 : memref), original_write_memrefs(%arg7 : memref)] {task_mapping_info = {cgra_positions = [{col = 2 : i32, row = 0 : i32}], read_sram_locations = [{col = 2 : i32, row = 0 : i32}], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]}} : (memref, memref) -> (memref, memref) { // PLACEMENT-NEXT: ^bb0(%arg10: memref, %arg11: memref): // PLACEMENT-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index // PLACEMENT-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 7 : index} : index @@ -583,7 +583,7 @@ module attributes {} { // PLACEMENT-NEXT: }) : (index, index) -> () // PLACEMENT-NEXT: taskflow.yield reads(%arg10 : memref) writes(%arg11 : memref) // PLACEMENT-NEXT: } -// PLACEMENT-NEXT: %dependency_read_out_6:2, %dependency_write_out_7 = taskflow.task @Task_4 dependency_read_in(%arg4, %dependency_write_out_5 : memref, memref) dependency_write_in(%arg8 : memref) [original_read_memrefs(%arg4, %arg7 : memref, memref), original_write_memrefs(%arg8 : memref)] {task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_locations = [{col = 1 : i32, row = 1 : i32}, {col = 2 : i32, row = 1 : i32}], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]}} : (memref, memref, memref) -> (memref, memref, memref) { +// PLACEMENT-NEXT: %dependency_read_out_6:2, %dependency_write_out_7 = taskflow.task @Task_4 dependency_read_in(%arg4, %dependency_write_out_5 : memref, memref) dependency_write_in(%arg8 : memref) [original_read_memrefs(%arg4, %arg7 : memref, memref), original_write_memrefs(%arg8 : memref)] {task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locations = [{col = 0 : i32, row = 1 : i32}, {col = 1 : i32, row = 1 : i32}], write_sram_locations = [{col = 0 : i32, row = 1 : i32}]}} : (memref, memref, memref) -> (memref, memref, memref) { // PLACEMENT-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref): // PLACEMENT-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index // PLACEMENT-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 9 : index} : index diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir index fa1135ad..ea42d03e 100644 --- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir +++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir @@ -42,7 +42,7 @@ // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ // RUN: --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ -// RUN: --map-task-on-cgra \ +// RUN: --allocate-cgra-to-task \ // RUN: -o %t.placement.mlir // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT