Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
06a625b
feat: Implement resource-aware task optimization pass with pipeline b…
guosran Feb 13, 2026
842e61e
refactor: reorder to fusion-first, update latency model to II*(tc-1)+…
guosran Feb 16, 2026
2292f26
refactor: remove steps, convert LLVM_DEBUG to llvm::errs()
guosran Feb 16, 2026
991c917
refactor: implement full slack analysis in findBottleneck
guosran Feb 16, 2026
476ab1e
feat: Implement ResourceAwareTaskOptimizationPass with pipeline balan…
guosran Feb 17, 2026
5becdb3
make cgra_count=1 explicit in IR output
guosran Feb 17, 2026
8ff000e
feat: ResourceAwareTaskOptimizationPass — critical review fixes & cov…
guosran Feb 17, 2026
588737d
removed excessive files
guosran Feb 17, 2026
963bf79
clean up: remove debug.log
guosran Feb 17, 2026
b5fa1a1
fix: restore Zeonica_Testbench submodule to main branch pointer
guosran Feb 17, 2026
fa9c4a2
fix: remove duplicate RESOPT test block and cache profiling attrs acr…
guosran Feb 17, 2026
37bf4bc
feat(resource-aware-opt): implement PR review fixes for multi-CGRA op…
guosran Feb 26, 2026
8c3c86b
fix(resource-aware-opt): prevent hyperblock assert on fused tasks, re…
guosran Feb 26, 2026
6e91448
Revert "fix(resource-aware-opt): prevent hyperblock assert on fused t…
guosran Feb 26, 2026
92f1214
fix(resource-aware-opt): restore multi-CGRA optimization and update t…
guosran Feb 26, 2026
38b1293
Fix comment formatting issues
guosran Feb 26, 2026
25ed5d5
Format comments correctly in ResourceAwareTaskOptimizationPass (Doxyg…
guosran Feb 26, 2026
49d870f
Update comment verbs to third-person singular (Builds, Runs, etc)
guosran Feb 26, 2026
d380586
Rename hasPath to hasDependency per review feedback
guosran Feb 26, 2026
57cb983
feat(resource-aware-opt): support value-output tasks in utilization f…
guosran Feb 26, 2026
d25095b
refactor: address PR review round 2 — shapes, naming, docs, namespace
guosran Feb 27, 2026
d50bd04
refactor(ResourceAwareTaskOptimizationPass): address reviewer feedback
guosran Feb 27, 2026
3f058e5
refactor: address PR review round 3 — naming, options, cleanup
guosran Feb 27, 2026
0b74d66
refactor: replace /// with // in comments
guosran Feb 28, 2026
d9aaa51
Fix IR corruption during resource-aware task optimization profiling
guosran Feb 28, 2026
a7e511f
refactor: implement kernel-level task profiling and document architec…
guosran Feb 28, 2026
67da762
refactor: resolve kernel-level fusion bottlenecks and clarify shape h…
guosran Feb 28, 2026
8cee68c
Refactor: remove affine-related logic from ResOpt pass and standardiz…
guosran Mar 1, 2026
8ee3421
fix(resource-aware-opt): restore affine/scf fallback in computeTripCo…
guosran Mar 1, 2026
099466f
fix: resolve crash in performFusion for multi-block task bodies
guosran Mar 2, 2026
c672e3a
fix(resopt): compute correct trip_count from post-CF-lowered IR; add …
guosran Mar 3, 2026
9257a5d
Fix multi-block fusion yield logic; chain control flows with llvm.br
guosran Mar 3, 2026
b610950
feat: resource-aware task optimization with balance-skip-mapper option
guosran Mar 3, 2026
36bdf2b
Add multi-CGRA resource-heavy test case and fix convergence re-profiling
guosran Mar 3, 2026
0207d82
remove excessive docs
guosran Mar 3, 2026
9744dd8
refactor: clean up redundant code, unify comment style, restore expla…
guosran Mar 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions include/NeuraDialect/Architecture/Architecture.h
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,23 @@ class Architecture {
// Checks if the architecture supports counter operations.
bool canSupportCounter() const;

// Clones the architecture but with new per-cgra dimensions.
// The provided tile_overrides will be appended to the existing ones.
//
// Example — create an 8×4 tile array (2×1 CGRA rectangle) with all tiles
// present:
// auto arch_2x1 = getArchitecture().cloneWithNewDimensions(8, 4);
//
// Example — create a 12×8 bounding box for a T-shape (4 CGRAs) where only
// specific tiles are valid:
// std::vector<TileOverride> overrides;
// // First mark all tiles as non-existent, then mark valid ones existent.
// // (see MapToAcceleratorPass for the full valid_tiles parsing logic)
// auto arch_T = getArchitecture().cloneWithNewDimensions(8, 12, overrides);
std::unique_ptr<Architecture> cloneWithNewDimensions(
int new_per_cgra_rows, int new_per_cgra_columns,
const std::vector<TileOverride> &additional_overrides = {}) const;

private:
// Helper methods for constructor initialization.
void initializeTiles(int rows, int columns);
Expand Down Expand Up @@ -532,6 +549,13 @@ class Architecture {
int per_cgra_rows_;
int per_cgra_columns_;
int max_ctrl_mem_items_;

BaseTopology multi_cgra_base_topology_;
BaseTopology per_cgra_base_topology_;
TileDefaults tile_defaults_;
std::vector<TileOverride> tile_overrides_;
LinkDefaults link_defaults_;
std::vector<LinkOverride> link_overrides_;
};

// Function for getting the architecture object.
Expand Down
5 changes: 4 additions & 1 deletion include/NeuraDialect/NeuraPasses.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@ std::unique_ptr<mlir::Pass> createInsertCtrlMovPass();
std::unique_ptr<mlir::Pass> createAssignAcceleratorPass();
std::unique_ptr<mlir::Pass> createTransformCtrlToDataFlowPass();
std::unique_ptr<mlir::Pass> createLeveragePredicatedValuePass();
std::unique_ptr<mlir::Pass> createMapToAcceleratorPass();
// Creates the MapToAccelerator pass. Tile dimensions default to 0 (use
// architecture singleton) when not specified via options.
std::unique_ptr<mlir::Pass> createMapToAcceleratorPass(
const MapToAcceleratorOptions &options = MapToAcceleratorOptions{});
std::unique_ptr<mlir::Pass> createGenerateCodePass();
std::unique_ptr<mlir::Pass> createCanonicalizeReturnPass();
std::unique_ptr<mlir::Pass> createCanonicalizeLiveInPass();
Expand Down
34 changes: 34 additions & 0 deletions include/NeuraDialect/NeuraPasses.td
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,41 @@ def MapToAccelerator : Pass<"map-to-accelerator", "ModuleOp"> {
let summary = "Map Neura operations onto a given accelerator";
let description = [{
This pass performs mapping from Neura operations to accelerator.

x-tiles and y-tiles specify the **tile** dimensions of the target array
(not the CGRA count). Each CGRA contains a per_cgra_rows × per_cgra_cols
tile grid (currently 4×4). So for a single CGRA, x-tiles=4 y-tiles=4;
for a 1×2 rectangular pair, x-tiles=8 y-tiles=4; etc.

When x-tiles=0 and y-tiles=0 (the default), the global Architecture
singleton determines the tile grid — this is equivalent to a single CGRA.

Examples:
Single CGRA (default):
--map-to-accelerator
1×3 rectangular (3 CGRAs in a row):
--map-to-accelerator x-tiles=12 y-tiles=4
T-shape (4 CGRAs: top row of 3 + centre below):
--map-to-accelerator x-tiles=12 y-tiles=8 \
valid-tiles="0_0,1_0,2_0,3_0,4_0,5_0,6_0,7_0,8_0,9_0,10_0,11_0,\
4_1,5_1,6_1,7_1,4_4,5_4,6_4,7_4,4_5,5_5,6_5,7_5"
}];
let options = [
Option<"x_tiles", "x-tiles", "int", /*default=*/"0",
"Total number of tiles in the X dimension of the target array "
"(not the number of CGRAs). Each CGRA contributes per_cgra_cols "
"tiles. 0 means use the global Architecture singleton (1 CGRA).">,
Option<"y_tiles", "y-tiles", "int", /*default=*/"0",
"Total number of tiles in the Y dimension of the target array "
"(not the number of CGRAs). Each CGRA contributes per_cgra_rows "
"tiles. 0 means use the global Architecture singleton (1 CGRA).">,
Option<"valid_tiles", "valid-tiles", "std::string", /*default=*/"\"\"",
"Comma-separated list of tile coordinates (x_y) that are actually "
"present in the array, used for non-rectangular CGRA shapes such as "
"L-blocks or T-blocks. Empty string means all tiles in the "
"x-tiles x y-tiles rectangle are valid. "
"Example: 0_0,1_0,0_1 selects three tiles forming an L-shape.">
];
let constructor = "neura::createMapToAcceleratorPass()";
}

Expand Down
1 change: 1 addition & 0 deletions include/TaskflowDialect/TaskflowPasses.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ std::unique_ptr<mlir::Pass> createMapTaskOnCgraPass();
std::unique_ptr<mlir::Pass> createAffineLoopTreeSerializationPass();
std::unique_ptr<mlir::Pass> createAffineLoopPerfectionPass();
std::unique_ptr<mlir::Pass> createMemoryAccessStreamingFusionPass();
std::unique_ptr<mlir::Pass> createResourceAwareTaskOptimizationPass();

#define GEN_PASS_REGISTRATION
#include "TaskflowDialect/TaskflowPasses.h.inc"
Expand Down
44 changes: 44 additions & 0 deletions include/TaskflowDialect/TaskflowPasses.td
Original file line number Diff line number Diff line change
Expand Up @@ -86,4 +86,48 @@ def MemoryAccessStreamingFusion : Pass<"memory-access-streaming-fusion", "func::
}];
let constructor = "taskflow::createMemoryAccessStreamingFusionPass()";
}

def ResourceAwareTaskOptimization : Pass<"resource-aware-task-optimization", "func::FuncOp"> {
let summary = "Balances pipeline latency and fuses independent tasks for CGRA utilization.";
let description = [{
Two-phase optimization:
1. Utilization Fusion: merges independent (no-edge) tasks, selecting pairs
that minimize |trip_count_a - trip_count_b| for balanced utilization.
2. Pipeline Balance: allocates extra CGRAs to critical-path bottleneck tasks.
More CGRAs combine tile arrays into larger arrays for mapping, potentially
lowering compiled_ii. Latency model: II * (trip_count - 1) + steps.
Targets a 4x4 CGRA grid (16 CGRAs total, one CGRA per cell).
Currently a single task may be allocated at most 4 CGRAs.
Supported CGRA array shapes for a task (all fit within the 4×4 grid):
- rect : a perfect rectangle, e.g. 1×1, 1×2, 2×1, 1×3, 3×1, 2×2, 1×4, 4×1.
- L : an L-shaped block of 3 or 4 CGRAs, e.g.
3 CGRAs: (0,0)(1,0)(0,1) — two in a row + one below-left.
4 CGRAs: (0,0)(0,1)(0,2)(1,2) — three in a column + one offset.
- T : a T-shaped block of 4 CGRAs, e.g.
(0,0)(1,0)(2,0)(1,1) — three in a row + one below centre.
Non-rectangular shapes are represented by their bounding box plus an
explicit tile list that enumerates only the occupied CGRA positions.
Compiled_ii must come from the downstream Neura pipeline (asserts on failure).

Use --estimation-mode to control how task II/steps are estimated:
compiled (default): runs the full Neura lowering + mapping pipeline
for accurate compiled_ii and steps.
analytical : uses only ResMII/RecMII analytical estimates without
running the mapper — much faster but less accurate.
Useful for rapid design-space exploration.
Example:
--resource-aware-task-optimization estimation-mode=analytical
}];
let options = [
Option<"estimationMode", "estimation-mode", "std::string",
/*default=*/"\"compiled\"",
"Profiling estimation mode: 'compiled' (default) runs the full "
"Neura lowering + mapping pipeline; 'analytical' uses only "
"ResMII/RecMII analytical estimates (faster but less accurate).">
];
let constructor = "taskflow::createResourceAwareTaskOptimizationPass()";
let dependentDialects = [
"mlir::affine::AffineDialect",
"mlir::func::FuncDialect"];
}
#endif // TASKFLOW_PASSES_TD
72 changes: 71 additions & 1 deletion lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,74 @@ struct ArithIndexCastToNeuraCast
}
};

struct ArithMinimumFToNeuraFCmpSel
: public OpRewritePattern<mlir::arith::MinimumFOp> {
using OpRewritePattern::OpRewritePattern;

LogicalResult matchAndRewrite(arith::MinimumFOp op,
PatternRewriter &rewriter) const override {
Value lhs = op.getLhs();
Value rhs = op.getRhs();
Type result_type = op.getType();
Location loc = op.getLoc();

// minimumf(a, b) → sel(fcmp(a, b, "olt"), a, b)
// "olt" = ordered less-than: true when a < b (false if either is NaN).
Value cmp = rewriter.create<neura::FCmpOp>(
loc, result_type, lhs, rhs, rewriter.getStringAttr("olt"));
rewriter.replaceOpWithNewOp<neura::SelOp>(op, result_type, cmp, lhs, rhs);
return success();
}
};

struct ArithMaximumFToNeuraFCmpSel
: public OpRewritePattern<mlir::arith::MaximumFOp> {
using OpRewritePattern::OpRewritePattern;

LogicalResult matchAndRewrite(arith::MaximumFOp op,
PatternRewriter &rewriter) const override {
Value lhs = op.getLhs();
Value rhs = op.getRhs();
Type result_type = op.getType();
Location loc = op.getLoc();

// maximumf(a, b) → sel(fcmp(a, b, "ogt"), a, b)
// "ogt" = ordered greater-than: true when a > b (false if either is NaN).
Value cmp = rewriter.create<neura::FCmpOp>(
loc, result_type, lhs, rhs, rewriter.getStringAttr("ogt"));
rewriter.replaceOpWithNewOp<neura::SelOp>(op, result_type, cmp, lhs, rhs);
return success();
}
};

// arith.andi(a, b) → neura.and(a, b)
struct ArithAndIToNeuraAnd : public OpRewritePattern<mlir::arith::AndIOp> {
using OpRewritePattern::OpRewritePattern;

LogicalResult matchAndRewrite(arith::AndIOp op,
PatternRewriter &rewriter) const override {
Value lhs = op.getLhs();
Value rhs = op.getRhs();
Type result_type = op.getType();
rewriter.replaceOpWithNewOp<neura::AndOp>(op, result_type, lhs, rhs);
return success();
}
};

// arith.ori(a, b) → neura.or(a, b)
struct ArithOrIToNeuraOr : public OpRewritePattern<mlir::arith::OrIOp> {
using OpRewritePattern::OpRewritePattern;

LogicalResult matchAndRewrite(arith::OrIOp op,
PatternRewriter &rewriter) const override {
Value lhs = op.getLhs();
Value rhs = op.getRhs();
Type result_type = op.getType();
rewriter.replaceOpWithNewOp<neura::OrOp>(op, result_type, lhs, rhs);
return success();
}
};

struct LowerArithToNeuraPass
: public PassWrapper<LowerArithToNeuraPass, OperationPass<ModuleOp>> {

Expand All @@ -322,7 +390,9 @@ struct LowerArithToNeuraPass
ArithExtUIToNeuraCast, ArithIndexCastToNeuraCast,
ArithFDivToNeuraFDiv, ArithExtfToNeuraCast, ArithMulFToNeuraFMul,
ArithSubIToNeuraSub, ArithSubFToNeuraFSub, ArithMulIToNeuraMul,
ArithDivSIToNeuraDiv, ArithRemSIToNeuraOp>(context);
ArithDivSIToNeuraDiv, ArithRemSIToNeuraOp,
ArithMinimumFToNeuraFCmpSel, ArithMaximumFToNeuraFCmpSel,
ArithAndIToNeuraAnd, ArithOrIToNeuraOr>(context);
return patterns;
}

Expand Down
23 changes: 20 additions & 3 deletions lib/NeuraDialect/Architecture/Architecture.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -561,12 +561,15 @@ Architecture::Architecture(int multi_cgra_rows, int multi_cgra_columns,
const std::vector<LinkOverride> &link_overrides) {
this->multi_cgra_rows_ = multi_cgra_rows;
this->multi_cgra_columns_ = multi_cgra_columns;
// TODO: Support multi-CGRA topology in the future:
// https://github.com/coredac/dataflow/issues/163.
// this->multi_cgra_base_topology_ = multi_cgra_base_topology;
this->multi_cgra_base_topology_ = multi_cgra_base_topology;
this->per_cgra_rows_ = per_cgra_rows;
this->per_cgra_columns_ = per_cgra_columns;
this->per_cgra_base_topology_ = per_cgra_base_topology;
this->max_ctrl_mem_items_ = max_ctrl_mem_items;
this->tile_defaults_ = tile_defaults;
this->tile_overrides_ = tile_overrides;
this->link_defaults_ = link_defaults;
this->link_overrides_ = link_overrides;

// Initializes architecture components using helper methods.
initializeTiles(per_cgra_rows, per_cgra_columns);
Expand All @@ -576,6 +579,20 @@ Architecture::Architecture(int multi_cgra_rows, int multi_cgra_columns,
applyLinkOverrides(link_overrides);
}

std::unique_ptr<Architecture> Architecture::cloneWithNewDimensions(
int new_per_cgra_rows, int new_per_cgra_columns,
const std::vector<TileOverride> &additional_overrides) const {

std::vector<TileOverride> merged_overrides = tile_overrides_;
merged_overrides.insert(merged_overrides.end(), additional_overrides.begin(), additional_overrides.end());

return std::make_unique<Architecture>(
multi_cgra_rows_, multi_cgra_columns_, multi_cgra_base_topology_,
new_per_cgra_rows, new_per_cgra_columns, max_ctrl_mem_items_,
per_cgra_base_topology_, tile_defaults_, merged_overrides,
link_defaults_, link_overrides_);
}

Tile *Architecture::getTile(int id) {
auto it = id_to_tile_.find(id);
assert(it != id_to_tile_.end() && "Tile with given ID not found");
Expand Down
19 changes: 16 additions & 3 deletions lib/NeuraDialect/Transforms/InsertDataMovPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,20 @@ struct InsertDataMovForNeuraOps : public RewritePattern {

LogicalResult matchAndRewrite(Operation *op,
PatternRewriter &rewriter) const override {
// Only processes operations from the neura dialect. Operations from
// other dialects (arith, math, etc.) should have been lowered to neura
// ops by earlier passes (LowerArithToNeura, etc.) before this pass runs.
if (op->getDialect()->getNamespace() != accel::kNeuraTarget ||
isa<neura::DataMovOp>(op)) {
isa<neura::DataMovOp>(op) ||
// ReserveOp creates a loop-carried placeholder in the dataflow
// recurrence cycle: %v = neura.reserve; neura.ctrl_mov %next -> %v.
// Its result must NOT be wrapped in DataMovOp, because ctrl_mov needs
// a direct reference to the same SSA value used by phi_start.
// Inserting a DataMovOp between reserve and its consumers would break
// the ctrl_mov→reserve back-edge and corrupt the recurrence cycle.
isa<neura::ReserveOp>(op) ||
isa<neura::KernelOp>(op) ||
isa<neura::FusedOp>(op)) {
return failure();
}

Expand Down Expand Up @@ -91,8 +103,9 @@ struct InsertDataMovForNeuraOps : public RewritePattern {
for (Value operand : op->getOperands()) {
Operation *producer = operand.getDefiningOp();

// Skips adding mov for any operand that comes from a reserve op or
// already from data_mov.
// Does NOT wrap operands that come from reserve: the reserve result
// is the recurrence back-edge target for ctrl_mov. Wrapping it would
// produce a new SSA value, breaking the ctrl_mov→reserve cycle.
if (producer && (isa<neura::ReserveOp>(producer) ||
isa<neura::DataMovOp>(producer))) {
new_operands.push_back(operand);
Expand Down
Loading