diff --git a/include/NeuraDialect/Architecture/Architecture.h b/include/NeuraDialect/Architecture/Architecture.h index 4f8e5cc2..80b4d1b5 100644 --- a/include/NeuraDialect/Architecture/Architecture.h +++ b/include/NeuraDialect/Architecture/Architecture.h @@ -560,6 +560,9 @@ class Architecture { // Function for getting the architecture object. const Architecture &getArchitecture(); + +// Function for getting the latency specification file path. +const std::string &getLatencySpecFile(); } // namespace neura } // namespace mlir diff --git a/include/NeuraDialect/Mapping/MappingState.h b/include/NeuraDialect/Mapping/MappingState.h index a43cea04..99dc737f 100644 --- a/include/NeuraDialect/Mapping/MappingState.h +++ b/include/NeuraDialect/Mapping/MappingState.h @@ -10,6 +10,13 @@ namespace mlir { namespace neura { +// Occupy status for multi-cycle pipeline support. +// These states define how a tile/FU is occupied at a given time step. +#define SINGLE_OCCUPY 0 // A single-cycle op is in the FU (exclusive) +#define START_PIPE_OCCUPY 1 // A multi-cycle op starts in the FU +#define END_PIPE_OCCUPY 2 // A multi-cycle op ends in the FU +#define IN_PIPE_OCCUPY 3 // A multi-cycle op is occupying the FU (pipelined) + // Represents a spatial-temporal location: (resource, time_step) struct MappingLoc { BasicResource *resource; @@ -54,9 +61,20 @@ namespace neura { class MappingState { public: MappingState(const Architecture &arch, int II, bool is_spatial_only); - // Binds a (tile/link, time_step) location to an operation. + // Binds a (tile/link, time_step) location to an operation with default + // SINGLE_OCCUPY status. bool bindOp(const MappingLoc &loc, Operation *op); + // Binds a (tile/link, time_step) location to an operation with specified + // occupy status for multi-cycle pipeline support. + bool bindOp(const MappingLoc &loc, Operation *op, int occupy_status); + + // Binds multiple locations for a multi-cycle operation. + // This sets START_PIPE_OCCUPY at start_time, IN_PIPE_OCCUPY for intermediate + // times, and END_PIPE_OCCUPY at end_time-1. + bool bindMultiCycleOp(BasicResource *resource, int start_time, int latency, + Operation *op); + // Unbinds an operation from its (tile/link, time_step) location, // which is useful for backtracking. void unbindOp(Operation *op); @@ -67,6 +85,19 @@ class MappingState { // it will check (tile 2, step 1), (tile 2, step 5), (tile 2, step 9), etc. bool isAvailableAcrossTime(const MappingLoc &loc) const; + // Checks if a location is available for a specific occupy status. + // This implements the pipeline-aware availability checking: + // - SINGLE_OCCUPY: only available if location is completely free + // - START_PIPE_OCCUPY: available if free or IN_PIPE_OCCUPY or END_PIPE_OCCUPY + // - END_PIPE_OCCUPY: available if free or IN_PIPE_OCCUPY or START_PIPE_OCCUPY + // - IN_PIPE_OCCUPY: always available (can pipeline with any status) + bool isAvailableForOccupyStatus(const MappingLoc &loc, + int new_occupy_status) const; + + // Gets the occupy status at a specific location across time domain. + // Returns -1 if the location is not occupied. + int getOccupyStatusAcrossTime(const MappingLoc &loc) const; + // Checks if a hardware resource is available across a time range. // This function leverages the isAvailableAcrossTime function in each // time step. @@ -111,7 +142,8 @@ class MappingState { void dumpOpToLocs(llvm::raw_ostream &os = llvm::errs()) const; // Getters for state information. - const std::set &getOccupiedLocs() const { + const std::map>> & + getOccupiedLocs() const { return this->occupied_locs; } const std::map &getLocToOp() const { @@ -122,7 +154,9 @@ class MappingState { } // Setters for state information. - void setOccupiedLocs(const std::set &locs) { + void setOccupiedLocs( + const std::map>> + &locs) { this->occupied_locs = locs; } void setLocToOp(const std::map &loc_to_op) { @@ -139,7 +173,9 @@ class MappingState { bool is_spatial_only; static constexpr int kMaxSteps = 10; - std::set occupied_locs; + // Maps location to a list of (occupy_status, operation) pairs. + // Multiple ops can occupy the same location with compatible pipeline states. + std::map>> occupied_locs; std::map loc_to_op; std::map> op_to_locs; }; @@ -160,7 +196,7 @@ class MappingStateSnapshot { } private: - std::set occupied_locs; + std::map>> occupied_locs; std::map loc_to_op; std::map> op_to_locs; }; diff --git a/include/NeuraDialect/Mapping/mapping_util.h b/include/NeuraDialect/Mapping/mapping_util.h index ee6fcefc..fd2f2329 100644 --- a/include/NeuraDialect/Mapping/mapping_util.h +++ b/include/NeuraDialect/Mapping/mapping_util.h @@ -116,5 +116,12 @@ bool canReachLocInTime(const std::vector &producers, Register *getAvailableRegister(const MappingState &mapping_state, Tile *tile, int start_time, int exclusive_end_time); +// Gets the execution latency of an operation from its "latency" attribute. +// Returns 1 (single-cycle) if the attribute is not present. +int getOpLatency(Operation *op); + +// Checks if an operation is a multi-cycle operation (latency > 1). +bool isMultiCycleOp(Operation *op); + } // namespace neura } // namespace mlir diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h index 56a9e785..c3a03f35 100644 --- a/include/NeuraDialect/NeuraPasses.h +++ b/include/NeuraDialect/NeuraPasses.h @@ -51,6 +51,7 @@ std::unique_ptr createInitPatternPass(); // Hardware optimization passes std::unique_ptr createHardwareMergePass(); +std::unique_ptr createInitExecLatencyPass(); #define GEN_PASS_REGISTRATION #include "NeuraDialect/NeuraPasses.h.inc" diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td index f7fc06a3..cb0e8646 100644 --- a/include/NeuraDialect/NeuraPasses.td +++ b/include/NeuraDialect/NeuraPasses.td @@ -220,4 +220,12 @@ def HardwareMerge : Pass<"hardware-merge", "ModuleOp"> { }]; let constructor = "neura::createHardwareMergePass()"; } + +def InitExecLatency : Pass<"init-exec-latency", "ModuleOp"> { + let summary = "Initialize execution latency information"; + let description = [{ + This pass initializes execution latency information. + }]; + let constructor = "neura::createInitExecLatencyPass()"; +} #endif // NEURA_PASSES_TD \ No newline at end of file diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h index a23c5b02..92393d7c 100644 --- a/include/TaskflowDialect/TaskflowPasses.h +++ b/include/TaskflowDialect/TaskflowPasses.h @@ -22,6 +22,7 @@ void registerTosaToAffineConversionPassPipeline(); std::unique_ptr createConstructHyperblockFromTaskPass(); std::unique_ptr createClassifyCountersPass(); std::unique_ptr createMapTaskOnCgraPass(); +std::unique_ptr createFuseTaskPass(); //=========================================================// // Optimization Passes diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td index 8d765498..960f44c6 100644 --- a/include/TaskflowDialect/TaskflowPasses.td +++ b/include/TaskflowDialect/TaskflowPasses.td @@ -73,6 +73,27 @@ def MapTaskOnCgra : Pass<"map-task-on-cgra", "func::FuncOp"> { let constructor = "taskflow::createMapTaskOnCgraPass()"; } +def FuseTask : Pass<"fuse-task", "func::FuncOp"> { + let summary = "Fuses Taskflow tasks using producer-consumer and sibling strategies"; + let description = [{ + Fuses taskflow.task operations using producer-consumer and sibling + fusion strategies. Uses Neura-level MII metrics for profitability analysis. + + Producer-Consumer Fusion: Fuses a producer task into its consumer when + the producer's memory output feeds directly into the consumer. + + Sibling Fusion: Fuses tasks that share inputs without data dependency. + }]; + let constructor = "taskflow::createFuseTaskPass()"; + let dependentDialects = [ + "mlir::LLVM::LLVMDialect", + "mlir::func::FuncDialect", + "mlir::arith::ArithDialect", + "mlir::memref::MemRefDialect", + "mlir::neura::NeuraDialect", + "mlir::taskflow::TaskflowDialect"]; +} + def MemoryAccessStreamingFusion : Pass<"memory-access-streaming-fusion", "func::FuncOp"> { let summary = "Fuses tasks connected by memory dependencies for streaming execution"; let description = [{ diff --git a/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp b/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp index 80fe8e0c..36c83f22 100644 --- a/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp +++ b/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp @@ -67,10 +67,9 @@ struct HyperblockToKernelPattern } // Asserts that each task contains only one hyperblock. + // (Fused tasks may contain multiple hyperblocks, which is valid.) int hyperblock_count = 0; task_op.walk([&](TaskflowHyperblockOp op) { hyperblock_count++; }); - assert(hyperblock_count == 1 && - "Each taskflow.task should contain only one hyperblock"); Block &hb_block = hyperblock_op.getBody().front(); Block &task_block = task_op.getBody().front(); diff --git a/lib/NeuraDialect/Mapping/MappingState.cpp b/lib/NeuraDialect/Mapping/MappingState.cpp index 110d1976..d537eeea 100644 --- a/lib/NeuraDialect/Mapping/MappingState.cpp +++ b/lib/NeuraDialect/Mapping/MappingState.cpp @@ -3,6 +3,7 @@ #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypes.h" #include "llvm/Support/raw_ostream.h" +#include using namespace mlir; using namespace mlir::neura; @@ -30,14 +31,62 @@ MappingState::MappingState(const Architecture &arch, int II, : II(II), is_spatial_only(is_spatial_only) {} bool MappingState::bindOp(const MappingLoc &loc, Operation *op) { + // Default to SINGLE_OCCUPY for backward compatibility + return bindOp(loc, op, SINGLE_OCCUPY); +} + +bool MappingState::bindOp(const MappingLoc &loc, Operation *op, + int occupy_status) { + // Check if the location is available for the specified occupy status + if (!isAvailableForOccupyStatus(loc, occupy_status)) { + return false; + } + loc_to_op[loc] = op; - occupied_locs.insert(loc); + occupied_locs[loc].push_back({occupy_status, op}); auto it = op_to_locs.find(op); assert(it == op_to_locs.end() && "Operation already has reserved locations"); op_to_locs[op].push_back(loc); return true; } +bool MappingState::bindMultiCycleOp(BasicResource *resource, int start_time, + int latency, Operation *op) { + // First check if all locations are available + for (int t = start_time; t < start_time + latency; ++t) { + MappingLoc check_loc = {resource, t}; + int status; + if (t == start_time) { + status = START_PIPE_OCCUPY; + } else if (t == start_time + latency - 1) { + status = END_PIPE_OCCUPY; + } else { + status = IN_PIPE_OCCUPY; + } + if (!isAvailableForOccupyStatus(check_loc, status)) { + return false; + } + } + + // Now bind all locations + for (int t = start_time; t < start_time + latency; ++t) { + MappingLoc loc = {resource, t}; + int status; + if (t == start_time) { + status = START_PIPE_OCCUPY; + } else if (t == start_time + latency - 1) { + status = END_PIPE_OCCUPY; + } else { + status = IN_PIPE_OCCUPY; + } + + loc_to_op[loc] = op; + occupied_locs[loc].push_back({status, op}); + op_to_locs[op].push_back(loc); + } + return true; +} + void MappingState::unbindOp(Operation *op) { auto it = op_to_locs.find(op); if (it == op_to_locs.end()) { @@ -46,7 +95,21 @@ void MappingState::unbindOp(Operation *op) { for (const MappingLoc &loc : it->second) { loc_to_op.erase(loc); - occupied_locs.erase(loc); + // Remove entries for this op from occupied_locs + auto occ_it = occupied_locs.find(loc); + if (occ_it != occupied_locs.end()) { + auto &entries = occ_it->second; + entries.erase( + std::remove_if(entries.begin(), entries.end(), + [op](const std::pair &entry) { + return entry.second == op; + }), + entries.end()); + // Remove the location entirely if no more entries + if (entries.empty()) { + occupied_locs.erase(occ_it); + } + } } op_to_locs.erase(it); @@ -57,21 +120,128 @@ bool MappingState::isAvailableAcrossTime(const MappingLoc &loc) const { if (this->is_spatial_only) { for (int t = 0; t < II * kMaxSteps; ++t) { MappingLoc check_loc = {loc.resource, t}; - if (occupied_locs.find(check_loc) != occupied_locs.end()) { - return false; + auto it = occupied_locs.find(check_loc); + if (it != occupied_locs.end()) { + // Check if all existing occupy statuses allow new single-cycle op + for (const auto &entry : it->second) { + if (entry.first != IN_PIPE_OCCUPY) { + return false; + } + } } } return true; } else { - // Checks the availability across time domain. for (int t = loc.time_step % II; t < II * kMaxSteps; t += II) { MappingLoc check_loc = {loc.resource, t}; - if (occupied_locs.find(check_loc) != occupied_locs.end()) { + auto it = occupied_locs.find(check_loc); + if (it != occupied_locs.end()) { + // Check if all existing occupy statuses allow new single-cycle op + for (const auto &entry : it->second) { + if (entry.first != IN_PIPE_OCCUPY) { + return false; + } + } + } + } + return true; + } +} + +bool MappingState::isAvailableForOccupyStatus(const MappingLoc &loc, + int new_occupy_status) const { + // Helper lambda to check a single location against all existing entries + auto checkSingleLoc = [this, new_occupy_status](const MappingLoc &check_loc) -> bool { + auto it = occupied_locs.find(check_loc); + if (it == occupied_locs.end() || it->second.empty()) { + // Location is free, always available + return true; + } + + // Check against all existing entries at this location + for (const auto &entry : it->second) { + int existing_status = entry.first; + + // Implement the pipeline-aware availability rules: + // - SINGLE_OCCUPY (0): exclusive, no other op can share + // - START_PIPE_OCCUPY (1): cannot coexist with SINGLE or another START + // - END_PIPE_OCCUPY (2): cannot coexist with SINGLE or another END + // - IN_PIPE_OCCUPY (3): can coexist with any status except SINGLE + + if (existing_status == SINGLE_OCCUPY) { + // SINGLE_OCCUPY blocks everything + return false; + } + + if (new_occupy_status == SINGLE_OCCUPY) { + // SINGLE_OCCUPY cannot be placed if anything is there return false; } + + if (new_occupy_status == START_PIPE_OCCUPY) { + // START cannot coexist with another START + if (existing_status == START_PIPE_OCCUPY) { + return false; + } + } + + if (new_occupy_status == END_PIPE_OCCUPY) { + // END cannot coexist with another END + if (existing_status == END_PIPE_OCCUPY) { + return false; + } + } + + // IN_PIPE_OCCUPY can coexist with START, END, or other IN_PIPE } return true; + }; + + // For spatial mapping, check all time steps + if (this->is_spatial_only) { + for (int t = 0; t < II * kMaxSteps; ++t) { + MappingLoc check_loc = {loc.resource, t}; + if (!checkSingleLoc(check_loc)) { + return false; + } + } + return true; + } else { + // Check across time domain (modulo II) + for (int t = loc.time_step % II; t < II * kMaxSteps; t += II) { + MappingLoc check_loc = {loc.resource, t}; + if (!checkSingleLoc(check_loc)) { + return false; + } + } + return true; + } +} + +int MappingState::getOccupyStatusAcrossTime(const MappingLoc &loc) const { + // For spatial mapping, check all time steps + if (this->is_spatial_only) { + for (int t = 0; t < II * kMaxSteps; ++t) { + MappingLoc check_loc = {loc.resource, t}; + auto it = occupied_locs.find(check_loc); + if (it != occupied_locs.end() && !it->second.empty()) { + // Return the first status found (most restrictive) + return it->second[0].first; + } + } + return -1; + } else { + // Check across time domain (modulo II) + for (int t = loc.time_step % II; t < II * kMaxSteps; t += II) { + MappingLoc check_loc = {loc.resource, t}; + auto it = occupied_locs.find(check_loc); + if (it != occupied_locs.end() && !it->second.empty()) { + // Return the first status found (most restrictive) + return it->second[0].first; + } + } + return -1; } } @@ -202,12 +372,9 @@ void MappingState::reserveRoute(Operation *op, ArrayRef path) { op_to_locs[op] = std::vector(path.begin(), path.end()); for (const MappingLoc &loc : path) { - assert(occupied_locs.find(loc) == occupied_locs.end() && - "Mapping location already occupied"); loc_to_op[loc] = op; - assert(occupied_locs.find(loc) == occupied_locs.end() && - "Mapping location already occupied in occupied_locs"); - occupied_locs.insert(loc); + // Use SINGLE_OCCUPY for route reservations (links/registers) + occupied_locs[loc].push_back({SINGLE_OCCUPY, op}); } } @@ -221,7 +388,21 @@ void MappingState::releaseRoute(Operation *op) { for (const MappingLoc &loc : route) { loc_to_op.erase(loc); - occupied_locs.erase(loc); + // Remove entries for this op from occupied_locs + auto occ_it = occupied_locs.find(loc); + if (occ_it != occupied_locs.end()) { + auto &entries = occ_it->second; + entries.erase( + std::remove_if(entries.begin(), entries.end(), + [op](const std::pair &entry) { + return entry.second == op; + }), + entries.end()); + // Remove the location entirely if no more entries + if (entries.empty()) { + occupied_locs.erase(occ_it); + } + } } op_to_locs.erase(it); diff --git a/lib/NeuraDialect/Mapping/mapping_util.cpp b/lib/NeuraDialect/Mapping/mapping_util.cpp index 814c59a3..c007a9c3 100644 --- a/lib/NeuraDialect/Mapping/mapping_util.cpp +++ b/lib/NeuraDialect/Mapping/mapping_util.cpp @@ -4,6 +4,7 @@ #include "NeuraDialect/Mapping/mapping_util.h" #include "NeuraDialect/NeuraOps.h" #include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/Operation.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" @@ -1115,13 +1116,37 @@ llvm::SmallVector mlir::neura::getCtrlMovUsers(Operation *op) { bool mlir::neura::placeAndRoute(Operation *op, const MappingLoc &target_loc, MappingState &mapping_state) { - if (mapping_state.bindOp(target_loc, op)) { + // Get the latency of the operation to determine if it's multi-cycle + int latency = getOpLatency(op); + bool is_multi_cycle = latency > 1; + + bool bind_success = false; + if (is_multi_cycle) { + // For multi-cycle ops, bind across multiple time steps with pipeline status + bind_success = mapping_state.bindMultiCycleOp( + target_loc.resource, target_loc.time_step, latency, op); + if (bind_success) { + llvm::errs() << "[DEBUG] Bound multi-cycle op (latency=" << latency + << ") " << *op << " onto loc: " + << target_loc.resource->getType() << "#" + << target_loc.resource->getId() + << " @t=" << target_loc.time_step << " to t=" + << (target_loc.time_step + latency - 1) << "\n"; + } + } else { + // For single-cycle ops, use default SINGLE_OCCUPY binding + bind_success = mapping_state.bindOp(target_loc, op); + if (bind_success) { + llvm::errs() << "[DEBUG] Schedule op " << *op + << " onto loc: " << target_loc.resource->getType() << "#" + << target_loc.resource->getId() + << " @t=" << target_loc.time_step << "\n"; + } + } + + if (bind_success) { std::vector routed_operands; std::vector routed_ctrl_movs; - llvm::errs() << "[DEBUG] Schedule op " << *op - << " onto loc: " << target_loc.resource->getType() << "#" - << target_loc.resource->getId() - << " @t=" << target_loc.time_step << "\n"; // Tries to route the data move operations. for (Value operand : op->getOperands()) { llvm::errs() << "Processing operand: " << operand << "\n"; @@ -1222,4 +1247,17 @@ bool mlir::neura::placeAndRoute(Operation *op, const MappingLoc &target_loc, return true; } return false; +} + +int mlir::neura::getOpLatency(Operation *op) { + // Try to get the latency attribute from the operation + if (auto latency_attr = op->getAttrOfType("latency")) { + return latency_attr.getInt(); + } + // Default to single-cycle if no latency attribute is present + return 1; +} + +bool mlir::neura::isMultiCycleOp(Operation *op) { + return getOpLatency(op) > 1; } \ No newline at end of file diff --git a/lib/NeuraDialect/Transforms/CMakeLists.txt b/lib/NeuraDialect/Transforms/CMakeLists.txt index 010fc3c7..ce23714a 100644 --- a/lib/NeuraDialect/Transforms/CMakeLists.txt +++ b/lib/NeuraDialect/Transforms/CMakeLists.txt @@ -18,6 +18,7 @@ add_mlir_library( TransformToSteerControlPass.cpp RemovePredicatedTypePass.cpp HardwareMergePass.cpp + InitExecLatencyPass.cpp GraphMining/HardwareTemplate.cpp DEPENDS diff --git a/lib/NeuraDialect/Transforms/GenerateCodePass.cpp b/lib/NeuraDialect/Transforms/GenerateCodePass.cpp index 3c2a7699..69a3d50c 100644 --- a/lib/NeuraDialect/Transforms/GenerateCodePass.cpp +++ b/lib/NeuraDialect/Transforms/GenerateCodePass.cpp @@ -85,6 +85,7 @@ static bool isCtrlMov(Operation *op) { return dyn_cast(op) != nullptr static bool isPhiStart(Operation *op) { return dyn_cast(op) != nullptr; } static bool isReserve(Operation *op) { return dyn_cast(op) != nullptr; } static bool isConstant(Operation *op) { return dyn_cast(op) != nullptr; } +static bool isFusedOp(Operation *op) { return dyn_cast(op) != nullptr; } // ---- Constant for phi_start operation ----. static constexpr unsigned kReserveOpIndex = 1; @@ -477,24 +478,29 @@ struct GenerateCodePass SmallVector &ctrl_movs, DenseMap &reserve_to_phi_map) { function.walk([&](Operation *op) { - // placement for every op (even for mov/reserve). + // Skips operations inside fused_op regions. + if (op->getParentOp() && isFusedOp(op->getParentOp())) { + return; + } + + // Records Records placement for every op (even for mov/reserve). operation_placements[op] = getTileLocation(op); - // build reserve -> phi mapping. + // Builds reserve -> phi mapping for loop-carried dependencies. if (isPhiStart(op)) { if (Value reserve = getReserveOperand(op)) { reserve_to_phi_map[reserve] = op; } } - // collect forwarders. + // Collects forwarders for later expansion. if (isDataMov(op)) { data_movs.push_back(op); return; } if (isCtrlMov(op)) { ctrl_movs.push_back(op); return; } - // skip Reserve from materialization. + // Skips Reserve from materialization. if (isReserve(op)) return; - // materialize all other ops placed on tiles (compute/phi/const/etc.). + // Materializes all other ops placed on tiles (compute/phi/const/fused_op/etc.). TileLocation placement = operation_placements[op]; if (!placement.has_tile) return; @@ -824,6 +830,31 @@ struct GenerateCodePass const DenseMap &reserve2phi) { if (!validateForwarderShape(forwarder)) return; + // Checks if this data_mov/ctrl_mov has mapping_locs assigned by MapToAcceleratorPass. + auto mapping_locs = getMappingLocations(forwarder); + if (!mapping_locs || mapping_locs.empty()) { + // Skips this mov operation - it will be handled by its consumer or does not need routing. + // This is expected for data_mov that only feeds into ctrl_mov. + if constexpr (!IsCtrl) { + // For data_mov without mapping, verifies if it is only used by ctrl_mov. + bool only_ctrl_mov_users = true; + for (OpOperand &use : forwarder->getResult(0).getUses()) { + if (!isa(use.getOwner())) { + only_ctrl_mov_users = false; + break; + } + } + if (only_ctrl_mov_users) { + // This is expected - ctrl_mov handles this data transfer implicitly. + return; + } else { + // This data_mov has non-ctrl_mov users but no mapping - this is an error. + forwarder->emitWarning("data_mov without mapping_locs has non-ctrl_mov users"); + } + } + return; + } + MovBasics basics = buildMovBasics(forwarder, topo); emitMovRoutingInstructions(forwarder, basics, topo); @@ -1022,6 +1053,10 @@ struct GenerateCodePass if (operation == func.getOperation()) return; // Skips function itself. if (isReserve(operation)) return; // Skips reserve nodes entirely (bypass later). if (isa(operation)) return; // Skips yield nodes entirely (bypass later). + // Skips operations inside fused_op regions - they are handled by hardware + if (operation->getParentOp() && isFusedOp(operation->getParentOp())) { + return; + } int dfg_id = getDfgId(operation); if (dfg_id < 0) { diff --git a/lib/NeuraDialect/Transforms/InitExecLatencyPass.cpp b/lib/NeuraDialect/Transforms/InitExecLatencyPass.cpp new file mode 100644 index 00000000..0a2e8ef2 --- /dev/null +++ b/lib/NeuraDialect/Transforms/InitExecLatencyPass.cpp @@ -0,0 +1,185 @@ +//===- InitExecLatencyPass.cpp - Initialize Execution Latency --------------===// +// +// This pass initializes execution latency information. +// +//===----------------------------------------------------------------------===// + +#include "mlir/IR/BuiltinOps.h" +#include "mlir/Pass/Pass.h" +#include "NeuraDialect/Architecture/ArchitectureSpec.h" +#include "NeuraDialect/Architecture/Architecture.h" +#include "NeuraDialect/NeuraDialect.h" +#include "NeuraDialect/NeuraOps.h" +#include "NeuraDialect/NeuraPasses.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/YAMLParser.h" +#include "llvm/Support/raw_ostream.h" +#include + +using namespace mlir; + +#define GEN_PASS_DEF_INITEXECLATENCY +#include "NeuraDialect/NeuraPasses.h.inc" + +namespace { + +// Helper function to parse YAML scalar to integer +static bool parseYamlScalarInt(const llvm::yaml::Node *node, int &result) { + auto *scalar = llvm::dyn_cast_or_null(node); + if (!scalar) + return false; + llvm::SmallString<64> value_string; + llvm::StringRef value_ref = scalar->getValue(value_string); + long long temp_value = 0; + if (value_ref.getAsInteger(10, temp_value)) + return false; + result = static_cast(temp_value); + return true; +} + +// Helper function to parse YAML scalar to string +static bool parseYamlScalarString(const llvm::yaml::Node *node, + std::string &result) { + auto *scalar = llvm::dyn_cast_or_null(node); + if (!scalar) + return false; + llvm::SmallString<64> value_string; + llvm::StringRef value_ref = scalar->getValue(value_string); + result = value_ref.str(); + return true; +} + +// Parse latency YAML file: expects a mapping of operation names to latency values +static bool parseLatencyYaml(const std::string &file_path, + std::map &latency_map) { + llvm::ErrorOr> buffer_or_err = + llvm::MemoryBuffer::getFile(file_path); + if (!buffer_or_err) { + llvm::errs() << "[InitExecLatencyPass] Failed to open latency specification file: " + << file_path << "\n"; + return false; + } + + llvm::SourceMgr sm; + sm.AddNewSourceBuffer(std::move(*buffer_or_err), llvm::SMLoc()); + llvm::yaml::Stream yaml_stream( + sm.getMemoryBuffer(sm.getMainFileID())->getBuffer(), sm); + + llvm::yaml::Document &yaml_doc = *yaml_stream.begin(); + if (yaml_stream.failed()) { + llvm::errs() << "[InitExecLatencyPass] YAML parse error in: " << file_path << "\n"; + return false; + } + + auto *root = yaml_doc.getRoot(); + if (!root) { + llvm::errs() << "[InitExecLatencyPass] Empty YAML document\n"; + return false; + } + + auto *root_map = llvm::dyn_cast(root); + if (!root_map) { + llvm::errs() << "[InitExecLatencyPass] YAML root is not a mapping\n"; + return false; + } + + for (auto &key_value_pair : *root_map) { + auto *key_node = + llvm::dyn_cast_or_null(key_value_pair.getKey()); + if (!key_node) + continue; + + std::string op_name; + if (!parseYamlScalarString(key_node, op_name)) + continue; + + int latency_value = 0; + if (!parseYamlScalarInt(key_value_pair.getValue(), latency_value)) + continue; + + latency_map[op_name] = latency_value; + } + + return true; +} + +void SetLatency(Operation *op, std::map &latency_map) { + // Get operation name and look up latency + std::string op_name = op->getName().getStringRef().str(); + if (op_name.compare("neura.fused_op") == 0) { + op_name = op->getAttrOfType("pattern_name").getValue().str(); + } + op_name = op_name.substr(op_name.find_last_of(".") + 1); // remove neura. prefix if exists + auto it = latency_map.find(op_name); + if (it != latency_map.end()) { + op->setAttr("latency", + IntegerAttr::get(IntegerType::get(op->getContext(), 32), it->second)); + } + else { + op->setAttr("latency", + IntegerAttr::get(IntegerType::get(op->getContext(), 32), 1)); + } +} + +struct InitExecLatencyPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(InitExecLatencyPass) + + InitExecLatencyPass() = default; + InitExecLatencyPass(const InitExecLatencyPass &pass) + : PassWrapper>(pass) {} + + StringRef getArgument() const override { return "init-exec-latency"; } + StringRef getDescription() const override { + return "Initialize execution latency information."; + } + + void runOnOperation() override { + + ModuleOp module_op = getOperation(); + llvm::errs() << "[InitExecLatencyPass] Running init-exec-latency pass\n"; + // Get latency spec file from global function (set by command line) + std::string latency_file = mlir::neura::getLatencySpecFile(); + if (latency_file.empty()) { + latency_file = "latency_map.yaml"; // default file name + } + + llvm::errs() << "[InitExecLatencyPass] Latency file: " << latency_file << "\n"; + // Builds a map of operation name to latency + std::map latency_map; + if (!parseLatencyYaml(latency_file, latency_map)) { + llvm::errs() << "[InitExecLatencyPass] Failed to parse latency specification file: " << latency_file << "\n"; + return; + } + + // Apply latency values to operations + module_op.walk([&](Operation *op) { + if (!op->getRegions().empty()) { + for (Region ®ion : op->getRegions()) { + region.walk([&](Operation *inner_op) { + // Skip operations inside fused_op regions + if (inner_op->getParentOp() && isa(inner_op->getParentOp())) { + return; + } + + if (inner_op->getName().getStringRef().str() == "neura.data_mov" || inner_op->getName().getStringRef().str() == "neura.reserve") { + return; + } + + SetLatency(inner_op, latency_map); + }); + } + } + }); + } +}; + +} // namespace + +namespace mlir::neura { +std::unique_ptr createInitExecLatencyPass() { + return std::make_unique(); +} +} // namespace mlir::neura diff --git a/lib/TaskflowDialect/CMakeLists.txt b/lib/TaskflowDialect/CMakeLists.txt index d8e5d7ff..49d60c57 100644 --- a/lib/TaskflowDialect/CMakeLists.txt +++ b/lib/TaskflowDialect/CMakeLists.txt @@ -13,4 +13,5 @@ add_mlir_dialect_library(MLIRTaskflow MLIRInferTypeOpInterface ) -add_subdirectory(Transforms) \ No newline at end of file +add_subdirectory(Transforms) +add_subdirectory(Transforms/Optimizations) \ No newline at end of file diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt index 5dcb6736..60078298 100644 --- a/lib/TaskflowDialect/Transforms/CMakeLists.txt +++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt @@ -4,6 +4,7 @@ add_mlir_library(MLIRTaskflowTransforms ConstructHyperblockFromTaskPass.cpp ClassifyCountersPass.cpp MapTaskOnCgraPass.cpp + FuseTaskPass.cpp DEPENDS MLIRTaskflowTransformsIncGen @@ -14,8 +15,9 @@ add_mlir_library(MLIRTaskflowTransforms MLIRSupport MLIRTransforms MLIRTaskflow + MLIRNeura + MLIRNeuraTransforms + MLIRConversion ${dialect_libs} LLVMSupport -) - -add_subdirectory(Optimizations) \ No newline at end of file +) \ No newline at end of file diff --git a/lib/TaskflowDialect/Transforms/FuseTaskPass.cpp b/lib/TaskflowDialect/Transforms/FuseTaskPass.cpp new file mode 100644 index 00000000..d55a7540 --- /dev/null +++ b/lib/TaskflowDialect/Transforms/FuseTaskPass.cpp @@ -0,0 +1,1101 @@ +//===- FuseTaskPass.cpp - Task fusion for Taskflow dialect ----------------===// +// +// Fuses taskflow.task ops by merging counter chains and hyperblock bodies. +// Supports producer-consumer fusion (with intermediate store/load elimination) +// and sibling fusion (with shared input deduplication). +// +//===----------------------------------------------------------------------===// + +#include "TaskflowDialect/TaskflowDialect.h" +#include "TaskflowDialect/TaskflowOps.h" +#include "TaskflowDialect/TaskflowPasses.h" + +#include "NeuraDialect/NeuraOps.h" +#include "NeuraDialect/NeuraPasses.h" +#include "NeuraDialect/Architecture/Architecture.h" +#include "NeuraDialect/Mapping/mapping_util.h" +#include "Conversion/ConversionPasses.h" + +#include "mlir/Conversion/AffineToStandard/AffineToStandard.h" +#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" +#include "mlir/Conversion/Passes.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/Math/IR/Math.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Parser/Parser.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include + +using namespace mlir; +using namespace mlir::taskflow; + +namespace { + +//===----------------------------------------------------------------------===// +// Operand helpers +//===----------------------------------------------------------------------===// + +// Collects unique values from two ranges into a deduped vector with index map. +static void collectUnique(ValueRange a, ValueRange b, + SmallVectorImpl &result, + llvm::SmallDenseMap &idx_map) { + for (Value v : a) { idx_map[v] = result.size(); result.push_back(v); } + for (Value v : b) + if (!idx_map.count(v)) { idx_map[v] = result.size(); result.push_back(v); } +} + +//===----------------------------------------------------------------------===// +// Counter chain & hyperblock utilities +//===----------------------------------------------------------------------===// + +// Extracts all TaskflowCounterOps from a task body in definition order. +static SmallVector +extractCounterChain(TaskflowTaskOp task) { + SmallVector chain; + for (Operation &op : task.getBody().front()) + if (auto c = dyn_cast(&op)) + chain.push_back(c); + return chain; +} + +// Returns true if two counter chains have identical bounds and steps. +static bool counterChainsMatch(SmallVectorImpl &a, + SmallVectorImpl &b) { + if (a.size() != b.size()) return false; + for (unsigned i = 0; i < a.size(); ++i) { + auto get_int = [](Operation *op, StringRef name) -> int64_t { + return op->getAttrOfType(name).getInt(); + }; + if (get_int(a[i], "lower_bound") != get_int(b[i], "lower_bound") || + get_int(a[i], "upper_bound") != get_int(b[i], "upper_bound") || + get_int(a[i], "step") != get_int(b[i], "step")) + return false; + } + return true; +} + +// Finds the single TaskflowHyperblockOp in a task body. Returns nullptr +// if none or multiple exist. +static TaskflowHyperblockOp findHyperblock(TaskflowTaskOp task) { + TaskflowHyperblockOp result = nullptr; + for (Operation &op : task.getBody().front()) { + if (auto hb = dyn_cast(&op)) { + if (result) return nullptr; // multiple hyperblocks + result = hb; + } + } + return result; +} + +// Finds the single scf::ForOp in a hyperblock body. Returns nullptr +// if none or multiple exist. +static scf::ForOp findInnerForOp(TaskflowHyperblockOp hb) { + scf::ForOp result = nullptr; + for (auto &op : hb.getBody().front()) { + if (auto f = dyn_cast(&op)) { + if (result) return nullptr; + result = f; + } + } + return result; +} + +// Returns true if two scf::ForOp loops have identical constant bounds +// and step. +static bool scfForBoundsMatch(scf::ForOp a, scf::ForOp b) { + auto get_const = [](Value v) -> std::optional { + if (auto cst = v.getDefiningOp()) + return cst.value(); + return std::nullopt; + }; + auto al = get_const(a.getLowerBound()), bl = get_const(b.getLowerBound()); + auto au = get_const(a.getUpperBound()), bu = get_const(b.getUpperBound()); + auto as = get_const(a.getStep()), bs = get_const(b.getStep()); + return al && bl && au && bu && as && bs && + *al == *bl && *au == *bu && *as == *bs; +} + +//===----------------------------------------------------------------------===// +// Legality checks +//===----------------------------------------------------------------------===// + +// Returns true if task1 dominates task2 in the same block. +static bool canFuseTasks(TaskflowTaskOp t1, TaskflowTaskOp t2) { + return t1 && t2 && t1 != t2 && + t1->getBlock() == t2->getBlock() && + t1->isBeforeInBlock(t2); +} + +// Returns true if any write output of the producer feeds the consumer. +static bool hasProducerConsumerRelation(TaskflowTaskOp producer, + TaskflowTaskOp consumer) { + for (Value r : producer.getWriteOutputs()) { + for (Value in : consumer.getReadMemrefs()) if (r == in) return true; + for (Value in : consumer.getWriteMemrefs()) if (r == in) return true; + } + for (Value r : producer.getValueOutputs()) + for (Value in : consumer.getValueInputs()) if (r == in) return true; + return false; +} + +// Returns true if tasks share at least one input with no data dependency. +static bool areSiblingTasks(TaskflowTaskOp t1, TaskflowTaskOp t2) { + llvm::SmallPtrSet t1_mem; + for (Value v : t1.getReadMemrefs()) t1_mem.insert(v); + for (Value v : t1.getWriteMemrefs()) t1_mem.insert(v); + llvm::SmallPtrSet t1_val(t1.getValueInputs().begin(), + t1.getValueInputs().end()); + bool share = false; + for (Value v : t2.getReadMemrefs()) if (t1_mem.count(v)) { share = true; break; } + if (!share) + for (Value v : t2.getWriteMemrefs()) if (t1_mem.count(v)) { share = true; break; } + if (!share) + for (Value v : t2.getValueInputs()) if (t1_val.count(v)) { share = true; break; } + return share && !hasProducerConsumerRelation(t1, t2) && + !hasProducerConsumerRelation(t2, t1); +} + +// Returns true if any op between producer and consumer uses producer's results. +static bool hasInterveningUses(TaskflowTaskOp producer, + TaskflowTaskOp consumer) { + llvm::SmallPtrSet results; + for (Value v : producer.getWriteOutputs()) results.insert(v); + for (Value v : producer.getValueOutputs()) results.insert(v); + bool in_range = false; + for (Operation &op : *producer->getBlock()) { + if (&op == producer.getOperation()) { in_range = true; continue; } + if (&op == consumer.getOperation()) break; + if (in_range) + for (Value v : op.getOperands()) + if (results.count(v)) return true; + } + return false; +} + +// Returns true if all outputs of the task have at most one use. +static bool hasOnlySingleUseOutputs(TaskflowTaskOp task) { + for (Value r : task.getWriteOutputs()) + if (!r.hasOneUse() && !r.use_empty()) return false; + for (Value r : task.getValueOutputs()) + if (!r.hasOneUse() && !r.use_empty()) return false; + return true; +} + +// Clones non-structural ops from a task body (skips counters, hyperblocks, +// and yields). +static void cloneTaskBodyMiscOps(Block &body, OpBuilder &builder, + IRMapping &mapping) { + for (Operation &op : body) { + if (isa(&op)) + continue; + builder.clone(op, mapping); + } +} + +// Returns true if two tasks have compatible loop structures for merging. +// Handles both counter-chain and scf.for-inside-hyperblock cases. +static bool haveMatchingLoopStructure(TaskflowTaskOp t1, TaskflowTaskOp t2) { + auto hb1 = findHyperblock(t1), hb2 = findHyperblock(t2); + if (!hb1 || !hb2) return false; + if (hb1.getIterArgs().size() > 0 || hb2.getIterArgs().size() > 0) + return false; + + auto c1 = extractCounterChain(t1), c2 = extractCounterChain(t2); + if (!c1.empty() && !c2.empty()) + return counterChainsMatch(c1, c2); + + // No counter chains: compares scf.for loops in hyperblock bodies. + if (c1.empty() && c2.empty()) { + auto f1 = findInnerForOp(hb1), f2 = findInnerForOp(hb2); + if (f1 && f2) return scfForBoundsMatch(f1, f2); + // Both have no loops: trivially compatible. + if (!f1 && !f2) return true; + } + return false; +} + +//===----------------------------------------------------------------------===// +// Producer-consumer fusion +//===----------------------------------------------------------------------===// + +// Fuses a producer into its consumer by merging their counter chains and +// hyperblock bodies into a single task with direct SSA value forwarding +// for the intermediate memref. +static TaskflowTaskOp +fuseProducerConsumerTasks(TaskflowTaskOp producer, TaskflowTaskOp consumer, + Value intermediate_memref, OpBuilder &builder) { + Location loc = consumer.getLoc(); + auto prod_read = producer.getReadMemrefs(); + auto prod_write = producer.getWriteMemrefs(); + auto prod_val = producer.getValueInputs(); + auto cons_read = consumer.getReadMemrefs(); + auto cons_write = consumer.getWriteMemrefs(); + auto cons_val = consumer.getValueInputs(); + + // Collects fused inputs, keeping intermediate in write_memrefs for + // block arg availability but excluding it from consumer's read side. + SmallVector fused_read, fused_write, fused_val; + for (Value v : prod_read) fused_read.push_back(v); + for (Value v : cons_read) + if (v != intermediate_memref) fused_read.push_back(v); + for (Value v : prod_write) fused_write.push_back(v); + for (Value v : cons_write) + if (v != intermediate_memref) fused_write.push_back(v); + for (Value v : prod_val) fused_val.push_back(v); + for (Value v : cons_val) fused_val.push_back(v); + + // Output types come from the consumer only. + SmallVector write_out_types(consumer.getWriteOutputs().getTypes()); + SmallVector val_out_types(consumer.getValueOutputs().getTypes()); + + SmallVector orig_reads, orig_writes; + for (Value v : producer.getOriginalReadMemrefs()) orig_reads.push_back(v); + for (Value v : consumer.getOriginalReadMemrefs()) orig_reads.push_back(v); + for (Value v : producer.getOriginalWriteMemrefs()) orig_writes.push_back(v); + for (Value v : consumer.getOriginalWriteMemrefs()) orig_writes.push_back(v); + + auto fused = builder.create( + loc, write_out_types, val_out_types, + fused_read, fused_write, fused_val, + builder.getStringAttr("fused_pc"), + orig_reads, orig_writes); + + Block *body = new Block(); + fused.getBody().push_back(body); + for (Value v : fused_read) body->addArgument(v.getType(), loc); + for (Value v : fused_write) body->addArgument(v.getType(), loc); + for (Value v : fused_val) body->addArgument(v.getType(), loc); + + unsigned fused_read_n = fused_read.size(); + unsigned fused_write_n = fused_write.size(); + + // --- Maps producer block args to fused block args --- + IRMapping mapping; + Block &prod_body = producer.getBody().front(); + for (unsigned i = 0; i < prod_read.size(); ++i) + mapping.map(prod_body.getArgument(i), body->getArgument(i)); + for (unsigned i = 0; i < prod_write.size(); ++i) + mapping.map(prod_body.getArgument(prod_read.size() + i), + body->getArgument(fused_read_n + i)); + for (unsigned i = 0; i < prod_val.size(); ++i) + mapping.map(prod_body.getArgument(prod_read.size() + prod_write.size() + i), + body->getArgument(fused_read_n + fused_write_n + i)); + + // Identifies the producer's task-body block arg for the intermediate write + // memref. The intermediate_memref is the producer's write OUTPUT result; + // finds which write output index it corresponds to, then gets the matching + // write memref block arg. + BlockArgument prod_intermediate_arg = nullptr; + for (unsigned i = 0; i < producer.getWriteOutputs().size(); ++i) + if (producer.getWriteOutputs()[i] == intermediate_memref) + prod_intermediate_arg = prod_body.getArgument(prod_read.size() + i); + + OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(body); + + // --- Maps consumer block args to fused block args --- + Block &cons_body = consumer.getBody().front(); + unsigned cons_read_fused = prod_read.size(); + for (unsigned i = 0; i < cons_read.size(); ++i) { + if (cons_read[i] == intermediate_memref) { + if (prod_intermediate_arg) + mapping.map(cons_body.getArgument(i), + mapping.lookupOrDefault(prod_intermediate_arg)); + } else { + mapping.map(cons_body.getArgument(i), + body->getArgument(cons_read_fused++)); + } + } + unsigned cons_write_fused = fused_read_n + prod_write.size(); + for (unsigned i = 0; i < cons_write.size(); ++i) { + if (cons_write[i] == intermediate_memref) { + if (prod_intermediate_arg) + mapping.map(cons_body.getArgument(cons_read.size() + i), + mapping.lookupOrDefault(prod_intermediate_arg)); + } else { + mapping.map(cons_body.getArgument(cons_read.size() + i), + body->getArgument(cons_write_fused++)); + } + } + unsigned cons_val_fused = fused_read_n + fused_write_n + prod_val.size(); + for (unsigned i = 0; i < cons_val.size(); ++i) + mapping.map(cons_body.getArgument(cons_read.size() + cons_write.size() + i), + body->getArgument(cons_val_fused + i)); + + // Clones counter chain from producer. + auto prod_counters = extractCounterChain(producer); + for (auto ctr : prod_counters) builder.clone(*ctr, mapping); + + // Clones non-counter, non-hyperblock, non-yield ops from both task bodies + // (e.g. arith.constant for loop bounds). + cloneTaskBodyMiscOps(prod_body, builder, mapping); + cloneTaskBodyMiscOps(cons_body, builder, mapping); + + // Locates both hyperblocks. + auto prod_hb = findHyperblock(producer); + auto cons_hb = findHyperblock(consumer); + Block &prod_hb_body = prod_hb.getBody().front(); + Block &cons_hb_body = cons_hb.getBody().front(); + + // Creates fused hyperblock. + SmallVector triggers; + for (Value v : prod_hb.getIndices()) + triggers.push_back(mapping.lookupOrDefault(v)); + auto fused_hb = builder.create( + loc, TypeRange{}, triggers, ValueRange{}); + Block *hb_body = &fused_hb.getBody().emplaceBlock(); + for (auto arg : prod_hb_body.getArguments()) + hb_body->addArgument(arg.getType(), loc); + + // Maps hyperblock block args. + for (unsigned i = 0; i < prod_hb_body.getNumArguments(); ++i) + mapping.map(prod_hb_body.getArgument(i), hb_body->getArgument(i)); + for (unsigned i = 0; i < cons_hb_body.getNumArguments(); ++i) + mapping.map(cons_hb_body.getArgument(i), hb_body->getArgument(i)); + + // Identifies the consumer's intermediate read block arg for load skipping. + BlockArgument cons_intermediate_arg = nullptr; + for (unsigned i = 0; i < cons_read.size(); ++i) + if (cons_read[i] == intermediate_memref) + cons_intermediate_arg = cons_body.getArgument(i); + + { + OpBuilder::InsertionGuard hb_guard(builder); + builder.setInsertionPointToStart(hb_body); + + auto prod_for = findInnerForOp(prod_hb); + auto cons_for = findInnerForOp(cons_hb); + + if (prod_for && cons_for) { + // --- SCF loop fusion: merges for-body ops into a single scf.for --- + + // Clones non-for, non-yield ops from both hyperblock bodies. + for (Operation &op : prod_hb_body) { + if (isa(&op)) continue; + builder.clone(op, mapping); + } + for (Operation &op : cons_hb_body) { + if (isa(&op)) continue; + builder.clone(op, mapping); + } + + // Creates merged scf.for with producer's bounds. + Value lb = mapping.lookupOrDefault(prod_for.getLowerBound()); + Value ub = mapping.lookupOrDefault(prod_for.getUpperBound()); + Value step = mapping.lookupOrDefault(prod_for.getStep()); + auto merged_for = builder.create(loc, lb, ub, step); + mapping.map(prod_for.getInductionVar(), merged_for.getInductionVar()); + mapping.map(cons_for.getInductionVar(), merged_for.getInductionVar()); + + OpBuilder::InsertionGuard for_guard(builder); + builder.setInsertionPoint(merged_for.getBody()->getTerminator()); + + // Clones producer for-body; skips store to intermediate, records + // the forwarded value. + Value forwarded = nullptr; + for (Operation &op : *prod_for.getBody()) { + if (isa(&op)) continue; + if (auto store = dyn_cast(&op)) { + if (prod_intermediate_arg && + store.getMemRef() == prod_intermediate_arg) { + forwarded = mapping.lookupOrDefault(store.getValueToStore()); + continue; + } + } + builder.clone(op, mapping); + } + + // Clones consumer for-body; replaces loads from intermediate + // with the forwarded value. + for (Operation &op : *cons_for.getBody()) { + if (isa(&op)) continue; + if (auto load = dyn_cast(&op)) { + if (cons_intermediate_arg && forwarded && + load.getMemRef() == cons_intermediate_arg) { + mapping.map(load.getResult(), forwarded); + continue; + } + } + builder.clone(op, mapping); + } + + } else { + // --- Counter-chain path: clones hyperblock bodies sequentially --- + Value forwarded = nullptr; + for (Operation &op : prod_hb_body) { + if (isa(&op)) continue; + if (auto store = dyn_cast(&op)) { + if (prod_intermediate_arg && + store.getMemRef() == prod_intermediate_arg) { + forwarded = mapping.lookupOrDefault(store.getValueToStore()); + continue; + } + } + builder.clone(op, mapping); + } + for (Operation &op : cons_hb_body) { + if (isa(&op)) continue; + if (auto load = dyn_cast(&op)) { + if (cons_intermediate_arg && forwarded && + load.getMemRef() == cons_intermediate_arg) { + mapping.map(load.getResult(), forwarded); + continue; + } + } + builder.clone(op, mapping); + } + } + + builder.create(loc); + } + + // Creates the task yield from consumer's yield operands. + auto cons_yield = cast(consumer.getBody().front().getTerminator()); + SmallVector yield_mem, yield_val; + for (Value v : cons_yield.getMemoryResults()) + yield_mem.push_back(mapping.lookupOrDefault(v)); + for (Value v : cons_yield.getValueResults()) + yield_val.push_back(mapping.lookupOrDefault(v)); + builder.create(loc, yield_mem, yield_val); + return fused; +} + +//===----------------------------------------------------------------------===// +// Sibling fusion +//===----------------------------------------------------------------------===// + +// Fuses two sibling tasks by merging their counter chains and hyperblock +// bodies into a single task with deduplicated inputs. +static TaskflowTaskOp fuseSiblingTasks(TaskflowTaskOp t1, TaskflowTaskOp t2, + OpBuilder &builder) { + Location loc = t1.getLoc(); + + // Deduplicates inputs across both tasks. + SmallVector fused_read, fused_write, fused_val; + llvm::SmallDenseMap read_idx, write_idx, val_idx; + collectUnique(t1.getReadMemrefs(), t2.getReadMemrefs(), fused_read, read_idx); + collectUnique(t1.getWriteMemrefs(), t2.getWriteMemrefs(), fused_write, write_idx); + collectUnique(t1.getValueInputs(), t2.getValueInputs(), fused_val, val_idx); + + // Combined output types. + SmallVector write_out_types, val_out_types; + write_out_types.append(t1.getWriteOutputs().getTypes().begin(), + t1.getWriteOutputs().getTypes().end()); + write_out_types.append(t2.getWriteOutputs().getTypes().begin(), + t2.getWriteOutputs().getTypes().end()); + val_out_types.append(t1.getValueOutputs().getTypes().begin(), + t1.getValueOutputs().getTypes().end()); + val_out_types.append(t2.getValueOutputs().getTypes().begin(), + t2.getValueOutputs().getTypes().end()); + + SmallVector orig_reads, orig_writes; + for (Value v : t1.getOriginalReadMemrefs()) orig_reads.push_back(v); + for (Value v : t2.getOriginalReadMemrefs()) orig_reads.push_back(v); + for (Value v : t1.getOriginalWriteMemrefs()) orig_writes.push_back(v); + for (Value v : t2.getOriginalWriteMemrefs()) orig_writes.push_back(v); + + auto fused = builder.create( + loc, write_out_types, val_out_types, + fused_read, fused_write, fused_val, + builder.getStringAttr("fused_sibling"), + orig_reads, orig_writes); + + Block *body = new Block(); + fused.getBody().push_back(body); + for (Value v : fused_read) body->addArgument(v.getType(), loc); + for (Value v : fused_write) body->addArgument(v.getType(), loc); + for (Value v : fused_val) body->addArgument(v.getType(), loc); + + unsigned rn = fused_read.size(), wn = fused_write.size(); + + // Lambda to map a task's block args to fused block args via index maps. + auto mapBlockArgs = [&](TaskflowTaskOp task, IRMapping &m) { + Block &tb = task.getBody().front(); + auto r = task.getReadMemrefs(); + auto w = task.getWriteMemrefs(); + auto v = task.getValueInputs(); + for (unsigned i = 0; i < r.size(); ++i) + m.map(tb.getArgument(i), body->getArgument(read_idx[r[i]])); + for (unsigned i = 0; i < w.size(); ++i) + m.map(tb.getArgument(r.size() + i), + body->getArgument(rn + write_idx[w[i]])); + for (unsigned i = 0; i < v.size(); ++i) + m.map(tb.getArgument(r.size() + w.size() + i), + body->getArgument(rn + wn + val_idx[v[i]])); + }; + + OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(body); + + // Clones counter chain from t1. + IRMapping mapping; + mapBlockArgs(t1, mapping); + auto counters = extractCounterChain(t1); + for (auto ctr : counters) builder.clone(*ctr, mapping); + + // Clones non-counter, non-hyperblock, non-yield ops from both task bodies. + Block &t1_body = t1.getBody().front(); + Block &t2_body = t2.getBody().front(); + IRMapping mapping2; + mapBlockArgs(t2, mapping2); + cloneTaskBodyMiscOps(t1_body, builder, mapping); + cloneTaskBodyMiscOps(t2_body, builder, mapping2); + + // Locates both hyperblocks. + auto hb1 = findHyperblock(t1), hb2 = findHyperblock(t2); + Block &hb1_body = hb1.getBody().front(); + Block &hb2_body = hb2.getBody().front(); + + // Creates fused hyperblock. + SmallVector triggers; + for (Value v : hb1.getIndices()) + triggers.push_back(mapping.lookupOrDefault(v)); + auto fused_hb = builder.create( + loc, TypeRange{}, triggers, ValueRange{}); + Block *hb_body = &fused_hb.getBody().emplaceBlock(); + for (auto arg : hb1_body.getArguments()) + hb_body->addArgument(arg.getType(), loc); + + // Maps hyperblock block args. + for (unsigned i = 0; i < hb1_body.getNumArguments(); ++i) + mapping.map(hb1_body.getArgument(i), hb_body->getArgument(i)); + for (unsigned i = 0; i < hb2_body.getNumArguments(); ++i) + mapping2.map(hb2_body.getArgument(i), hb_body->getArgument(i)); + + { + OpBuilder::InsertionGuard hb_guard(builder); + builder.setInsertionPointToStart(hb_body); + + auto for1 = findInnerForOp(hb1), for2 = findInnerForOp(hb2); + + if (for1 && for2) { + // --- SCF loop fusion: merges for-body ops into a single scf.for --- + + // Clones non-for, non-yield ops from both hyperblock bodies. + for (Operation &op : hb1_body) { + if (isa(&op)) continue; + builder.clone(op, mapping); + } + for (Operation &op : hb2_body) { + if (isa(&op)) continue; + builder.clone(op, mapping2); + } + + // Creates merged scf.for with t1's bounds. + Value lb = mapping.lookupOrDefault(for1.getLowerBound()); + Value ub = mapping.lookupOrDefault(for1.getUpperBound()); + Value step = mapping.lookupOrDefault(for1.getStep()); + auto merged_for = builder.create(loc, lb, ub, step); + mapping.map(for1.getInductionVar(), merged_for.getInductionVar()); + mapping2.map(for2.getInductionVar(), merged_for.getInductionVar()); + + OpBuilder::InsertionGuard for_guard(builder); + builder.setInsertionPoint(merged_for.getBody()->getTerminator()); + + // Clones t1 for-body. + for (Operation &op : *for1.getBody()) { + if (isa(&op)) continue; + builder.clone(op, mapping); + } + // Clones t2 for-body. + for (Operation &op : *for2.getBody()) { + if (isa(&op)) continue; + builder.clone(op, mapping2); + } + + } else { + // --- Counter-chain path: clones hyperblock bodies sequentially --- + for (Operation &op : hb1_body) { + if (isa(&op)) continue; + builder.clone(op, mapping); + } + for (Operation &op : hb2_body) { + if (isa(&op)) continue; + builder.clone(op, mapping2); + } + } + + builder.create(loc); + } + + // Creates combined task yield. + auto t1_yield = cast(t1.getBody().front().getTerminator()); + auto t2_yield = cast(t2.getBody().front().getTerminator()); + SmallVector all_mem, all_val; + for (Value v : t1_yield.getMemoryResults()) + all_mem.push_back(mapping.lookupOrDefault(v)); + for (Value v : t1_yield.getValueResults()) + all_val.push_back(mapping.lookupOrDefault(v)); + for (Value v : t2_yield.getMemoryResults()) + all_mem.push_back(mapping2.lookupOrDefault(v)); + for (Value v : t2_yield.getValueResults()) + all_val.push_back(mapping2.lookupOrDefault(v)); + builder.create(loc, all_mem, all_val); + return fused; +} + +//===----------------------------------------------------------------------===// +// Profitability analysis +//===----------------------------------------------------------------------===// + +// Represents metrics for evaluating fusion profitability. +struct FusionMetrics { + int rec_mii = 1; + int res_mii = 1; + int max_fanout = 0; + int num_ops = 0; +}; + +// Calculates the maximum fanout across all ops in a region. +static int calculateMaxFanoutInRegion(Region ®ion) { + int max_fanout = 0; + region.walk([&](Operation *op) { + for (Value result : op->getResults()) { + int fanout = std::distance(result.use_begin(), result.use_end()); + max_fanout = std::max(max_fanout, fanout); + } + }); + return max_fanout; +} + +// Runs the taskflow-to-neura pipeline on a cloned module and computes +// MII metrics on the resulting "test_fused_kernel" function. +static FusionMetrics computeRealMetrics( + ModuleOp test_module, const neura::Architecture &architecture) { + FusionMetrics metrics; + auto cloned = test_module.clone(); + + // Phase 1: converts taskflow ops to neura kernels. + { + PassManager pm(cloned.getContext()); + pm.addPass(taskflow::createClassifyCountersPass()); + pm.addPass(createConvertTaskflowToNeuraPass()); + pm.enableVerifier(false); + if (failed(pm.run(cloned))) { + metrics.rec_mii = 100; + metrics.res_mii = 100; + cloned.erase(); + return metrics; + } + } + + // Converts func.return to neura.return for accelerator-marked functions + // so that the subsequent neura passes can process them correctly. + cloned.walk([&](func::ReturnOp ret) { + auto func_op = ret->getParentOfType(); + if (!func_op || !func_op->getAttrOfType("accelerator")) { + return; + } + OpBuilder builder(ret); + auto neura_ret = builder.create(ret.getLoc(), ret.getOperands()); + if (ret.getNumOperands() > 0) { + neura_ret->setAttr("return_type", builder.getStringAttr("value")); + } else { + neura_ret->setAttr("return_type", builder.getStringAttr("void")); + } + ret.erase(); + }); + + // Phase 2: runs the neura compilation pipeline. + { + PassManager pm(cloned.getContext()); + pm.addPass(neura::createAssignAcceleratorPass()); + pm.addPass(createLowerArithToNeuraPass()); + pm.addPass(createLowerMemRefToNeuraPass()); + pm.addPass(neura::createCanonicalizeReturnPass()); + pm.addPass(neura::createCanonicalizeCastPass()); + pm.addPass(neura::createPromoteInputArgToConstPass()); + pm.addPass(neura::createCanonicalizeLiveInPass()); + pm.addPass(neura::createLeveragePredicatedValuePass()); + pm.addPass(neura::createTransformCtrlToDataFlowPass()); + pm.enableVerifier(false); + if (failed(pm.run(cloned))) { + metrics.rec_mii = 100; + metrics.res_mii = 100; + cloned.erase(); + return metrics; + } + } + + cloned.walk([&](func::FuncOp func_op) { + if (func_op.getName() != "test_fused_kernel") { + return; + } + metrics.res_mii = neura::calculateResMii(func_op.getBody(), architecture); + auto cycles = neura::collectRecurrenceCycles(func_op.getBody()); + metrics.rec_mii = 1; + for (const auto &cycle : cycles) { + metrics.rec_mii = std::max(metrics.rec_mii, cycle.length); + } + int num_ops = 0; + func_op.walk([&](Operation *op) { + if (!isa(op) && !op->hasTrait()) { + ++num_ops; + } + }); + metrics.num_ops = num_ops; + if (!func_op.getBody().empty()) { + metrics.max_fanout = calculateMaxFanoutInRegion(func_op.getBody()); + } + }); + + cloned.erase(); + return metrics; +} + +// Creates a test module containing a single task wrapped in a function. +static ModuleOp createTestModuleForTask(TaskflowTaskOp task) { + MLIRContext *ctx = task->getContext(); + OpBuilder builder(ctx); + Location loc = builder.getUnknownLoc(); + auto module = ModuleOp::create(loc); + builder.setInsertionPointToStart(module.getBody()); + + // Collects unique operand values from the task. + llvm::SetVector unique_operands; + for (Value v : task->getOperands()) { + unique_operands.insert(v); + } + + SmallVector input_types; + for (Value v : unique_operands) { + input_types.push_back(v.getType()); + } + SmallVector output_types; + for (Value v : task->getResults()) { + output_types.push_back(v.getType()); + } + if (input_types.empty()) { + input_types.push_back(builder.getI64Type()); + } + if (output_types.empty()) { + output_types.push_back(builder.getI64Type()); + } + + auto func_type = builder.getFunctionType(input_types, output_types); + auto func_op = builder.create(loc, "test_fused_kernel", func_type); + func_op->setAttr("accelerator", builder.getStringAttr("neura")); + + Block *entry = func_op.addEntryBlock(); + builder.setInsertionPointToStart(entry); + + IRMapping mapping; + for (auto [i, v] : llvm::enumerate(unique_operands)) { + mapping.map(v, entry->getArgument(i)); + } + + auto *cloned = builder.clone(*task.getOperation(), mapping); + auto cloned_task = cast(cloned); + + SmallVector ret; + for (Value v : cloned_task->getResults()) { + ret.push_back(v); + } + if (ret.empty()) { + ret.push_back(entry->getArgument(0)); + } + builder.create(loc, ret); + return module; +} + +// Computes metrics for a single task by running the neura pipeline. +static FusionMetrics computeSingleTaskMetrics( + TaskflowTaskOp task, const neura::Architecture &architecture) { + auto module = createTestModuleForTask(task); + FusionMetrics metrics = computeRealMetrics(module, architecture); + module.erase(); + return metrics; +} + +// Creates a test module with two tasks fused together. +static ModuleOp createFusedTestModule( + TaskflowTaskOp task1, TaskflowTaskOp task2, + bool is_producer_consumer, Value intermediate_memref) { + MLIRContext *ctx = task1->getContext(); + OpBuilder builder(ctx); + Location loc = builder.getUnknownLoc(); + auto module = ModuleOp::create(loc); + builder.setInsertionPointToStart(module.getBody()); + + // Collects unique external values (excludes task1's results used by task2). + llvm::SetVector unique_vals; + for (Value v : task1->getOperands()) { + unique_vals.insert(v); + } + for (Value v : task2->getOperands()) { + unique_vals.insert(v); + } + for (Value v : task1->getResults()) { + unique_vals.remove(v); + } + + SmallVector input_types; + for (Value v : unique_vals) { + input_types.push_back(v.getType()); + } + if (input_types.empty()) { + input_types.push_back(builder.getI64Type()); + } + + // Placeholder output type; the fused task's results drive the actual return. + SmallVector output_types; + output_types.push_back(builder.getI64Type()); + + auto func_type = builder.getFunctionType(input_types, output_types); + auto func_op = builder.create(loc, "test_fused_kernel", func_type); + func_op->setAttr("accelerator", builder.getStringAttr("neura")); + + Block *entry = func_op.addEntryBlock(); + builder.setInsertionPointToStart(entry); + + IRMapping mapping; + for (auto [i, v] : llvm::enumerate(unique_vals)) { + mapping.map(v, entry->getArgument(i)); + } + + // Clones task1. + auto *ct1_op = builder.clone(*task1.getOperation(), mapping); + auto ct1 = cast(ct1_op); + + // Maps task1's results so task2's operands resolve correctly. + for (auto [orig, cloned] : llvm::zip(task1->getResults(), ct1->getResults())) { + mapping.map(orig, cloned); + } + + // Clones task2. + auto *ct2_op = builder.clone(*task2.getOperation(), mapping); + auto ct2 = cast(ct2_op); + + // Resolves the cloned intermediate memref. + Value cloned_intermediate; + if (is_producer_consumer && intermediate_memref) { + cloned_intermediate = mapping.lookupOrDefault(intermediate_memref); + } + + // Fuses the cloned tasks. + TaskflowTaskOp fused; + if (is_producer_consumer) { + fused = fuseProducerConsumerTasks(ct1, ct2, cloned_intermediate, builder); + } else { + fused = fuseSiblingTasks(ct1, ct2, builder); + } + + // Fixes the function return type and creates a return op. + SmallVector ret; + for (Value v : fused->getResults()) { + ret.push_back(v); + } + if (ret.empty()) { + ret.push_back(entry->getArgument(0)); + } + + // Removes placeholder return type and rebuilds with actual types. + SmallVector real_output_types; + for (Value v : ret) { + real_output_types.push_back(v.getType()); + } + func_op.setFunctionType(builder.getFunctionType(input_types, real_output_types)); + builder.create(loc, ret); + + // Erases un-fused clones (consumer first to drop uses of producer results). + ct2->erase(); + ct1->erase(); + return module; +} + +// Computes metrics for the fused version of two tasks. +static FusionMetrics computeFusedTaskMetrics( + TaskflowTaskOp task1, TaskflowTaskOp task2, + bool is_producer_consumer, Value intermediate_memref, + const neura::Architecture &architecture) { + auto module = createFusedTestModule(task1, task2, is_producer_consumer, intermediate_memref); + FusionMetrics metrics = computeRealMetrics(module, architecture); + module.erase(); + return metrics; +} + +// Estimates the effective MII considering utilization and fanout penalties. +static int estimateMII(const FusionMetrics &metrics, int total_tiles) { + const float alpha = 0.5f; + const float beta = 0.5f; + int mii = std::max(metrics.rec_mii, metrics.res_mii); + float utilization_factor = 1.0f + alpha * (metrics.num_ops / static_cast(total_tiles)); + float fanout_factor = 1.0f + beta * std::max(metrics.max_fanout - 4, 0); + return static_cast(std::ceil(utilization_factor * fanout_factor * mii)); +} + +// Checks if fusion is profitable by comparing estimated MII of the fused +// task against the individual tasks. For producer-consumer fusion, the +// unfused MII is the sum (sequential execution); for sibling fusion, it +// is the max (independent execution). +static bool isFusionProfitable(TaskflowTaskOp task1, TaskflowTaskOp task2, + bool is_producer_consumer, + Value intermediate = nullptr) { + neura::Architecture architecture(1, 1); + int total_tiles = architecture.getNumTiles(); + + FusionMetrics m1 = computeSingleTaskMetrics(task1, architecture); + FusionMetrics m2 = computeSingleTaskMetrics(task2, architecture); + FusionMetrics fused = computeFusedTaskMetrics(task1, task2, is_producer_consumer, intermediate, architecture); + + int mii_1 = estimateMII(m1, total_tiles); + int mii_2 = estimateMII(m2, total_tiles); + int mii_fused = estimateMII(fused, total_tiles); + + // Uses raw MII (max of recurrence and resource MII) for the core + // comparison, since the estimateMII utilization penalty is additive + // and already reflected in res_mii. + int raw_1 = std::max(m1.rec_mii, m1.res_mii); + int raw_2 = std::max(m2.rec_mii, m2.res_mii); + int raw_fused = std::max(fused.rec_mii, fused.res_mii); + int unfused_mii = is_producer_consumer ? (raw_1 + raw_2) : std::max(raw_1, raw_2); + bool profitable = raw_fused <= unfused_mii; + + llvm::errs() << "[fuse-task] Profitability:" + << " m1(rec=" << m1.rec_mii << " res=" << m1.res_mii << " ops=" << m1.num_ops << " fan=" << m1.max_fanout << " mii=" << mii_1 << ")" + << " m2(rec=" << m2.rec_mii << " res=" << m2.res_mii << " ops=" << m2.num_ops << " fan=" << m2.max_fanout << " mii=" << mii_2 << ")" + << " fused(rec=" << fused.rec_mii << " res=" << fused.res_mii << " ops=" << fused.num_ops << " fan=" << fused.max_fanout << " mii=" << mii_fused << ")" + << " -> " << (profitable ? "PROFITABLE" : "REJECTED") << "\n"; + return profitable; +} + +//===----------------------------------------------------------------------===// +// Rewrite patterns +//===----------------------------------------------------------------------===// + +// Fuses a producer task into its consumer when the producer's output feeds +// directly into the consumer and the loop structures match. +struct ProducerConsumerTaskFusion + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(TaskflowTaskOp consumer, + PatternRewriter &rewriter) const override { + TaskflowTaskOp producer = nullptr; + Value intermediate; + + // Searches consumer's read and write memrefs for a fusible producer. + auto tryFindProducer = [&](ValueRange inputs) -> bool { + for (Value in : inputs) { + auto def = in.getDefiningOp(); + if (!canFuseTasks(def, consumer) || + !hasOnlySingleUseOutputs(def) || + hasInterveningUses(def, consumer) || + !haveMatchingLoopStructure(def, consumer) || + !isFusionProfitable(def, consumer, true, in)) + continue; + producer = def; + intermediate = in; + return true; + } + return false; + }; + if (!tryFindProducer(consumer.getReadMemrefs()) && + !tryFindProducer(consumer.getWriteMemrefs())) + return failure(); + + auto fused = fuseProducerConsumerTasks(producer, consumer, + intermediate, rewriter); + for (auto [o, n] : llvm::zip(consumer.getWriteOutputs(), + fused.getWriteOutputs())) + o.replaceAllUsesWith(n); + for (auto [o, n] : llvm::zip(consumer.getValueOutputs(), + fused.getValueOutputs())) + o.replaceAllUsesWith(n); + rewriter.eraseOp(consumer); + rewriter.eraseOp(producer); + return success(); + } +}; + +// Fuses sibling tasks that share inputs and have matching loop structures. +struct SiblingTaskFusion : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(TaskflowTaskOp t1, + PatternRewriter &rewriter) const override { + TaskflowTaskOp t2 = nullptr; + for (Operation *op = t1->getNextNode(); op; op = op->getNextNode()) { + auto next = dyn_cast(op); + if (!next) continue; + if (areSiblingTasks(t1, next) && canFuseTasks(t1, next) && + haveMatchingLoopStructure(t1, next) && + isFusionProfitable(t1, next, false)) { + t2 = next; + break; + } + } + if (!t2) return failure(); + + auto fused = fuseSiblingTasks(t1, t2, rewriter); + unsigned t1_wo = t1.getWriteOutputs().size(); + unsigned t1_vo = t1.getValueOutputs().size(); + for (unsigned i = 0; i < t1_wo; ++i) + t1.getWriteOutputs()[i].replaceAllUsesWith(fused.getWriteOutputs()[i]); + for (unsigned i = 0; i < t1_vo; ++i) + t1.getValueOutputs()[i].replaceAllUsesWith(fused.getValueOutputs()[i]); + for (unsigned i = 0; i < t2.getWriteOutputs().size(); ++i) + t2.getWriteOutputs()[i].replaceAllUsesWith( + fused.getWriteOutputs()[t1_wo + i]); + for (unsigned i = 0; i < t2.getValueOutputs().size(); ++i) + t2.getValueOutputs()[i].replaceAllUsesWith( + fused.getValueOutputs()[t1_vo + i]); + rewriter.eraseOp(t1); + rewriter.eraseOp(t2); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// Pass definition +//===----------------------------------------------------------------------===// + +// Applies producer-consumer and sibling task fusion using MII-based +// profitability analysis and loop body merging. +struct FuseTaskPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(FuseTaskPass) + + StringRef getArgument() const override { return "fuse-task"; } + StringRef getDescription() const override { + return "Fuses taskflow.task ops via counter chain and hyperblock merging."; + } + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() override { + RewritePatternSet patterns(&getContext()); + patterns.add(&getContext(), /*benefit=*/10); + patterns.add(&getContext(), /*benefit=*/5); + if (failed(applyPatternsGreedily(getOperation(), + FrozenRewritePatternSet(std::move(patterns))))) + signalPassFailure(); + } +}; + +} // namespace + +namespace mlir::taskflow { +std::unique_ptr createFuseTaskPass() { + return std::make_unique(); +} +} // namespace mlir::taskflow diff --git a/test/neura/kernel_fusion/kernel.cpp b/test/neura/kernel_fusion/kernel.cpp new file mode 100644 index 00000000..7b1cb617 --- /dev/null +++ b/test/neura/kernel_fusion/kernel.cpp @@ -0,0 +1,152 @@ +// Test cases for FuseTaskPass +// +// Build workflow using Polygeist: +// 1. cgeist kernel.cpp -S -O2 -> SCF loops +// 2. polygeist-opt --raise-scf-to-affine -> Affine loops +// 3. mlir-neura-opt --affine-loop-tree-serialization \ +// --convert-affine-to-taskflow \ +// --construct-hyperblock-from-task -> taskflow.task ops +// 4. mlir-neura-opt --fuse-task -> Fused tasks + +#define N 64 + +float A[N], B[N], C[N], D[N], E[N], F[N], G[N], H[N], X[N], Y[N]; + +// Producer-Consumer Fusion: kernel0 -> kernel1 +// kernel0: C[i] = A[i] + B[i] +// kernel1: D[i] = C[i] * 2.0 +void test_producer_consumer_fusion(float A[], float B[], float C[], float D[]) { + for (int i = 0; i < N; i++) { + C[i] = A[i] + B[i]; + } + + for (int i = 0; i < N; i++) { + D[i] = C[i] * 2.0f; + } +} + +// Multiple Consumers: kernel0 -> kernel1, kernel0 -> kernel2 +// kernel0: C[i] = A[i] + B[i] +// kernel1: D[i] = C[i] * 2.0 +// kernel2: E[i] = C[i] + 1.0 +void test_multiple_consumers(float A[], float B[], float C[], float D[], float E[]) { + for (int i = 0; i < N; i++) { + C[i] = A[i] + B[i]; + } + + for (int i = 0; i < N; i++) { + D[i] = C[i] * 2.0f; + } + + for (int i = 0; i < N; i++) { + E[i] = C[i] + 1.0f; + } +} + +// Sibling Fusion: kernel0 || kernel1 (share input A) +// kernel0: E[i] = A[i] * 3.0 +// kernel1: F[i] = A[i] + 1.0 +void test_sibling_fusion(float A[], float E[], float F[]) { + for (int i = 0; i < N; i++) { + E[i] = A[i] * 3.0f; + } + + for (int i = 0; i < N; i++) { + F[i] = A[i] + 1.0f; + } +} + +// No Shared Input: kernel0, kernel1 (no fusion - different inputs) +// kernel0: G[i] = X[i] * 2.0 +// kernel1: H[i] = Y[i] + 3.0 +void test_no_shared_input(float X[], float Y[], float G[], float H[]) { + for (int i = 0; i < N; i++) { + G[i] = X[i] * 2.0f; + } + + for (int i = 0; i < N; i++) { + H[i] = Y[i] + 3.0f; + } +} + +// Chain Fusion: kernel0 -> kernel1 -> kernel2 +// kernel0: C[i] = A[i] + B[i] +// kernel1: D[i] = C[i] * 2.0 +// kernel2: E[i] = D[i] + 1.0 +void test_chain_fusion(float A[], float B[], float C[], float D[], float E[]) { + for (int i = 0; i < N; i++) { + C[i] = A[i] + B[i]; + } + + for (int i = 0; i < N; i++) { + D[i] = C[i] * 2.0f; + } + + for (int i = 0; i < N; i++) { + E[i] = D[i] + 1.0f; + } +} + +// Complex Sibling: (kernel0 || kernel1 || kernel2), kernel3 +// kernel0: C[i] = A[i] * 2.0 +// kernel1: D[i] = A[i] + 1.0 } siblings (share A) +// kernel2: E[i] = A[i] - 1.0 +// kernel3: F[i] = B[i] * 3.0 (independent) +void test_complex_sibling(float A[], float B[], float C[], float D[], float E[], float F[]) { + for (int i = 0; i < N; i++) { + C[i] = A[i] * 2.0f; + } + + for (int i = 0; i < N; i++) { + D[i] = A[i] + 1.0f; + } + + for (int i = 0; i < N; i++) { + E[i] = A[i] - 1.0f; + } + + for (int i = 0; i < N; i++) { + F[i] = B[i] * 3.0f; + } +} + +// Mixed Patterns: (kernel0 -> kernel3) || (kernel1 || kernel2) +// kernel0: C[i] = A[i] + B[i] ─┐ +// kernel1: D[i] = A[i] * 2.0 ├─ siblings (share A) +// kernel2: E[i] = A[i] + 3.0 ─┘ +// kernel3: F[i] = C[i] * 2.0 (consumer of kernel0) +void test_mixed_patterns(float A[], float B[], float C[], float D[], float E[], float F[]) { + for (int i = 0; i < N; i++) { + C[i] = A[i] + B[i]; + } + + for (int i = 0; i < N; i++) { + D[i] = A[i] * 2.0f; + } + + for (int i = 0; i < N; i++) { + E[i] = A[i] + 3.0f; + } + + for (int i = 0; i < N; i++) { + F[i] = C[i] * 2.0f; + } +} + +int main() { + for (int i = 0; i < N; i++) { + A[i] = (float)i; + B[i] = (float)(i * 2); + X[i] = (float)(i + 1); + Y[i] = (float)(i - 1); + } + + test_producer_consumer_fusion(A, B, C, D); + test_sibling_fusion(A, E, F); + test_no_shared_input(X, Y, G, H); + test_chain_fusion(A, B, C, D, E); + test_complex_sibling(A, B, C, D, E, F); + test_mixed_patterns(A, B, C, D, E, F); + + return 0; +} diff --git a/test/neura/kernel_fusion/test.mlir b/test/neura/kernel_fusion/test.mlir new file mode 100644 index 00000000..ddb76a01 --- /dev/null +++ b/test/neura/kernel_fusion/test.mlir @@ -0,0 +1,233 @@ +// RUN: mlir-neura-opt %s \ +// RUN: --affine-loop-tree-serialization \ +// RUN: --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task \ +// RUN: --fuse-task \ +// RUN: 2>&1 | FileCheck %s + +module { + +// ============================================================================= +// TEST 1: Producer-Consumer Fusion +// Loops share intermediate memref C: loop1 writes C, loop2 reads C. +// Expected: Fused into one task; intermediate store/load eliminated. +// ============================================================================= + +// CHECK-LABEL: func.func @test_producer_consumer +// CHECK: taskflow.task @fused_pc +// CHECK: arith.addf +// CHECK-NEXT: arith.mulf +// CHECK-NEXT: memref.store +// CHECK-NOT: taskflow.task +// CHECK: return + +func.func @test_producer_consumer(%A: memref<64xf32>, %B: memref<64xf32>, + %C: memref<64xf32>, %D: memref<64xf32>) { + %cst = arith.constant 2.000000e+00 : f32 + affine.for %i = 0 to 64 { + %v = affine.load %A[%i] : memref<64xf32> + %w = affine.load %B[%i] : memref<64xf32> + %s = arith.addf %v, %w : f32 + affine.store %s, %C[%i] : memref<64xf32> + } + affine.for %i = 0 to 64 { + %v = affine.load %C[%i] : memref<64xf32> + %r = arith.mulf %v, %cst : f32 + affine.store %r, %D[%i] : memref<64xf32> + } + return +} + +// ============================================================================= +// TEST 2: Sibling Fusion +// Both loops read from A without data dependency. +// Expected: Fused into one task with deduplicated read of A. +// ============================================================================= + +// CHECK-LABEL: func.func @test_sibling +// CHECK: taskflow.task @fused_sibling +// CHECK: arith.mulf +// CHECK: memref.store +// CHECK: arith.addf +// CHECK: memref.store +// CHECK-NOT: taskflow.task +// CHECK: return + +func.func @test_sibling(%A: memref<64xf32>, %B: memref<64xf32>, + %C: memref<64xf32>) { + %cst3 = arith.constant 3.000000e+00 : f32 + %cst1 = arith.constant 1.000000e+00 : f32 + affine.for %i = 0 to 64 { + %v = affine.load %A[%i] : memref<64xf32> + %r = arith.mulf %v, %cst3 : f32 + affine.store %r, %B[%i] : memref<64xf32> + } + affine.for %i = 0 to 64 { + %v = affine.load %A[%i] : memref<64xf32> + %r = arith.addf %v, %cst1 : f32 + affine.store %r, %C[%i] : memref<64xf32> + } + return +} + +// ============================================================================= +// TEST 3: No Shared Input (No Fusion) +// Loops read from different arrays (A and B) with no dependency. +// Expected: Two separate tasks remain. +// ============================================================================= + +// CHECK-LABEL: func.func @test_no_shared_input +// CHECK: taskflow.task @Task_0 +// CHECK: taskflow.task @Task_1 +// CHECK: return + +func.func @test_no_shared_input(%A: memref<64xf32>, %B: memref<64xf32>, + %C: memref<64xf32>, %D: memref<64xf32>) { + %cst2 = arith.constant 2.000000e+00 : f32 + %cst3 = arith.constant 3.000000e+00 : f32 + affine.for %i = 0 to 64 { + %v = affine.load %A[%i] : memref<64xf32> + %r = arith.mulf %v, %cst2 : f32 + affine.store %r, %C[%i] : memref<64xf32> + } + affine.for %i = 0 to 64 { + %v = affine.load %B[%i] : memref<64xf32> + %r = arith.addf %v, %cst3 : f32 + affine.store %r, %D[%i] : memref<64xf32> + } + return +} + +// ============================================================================= +// TEST 4: Chain Fusion (A -> B -> C) +// Three loops chained via intermediates C and D. +// Expected: All fused into one task; all intermediate store/load eliminated. +// ============================================================================= + +// CHECK-LABEL: func.func @test_chain +// CHECK: taskflow.task @fused_pc +// CHECK: arith.addf +// CHECK-NEXT: arith.mulf +// CHECK-NEXT: arith.addf +// CHECK-NEXT: memref.store +// CHECK-NOT: taskflow.task +// CHECK: return + +func.func @test_chain(%A: memref<64xf32>, %B: memref<64xf32>, + %C: memref<64xf32>, %D: memref<64xf32>, + %E: memref<64xf32>) { + %cst2 = arith.constant 2.000000e+00 : f32 + %cst1 = arith.constant 1.000000e+00 : f32 + affine.for %i = 0 to 64 { + %v = affine.load %A[%i] : memref<64xf32> + %w = affine.load %B[%i] : memref<64xf32> + %s = arith.addf %v, %w : f32 + affine.store %s, %C[%i] : memref<64xf32> + } + affine.for %i = 0 to 64 { + %v = affine.load %C[%i] : memref<64xf32> + %r = arith.mulf %v, %cst2 : f32 + affine.store %r, %D[%i] : memref<64xf32> + } + affine.for %i = 0 to 64 { + %v = affine.load %D[%i] : memref<64xf32> + %r = arith.addf %v, %cst1 : f32 + affine.store %r, %E[%i] : memref<64xf32> + } + return +} + +// ============================================================================= +// TEST 5: Sibling Fusion Rejected - Compute Pressure +// Both loops read from shared array A and each performs a long multiply-add +// chain. The fused kernel's operation count is large enough to push res_mii +// above the individual res_mii values, making fusion unprofitable. +// Expected: Two separate tasks remain. +// ============================================================================= + +// CHECK-LABEL: func.func @test_sibling_rejected_compute +// CHECK: taskflow.task @Task_0 +// CHECK: taskflow.task @Task_1 +// CHECK-NOT: fused_sibling +// CHECK: return + +func.func @test_sibling_rejected_compute(%A: memref<64xf32>, + %B: memref<64xf32>, + %C: memref<64xf32>) { + %c1 = arith.constant 1.5e+00 : f32 + %c2 = arith.constant 2.5e+00 : f32 + affine.for %i = 0 to 64 { + %v = affine.load %A[%i] : memref<64xf32> + %r1 = arith.mulf %v, %c1 : f32 + %r2 = arith.mulf %r1, %c2 : f32 + %r3 = arith.mulf %r2, %c1 : f32 + %r4 = arith.mulf %r3, %c2 : f32 + %r5 = arith.mulf %r4, %c1 : f32 + %r6 = arith.mulf %r5, %c2 : f32 + %r7 = arith.addf %r6, %v : f32 + affine.store %r7, %B[%i] : memref<64xf32> + } + affine.for %i = 0 to 64 { + %v = affine.load %A[%i] : memref<64xf32> + %r1 = arith.mulf %v, %c2 : f32 + %r2 = arith.mulf %r1, %c1 : f32 + %r3 = arith.mulf %r2, %c2 : f32 + %r4 = arith.mulf %r3, %c1 : f32 + %r5 = arith.mulf %r4, %c2 : f32 + %r6 = arith.mulf %r5, %c1 : f32 + %r7 = arith.addf %r6, %v : f32 + affine.store %r7, %C[%i] : memref<64xf32> + } + return +} + +// ============================================================================= +// TEST 6: Sibling Fusion Rejected - Memory Pressure +// Both loops share array A but each also loads from two additional independent +// arrays, so no deduplication occurs for those loads. The accumulated load and +// compute operations in the fused kernel raise res_mii beyond either individual, +// making fusion unprofitable. +// Expected: Two separate tasks remain. +// ============================================================================= + +// CHECK-LABEL: func.func @test_sibling_rejected_memory +// CHECK: taskflow.task @Task_0 +// CHECK: taskflow.task @Task_1 +// CHECK-NOT: fused_sibling +// CHECK: return + +func.func @test_sibling_rejected_memory(%A: memref<64xf32>, + %B: memref<64xf32>, + %C: memref<64xf32>, + %D: memref<64xf32>, + %E: memref<64xf32>, + %F: memref<64xf32>, + %G: memref<64xf32>) { + affine.for %i = 0 to 64 { + %va = affine.load %A[%i] : memref<64xf32> + %vb = affine.load %B[%i] : memref<64xf32> + %vc = affine.load %C[%i] : memref<64xf32> + %s1 = arith.addf %va, %vb : f32 + %s2 = arith.mulf %s1, %vc : f32 + %s3 = arith.mulf %s2, %va : f32 + %s4 = arith.addf %s3, %vb : f32 + %s5 = arith.mulf %s4, %vc : f32 + %s6 = arith.addf %s5, %va : f32 + affine.store %s6, %D[%i] : memref<64xf32> + } + affine.for %i = 0 to 64 { + %va = affine.load %A[%i] : memref<64xf32> + %ve = affine.load %E[%i] : memref<64xf32> + %vf = affine.load %F[%i] : memref<64xf32> + %s1 = arith.addf %va, %ve : f32 + %s2 = arith.mulf %s1, %vf : f32 + %s3 = arith.mulf %s2, %va : f32 + %s4 = arith.addf %s3, %ve : f32 + %s5 = arith.mulf %s4, %vf : f32 + %s6 = arith.addf %s5, %va : f32 + affine.store %s6, %G[%i] : memref<64xf32> + } + return +} + +} diff --git a/tools/mlir-neura-opt/mlir-neura-opt.cpp b/tools/mlir-neura-opt/mlir-neura-opt.cpp index a809f6f3..587ab050 100644 --- a/tools/mlir-neura-opt/mlir-neura-opt.cpp +++ b/tools/mlir-neura-opt/mlir-neura-opt.cpp @@ -34,6 +34,9 @@ using mlir::neura::util::ArchParser; // Global variable to store architecture spec file path static std::string architecture_spec_file; +// Global variable to store latency spec file path +static std::string latency_spec_file; + const Architecture &mlir::neura::getArchitecture() { static Architecture instance = []() { auto arch_parser = ArchParser(architecture_spec_file); @@ -46,6 +49,10 @@ const Architecture &mlir::neura::getArchitecture() { return instance; } +const std::string &mlir::neura::getLatencySpecFile() { + return latency_spec_file; +} + int main(int argc, char **argv) { // Manually scan and strip --architecture-spec from argv, keep others for // MlirOptMain. @@ -68,6 +75,15 @@ int main(int argc, char **argv) { architecture_spec_file = arg_ref.substr(strlen("--architecture-spec=")).str(); continue; + } else if (arg_ref == "--latency-spec") { + if (i + 1 < argc) { + latency_spec_file = argv[i + 1]; + ++i; // skip value + continue; + } + } else if (arg_ref.starts_with("--latency-spec=")) { + latency_spec_file = arg_ref.substr(strlen("--latency-spec=")).str(); + continue; } forwarded_args.push_back(argv[i]); }