diff --git a/include/NeuraDialect/Architecture/ArchitectureSpec.h b/include/NeuraDialect/Architecture/ArchitectureSpec.h index 2df350f7..1a63244c 100644 --- a/include/NeuraDialect/Architecture/ArchitectureSpec.h +++ b/include/NeuraDialect/Architecture/ArchitectureSpec.h @@ -72,6 +72,10 @@ struct LinkOverride { // This is set by the command line tool when a YAML file is provided. std::string getArchitectureSpecFile(); +// Function for getting the latency specification file path. +// This is set by the command line tool when a YAML file is provided. +std::string getLatencySpecFile(); + // Function for getting tile defaults configuration. TileDefaults getTileDefaults(); diff --git a/include/NeuraDialect/Mapping/MappingState.h b/include/NeuraDialect/Mapping/MappingState.h index a43cea04..99dc737f 100644 --- a/include/NeuraDialect/Mapping/MappingState.h +++ b/include/NeuraDialect/Mapping/MappingState.h @@ -10,6 +10,13 @@ namespace mlir { namespace neura { +// Occupy status for multi-cycle pipeline support. +// These states define how a tile/FU is occupied at a given time step. +#define SINGLE_OCCUPY 0 // A single-cycle op is in the FU (exclusive) +#define START_PIPE_OCCUPY 1 // A multi-cycle op starts in the FU +#define END_PIPE_OCCUPY 2 // A multi-cycle op ends in the FU +#define IN_PIPE_OCCUPY 3 // A multi-cycle op is occupying the FU (pipelined) + // Represents a spatial-temporal location: (resource, time_step) struct MappingLoc { BasicResource *resource; @@ -54,9 +61,20 @@ namespace neura { class MappingState { public: MappingState(const Architecture &arch, int II, bool is_spatial_only); - // Binds a (tile/link, time_step) location to an operation. + // Binds a (tile/link, time_step) location to an operation with default + // SINGLE_OCCUPY status. bool bindOp(const MappingLoc &loc, Operation *op); + // Binds a (tile/link, time_step) location to an operation with specified + // occupy status for multi-cycle pipeline support. + bool bindOp(const MappingLoc &loc, Operation *op, int occupy_status); + + // Binds multiple locations for a multi-cycle operation. + // This sets START_PIPE_OCCUPY at start_time, IN_PIPE_OCCUPY for intermediate + // times, and END_PIPE_OCCUPY at end_time-1. + bool bindMultiCycleOp(BasicResource *resource, int start_time, int latency, + Operation *op); + // Unbinds an operation from its (tile/link, time_step) location, // which is useful for backtracking. void unbindOp(Operation *op); @@ -67,6 +85,19 @@ class MappingState { // it will check (tile 2, step 1), (tile 2, step 5), (tile 2, step 9), etc. bool isAvailableAcrossTime(const MappingLoc &loc) const; + // Checks if a location is available for a specific occupy status. + // This implements the pipeline-aware availability checking: + // - SINGLE_OCCUPY: only available if location is completely free + // - START_PIPE_OCCUPY: available if free or IN_PIPE_OCCUPY or END_PIPE_OCCUPY + // - END_PIPE_OCCUPY: available if free or IN_PIPE_OCCUPY or START_PIPE_OCCUPY + // - IN_PIPE_OCCUPY: always available (can pipeline with any status) + bool isAvailableForOccupyStatus(const MappingLoc &loc, + int new_occupy_status) const; + + // Gets the occupy status at a specific location across time domain. + // Returns -1 if the location is not occupied. + int getOccupyStatusAcrossTime(const MappingLoc &loc) const; + // Checks if a hardware resource is available across a time range. // This function leverages the isAvailableAcrossTime function in each // time step. @@ -111,7 +142,8 @@ class MappingState { void dumpOpToLocs(llvm::raw_ostream &os = llvm::errs()) const; // Getters for state information. - const std::set &getOccupiedLocs() const { + const std::map>> & + getOccupiedLocs() const { return this->occupied_locs; } const std::map &getLocToOp() const { @@ -122,7 +154,9 @@ class MappingState { } // Setters for state information. - void setOccupiedLocs(const std::set &locs) { + void setOccupiedLocs( + const std::map>> + &locs) { this->occupied_locs = locs; } void setLocToOp(const std::map &loc_to_op) { @@ -139,7 +173,9 @@ class MappingState { bool is_spatial_only; static constexpr int kMaxSteps = 10; - std::set occupied_locs; + // Maps location to a list of (occupy_status, operation) pairs. + // Multiple ops can occupy the same location with compatible pipeline states. + std::map>> occupied_locs; std::map loc_to_op; std::map> op_to_locs; }; @@ -160,7 +196,7 @@ class MappingStateSnapshot { } private: - std::set occupied_locs; + std::map>> occupied_locs; std::map loc_to_op; std::map> op_to_locs; }; diff --git a/include/NeuraDialect/Mapping/mapping_util.h b/include/NeuraDialect/Mapping/mapping_util.h index 0a36d476..dfe7ca4d 100644 --- a/include/NeuraDialect/Mapping/mapping_util.h +++ b/include/NeuraDialect/Mapping/mapping_util.h @@ -116,5 +116,12 @@ bool canReachLocInTime(const std::vector &producers, Register *getAvailableRegister(const MappingState &mapping_state, Tile *tile, int start_time, int exclusive_end_time); +// Gets the execution latency of an operation from its "latency" attribute. +// Returns 1 (single-cycle) if the attribute is not present. +int getOpLatency(Operation *op); + +// Checks if an operation is a multi-cycle operation (latency > 1). +bool isMultiCycleOp(Operation *op); + } // namespace neura } // namespace mlir diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h index 95aa70c8..340886ed 100644 --- a/include/NeuraDialect/NeuraPasses.h +++ b/include/NeuraDialect/NeuraPasses.h @@ -38,6 +38,7 @@ std::unique_ptr createWrapLoopInKernelPass(); // Hardware specific optimization passes std::unique_ptr createFuseLoopControlPass(); std::unique_ptr createFusePatternPass(); +std::unique_ptr createFuseKernelPass(); // Hardware agnostic optimization passes std::unique_ptr createFoldConstantPass(); @@ -49,6 +50,7 @@ std::unique_ptr createInitPatternPass(); // Hardware optimization passes std::unique_ptr createHardwareMergePass(); +std::unique_ptr createInitExecLatencyPass(); #define GEN_PASS_REGISTRATION #include "NeuraDialect/NeuraPasses.h.inc" diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td index ec0df60b..90a5d5e3 100644 --- a/include/NeuraDialect/NeuraPasses.td +++ b/include/NeuraDialect/NeuraPasses.td @@ -20,6 +20,21 @@ def FusePattern : Pass<"fuse-pattern", "ModuleOp"> { let constructor = "neura::createFusePatternPass()"; } +def FuseKernel : Pass<"fuse-kernel", "ModuleOp"> { + let summary = "Fuses kernel operations in the Neura dialect"; + let description = [{ + This pass fuses neura.kernel operations using producer-consumer and sibling + fusion strategies, inspired by MLIR's linalg and affine loop fusion. + + Producer-Consumer Fusion: Fuses a producer kernel into its consumer when + the producer's output is only used by the consumer. + + Sibling Fusion: Fuses kernels that share the same input operands and have + no data dependencies between them. + }]; + let constructor = "neura::createFuseKernelPass()"; +} + def InsertDataMov : Pass<"insert-data-mov", "ModuleOp"> { let summary = "Inserts data move operations in the Neura dialect"; let description = @@ -194,4 +209,12 @@ def HardwareMerge : Pass<"hardware-merge", "ModuleOp"> { }]; let constructor = "neura::createHardwareMergePass()"; } + +def InitExecLatency : Pass<"init-exec-latency", "ModuleOp"> { + let summary = "Initialize execution latency information"; + let description = [{ + This pass initializes execution latency information. + }]; + let constructor = "neura::createInitExecLatencyPass()"; +} #endif // NEURA_PASSES_TD \ No newline at end of file diff --git a/lib/NeuraDialect/Mapping/MappingState.cpp b/lib/NeuraDialect/Mapping/MappingState.cpp index 110d1976..d537eeea 100644 --- a/lib/NeuraDialect/Mapping/MappingState.cpp +++ b/lib/NeuraDialect/Mapping/MappingState.cpp @@ -3,6 +3,7 @@ #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypes.h" #include "llvm/Support/raw_ostream.h" +#include using namespace mlir; using namespace mlir::neura; @@ -30,14 +31,62 @@ MappingState::MappingState(const Architecture &arch, int II, : II(II), is_spatial_only(is_spatial_only) {} bool MappingState::bindOp(const MappingLoc &loc, Operation *op) { + // Default to SINGLE_OCCUPY for backward compatibility + return bindOp(loc, op, SINGLE_OCCUPY); +} + +bool MappingState::bindOp(const MappingLoc &loc, Operation *op, + int occupy_status) { + // Check if the location is available for the specified occupy status + if (!isAvailableForOccupyStatus(loc, occupy_status)) { + return false; + } + loc_to_op[loc] = op; - occupied_locs.insert(loc); + occupied_locs[loc].push_back({occupy_status, op}); auto it = op_to_locs.find(op); assert(it == op_to_locs.end() && "Operation already has reserved locations"); op_to_locs[op].push_back(loc); return true; } +bool MappingState::bindMultiCycleOp(BasicResource *resource, int start_time, + int latency, Operation *op) { + // First check if all locations are available + for (int t = start_time; t < start_time + latency; ++t) { + MappingLoc check_loc = {resource, t}; + int status; + if (t == start_time) { + status = START_PIPE_OCCUPY; + } else if (t == start_time + latency - 1) { + status = END_PIPE_OCCUPY; + } else { + status = IN_PIPE_OCCUPY; + } + if (!isAvailableForOccupyStatus(check_loc, status)) { + return false; + } + } + + // Now bind all locations + for (int t = start_time; t < start_time + latency; ++t) { + MappingLoc loc = {resource, t}; + int status; + if (t == start_time) { + status = START_PIPE_OCCUPY; + } else if (t == start_time + latency - 1) { + status = END_PIPE_OCCUPY; + } else { + status = IN_PIPE_OCCUPY; + } + + loc_to_op[loc] = op; + occupied_locs[loc].push_back({status, op}); + op_to_locs[op].push_back(loc); + } + return true; +} + void MappingState::unbindOp(Operation *op) { auto it = op_to_locs.find(op); if (it == op_to_locs.end()) { @@ -46,7 +95,21 @@ void MappingState::unbindOp(Operation *op) { for (const MappingLoc &loc : it->second) { loc_to_op.erase(loc); - occupied_locs.erase(loc); + // Remove entries for this op from occupied_locs + auto occ_it = occupied_locs.find(loc); + if (occ_it != occupied_locs.end()) { + auto &entries = occ_it->second; + entries.erase( + std::remove_if(entries.begin(), entries.end(), + [op](const std::pair &entry) { + return entry.second == op; + }), + entries.end()); + // Remove the location entirely if no more entries + if (entries.empty()) { + occupied_locs.erase(occ_it); + } + } } op_to_locs.erase(it); @@ -57,21 +120,128 @@ bool MappingState::isAvailableAcrossTime(const MappingLoc &loc) const { if (this->is_spatial_only) { for (int t = 0; t < II * kMaxSteps; ++t) { MappingLoc check_loc = {loc.resource, t}; - if (occupied_locs.find(check_loc) != occupied_locs.end()) { - return false; + auto it = occupied_locs.find(check_loc); + if (it != occupied_locs.end()) { + // Check if all existing occupy statuses allow new single-cycle op + for (const auto &entry : it->second) { + if (entry.first != IN_PIPE_OCCUPY) { + return false; + } + } } } return true; } else { - // Checks the availability across time domain. for (int t = loc.time_step % II; t < II * kMaxSteps; t += II) { MappingLoc check_loc = {loc.resource, t}; - if (occupied_locs.find(check_loc) != occupied_locs.end()) { + auto it = occupied_locs.find(check_loc); + if (it != occupied_locs.end()) { + // Check if all existing occupy statuses allow new single-cycle op + for (const auto &entry : it->second) { + if (entry.first != IN_PIPE_OCCUPY) { + return false; + } + } + } + } + return true; + } +} + +bool MappingState::isAvailableForOccupyStatus(const MappingLoc &loc, + int new_occupy_status) const { + // Helper lambda to check a single location against all existing entries + auto checkSingleLoc = [this, new_occupy_status](const MappingLoc &check_loc) -> bool { + auto it = occupied_locs.find(check_loc); + if (it == occupied_locs.end() || it->second.empty()) { + // Location is free, always available + return true; + } + + // Check against all existing entries at this location + for (const auto &entry : it->second) { + int existing_status = entry.first; + + // Implement the pipeline-aware availability rules: + // - SINGLE_OCCUPY (0): exclusive, no other op can share + // - START_PIPE_OCCUPY (1): cannot coexist with SINGLE or another START + // - END_PIPE_OCCUPY (2): cannot coexist with SINGLE or another END + // - IN_PIPE_OCCUPY (3): can coexist with any status except SINGLE + + if (existing_status == SINGLE_OCCUPY) { + // SINGLE_OCCUPY blocks everything + return false; + } + + if (new_occupy_status == SINGLE_OCCUPY) { + // SINGLE_OCCUPY cannot be placed if anything is there return false; } + + if (new_occupy_status == START_PIPE_OCCUPY) { + // START cannot coexist with another START + if (existing_status == START_PIPE_OCCUPY) { + return false; + } + } + + if (new_occupy_status == END_PIPE_OCCUPY) { + // END cannot coexist with another END + if (existing_status == END_PIPE_OCCUPY) { + return false; + } + } + + // IN_PIPE_OCCUPY can coexist with START, END, or other IN_PIPE } return true; + }; + + // For spatial mapping, check all time steps + if (this->is_spatial_only) { + for (int t = 0; t < II * kMaxSteps; ++t) { + MappingLoc check_loc = {loc.resource, t}; + if (!checkSingleLoc(check_loc)) { + return false; + } + } + return true; + } else { + // Check across time domain (modulo II) + for (int t = loc.time_step % II; t < II * kMaxSteps; t += II) { + MappingLoc check_loc = {loc.resource, t}; + if (!checkSingleLoc(check_loc)) { + return false; + } + } + return true; + } +} + +int MappingState::getOccupyStatusAcrossTime(const MappingLoc &loc) const { + // For spatial mapping, check all time steps + if (this->is_spatial_only) { + for (int t = 0; t < II * kMaxSteps; ++t) { + MappingLoc check_loc = {loc.resource, t}; + auto it = occupied_locs.find(check_loc); + if (it != occupied_locs.end() && !it->second.empty()) { + // Return the first status found (most restrictive) + return it->second[0].first; + } + } + return -1; + } else { + // Check across time domain (modulo II) + for (int t = loc.time_step % II; t < II * kMaxSteps; t += II) { + MappingLoc check_loc = {loc.resource, t}; + auto it = occupied_locs.find(check_loc); + if (it != occupied_locs.end() && !it->second.empty()) { + // Return the first status found (most restrictive) + return it->second[0].first; + } + } + return -1; } } @@ -202,12 +372,9 @@ void MappingState::reserveRoute(Operation *op, ArrayRef path) { op_to_locs[op] = std::vector(path.begin(), path.end()); for (const MappingLoc &loc : path) { - assert(occupied_locs.find(loc) == occupied_locs.end() && - "Mapping location already occupied"); loc_to_op[loc] = op; - assert(occupied_locs.find(loc) == occupied_locs.end() && - "Mapping location already occupied in occupied_locs"); - occupied_locs.insert(loc); + // Use SINGLE_OCCUPY for route reservations (links/registers) + occupied_locs[loc].push_back({SINGLE_OCCUPY, op}); } } @@ -221,7 +388,21 @@ void MappingState::releaseRoute(Operation *op) { for (const MappingLoc &loc : route) { loc_to_op.erase(loc); - occupied_locs.erase(loc); + // Remove entries for this op from occupied_locs + auto occ_it = occupied_locs.find(loc); + if (occ_it != occupied_locs.end()) { + auto &entries = occ_it->second; + entries.erase( + std::remove_if(entries.begin(), entries.end(), + [op](const std::pair &entry) { + return entry.second == op; + }), + entries.end()); + // Remove the location entirely if no more entries + if (entries.empty()) { + occupied_locs.erase(occ_it); + } + } } op_to_locs.erase(it); diff --git a/lib/NeuraDialect/Mapping/mapping_util.cpp b/lib/NeuraDialect/Mapping/mapping_util.cpp index f5b7a86d..65abaab6 100644 --- a/lib/NeuraDialect/Mapping/mapping_util.cpp +++ b/lib/NeuraDialect/Mapping/mapping_util.cpp @@ -4,6 +4,7 @@ #include "NeuraDialect/Mapping/mapping_util.h" #include "NeuraDialect/NeuraOps.h" #include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/Operation.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" @@ -1112,13 +1113,37 @@ llvm::SmallVector mlir::neura::getCtrlMovUsers(Operation *op) { bool mlir::neura::placeAndRoute(Operation *op, const MappingLoc &target_loc, MappingState &mapping_state) { - if (mapping_state.bindOp(target_loc, op)) { + // Get the latency of the operation to determine if it's multi-cycle + int latency = getOpLatency(op); + bool is_multi_cycle = latency > 1; + + bool bind_success = false; + if (is_multi_cycle) { + // For multi-cycle ops, bind across multiple time steps with pipeline status + bind_success = mapping_state.bindMultiCycleOp( + target_loc.resource, target_loc.time_step, latency, op); + if (bind_success) { + llvm::errs() << "[DEBUG] Bound multi-cycle op (latency=" << latency + << ") " << *op << " onto loc: " + << target_loc.resource->getType() << "#" + << target_loc.resource->getId() + << " @t=" << target_loc.time_step << " to t=" + << (target_loc.time_step + latency - 1) << "\n"; + } + } else { + // For single-cycle ops, use default SINGLE_OCCUPY binding + bind_success = mapping_state.bindOp(target_loc, op); + if (bind_success) { + llvm::errs() << "[DEBUG] Schedule op " << *op + << " onto loc: " << target_loc.resource->getType() << "#" + << target_loc.resource->getId() + << " @t=" << target_loc.time_step << "\n"; + } + } + + if (bind_success) { std::vector routed_operands; std::vector routed_ctrl_movs; - llvm::errs() << "[DEBUG] Schedule op " << *op - << " onto loc: " << target_loc.resource->getType() << "#" - << target_loc.resource->getId() - << " @t=" << target_loc.time_step << "\n"; // Tries to route the data move operations. for (Value operand : op->getOperands()) { llvm::errs() << "Processing operand: " << operand << "\n"; @@ -1219,4 +1244,17 @@ bool mlir::neura::placeAndRoute(Operation *op, const MappingLoc &target_loc, return true; } return false; +} + +int mlir::neura::getOpLatency(Operation *op) { + // Try to get the latency attribute from the operation + if (auto latency_attr = op->getAttrOfType("latency")) { + return latency_attr.getInt(); + } + // Default to single-cycle if no latency attribute is present + return 1; +} + +bool mlir::neura::isMultiCycleOp(Operation *op) { + return getOpLatency(op) > 1; } \ No newline at end of file diff --git a/lib/NeuraDialect/Transforms/CMakeLists.txt b/lib/NeuraDialect/Transforms/CMakeLists.txt index 85200b48..da52fc00 100644 --- a/lib/NeuraDialect/Transforms/CMakeLists.txt +++ b/lib/NeuraDialect/Transforms/CMakeLists.txt @@ -5,6 +5,7 @@ add_mlir_library( InsertDataMovPass.cpp InsertCtrlMovPass.cpp FusePatternPass.cpp + FuseKernelPass.cpp AssignAcceleratorPass.cpp TransformCtrlToDataFlowPass.cpp LeveragePredicatedValuePass.cpp @@ -18,6 +19,7 @@ add_mlir_library( TransformToSteerControlPass.cpp RemovePredicatedTypePass.cpp HardwareMergePass.cpp + InitExecLatencyPass.cpp GraphMining/HardwareTemplate.cpp WrapLoopInKernelPass.cpp diff --git a/lib/NeuraDialect/Transforms/FuseKernelPass.cpp b/lib/NeuraDialect/Transforms/FuseKernelPass.cpp new file mode 100644 index 00000000..611c5787 --- /dev/null +++ b/lib/NeuraDialect/Transforms/FuseKernelPass.cpp @@ -0,0 +1,592 @@ +//===- FuseKernelPass.cpp - Kernel Fusion Pass for Neura Dialect ----------===// +// +// This pass implements kernel fusion for the Neura dialect: +// 1. Producer-Consumer Fusion: Fuses a producer kernel into its consumer. +// 2. Sibling Fusion: Fuses kernels that share inputs without data dependency. +// +//===----------------------------------------------------------------------===// + +#include "NeuraDialect/NeuraOps.h" +#include "NeuraDialect/NeuraPasses.h" +#include "NeuraDialect/Architecture/Architecture.h" +#include "NeuraDialect/Mapping/mapping_util.h" +#include "Conversion/ConversionPasses.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/Math/IR/Math.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/IR/Dominance.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" + +using namespace mlir; + +#define GEN_PASS_DEF_FUSEKERNEL +#include "NeuraDialect/NeuraPasses.h.inc" + +namespace { + +// Represents metrics for evaluating fusion profitability. +struct FusionMetrics { + int rec_mii = 1; + int res_mii = 1; + int max_fanout = 0; + int num_ops = 0; +}; + +// Calculates the maximum fanout in a block. +int calculateMaxFanoutInBlock(Block &block) { + int max_fanout = 0; + for (Operation &op : block) { + for (Value result : op.getResults()) { + int fanout = std::distance(result.use_begin(), result.use_end()); + max_fanout = std::max(max_fanout, fanout); + } + } + return max_fanout; +} + +// Runs the neura transformation pipeline on a cloned module and computes MII metrics. +FusionMetrics computeRealMetrics(ModuleOp test_module, const neura::Architecture &architecture) { + FusionMetrics metrics; + auto cloned_module = test_module.clone(); + + PassManager pm(cloned_module.getContext()); + pm.addPass(mlir::neura::createAssignAcceleratorPass()); + pm.addPass(mlir::createLowerArithToNeuraPass()); + pm.addPass(neura::createCanonicalizeReturnPass()); + pm.addPass(neura::createCanonicalizeCastPass()); + pm.addPass(neura::createPromoteFuncArgToConstPass()); + pm.addPass(neura::createCanonicalizeLiveInPass()); + pm.addPass(neura::createLeveragePredicatedValuePass()); + pm.addPass(neura::createTransformCtrlToDataFlowPass()); + pm.enableVerifier(true); + + if (failed(pm.run(cloned_module))) { + metrics.rec_mii = 100; + metrics.res_mii = 100; + cloned_module.erase(); + return metrics; + } + + cloned_module.walk([&](func::FuncOp func_op) { + if (func_op.getName() != "test_fused_kernel") { + return; + } + metrics.res_mii = neura::calculateResMii(func_op, architecture); + auto cycles = neura::collectRecurrenceCycles(func_op); + metrics.rec_mii = 1; + for (const auto &cycle : cycles) { + metrics.rec_mii = std::max(metrics.rec_mii, cycle.length); + } + int num_ops = 0; + func_op.walk([&](Operation *op) { + if (!isa(op) && !op->hasTrait()) { + ++num_ops; + } + }); + metrics.num_ops = num_ops; + if (!func_op.getBody().empty()) { + metrics.max_fanout = calculateMaxFanoutInBlock(func_op.getBody().front()); + } + }); + + cloned_module.erase(); + return metrics; +} + +// Clones operations from a kernel block, collecting yield values. +void cloneKernelBlockOps(Block &source_block, OpBuilder &builder, IRMapping &mapping, SmallVectorImpl &yield_values) { + for (Operation &op : source_block) { + if (auto yield_op = dyn_cast(&op)) { + for (Value v : yield_op.getOperands()) { + yield_values.push_back(mapping.lookup(v)); + } + continue; + } + builder.clone(op, mapping); + } +} + +// Creates a test function from a kernel's body and returns the function. +func::FuncOp cloneKernelToTestFunction(neura::KernelOp kernel, OpBuilder &builder, Location loc) { + Block &kernel_block = kernel.getBody().front(); + + SmallVector input_types; + for (auto arg : kernel_block.getArguments()) { + input_types.push_back(arg.getType()); + } + SmallVector output_types(kernel.getResultTypes()); + + if (input_types.empty()) { + input_types.push_back(builder.getI64Type()); + } + if (output_types.empty()) { + output_types.push_back(builder.getI64Type()); + } + + auto func_type = builder.getFunctionType(input_types, output_types); + auto func_op = builder.create(loc, "test_fused_kernel", func_type); + func_op->setAttr("accelerator", builder.getStringAttr("neura")); + + Block *entry_block = func_op.addEntryBlock(); + builder.setInsertionPointToStart(entry_block); + + IRMapping mapping; + for (auto [kernel_arg, func_arg] : llvm::zip(kernel_block.getArguments(), entry_block->getArguments())) { + mapping.map(kernel_arg, func_arg); + } + + SmallVector yield_values; + cloneKernelBlockOps(kernel_block, builder, mapping, yield_values); + + if (yield_values.empty()) { + yield_values.push_back(entry_block->getArgument(0)); + } + auto return_op = builder.create(loc, yield_values); + return_op->setAttr("return_type", builder.getStringAttr("value")); + + return func_op; +} + +// Computes metrics for a single kernel by creating a test module. +FusionMetrics computeSingleKernelMetrics(neura::KernelOp kernel, const neura::Architecture &architecture) { + MLIRContext *ctx = kernel.getContext(); + OpBuilder builder(ctx); + + auto module = ModuleOp::create(builder.getUnknownLoc()); + builder.setInsertionPointToStart(module.getBody()); + cloneKernelToTestFunction(kernel, builder, builder.getUnknownLoc()); + + FusionMetrics metrics = computeRealMetrics(module, architecture); + module.erase(); + return metrics; +} + +// Computes metrics for fused kernels by directly merging kernel bodies into a test function. +FusionMetrics computeFusedKernelMetrics(neura::KernelOp kernel1, neura::KernelOp kernel2, bool is_producer_consumer, Value fused_value, const neura::Architecture &architecture) { + MLIRContext *ctx = kernel1.getContext(); + OpBuilder builder(ctx); + Location loc = builder.getUnknownLoc(); + + auto module = ModuleOp::create(loc); + builder.setInsertionPointToStart(module.getBody()); + + Block &k1_block = kernel1.getBody().front(); + Block &k2_block = kernel2.getBody().front(); + + // Collects input types from both kernels. + SmallVector input_types; + for (auto arg : k1_block.getArguments()) { + input_types.push_back(arg.getType()); + } + + // Finds which kernel2 arg corresponds to fused_value for producer-consumer fusion. + int fused_value_arg_idx = -1; + if (is_producer_consumer && fused_value) { + for (auto [idx, input] : llvm::enumerate(kernel2.getInputs())) { + if (input == fused_value) { + fused_value_arg_idx = idx; + break; + } + } + } + + for (auto [idx, arg] : llvm::enumerate(k2_block.getArguments())) { + if (static_cast(idx) != fused_value_arg_idx) { + input_types.push_back(arg.getType()); + } + } + + // Determines output types based on fusion type. + SmallVector output_types; + if (is_producer_consumer) { + output_types.append(kernel2.getResultTypes().begin(), kernel2.getResultTypes().end()); + } else { + output_types.append(kernel1.getResultTypes().begin(), kernel1.getResultTypes().end()); + output_types.append(kernel2.getResultTypes().begin(), kernel2.getResultTypes().end()); + } + if (input_types.empty()) { + input_types.push_back(builder.getI64Type()); + } + if (output_types.empty()) { + output_types.push_back(builder.getI64Type()); + } + + // Creates test function. + auto func_type = builder.getFunctionType(input_types, output_types); + auto func_op = builder.create(loc, "test_fused_kernel", func_type); + func_op->setAttr("accelerator", builder.getStringAttr("neura")); + Block *entry_block = func_op.addEntryBlock(); + builder.setInsertionPointToStart(entry_block); + + // Maps kernel1's block arguments to function arguments. + IRMapping mapping; + unsigned func_arg_idx = 0; + for (auto k1_arg : k1_block.getArguments()) { + mapping.map(k1_arg, entry_block->getArgument(func_arg_idx++)); + } + + // Clones kernel1's operations. + SmallVector k1_yields; + cloneKernelBlockOps(k1_block, builder, mapping, k1_yields); + + // Maps kernel2's block arguments. + for (auto [idx, k2_arg] : llvm::enumerate(k2_block.getArguments())) { + if (is_producer_consumer && static_cast(idx) == fused_value_arg_idx) { + if (!k1_yields.empty()) { + mapping.map(k2_arg, k1_yields[0]); + } + } else { + mapping.map(k2_arg, entry_block->getArgument(func_arg_idx++)); + } + } + + // Clones kernel2's operations. + SmallVector k2_yields; + cloneKernelBlockOps(k2_block, builder, mapping, k2_yields); + + // Creates return with appropriate yields. + SmallVector return_values; + if (is_producer_consumer) { + return_values = k2_yields; + } else { + return_values.append(k1_yields.begin(), k1_yields.end()); + return_values.append(k2_yields.begin(), k2_yields.end()); + } + if (return_values.empty()) { + return_values.push_back(entry_block->getArgument(0)); + } + auto return_op = builder.create(loc, return_values); + return_op->setAttr("return_type", builder.getStringAttr("value")); + + FusionMetrics metrics = computeRealMetrics(module, architecture); + module.erase(); + return metrics; +} + +int estimateMII(const FusionMetrics &metrics, int total_ops, int total_tiles) { + const float alpha = 0.5; + const float beta = 0.5; + int mii = std::max(metrics.rec_mii, metrics.res_mii); + return std::ceil((1.0 + alpha * (total_ops / float(total_tiles))) * (1 + beta * std::max(metrics.max_fanout - 4, 0)) * mii); +} + +// Checks if fusion is profitable based on MII and fanout metrics. +bool isFusionProfitable(neura::KernelOp kernel1, neura::KernelOp kernel2, bool is_producer_consumer, Value fused_value = nullptr) { + neura::Architecture architecture(1, 1, neura::BaseTopology::MESH, 4, 4, neura::BaseTopology::MESH); + + FusionMetrics m1 = computeSingleKernelMetrics(kernel1, architecture); + FusionMetrics m2 = computeSingleKernelMetrics(kernel2, architecture); + FusionMetrics fused = computeFusedKernelMetrics(kernel1, kernel2, is_producer_consumer, fused_value, architecture); + + return estimateMII(fused, fused.num_ops, architecture.getNumTiles()) <= std::max(estimateMII(m1, m1.num_ops, architecture.getNumTiles()), estimateMII(m2, m2.num_ops, architecture.getNumTiles())); + +} + +// Checks if two kernels can be fused (same block, producer before consumer). +bool canFuseKernels(neura::KernelOp producer, neura::KernelOp consumer) { + if (!producer || !consumer || producer == consumer) { + return false; + } + if (producer->getBlock() != consumer->getBlock()) { + return false; + } + return producer->isBeforeInBlock(consumer); +} + +// Returns true if consumer uses any of producer's results. +bool hasProducerConsumerRelation(neura::KernelOp producer, neura::KernelOp consumer) { + for (Value result : producer.getOutputs()) { + for (Value input : consumer.getInputs()) { + if (result == input) { + return true; + } + } + } + return false; +} + +// Checks if two kernels are siblings (share inputs but no data dependency). +bool areSiblingKernels(neura::KernelOp kernel1, neura::KernelOp kernel2) { + llvm::SmallPtrSet kernel1_inputs(kernel1.getInputs().begin(), kernel1.getInputs().end()); + bool share_input = llvm::any_of(kernel2.getInputs(), [&](Value input) { + return kernel1_inputs.contains(input); + }); + return share_input && !hasProducerConsumerRelation(kernel1, kernel2) && !hasProducerConsumerRelation(kernel2, kernel1); +} + +// Checks if any operation between producer and consumer uses producer's results. +bool hasInterveningUses(neura::KernelOp producer, neura::KernelOp consumer) { + llvm::SmallPtrSet producer_results(producer.getOutputs().begin(), producer.getOutputs().end()); + bool in_range = false; + for (Operation &op : *producer->getBlock()) { + if (&op == producer.getOperation()) { + in_range = true; + continue; + } + if (&op == consumer.getOperation()) { + break; + } + if (in_range) { + for (Value operand : op.getOperands()) { + if (producer_results.contains(operand)) { + return true; + } + } + } + } + return false; +} + +// Collects inputs from two kernels, avoiding duplicates. +void collectFusedInputs(OperandRange inputs1, OperandRange inputs2, SmallVectorImpl &fused_inputs, SmallVectorImpl &fused_input_types, llvm::SmallDenseMap &input_index_map) { + for (Value input : inputs1) { + input_index_map[input] = fused_inputs.size(); + fused_inputs.push_back(input); + fused_input_types.push_back(input.getType()); + } + for (Value input : inputs2) { + if (!input_index_map.count(input)) { + input_index_map[input] = fused_inputs.size(); + fused_inputs.push_back(input); + fused_input_types.push_back(input.getType()); + } + } +} + +// Clones operations from a kernel block with input index mapping for sibling fusion. +void cloneKernelOpsWithIndexMap(Block &source_block, Block *fused_block, OpBuilder &builder, IRMapping &mapping, OperandRange kernel_inputs, const llvm::SmallDenseMap &input_index_map, SmallVectorImpl *yield_values) { + for (auto [idx, old_arg] : llvm::enumerate(source_block.getArguments())) { + Value original_input = kernel_inputs[idx]; + mapping.map(old_arg, fused_block->getArgument(input_index_map.lookup(original_input))); + } + for (Operation &op : source_block) { + if (auto yield_op = dyn_cast(&op)) { + if (yield_values) { + for (Value v : yield_op.getOperands()) { + yield_values->push_back(mapping.lookup(v)); + } + } + continue; + } + builder.clone(op, mapping); + } +} + +// Fuses a producer kernel into its consumer and returns the fused kernel. +neura::KernelOp fuseProducerConsumerKernels(neura::KernelOp producer, neura::KernelOp consumer, Value fused_value, OpBuilder &builder) { + Location loc = consumer.getLoc(); + + SmallVector fused_inputs; + SmallVector fused_input_types; + for (Value input : producer.getInputs()) { + fused_inputs.push_back(input); + fused_input_types.push_back(input.getType()); + } + for (Value input : consumer.getInputs()) { + if (input != fused_value) { + fused_inputs.push_back(input); + fused_input_types.push_back(input.getType()); + } + } + + SmallVector fused_output_types(consumer.getResultTypes()); + auto fused_kernel = builder.create(loc, fused_output_types, fused_inputs, consumer.getCgraIdAttr(), builder.getStringAttr("fused_producer_consumer"), consumer.getAcceleratorAttr()); + + Block *fused_block = builder.createBlock(&fused_kernel.getBody()); + for (Type t : fused_input_types) { + fused_block->addArgument(t, loc); + } + + OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(fused_block); + + // Maps and clones producer's operations. + IRMapping producer_mapping; + Block &producer_block = producer.getBody().front(); + for (auto [old_arg, new_arg] : llvm::zip(producer_block.getArguments(), fused_block->getArguments().take_front(producer.getInputs().size()))) { + producer_mapping.map(old_arg, new_arg); + } + SmallVector producer_yields; + cloneKernelBlockOps(producer_block, builder, producer_mapping, producer_yields); + + // Maps and clones consumer's operations with fused value mapped to producer's output. + IRMapping consumer_mapping; + Block &consumer_block = consumer.getBody().front(); + unsigned consumer_input_idx = producer.getInputs().size(); + for (auto [idx, old_arg] : llvm::enumerate(consumer_block.getArguments())) { + Value original_input = consumer.getInputs()[idx]; + if (original_input == fused_value) { + consumer_mapping.map(old_arg, producer_yields.empty() ? Value() : producer_yields[0]); + } else { + consumer_mapping.map(old_arg, fused_block->getArgument(consumer_input_idx++)); + } + } + SmallVector consumer_yields; + cloneKernelBlockOps(consumer_block, builder, consumer_mapping, consumer_yields); + + builder.create(loc, consumer_yields); + return fused_kernel; +} + +// Fuses two sibling kernels and returns the fused kernel. +neura::KernelOp fuseSiblingKernels(neura::KernelOp kernel1, neura::KernelOp kernel2, OpBuilder &builder) { + Location loc = kernel1.getLoc(); + + SmallVector fused_inputs; + SmallVector fused_input_types; + llvm::SmallDenseMap input_index_map; + collectFusedInputs(kernel1.getInputs(), kernel2.getInputs(), fused_inputs, fused_input_types, input_index_map); + + SmallVector fused_output_types(kernel1.getResultTypes()); + fused_output_types.append(kernel2.getResultTypes().begin(), kernel2.getResultTypes().end()); + + auto fused_kernel = builder.create(loc, fused_output_types, fused_inputs, kernel1.getCgraIdAttr(), builder.getStringAttr("fused_sibling"), kernel1.getAcceleratorAttr()); + + Block *fused_block = builder.createBlock(&fused_kernel.getBody()); + for (Type t : fused_input_types) { + fused_block->addArgument(t, loc); + } + + OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(fused_block); + + IRMapping mapping1; + Block &block1 = kernel1.getBody().front(); + SmallVector kernel1_yields; + cloneKernelOpsWithIndexMap(block1, fused_block, builder, mapping1, kernel1.getInputs(), input_index_map, &kernel1_yields); + + IRMapping mapping2; + Block &block2 = kernel2.getBody().front(); + SmallVector kernel2_yields; + cloneKernelOpsWithIndexMap(block2, fused_block, builder, mapping2, kernel2.getInputs(), input_index_map, &kernel2_yields); + + SmallVector all_yields(kernel1_yields); + all_yields.append(kernel2_yields); + builder.create(loc, all_yields); + + return fused_kernel; +} + +// Pattern that fuses a producer kernel into its consumer. +struct ProducerConsumerFusion : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(neura::KernelOp consumer, PatternRewriter &rewriter) const override { + neura::KernelOp producer = nullptr; + Value fused_value; + + for (Value input : consumer.getInputs()) { + auto def_op = input.getDefiningOp(); + if (!canFuseKernels(def_op, consumer)) { + continue; + } + bool has_only_one_use = llvm::all_of(def_op.getOutputs(), [](Value result) { + return result.hasOneUse() || result.use_empty(); + }); + if (!has_only_one_use || hasInterveningUses(def_op, consumer)) { + continue; + } + if (!isFusionProfitable(def_op, consumer, true, input)) { + continue; + } + producer = def_op; + fused_value = input; + break; + } + + if (!producer) { + return failure(); + } + + auto fused_kernel = fuseProducerConsumerKernels(producer, consumer, fused_value, rewriter); + rewriter.replaceOp(consumer, fused_kernel.getOutputs()); + rewriter.eraseOp(producer); + return success(); + } +}; + +// Pattern that fuses kernels sharing the same inputs without data dependencies. +struct SiblingFusion : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(neura::KernelOp kernel1, PatternRewriter &rewriter) const override { + neura::KernelOp kernel2 = nullptr; + + for (Operation *op = kernel1->getNextNode(); op; op = op->getNextNode()) { + if (auto next_kernel = dyn_cast(op)) { + if (areSiblingKernels(kernel1, next_kernel) && canFuseKernels(kernel1, next_kernel) && isFusionProfitable(kernel1, next_kernel, false)) { + kernel2 = next_kernel; + break; + } + } + } + + if (!kernel2) { + return failure(); + } + + auto fused_kernel = fuseSiblingKernels(kernel1, kernel2, rewriter); + + SmallVector kernel1_results, kernel2_results; + for (unsigned i = 0; i < kernel1.getNumResults(); ++i) { + kernel1_results.push_back(fused_kernel.getResult(i)); + } + for (unsigned i = 0; i < kernel2.getNumResults(); ++i) { + kernel2_results.push_back(fused_kernel.getResult(kernel1.getNumResults() + i)); + } + + rewriter.replaceOp(kernel1, kernel1_results); + rewriter.replaceOp(kernel2, kernel2_results); + return success(); + } +}; + +// Pass that fuses neura.kernel operations using producer-consumer and sibling fusion. +struct FuseKernelPass : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(FuseKernelPass) + + StringRef getArgument() const override { return "fuse-kernel"; } + StringRef getDescription() const override { return "Fuses neura.kernel operations using producer-consumer and sibling fusion."; } + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + registry.insert(); + registry.insert(); + registry.insert(); + } + + void runOnOperation() override { + ModuleOp module = getOperation(); + + RewritePatternSet patterns(&getContext()); + patterns.add(&getContext(), 10); + patterns.add(&getContext(), 5); + + FrozenRewritePatternSet frozen(std::move(patterns)); + module.walk([&](func::FuncOp func_op) { + if (failed(applyPatternsGreedily(func_op, frozen))) { + signalPassFailure(); + } + }); + + unsigned num_kernels = 0; + module.walk([&](neura::KernelOp) { ++num_kernels; }); + llvm::outs() << "[FuseKernelPass] Remaining kernels after fusion: " << num_kernels << "\n"; + } +}; + +} // namespace + +namespace mlir::neura { +std::unique_ptr createFuseKernelPass() { + return std::make_unique(); +} +} // namespace mlir::neura diff --git a/lib/NeuraDialect/Transforms/GenerateCodePass.cpp b/lib/NeuraDialect/Transforms/GenerateCodePass.cpp index 18766349..6c383bc6 100644 --- a/lib/NeuraDialect/Transforms/GenerateCodePass.cpp +++ b/lib/NeuraDialect/Transforms/GenerateCodePass.cpp @@ -85,6 +85,7 @@ static bool isCtrlMov(Operation *op) { return dyn_cast(op) != nullptr static bool isPhiStart(Operation *op) { return dyn_cast(op) != nullptr; } static bool isReserve(Operation *op) { return dyn_cast(op) != nullptr; } static bool isConstant(Operation *op) { return dyn_cast(op) != nullptr; } +static bool isFusedOp(Operation *op) { return dyn_cast(op) != nullptr; } // ---- Constant for phi_start operation ----. static constexpr unsigned kReserveOpIndex = 1; @@ -484,24 +485,29 @@ struct GenerateCodePass SmallVector &ctrl_movs, DenseMap &reserve_to_phi_map) { function.walk([&](Operation *op) { - // placement for every op (even for mov/reserve). + // Skips operations inside fused_op regions. + if (op->getParentOp() && isFusedOp(op->getParentOp())) { + return; + } + + // Records Records placement for every op (even for mov/reserve). operation_placements[op] = getTileLocation(op); - // build reserve -> phi mapping. + // Builds reserve -> phi mapping for loop-carried dependencies. if (isPhiStart(op)) { if (Value reserve = getReserveOperand(op)) { reserve_to_phi_map[reserve] = op; } } - // collect forwarders. + // Collects forwarders for later expansion. if (isDataMov(op)) { data_movs.push_back(op); return; } if (isCtrlMov(op)) { ctrl_movs.push_back(op); return; } - // skip Reserve from materialization. + // Skips Reserve from materialization. if (isReserve(op)) return; - // materialize all other ops placed on tiles (compute/phi/const/etc.). + // Materializes all other ops placed on tiles (compute/phi/const/fused_op/etc.). TileLocation placement = operation_placements[op]; if (!placement.has_tile) return; @@ -831,6 +837,31 @@ struct GenerateCodePass const DenseMap &reserve2phi) { if (!validateForwarderShape(forwarder)) return; + // Checks if this data_mov/ctrl_mov has mapping_locs assigned by MapToAcceleratorPass. + auto mapping_locs = getMappingLocations(forwarder); + if (!mapping_locs || mapping_locs.empty()) { + // Skips this mov operation - it will be handled by its consumer or does not need routing. + // This is expected for data_mov that only feeds into ctrl_mov. + if constexpr (!IsCtrl) { + // For data_mov without mapping, verifies if it is only used by ctrl_mov. + bool only_ctrl_mov_users = true; + for (OpOperand &use : forwarder->getResult(0).getUses()) { + if (!isa(use.getOwner())) { + only_ctrl_mov_users = false; + break; + } + } + if (only_ctrl_mov_users) { + // This is expected - ctrl_mov handles this data transfer implicitly. + return; + } else { + // This data_mov has non-ctrl_mov users but no mapping - this is an error. + forwarder->emitWarning("data_mov without mapping_locs has non-ctrl_mov users"); + } + } + return; + } + MovBasics basics = buildMovBasics(forwarder, topo); emitMovRoutingInstructions(forwarder, basics, topo); @@ -1029,6 +1060,10 @@ struct GenerateCodePass if (operation == func.getOperation()) return; // Skips function itself. if (isReserve(operation)) return; // Skips reserve nodes entirely (bypass later). if (isa(operation)) return; // Skips yield nodes entirely (bypass later). + // Skips operations inside fused_op regions - they are handled by hardware + if (operation->getParentOp() && isFusedOp(operation->getParentOp())) { + return; + } int dfg_id = getDfgId(operation); if (dfg_id < 0) { diff --git a/lib/NeuraDialect/Transforms/InitExecLatencyPass.cpp b/lib/NeuraDialect/Transforms/InitExecLatencyPass.cpp new file mode 100644 index 00000000..c50023bc --- /dev/null +++ b/lib/NeuraDialect/Transforms/InitExecLatencyPass.cpp @@ -0,0 +1,184 @@ +//===- InitExecLatencyPass.cpp - Initialize Execution Latency --------------===// +// +// This pass initializes execution latency information. +// +//===----------------------------------------------------------------------===// + +#include "mlir/IR/BuiltinOps.h" +#include "mlir/Pass/Pass.h" +#include "NeuraDialect/Architecture/ArchitectureSpec.h" +#include "NeuraDialect/NeuraDialect.h" +#include "NeuraDialect/NeuraOps.h" +#include "NeuraDialect/NeuraPasses.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/YAMLParser.h" +#include "llvm/Support/raw_ostream.h" +#include + +using namespace mlir; + +#define GEN_PASS_DEF_INITEXECLATENCY +#include "NeuraDialect/NeuraPasses.h.inc" + +namespace { + +// Helper function to parse YAML scalar to integer +static bool parseYamlScalarInt(const llvm::yaml::Node *node, int &result) { + auto *scalar = llvm::dyn_cast_or_null(node); + if (!scalar) + return false; + llvm::SmallString<64> value_string; + llvm::StringRef value_ref = scalar->getValue(value_string); + long long temp_value = 0; + if (value_ref.getAsInteger(10, temp_value)) + return false; + result = static_cast(temp_value); + return true; +} + +// Helper function to parse YAML scalar to string +static bool parseYamlScalarString(const llvm::yaml::Node *node, + std::string &result) { + auto *scalar = llvm::dyn_cast_or_null(node); + if (!scalar) + return false; + llvm::SmallString<64> value_string; + llvm::StringRef value_ref = scalar->getValue(value_string); + result = value_ref.str(); + return true; +} + +// Parse latency YAML file: expects a mapping of operation names to latency values +static bool parseLatencyYaml(const std::string &file_path, + std::map &latency_map) { + llvm::ErrorOr> buffer_or_err = + llvm::MemoryBuffer::getFile(file_path); + if (!buffer_or_err) { + llvm::errs() << "[InitExecLatencyPass] Failed to open latency specification file: " + << file_path << "\n"; + return false; + } + + llvm::SourceMgr sm; + sm.AddNewSourceBuffer(std::move(*buffer_or_err), llvm::SMLoc()); + llvm::yaml::Stream yaml_stream( + sm.getMemoryBuffer(sm.getMainFileID())->getBuffer(), sm); + + llvm::yaml::Document &yaml_doc = *yaml_stream.begin(); + if (yaml_stream.failed()) { + llvm::errs() << "[InitExecLatencyPass] YAML parse error in: " << file_path << "\n"; + return false; + } + + auto *root = yaml_doc.getRoot(); + if (!root) { + llvm::errs() << "[InitExecLatencyPass] Empty YAML document\n"; + return false; + } + + auto *root_map = llvm::dyn_cast(root); + if (!root_map) { + llvm::errs() << "[InitExecLatencyPass] YAML root is not a mapping\n"; + return false; + } + + for (auto &key_value_pair : *root_map) { + auto *key_node = + llvm::dyn_cast_or_null(key_value_pair.getKey()); + if (!key_node) + continue; + + std::string op_name; + if (!parseYamlScalarString(key_node, op_name)) + continue; + + int latency_value = 0; + if (!parseYamlScalarInt(key_value_pair.getValue(), latency_value)) + continue; + + latency_map[op_name] = latency_value; + } + + return true; +} + +void SetLatency(Operation *op, std::map &latency_map) { + // Get operation name and look up latency + std::string op_name = op->getName().getStringRef().str(); + if (op_name.compare("neura.fused_op") == 0) { + op_name = op->getAttrOfType("pattern_name").getValue().str(); + } + op_name = op_name.substr(op_name.find_last_of(".") + 1); // remove neura. prefix if exists + auto it = latency_map.find(op_name); + if (it != latency_map.end()) { + op->setAttr("latency", + IntegerAttr::get(IntegerType::get(op->getContext(), 32), it->second)); + } + else { + op->setAttr("latency", + IntegerAttr::get(IntegerType::get(op->getContext(), 32), 1)); + } +} + +struct InitExecLatencyPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(InitExecLatencyPass) + + InitExecLatencyPass() = default; + InitExecLatencyPass(const InitExecLatencyPass &pass) + : PassWrapper>(pass) {} + + StringRef getArgument() const override { return "init-exec-latency"; } + StringRef getDescription() const override { + return "Initialize execution latency information."; + } + + void runOnOperation() override { + + ModuleOp module_op = getOperation(); + llvm::errs() << "[InitExecLatencyPass] Running init-exec-latency pass\n"; + // Get latency spec file from global function (set by command line) + std::string latency_file = mlir::neura::getLatencySpecFile(); + if (latency_file.empty()) { + latency_file = "latency_map.yaml"; // default file name + } + + llvm::errs() << "[InitExecLatencyPass] Latency file: " << latency_file << "\n"; + // Builds a map of operation name to latency + std::map latency_map; + if (!parseLatencyYaml(latency_file, latency_map)) { + llvm::errs() << "[InitExecLatencyPass] Failed to parse latency specification file: " << latency_file << "\n"; + return; + } + + // Apply latency values to operations + module_op.walk([&](Operation *op) { + if (!op->getRegions().empty()) { + for (Region ®ion : op->getRegions()) { + region.walk([&](Operation *inner_op) { + // Skip operations inside fused_op regions + if (inner_op->getParentOp() && isa(inner_op->getParentOp())) { + return; + } + + if (inner_op->getName().getStringRef().str() == "neura.data_mov" || inner_op->getName().getStringRef().str() == "neura.reserve") { + return; + } + + SetLatency(inner_op, latency_map); + }); + } + } + }); + } +}; + +} // namespace + +namespace mlir::neura { +std::unique_ptr createInitExecLatencyPass() { + return std::make_unique(); +} +} // namespace mlir::neura diff --git a/test/neura/kernel_fusion/kernel.cpp b/test/neura/kernel_fusion/kernel.cpp new file mode 100644 index 00000000..6e8e44d2 --- /dev/null +++ b/test/neura/kernel_fusion/kernel.cpp @@ -0,0 +1,150 @@ +// Test cases for FuseKernelPass +// +// Build workflow using Polygeist: +// 1. cgeist kernel_fusion.cpp -S -O2 -> SCF loops (kernel_fusion_scf.mlir) +// 2. polygeist-opt --raise-scf-to-affine -> Affine loops (kernel_fusion_affine.mlir) +// 3. mlir-neura-opt --wrap-loop-in-kernel -> neura.kernel ops (kernel_fusion_wrapped.mlir) +// 4. mlir-neura-opt --fuse-kernel -> Fused kernels (kernel_fusion_fused.mlir) + +#define N 64 + +float A[N], B[N], C[N], D[N], E[N], F[N], G[N], H[N], X[N], Y[N]; + +// Producer-Consumer Fusion: kernel0 -> kernel1 +// kernel0: C[i] = A[i] + B[i] +// kernel1: D[i] = C[i] * 2.0 +void test_producer_consumer_fusion(float A[], float B[], float C[], float D[]) { + for (int i = 0; i < N; i++) { + C[i] = A[i] + B[i]; + } + + for (int i = 0; i < N; i++) { + D[i] = C[i] * 2.0f; + } +} + +// Multiple Consumers: kernel0 -> kernel1, kernel0 -> kernel2 +// kernel0: C[i] = A[i] + B[i] +// kernel1: D[i] = C[i] * 2.0 +// kernel2: E[i] = C[i] + 1.0 +void test_multiple_consumers(float A[], float B[], float C[], float D[], float E[]) { + for (int i = 0; i < N; i++) { + C[i] = A[i] + B[i]; + } + + for (int i = 0; i < N; i++) { + D[i] = C[i] * 2.0f; + } + + for (int i = 0; i < N; i++) { + E[i] = C[i] + 1.0f; + } +} + +// Sibling Fusion: kernel0 || kernel1 (share input A) +// kernel0: E[i] = A[i] * 3.0 +// kernel1: F[i] = A[i] + 1.0 +void test_sibling_fusion(float A[], float E[], float F[]) { + for (int i = 0; i < N; i++) { + E[i] = A[i] * 3.0f; + } + + for (int i = 0; i < N; i++) { + F[i] = A[i] + 1.0f; + } +} + +// No Shared Input: kernel0, kernel1 (no fusion - different inputs) +// kernel0: G[i] = X[i] * 2.0 +// kernel1: H[i] = Y[i] + 3.0 +void test_no_shared_input(float X[], float Y[], float G[], float H[]) { + for (int i = 0; i < N; i++) { + G[i] = X[i] * 2.0f; + } + + for (int i = 0; i < N; i++) { + H[i] = Y[i] + 3.0f; + } +} + +// Chain Fusion: kernel0 -> kernel1 -> kernel2 +// kernel0: C[i] = A[i] + B[i] +// kernel1: D[i] = C[i] * 2.0 +// kernel2: E[i] = D[i] + 1.0 +void test_chain_fusion(float A[], float B[], float C[], float D[], float E[]) { + for (int i = 0; i < N; i++) { + C[i] = A[i] + B[i]; + } + + for (int i = 0; i < N; i++) { + D[i] = C[i] * 2.0f; + } + + for (int i = 0; i < N; i++) { + E[i] = D[i] + 1.0f; + } +} + +// Complex Sibling: (kernel0 || kernel1 || kernel2), kernel3 +// kernel0: C[i] = A[i] * 2.0 +// kernel1: D[i] = A[i] + 1.0 } siblings (share A) +// kernel2: E[i] = A[i] - 1.0 +// kernel3: F[i] = B[i] * 3.0 (independent) +void test_complex_sibling(float A[], float B[], float C[], float D[], float E[], float F[]) { + for (int i = 0; i < N; i++) { + C[i] = A[i] * 2.0f; + } + + for (int i = 0; i < N; i++) { + D[i] = A[i] + 1.0f; + } + + for (int i = 0; i < N; i++) { + E[i] = A[i] - 1.0f; + } + + for (int i = 0; i < N; i++) { + F[i] = B[i] * 3.0f; + } +} + +// Mixed Patterns: (kernel0 -> kernel3) || (kernel1 || kernel2) +// kernel0: C[i] = A[i] + B[i] ─┐ +// kernel1: D[i] = A[i] * 2.0 ├─ siblings (share A) +// kernel2: E[i] = A[i] + 3.0 ─┘ +// kernel3: F[i] = C[i] * 2.0 (consumer of kernel0) +void test_mixed_patterns(float A[], float B[], float C[], float D[], float E[], float F[]) { + for (int i = 0; i < N; i++) { + C[i] = A[i] + B[i]; + } + + for (int i = 0; i < N; i++) { + D[i] = A[i] * 2.0f; + } + + for (int i = 0; i < N; i++) { + E[i] = A[i] + 3.0f; + } + + for (int i = 0; i < N; i++) { + F[i] = C[i] * 2.0f; + } +} + +int main() { + for (int i = 0; i < N; i++) { + A[i] = (float)i; + B[i] = (float)(i * 2); + X[i] = (float)(i + 1); + Y[i] = (float)(i - 1); + } + + test_producer_consumer_fusion(A, B, C, D); + test_sibling_fusion(A, E, F); + test_no_shared_input(X, Y, G, H); + test_chain_fusion(A, B, C, D, E); + test_complex_sibling(A, B, C, D, E, F); + test_mixed_patterns(A, B, C, D, E, F); + + return 0; +} diff --git a/test/neura/kernel_fusion/test.mlir b/test/neura/kernel_fusion/test.mlir new file mode 100644 index 00000000..3693ec24 --- /dev/null +++ b/test/neura/kernel_fusion/test.mlir @@ -0,0 +1,210 @@ +// RUN: mlir-neura-opt --wrap-loop-in-kernel --fuse-kernel %s 2>&1 | FileCheck %s + +// ============================================================================= +// TEST 1: Producer-Consumer Fusion +// Expected: Both loops should be fused into a single kernel. +// ============================================================================= + +// CHECK-LABEL: func.func @test_producer_consumer_fusion(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref) { +// CHECK: neura.kernel ins(%arg0, %arg1, %arg2, %cst, %arg3 : memref, memref, memref, f32, memref) attributes {kernel_name = "fused_sibling"} { +// CHECK: affine.for +// CHECK: arith.addf +// CHECK: affine.for +// CHECK: arith.mulf +// CHECK-NOT: neura.kernel +// CHECK: return + +func.func @test_producer_consumer_fusion(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref) { + %cst = arith.constant 2.000000e+00 : f32 + affine.for %arg4 = 0 to 64 { + %0 = memref.load %arg0[%arg4] : memref + %1 = memref.load %arg1[%arg4] : memref + %2 = arith.addf %0, %1 : f32 + memref.store %2, %arg2[%arg4] : memref + } + affine.for %arg4 = 0 to 64 { + %0 = memref.load %arg2[%arg4] : memref + %1 = arith.mulf %0, %cst : f32 + memref.store %1, %arg3[%arg4] : memref + } + return +} + +// ============================================================================= +// TEST 2: Sibling Fusion +// Expected: Both loops should be fused into a single kernel. +// ============================================================================= + +// CHECK-LABEL: func.func @test_sibling_fusion(%arg0: memref, %arg1: memref, %arg2: memref) { +// CHECK: neura.kernel ins(%arg0, %cst_0, %arg1, %cst, %arg2 : memref, f32, memref, f32, memref) attributes {kernel_name = "fused_sibling"} { +// CHECK: affine.for +// CHECK: arith.mulf +// CHECK: affine.for +// CHECK: arith.addf +// CHECK-NOT: neura.kernel +// CHECK: return + +func.func @test_sibling_fusion(%arg0: memref, %arg1: memref, %arg2: memref) { + %cst = arith.constant 1.000000e+00 : f32 + %cst_0 = arith.constant 3.000000e+00 : f32 + affine.for %arg3 = 0 to 64 { + %0 = memref.load %arg0[%arg3] : memref + %1 = arith.mulf %0, %cst_0 : f32 + memref.store %1, %arg1[%arg3] : memref + } + affine.for %arg3 = 0 to 64 { + %0 = memref.load %arg0[%arg3] : memref + %1 = arith.addf %0, %cst : f32 + memref.store %1, %arg2[%arg3] : memref + } + return +} + +// ============================================================================= +// TEST 3: No Shared Input (No Fusion) +// Expected: Kernels should NOT be fused as siblings since they dont't share input. +// ============================================================================= +// CHECK-LABEL: func.func @test_no_shared_input(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref) { +// CHECK: neura.kernel +// CHECK-SAME: kernel_name = "kernel_0" +// CHECK: neura.kernel +// CHECK-SAME: kernel_name = "kernel_1" +// CHECK: return + +func.func @test_no_shared_input(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref) { + %cst = arith.constant 3.000000e+00 : f32 + %cst_0 = arith.constant 2.000000e+00 : f32 + affine.for %arg4 = 0 to 64 { + %0 = memref.load %arg0[%arg4] : memref + %1 = arith.mulf %0, %cst_0 : f32 + memref.store %1, %arg2[%arg4] : memref + } + affine.for %arg4 = 0 to 64 { + %0 = memref.load %arg1[%arg4] : memref + %1 = arith.addf %0, %cst : f32 + memref.store %1, %arg3[%arg4] : memref + } + return +} + +// ============================================================================= +// TEST 4: Chain fusion: A -> B -> C +// Expected: All kernels should be fused into a single kernel. +// ============================================================================= +// CHECK-LABEL: func.func @test_chain_fusion(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref) { +// CHECK: neura.kernel ins(%arg0, %arg1, %arg2, %cst_0, %arg3, %cst, %arg4 : memref, memref, memref, f32, memref, f32, memref) attributes {kernel_name = "fused_sibling"} { +// CHECK: affine.for +// CHECK: arith.addf +// CHECK: affine.for +// CHECK: arith.mulf +// CHECK: affine.for +// CHECK: arith.addf +// CHECK-NOT: neura.kernel +// CHECK: return + +func.func @test_chain_fusion(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref) { + %cst = arith.constant 1.000000e+00 : f32 + %cst_0 = arith.constant 2.000000e+00 : f32 + affine.for %arg5 = 0 to 64 { + %0 = memref.load %arg0[%arg5] : memref + %1 = memref.load %arg1[%arg5] : memref + %2 = arith.addf %0, %1 : f32 + memref.store %2, %arg2[%arg5] : memref + } + affine.for %arg5 = 0 to 64 { + %0 = memref.load %arg2[%arg5] : memref + %1 = arith.mulf %0, %cst_0 : f32 + memref.store %1, %arg3[%arg5] : memref + } + affine.for %arg5 = 0 to 64 { + %0 = memref.load %arg3[%arg5] : memref + %1 = arith.addf %0, %cst : f32 + memref.store %1, %arg4[%arg5] : memref + } + return +} + +// ============================================================================= +// TEST 5: Complex Sibling Fusion +// Expected: Siblings that share inputs should be fused, but kernel_3 should remain as a separate kernel. +// ============================================================================= + +// CHECK-LABEL: func.func @test_complex_sibling(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref) { +// CHECK: neura.kernel ins(%arg0, %cst_1, %arg2, %cst_0, %arg3, %arg4 : memref, f32, memref, f32, memref, memref) attributes {kernel_name = "fused_sibling"} { +// CHECK: affine.for +// CHECK: arith.mulf +// CHECK: affine.for +// CHECK: arith.addf +// CHECK: affine.for +// CHECK: arith.subf +// CHECK: neura.kernel +// CHECK-SAME: kernel_name = "kernel_3" +// CHECK: return + +func.func @test_complex_sibling(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref) { + %cst = arith.constant 3.000000e+00 : f32 + %cst_0 = arith.constant 1.000000e+00 : f32 + %cst_1 = arith.constant 2.000000e+00 : f32 + affine.for %arg6 = 0 to 64 { + %0 = memref.load %arg0[%arg6] : memref + %1 = arith.mulf %0, %cst_1 : f32 + memref.store %1, %arg2[%arg6] : memref + } + affine.for %arg6 = 0 to 64 { + %0 = memref.load %arg0[%arg6] : memref + %1 = arith.addf %0, %cst_0 : f32 + memref.store %1, %arg3[%arg6] : memref + } + affine.for %arg6 = 0 to 64 { + %0 = memref.load %arg0[%arg6] : memref + %1 = arith.subf %0, %cst_0 : f32 + memref.store %1, %arg4[%arg6] : memref + } + affine.for %arg6 = 0 to 64 { + %0 = memref.load %arg1[%arg6] : memref + %1 = arith.mulf %0, %cst : f32 + memref.store %1, %arg5[%arg6] : memref + } + return +} + +// ============================================================================= +// TEST 6: Mixed Patterns +// Expected: All four loops should be fused into a single kernel. +// ============================================================================= + +// CHECK-LABEL: func.func @test_mixed_patterns(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref) { +// CHECK: neura.kernel ins(%arg0, %arg1, %arg2, %cst_0, %arg3, %cst, %arg4, %arg5 : memref, memref, memref, f32, memref, f32, memref, memref) attributes {kernel_name = "fused_sibling"} { +// CHECK: affine.for +// CHECK: affine.for +// CHECK: affine.for +// CHECK: affine.for +// CHECK-NOT: neura.kernel +// CHECK: return + +func.func @test_mixed_patterns(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref) { + %cst = arith.constant 3.000000e+00 : f32 + %cst_0 = arith.constant 2.000000e+00 : f32 + affine.for %arg6 = 0 to 64 { + %0 = memref.load %arg0[%arg6] : memref + %1 = memref.load %arg1[%arg6] : memref + %2 = arith.addf %0, %1 : f32 + memref.store %2, %arg2[%arg6] : memref + } + affine.for %arg6 = 0 to 64 { + %0 = memref.load %arg0[%arg6] : memref + %1 = arith.mulf %0, %cst_0 : f32 + memref.store %1, %arg3[%arg6] : memref + } + affine.for %arg6 = 0 to 64 { + %0 = memref.load %arg0[%arg6] : memref + %1 = arith.addf %0, %cst : f32 + memref.store %1, %arg4[%arg6] : memref + } + affine.for %arg6 = 0 to 64 { + %0 = memref.load %arg2[%arg6] : memref + %1 = arith.mulf %0, %cst_0 : f32 + memref.store %1, %arg5[%arg6] : memref + } + return +} diff --git a/tools/mlir-neura-opt/mlir-neura-opt.cpp b/tools/mlir-neura-opt/mlir-neura-opt.cpp index f7569960..fd2a7ca8 100644 --- a/tools/mlir-neura-opt/mlir-neura-opt.cpp +++ b/tools/mlir-neura-opt/mlir-neura-opt.cpp @@ -31,12 +31,18 @@ // Global variable to store architecture spec file path static std::string architecture_spec_file; static mlir::neura::TileDefaults tile_defaults; +static std::string latency_spec_file; // Function to get the architecture spec file path std::string mlir::neura::getArchitectureSpecFile() { return architecture_spec_file; } +// Function to get the latency spec file path +std::string mlir::neura::getLatencySpecFile() { + return latency_spec_file; +} + // Function to get tile defaults configuration mlir::neura::TileDefaults mlir::neura::getTileDefaults() { return tile_defaults; @@ -60,6 +66,15 @@ int main(int argc, char **argv) { architecture_spec_file = arg_ref.substr(strlen("--architecture-spec=")).str(); continue; + } else if (arg_ref == "--latency-spec") { + if (i + 1 < argc) { + latency_spec_file = argv[i + 1]; + ++i; // skip value + continue; + } + } else if (arg_ref.starts_with("--latency-spec=")) { + latency_spec_file = arg_ref.substr(strlen("--latency-spec=")).str(); + continue; } forwarded_args.push_back(argv[i]); }