diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h index 36dc4c63..95aa70c8 100644 --- a/include/NeuraDialect/NeuraPasses.h +++ b/include/NeuraDialect/NeuraPasses.h @@ -47,6 +47,9 @@ std::unique_ptr createCanonicalizeCastPass(); std::unique_ptr createIterMergePatternPass(); std::unique_ptr createInitPatternPass(); +// Hardware optimization passes +std::unique_ptr createHardwareMergePass(); + #define GEN_PASS_REGISTRATION #include "NeuraDialect/NeuraPasses.h.inc" diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td index de1dcc67..ec0df60b 100644 --- a/include/NeuraDialect/NeuraPasses.td +++ b/include/NeuraDialect/NeuraPasses.td @@ -174,4 +174,24 @@ def WrapLoopInKernelPass : Pass<"wrap-loop-in-kernel", "func::FuncOp">{ }]; let constructor = "neura::createWrapLoopInKernelPass()"; } + +def HardwareMerge : Pass<"hardware-merge", "ModuleOp"> { + let summary = "Merge and optimize hardware units for pattern execution"; + let description = [{ + This pass analyzes patterns (fused_op regions) and designs an optimal + hardware configuration that supports all patterns while minimizing + hardware cost. It uses a Functional Unit (FU) based design where each + FU executes exactly one operation type. + + Algorithm: + 1. Extract pattern DAGs with topological structure from fused_op regions + 2. Sort patterns by complexity (distinct operation count and cost) + 3. For each pattern, try to accommodate it into existing templates by reusing FUs with matching operation types + 4. If accommodation cost is too high, create a new template with dedicated FUs for the pattern + 5. Generate FU connections based on pattern dependencies with transitive reduction optimization + 6. Generate execution plans with parallel execution stages + 7. Output the final hardware configuration as a JSON file + }]; + let constructor = "neura::createHardwareMergePass()"; +} #endif // NEURA_PASSES_TD \ No newline at end of file diff --git a/include/NeuraDialect/Transforms/GraphMining/HardwareTemplate.h b/include/NeuraDialect/Transforms/GraphMining/HardwareTemplate.h new file mode 100644 index 00000000..586c1959 --- /dev/null +++ b/include/NeuraDialect/Transforms/GraphMining/HardwareTemplate.h @@ -0,0 +1,224 @@ +//===- HardwareTemplate.h - Hardware Template Data Structures and Helpers -===// +// +// This file contains declarations for hardware template data structures and +// helper functions for hardware template merging. +// +// The hardware template system maximizes pattern coverage while minimizing +// hardware cost through resource sharing. Key concepts: +// +// - FunctionalUnit (FU): A single hardware unit that executes one operation type +// - HardwareTemplate: A collection of FUs with connections supporting multiple patterns +// - HardwarePattern: A sequence of operations mapped to template FUs +// +// For detailed documentation with examples and diagrams, see: +// docs/HardwareTemplateGuide.md +// +//===----------------------------------------------------------------------===// + +#ifndef NEURA_DIALECT_TRANSFORMS_GRAPHMINING_HARDWARETEMPLATE_H +#define NEURA_DIALECT_TRANSFORMS_GRAPHMINING_HARDWARETEMPLATE_H + +#include "mlir/IR/Operation.h" +#include "mlir/IR/BuiltinOps.h" +#include +#include +#include +#include +#include +#include + +namespace mlir { +namespace neura { +class FusedOp; +} +} + +namespace mlir::neura { + +// Forward declarations +struct HardwarePattern { + int64_t id; + std::string name; + int64_t freq; + std::vector ops; + std::vector op_levels; // Topological level for each op (ops at same level can run in parallel) + std::vector> op_preds; // Predecessors for each op (dependency graph) + double cost; + + HardwarePattern(int64_t i, const std::string& n, int64_t f); +}; + +//===----------------------------------------------------------------------===// +// FunctionalUnit (FU) - A single hardware execution unit +//===----------------------------------------------------------------------===// +// +// A FunctionalUnit represents a single hardware unit that can execute exactly +// one type of operation (e.g., adder, multiplier, load unit). +// +// Key Properties: +// --------------- +// 1. SINGLE OPERATION TYPE: Each FU executes exactly one operation type. +// For example, an "adder" FU only executes neura.add operations. +// +// 2. MULTIPLE INSTANCES: A template can have multiple FUs of the same type. +// For example, two adders to support patterns needing parallel additions. +// +// 3. DIRECT CONNECTIONS: FUs are connected directly to each other, forming +// a dataflow graph within the template. +// +// Example: +// -------- +// Consider a template supporting pattern: gep -> load -> add -> store +// +// Template structure: +// ┌─────┐ ┌──────┐ ┌─────┐ ┌───────┐ +// │ gep │ --> │ load │ --> │ add │ --> │ store │ +// │FU 0 │ │ FU 1 │ │FU 2 │ │ FU 3 │ +// └─────┘ └──────┘ └─────┘ └───────┘ +// +// For patterns with parallel operations (e.g., add + mul -> store): +// ┌─────┐ +// │ add │ ──┐ +// │FU 0 │ │ ┌───────┐ +// └─────┘ ├──-> │ store │ +// ┌─────┐ │ │ FU 2 │ +// │ mul │ ──┘ └───────┘ +// │FU 1 │ +// └─────┘ +// +//===----------------------------------------------------------------------===// +struct FunctionalUnit { + int id; // Unique FU ID within the template + std::string op_type; // Operation type this FU executes (e.g., "neura.add") + + FunctionalUnit(int i, const std::string& op); +}; + +// Execution stage for a pattern - contains FU indices that can execute in parallel. +struct ExecutionStage { + std::vector fus; // FUs that execute in this stage (parallel) + std::vector ops; // Corresponding operations +}; + +// Execution plan for a pattern on a hardware template. +struct PatternExecutionPlan { + int64_t pattern_id; + std::string pattern_name; + std::vector stages; // Ordered stages of execution +}; + +// Operations supported by a hardware template. +struct TemplateSupportedOps { + int template_id; + std::set single_ops; // Individual ops this template can support + std::vector composite_ops; // Pattern IDs (composite operations) +}; + +class OperationCostModel { +public: + OperationCostModel(); + double get(const std::string& op) const; + double fu_cost(const std::string& op) const; + double pattern_cost(const std::vector& ops) const; +private: + std::map costs; +}; + +//===----------------------------------------------------------------------===// +// HardwareTemplate - A collection of FUs forming a reusable hardware block +//===----------------------------------------------------------------------===// +// +// A HardwareTemplate contains multiple FunctionalUnits connected together. +// Multiple patterns can be mapped to the same template by reusing FUs. +// +// Key differences from the old slot-based design: +// - Each FU has exactly one operation type (no multiplexing within FU) +// - Template can have multiple FUs of the same type +// - Connections are between specific FU IDs, not abstract slot positions +// +//===----------------------------------------------------------------------===// +struct HardwareTemplate { + int id; + std::vector fus; // All FUs in this template + std::vector patterns; // Pattern IDs mapped to this template + std::map> mapping; // pattern_id -> FU id sequence + std::set> connections; // FU connections: (from_fu_id, to_fu_id) + int instances; + + HardwareTemplate(int i); + + // Adds a new FU with the given operation type, returns its ID. + int add_fu(const std::string& op_type); + + // Finds an existing FU that can handle the operation, or -1 if none available. + int find_available_fu(const std::string& op_type, const std::set& used_fus) const; + + // Finds a mapping for a pattern into the existing template. + // Returns FU IDs for each operation, or empty if no valid mapping exists. + std::vector find_mapping(const HardwarePattern& pat) const; + + // Tries to accommodate a pattern, potentially adding new FUs. + // Returns true if successful, with the mapping and cost increase. + bool try_accommodate(const HardwarePattern& pat, const OperationCostModel& cm, + std::vector& out_mapping, double& out_cost_increase); + + // Applies a mapping to the template. + void apply_mapping(const HardwarePattern& pat, const std::vector& m); + + // Computes the total cost of the template. + double compute_cost(const OperationCostModel& cm) const; + + // Checks if two operations are compatible (can potentially share resources in future). + static bool compatible(const std::string& a, const std::string& b); + +private: + // DFS helper for finding mappings. + void dfs_find_mapping(const HardwarePattern& pat, size_t op_idx, + std::vector& cur_mapping, std::set& used_fus, + std::vector& best_mapping, int& best_reuse_count) const; +}; + +// Extracts all patterns from module. +void extract_patterns(ModuleOp module, std::vector& patterns, OperationCostModel& cost_model); + +// Extracts all standalone operations from module (ops not inside FusedOp). +void extract_all_standalone_ops(ModuleOp module, std::set& all_ops); + +// Creates hardware templates from patterns. +void create_hardware_templates(const std::vector& patterns, std::vector& templates, OperationCostModel& cost_model); + +// Generates FU connections for all templates based on pattern dependencies. +void generate_connections(const std::vector& patterns, std::vector& templates); + +// Generates optimized FU connections (removes redundant connections using transitive reachability). +void generate_optimized_connections(const std::vector& patterns, std::vector& templates); + +// Generates execution plans for all patterns on their assigned templates. +void generate_execution_plans(const std::vector& patterns, + const std::vector& templates, + std::vector& plans); + +// Collects supported operations (single + composite) for each template. +void collect_supported_operations(const std::vector& patterns, + const std::vector& templates, + const std::set& all_dfg_ops, + std::vector& supported_ops); + +// Calculates total cost of templates. +double calculate_total_cost(const std::vector& templates, const OperationCostModel& cost_model); + +// Writes hardware configuration to JSON file (extended version with execution plans and supported ops). +void write_hardware_config_json(const std::string& path, + const std::vector& patterns, + const std::vector& templates, + const OperationCostModel& cost_model, + const std::vector& execution_plans, + const std::vector& supported_ops); + +// Legacy version for backward compatibility. +void write_hardware_config_json(const std::string& path, const std::vector& patterns, const std::vector& templates, const OperationCostModel& cost_model); + +} // namespace mlir::neura + +#endif // NEURA_DIALECT_TRANSFORMS_GRAPHMINING_HARDWARETEMPLATE_H + diff --git a/lib/NeuraDialect/Transforms/CMakeLists.txt b/lib/NeuraDialect/Transforms/CMakeLists.txt index e043bcce..85200b48 100644 --- a/lib/NeuraDialect/Transforms/CMakeLists.txt +++ b/lib/NeuraDialect/Transforms/CMakeLists.txt @@ -17,6 +17,8 @@ add_mlir_library( IterMergePatternPass.cpp TransformToSteerControlPass.cpp RemovePredicatedTypePass.cpp + HardwareMergePass.cpp + GraphMining/HardwareTemplate.cpp WrapLoopInKernelPass.cpp DEPENDS diff --git a/lib/NeuraDialect/Transforms/GraphMining/HardwareTemplate.cpp b/lib/NeuraDialect/Transforms/GraphMining/HardwareTemplate.cpp new file mode 100644 index 00000000..61efd91d --- /dev/null +++ b/lib/NeuraDialect/Transforms/GraphMining/HardwareTemplate.cpp @@ -0,0 +1,817 @@ +//===- HardwareTemplate.cpp - Hardware Template Data Structures and Helpers -===// +// +// This file contains data structures and helper functions for hardware template +// merging, including pattern extraction, template creation, and cost calculation. +// +// This version uses Functional Unit (FU) based design where each FU executes +// exactly one operation type, and templates can have multiple FUs of the same type. +// +//===----------------------------------------------------------------------===// + +#include "NeuraDialect/Transforms/GraphMining/HardwareTemplate.h" +#include "NeuraDialect/NeuraOps.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/ADT/DenseMap.h" +#include +#include +#include +#include +#include +#include + +using namespace mlir; + +namespace mlir::neura { + +// Initializes the operation cost model with default costs for each operation type. +// TODO: The cost model should be imported from the input file. +OperationCostModel::OperationCostModel() { + costs["neura.div"] = 100; costs["neura.fdiv"] = 100; costs["neura.rem"] = 80; + costs["neura.load"] = 50; costs["neura.store"] = 50; + costs["neura.load_indexed"] = 55; costs["neura.store_indexed"] = 55; + costs["neura.mul"] = 30; costs["neura.fmul"] = 35; + costs["neura.gep"] = 20; + costs["neura.add"] = 10; costs["neura.sub"] = 10; + costs["neura.fadd"] = 12; costs["neura.fsub"] = 12; + costs["neura.icmp"] = 15; costs["neura.fcmp"] = 15; + costs["neura.and"] = 5; costs["neura.or"] = 5; costs["neura.not"] = 5; + costs["neura.sel"] = 6; costs["neura.phi"] = 3; + costs["neura.grant_predicate"] = 3; costs["neura.grant_once"] = 3; + costs["neura.cast"] = 2; costs["neura.sext"] = 2; costs["neura.zext"] = 2; + costs["neura.data_mov"] = 1; costs["neura.constant"] = 1; +} + +// Returns the cost for a given operation, or 5.0 as default if not found. +double OperationCostModel::get(const std::string& op) const { + auto it = costs.find(op); + return it != costs.end() ? it->second : 5.0; +} + +// Returns the cost for a single FU (same as get). +double OperationCostModel::fu_cost(const std::string& op) const { + return get(op); +} + +// Returns the total cost for a pattern by summing costs of all its operations. +double OperationCostModel::pattern_cost(const std::vector& ops) const { + double sum = 0; + for (const auto& op : ops) sum += get(op); + return sum; +} + + +// Constructs a HardwarePattern with the given id, name, and frequency. +HardwarePattern::HardwarePattern(int64_t i, const std::string& n, int64_t f) + : id(i), name(n), freq(f), cost(0) {} + +// Constructs a FunctionalUnit with the given id and operation type. +FunctionalUnit::FunctionalUnit(int i, const std::string& op) : id(i), op_type(op) {} + +// Constructs a HardwareTemplate with the given id and one instance. +HardwareTemplate::HardwareTemplate(int i) : id(i), instances(1) {} + +// Adds a new FU with the given operation type, returns its ID. +int HardwareTemplate::add_fu(const std::string& op_type) { + int new_id = fus.size(); + fus.emplace_back(new_id, op_type); + return new_id; +} + +// Finds an existing FU that can handle the operation and is not already used. +int HardwareTemplate::find_available_fu(const std::string& op_type, const std::set& used_fus) const { + for (const auto& fu : fus) { + if (fu.op_type == op_type && used_fus.find(fu.id) == used_fus.end()) { + return fu.id; + } + } + return -1; // No available FU found +} + +// Checks if two operations are compatible (could share hardware in extended designs). +bool HardwareTemplate::compatible(const std::string& a, const std::string& b) { + if (a == b) return true; + + auto in_group = [](const std::string& op, const std::vector& group) -> bool { + for (const auto& keyword : group) { + if (op.find(keyword) != std::string::npos) return true; + } + return false; + }; + + static const std::vector> compatible_groups = { + {"add", "sub"}, + {"grant_once", "grant_predicate", "grant_always"}, + {"and", "or", "not", "xor"}, + {"load", "store"} + }; + + for (const auto& group : compatible_groups) { + if (in_group(a, group) && in_group(b, group)) { + return true; + } + } + + return false; +} + +// DFS helper for finding the best mapping with maximum FU reuse. +void HardwareTemplate::dfs_find_mapping(const HardwarePattern& pat, size_t op_idx, + std::vector& cur_mapping, std::set& used_fus, + std::vector& best_mapping, int& best_reuse_count) const { + if (op_idx >= pat.ops.size()) { + // Count reused FUs (existing FUs that were mapped) + int reuse_count = 0; + for (int fu_id : cur_mapping) { + if (fu_id >= 0 && fu_id < (int)fus.size()) { + reuse_count++; + } + } + if (reuse_count > best_reuse_count) { + best_reuse_count = reuse_count; + best_mapping = cur_mapping; + } + return; + } + + const std::string& op = pat.ops[op_idx]; + + // Try to find an existing FU that matches this operation + for (const auto& fu : fus) { + if (fu.op_type == op && used_fus.find(fu.id) == used_fus.end()) { + // Check dependency constraints: all predecessors must be mapped to FUs + // that can connect to this FU + bool valid = true; + if (op_idx < pat.op_preds.size()) { + for (int pred_idx : pat.op_preds[op_idx]) { + if (pred_idx >= 0 && pred_idx < (int)cur_mapping.size()) { + // Predecessor is mapped, connection will be established later + // For now, we just need to ensure the mapping is valid + } + } + } + + if (valid) { + cur_mapping.push_back(fu.id); + used_fus.insert(fu.id); + dfs_find_mapping(pat, op_idx + 1, cur_mapping, used_fus, best_mapping, best_reuse_count); + used_fus.erase(fu.id); + cur_mapping.pop_back(); + } + } + } + + // Also try creating a "virtual" new FU (represented by -1 - op_idx to distinguish) + // This indicates we'll need to add a new FU for this operation + cur_mapping.push_back(-1 - (int)op_idx); // Negative value indicates new FU needed + dfs_find_mapping(pat, op_idx + 1, cur_mapping, used_fus, best_mapping, best_reuse_count); + cur_mapping.pop_back(); +} + +// Finds a mapping for a pattern into the existing template. +std::vector HardwareTemplate::find_mapping(const HardwarePattern& pat) const { + std::vector best_mapping; + int best_reuse = -1; + std::vector cur_mapping; + std::set used_fus; + + dfs_find_mapping(pat, 0, cur_mapping, used_fus, best_mapping, best_reuse); + + return best_mapping; +} + +// Tries to accommodate a pattern into the existing template. +bool HardwareTemplate::try_accommodate(const HardwarePattern& pat, const OperationCostModel& cm, + std::vector& out_mapping, double& out_cost_increase) { + auto mapping = find_mapping(pat); + if (mapping.empty() && !pat.ops.empty()) { + return false; + } + + // Calculate cost increase: count how many new FUs we need to add + double old_cost = compute_cost(cm); + int new_fus_needed = 0; + + for (size_t i = 0; i < mapping.size(); ++i) { + if (mapping[i] < 0) { + new_fus_needed++; + } + } + + // Convert negative indices to actual new FU IDs + std::vector final_mapping; + int next_fu_id = fus.size(); + for (size_t i = 0; i < mapping.size(); ++i) { + if (mapping[i] < 0) { + final_mapping.push_back(next_fu_id++); + } else { + final_mapping.push_back(mapping[i]); + } + } + + // Calculate new cost + double new_cost = old_cost; + for (size_t i = 0; i < pat.ops.size(); ++i) { + if (mapping[i] < 0) { + new_cost += cm.fu_cost(pat.ops[i]); + } + } + + out_mapping = final_mapping; + out_cost_increase = new_cost - old_cost; + return true; +} + +// Applies a mapping to the template, adding new FUs as needed. +void HardwareTemplate::apply_mapping(const HardwarePattern& pat, const std::vector& m) { + patterns.push_back(pat.id); + mapping[pat.id] = m; + + // Add any new FUs that don't exist yet + for (size_t i = 0; i < m.size() && i < pat.ops.size(); ++i) { + int fu_id = m[i]; + // If this FU ID doesn't exist, we need to add FUs until it does + while (fu_id >= (int)fus.size()) { + // Find the operation that should go in this new FU + for (size_t j = 0; j < m.size(); ++j) { + if (m[j] == (int)fus.size()) { + add_fu(pat.ops[j]); + break; + } + } + // Safety: if no matching op found, break to avoid infinite loop + if (fu_id >= (int)fus.size() && fus.size() > 0) { + // This shouldn't happen with correct mappings + break; + } + } + } +} + +// Computes the total cost of the template based on all FUs. +double HardwareTemplate::compute_cost(const OperationCostModel& cm) const { + double sum = 0; + for (const auto& fu : fus) { + sum += cm.fu_cost(fu.op_type); + } + return sum; +} + +// Extracts operations from fused op and linearizes them via topological sort. +void extract_pattern_ops(neura::FusedOp fop, HardwarePattern& pat) { + Region& body = fop.getBody(); + if (body.empty()) return; + Block& blk = body.front(); + + llvm::DenseMap op_id; + std::vector op_names; + std::vector> preds; + + int id = 0; + for (Operation& op : blk.getOperations()) { + std::string name = op.getName().getStringRef().str(); + if (name == "neura.yield") continue; + op_id[&op] = id++; + op_names.push_back(name); + preds.push_back({}); + } + + int idx = 0; + for (Operation& op : blk.getOperations()) { + std::string name = op.getName().getStringRef().str(); + if (name == "neura.yield") continue; + for (Value v : op.getOperands()) { + if (Operation* def = v.getDefiningOp()) { + if (op_id.count(def)) { + preds[idx].push_back(op_id[def]); + } + } + } + idx++; + } + + int n = op_names.size(); + std::vector level(n, 0); + std::vector in_deg(n, 0); + for (int i = 0; i < n; ++i) in_deg[i] = preds[i].size(); + + std::vector q; + for (int i = 0; i < n; ++i) if (in_deg[i] == 0) q.push_back(i); + + for (size_t h = 0; h < q.size(); ++h) { + int cur = q[h]; + for (int i = 0; i < n; ++i) { + for (int p : preds[i]) { + if (p == cur) { + level[i] = std::max(level[i], level[cur] + 1); + if (--in_deg[i] == 0) q.push_back(i); + } + } + } + } + + std::vector order; + for (int i = 0; i < n; ++i) order.push_back(i); + std::sort(order.begin(), order.end(), [&](int a, int b) { + return level[a] < level[b]; + }); + + // Build mapping from old index to new index after reordering. + std::vector old_to_new(n); + for (int new_idx = 0; new_idx < n; ++new_idx) { + old_to_new[order[new_idx]] = new_idx; + } + + for (int i : order) { + pat.ops.push_back(op_names[i]); + pat.op_levels.push_back(level[i]); + + std::vector remapped_preds; + for (int p : preds[i]) { + remapped_preds.push_back(old_to_new[p]); + } + pat.op_preds.push_back(remapped_preds); + } +} + +// Extracts all patterns from module. +void extract_patterns(ModuleOp module, std::vector& patterns, OperationCostModel& cost_model) { + module.walk([&](neura::FusedOp fop) { + int64_t pid = fop.getPatternId(); + if (std::find_if(patterns.begin(), patterns.end(), + [pid](const HardwarePattern& p) { return p.id == pid; }) != patterns.end()) return; + + HardwarePattern pat(pid, fop.getPatternName().str(), fop.getFrequency()); + extract_pattern_ops(fop, pat); + pat.cost = cost_model.pattern_cost(pat.ops); + patterns.push_back(pat); + }); +} + +// Extracts all standalone operations from module. +void extract_all_standalone_ops(ModuleOp module, std::set& all_ops) { + module.walk([&](Operation* op) { + if (isa(op)) return; + + Operation* parent = op->getParentOp(); + while (parent) { + if (isa(parent)) return; + parent = parent->getParentOp(); + } + + std::string op_name = op->getName().getStringRef().str(); + if (op_name.find("neura.") == 0) { + if (op_name == "neura.yield" || op_name == "neura.fused_op") return; + all_ops.insert(op_name); + } + }); +} + +// Creates hardware templates from patterns. +void create_hardware_templates(const std::vector& patterns, std::vector& templates, OperationCostModel& cost_model) { + auto count_distinct_ops = [](const HardwarePattern& p) -> int { + std::set distinct_ops; + for (const auto& op : p.ops) { + distinct_ops.insert(op); + } + return distinct_ops.size(); + }; + + std::vector order; + for (size_t i = 0; i < patterns.size(); ++i) order.push_back(i); + std::sort(order.begin(), order.end(), [&patterns, &count_distinct_ops](int a, int b) { + int dist_a = count_distinct_ops(patterns[a]); + int dist_b = count_distinct_ops(patterns[b]); + if (dist_a != dist_b) { + return dist_a > dist_b; + } + return patterns[a].cost > patterns[b].cost; + }); + + for (int idx : order) { + const HardwarePattern& pat = patterns[idx]; + + int best_t = -1; + std::vector best_m; + double best_inc = 1e18; + + for (size_t t = 0; t < templates.size(); ++t) { + HardwareTemplate temp_tmpl = templates[t]; + std::vector m; + double inc; + if (temp_tmpl.try_accommodate(pat, cost_model, m, inc)) { + if (inc < best_inc) { + best_inc = inc; + best_t = t; + best_m = m; + } + } + } + + double new_tmpl_cost = pat.cost; + + if (best_t >= 0 && best_inc <= new_tmpl_cost * 0.5) { + templates[best_t].apply_mapping(pat, best_m); + } else { + // Create a new template with FUs for this pattern + HardwareTemplate t(templates.size()); + std::vector m; + for (size_t i = 0; i < pat.ops.size(); ++i) { + int fu_id = t.add_fu(pat.ops[i]); + m.push_back(fu_id); + } + t.apply_mapping(pat, m); + templates.push_back(t); + } + } + + for (auto& t : templates) { + int64_t total_freq = 0; + for (int64_t pid : t.patterns) { + for (const auto& p : patterns) { + if (p.id == pid) { total_freq += p.freq; break; } + } + } + t.instances = std::max(1, (int)std::ceil(total_freq / 10.0)); + } +} + +// Generates FU connections for all templates based on pattern mappings. +// For each pattern, creates connections based on the dependency graph (op_preds). +void generate_connections(const std::vector& patterns, std::vector& templates) { + std::map pattern_map; + for (const auto& p : patterns) { + pattern_map[p.id] = &p; + } + + for (auto& tmpl : templates) { + tmpl.connections.clear(); + + // For each pattern mapped to this template, generate connections based on dependencies. + for (const auto& [pid, fu_mapping] : tmpl.mapping) { + auto pat_it = pattern_map.find(pid); + if (pat_it == pattern_map.end()) continue; + const HardwarePattern* pat = pat_it->second; + + if (fu_mapping.size() < 2) continue; + + // Use op_preds to determine actual data dependencies + if (!pat->op_preds.empty()) { + for (size_t op_idx = 0; op_idx < pat->op_preds.size() && op_idx < fu_mapping.size(); ++op_idx) { + int to_fu = fu_mapping[op_idx]; + + for (int pred_op_idx : pat->op_preds[op_idx]) { + if (pred_op_idx >= 0 && pred_op_idx < (int)fu_mapping.size()) { + int from_fu = fu_mapping[pred_op_idx]; + if (from_fu >= 0 && to_fu >= 0 && from_fu != to_fu) { + tmpl.connections.insert({from_fu, to_fu}); + } + } + } + } + } else { + // Fallback: create linear chain if no dependency info + for (size_t i = 0; i < fu_mapping.size() - 1; ++i) { + int from = fu_mapping[i]; + int to = fu_mapping[i + 1]; + if (from >= 0 && to >= 0 && from != to) { + tmpl.connections.insert({from, to}); + } + } + } + } + } +} + +// Checks if FU 'from' can reach FU 'to' through existing connections. +static bool can_reach_via_connections(const std::set>& connections, int from, int to, int num_fus) { + if (from == to) return true; + + std::vector visited(num_fus, false); + std::vector queue; + queue.push_back(from); + visited[from] = true; + + for (size_t h = 0; h < queue.size(); ++h) { + int cur = queue[h]; + for (const auto& conn : connections) { + if (conn.first == cur && !visited[conn.second]) { + if (conn.second == to) return true; + visited[conn.second] = true; + queue.push_back(conn.second); + } + } + } + return false; +} + +// Generates optimized FU connections for all templates based on pattern dependencies. +// Removes redundant connections using transitive reachability. +void generate_optimized_connections(const std::vector& patterns, std::vector& templates) { + std::map pattern_map; + for (const auto& p : patterns) { + pattern_map[p.id] = &p; + } + + for (auto& tmpl : templates) { + std::set> required_connections; + + for (const auto& [pid, fu_mapping] : tmpl.mapping) { + auto pat_it = pattern_map.find(pid); + if (pat_it == pattern_map.end()) continue; + const HardwarePattern* pat = pat_it->second; + + if (fu_mapping.size() < 2) continue; + + if (!pat->op_preds.empty()) { + for (size_t op_idx = 0; op_idx < pat->op_preds.size() && op_idx < fu_mapping.size(); ++op_idx) { + int to_fu = fu_mapping[op_idx]; + + for (int pred_op_idx : pat->op_preds[op_idx]) { + if (pred_op_idx >= 0 && pred_op_idx < (int)fu_mapping.size()) { + int from_fu = fu_mapping[pred_op_idx]; + if (from_fu >= 0 && to_fu >= 0 && from_fu != to_fu) { + required_connections.insert({from_fu, to_fu}); + } + } + } + } + } else { + for (size_t i = 0; i < fu_mapping.size() - 1; ++i) { + int from = fu_mapping[i]; + int to = fu_mapping[i + 1]; + if (from >= 0 && to >= 0 && from != to) { + required_connections.insert({from, to}); + } + } + } + } + + // Sort connections by "distance" (prefer shorter connections first for transitive reduction) + std::vector> sorted_connections(required_connections.begin(), required_connections.end()); + std::sort(sorted_connections.begin(), sorted_connections.end(), + [](const auto& a, const auto& b) { + return std::abs(a.second - a.first) < std::abs(b.second - b.first); + }); + + // Build optimized connections - add a connection only if it's not already reachable. + tmpl.connections.clear(); + int num_fus = tmpl.fus.size(); + + for (const auto& conn : sorted_connections) { + // Check if we can already reach conn.second from conn.first via existing connections. + if (!can_reach_via_connections(tmpl.connections, conn.first, conn.second, num_fus)) { + tmpl.connections.insert(conn); + } + } + } +} + +// Generates execution plans for all patterns on their assigned templates. +void generate_execution_plans(const std::vector& patterns, const std::vector& templates, std::vector& plans) { + std::map pattern_to_template; + + for (const auto& t : templates) { + for (int64_t pid : t.patterns) { + pattern_to_template[pid] = &t; + } + } + + for (const auto& pat : patterns) { + PatternExecutionPlan plan; + plan.pattern_id = pat.id; + plan.pattern_name = pat.name; + + auto it = pattern_to_template.find(pat.id); + if (it == pattern_to_template.end()) continue; + + const HardwareTemplate* tmpl = it->second; + auto mapping_it = tmpl->mapping.find(pat.id); + if (mapping_it == tmpl->mapping.end()) continue; + + const std::vector& fu_mapping = mapping_it->second; + + // Group operations by their topological level for parallel execution + std::map>>> level_to_ops; + + for (size_t i = 0; i < pat.ops.size() && i < fu_mapping.size(); ++i) { + int level = (i < pat.op_levels.size()) ? pat.op_levels[i] : (int)i; + int fu = fu_mapping[i]; + level_to_ops[level].push_back({(int)i, {fu, pat.ops[i]}}); + } + + for (const auto& [level, ops_at_level] : level_to_ops) { + ExecutionStage stage; + for (const auto& [op_idx, fu_and_op] : ops_at_level) { + stage.fus.push_back(fu_and_op.first); + stage.ops.push_back(fu_and_op.second); + } + plan.stages.push_back(stage); + } + + plans.push_back(plan); + } +} + +// Collects supported operations for each template. +void collect_supported_operations(const std::vector& patterns, const std::vector& templates, const std::set& all_dfg_ops, std::vector& supported_ops) { + for (const auto& tmpl : templates) { + TemplateSupportedOps ops; + ops.template_id = tmpl.id; + + // Collect all operation types present in this template's FUs + std::set template_ops; + for (const auto& fu : tmpl.fus) { + template_ops.insert(fu.op_type); + } + + // For each DFG op, check if this template can support it + for (const std::string& dfg_op : all_dfg_ops) { + bool can_support = false; + + // Direct support: template has an FU for this op + if (template_ops.count(dfg_op)) { + can_support = true; + } else { + // Check if any existing FU type is compatible + for (const auto& existing_op : template_ops) { + if (HardwareTemplate::compatible(existing_op, dfg_op)) { + can_support = true; + break; + } + } + } + + if (can_support) { + ops.single_ops.insert(dfg_op); + } + } + + ops.composite_ops = tmpl.patterns; + + supported_ops.push_back(ops); + } +} + +// Calculates total cost of templates. +double calculate_total_cost(const std::vector& templates, const OperationCostModel& cost_model) { + double total_cost = 0; + for (const auto& t : templates) { + total_cost += t.compute_cost(cost_model) * t.instances; + } + return total_cost; +} + +// Escapes string for JSON output. +std::string escape_json_string(const std::string& s) { + std::string r; + for (char c : s) { + if (c == '"') r += "\\\""; + else if (c == '\\') r += "\\\\"; + else r += c; + } + return r; +} + +// Writes hardware configuration to JSON file (extended version with execution plans and supported ops). +void write_hardware_config_json(const std::string& path, const std::vector& patterns, const std::vector& templates, const OperationCostModel& cost_model, const std::vector& execution_plans, const std::vector& supported_ops) { + std::error_code EC; + llvm::raw_fd_ostream os(path, EC, llvm::sys::fs::OF_Text); + if (EC) return; + + // Build pattern name lookup. + std::map pattern_names; + for (const auto& p : patterns) pattern_names[p.id] = p.name; + + os << "{\n \"hardware_configuration\": {\n"; + os << " \"summary\": {\n"; + os << " \"total_templates\": " << templates.size() << "\n"; + os << " },\n"; + + os << " \"hardware_templates\": [\n"; + for (size_t t = 0; t < templates.size(); ++t) { + const auto& tmpl = templates[t]; + if (t) os << ",\n"; + + os << " {\n"; + os << " \"template_id\": " << tmpl.id << ",\n"; + os << " \"instance_count\": " << tmpl.instances << ",\n"; + + const TemplateSupportedOps* tmpl_supported_ops = nullptr; + for (const auto& sop : supported_ops) { + if (sop.template_id == tmpl.id) { + tmpl_supported_ops = &sop; + break; + } + } + + if (tmpl_supported_ops) { + os << " \"supported_single_ops\": ["; + bool first = true; + for (const auto& op : tmpl_supported_ops->single_ops) { + if (!first) os << ", "; + first = false; + os << "\"" << op << "\""; + } + os << "],\n"; + + os << " \"supported_composite_ops\": [\n"; + for (size_t i = 0; i < tmpl_supported_ops->composite_ops.size(); ++i) { + if (i) os << ",\n"; + int64_t pid = tmpl_supported_ops->composite_ops[i]; + auto name_it = pattern_names.find(pid); + std::string pname = (name_it != pattern_names.end()) ? name_it->second : ""; + os << " {\"pattern_id\": " << pid << ", \"name\": \"" << escape_json_string(pname) << "\"}"; + } + os << "\n ],\n"; + } + + // Output FUs (functional units) instead of slots + os << " \"functional_units\": [\n"; + for (size_t f = 0; f < tmpl.fus.size(); ++f) { + const auto& fu = tmpl.fus[f]; + if (f) os << ",\n"; + os << " {\"fu_id\": " << fu.id << ", \"op_type\": \"" << fu.op_type << "\"}"; + } + os << "\n ],\n"; + + // Output FU connections + os << " \"fu_connections\": [\n"; + bool first_conn = true; + for (const auto& conn : tmpl.connections) { + if (!first_conn) os << ",\n"; + first_conn = false; + os << " {\"from_fu\": " << conn.first << ", \"to_fu\": " << conn.second << "}"; + } + os << "\n ],\n"; + + os << " \"pattern_execution_plans\": [\n"; + bool first_plan = true; + for (const auto& plan : execution_plans) { + auto mapping_it = tmpl.mapping.find(plan.pattern_id); + if (mapping_it == tmpl.mapping.end()) continue; + + if (!first_plan) os << ",\n"; + first_plan = false; + + os << " {\n"; + os << " \"pattern_id\": " << plan.pattern_id << ",\n"; + os << " \"pattern_name\": \"" << escape_json_string(plan.pattern_name) << "\",\n"; + os << " \"fu_mapping\": ["; + const auto& m = mapping_it->second; + for (size_t i = 0; i < m.size(); ++i) { + if (i) os << ", "; + os << m[i]; + } + os << "],\n"; + os << " \"execution_stages\": [\n"; + for (size_t stage_idx = 0; stage_idx < plan.stages.size(); ++stage_idx) { + const auto& stage = plan.stages[stage_idx]; + if (stage_idx) os << ",\n"; + os << " {\n"; + os << " \"stage\": " << stage_idx << ",\n"; + os << " \"parallel_fus\": ["; + for (size_t i = 0; i < stage.fus.size(); ++i) { + if (i) os << ", "; + os << stage.fus[i]; + } + os << "],\n"; + os << " \"parallel_ops\": ["; + for (size_t i = 0; i < stage.ops.size(); ++i) { + if (i) os << ", "; + os << "\"" << stage.ops[i] << "\""; + } + os << "]\n"; + os << " }"; + } + os << "\n ]\n"; + os << " }"; + } + os << "\n ]\n"; + os << " }"; + } + os << "\n ]\n"; + + os << " }\n}\n"; +} + +// Legacy version for backward compatibility. +void write_hardware_config_json(const std::string& path, const std::vector& patterns, const std::vector& templates, const OperationCostModel& cost_model) { + std::vector plans; + generate_execution_plans(patterns, templates, plans); + + std::set all_dfg_ops; + for (const auto& p : patterns) { + for (const auto& op : p.ops) { + all_dfg_ops.insert(op); + } + } + + std::vector supported_ops; + collect_supported_operations(patterns, templates, all_dfg_ops, supported_ops); + + write_hardware_config_json(path, patterns, templates, cost_model, plans, supported_ops); +} + +} // namespace mlir::neura + diff --git a/lib/NeuraDialect/Transforms/HardwareMergePass.cpp b/lib/NeuraDialect/Transforms/HardwareMergePass.cpp new file mode 100644 index 00000000..56e610bb --- /dev/null +++ b/lib/NeuraDialect/Transforms/HardwareMergePass.cpp @@ -0,0 +1,79 @@ +//===- HardwareMergePass.cpp - Hardware Template Merging Pass -------------===// +// +// This pass maximizes pattern coverage with minimum hardware cost by merging +// patterns into shared hardware templates. It uses a Functional Unit (FU) +// based design where each FU executes exactly one operation type. The pass +// generates optimized FU connections with transitive reduction and creates +// execution plans for parallel execution stages. +// +//===----------------------------------------------------------------------===// + +#include "NeuraDialect/Transforms/GraphMining/HardwareTemplate.h" +#include "mlir/Pass/Pass.h" +#include +#include + +using namespace mlir; +using namespace mlir::neura; + +#define GEN_PASS_DEF_HARDWAREMERGE +#include "NeuraDialect/NeuraPasses.h.inc" + +namespace { + +struct HardwareMergePass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(HardwareMergePass) + + HardwareMergePass() = default; + HardwareMergePass(const HardwareMergePass &pass) + : PassWrapper>(pass) {} + + StringRef getArgument() const override { return "hardware-merge"; } + StringRef getDescription() const override { + return "Merges hardware templates with bypass support."; + } + + Option outputFile{*this, "output", + llvm::cl::desc("Output JSON file path"), llvm::cl::init("hardware_config.json")}; + + // Runs the hardware merge pass on the module. + void runOnOperation() override { + std::vector patterns; + std::vector templates; + OperationCostModel cost_model; + + extract_patterns(getOperation(), patterns, cost_model); + + std::set all_standalone_ops; + extract_all_standalone_ops(getOperation(), all_standalone_ops); + + create_hardware_templates(patterns, templates, cost_model); + + generate_optimized_connections(patterns, templates); + + std::vector execution_plans; + generate_execution_plans(patterns, templates, execution_plans); + + std::set all_dfg_ops = all_standalone_ops; + for (const auto& p : patterns) { + for (const auto& op : p.ops) { + all_dfg_ops.insert(op); + } + } + + std::vector supported_ops; + collect_supported_operations(patterns, templates, all_dfg_ops, supported_ops); + + write_hardware_config_json(outputFile.getValue(), patterns, templates, cost_model, execution_plans, supported_ops); + } +}; + +} // namespace + +namespace mlir::neura { +// Creates an instance of the hardware merge pass. +std::unique_ptr createHardwareMergePass() { + return std::make_unique(); +} +} // namespace mlir::neura diff --git a/test/neura/fusion/test.mlir b/test/neura/fusion/test.mlir index 11ac718b..35643cdf 100644 --- a/test/neura/fusion/test.mlir +++ b/test/neura/fusion/test.mlir @@ -112,4 +112,257 @@ // RUN: --insert-data-mov \ // RUN: --map-to-accelerator="mapping-strategy=heuristic backtrack-config=simple" %t-kernel.mlir | FileCheck %s --check-prefix=CHECK-ITER-MERGE-PATTERN-MAPPING -// CHECK-ITER-MERGE-PATTERN-MAPPING: mapping_info = {compiled_ii = 12 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 8 : i32, res_mii = 3 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32} \ No newline at end of file +// CHECK-ITER-MERGE-PATTERN-MAPPING: mapping_info = {compiled_ii = 12 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 8 : i32, res_mii = 3 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32} + +// RUN: mlir-neura-opt --architecture-spec=%S/../../arch_spec/architecture.yaml --verify-each=true --mlir-print-ir-after-failure \ +// RUN: --assign-accelerator \ +// RUN: --lower-llvm-to-neura \ +// RUN: --promote-func-arg-to-const \ +// RUN: --canonicalize-return \ +// RUN: --canonicalize-cast \ +// RUN: --canonicalize-live-in \ +// RUN: --leverage-predicated-value \ +// RUN: --fold-constant \ +// RUN: --transform-ctrl-to-data-flow \ +// RUN: --fold-constant \ +// RUN: --iter-merge-pattern="min-support=3 max-iter=4" \ +// RUN: --hardware-merge="output=hardware_config.json" %t-kernel.mlir +// RUN: FileCheck %s --input-file=hardware_config.json --check-prefix=CHECK-HARDWARE-MERGE + +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "hardware_configuration": { +// CHECK-HARDWARE-MERGE: "summary": { +// CHECK-HARDWARE-MERGE: "total_templates": 3 +// CHECK-HARDWARE-MERGE: }, +// CHECK-HARDWARE-MERGE: "hardware_templates": [ +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "template_id": 0, +// CHECK-HARDWARE-MERGE: "instance_count": 2, +// CHECK-HARDWARE-MERGE: "supported_single_ops": ["neura.gep", "neura.grant_once", "neura.grant_predicate", "neura.load", "neura.phi_start", "neura.store"], +// CHECK-HARDWARE-MERGE: "supported_composite_ops": [ +// CHECK-HARDWARE-MERGE: {"pattern_id": 10, "name": "phi_start->fused_op:gep->load"}, +// CHECK-HARDWARE-MERGE: {"pattern_id": 0, "name": "gep->load"}, +// CHECK-HARDWARE-MERGE: {"pattern_id": 11, "name": "phi_start->grant_predicate"} +// CHECK-HARDWARE-MERGE: ], +// CHECK-HARDWARE-MERGE: "functional_units": [ +// CHECK-HARDWARE-MERGE: {"fu_id": 0, "op_type": "neura.phi_start"}, +// CHECK-HARDWARE-MERGE: {"fu_id": 1, "op_type": "neura.gep"}, +// CHECK-HARDWARE-MERGE: {"fu_id": 2, "op_type": "neura.load"}, +// CHECK-HARDWARE-MERGE: {"fu_id": 3, "op_type": "neura.grant_predicate"} +// CHECK-HARDWARE-MERGE: ], +// CHECK-HARDWARE-MERGE: "fu_connections": [ +// CHECK-HARDWARE-MERGE: {"from_fu": 0, "to_fu": 1}, +// CHECK-HARDWARE-MERGE: {"from_fu": 0, "to_fu": 3}, +// CHECK-HARDWARE-MERGE: {"from_fu": 1, "to_fu": 2} +// CHECK-HARDWARE-MERGE: ], +// CHECK-HARDWARE-MERGE: "pattern_execution_plans": [ +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "pattern_id": 10, +// CHECK-HARDWARE-MERGE: "pattern_name": "phi_start->fused_op:gep->load", +// CHECK-HARDWARE-MERGE: "fu_mapping": [0, 1, 2], +// CHECK-HARDWARE-MERGE: "execution_stages": [ +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "stage": 0, +// CHECK-HARDWARE-MERGE: "parallel_fus": [0], +// CHECK-HARDWARE-MERGE: "parallel_ops": ["neura.phi_start"] +// CHECK-HARDWARE-MERGE: }, +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "stage": 1, +// CHECK-HARDWARE-MERGE: "parallel_fus": [1], +// CHECK-HARDWARE-MERGE: "parallel_ops": ["neura.gep"] +// CHECK-HARDWARE-MERGE: }, +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "stage": 2, +// CHECK-HARDWARE-MERGE: "parallel_fus": [2], +// CHECK-HARDWARE-MERGE: "parallel_ops": ["neura.load"] +// CHECK-HARDWARE-MERGE: } +// CHECK-HARDWARE-MERGE: ] +// CHECK-HARDWARE-MERGE: }, +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "pattern_id": 0, +// CHECK-HARDWARE-MERGE: "pattern_name": "gep->load", +// CHECK-HARDWARE-MERGE: "fu_mapping": [1, 2], +// CHECK-HARDWARE-MERGE: "execution_stages": [ +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "stage": 0, +// CHECK-HARDWARE-MERGE: "parallel_fus": [1], +// CHECK-HARDWARE-MERGE: "parallel_ops": ["neura.gep"] +// CHECK-HARDWARE-MERGE: }, +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "stage": 1, +// CHECK-HARDWARE-MERGE: "parallel_fus": [2], +// CHECK-HARDWARE-MERGE: "parallel_ops": ["neura.load"] +// CHECK-HARDWARE-MERGE: } +// CHECK-HARDWARE-MERGE: ] +// CHECK-HARDWARE-MERGE: }, +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "pattern_id": 11, +// CHECK-HARDWARE-MERGE: "pattern_name": "phi_start->grant_predicate", +// CHECK-HARDWARE-MERGE: "fu_mapping": [0, 3], +// CHECK-HARDWARE-MERGE: "execution_stages": [ +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "stage": 0, +// CHECK-HARDWARE-MERGE: "parallel_fus": [0], +// CHECK-HARDWARE-MERGE: "parallel_ops": ["neura.phi_start"] +// CHECK-HARDWARE-MERGE: }, +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "stage": 1, +// CHECK-HARDWARE-MERGE: "parallel_fus": [3], +// CHECK-HARDWARE-MERGE: "parallel_ops": ["neura.grant_predicate"] +// CHECK-HARDWARE-MERGE: } +// CHECK-HARDWARE-MERGE: ] +// CHECK-HARDWARE-MERGE: } +// CHECK-HARDWARE-MERGE: ] +// CHECK-HARDWARE-MERGE: }, +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "template_id": 1, +// CHECK-HARDWARE-MERGE: "instance_count": 3, +// CHECK-HARDWARE-MERGE: "supported_single_ops": ["neura.grant_once", "neura.grant_predicate", "neura.icmp"], +// CHECK-HARDWARE-MERGE: "supported_composite_ops": [ +// CHECK-HARDWARE-MERGE: {"pattern_id": 1, "name": "fused_op:icmp->grant_predicate->grant_predicate"}, +// CHECK-HARDWARE-MERGE: {"pattern_id": 3, "name": "icmp->grant_predicate"}, +// CHECK-HARDWARE-MERGE: {"pattern_id": 2, "name": "grant_predicate->grant_predicate"} +// CHECK-HARDWARE-MERGE: ], +// CHECK-HARDWARE-MERGE: "functional_units": [ +// CHECK-HARDWARE-MERGE: {"fu_id": 0, "op_type": "neura.icmp"}, +// CHECK-HARDWARE-MERGE: {"fu_id": 1, "op_type": "neura.grant_predicate"}, +// CHECK-HARDWARE-MERGE: {"fu_id": 2, "op_type": "neura.grant_predicate"} +// CHECK-HARDWARE-MERGE: ], +// CHECK-HARDWARE-MERGE: "fu_connections": [ +// CHECK-HARDWARE-MERGE: {"from_fu": 0, "to_fu": 1}, +// CHECK-HARDWARE-MERGE: {"from_fu": 1, "to_fu": 2} +// CHECK-HARDWARE-MERGE: ], +// CHECK-HARDWARE-MERGE: "pattern_execution_plans": [ +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "pattern_id": 1, +// CHECK-HARDWARE-MERGE: "pattern_name": "fused_op:icmp->grant_predicate->grant_predicate", +// CHECK-HARDWARE-MERGE: "fu_mapping": [0, 1, 2], +// CHECK-HARDWARE-MERGE: "execution_stages": [ +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "stage": 0, +// CHECK-HARDWARE-MERGE: "parallel_fus": [0], +// CHECK-HARDWARE-MERGE: "parallel_ops": ["neura.icmp"] +// CHECK-HARDWARE-MERGE: }, +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "stage": 1, +// CHECK-HARDWARE-MERGE: "parallel_fus": [1, 2], +// CHECK-HARDWARE-MERGE: "parallel_ops": ["neura.grant_predicate", "neura.grant_predicate"] +// CHECK-HARDWARE-MERGE: } +// CHECK-HARDWARE-MERGE: ] +// CHECK-HARDWARE-MERGE: }, +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "pattern_id": 3, +// CHECK-HARDWARE-MERGE: "pattern_name": "icmp->grant_predicate", +// CHECK-HARDWARE-MERGE: "fu_mapping": [0, 1], +// CHECK-HARDWARE-MERGE: "execution_stages": [ +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "stage": 0, +// CHECK-HARDWARE-MERGE: "parallel_fus": [0], +// CHECK-HARDWARE-MERGE: "parallel_ops": ["neura.icmp"] +// CHECK-HARDWARE-MERGE: }, +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "stage": 1, +// CHECK-HARDWARE-MERGE: "parallel_fus": [1], +// CHECK-HARDWARE-MERGE: "parallel_ops": ["neura.grant_predicate"] +// CHECK-HARDWARE-MERGE: } +// CHECK-HARDWARE-MERGE: ] +// CHECK-HARDWARE-MERGE: }, +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "pattern_id": 2, +// CHECK-HARDWARE-MERGE: "pattern_name": "grant_predicate->grant_predicate", +// CHECK-HARDWARE-MERGE: "fu_mapping": [1, 2], +// CHECK-HARDWARE-MERGE: "execution_stages": [ +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "stage": 0, +// CHECK-HARDWARE-MERGE: "parallel_fus": [1], +// CHECK-HARDWARE-MERGE: "parallel_ops": ["neura.grant_predicate"] +// CHECK-HARDWARE-MERGE: }, +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "stage": 1, +// CHECK-HARDWARE-MERGE: "parallel_fus": [2], +// CHECK-HARDWARE-MERGE: "parallel_ops": ["neura.grant_predicate"] +// CHECK-HARDWARE-MERGE: } +// CHECK-HARDWARE-MERGE: ] +// CHECK-HARDWARE-MERGE: } +// CHECK-HARDWARE-MERGE: ] +// CHECK-HARDWARE-MERGE: }, +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "template_id": 2, +// CHECK-HARDWARE-MERGE: "instance_count": 2, +// CHECK-HARDWARE-MERGE: "supported_single_ops": ["neura.grant_once", "neura.grant_predicate", "neura.phi_start"], +// CHECK-HARDWARE-MERGE: "supported_composite_ops": [ +// CHECK-HARDWARE-MERGE: {"pattern_id": 4, "name": "grant_once->fused_op:phi_start->phi_start"}, +// CHECK-HARDWARE-MERGE: {"pattern_id": 9, "name": "grant_once->phi_start"}, +// CHECK-HARDWARE-MERGE: {"pattern_id": 8, "name": "phi_start->phi_start"} +// CHECK-HARDWARE-MERGE: ], +// CHECK-HARDWARE-MERGE: "functional_units": [ +// CHECK-HARDWARE-MERGE: {"fu_id": 0, "op_type": "neura.grant_once"}, +// CHECK-HARDWARE-MERGE: {"fu_id": 1, "op_type": "neura.phi_start"}, +// CHECK-HARDWARE-MERGE: {"fu_id": 2, "op_type": "neura.phi_start"} +// CHECK-HARDWARE-MERGE: ], +// CHECK-HARDWARE-MERGE: "fu_connections": [ +// CHECK-HARDWARE-MERGE: {"from_fu": 0, "to_fu": 1}, +// CHECK-HARDWARE-MERGE: {"from_fu": 1, "to_fu": 2} +// CHECK-HARDWARE-MERGE: ], +// CHECK-HARDWARE-MERGE: "pattern_execution_plans": [ +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "pattern_id": 9, +// CHECK-HARDWARE-MERGE: "pattern_name": "grant_once->phi_start", +// CHECK-HARDWARE-MERGE: "fu_mapping": [0, 1], +// CHECK-HARDWARE-MERGE: "execution_stages": [ +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "stage": 0, +// CHECK-HARDWARE-MERGE: "parallel_fus": [0], +// CHECK-HARDWARE-MERGE: "parallel_ops": ["neura.grant_once"] +// CHECK-HARDWARE-MERGE: }, +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "stage": 1, +// CHECK-HARDWARE-MERGE: "parallel_fus": [1], +// CHECK-HARDWARE-MERGE: "parallel_ops": ["neura.phi_start"] +// CHECK-HARDWARE-MERGE: } +// CHECK-HARDWARE-MERGE: ] +// CHECK-HARDWARE-MERGE: }, +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "pattern_id": 4, +// CHECK-HARDWARE-MERGE: "pattern_name": "grant_once->fused_op:phi_start->phi_start", +// CHECK-HARDWARE-MERGE: "fu_mapping": [0, 1, 2], +// CHECK-HARDWARE-MERGE: "execution_stages": [ +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "stage": 0, +// CHECK-HARDWARE-MERGE: "parallel_fus": [0], +// CHECK-HARDWARE-MERGE: "parallel_ops": ["neura.grant_once"] +// CHECK-HARDWARE-MERGE: }, +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "stage": 1, +// CHECK-HARDWARE-MERGE: "parallel_fus": [1], +// CHECK-HARDWARE-MERGE: "parallel_ops": ["neura.phi_start"] +// CHECK-HARDWARE-MERGE: }, +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "stage": 2, +// CHECK-HARDWARE-MERGE: "parallel_fus": [2], +// CHECK-HARDWARE-MERGE: "parallel_ops": ["neura.phi_start"] +// CHECK-HARDWARE-MERGE: } +// CHECK-HARDWARE-MERGE: ] +// CHECK-HARDWARE-MERGE: }, +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "pattern_id": 8, +// CHECK-HARDWARE-MERGE: "pattern_name": "phi_start->phi_start", +// CHECK-HARDWARE-MERGE: "fu_mapping": [1, 2], +// CHECK-HARDWARE-MERGE: "execution_stages": [ +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "stage": 0, +// CHECK-HARDWARE-MERGE: "parallel_fus": [1], +// CHECK-HARDWARE-MERGE: "parallel_ops": ["neura.phi_start"] +// CHECK-HARDWARE-MERGE: }, +// CHECK-HARDWARE-MERGE: { +// CHECK-HARDWARE-MERGE: "stage": 1, +// CHECK-HARDWARE-MERGE: "parallel_fus": [2], +// CHECK-HARDWARE-MERGE: "parallel_ops": ["neura.phi_start"] +// CHECK-HARDWARE-MERGE: } +// CHECK-HARDWARE-MERGE: ] +// CHECK-HARDWARE-MERGE: } +// CHECK-HARDWARE-MERGE: ] +// CHECK-HARDWARE-MERGE: } +// CHECK-HARDWARE-MERGE: ] +// CHECK-HARDWARE-MERGE: } +// CHECK-HARDWARE-MERGE: }