Skip to content

Commit cd2ae13

Browse files
authored
Merge pull request #244 from HobbitQia/hardware_merge
[feat] FU-level fusion
2 parents 21bd419 + c586140 commit cd2ae13

7 files changed

Lines changed: 1399 additions & 1 deletion

File tree

include/NeuraDialect/NeuraPasses.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ std::unique_ptr<mlir::Pass> createCanonicalizeCastPass();
4747
std::unique_ptr<mlir::Pass> createIterMergePatternPass();
4848
std::unique_ptr<mlir::Pass> createInitPatternPass();
4949

50+
// Hardware optimization passes
51+
std::unique_ptr<mlir::Pass> createHardwareMergePass();
52+
5053
#define GEN_PASS_REGISTRATION
5154
#include "NeuraDialect/NeuraPasses.h.inc"
5255

include/NeuraDialect/NeuraPasses.td

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,4 +174,24 @@ def WrapLoopInKernelPass : Pass<"wrap-loop-in-kernel", "func::FuncOp">{
174174
}];
175175
let constructor = "neura::createWrapLoopInKernelPass()";
176176
}
177+
178+
def HardwareMerge : Pass<"hardware-merge", "ModuleOp"> {
179+
let summary = "Merge and optimize hardware units for pattern execution";
180+
let description = [{
181+
This pass analyzes patterns (fused_op regions) and designs an optimal
182+
hardware configuration that supports all patterns while minimizing
183+
hardware cost. It uses a Functional Unit (FU) based design where each
184+
FU executes exactly one operation type.
185+
186+
Algorithm:
187+
1. Extract pattern DAGs with topological structure from fused_op regions
188+
2. Sort patterns by complexity (distinct operation count and cost)
189+
3. For each pattern, try to accommodate it into existing templates by reusing FUs with matching operation types
190+
4. If accommodation cost is too high, create a new template with dedicated FUs for the pattern
191+
5. Generate FU connections based on pattern dependencies with transitive reduction optimization
192+
6. Generate execution plans with parallel execution stages
193+
7. Output the final hardware configuration as a JSON file
194+
}];
195+
let constructor = "neura::createHardwareMergePass()";
196+
}
177197
#endif // NEURA_PASSES_TD
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
//===- HardwareTemplate.h - Hardware Template Data Structures and Helpers -===//
2+
//
3+
// This file contains declarations for hardware template data structures and
4+
// helper functions for hardware template merging.
5+
//
6+
// The hardware template system maximizes pattern coverage while minimizing
7+
// hardware cost through resource sharing. Key concepts:
8+
//
9+
// - FunctionalUnit (FU): A single hardware unit that executes one operation type
10+
// - HardwareTemplate: A collection of FUs with connections supporting multiple patterns
11+
// - HardwarePattern: A sequence of operations mapped to template FUs
12+
//
13+
// For detailed documentation with examples and diagrams, see:
14+
// docs/HardwareTemplateGuide.md
15+
//
16+
//===----------------------------------------------------------------------===//
17+
18+
#ifndef NEURA_DIALECT_TRANSFORMS_GRAPHMINING_HARDWARETEMPLATE_H
19+
#define NEURA_DIALECT_TRANSFORMS_GRAPHMINING_HARDWARETEMPLATE_H
20+
21+
#include "mlir/IR/Operation.h"
22+
#include "mlir/IR/BuiltinOps.h"
23+
#include <vector>
24+
#include <string>
25+
#include <set>
26+
#include <map>
27+
#include <cstdint>
28+
#include <utility>
29+
30+
namespace mlir {
31+
namespace neura {
32+
class FusedOp;
33+
}
34+
}
35+
36+
namespace mlir::neura {
37+
38+
// Forward declarations
39+
struct HardwarePattern {
40+
int64_t id;
41+
std::string name;
42+
int64_t freq;
43+
std::vector<std::string> ops;
44+
std::vector<int> op_levels; // Topological level for each op (ops at same level can run in parallel)
45+
std::vector<std::vector<int>> op_preds; // Predecessors for each op (dependency graph)
46+
double cost;
47+
48+
HardwarePattern(int64_t i, const std::string& n, int64_t f);
49+
};
50+
51+
//===----------------------------------------------------------------------===//
52+
// FunctionalUnit (FU) - A single hardware execution unit
53+
//===----------------------------------------------------------------------===//
54+
//
55+
// A FunctionalUnit represents a single hardware unit that can execute exactly
56+
// one type of operation (e.g., adder, multiplier, load unit).
57+
//
58+
// Key Properties:
59+
// ---------------
60+
// 1. SINGLE OPERATION TYPE: Each FU executes exactly one operation type.
61+
// For example, an "adder" FU only executes neura.add operations.
62+
//
63+
// 2. MULTIPLE INSTANCES: A template can have multiple FUs of the same type.
64+
// For example, two adders to support patterns needing parallel additions.
65+
//
66+
// 3. DIRECT CONNECTIONS: FUs are connected directly to each other, forming
67+
// a dataflow graph within the template.
68+
//
69+
// Example:
70+
// --------
71+
// Consider a template supporting pattern: gep -> load -> add -> store
72+
//
73+
// Template structure:
74+
// ┌─────┐ ┌──────┐ ┌─────┐ ┌───────┐
75+
// │ gep │ --> │ load │ --> │ add │ --> │ store │
76+
// │FU 0 │ │ FU 1 │ │FU 2 │ │ FU 3 │
77+
// └─────┘ └──────┘ └─────┘ └───────┘
78+
//
79+
// For patterns with parallel operations (e.g., add + mul -> store):
80+
// ┌─────┐
81+
// │ add │ ──┐
82+
// │FU 0 │ │ ┌───────┐
83+
// └─────┘ ├──-> │ store │
84+
// ┌─────┐ │ │ FU 2 │
85+
// │ mul │ ──┘ └───────┘
86+
// │FU 1 │
87+
// └─────┘
88+
//
89+
//===----------------------------------------------------------------------===//
90+
struct FunctionalUnit {
91+
int id; // Unique FU ID within the template
92+
std::string op_type; // Operation type this FU executes (e.g., "neura.add")
93+
94+
FunctionalUnit(int i, const std::string& op);
95+
};
96+
97+
// Execution stage for a pattern - contains FU indices that can execute in parallel.
98+
struct ExecutionStage {
99+
std::vector<int> fus; // FUs that execute in this stage (parallel)
100+
std::vector<std::string> ops; // Corresponding operations
101+
};
102+
103+
// Execution plan for a pattern on a hardware template.
104+
struct PatternExecutionPlan {
105+
int64_t pattern_id;
106+
std::string pattern_name;
107+
std::vector<ExecutionStage> stages; // Ordered stages of execution
108+
};
109+
110+
// Operations supported by a hardware template.
111+
struct TemplateSupportedOps {
112+
int template_id;
113+
std::set<std::string> single_ops; // Individual ops this template can support
114+
std::vector<int64_t> composite_ops; // Pattern IDs (composite operations)
115+
};
116+
117+
class OperationCostModel {
118+
public:
119+
OperationCostModel();
120+
double get(const std::string& op) const;
121+
double fu_cost(const std::string& op) const;
122+
double pattern_cost(const std::vector<std::string>& ops) const;
123+
private:
124+
std::map<std::string, double> costs;
125+
};
126+
127+
//===----------------------------------------------------------------------===//
128+
// HardwareTemplate - A collection of FUs forming a reusable hardware block
129+
//===----------------------------------------------------------------------===//
130+
//
131+
// A HardwareTemplate contains multiple FunctionalUnits connected together.
132+
// Multiple patterns can be mapped to the same template by reusing FUs.
133+
//
134+
// Key differences from the old slot-based design:
135+
// - Each FU has exactly one operation type (no multiplexing within FU)
136+
// - Template can have multiple FUs of the same type
137+
// - Connections are between specific FU IDs, not abstract slot positions
138+
//
139+
//===----------------------------------------------------------------------===//
140+
struct HardwareTemplate {
141+
int id;
142+
std::vector<FunctionalUnit> fus; // All FUs in this template
143+
std::vector<int64_t> patterns; // Pattern IDs mapped to this template
144+
std::map<int64_t, std::vector<int>> mapping; // pattern_id -> FU id sequence
145+
std::set<std::pair<int, int>> connections; // FU connections: (from_fu_id, to_fu_id)
146+
int instances;
147+
148+
HardwareTemplate(int i);
149+
150+
// Adds a new FU with the given operation type, returns its ID.
151+
int add_fu(const std::string& op_type);
152+
153+
// Finds an existing FU that can handle the operation, or -1 if none available.
154+
int find_available_fu(const std::string& op_type, const std::set<int>& used_fus) const;
155+
156+
// Finds a mapping for a pattern into the existing template.
157+
// Returns FU IDs for each operation, or empty if no valid mapping exists.
158+
std::vector<int> find_mapping(const HardwarePattern& pat) const;
159+
160+
// Tries to accommodate a pattern, potentially adding new FUs.
161+
// Returns true if successful, with the mapping and cost increase.
162+
bool try_accommodate(const HardwarePattern& pat, const OperationCostModel& cm,
163+
std::vector<int>& out_mapping, double& out_cost_increase);
164+
165+
// Applies a mapping to the template.
166+
void apply_mapping(const HardwarePattern& pat, const std::vector<int>& m);
167+
168+
// Computes the total cost of the template.
169+
double compute_cost(const OperationCostModel& cm) const;
170+
171+
// Checks if two operations are compatible (can potentially share resources in future).
172+
static bool compatible(const std::string& a, const std::string& b);
173+
174+
private:
175+
// DFS helper for finding mappings.
176+
void dfs_find_mapping(const HardwarePattern& pat, size_t op_idx,
177+
std::vector<int>& cur_mapping, std::set<int>& used_fus,
178+
std::vector<int>& best_mapping, int& best_reuse_count) const;
179+
};
180+
181+
// Extracts all patterns from module.
182+
void extract_patterns(ModuleOp module, std::vector<HardwarePattern>& patterns, OperationCostModel& cost_model);
183+
184+
// Extracts all standalone operations from module (ops not inside FusedOp).
185+
void extract_all_standalone_ops(ModuleOp module, std::set<std::string>& all_ops);
186+
187+
// Creates hardware templates from patterns.
188+
void create_hardware_templates(const std::vector<HardwarePattern>& patterns, std::vector<HardwareTemplate>& templates, OperationCostModel& cost_model);
189+
190+
// Generates FU connections for all templates based on pattern dependencies.
191+
void generate_connections(const std::vector<HardwarePattern>& patterns, std::vector<HardwareTemplate>& templates);
192+
193+
// Generates optimized FU connections (removes redundant connections using transitive reachability).
194+
void generate_optimized_connections(const std::vector<HardwarePattern>& patterns, std::vector<HardwareTemplate>& templates);
195+
196+
// Generates execution plans for all patterns on their assigned templates.
197+
void generate_execution_plans(const std::vector<HardwarePattern>& patterns,
198+
const std::vector<HardwareTemplate>& templates,
199+
std::vector<PatternExecutionPlan>& plans);
200+
201+
// Collects supported operations (single + composite) for each template.
202+
void collect_supported_operations(const std::vector<HardwarePattern>& patterns,
203+
const std::vector<HardwareTemplate>& templates,
204+
const std::set<std::string>& all_dfg_ops,
205+
std::vector<TemplateSupportedOps>& supported_ops);
206+
207+
// Calculates total cost of templates.
208+
double calculate_total_cost(const std::vector<HardwareTemplate>& templates, const OperationCostModel& cost_model);
209+
210+
// Writes hardware configuration to JSON file (extended version with execution plans and supported ops).
211+
void write_hardware_config_json(const std::string& path,
212+
const std::vector<HardwarePattern>& patterns,
213+
const std::vector<HardwareTemplate>& templates,
214+
const OperationCostModel& cost_model,
215+
const std::vector<PatternExecutionPlan>& execution_plans,
216+
const std::vector<TemplateSupportedOps>& supported_ops);
217+
218+
// Legacy version for backward compatibility.
219+
void write_hardware_config_json(const std::string& path, const std::vector<HardwarePattern>& patterns, const std::vector<HardwareTemplate>& templates, const OperationCostModel& cost_model);
220+
221+
} // namespace mlir::neura
222+
223+
#endif // NEURA_DIALECT_TRANSFORMS_GRAPHMINING_HARDWARETEMPLATE_H
224+

lib/NeuraDialect/Transforms/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ add_mlir_library(
1717
IterMergePatternPass.cpp
1818
TransformToSteerControlPass.cpp
1919
RemovePredicatedTypePass.cpp
20+
HardwareMergePass.cpp
21+
GraphMining/HardwareTemplate.cpp
2022
WrapLoopInKernelPass.cpp
2123

2224
DEPENDS

0 commit comments

Comments
 (0)