diff --git a/include/Conversion/ConversionPasses.h b/include/Conversion/ConversionPasses.h index f17b9fcb..550e6092 100644 --- a/include/Conversion/ConversionPasses.h +++ b/include/Conversion/ConversionPasses.h @@ -21,7 +21,7 @@ std::unique_ptr createLowerBuiltinToNeuraPass(); std::unique_ptr createLowerAffineToNeuraPass(); // TaskFlow Conversion Passes. -std::unique_ptr createConvertLinalgToTaskflowPass(); +std::unique_ptr createConvertAffineToTaskflowPass(); #define GEN_PASS_REGISTRATION #include "Conversion/ConversionPasses.h.inc" diff --git a/include/Conversion/ConversionPasses.td b/include/Conversion/ConversionPasses.td index e1e477ca..a341d9fe 100644 --- a/include/Conversion/ConversionPasses.td +++ b/include/Conversion/ConversionPasses.td @@ -47,16 +47,29 @@ def LowerAffineToNeura : Pass<"lower-affine-to-neura", "func::FuncOp">{ //=========================================================// // TaskFlow Conversion Passes. //=========================================================// -def ConvertLinalgToTaskflow : Pass<"convert-linalg-to-taskflow", "ModuleOp">{ - let summary = "Convert Linalg operations to Taskflow dialect"; + +def ConvertAffineToTaskflow : Pass<"convert-affine-to-taskflow", "ModuleOp">{ + let summary = "Convert top-level affine.for operations to Taskflow dialect"; let description = [{ - Extracts compute-intensive linalg operations and wraps them into a Taskflow graph for spatial architecture execution. + This pass converts top-level affine.for loops in a function into + taskflow.task operations within a taskflow.graph. Each top-level loop + becomes a separate task, and data dependencies between tasks are made + explicit through taskflow.channel operations. + + The pass: + 1. Identifies all top-level affine.for operations + 2. Analyzes data dependencies (RAW, WAR, WAW) between loops + 3. Creates a taskflow.graph containing the loops + 4. Converts each loop to a taskflow.task + 5. Inserts taskflow.channel operations for data dependencies }]; - let constructor = "mlir::createConvertLinalgToTaskflowPass()"; + + let constructor = "mlir::createConvertAffineToTaskflowPass()"; let dependentDialects = [ "mlir::taskflow::TaskflowDialect", - "mlir::linalg::LinalgDialect", + "mlir::affine::AffineDialect", "mlir::func::FuncDialect", + "mlir::memref::MemRefDialect", "mlir::arith::ArithDialect" ]; } diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h index 72e97d72..36dc4c63 100644 --- a/include/NeuraDialect/NeuraPasses.h +++ b/include/NeuraDialect/NeuraPasses.h @@ -15,7 +15,7 @@ namespace neura { void registerNeuraConversionPassPipeline(); -// Passes defined in GraphPasses.td +// Passes defined in NeuraPasses.td #define GEN_PASS_DECL #include "NeuraDialect/NeuraPasses.h.inc" std::unique_ptr createInsertDataMovPass(); diff --git a/include/TaskflowDialect/CMakeLists.txt b/include/TaskflowDialect/CMakeLists.txt index c2588b05..26d2d8cd 100644 --- a/include/TaskflowDialect/CMakeLists.txt +++ b/include/TaskflowDialect/CMakeLists.txt @@ -1 +1,5 @@ add_mlir_dialect(Taskflow taskflow) + +set(LLVM_TARGET_DEFINITIONS TaskflowPasses.td) +mlir_tablegen(TaskflowPasses.h.inc --gen-pass-decls) +add_public_tablegen_target(MLIRTaskflowTransformsIncGen) \ No newline at end of file diff --git a/include/TaskflowDialect/Taskflow.td b/include/TaskflowDialect/Taskflow.td index 7b6cc8fd..753e7a51 100644 --- a/include/TaskflowDialect/Taskflow.td +++ b/include/TaskflowDialect/Taskflow.td @@ -3,5 +3,7 @@ include "TaskflowDialect.td" include "TaskflowOps.td" +include "TaskflowPasses.td" +include "TaskflowTypes.td" #endif // TASKFLOW_TD \ No newline at end of file diff --git a/include/TaskflowDialect/TaskflowOps.h b/include/TaskflowDialect/TaskflowOps.h index 9dc984a8..dc8b87f4 100644 --- a/include/TaskflowDialect/TaskflowOps.h +++ b/include/TaskflowDialect/TaskflowOps.h @@ -8,6 +8,7 @@ #include "mlir/IR/DialectImplementation.h" #include "mlir/IR/OpDefinition.h" #include "mlir/Interfaces/ControlFlowInterfaces.h" +#include "mlir/Interfaces/InferTypeOpInterface.h" #include "mlir/Interfaces/SideEffectInterfaces.h" // First includes the interface declarations. diff --git a/include/TaskflowDialect/TaskflowOps.td b/include/TaskflowDialect/TaskflowOps.td index 28641951..66603f7d 100644 --- a/include/TaskflowDialect/TaskflowOps.td +++ b/include/TaskflowDialect/TaskflowOps.td @@ -7,6 +7,9 @@ include "mlir/IR/OpBase.td" include "mlir/IR/RegionKindInterface.td" include "mlir/Interfaces/ControlFlowInterfaces.td" include "mlir/Interfaces/SideEffectInterfaces.td" +include "mlir/Interfaces/InferTypeOpInterface.td" +include "mlir/IR/CommonTypeConstraints.td" +include "mlir/IR/CommonAttrConstraints.td" //---------------------------------------------------------------------- // Base Class for all Taskflow operations. @@ -14,165 +17,238 @@ include "mlir/Interfaces/SideEffectInterfaces.td" class TaskflowOpBase traits = []> : Op; //---------------------------------------------------------------------- -// Graph Level Operations. +// Task Level Operations. //---------------------------------------------------------------------- -// Defines the top-level graph operation representing the workload. -def TaskflowGraphOp : TaskflowOpBase<"graph", [ +// Defines a uniform computation task operation within a Taskflow graph. +def TaskflowTaskOp : TaskflowOpBase<"task", [ IsolatedFromAbove, AutomaticAllocationScope, - SingleBlockImplicitTerminator<"TaskflowReturnOp"> + AttrSizedOperandSegments, + AttrSizedResultSegments, + SingleBlockImplicitTerminator<"TaskflowYieldOp"> ]>{ - let summary = "Top-level graph operation representing workload on a scale-out/scale-up spatial architecture."; + let summary = "Computation task operation within a Taskflow graph."; let description = [{ - Defines a region where all operations are flat tasks connected by edges. - This is the boundary between Host (CPU) and Device (spatial architecture). - - The graph contains: - - A flat list of `taskflow.task` operations (nodes) - - `taskflow.drive` operations (control edges) - - `taskflow.connect` operations (data dependency edges) - - A single `taskflow.return` operation to terminate the graph. - - Example: + Represents a computational task that takes data inputs and produces + data outoputs. Tasks are isolated from their surrounding scope and can only + communicate through explicit data dependencies. + Tasks has two types of inputs/outputs: + 1. Memory dependencies: memrefs that are read or written by the task + 2. Value dependencies: SSA values from producer tasks + + Example: + // Memory input: %mem, Value input: %val + $out_mem, %out_val = taskflow.task "Task_0" + memory_inputs(%mem : memref<4xi32>) + value_inputs(%val : i32) { + ^bb0(%a0: memref<4xi32>, %a1: i32): + affine.for %i = 0 to 4 { + %v = affine.load %a0[%i] : memref<4xi32> + %sum = arith.addi %v, %a1 : i32 + affine.store %sum, %a0[%i] : memref<4xi32> + } + taskflow.yield memory_outputs(%a0 : memref<4xi32>) value_outputs(%a1 : i32) + } : (memref<4xi32>, i32) -> (memref<4xi32>, i32) }]; - let arguments = (ins Variadic:$inputs); - let results = (outs Variadic:$results); + let arguments = (ins + Variadic:$memory_inputs, + Variadic:$value_inputs, + StrAttr:$task_name + ); + + let results = (outs + Variadic:$memory_outputs, + Variadic:$value_outputs + ); + let regions = (region SizedRegion<1>:$body); - let assemblyFormat = [{ - `(` $inputs `)` attr-dict-with-keyword $body `:` functional-type($inputs, $results) - }]; + // let hasCustomAssemblyFormat = 1; + + // let assemblyFormat = [{ + // (`memory_inputs` `(` $memory_inputs^ `:` type($memory_inputs) `)`)? + // (`value_inputs` `(` $value_inputs^ `:` type($value_inputs) `)`)? + // attr-dict-with-keyword + // $body + // `->` `(` type($memory_outputs) `,` type($value_outputs) `)` + // }]; + } -// Defines the return operation to terminate a Taskflow graph. -def TaskflowReturnOp : TaskflowOpBase<"return", [Terminator]> { - let summary = "Return operation for Taskflow graph."; +// Defines the yield operation to terminate a Taskflow task. +def TaskflowYieldOp : TaskflowOpBase<"yield", [Terminator, Pure, ReturnLike, AttrSizedOperandSegments, ParentOneOf<["TaskflowTaskOp"]>]>{ + let summary = "Yield operation for Taskflow task"; let description = [{ - This operation terminates a Taskflow graph. - This acts as a interaction op between the spatial architecture and the host processor. - - Example" - taskflow.graph { + Yields values from a task body. The number and types of operands + must match the result types of the parent taskflow.task operation. + + Example: + taskflow.task "Task_0" (%arg0, %arg1) { ... - taskflow.return - } + taskflow.yield %a0 : memref<4xi32> + } : (memref<4xi32>, i32) -> memref<4xi32> }]; - let arguments = (ins Variadic:$results); + let arguments = (ins + Variadic:$memory_results, + Variadic:$value_results); - let assemblyFormat = [{ - ($results^ `:` type($results))? attr-dict - }]; + // let assemblyFormat = [{ + // (`memory_outputs` `(` $memory_results^ `:` type($memory_results) `)`)? + // (`value_outputs` `(` $value_results^ `:` type($value_results) `)`)? + // attr-dict + // }]; + + // let hasCustomAssemblyFormat = 1; let builders = [ - // Default builder for empty return. + // Default builder for empty yield. OpBuilder<(ins), [{ - build($_builder, $_state, ValueRange{}); + build($_builder, $_state, ValueRange{}, ValueRange{}); }]> ]; } -//---------------------------------------------------------------------- -// Task Level Operations. -//---------------------------------------------------------------------- - -// Defines a uniform computation and control task operation within a Taskflow graph. -def TaskflowTaskOp : TaskflowOpBase<"task", [ - AttrSizedOperandSegments, - AttrSizedResultSegments, - SingleBlockImplicitTerminator<"TaskflowYieldOp">, - NoMemoryEffect, -]>{ - let summary = "Uniform computation and control task operation within a Taskflow graph"; - +// Defines the data dependency edge operation that carries data dependencies between tasks in a Taskflow graph. +def TaskflowChannelOp : TaskflowOpBase<"channel", [Pure, SameOperandsAndResultType]>{ + let summary = "Data dependency edge that carries data dependencies between tasks in a Taskflow graph"; let description = [{ - - }]; + Represents a data dependency edge between tasks in the taskflow graph. + A channel connects a producer task's output to a consumer task's input. - let arguments = (ins - // Optional control inputs. - Variadic:$control_ins, - // Optional data inputs. - Variadic:$data_ins, - StrAttr:$task_name, - - // Task metadata. - OptionalAttr:$indexing_maps, - OptionalAttr:$iterator_types - ); + Channels enforce explicit data dependencies and can be used for: + - Producer-consumer relationships + - Read-after-write (RAW) dependencies + - Write-after-read (WAR) dependencies + - Write-after-write (WAW) dependencies - let results = (outs - // Optional control outputs. - Variadic:$control_outs, - // Optional data outputs. - Variadic:$data_outs - ); + Example: + %0 = taskflow.task "producer_task" (...) { ... } : (...) -> memref<4xi32> + %1 = taskflow.channel %0 : memref<4xi32> + %2 = taskflow.task "consumer_task" (%1, ...) { ... } : (memref<4xi32>, ...) -> ... + }]; - let regions = (region SizedRegion<1>:$body); + let arguments = (ins AnyType:$source); + let results = (outs AnyType:$target); - // let assemblyFormat = [{ - // $task_name - // (`control_ins` `(` $control_ins^ `:` type($control_ins) `)`)? - // (`data_ins` `(` $data_ins^ `:` type($data_ins) `)`)? - // $body attr-dict - // `->` type(results) - // }]; + let assemblyFormat = [{ + $source attr-dict `:` type($source) `->` type($target) + }]; } -// Defines the yield operation to terminate a Taskflow task. -def TaskflowYieldOp : TaskflowOpBase<"yield", [Terminator, Pure, ReturnLike, ParentOneOf<["TaskflowTaskOp"]>]>{ - let summary = "Yield operation for Taskflow task"; +//---------------------------------------------------------------------- +// Intra-Task Operations. +//---------------------------------------------------------------------- +// Counter operation representing loop iteration control within a Taskflow task. +def TaskflowCounterOp : TaskflowOpBase<"counter", [Pure]>{ + let summary = "Loop counter operation with hardware counter semantics"; + let description = [{ - + Represents a loop counter that generates iteration indices. + The hardware counter produces a predicated index value. + + Counter behavior: + - Top-level counter: increments unconditionally each cycle. + - Nested counter: increments only when the parent counter is valid. + + Example: + // Top-level counter + %i = taskflow.counter { + lower_bound = 0 : index, + upper_bound = 16 : index, + step = 1 : index, + counter_name = "i" + } : index + // Nested counter + %j = taskflow.counter parent(%i) { + lower_bound = 0 : index, + upper_bound = 8 : index, + step = 1 : index, + counter_name = "j" + } : index }]; - let arguments = (ins Variadic:$results); + let arguments = (ins + Optional:$parent_index, + IndexAttr:$lower_bound, + IndexAttr:$upper_bound, + IndexAttr:$step + ); + + let results = (outs AnyType:$counter_index); let assemblyFormat = [{ - ($results^ `:` type($results))? attr-dict + (`parent` `(` $parent_index^ `:` type($parent_index) `)`)? + attr-dict-with-keyword + `:` type($counter_index) }]; - - let builders = [ - // Default builder for empty yield. - OpBuilder<(ins), [{ - build($_builder, $_state, ValueRange{}); - }]> - ]; } -// Defines the control edge operation that carries control packets between tasks in a Taskflow graph. -def TaskflowDriveOp : TaskflowOpBase<"drive", [Pure]>{ - let summary = "Control edge that carries control packets between tasks in a Taskflow graph"; +def TaskflowHyperblockOp : TaskflowOpBase<"hyperblock",[ + AutomaticAllocationScope, + SingleBlockImplicitTerminator<"TaskflowHyperblockYieldOp"> +]>{ + let summary = "Hyperblock operation containing loop body computation"; + let description = [{ - + Represents the loop body computation as a hyperblock controlled by taskflow.counter operation. + The hyperblock takes the counter indices as input to trigger its execution. + + If the hyperblock has a return value, it must return the final value produced by the hyperblock (i.e., from the last iteration). + + Example: + %result = taskflow.hyperblock indices(%i : index) { + ^bb0(%idx: index): + // Loop body computation using %idx + ... + taskflow.hyperblock.yield %output : i32 + } -> i32 }]; - let arguments = (ins TaskflowPacketType:$source); + let arguments = (ins + Variadic:$indices + ); + + let results = (outs + Variadic:$outputs + ); - let results = (outs TaskflowPacketType:$target); + let regions = (region SizedRegion<1>:$body); let assemblyFormat = [{ - $source attr-dict `:` type($source) `->` type($target) + (`indices` `(` $indices^ `:` type($indices) `)`)? + attr-dict-with-keyword + $body + `->` `(` type($outputs) `)` }]; } -def TaskflowChannelOp : TaskflowOpBase<"channel", [Pure]>{ - let summary = "Data dependency edge that carries data dependencies between tasks in a Taskflow graph"; +def TaskflowHyperblockYieldOp : TaskflowOpBase<"hyperblock.yield", [ + Terminator, + Pure, + ReturnLike, + ParentOneOf<["TaskflowHyperblockOp"]> +]>{ + let summary = "Yield operation for Taskflow hyperblock"; + let description = [{ - + Terminates the hyperblock body. }]; - let arguments = (ins AnyType:$source); - - let results = (outs AnyType:$target); + let arguments = (ins Variadic:$outputs); let assemblyFormat = [{ - $source attr-dict `:` type($source) `->` type($target) + (`outputs` `(` $outputs^ `:` type($outputs) `)`)? + attr-dict }]; + + let builders = [ + OpBuilder<(ins), [{build($_builder, $_state, ValueRange{});}]> + ]; } #endif // TASKFLOW_OPS_TD \ No newline at end of file diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h new file mode 100644 index 00000000..f6219511 --- /dev/null +++ b/include/TaskflowDialect/TaskflowPasses.h @@ -0,0 +1,25 @@ +// TaskflowPasses.h - Header file for Taskflow passes + +#ifndef TASKFLOW_PASSES_H +#define TASKFLOW_PASSES_H + +#include "TaskflowDialect/TaskflowDialect.h" +#include "TaskflowDialect/TaskflowOps.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Pass/PassRegistry.h" + +#include +namespace mlir { +namespace taskflow { +// Passes defined in TaskflowPasses.td +#define GEN_PASS_DECL +#include "TaskflowDialect/TaskflowPasses.h.inc" +std::unique_ptr createConstructHyperblockFromTaskPass(); + +#define GEN_PASS_REGISTRATION +#include "TaskflowDialect/TaskflowPasses.h.inc" +} // namespace taskflow +} // namespace mlir + +#endif // TASKFLOW_PASSES_H \ No newline at end of file diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td new file mode 100644 index 00000000..1bcf3b22 --- /dev/null +++ b/include/TaskflowDialect/TaskflowPasses.td @@ -0,0 +1,18 @@ +// TaskflowPasses.td - Passes for the Taskflow dialect + +#ifndef TASKFLOW_PASSES_TD +#define TASKFLOW_PASSES_TD + +include "mlir/Pass/PassBase.td" + +//=========================================================// +// Passes for the Taskflow dialect +//=========================================================// +def ConstructHyperblockFromTask : Pass<"construct-hyperblock-from-task", "func::FuncOp"> { + let summary = "Constructs hyperblocks and counter chain from Taskflow tasks"; + let description = [{ + This pass constructs hyperblocks and counter chain from Taskflow tasks. + }]; + let constructor = "taskflow::createConstructHyperblockFromTaskPass()"; +} +#endif // TASKFLOW_PASSES_TD \ No newline at end of file diff --git a/include/TaskflowDialect/TaskflowTypes.td b/include/TaskflowDialect/TaskflowTypes.td index 120a5265..dacbf512 100644 --- a/include/TaskflowDialect/TaskflowTypes.td +++ b/include/TaskflowDialect/TaskflowTypes.td @@ -11,29 +11,4 @@ class TaskflowTypeBase traits = [] : TypeDef{ let mnemonic = typeMnemonic; } - -//---------------------------------------------------------------------- -// PacketType - Control conifguration packet type. -//---------------------------------------------------------------------- -def TaskflowPacketType : TaskflowTypeBase<"Packet", "packet">{ - let summary = "Control packet carrying conifguration metadata for affine controller"; - - let description = [{ - - }]; - - // Payload type carried by the packet. - let parameters = (ins "::mlir::Type":$payloadType); - - let assemblyFormat = [{ - `<` $payloadType `>` - }]; - - let builders = [ - TypeBuilderWithInferredContext<(ins "Type":$payloadType), - [{ - return $_get(payloadType.getContext(), payloadType); - }]> - ]; -} #endif //TASKFLOW_TYPES_TD \ No newline at end of file diff --git a/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp b/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp new file mode 100644 index 00000000..f628364f --- /dev/null +++ b/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp @@ -0,0 +1,364 @@ +#include "Conversion/ConversionPasses.h" +#include "TaskflowDialect/TaskflowDialect.h" +#include "TaskflowDialect/TaskflowOps.h" +#include "TaskflowDialect/TaskflowTypes.h" + +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/Block.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/IR/MLIRContext.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/Value.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Support/LLVM.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/raw_ostream.h" + +using namespace mlir; +using namespace mlir::taskflow; + +namespace { +//------------------------------------------------------------------------------ +// Helper Functions. +//------------------------------------------------------------------------------ + +// Collects all top-level affine.for operations in a function. +static SmallVector +collectTopLevelLooops(func::FuncOp func_op) { + SmallVector top_level_loops; + for (Block &block : func_op.getBlocks()) { + for (Operation &op : block) { + if (auto for_op = dyn_cast(op)) { + top_level_loops.push_back(for_op); + } + } + } + + return top_level_loops; +} + +// Collects memrefs that are loaded (read) within a given operation scope. +static void collectReadMemrefs(Operation *op, SetVector &read_memrefs) { + op->walk([&](Operation *nested_op) { + if (auto load_op = dyn_cast(nested_op)) { + read_memrefs.insert(load_op.getMemRef()); + } else if (auto load_op = dyn_cast(nested_op)) { + read_memrefs.insert(load_op.getMemRef()); + } + }); +} + +// Collects memrefs that are stored (written) within a given operation scope. +static void collectWrittenMemrefs(Operation *op, + SetVector &written_memrefs) { + op->walk([&](Operation *nested_op) { + if (auto store_op = dyn_cast(nested_op)) { + written_memrefs.insert(store_op.getMemRef()); + } else if (auto store_op = dyn_cast(nested_op)) { + written_memrefs.insert(store_op.getMemRef()); + } + }); +} + +// Collects external values used within a given scope of operations. +static void collectExternalValues(Operation *root_op, + const DenseSet &scope_ops, + SetVector &external_values) { + for (Value operand : root_op->getOperands()) { + // Skips memref types (handled separately as memory dependencies). + if (isa(operand.getType())) { + continue; + } + + // Checks if it's a block argument. + if (auto block_arg = dyn_cast(operand)) { + // Only adds if the block argument is not from within the scope. + Operation *parent_op = block_arg.getOwner()->getParentOp(); + if (!scope_ops.contains(parent_op)) { + external_values.insert(operand); + } + continue; + } + + // Checks if the operand is defined outside the scope. + Operation *def_op = operand.getDefiningOp(); + if (def_op && !scope_ops.contains(def_op)) { + external_values.insert(operand); + } + } + + // Recursively processes nested operations. + for (Region ®ion : root_op->getRegions()) { + for (Block &block : region.getBlocks()) { + for (Operation &op : block.getOperations()) { + collectExternalValues(&op, scope_ops, external_values); + } + } + } +} + +//------------------------------------------------------------------------------ +// Task Conversion +//------------------------------------------------------------------------------ + +// Converts a top-level affine.for to a taskflow.task operation. +static TaskflowTaskOp convertLoopToTask(OpBuilder &builder, + affine::AffineForOp for_op, + DenseMap &value_mapping, + int task_id) { + Location loc = for_op.getLoc(); + std::string task_name = "Task_" + std::to_string(task_id); + + // Collects all operations in the loop scope. + DenseSet scope_ops; + scope_ops.insert(for_op.getOperation()); + for_op.walk([&](Operation *op) { scope_ops.insert(op); }); + + //------------------------------------------------------------------- + // Step 1: Collects read and written memrefs. + //------------------------------------------------------------------- + SetVector read_memrefs; + SetVector written_memrefs; + collectReadMemrefs(for_op.getOperation(), read_memrefs); + collectWrittenMemrefs(for_op.getOperation(), written_memrefs); + + llvm::errs() << "Read memrefs for loop:\n" << for_op << "\n"; + for (Value memref : read_memrefs) { + llvm::errs() << memref << "\n"; + } + + llvm::errs() << "Written memrefs for loop:\n" << for_op << "\n"; + for (Value memref : written_memrefs) { + llvm::errs() << memref << "\n"; + } + + //------------------------------------------------------------------- + // Step 2: Determines memory inputs and outputs. + //------------------------------------------------------------------- + // Memory inputs: ALL memrefs that are accessed (read OR written). + // This ensures WAR and WAW dependencies are respected. + SetVector accessed_memrefs; + accessed_memrefs.insert(read_memrefs.begin(), read_memrefs.end()); + accessed_memrefs.insert(written_memrefs.begin(), written_memrefs.end()); + + // Memory outputs: ONLY memrefs that are written. + // This ensures RAW and WAW dependencies are respected. + SetVector output_memrefs; + output_memrefs.insert(written_memrefs.begin(), written_memrefs.end()); + + //------------------------------------------------------------------- + // Step 3: Collects external SSA values (non-memref). + //------------------------------------------------------------------- + SetVector external_values; + collectExternalValues(for_op.getOperation(), scope_ops, external_values); + + llvm::errs() << "External values for loop:\n" << for_op << "\n"; + for (Value val : external_values) { + llvm::errs() << val << "\n"; + } + + //------------------------------------------------------------------- + // Step 4: Resolves inputs through value mapping. + //------------------------------------------------------------------- + SmallVector memory_inputs; + SmallVector value_inputs; + IRMapping mapping; + + // Resolves memory inputs. + for (Value memref : accessed_memrefs) { + Value resolved_memref = value_mapping.lookup(memref); + if (!resolved_memref) { + resolved_memref = memref; + } + memory_inputs.push_back(resolved_memref); + mapping.map(memref, resolved_memref); + } + + // Resolves external SSA value inputs. + for (Value external_val : external_values) { + Value resolved_val = value_mapping.lookup(external_val); + if (!resolved_val) { + resolved_val = external_val; + } + value_inputs.push_back(resolved_val); + mapping.map(external_val, resolved_val); + } + + //------------------------------------------------------------------- + // Step 5: Prepares output types. + //------------------------------------------------------------------- + SmallVector memory_output_types; + for (Value memref : output_memrefs) { + memory_output_types.push_back(memref.getType()); + } + + SmallVector value_output_types; + for (Type result_type : for_op.getResultTypes()) { + value_output_types.push_back(result_type); + } + + //------------------------------------------------------------------- + // Step 6: Creates the taskflow.task operation. + //------------------------------------------------------------------- + TaskflowTaskOp task_op = builder.create( + loc, + /*memory_outputs=*/memory_output_types, + /*value_outputs=*/value_output_types, + /*memory_inputs=*/memory_inputs, + /*value_inputs=*/value_inputs, + /*task_name=*/builder.getStringAttr(task_name)); + + //------------------------------------------------------------------- + // Step 7: Builds the task body. + //------------------------------------------------------------------- + Block *task_body = new Block(); + task_op.getBody().push_back(task_body); + + // Adds block arguments (memory inputs first, then value inputs). + DenseMap input_to_block_arg; + // Memory input arguments. + for (Value memref : accessed_memrefs) { + BlockArgument arg = task_body->addArgument(memref.getType(), loc); + mapping.map(memref, arg); + input_to_block_arg[memref] = arg; + } + + // Value input arguments. + for (Value val : external_values) { + BlockArgument arg = task_body->addArgument(val.getType(), loc); + mapping.map(val, arg); + input_to_block_arg[val] = arg; + } + + // Clones loop into the task body. + OpBuilder task_builder(task_body, task_body->begin()); + Operation *cloned_loop = task_builder.clone(*for_op.getOperation(), mapping); + + //--------------------------------------------------------------- + // Step 8: Creates the yield operation. + //--------------------------------------------------------------- + task_builder.setInsertionPointToEnd(task_body); + SmallVector memory_yield_operands; + SmallVector value_yield_operands; + + // Memory yield outputs: yield the written memrefs. + for (Value memref : output_memrefs) { + if (input_to_block_arg.count(memref)) { + memory_yield_operands.push_back(input_to_block_arg[memref]); + } else { + assert(false && "Written memref not in inputs!"); + } + } + + // Value yield outputs: yield the loop results. + for (Value result : cloned_loop->getResults()) { + value_yield_operands.push_back(result); + } + task_builder.create(loc, memory_yield_operands, + value_yield_operands); + + //------------------------------------------------------------------- + // Step 9 : Updates value mapping with task outputs for subsequent tasks + // conversion. + //------------------------------------------------------------------- + // Memory outputs. + for (auto [memref, task_output] : + llvm::zip(output_memrefs, task_op.getMemoryOutputs())) { + value_mapping[memref] = task_output; + } + + return task_op; +} + +//------------------------------------------------------------------------------ +// Main Conversion Process. +//------------------------------------------------------------------------------ +// Converts a single function to TaskFlow operations. +static LogicalResult convertFuncToTaskflow(func::FuncOp func_op) { + // Collects top-level loops for conversion. + SmallVector top_level_loops = + collectTopLevelLooops(func_op); + + if (top_level_loops.empty()) { + // No loops to convert. + llvm::errs() << "No top-level affine.for loops found in function '" + << func_op.getName() << "'.\n"; + return success(); + } + + llvm::errs() << "\n===Converting function: " << func_op.getName() << "===\n"; + llvm::errs() << "Found " << top_level_loops.size() + << " top-level affine.for loops to convert:\n"; + for (affine::AffineForOp for_op : top_level_loops) { + llvm::errs() << for_op.getLoc() << "\n"; + } + + OpBuilder builder(func_op.getContext()); + DenseMap value_mapping; + + // Converts each top-level loop to taskflow.task operation. + for (auto [idx, loop] : llvm::enumerate(top_level_loops)) { + builder.setInsertionPoint(loop); + TaskflowTaskOp task_op = + convertLoopToTask(builder, loop, value_mapping, idx); + + // Replaces uses of loop results with task value outputs. + for (auto [loop_result, task_value_output] : + llvm::zip(loop.getResults(), task_op.getValueOutputs())) { + loop_result.replaceAllUsesWith(task_value_output); + } + } + + // Erases the original loops after conversion. + for (affine::AffineForOp for_op : top_level_loops) { + for_op.erase(); + } + + return success(); +} + +class ConvertAffineToTaskflowPass + : public PassWrapper> { +public: + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ConvertAffineToTaskflowPass) + + StringRef getArgument() const final { return "convert-affine-to-taskflow"; } + + StringRef getDescription() const final { + return "Convert Affine operations to Taskflow operations"; + } + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() override { + ModuleOp module = getOperation(); + + WalkResult result = module.walk([](func::FuncOp func_op) { + if (failed(convertFuncToTaskflow(func_op))) { + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + + if (result.wasInterrupted()) { + signalPassFailure(); + } + } +}; +} // namespace + +std::unique_ptr mlir::createConvertAffineToTaskflowPass() { + return std::make_unique(); +} \ No newline at end of file diff --git a/lib/Conversion/LinalgToTaskflow/CMakeLists.txt b/lib/Conversion/AffineToTaskflow/CMakeLists.txt similarity index 74% rename from lib/Conversion/LinalgToTaskflow/CMakeLists.txt rename to lib/Conversion/AffineToTaskflow/CMakeLists.txt index c8e425a6..bb4f3f52 100644 --- a/lib/Conversion/LinalgToTaskflow/CMakeLists.txt +++ b/lib/Conversion/AffineToTaskflow/CMakeLists.txt @@ -1,7 +1,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR}) -add_mlir_conversion_library(MLIRLinalgToTaskflowPass - LinalgToTaskflowPass.cpp +add_mlir_conversion_library(MLIRAffineToTaskflowPass + AffineToTaskflowPass.cpp DEPENDS MLIRConversionIncGen diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt index 4f4e247f..cf66d518 100644 --- a/lib/Conversion/CMakeLists.txt +++ b/lib/Conversion/CMakeLists.txt @@ -5,7 +5,7 @@ add_subdirectory(AffineToNeura) add_subdirectory(LlvmToNeura) add_subdirectory(MemRefToNeura) add_subdirectory(BuiltinToNeura) -add_subdirectory(LinalgToTaskflow) +add_subdirectory(AffineToTaskflow) add_library(MLIRConversion INTERFACE) @@ -22,6 +22,6 @@ target_link_libraries(MLIRConversion INTERFACE MLIRNeuraLlvmToNeuraPass MLIRNeuraMemRefToNeuraPass MLIRNeuraBuiltinToNeuraPass - MLIRLinalgToTaskflowPass + MLIRAffineToTaskflowPass ${dialect_libs} ) \ No newline at end of file diff --git a/lib/Conversion/LinalgToTaskflow/LinalgToTaskflowPass.cpp b/lib/Conversion/LinalgToTaskflow/LinalgToTaskflowPass.cpp deleted file mode 100644 index a4489f44..00000000 --- a/lib/Conversion/LinalgToTaskflow/LinalgToTaskflowPass.cpp +++ /dev/null @@ -1,487 +0,0 @@ -#include "Conversion/ConversionPasses.h" -#include "TaskflowDialect/TaskflowDialect.h" -#include "TaskflowDialect/TaskflowOps.h" -#include "TaskflowDialect/TaskflowTypes.h" - -#include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Dialect/Linalg/IR/Linalg.h" -#include "mlir/Dialect/Tensor/IR/Tensor.h" -#include "mlir/IR/Block.h" -#include "mlir/IR/Builders.h" -#include "mlir/IR/BuiltinOps.h" -#include "mlir/IR/MLIRContext.h" -#include "mlir/IR/Operation.h" -#include "mlir/IR/Value.h" -#include "mlir/Pass/Pass.h" -#include "mlir/Support/LLVM.h" -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/SetVector.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/Support/raw_ostream.h" - -using namespace mlir; -using namespace mlir::taskflow; - -namespace { -//------------------------------------------------------------------------------ -// Helper Functions. -//------------------------------------------------------------------------------ -// Gets a descriptive task name based on the operation type. -static std::string generateTaskBaseName(Operation *op) { - if (isa(op)) { - return "conv2d"; - } - if (isa(op)) { - return "matmul"; - } - if (isa(op)) { - return "batch_matmul"; - } - if (isa(op)) { - return "pooling"; - } - if (auto generic_op = dyn_cast(op)) { - return "generic"; - } - return "task"; -} - -// Maintains conversion context during the conversion process. -struct ConversionContext { - // Maps original SSA values to task output values. - DenseMap value_mapping; - - // Maps graph input values to graph block arguments. - DenseMap graph_input_mapping; - - // Counter for generating unique task names. - int task_counter = 0; - - // Generates a unique task name. - std::string getTaskBaseName(StringRef base_name) { - return (base_name + "_" + Twine(task_counter++)).str(); - } -}; - -// Operation classification. -static bool isComputeIntensiveOp(Operation *op) { - // Returns true if the operation is one of the compute-intensive Linalg ops. - return isa(op); -} - -// Collects external values for a single operation. -static void collectExternalValuesForOp( - Operation *op, const DenseSet &graph_op_set, - func::FuncOp func_op, SetVector &external_values) { - for (Value operand : op->getOperands()) { - // Skips nested region block arguments. - if (auto block_arg = dyn_cast(operand)) { - if (block_arg.getOwner()->getParentOp() != func_op.getOperation()) { - continue; - } - external_values.insert(operand); - continue; - } - - // Skips values defined inside graph ops or nested regions. - Operation *def_op = operand.getDefiningOp(); - if (def_op) { - if (!graph_op_set.contains(def_op) && - def_op->getBlock()->getParentOp() == func_op.getOperation()) { - external_values.insert(operand); - } - } - } - - // Recurses into nested regions. - for (Region ®ion : op->getRegions()) { - for (Block &block : region) { - for (Operation &nested_op : block) { - collectExternalValuesForOp(&nested_op, graph_op_set, func_op, - external_values); - } - } - } -} - -// Collects external values used by each graph operation. -static DenseMap> -collectExternalValuesPerOp(ArrayRef graph_ops, - func::FuncOp func_op) { - DenseSet graph_op_set(graph_ops.begin(), graph_ops.end()); - DenseMap> op_external_values; - - for (Operation *op : graph_ops) { - SetVector external_values; - collectExternalValuesForOp(op, graph_op_set, func_op, external_values); - op_external_values[op] = - SmallVector(external_values.begin(), external_values.end()); - } - - return op_external_values; -} - -//------------------------------------------------------------------------------ -// Step 1: Scope Identification - Collects operations for the taskflow.graph -// op. -//------------------------------------------------------------------------------ -// Collects all operations that should be included in the taskflow graph. -// Returns operations in topological order. -static SmallVector collectTaskflowGraphOps(func::FuncOp func_op) { - SmallVector graph_ops; - - func_op.walk([&](Operation *op) { - if (isComputeIntensiveOp(op)) { - graph_ops.push_back(op); - } - }); - return graph_ops; -} - -// Identifies external inputs to the taskflow graph (values defined outside the -// graph ops). -static SmallVector identifyGraphInputs(ArrayRef graph_ops, - func::FuncOp func_op) { - llvm::SetVector input_set; - llvm::DenseSet graph_op_set(graph_ops.begin(), graph_ops.end()); - - for (Operation *op : graph_ops) { - collectExternalValuesForOp(op, graph_op_set, func_op, input_set); - } - - return SmallVector(input_set.begin(), input_set.end()); -} - -// Identifies outputs from the graph (values used outside the graph ops). -static SmallVector identifyGraphOutputs(ArrayRef graph_ops, - func::FuncOp func_op) { - SmallVector outputs; - DenseSet graph_op_set(graph_ops.begin(), graph_ops.end()); - - for (Operation *op : graph_ops) { - for (Value result : op->getResults()) { - bool used_outside = false; - for (Operation *user : result.getUsers()) { - if (!graph_op_set.contains(user)) { - used_outside = true; - break; - } - } - if (used_outside) { - outputs.push_back(result); - } - } - } - return outputs; -} - -//------------------------------------------------------------------------------ -// Step 2: Task Contruction - Creates the taskflow.task ops. -//------------------------------------------------------------------------------ -// Reolves the input value for a task operand. -// Returns the corresponding buffer value from the context, or wraps the -// original value. -static Value resolveTaskInput(OpBuilder &builder, Location loc, - Value original_value, ConversionContext &ctx) { - // Checks if this value is produced by a task. - if (ctx.value_mapping.count(original_value)) { - return ctx.value_mapping[original_value]; - } - - // Checks if this value is a graph input. - if (ctx.graph_input_mapping.count(original_value)) { - return ctx.graph_input_mapping[original_value]; - } - - // Should not reach here for well-formed graphs. - assert(false && "Unable to resolve task input value"); - return Value(); -} - -// Creates a taskflow.task op from a given operation. -// For pure data dependent workloads (e.g., AI workloads), taskes have: -// - data_ins: input buffers -// - data_outs: output buffers -// - no control dependencies -static TaskflowTaskOp createTaskFromOp(OpBuilder &builder, Operation *op, - ConversionContext &ctx, - ArrayRef external_values) { - Location loc = op->getLoc(); - std::string task_name = ctx.getTaskBaseName(generateTaskBaseName(op)); - - // Resolves all external values to graph local values. - SmallVector data_ins; - IRMapping mapping; - - for (Value external_val : external_values) { - Value resolved_input = resolveTaskInput(builder, loc, external_val, ctx); - assert(resolved_input && "Failed to resolve task input"); - data_ins.push_back(resolved_input); - mapping.map(external_val, resolved_input); - } - - for (Value operand : op->getOperands()) { - if (llvm::is_contained(external_values, operand)) { - // Already mapped. - continue; - } - Value resolved_input = resolveTaskInput(builder, loc, operand, ctx); - assert(resolved_input && "Failed to resolve task input"); - data_ins.push_back(resolved_input); - mapping.map(operand, resolved_input); - } - - // Data outputs uses original result types. - SmallVector data_out_types; - for (Type result_type : op->getResultTypes()) { - data_out_types.push_back(result_type); - } - - // Creates the taskflow.task op. - auto task_op = builder.create( - loc, - /*control_outs=*/TypeRange{}, - /*data_outs=*/data_out_types, - /*control_ins=*/ValueRange{}, - /*data_ins=*/data_ins, builder.getStringAttr(task_name), - /*indexing_maps=*/nullptr, - /*iterator_types=*/nullptr); - - // Builds task body. - Block *task_body = new Block(); - task_op.getBody().push_back(task_body); - - // Block arguments have same types as data_ins (original tensor types). - for (Value input : data_ins) { - task_body->addArgument(input.getType(), loc); - } - - // Maps external values to task block arguments. - for (size_t i = 0; i < external_values.size(); i++) { - mapping.map(external_values[i], task_body->getArgument(i)); - } - - // Switches to the task body to clone the original operation. - OpBuilder task_builder(task_body, task_body->begin()); - Operation *cloned_op = task_builder.clone(*op, mapping); - // Yields the results. - task_builder.create(loc, cloned_op->getResults()); - - // Registers task outputs in context (same types as original results). - for (auto [orig_result, task_output] : - llvm::zip(op->getResults(), task_op.getDataOuts())) { - ctx.value_mapping[orig_result] = task_output; - } - - return task_op; -} - -//------------------------------------------------------------------------------ -// Step 3: Channel Insertion - Inserts taskflow.channel ops between tasks. -//------------------------------------------------------------------------------ -static void insertChannels(OpBuilder &builder, ArrayRef tasks) { - DenseSet task_set(tasks.begin(), tasks.end()); - - for (TaskflowTaskOp producer_task : tasks) { - Location loc = producer_task.getLoc(); - - // For each data output of this producer task. - for (Value data_out : producer_task.getDataOuts()) { - // Collects all consumer tasks that use this output. - SmallVector> consumer_tasks; - - for (OpOperand &use : data_out.getUses()) { - Operation *user = use.getOwner(); - if (auto consumer_task = dyn_cast(user)) { - if (task_set.contains(consumer_task)) { - consumer_tasks.push_back({consumer_task, &use}); - } - } - } - - // Creates a dedicated channel for each consumer task. - builder.setInsertionPointAfter(producer_task); - - for (auto [consumer_task, use] : consumer_tasks) { - // Creates a new channel for this specific producer->consumer edge. - auto channel_op = builder.create( - loc, data_out.getType(), data_out); - - // Replaces only this specific use with the channel output. - use->set(channel_op.getTarget()); - } - } - } -} - -//------------------------------------------------------------------------------ -// Step 4: Graph Construction - Creates the taskflow.graph op. -//------------------------------------------------------------------------------ -static LogicalResult buildTaskflowGraph( - OpBuilder &builder, func::FuncOp func_op, ArrayRef graph_ops, - ArrayRef graph_inputs, MutableArrayRef graph_outputs, - const DenseMap> &op_external_values) { - Location loc = func_op.getLoc(); - - // Graph result types = original output types (no conversion). - SmallVector result_types; - for (Value output : graph_outputs) { - result_types.push_back(output.getType()); - } - - // Creates graph op. - auto graph_op = - builder.create(loc, result_types, graph_inputs); - - // Builds graph body. - Block *graph_body = new Block(); - graph_op.getBody().push_back(graph_body); - - // Block arguments have same types as graph inputs. - ConversionContext ctx; - for (Value input : graph_inputs) { - BlockArgument arg = graph_body->addArgument(input.getType(), loc); - ctx.graph_input_mapping[input] = arg; - } - - // Converts each operation to a task. - builder.setInsertionPointToStart(graph_body); - SmallVector tasks; - for (Operation *op : graph_ops) { - const SmallVector &external_values = op_external_values.lookup(op); - auto task_op = createTaskFromOp(builder, op, ctx, external_values); - if (!task_op) { - return failure(); - } - tasks.push_back(task_op); - } - - // Inserts channels between tasks. - insertChannels(builder, tasks); - - // Creates graph return. - SmallVector return_values; - for (Value output : graph_outputs) { - Value resolved = ctx.value_mapping[output]; - return_values.push_back(resolved); - } - builder.create(loc, return_values); - - // Replaces original outputs with graph results. - for (auto [orig_output, graph_result] : - llvm::zip(graph_outputs, graph_op.getResults())) { - orig_output.replaceAllUsesExcept(graph_result, graph_op.getOperation()); - } - - // Erases original operations. - for (Operation *op : llvm::reverse(graph_ops)) { - op->erase(); - } - - return success(); -} - -//------------------------------------------------------------------------------ -// Main Conversion Process. -//------------------------------------------------------------------------------ -// Converts a single function to TaskFlow operations. -static LogicalResult convertFuncToTaskflow(func::FuncOp func_op) { - // Step 1: Collects operations for the taskflow.graph op. - SmallVector graph_ops = collectTaskflowGraphOps(func_op); - if (graph_ops.empty()) { - // No operations to convert. - return success(); - } - - llvm::errs() << "Converting function: " << func_op.getName() << "\n"; - llvm::errs() << "Collected taskflow graph operations:\n"; - for (Operation *op : graph_ops) { - llvm::errs() << " " << *op << "\n"; - } - - SmallVector graph_inputs = identifyGraphInputs(graph_ops, func_op); - SmallVector graph_outputs = identifyGraphOutputs(graph_ops, func_op); - - llvm::errs() << "Identified graph inputs:\n"; - for (Value input : graph_inputs) { - llvm::errs() << " " << input << "\n"; - } - llvm::errs() << "Identified graph outputs:\n"; - for (Value output : graph_outputs) { - llvm::errs() << " " << output << "\n"; - } - - // Finds insertion point: after the last operation that defines a graph input. - Operation *insertion_point = nullptr; - for (Value input : graph_inputs) { - if (auto *def_op = input.getDefiningOp()) { - if (!insertion_point || insertion_point->isBeforeInBlock(def_op)) { - insertion_point = def_op; - } - } - } - - // Set the insertion point for the builder. - OpBuilder builder(func_op.getContext()); - if (insertion_point) { - builder.setInsertionPointAfter(insertion_point); - } else { - // If no inputs are defined by an operation (i.e., they are all function - // arguments), insert the graph at the beginning of the function body. - builder.setInsertionPointToStart(&func_op.front()); - } - - // Collects external values for each graph operation. - DenseMap> op_external_values = - collectExternalValuesPerOp(graph_ops, func_op); - - // Step 2 & 3 & 4: Creates the taskflow.graph op. - auto result = buildTaskflowGraph(builder, func_op, graph_ops, graph_inputs, - graph_outputs, op_external_values); - llvm::errs() << "Converted function to TaskFlow graph.\n"; - llvm::errs() << "Resulting function:\n"; - func_op.print(llvm::errs()); - llvm::errs() << "\n"; - - return result; -} - -class ConvertLinalgToTaskflowPass - : public PassWrapper> { -public: - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ConvertLinalgToTaskflowPass) - - StringRef getArgument() const final { return "convert-linalg-to-taskflow"; } - - StringRef getDescription() const final { - return "Convert Linalg operations to Taskflow operations"; - } - - void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); - } - - void runOnOperation() override { - ModuleOp module = getOperation(); - - WalkResult result = module.walk([](func::FuncOp func_op) { - if (failed(convertFuncToTaskflow(func_op))) { - return WalkResult::interrupt(); - } - return WalkResult::advance(); - }); - - if (result.wasInterrupted()) { - signalPassFailure(); - } - } -}; -} // namespace - -std::unique_ptr mlir::createConvertLinalgToTaskflowPass() { - return std::make_unique(); -} \ No newline at end of file diff --git a/lib/TaskflowDialect/CMakeLists.txt b/lib/TaskflowDialect/CMakeLists.txt index 5762784e..d8e5d7ff 100644 --- a/lib/TaskflowDialect/CMakeLists.txt +++ b/lib/TaskflowDialect/CMakeLists.txt @@ -1,11 +1,16 @@ add_mlir_dialect_library(MLIRTaskflow Taskflow.cpp + TaskflowPasses.cpp + TaskflowOps.cpp DEPENDS MLIRConversionIncGen + MLIRTaskflowTransformsIncGen LINK_LIBS PUBLIC MLIRIR MLIRSupport MLIRInferTypeOpInterface - ) \ No newline at end of file +) + +add_subdirectory(Transforms) \ No newline at end of file diff --git a/lib/TaskflowDialect/Taskflow.cpp b/lib/TaskflowDialect/Taskflow.cpp index 61ff3195..bff4ec54 100644 --- a/lib/TaskflowDialect/Taskflow.cpp +++ b/lib/TaskflowDialect/Taskflow.cpp @@ -40,4 +40,16 @@ void TaskflowDialect::printAttribute(mlir::Attribute attr, mlir::DialectAsmPrinter &printer) const { // Currently no custom attributes to print. llvm_unreachable("Unknown Taskflow attribute"); +} + +mlir::Type TaskflowDialect::parseType(mlir::DialectAsmParser &parser) const { + // Currently no custom types to parse. + parser.emitError(parser.getNameLoc()) << "unknown Taskflow type"; + return mlir::Type(); +} + +void TaskflowDialect::printType(mlir::Type type, + mlir::DialectAsmPrinter &printer) const { + // Currently no custom types to print. + llvm_unreachable("Unknown Taskflow type"); } \ No newline at end of file diff --git a/lib/TaskflowDialect/TaskflowOps.cpp b/lib/TaskflowDialect/TaskflowOps.cpp new file mode 100644 index 00000000..e69de29b diff --git a/lib/TaskflowDialect/TaskflowPasses.cpp b/lib/TaskflowDialect/TaskflowPasses.cpp new file mode 100644 index 00000000..1a10c2ef --- /dev/null +++ b/lib/TaskflowDialect/TaskflowPasses.cpp @@ -0,0 +1,7 @@ +#include "TaskflowDialect/TaskflowPasses.h" +#include "TaskflowDialect/TaskflowDialect.h" +#include "TaskflowDialect/TaskflowOps.h" + +#include "mlir/Pass/PassManager.h" +#include "mlir/Pass/PassRegistry.h" +#include "mlir/Transforms/Passes.h" \ No newline at end of file diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt new file mode 100644 index 00000000..270ce96a --- /dev/null +++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt @@ -0,0 +1,17 @@ +get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) + +add_mlir_library(MLIRTaskflowTransforms + ConstructHyperblockFromTaskPass.cpp + + DEPENDS + MLIRTaskflowTransformsIncGen + + LINK_LIBS PUBLIC + MLIRIR + MLIRPass + MLIRSupport + MLIRTransforms + MLIRTaskflow + ${dialect_libs} + LLVMSupport +) \ No newline at end of file diff --git a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp new file mode 100644 index 00000000..7ba05060 --- /dev/null +++ b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp @@ -0,0 +1,493 @@ +#include "TaskflowDialect/TaskflowDialect.h" +#include "TaskflowDialect/TaskflowOps.h" +#include "TaskflowDialect/TaskflowPasses.h" + +#include "mlir/Conversion/AffineToStandard/AffineToStandard.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/DialectConversion.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/LogicalResult.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include + +using namespace mlir; +using namespace mlir::taskflow; + +namespace { +//--------------------------------------------------------------------------- +// Loop Info Structure +//---------------------------------------------------------------------------- +struct LoopInfo { + affine::AffineForOp for_op; + int lower_bound; + int upper_bound; + int step; + + // For nested loops + LoopInfo *parent_loop_info = nullptr; + SmallVector child_loops; + + // Generated counter index + Value counter_index; +}; + +//--------------------------------------------------------------------------- +// Hyperblock Info Structure +//---------------------------------------------------------------------------- +// Represents a code block that should become a hyperblock. +struct HyperblockInfo { + // The operations that belong to this hyperblock. + SmallVector operations; + + // The counter indices that trigger this hyperblock (empty for top-level + // operations before any loops). + SmallVector trigger_indices; + + // Whther this hyperblock is nested within loops. + bool is_loop_body = false; + + // The corresponding loop. + affine::AffineForOp loop_op = nullptr; +}; + +//---------------------------------------------------------------------------- +// Helper Functions +//---------------------------------------------------------------------------- +// Extracts loop parameters from affine.for operation. +static std::optional extractLoopBound(affine::AffineForOp for_op) { + LoopInfo loop_info; + loop_info.for_op = for_op; + + // Gets lower bound. + if (for_op.hasConstantLowerBound()) { + loop_info.lower_bound = for_op.getConstantLowerBound(); + } else { + return std::nullopt; + } + + // Gets upper bound. + if (for_op.hasConstantUpperBound()) { + loop_info.upper_bound = for_op.getConstantUpperBound(); + } else { + return std::nullopt; + } + + // Gets step. + loop_info.step = for_op.getStepAsInt(); + + return loop_info; +} + +// Collects all affine.for loops and builds loop hierarchy. +static SmallVector collectLoopInfo(TaskflowTaskOp task_op) { + SmallVector loops_info; + DenseMap op_to_loopinfo; + + // Step 1: Collects all loops with its parameter. + task_op.walk([&](affine::AffineForOp for_op) { + auto info = extractLoopBound(for_op); + if (!info) { + assert(false && "Non-constant loop bounds are not supported."); + } + + loops_info.push_back(*info); + op_to_loopinfo[for_op.getOperation()] = &loops_info.back(); + }); + + // Step 2: Builds parent-child relationships among loops. + for (auto &loop_info : loops_info) { + Operation *parent_op = loop_info.for_op->getParentOp(); + if (auto parent_for = dyn_cast(parent_op)) { + if (op_to_loopinfo.count(parent_for.getOperation())) { + LoopInfo *parent_loop_info = op_to_loopinfo[parent_for.getOperation()]; + loop_info.parent_loop_info = parent_loop_info; + parent_loop_info->child_loops.push_back(&loop_info); + } + } + } + + return loops_info; +} + +//---------------------------------------------------------------------------- +// Counter Chain Creation +//---------------------------------------------------------------------------- +// Recursively creates counter chain for each top-level loop. +static void createCounterChainRecursivly(OpBuilder &builder, Location loc, + LoopInfo *loop_info, + Value parent_counter) { + // Creates counter for this loop. + Value counter_index; + if (parent_counter) { + // Nested counter. + auto counter_op = builder.create( + loc, builder.getIndexType(), parent_counter, + builder.getIndexAttr(loop_info->lower_bound), + builder.getIndexAttr(loop_info->upper_bound), + builder.getIndexAttr(loop_info->step)); + counter_index = counter_op.getCounterIndex(); + } else { + // Top-level counter. + auto counter_op = builder.create( + loc, builder.getIndexType(), /*parent_index=*/nullptr, + builder.getIndexAttr(loop_info->lower_bound), + builder.getIndexAttr(loop_info->upper_bound), + builder.getIndexAttr(loop_info->step)); + counter_index = counter_op.getCounterIndex(); + } + + loop_info->counter_index = counter_index; + + // Recursively creates counters for child loops. + for (LoopInfo *child : loop_info->child_loops) { + createCounterChainRecursivly(builder, loc, child, counter_index); + } +} + +// Creates counter chain for all top-level loops. +static void createCounterChain(OpBuilder &builder, Location loc, + SmallVector &top_level_loops_info) { + for (LoopInfo *loop_info : top_level_loops_info) { + createCounterChainRecursivly(builder, loc, loop_info, nullptr); + } +} + +// Gets top-level loops' info (loops without parents). +static SmallVector +getTopLevelLoopsInfo(SmallVector &loops_info) { + SmallVector top_level_loops_info; + for (auto &loop_info : loops_info) { + if (!loop_info.parent_loop_info) { + top_level_loops_info.push_back(&loop_info); + } + } + return top_level_loops_info; +} + +//---------------------------------------------------------------------------- +// Hyperblock Creation +//---------------------------------------------------------------------------- +// Recursively extracts hyperblocks from a region. +static void extractHyperblocksInfoFromRegion( + Region ®ion, + const DenseMap &loop_info_map, + SmallVector parent_indices, + SmallVector &hyperblocks_info) { + Block &block = region.front(); + SmallVector current_block_ops; + + for (Operation &op : block.getOperations()) { + if (auto for_op = dyn_cast(&op)) { + // Before processing the loop, emits any accumulated operations as a + // hyperblock. + if (!current_block_ops.empty()) { + HyperblockInfo info; + info.operations = current_block_ops; + info.trigger_indices = parent_indices; + info.is_loop_body = !parent_indices.empty(); + hyperblocks_info.push_back(info); + current_block_ops.clear(); + } + + // Gets the loop info. + LoopInfo *loop_info = loop_info_map.lookup(for_op); + assert(loop_info && "Loop not found in loop_info_map"); + + // Builds trigger indices fro this loop (parent indices + this loop's + // index). + SmallVector loop_indices = parent_indices; + loop_indices.push_back(loop_info->counter_index); + + // Recursively extracts hyperblocks from the loop body. + extractHyperblocksInfoFromRegion(for_op.getRegion(), loop_info_map, + loop_indices, hyperblocks_info); + } else if (isa(&op) || + (isa(&op) && op.getOperands().empty())) { + // Skips TaskflowYieldOp and TaskflowCounterOp. + continue; + } else { + // Regular operation, accumulates it. + current_block_ops.push_back(&op); + } + } + + // Emits any remaining operations as a hyperblock. + if (!current_block_ops.empty()) { + HyperblockInfo info; + info.operations = current_block_ops; + info.trigger_indices = parent_indices; + info.is_loop_body = !parent_indices.empty(); + hyperblocks_info.push_back(info); + current_block_ops.clear(); + } +} + +// Extracts all hyperblocks from a task. +static SmallVector extractHyperblocksInfo( + TaskflowTaskOp task_op, + const DenseMap &loop_info_map) { + SmallVector hyperblocks_info; + // No parent indices for top-level hyperblocks (Not nested in a loop). + SmallVector empty_indices; + + extractHyperblocksInfoFromRegion(task_op.getBody(), loop_info_map, + empty_indices, hyperblocks_info); + + return hyperblocks_info; +} + +// Collects all indices that are actually used by operations in the hyperblock. +static SmallVector collectUsedIndices( + const SmallVector &operations, + const SmallVector &candidate_indices, + const DenseMap &loop_info_map) { + // Builds reverse mapping: counter -> induction variable. + DenseMap counter_to_indvar; + for (auto [loop_op, loop_info] : loop_info_map) { + counter_to_indvar[loop_info->counter_index] = loop_op.getInductionVar(); + } + + // Collects all values used by operations. + SetVector used_indvars_set; + for (Operation *op : operations) { + for (Value operand : op->getOperands()) { + used_indvars_set.insert(operand); + } + } + + // Returns in the same order as candidate_indices to maintain parent->child + // order. + SmallVector used_counters; + for (Value counter : candidate_indices) { + if (counter_to_indvar.count(counter)) { + Value indvar = counter_to_indvar[counter]; + if (used_indvars_set.contains(indvar)) { + used_counters.push_back(counter); + } + } + } + + return used_counters; +} + +// Determines output types for the hyperblock based on operations. +static SmallVector +determineHyperblockOutputTypes(const SmallVector &operations) { + SmallVector output_types = {}; + + // Checks if there's an affine.yield operation. + for (Operation *op : operations) { + if (auto affine_yield = dyn_cast(op)) { + // Uses the operand types of affine.yield as output types. + for (Value operand : affine_yield.getOperands()) { + output_types.push_back(operand.getType()); + } + return output_types; + } + } + + // No affine.yield found, no output types needed. + return output_types; +} + +// Creates a taskflow.hyperblock operation from HyperblockInfo. +static TaskflowHyperblockOp createHyperblock( + OpBuilder &builder, Location loc, const HyperblockInfo &info, + Block *task_body, + const DenseMap &loop_info_map) { + // Collects only the indices that are actually used in the hyperblock. + SmallVector used_indices = + collectUsedIndices(info.operations, info.trigger_indices, loop_info_map); + + // Determines output types for the hyperblock based on operations. + SmallVector output_types = + determineHyperblockOutputTypes(info.operations); + + // Creates the hyperblock operation. + TaskflowHyperblockOp hyperblock_op = + builder.create(loc, output_types, used_indices); + Block *hyperblock_body = new Block(); + hyperblock_op.getBody().push_back(hyperblock_body); + + // Adds block arguments for the used indices. + for (Value idx : used_indices) { + hyperblock_body->addArgument(idx.getType(), loc); + } + + // Clone operations into the hyperblock body. + OpBuilder hyperblock_builder(hyperblock_body, hyperblock_body->begin()); + IRMapping mapping; + + // Maps used indices to block arguments + for (auto [idx, arg] : + llvm::zip(used_indices, hyperblock_body->getArguments())) { + mapping.map(idx, arg); + } + + // Creates a mapping from loop counters to loop induction variables. + DenseMap counter_to_indvar; + for (auto [loop_op, loop_info] : loop_info_map) { + counter_to_indvar[loop_info->counter_index] = loop_op.getInductionVar(); + } + + // Maps loop induction variables to hyperblock block arguments. + for (auto [idx, arg] : + llvm::zip(used_indices, hyperblock_body->getArguments())) { + if (counter_to_indvar.count(idx)) { + Value indvar = counter_to_indvar[idx]; + mapping.map(indvar, arg); + } + } + + // Clones all operations and handle terminators. + bool has_terminator = false; + for (Operation *op : info.operations) { + // Handles affine.yield specially - convert to hyperblock.yield. + if (auto affine_yield = dyn_cast(op)) { + // Maps the yield operands through the IRMapping. + SmallVector yield_operands; + for (Value operand : affine_yield.getOperands()) { + Value mapped_operand = mapping.lookupOrDefault(operand); + yield_operands.push_back(mapped_operand); + } + + // Creates hyperblock.yield with the mapped operands. + hyperblock_builder.create(loc, yield_operands); + has_terminator = true; + continue; + } + + // Clones regular operations. + hyperblock_builder.clone(*op, mapping); + } + + // Adds terminator if the last operation wasn't already a yield. + if (!has_terminator) { + hyperblock_builder.setInsertionPointToEnd(hyperblock_body); + hyperblock_builder.create(loc); + } + + MLIRContext *context = hyperblock_op.getContext(); + RewritePatternSet patterns(context); + + populateAffineToStdConversionPatterns(patterns); + ConversionTarget target(*context); + target.addLegalDialect(); + target.addIllegalOp(); + if (failed( + applyPartialConversion(hyperblock_op, target, std::move(patterns)))) { + assert(false && "Affine to Standard conversion failed."); + } + + return hyperblock_op; +} + +//---------------------------------------------------------------------------- +// Task Transformation +//---------------------------------------------------------------------------- +// The main transformation function for TaskflowTaskOp. +static LogicalResult transformTask(TaskflowTaskOp task_op) { + Location loc = task_op.getLoc(); + + // Step 1: Collects loop information. + DenseMap loop_info_map; + SmallVector loops_info = collectLoopInfo(task_op); + for (auto &loop_info : loops_info) { + loop_info_map[loop_info.for_op] = &loop_info; + } + + // Gets the body block of the task. + Block *task_body = &task_op.getBody().front(); + + // Finds the first loop in the task body. + affine::AffineForOp first_loop_op = nullptr; + for (Operation &op : task_body->getOperations()) { + if (auto for_op = dyn_cast(&op)) { + first_loop_op = for_op; + break; + } + } + + assert(first_loop_op && "No loops found in the task body."); + + // Step 2: Creates counter chain before the first loop. + OpBuilder builder(first_loop_op); + SmallVector top_level_loops_info = + getTopLevelLoopsInfo(loops_info); + createCounterChain(builder, loc, top_level_loops_info); + + // Step 3: Extracts hyperblocks from task. + SmallVector hyperblocks_info = + extractHyperblocksInfo(task_op, loop_info_map); + + // Step 4: Creates taskflow.hyperblock operations for each hyperblock. + builder.setInsertionPoint(first_loop_op); + + // Collects all operations to erase. + SmallVector ops_to_erase; + for (Operation &op : llvm::make_early_inc_range(task_body->getOperations())) { + if (!isa(&op)) { + ops_to_erase.push_back(&op); + } + } + + // Creates hyperblock ops. + for (const auto &info : hyperblocks_info) { + createHyperblock(builder, loc, info, task_body, loop_info_map); + } + + // Erases original operations. + for (Operation *op : ops_to_erase) { + op->erase(); + } + + return success(); +} + +struct ConstructHyperblockFromTaskPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ConstructHyperblockFromTaskPass) + + StringRef getArgument() const final { + return "construct-hyperblock-from-task"; + } + + StringRef getDescription() const final { + return "Constructs hyperblocks and counter chains from Taskflow tasks."; + } + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() override { + func::FuncOp func_op = getOperation(); + // Collects all tasks. + SmallVector tasks; + func_op.walk([&](TaskflowTaskOp task_op) { tasks.push_back(task_op); }); + + // Transforms each task. + for (TaskflowTaskOp task_op : tasks) { + if (failed(transformTask(task_op))) { + signalPassFailure(); + return; + } + } + } +}; +} // namespace + +std::unique_ptr mlir::taskflow::createConstructHyperblockFromTaskPass() { + return std::make_unique(); +} \ No newline at end of file diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir new file mode 100644 index 00000000..ac2881c1 --- /dev/null +++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir @@ -0,0 +1,141 @@ +// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: | FileCheck %s --check-prefixes=TASKFLOW + +// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task \ +// RUN: | FileCheck %s --check-prefixes=HYPERBLOCK + +module attributes {} { + func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { + affine.for %arg10 = 0 to 4 { + affine.for %arg11 = 0 to 8 { + affine.for %arg12 = 0 to 6 { + %1 = affine.load %arg0[%arg10, %arg11, %arg12] : memref + affine.store %1, %arg5[%arg12] : memref + } + affine.for %arg12 = 0 to 5 { + %1 = affine.load %arg1[%arg10, %arg11, %arg12] : memref + %2 = affine.load %arg2[%arg10, %arg11, %arg12] : memref + %3 = arith.addi %1, %2 : i32 + affine.store %3, %arg6[%arg12] : memref + } + affine.for %arg12 = 0 to 6 { + %1 = affine.load %arg5[%arg12] : memref + %2 = affine.load %arg6[%arg12] : memref + %3 = arith.addi %1, %2 : i32 + %4 = affine.load %arg9[0] : memref + %5 = arith.addi %4, %3 : i32 + affine.store %5, %arg9[0] : memref + } + } + affine.for %arg11 = 0 to 7 { + %1 = affine.load %arg3[%arg10, %arg11] : memref + affine.store %1, %arg7[%arg11] : memref + } + affine.for %arg11 = 0 to 9 { + %1 = affine.load %arg4[%arg10, %arg11] : memref + %2 = affine.load %arg7[%arg11] : memref + %3 = arith.addi %1, %2 : i32 + affine.store %3, %arg8[%arg11] : memref + } + } + %0 = affine.load %arg9[0] : memref + return %0 : i32 + } +} + +// TASKFLOW: module { +// TASKFLOW-NEXT: func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { +// TASKFLOW-NEXT: %memory_outputs:5 = "taskflow.task"(%arg0, %arg1, %arg2, %arg5, %arg6, %arg9, %arg3, %arg4, %arg7, %arg8) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// TASKFLOW-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref, %arg13: memref, %arg14: memref, %arg15: memref, %arg16: memref, %arg17: memref, %arg18: memref, %arg19: memref): +// TASKFLOW-NEXT: affine.for %arg20 = 0 to 4 { +// TASKFLOW-NEXT: affine.for %arg21 = 0 to 8 { +// TASKFLOW-NEXT: affine.for %arg22 = 0 to 6 { +// TASKFLOW-NEXT: %1 = affine.load %arg10[%arg20, %arg21, %arg22] : memref +// TASKFLOW-NEXT: affine.store %1, %arg13[%arg22] : memref +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: affine.for %arg22 = 0 to 5 { +// TASKFLOW-NEXT: %1 = affine.load %arg11[%arg20, %arg21, %arg22] : memref +// TASKFLOW-NEXT: %2 = affine.load %arg12[%arg20, %arg21, %arg22] : memref +// TASKFLOW-NEXT: %3 = arith.addi %1, %2 : i32 +// TASKFLOW-NEXT: affine.store %3, %arg14[%arg22] : memref +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: affine.for %arg22 = 0 to 6 { +// TASKFLOW-NEXT: %1 = affine.load %arg13[%arg22] : memref +// TASKFLOW-NEXT: %2 = affine.load %arg14[%arg22] : memref +// TASKFLOW-NEXT: %3 = arith.addi %1, %2 : i32 +// TASKFLOW-NEXT: %4 = affine.load %arg15[0] : memref +// TASKFLOW-NEXT: %5 = arith.addi %4, %3 : i32 +// TASKFLOW-NEXT: affine.store %5, %arg15[0] : memref +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: affine.for %arg21 = 0 to 7 { +// TASKFLOW-NEXT: %1 = affine.load %arg16[%arg20, %arg21] : memref +// TASKFLOW-NEXT: affine.store %1, %arg18[%arg21] : memref +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: affine.for %arg21 = 0 to 9 { +// TASKFLOW-NEXT: %1 = affine.load %arg17[%arg20, %arg21] : memref +// TASKFLOW-NEXT: %2 = affine.load %arg18[%arg21] : memref +// TASKFLOW-NEXT: %3 = arith.addi %1, %2 : i32 +// TASKFLOW-NEXT: affine.store %3, %arg19[%arg21] : memref +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: "taskflow.yield"(%arg13, %arg14, %arg15, %arg18, %arg19) <{operandSegmentSizes = array}> : (memref, memref, memref, memref, memref) -> () +// TASKFLOW-NEXT: }) : (memref, memref, memref, memref, memref, memref, memref, memref, memref, memref) -> (memref, memref, memref, memref, memref) +// TASKFLOW-NEXT: %0 = affine.load %arg9[0] : memref +// TASKFLOW-NEXT: return %0 : i32 +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT:} + +// HYPERBLOCK: module { +// HYPERBLOCK-NEXT: func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { +// HYPERBLOCK-NEXT: %memory_outputs:5 = "taskflow.task"(%arg0, %arg1, %arg2, %arg5, %arg6, %arg9, %arg3, %arg4, %arg7, %arg8) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// HYPERBLOCK-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref, %arg13: memref, %arg14: memref, %arg15: memref, %arg16: memref, %arg17: memref, %arg18: memref, %arg19: memref): +// HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index +// HYPERBLOCK-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// HYPERBLOCK-NEXT: %3 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 6 : index} : index +// HYPERBLOCK-NEXT: %4 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 5 : index} : index +// HYPERBLOCK-NEXT: %5 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 6 : index} : index +// HYPERBLOCK-NEXT: %6 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 7 : index} : index +// HYPERBLOCK-NEXT: %7 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 9 : index} : index +// HYPERBLOCK-NEXT: taskflow.hyperblock indices(%1, %2, %3 : index, index, index) { +// HYPERBLOCK-NEXT: ^bb0(%arg20: index, %arg21: index, %arg22: index): +// HYPERBLOCK-NEXT: %8 = memref.load %arg10[%arg20, %arg21, %arg22] : memref +// HYPERBLOCK-NEXT: memref.store %8, %arg13[%arg22] : memref +// HYPERBLOCK-NEXT: } -> () +// HYPERBLOCK-NEXT: taskflow.hyperblock indices(%1, %2, %4 : index, index, index) { +// HYPERBLOCK-NEXT: ^bb0(%arg20: index, %arg21: index, %arg22: index): +// HYPERBLOCK-NEXT: %8 = memref.load %arg11[%arg20, %arg21, %arg22] : memref +// HYPERBLOCK-NEXT: %9 = memref.load %arg12[%arg20, %arg21, %arg22] : memref +// HYPERBLOCK-NEXT: %10 = arith.addi %8, %9 : i32 +// HYPERBLOCK-NEXT: memref.store %10, %arg14[%arg22] : memref +// HYPERBLOCK-NEXT: } -> () +// HYPERBLOCK-NEXT: taskflow.hyperblock indices(%5 : index) { +// HYPERBLOCK-NEXT: ^bb0(%arg20: index): +// HYPERBLOCK-NEXT: %8 = memref.load %arg13[%arg20] : memref +// HYPERBLOCK-NEXT: %9 = memref.load %arg14[%arg20] : memref +// HYPERBLOCK-NEXT: %10 = arith.addi %8, %9 : i32 +// HYPERBLOCK-NEXT: %c0 = arith.constant 0 : index +// HYPERBLOCK-NEXT: %11 = memref.load %arg15[%c0] : memref +// HYPERBLOCK-NEXT: %12 = arith.addi %11, %10 : i32 +// HYPERBLOCK-NEXT: %c0_0 = arith.constant 0 : index +// HYPERBLOCK-NEXT: memref.store %12, %arg15[%c0_0] : memref +// HYPERBLOCK-NEXT: } -> () +// HYPERBLOCK-NEXT: taskflow.hyperblock indices(%1, %6 : index, index) { +// HYPERBLOCK-NEXT: ^bb0(%arg20: index, %arg21: index): +// HYPERBLOCK-NEXT: %8 = memref.load %arg16[%arg20, %arg21] : memref +// HYPERBLOCK-NEXT: memref.store %8, %arg18[%arg21] : memref +// HYPERBLOCK-NEXT: } -> () +// HYPERBLOCK-NEXT: taskflow.hyperblock indices(%1, %7 : index, index) { +// HYPERBLOCK-NEXT: ^bb0(%arg20: index, %arg21: index): +// HYPERBLOCK-NEXT: %8 = memref.load %arg17[%arg20, %arg21] : memref +// HYPERBLOCK-NEXT: %9 = memref.load %arg18[%arg21] : memref +// HYPERBLOCK-NEXT: %10 = arith.addi %8, %9 : i32 +// HYPERBLOCK-NEXT: memref.store %10, %arg19[%arg21] : memref +// HYPERBLOCK-NEXT: } -> () +// HYPERBLOCK-NEXT: "taskflow.yield"(%arg13, %arg14, %arg15, %arg18, %arg19) <{operandSegmentSizes = array}> : (memref, memref, memref, memref, memref) -> () +// HYPERBLOCK-NEXT: }) : (memref, memref, memref, memref, memref, memref, memref, memref, memref, memref) -> (memref, memref, memref, memref, memref) +// HYPERBLOCK-NEXT: %0 = affine.load %arg9[0] : memref +// HYPERBLOCK-NEXT: return %0 : i32 +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT:} \ No newline at end of file diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir new file mode 100644 index 00000000..ab4360ed --- /dev/null +++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir @@ -0,0 +1,94 @@ +// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: | FileCheck %s --check-prefixes=TASKFLOW + +// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task \ +// RUN: | FileCheck %s --check-prefixes=HYPERBLOCK + +module { + // Example: Parallel nested loops scenario + // Task 0: Single-level loop (vector scaling) + // Task 1: Two-level nested loop (matrix multiplication) + func.func @parallel_nested_example(%A: memref<16xf32>, + %B: memref<8x8xf32>, + %C: memref<8x8xf32>, + %D: memref<8x8xf32>, + %scalar: f32) { + // Task 0: Single-level loop - Vector scaling + // Computes: A[i] = A[i] * scalar + affine.for %i = 0 to 16 { + %v = affine.load %A[%i] : memref<16xf32> + %scaled = arith.mulf %v, %scalar : f32 + affine.store %scaled, %A[%i] : memref<16xf32> + } + + // Task 1: Two-level nested loop - Matrix multiplication + // Computes: D[i][j] = B[i][j] * C[i][j] (element-wise) + affine.for %i = 0 to 8 { + affine.for %j = 0 to 8 { + %b_val = affine.load %B[%i, %j] : memref<8x8xf32> + %c_val = affine.load %C[%i, %j] : memref<8x8xf32> + %product = arith.mulf %b_val, %c_val : f32 + affine.store %product, %D[%i, %j] : memref<8x8xf32> + } + } + return + } +} + +// TASKFLOW: module { +// TASKFLOW-NEXT: func.func @parallel_nested_example(%arg0: memref<16xf32>, %arg1: memref<8x8xf32>, %arg2: memref<8x8xf32>, %arg3: memref<8x8xf32>, %arg4: f32) { +// TASKFLOW-NEXT: %memory_outputs = "taskflow.task"(%arg0, %arg4) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// TASKFLOW-NEXT: ^bb0(%arg5: memref<16xf32>, %arg6: f32): +// TASKFLOW-NEXT: affine.for %arg7 = 0 to 16 { +// TASKFLOW-NEXT: %0 = affine.load %arg5[%arg7] : memref<16xf32> +// TASKFLOW-NEXT: %1 = arith.mulf %0, %arg6 : f32 +// TASKFLOW-NEXT: affine.store %1, %arg5[%arg7] : memref<16xf32> +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: "taskflow.yield"(%arg5) <{operandSegmentSizes = array}> : (memref<16xf32>) -> () +// TASKFLOW-NEXT: }) : (memref<16xf32>, f32) -> memref<16xf32> +// TASKFLOW-NEXT: %memory_outputs_0 = "taskflow.task"(%arg1, %arg2, %arg3) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_1"}> ({ +// TASKFLOW-NEXT: ^bb0(%arg5: memref<8x8xf32>, %arg6: memref<8x8xf32>, %arg7: memref<8x8xf32>): +// TASKFLOW-NEXT: affine.for %arg8 = 0 to 8 { +// TASKFLOW-NEXT: affine.for %arg9 = 0 to 8 { +// TASKFLOW-NEXT: %0 = affine.load %arg5[%arg8, %arg9] : memref<8x8xf32> +// TASKFLOW-NEXT: %1 = affine.load %arg6[%arg8, %arg9] : memref<8x8xf32> +// TASKFLOW-NEXT: %2 = arith.mulf %0, %1 : f32 +// TASKFLOW-NEXT: affine.store %2, %arg7[%arg8, %arg9] : memref<8x8xf32> +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: "taskflow.yield"(%arg7) <{operandSegmentSizes = array}> : (memref<8x8xf32>) -> () +// TASKFLOW-NEXT: }) : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> memref<8x8xf32> +// TASKFLOW-NEXT: return +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: } + +// HYPERBLOCK: module { +// HYPERBLOCK-NEXT: func.func @parallel_nested_example(%arg0: memref<16xf32>, %arg1: memref<8x8xf32>, %arg2: memref<8x8xf32>, %arg3: memref<8x8xf32>, %arg4: f32) { +// HYPERBLOCK-NEXT: %memory_outputs = "taskflow.task"(%arg0, %arg4) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// HYPERBLOCK-NEXT: ^bb0(%arg5: memref<16xf32>, %arg6: f32): +// HYPERBLOCK-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 16 : index} : index +// HYPERBLOCK-NEXT: taskflow.hyperblock indices(%0 : index) { +// HYPERBLOCK-NEXT: ^bb0(%arg7: index): +// HYPERBLOCK-NEXT: %1 = memref.load %arg5[%arg7] : memref<16xf32> +// HYPERBLOCK-NEXT: %2 = arith.mulf %1, %arg6 : f32 +// HYPERBLOCK-NEXT: memref.store %2, %arg5[%arg7] : memref<16xf32> +// HYPERBLOCK-NEXT: } -> () +// HYPERBLOCK-NEXT: "taskflow.yield"(%arg5) <{operandSegmentSizes = array}> : (memref<16xf32>) -> () +// HYPERBLOCK-NEXT: }) : (memref<16xf32>, f32) -> memref<16xf32> +// HYPERBLOCK-NEXT: %memory_outputs_0 = "taskflow.task"(%arg1, %arg2, %arg3) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_1"}> ({ +// HYPERBLOCK-NEXT: ^bb0(%arg5: memref<8x8xf32>, %arg6: memref<8x8xf32>, %arg7: memref<8x8xf32>): +// HYPERBLOCK-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// HYPERBLOCK-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// HYPERBLOCK-NEXT: taskflow.hyperblock indices(%0, %1 : index, index) { +// HYPERBLOCK-NEXT: ^bb0(%arg8: index, %arg9: index): +// HYPERBLOCK-NEXT: %2 = memref.load %arg5[%arg8, %arg9] : memref<8x8xf32> +// HYPERBLOCK-NEXT: %3 = memref.load %arg6[%arg8, %arg9] : memref<8x8xf32> +// HYPERBLOCK-NEXT: %4 = arith.mulf %2, %3 : f32 +// HYPERBLOCK-NEXT: memref.store %4, %arg7[%arg8, %arg9] : memref<8x8xf32> +// HYPERBLOCK-NEXT: } -> () +// HYPERBLOCK-NEXT: "taskflow.yield"(%arg7) <{operandSegmentSizes = array}> : (memref<8x8xf32>) -> () +// HYPERBLOCK-NEXT: }) : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> memref<8x8xf32> +// HYPERBLOCK-NEXT: return +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: } \ No newline at end of file diff --git a/test/multi-cgra/taskflow/resenet/resnet.mlir b/test/multi-cgra/taskflow/resenet/resnet.mlir deleted file mode 100644 index f537fe8f..00000000 --- a/test/multi-cgra/taskflow/resenet/resnet.mlir +++ /dev/null @@ -1,74 +0,0 @@ -// RUN: cd %S && python resnet.py - -// RUN: mlir-neura-opt %S/Output/simple_resnet.mlir \ -// RUN: --convert-linalg-to-taskflow -o %t-resnet-taskflow.mlir - -// RUN: FileCheck %s --input-file=%t-resnet-taskflow.mlir - -// CHECK: %2 = taskflow.graph(%arg0, %cst_1, %cst_0, %1, %0, %cst) { -// CHECK-NEXT: ^bb0(%arg1: tensor<1x64x8x8xf32>, %arg2: f32, %arg3: tensor<64x64x3x3xf32>, %arg4: tensor<1x64x8x8xf32>, %arg5: tensor<1x64x8x8xf32>, %arg6: tensor<64x64x3x3xf32>): -// CHECK-NEXT: %data_outs = "taskflow.task"(%arg1, %arg2) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "task_0"}> ({ -// CHECK-NEXT: ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: f32): -// CHECK-NEXT: %padded = tensor.pad %arg7 low[0, 0, 1, 1] high[0, 0, 1, 1] { -// CHECK-NEXT: ^bb0(%arg9: index, %arg10: index, %arg11: index, %arg12: index): -// CHECK-NEXT: tensor.yield %arg8 : f32 -// CHECK-NEXT: } : tensor<1x64x8x8xf32> to tensor<1x64x10x10xf32> -// CHECK-NEXT: taskflow.yield %padded : tensor<1x64x10x10xf32> -// CHECK-NEXT: }) : (tensor<1x64x8x8xf32>, f32) -> tensor<1x64x10x10xf32> -// CHECK-NEXT: %3 = taskflow.channel %data_outs : tensor<1x64x10x10xf32> -> tensor<1x64x10x10xf32> -// CHECK-NEXT: %data_outs_2 = "taskflow.task"(%arg3, %arg4, %3) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "conv2d_1"}> ({ -// CHECK-NEXT: ^bb0(%arg7: tensor<64x64x3x3xf32>, %arg8: tensor<1x64x8x8xf32>, %arg9: tensor<1x64x10x10xf32>): -// CHECK-NEXT: %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%data_outs, %arg7 : tensor<1x64x10x10xf32>, tensor<64x64x3x3xf32>) outs(%arg8 : tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32> -// CHECK-NEXT: taskflow.yield %9 : tensor<1x64x8x8xf32> -// CHECK-NEXT: }) : (tensor<64x64x3x3xf32>, tensor<1x64x8x8xf32>, tensor<1x64x10x10xf32>) -> tensor<1x64x8x8xf32> -// CHECK-NEXT: %4 = taskflow.channel %data_outs_2 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32> -// CHECK-NEXT: %data_outs_3 = "taskflow.task"(%arg5, %arg2, %4) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "generic_2"}> ({ -// CHECK-NEXT: ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: f32, %arg9: tensor<1x64x8x8xf32>): -// CHECK-NEXT: %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%data_outs_2 : tensor<1x64x8x8xf32>) outs(%arg7 : tensor<1x64x8x8xf32>) { -// CHECK-NEXT: ^bb0(%in: f32, %out: f32): -// CHECK-NEXT: %10 = arith.cmpf ugt, %in, %arg8 : f32 -// CHECK-NEXT: %11 = arith.select %10, %in, %arg8 : f32 -// CHECK-NEXT: linalg.yield %11 : f32 -// CHECK-NEXT: } -> tensor<1x64x8x8xf32> -// CHECK-NEXT: taskflow.yield %9 : tensor<1x64x8x8xf32> -// CHECK-NEXT: }) : (tensor<1x64x8x8xf32>, f32, tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32> -// CHECK-NEXT: %5 = taskflow.channel %data_outs_3 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32> -// CHECK-NEXT: %data_outs_4 = "taskflow.task"(%arg2, %5) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "task_3"}> ({ -// CHECK-NEXT: ^bb0(%arg7: f32, %arg8: tensor<1x64x8x8xf32>): -// CHECK-NEXT: %padded = tensor.pad %data_outs_3 low[0, 0, 1, 1] high[0, 0, 1, 1] { -// CHECK-NEXT: ^bb0(%arg9: index, %arg10: index, %arg11: index, %arg12: index): -// CHECK-NEXT: tensor.yield %arg7 : f32 -// CHECK-NEXT: } : tensor<1x64x8x8xf32> to tensor<1x64x10x10xf32> -// CHECK-NEXT: taskflow.yield %padded : tensor<1x64x10x10xf32> -// CHECK-NEXT: }) : (f32, tensor<1x64x8x8xf32>) -> tensor<1x64x10x10xf32> -// CHECK-NEXT: %6 = taskflow.channel %data_outs_4 : tensor<1x64x10x10xf32> -> tensor<1x64x10x10xf32> -// CHECK-NEXT: %data_outs_5 = "taskflow.task"(%arg6, %arg4, %6) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "conv2d_4"}> ({ -// CHECK-NEXT: ^bb0(%arg7: tensor<64x64x3x3xf32>, %arg8: tensor<1x64x8x8xf32>, %arg9: tensor<1x64x10x10xf32>): -// CHECK-NEXT: %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%data_outs_4, %arg7 : tensor<1x64x10x10xf32>, tensor<64x64x3x3xf32>) outs(%arg8 : tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32> -// CHECK-NEXT: taskflow.yield %9 : tensor<1x64x8x8xf32> -// CHECK-NEXT: }) : (tensor<64x64x3x3xf32>, tensor<1x64x8x8xf32>, tensor<1x64x10x10xf32>) -> tensor<1x64x8x8xf32> -// CHECK-NEXT: %7 = taskflow.channel %data_outs_5 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32> -// CHECK-NEXT: %data_outs_6 = "taskflow.task"(%arg1, %arg5, %7) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "generic_5"}> ({ -// CHECK-NEXT: ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: tensor<1x64x8x8xf32>, %arg9: tensor<1x64x8x8xf32>): -// CHECK-NEXT: %9 = linalg.generic {indexing_maps = [#map, #map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%data_outs_5, %arg7 : tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>) outs(%arg8 : tensor<1x64x8x8xf32>) { -// CHECK-NEXT: ^bb0(%in: f32, %in_8: f32, %out: f32): -// CHECK-NEXT: %10 = arith.addf %in, %in_8 : f32 -// CHECK-NEXT: linalg.yield %10 : f32 -// CHECK-NEXT: } -> tensor<1x64x8x8xf32> -// CHECK-NEXT: taskflow.yield %9 : tensor<1x64x8x8xf32> -// CHECK-NEXT: }) : (tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32> -// CHECK-NEXT: %8 = taskflow.channel %data_outs_6 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32> -// CHECK-NEXT: %data_outs_7 = "taskflow.task"(%arg5, %arg2, %8) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "generic_6"}> ({ -// CHECK-NEXT: ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: f32, %arg9: tensor<1x64x8x8xf32>): -// CHECK-NEXT: %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%data_outs_6 : tensor<1x64x8x8xf32>) outs(%arg7 : tensor<1x64x8x8xf32>) { -// CHECK-NEXT: ^bb0(%in: f32, %out: f32): -// CHECK-NEXT: %10 = arith.cmpf ugt, %in, %arg8 : f32 -// CHECK-NEXT: %11 = arith.select %10, %in, %arg8 : f32 -// CHECK-NEXT: linalg.yield %11 : f32 -// CHECK-NEXT: } -> tensor<1x64x8x8xf32> -// CHECK-NEXT: taskflow.yield %9 : tensor<1x64x8x8xf32> -// CHECK-NEXT: }) : (tensor<1x64x8x8xf32>, f32, tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32> -// CHECK-NEXT: taskflow.return %data_outs_7 : tensor<1x64x8x8xf32> -// CHECK-NEXT: } : (tensor<1x64x8x8xf32>, f32, tensor<64x64x3x3xf32>, tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>, tensor<64x64x3x3xf32>) -> tensor<1x64x8x8xf32> -// CHECK-NEXT: return %2 : tensor<1x64x8x8xf32> -// CHECK-NEXT: } \ No newline at end of file diff --git a/test/multi-cgra/taskflow/resenet/resnet.py b/test/multi-cgra/taskflow/resenet/resnet.py deleted file mode 100644 index 90523903..00000000 --- a/test/multi-cgra/taskflow/resenet/resnet.py +++ /dev/null @@ -1,53 +0,0 @@ -import torch -import torch.nn as nn -from torch._inductor.decomposition import decompositions as inductor_decomp -import os - - -class SimpleResNetBlock(nn.Module): - """ - Minimal ResNet Block: Conv -> ReLU -> Conv -> Add (residual) - """ - - def __init__(self, channels=64): - super().__init__() - self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1, bias=False) - self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1, bias=False) - - def forward(self, x): - residual = x - out = self.conv1(x) - out = torch.relu(out) - out = self.conv2(out) - out = out + residual # Residual connection - out = torch.relu(out) - return out - - -def generate_mlir(): - """Generate MLIR with Linalg ops""" - model = SimpleResNetBlock(channels=64) - model.eval() - - # Small input for quick testing: [batch, channels, height, width] - x = torch.randn(1, 64, 8, 8) - - # Export to MLIR via torch-mlir - try: - from torch_mlir import compile - - mlir_module = compile( - model, x, output_type="linalg-on-tensors", use_tracing=True - ) - output_dir = os.path.dirname(os.path.abspath(__file__)) - output_dir = os.path.join(output_dir, "Output") - os.makedirs(output_dir, exist_ok=True) - filename = os.path.join(output_dir, "simple_resnet.mlir") - with open(filename, "w") as f: - f.write(str(mlir_module)) - except ImportError: - print("Error: torch-mlir is not installed.\n") - - -if __name__ == "__main__": - generate_mlir() diff --git a/tools/mlir-neura-opt/CMakeLists.txt b/tools/mlir-neura-opt/CMakeLists.txt index 70c06a51..e1e49db2 100644 --- a/tools/mlir-neura-opt/CMakeLists.txt +++ b/tools/mlir-neura-opt/CMakeLists.txt @@ -5,8 +5,10 @@ set(LIBS ${dialect_libs} ${conversion_libs} MLIRNeuraTransforms + MLIRTaskflowTransforms MLIRConversion MLIRNeura + MLIRTaskflow MLIRTransforms MLIROptLib MLIRPass diff --git a/tools/mlir-neura-opt/mlir-neura-opt.cpp b/tools/mlir-neura-opt/mlir-neura-opt.cpp index cd824879..a4ac0e2e 100644 --- a/tools/mlir-neura-opt/mlir-neura-opt.cpp +++ b/tools/mlir-neura-opt/mlir-neura-opt.cpp @@ -17,6 +17,8 @@ #include "NeuraDialect/Architecture/ArchitectureSpec.h" #include "NeuraDialect/NeuraDialect.h" #include "NeuraDialect/NeuraPasses.h" +#include "TaskflowDialect/TaskflowDialect.h" +#include "TaskflowDialect/TaskflowPasses.h" // Global variable to store architecture spec file path static std::string architecture_spec_file; @@ -71,10 +73,12 @@ int main(int argc, char **argv) { registry.insert(); registry.insert(); registry.insert(); + registry.insert(); mlir::neura::registerPasses(); mlir::registerPasses(); mlir::registerViewOpGraphPass(); + mlir::taskflow::registerPasses(); // Register all standard conversion passes mlir::registerConversionPasses();