Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 30 additions & 3 deletions include/TaskflowDialect/TaskflowOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,10 @@ def TaskflowReturnOp : TaskflowOpBase<"return", [Terminator]> {

// Defines a uniform computation and control task operation within a Taskflow graph.
def TaskflowTaskOp : TaskflowOpBase<"task", [
IsolatedFromAbove,
AttrSizedOperandSegments,
AttrSizedResultSegments,
SingleBlockImplicitTerminator<"TaskflowYieldOp">,
NoMemoryEffect,
SingleBlockImplicitTerminator<"TaskflowYieldOp">
]>{
let summary = "Uniform computation and control task operation within a Taskflow graph";

Expand Down Expand Up @@ -156,10 +156,11 @@ def TaskflowDriveOp : TaskflowOpBase<"drive", [Pure]>{
let results = (outs TaskflowPacketType:$target);

let assemblyFormat = [{
$source attr-dict `:` type($source) `->` type($target)
$source attr-dict `:` qualified(type($source)) `->` qualified(type($target))
}];
}

// Defines the data dependency edge operation that carries data dependencies between tasks in a Taskflow graph.
def TaskflowChannelOp : TaskflowOpBase<"channel", [Pure]>{
let summary = "Data dependency edge that carries data dependencies between tasks in a Taskflow graph";
let description = [{
Expand All @@ -175,4 +176,30 @@ def TaskflowChannelOp : TaskflowOpBase<"channel", [Pure]>{
}];
}

// Defines the emit operatin to emit data and control packets from a task before termination (used in streaming or hierachical control cases).
def TaskflowEmitOp : TaskflowOpBase<"emit", [Pure]>{
let summary = "Emit operation for taskflow.task to emit data and control packets before termination";
let description = [{
Emits control and data packets from a task before its termination.
This is useful in streaming or hierarchical control scenarios where tasks need to send out packets without terminating the entire task.

Example:
taskflow.task(...) {
...
taskflow.emit %control_packet, %data_packet : !taskflow.packet<...>, i32
taskflow.yield
}
}];
let arguments = (ins Variadic<AnyType>:$results);
let assemblyFormat = [{
($results^ `:` type($results))? attr-dict
}];
let builders = [
// Default builder for empty emit.
OpBuilder<(ins), [{
build($_builder, $_state, ValueRange{});
}]>
];
}

#endif // TASKFLOW_OPS_TD
39 changes: 16 additions & 23 deletions lib/Conversion/LinalgToTaskflow/LinalgToTaskflowPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,7 @@ static void collectExternalValuesForOp(
// Skips values defined inside graph ops or nested regions.
Operation *def_op = operand.getDefiningOp();
if (def_op) {
if (!graph_op_set.contains(def_op) &&
def_op->getBlock()->getParentOp() == func_op.getOperation()) {
if (def_op->getBlock()->getParentOp() == func_op.getOperation()) {
external_values.insert(operand);
}
}
Expand Down Expand Up @@ -147,9 +146,22 @@ static SmallVector<Value> identifyGraphInputs(ArrayRef<Operation *> graph_ops,
func::FuncOp func_op) {
llvm::SetVector<Value> input_set;
llvm::DenseSet<Operation *> graph_op_set(graph_ops.begin(), graph_ops.end());

DenseMap<Operation *, SmallVector<Value>> external_values_per_op =
collectExternalValuesPerOp(graph_ops, func_op);
for (Operation *op : graph_ops) {
collectExternalValuesForOp(op, graph_op_set, func_op, input_set);
for (Value external_val : external_values_per_op[op]) {
if (external_val.getDefiningOp()) {
if (!graph_op_set.contains(external_val.getDefiningOp())) {
input_set.insert(external_val);
}
} else {
if (isa<BlockArgument>(external_val) &&
external_val.getParentBlock()->getParentOp() ==
func_op.getOperation()) {
input_set.insert(external_val);
}
}
}
}

return SmallVector<Value>(input_set.begin(), input_set.end());
Expand Down Expand Up @@ -396,24 +408,9 @@ static LogicalResult convertFuncToTaskflow(func::FuncOp func_op) {
return success();
}

llvm::errs() << "Converting function: " << func_op.getName() << "\n";
llvm::errs() << "Collected taskflow graph operations:\n";
for (Operation *op : graph_ops) {
llvm::errs() << " " << *op << "\n";
}

SmallVector<Value> graph_inputs = identifyGraphInputs(graph_ops, func_op);
SmallVector<Value> graph_outputs = identifyGraphOutputs(graph_ops, func_op);

llvm::errs() << "Identified graph inputs:\n";
for (Value input : graph_inputs) {
llvm::errs() << " " << input << "\n";
}
llvm::errs() << "Identified graph outputs:\n";
for (Value output : graph_outputs) {
llvm::errs() << " " << output << "\n";
}

// Finds insertion point: after the last operation that defines a graph input.
Operation *insertion_point = nullptr;
for (Value input : graph_inputs) {
Expand Down Expand Up @@ -441,10 +438,6 @@ static LogicalResult convertFuncToTaskflow(func::FuncOp func_op) {
// Step 2 & 3 & 4: Creates the taskflow.graph op.
auto result = buildTaskflowGraph(builder, func_op, graph_ops, graph_inputs,
graph_outputs, op_external_values);
llvm::errs() << "Converted function to TaskFlow graph.\n";
llvm::errs() << "Resulting function:\n";
func_op.print(llvm::errs());
llvm::errs() << "\n";

return result;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
// RUN: mlir-neura-opt %s | FileCheck %s
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How is the test/multi-cgra/taskflow/irregular-loop/irregular-loop.cpp compiled using lit?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just use the mlir-neura-opt to parse the input ir, to make sure the syntax is correct.


module {
func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
%c2_i32 = arith.constant 2 : i32
%c8_i32 = arith.constant 8 : i32
%c0_i32 = arith.constant 0 : i32
%alloca = memref.alloca() : memref<i32>
%alloca_0 = memref.alloca() : memref<4x8xi32>
taskflow.graph(%c0_i32, %alloca_0, %alloca, %c2_i32, %c8_i32) {
^bb0(%arg0: i32, %arg1: memref<4x8xi32>, %arg2: memref<i32>, %arg3: i32, %arg4: i32):
%data_outs = "taskflow.task"(%arg0) <{operandSegmentSizes = array<i32: 0, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
^bb0(%arg5: i32):
%7 = affine.for %arg6 = 0 to 5 iter_args(%arg7 = %arg5) -> (i32) {
%8 = arith.index_cast %arg6 : index to i32
%9 = arith.addi %arg7, %8 : i32
affine.yield %9 : i32
}
taskflow.yield %7 : i32
}) : (i32) -> i32
%1 = taskflow.channel %data_outs : i32 -> i32
%control_outs, %data_outs_1 = "taskflow.task"(%arg4) <{operandSegmentSizes = array<i32: 0, 1>, resultSegmentSizes = array<i32: 1, 1>, task_name = "Controller_1"}> ({
^bb0(%arg5: i32):
affine.for %arg6 = 0 to 4 {
%7 = arith.index_cast %arg6 : index to i32
%8 = arith.muli %7, %arg5 : i32
taskflow.emit %arg6, %8 : index, i32
}
taskflow.yield
}) : (i32) -> (!taskflow.packet<index>, i32)
%2 = taskflow.channel %data_outs_1 : i32 -> i32
%3 = taskflow.channel %data_outs_1 : i32 -> i32
%4 = taskflow.drive %control_outs : !taskflow.packet<index> -> !taskflow.packet<index>
%5 = taskflow.drive %control_outs : !taskflow.packet<index> -> !taskflow.packet<index>
%data_outs_2 = "taskflow.task"(%5, %3, %arg1) <{operandSegmentSizes = array<i32: 1, 2>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_2"}> ({
^bb0(%arg5: index, %arg6: i32, %arg7: memref<4x8xi32>):
affine.for %arg8 = 0 to 8 {
%7 = arith.index_cast %arg8 : index to i32
%8 = arith.addi %arg6, %7 : i32
memref.store %8, %arg7[%arg5, %arg8] : memref<4x8xi32>
}
taskflow.yield %arg7 : memref<4x8xi32>
}) : (!taskflow.packet<index>, i32, memref<4x8xi32>) -> memref<4x8xi32>
%6 = taskflow.channel %data_outs_2 : memref<4x8xi32> -> memref<4x8xi32>
"taskflow.task"(%4, %2, %6, %1, %arg2, %arg3) <{operandSegmentSizes = array<i32: 1, 5>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_3"}> ({
^bb0(%arg5: index, %arg6: i32, %arg7: memref<4x8xi32>, %arg8: i32, %arg9: memref<i32>, %arg10: i32):
affine.for %arg11 = 0 to 8 {
%7 = memref.load %arg7[%arg5, %arg11] : memref<4x8xi32>
%8 = arith.addi %7, %arg8 : i32
%c3 = arith.constant 3 : index
%9 = arith.cmpi eq, %arg5, %c3 : index
%c7 = arith.constant 7 : index
%10 = arith.cmpi eq, %arg11, %c7 : index
%11 = arith.andi %9, %10 : i1
scf.if %11 {
memref.store %8, %arg9[] : memref<i32>
%12 = arith.muli %8, %arg10 : i32
memref.store %12, %arg9[] : memref<i32>
}
}
taskflow.yield %arg9 : memref<i32>
}) : (!taskflow.packet<index>, i32, memref<4x8xi32>, i32, memref<i32>, i32) -> memref<i32>
} : (i32, memref<4x8xi32>, memref<i32>, i32, i32) -> ()
%0 = affine.load %alloca[] : memref<i32>
return %0 : i32
}
}

// CHECK-LABEL: func.func @_Z21irregularLoopExample1v
// CHECK: taskflow.graph
// CHECK: taskflow.task
// CHECK: taskflow.channel
// CHECK: taskflow.yield
35 changes: 35 additions & 0 deletions test/multi-cgra/taskflow/irregular-loop/irregular-loop.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
using namespace std;

#define M 4
#define N 8
#define K 5

// Example 1: Matrix processing + vectorization + RAW dependency
int irregularLoopExample1() {
// vector<vector<int>> A(M, vector<int>(N, 0));
int A[M][N];
int B[M][N];
int temp[N];

for (int i = 0; i < M; i++) {
// First independent loop: matrix initialization (Independent Loop 1)
for (int j = 0; j < N; j++) {
A[i][j] = i * N + j;
temp[j] = 0; // Initialize temp
}

// Non-nested code segment
int sum = 0;
for (int k = 0; k < K; k++) {
sum += k;
}

// Second independent loop: using the results of the first loop (Independent
// Loop 2 - RAW Dependency) RAW: depends on the writes to temp[j] above
for (int j = 0; j < N; j++) {
B[i][j] = A[i][j] + temp[j] + sum; // Read temp[j] (RAW dependency)
B[i][j] *= 2;
}
}
return B[M - 1][N - 1];
}
84 changes: 84 additions & 0 deletions test/multi-cgra/taskflow/multi-nested/multi-nested-taskflow.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// RUN: mlir-neura-opt %s | FileCheck %s

module {
func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref<?x8x6xi32>, %arg1: memref<?x8x5xi32>, %arg2: memref<?x8x5xi32>, %arg3: memref<?x7xi32>, %arg4: memref<?x9xi32>, %arg5: memref<?xi32>, %arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: memref<?xi32>, %arg9: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
taskflow.graph(%arg0, %arg5, %arg1, %arg2, %arg6, %arg9, %arg3, %arg7, %arg4, %arg8) {
^bb0(%arg10: memref<?x8x6xi32>, %arg11: memref<?xi32>, %arg12: memref<?x8x5xi32>, %arg13: memref<?x8x5xi32>, %arg14: memref<?xi32>, %arg15: memref<?xi32>, %arg16: memref<?x7xi32>, %arg17: memref<?xi32>, %arg18: memref<?x9xi32>, %arg19: memref<?xi32>):
%data_outs = "taskflow.task"(%arg10, %arg11) <{operandSegmentSizes = array<i32: 0, 2>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
^bb0(%arg20: memref<?x8x6xi32>, %arg21: memref<?xi32>):
affine.for %arg22 = 0 to 4 {
affine.for %arg23 = 0 to 8 {
affine.for %arg24 = 0 to 6 {
%4 = affine.load %arg20[%arg22, %arg23, %arg24] : memref<?x8x6xi32>
affine.store %4, %arg21[%arg24] : memref<?xi32>
}
}
}
taskflow.yield %arg21 : memref<?xi32>
}) : (memref<?x8x6xi32>, memref<?xi32>) -> memref<?xi32>
%1 = taskflow.channel %data_outs : memref<?xi32> -> memref<?xi32>
%data_outs_0 = "taskflow.task"(%arg12, %arg13, %arg14) <{operandSegmentSizes = array<i32: 0, 3>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_1"}> ({
^bb0(%arg20: memref<?x8x5xi32>, %arg21: memref<?x8x5xi32>, %arg22: memref<?xi32>):
affine.for %arg23 = 0 to 4 {
affine.for %arg24 = 0 to 8 {
affine.for %arg25 = 0 to 5 {
%4 = affine.load %arg20[%arg23, %arg24, %arg25] : memref<?x8x5xi32>
%5 = affine.load %arg21[%arg23, %arg24, %arg25] : memref<?x8x5xi32>
%6 = arith.addi %4, %5 : i32
affine.store %6, %arg22[%arg25] : memref<?xi32>
}
}
}
taskflow.yield %arg22 : memref<?xi32>
}) : (memref<?x8x5xi32>, memref<?x8x5xi32>, memref<?xi32>) -> memref<?xi32>
%2 = taskflow.channel %data_outs_0 : memref<?xi32> -> memref<?xi32>
%data_outs_1 = "taskflow.task"(%1, %2, %arg15) <{operandSegmentSizes = array<i32: 0, 3>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_2"}> ({
^bb0(%arg20: memref<?xi32>, %arg21: memref<?xi32>, %arg22: memref<?xi32>):
affine.for %arg23 = 0 to 4 {
affine.for %arg24 = 0 to 8 {
affine.for %arg25 = 0 to 6 {
%4 = affine.load %arg20[%arg25] : memref<?xi32>
%5 = affine.load %arg21[%arg25] : memref<?xi32>
%6 = arith.addi %4, %5 : i32
%7 = affine.load %arg22[0] : memref<?xi32>
%8 = arith.addi %7, %6 : i32
affine.store %8, %arg22[0] : memref<?xi32>
}
}
}
taskflow.yield %arg22 : memref<?xi32>
}) : (memref<?xi32>, memref<?xi32>, memref<?xi32>) -> memref<?xi32>
%data_outs_2 = "taskflow.task"(%arg16, %arg17) <{operandSegmentSizes = array<i32: 0, 2>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_3"}> ({
^bb0(%arg20: memref<?x7xi32>, %arg21: memref<?xi32>):
affine.for %arg22 = 0 to 4 {
affine.for %arg23 = 0 to 7 {
%4 = affine.load %arg20[%arg22, %arg23] : memref<?x7xi32>
affine.store %4, %arg21[%arg23] : memref<?xi32>
}
}
taskflow.yield %arg21 : memref<?xi32>
}) : (memref<?x7xi32>, memref<?xi32>) -> memref<?xi32>
%3 = taskflow.channel %data_outs_2 : memref<?xi32> -> memref<?xi32>
%data_outs_3 = "taskflow.task"(%arg18, %3, %arg19) <{operandSegmentSizes = array<i32: 0, 3>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_4"}> ({
^bb0(%arg20: memref<?x9xi32>, %arg21: memref<?xi32>, %arg22: memref<?xi32>):
affine.for %arg23 = 0 to 4 {
affine.for %arg24 = 0 to 9 {
%4 = affine.load %arg20[%arg23, %arg24] : memref<?x9xi32>
%5 = affine.load %arg21[%arg24] : memref<?xi32>
%6 = arith.addi %4, %5 : i32
affine.store %6, %arg22[%arg24] : memref<?xi32>
}
}
taskflow.yield %arg22 : memref<?xi32>
}) : (memref<?x9xi32>, memref<?xi32>, memref<?xi32>) -> memref<?xi32>
} : (memref<?x8x6xi32>, memref<?xi32>, memref<?x8x5xi32>, memref<?x8x5xi32>, memref<?xi32>, memref<?xi32>, memref<?x7xi32>, memref<?xi32>, memref<?x9xi32>, memref<?xi32>) -> ()
%0 = affine.load %arg9[0] : memref<?xi32>
return %0 : i32
}
}

// CHECK-LABEL: func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_
// CHECK: taskflow.graph
// CHECK: taskflow.task
// CHECK: taskflow.channel
// CHECK: taskflow.yield
25 changes: 25 additions & 0 deletions test/multi-cgra/taskflow/multi-nested/multi-nested.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// Pure nested loop structure - no inter-loop computations
int pureNestedLoopExample(int d1[4][8][6], int d2[4][8][5], int d3[4][8][5],
int d4[4][7], int d5[4][9], int m1[6], int m2[5],
int m3[7], int m4[9], int *result) {
for (int i = 0; i < 4; i++) { // Loop A
for (int j = 0; j < 8; j++) { // Loop B
for (int k = 0; k < 6; k++) { // Loop C
m1[k] = d1[i][j][k];
}
for (int k = 0; k < 5; k++) { // Loop D
m2[k] = d2[i][j][k] + d3[i][j][k];
}
for (int k = 0; k < 6; k++) { // Loop E
*result += m1[k] + m2[k];
}
}
for (int j = 0; j < 7; j++) { // Loop F
m3[j] = d4[i][j];
}
for (int j = 0; j < 9; j++) { // Loop G
m4[j] = d5[i][j] + m3[j];
}
}
return *result;
}
Loading