diff --git a/include/TaskflowDialect/TaskflowOps.td b/include/TaskflowDialect/TaskflowOps.td index 28641951..91f599a7 100644 --- a/include/TaskflowDialect/TaskflowOps.td +++ b/include/TaskflowDialect/TaskflowOps.td @@ -82,10 +82,10 @@ def TaskflowReturnOp : TaskflowOpBase<"return", [Terminator]> { // Defines a uniform computation and control task operation within a Taskflow graph. def TaskflowTaskOp : TaskflowOpBase<"task", [ + IsolatedFromAbove, AttrSizedOperandSegments, AttrSizedResultSegments, - SingleBlockImplicitTerminator<"TaskflowYieldOp">, - NoMemoryEffect, + SingleBlockImplicitTerminator<"TaskflowYieldOp"> ]>{ let summary = "Uniform computation and control task operation within a Taskflow graph"; @@ -156,10 +156,11 @@ def TaskflowDriveOp : TaskflowOpBase<"drive", [Pure]>{ let results = (outs TaskflowPacketType:$target); let assemblyFormat = [{ - $source attr-dict `:` type($source) `->` type($target) + $source attr-dict `:` qualified(type($source)) `->` qualified(type($target)) }]; } +// Defines the data dependency edge operation that carries data dependencies between tasks in a Taskflow graph. def TaskflowChannelOp : TaskflowOpBase<"channel", [Pure]>{ let summary = "Data dependency edge that carries data dependencies between tasks in a Taskflow graph"; let description = [{ @@ -175,4 +176,30 @@ def TaskflowChannelOp : TaskflowOpBase<"channel", [Pure]>{ }]; } +// Defines the emit operatin to emit data and control packets from a task before termination (used in streaming or hierachical control cases). +def TaskflowEmitOp : TaskflowOpBase<"emit", [Pure]>{ + let summary = "Emit operation for taskflow.task to emit data and control packets before termination"; + let description = [{ + Emits control and data packets from a task before its termination. + This is useful in streaming or hierarchical control scenarios where tasks need to send out packets without terminating the entire task. + + Example: + taskflow.task(...) { + ... + taskflow.emit %control_packet, %data_packet : !taskflow.packet<...>, i32 + taskflow.yield + } + }]; + let arguments = (ins Variadic:$results); + let assemblyFormat = [{ + ($results^ `:` type($results))? attr-dict + }]; + let builders = [ + // Default builder for empty emit. + OpBuilder<(ins), [{ + build($_builder, $_state, ValueRange{}); + }]> + ]; +} + #endif // TASKFLOW_OPS_TD \ No newline at end of file diff --git a/lib/Conversion/LinalgToTaskflow/LinalgToTaskflowPass.cpp b/lib/Conversion/LinalgToTaskflow/LinalgToTaskflowPass.cpp index a4489f44..7b4e851f 100644 --- a/lib/Conversion/LinalgToTaskflow/LinalgToTaskflowPass.cpp +++ b/lib/Conversion/LinalgToTaskflow/LinalgToTaskflowPass.cpp @@ -89,8 +89,7 @@ static void collectExternalValuesForOp( // Skips values defined inside graph ops or nested regions. Operation *def_op = operand.getDefiningOp(); if (def_op) { - if (!graph_op_set.contains(def_op) && - def_op->getBlock()->getParentOp() == func_op.getOperation()) { + if (def_op->getBlock()->getParentOp() == func_op.getOperation()) { external_values.insert(operand); } } @@ -147,9 +146,22 @@ static SmallVector identifyGraphInputs(ArrayRef graph_ops, func::FuncOp func_op) { llvm::SetVector input_set; llvm::DenseSet graph_op_set(graph_ops.begin(), graph_ops.end()); - + DenseMap> external_values_per_op = + collectExternalValuesPerOp(graph_ops, func_op); for (Operation *op : graph_ops) { - collectExternalValuesForOp(op, graph_op_set, func_op, input_set); + for (Value external_val : external_values_per_op[op]) { + if (external_val.getDefiningOp()) { + if (!graph_op_set.contains(external_val.getDefiningOp())) { + input_set.insert(external_val); + } + } else { + if (isa(external_val) && + external_val.getParentBlock()->getParentOp() == + func_op.getOperation()) { + input_set.insert(external_val); + } + } + } } return SmallVector(input_set.begin(), input_set.end()); @@ -396,24 +408,9 @@ static LogicalResult convertFuncToTaskflow(func::FuncOp func_op) { return success(); } - llvm::errs() << "Converting function: " << func_op.getName() << "\n"; - llvm::errs() << "Collected taskflow graph operations:\n"; - for (Operation *op : graph_ops) { - llvm::errs() << " " << *op << "\n"; - } - SmallVector graph_inputs = identifyGraphInputs(graph_ops, func_op); SmallVector graph_outputs = identifyGraphOutputs(graph_ops, func_op); - llvm::errs() << "Identified graph inputs:\n"; - for (Value input : graph_inputs) { - llvm::errs() << " " << input << "\n"; - } - llvm::errs() << "Identified graph outputs:\n"; - for (Value output : graph_outputs) { - llvm::errs() << " " << output << "\n"; - } - // Finds insertion point: after the last operation that defines a graph input. Operation *insertion_point = nullptr; for (Value input : graph_inputs) { @@ -441,10 +438,6 @@ static LogicalResult convertFuncToTaskflow(func::FuncOp func_op) { // Step 2 & 3 & 4: Creates the taskflow.graph op. auto result = buildTaskflowGraph(builder, func_op, graph_ops, graph_inputs, graph_outputs, op_external_values); - llvm::errs() << "Converted function to TaskFlow graph.\n"; - llvm::errs() << "Resulting function:\n"; - func_op.print(llvm::errs()); - llvm::errs() << "\n"; return result; } diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop-taskflow.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop-taskflow.mlir new file mode 100644 index 00000000..615c7c50 --- /dev/null +++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop-taskflow.mlir @@ -0,0 +1,73 @@ +// RUN: mlir-neura-opt %s | FileCheck %s + +module { + func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage} { + %c2_i32 = arith.constant 2 : i32 + %c8_i32 = arith.constant 8 : i32 + %c0_i32 = arith.constant 0 : i32 + %alloca = memref.alloca() : memref + %alloca_0 = memref.alloca() : memref<4x8xi32> + taskflow.graph(%c0_i32, %alloca_0, %alloca, %c2_i32, %c8_i32) { + ^bb0(%arg0: i32, %arg1: memref<4x8xi32>, %arg2: memref, %arg3: i32, %arg4: i32): + %data_outs = "taskflow.task"(%arg0) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ + ^bb0(%arg5: i32): + %7 = affine.for %arg6 = 0 to 5 iter_args(%arg7 = %arg5) -> (i32) { + %8 = arith.index_cast %arg6 : index to i32 + %9 = arith.addi %arg7, %8 : i32 + affine.yield %9 : i32 + } + taskflow.yield %7 : i32 + }) : (i32) -> i32 + %1 = taskflow.channel %data_outs : i32 -> i32 + %control_outs, %data_outs_1 = "taskflow.task"(%arg4) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Controller_1"}> ({ + ^bb0(%arg5: i32): + affine.for %arg6 = 0 to 4 { + %7 = arith.index_cast %arg6 : index to i32 + %8 = arith.muli %7, %arg5 : i32 + taskflow.emit %arg6, %8 : index, i32 + } + taskflow.yield + }) : (i32) -> (!taskflow.packet, i32) + %2 = taskflow.channel %data_outs_1 : i32 -> i32 + %3 = taskflow.channel %data_outs_1 : i32 -> i32 + %4 = taskflow.drive %control_outs : !taskflow.packet -> !taskflow.packet + %5 = taskflow.drive %control_outs : !taskflow.packet -> !taskflow.packet + %data_outs_2 = "taskflow.task"(%5, %3, %arg1) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_2"}> ({ + ^bb0(%arg5: index, %arg6: i32, %arg7: memref<4x8xi32>): + affine.for %arg8 = 0 to 8 { + %7 = arith.index_cast %arg8 : index to i32 + %8 = arith.addi %arg6, %7 : i32 + memref.store %8, %arg7[%arg5, %arg8] : memref<4x8xi32> + } + taskflow.yield %arg7 : memref<4x8xi32> + }) : (!taskflow.packet, i32, memref<4x8xi32>) -> memref<4x8xi32> + %6 = taskflow.channel %data_outs_2 : memref<4x8xi32> -> memref<4x8xi32> + "taskflow.task"(%4, %2, %6, %1, %arg2, %arg3) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_3"}> ({ + ^bb0(%arg5: index, %arg6: i32, %arg7: memref<4x8xi32>, %arg8: i32, %arg9: memref, %arg10: i32): + affine.for %arg11 = 0 to 8 { + %7 = memref.load %arg7[%arg5, %arg11] : memref<4x8xi32> + %8 = arith.addi %7, %arg8 : i32 + %c3 = arith.constant 3 : index + %9 = arith.cmpi eq, %arg5, %c3 : index + %c7 = arith.constant 7 : index + %10 = arith.cmpi eq, %arg11, %c7 : index + %11 = arith.andi %9, %10 : i1 + scf.if %11 { + memref.store %8, %arg9[] : memref + %12 = arith.muli %8, %arg10 : i32 + memref.store %12, %arg9[] : memref + } + } + taskflow.yield %arg9 : memref + }) : (!taskflow.packet, i32, memref<4x8xi32>, i32, memref, i32) -> memref + } : (i32, memref<4x8xi32>, memref, i32, i32) -> () + %0 = affine.load %alloca[] : memref + return %0 : i32 + } +} + +// CHECK-LABEL: func.func @_Z21irregularLoopExample1v +// CHECK: taskflow.graph +// CHECK: taskflow.task +// CHECK: taskflow.channel +// CHECK: taskflow.yield \ No newline at end of file diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.cpp b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.cpp new file mode 100644 index 00000000..580e1971 --- /dev/null +++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.cpp @@ -0,0 +1,35 @@ +using namespace std; + +#define M 4 +#define N 8 +#define K 5 + +// Example 1: Matrix processing + vectorization + RAW dependency +int irregularLoopExample1() { + // vector> A(M, vector(N, 0)); + int A[M][N]; + int B[M][N]; + int temp[N]; + + for (int i = 0; i < M; i++) { + // First independent loop: matrix initialization (Independent Loop 1) + for (int j = 0; j < N; j++) { + A[i][j] = i * N + j; + temp[j] = 0; // Initialize temp + } + + // Non-nested code segment + int sum = 0; + for (int k = 0; k < K; k++) { + sum += k; + } + + // Second independent loop: using the results of the first loop (Independent + // Loop 2 - RAW Dependency) RAW: depends on the writes to temp[j] above + for (int j = 0; j < N; j++) { + B[i][j] = A[i][j] + temp[j] + sum; // Read temp[j] (RAW dependency) + B[i][j] *= 2; + } + } + return B[M - 1][N - 1]; +} \ No newline at end of file diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested-taskflow.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested-taskflow.mlir new file mode 100644 index 00000000..e1f27dba --- /dev/null +++ b/test/multi-cgra/taskflow/multi-nested/multi-nested-taskflow.mlir @@ -0,0 +1,84 @@ +// RUN: mlir-neura-opt %s | FileCheck %s + +module { + func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { + taskflow.graph(%arg0, %arg5, %arg1, %arg2, %arg6, %arg9, %arg3, %arg7, %arg4, %arg8) { + ^bb0(%arg10: memref, %arg11: memref, %arg12: memref, %arg13: memref, %arg14: memref, %arg15: memref, %arg16: memref, %arg17: memref, %arg18: memref, %arg19: memref): + %data_outs = "taskflow.task"(%arg10, %arg11) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ + ^bb0(%arg20: memref, %arg21: memref): + affine.for %arg22 = 0 to 4 { + affine.for %arg23 = 0 to 8 { + affine.for %arg24 = 0 to 6 { + %4 = affine.load %arg20[%arg22, %arg23, %arg24] : memref + affine.store %4, %arg21[%arg24] : memref + } + } + } + taskflow.yield %arg21 : memref + }) : (memref, memref) -> memref + %1 = taskflow.channel %data_outs : memref -> memref + %data_outs_0 = "taskflow.task"(%arg12, %arg13, %arg14) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_1"}> ({ + ^bb0(%arg20: memref, %arg21: memref, %arg22: memref): + affine.for %arg23 = 0 to 4 { + affine.for %arg24 = 0 to 8 { + affine.for %arg25 = 0 to 5 { + %4 = affine.load %arg20[%arg23, %arg24, %arg25] : memref + %5 = affine.load %arg21[%arg23, %arg24, %arg25] : memref + %6 = arith.addi %4, %5 : i32 + affine.store %6, %arg22[%arg25] : memref + } + } + } + taskflow.yield %arg22 : memref + }) : (memref, memref, memref) -> memref + %2 = taskflow.channel %data_outs_0 : memref -> memref + %data_outs_1 = "taskflow.task"(%1, %2, %arg15) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_2"}> ({ + ^bb0(%arg20: memref, %arg21: memref, %arg22: memref): + affine.for %arg23 = 0 to 4 { + affine.for %arg24 = 0 to 8 { + affine.for %arg25 = 0 to 6 { + %4 = affine.load %arg20[%arg25] : memref + %5 = affine.load %arg21[%arg25] : memref + %6 = arith.addi %4, %5 : i32 + %7 = affine.load %arg22[0] : memref + %8 = arith.addi %7, %6 : i32 + affine.store %8, %arg22[0] : memref + } + } + } + taskflow.yield %arg22 : memref + }) : (memref, memref, memref) -> memref + %data_outs_2 = "taskflow.task"(%arg16, %arg17) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_3"}> ({ + ^bb0(%arg20: memref, %arg21: memref): + affine.for %arg22 = 0 to 4 { + affine.for %arg23 = 0 to 7 { + %4 = affine.load %arg20[%arg22, %arg23] : memref + affine.store %4, %arg21[%arg23] : memref + } + } + taskflow.yield %arg21 : memref + }) : (memref, memref) -> memref + %3 = taskflow.channel %data_outs_2 : memref -> memref + %data_outs_3 = "taskflow.task"(%arg18, %3, %arg19) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_4"}> ({ + ^bb0(%arg20: memref, %arg21: memref, %arg22: memref): + affine.for %arg23 = 0 to 4 { + affine.for %arg24 = 0 to 9 { + %4 = affine.load %arg20[%arg23, %arg24] : memref + %5 = affine.load %arg21[%arg24] : memref + %6 = arith.addi %4, %5 : i32 + affine.store %6, %arg22[%arg24] : memref + } + } + taskflow.yield %arg22 : memref + }) : (memref, memref, memref) -> memref + } : (memref, memref, memref, memref, memref, memref, memref, memref, memref, memref) -> () + %0 = affine.load %arg9[0] : memref + return %0 : i32 + } +} + +// CHECK-LABEL: func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_ +// CHECK: taskflow.graph +// CHECK: taskflow.task +// CHECK: taskflow.channel +// CHECK: taskflow.yield \ No newline at end of file diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.cpp b/test/multi-cgra/taskflow/multi-nested/multi-nested.cpp new file mode 100644 index 00000000..e228ba86 --- /dev/null +++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.cpp @@ -0,0 +1,25 @@ +// Pure nested loop structure - no inter-loop computations +int pureNestedLoopExample(int d1[4][8][6], int d2[4][8][5], int d3[4][8][5], + int d4[4][7], int d5[4][9], int m1[6], int m2[5], + int m3[7], int m4[9], int *result) { + for (int i = 0; i < 4; i++) { // Loop A + for (int j = 0; j < 8; j++) { // Loop B + for (int k = 0; k < 6; k++) { // Loop C + m1[k] = d1[i][j][k]; + } + for (int k = 0; k < 5; k++) { // Loop D + m2[k] = d2[i][j][k] + d3[i][j][k]; + } + for (int k = 0; k < 6; k++) { // Loop E + *result += m1[k] + m2[k]; + } + } + for (int j = 0; j < 7; j++) { // Loop F + m3[j] = d4[i][j]; + } + for (int j = 0; j < 9; j++) { // Loop G + m4[j] = d5[i][j] + m3[j]; + } + } + return *result; +} \ No newline at end of file diff --git a/test/multi-cgra/taskflow/resenet/resnet.mlir b/test/multi-cgra/taskflow/resenet/resnet.mlir index f537fe8f..8042874c 100644 --- a/test/multi-cgra/taskflow/resenet/resnet.mlir +++ b/test/multi-cgra/taskflow/resenet/resnet.mlir @@ -5,70 +5,71 @@ // RUN: FileCheck %s --input-file=%t-resnet-taskflow.mlir -// CHECK: %2 = taskflow.graph(%arg0, %cst_1, %cst_0, %1, %0, %cst) { -// CHECK-NEXT: ^bb0(%arg1: tensor<1x64x8x8xf32>, %arg2: f32, %arg3: tensor<64x64x3x3xf32>, %arg4: tensor<1x64x8x8xf32>, %arg5: tensor<1x64x8x8xf32>, %arg6: tensor<64x64x3x3xf32>): -// CHECK-NEXT: %data_outs = "taskflow.task"(%arg1, %arg2) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "task_0"}> ({ -// CHECK-NEXT: ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: f32): -// CHECK-NEXT: %padded = tensor.pad %arg7 low[0, 0, 1, 1] high[0, 0, 1, 1] { -// CHECK-NEXT: ^bb0(%arg9: index, %arg10: index, %arg11: index, %arg12: index): -// CHECK-NEXT: tensor.yield %arg8 : f32 -// CHECK-NEXT: } : tensor<1x64x8x8xf32> to tensor<1x64x10x10xf32> -// CHECK-NEXT: taskflow.yield %padded : tensor<1x64x10x10xf32> -// CHECK-NEXT: }) : (tensor<1x64x8x8xf32>, f32) -> tensor<1x64x10x10xf32> -// CHECK-NEXT: %3 = taskflow.channel %data_outs : tensor<1x64x10x10xf32> -> tensor<1x64x10x10xf32> -// CHECK-NEXT: %data_outs_2 = "taskflow.task"(%arg3, %arg4, %3) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "conv2d_1"}> ({ -// CHECK-NEXT: ^bb0(%arg7: tensor<64x64x3x3xf32>, %arg8: tensor<1x64x8x8xf32>, %arg9: tensor<1x64x10x10xf32>): -// CHECK-NEXT: %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%data_outs, %arg7 : tensor<1x64x10x10xf32>, tensor<64x64x3x3xf32>) outs(%arg8 : tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32> -// CHECK-NEXT: taskflow.yield %9 : tensor<1x64x8x8xf32> -// CHECK-NEXT: }) : (tensor<64x64x3x3xf32>, tensor<1x64x8x8xf32>, tensor<1x64x10x10xf32>) -> tensor<1x64x8x8xf32> -// CHECK-NEXT: %4 = taskflow.channel %data_outs_2 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32> -// CHECK-NEXT: %data_outs_3 = "taskflow.task"(%arg5, %arg2, %4) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "generic_2"}> ({ -// CHECK-NEXT: ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: f32, %arg9: tensor<1x64x8x8xf32>): -// CHECK-NEXT: %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%data_outs_2 : tensor<1x64x8x8xf32>) outs(%arg7 : tensor<1x64x8x8xf32>) { -// CHECK-NEXT: ^bb0(%in: f32, %out: f32): -// CHECK-NEXT: %10 = arith.cmpf ugt, %in, %arg8 : f32 -// CHECK-NEXT: %11 = arith.select %10, %in, %arg8 : f32 -// CHECK-NEXT: linalg.yield %11 : f32 -// CHECK-NEXT: } -> tensor<1x64x8x8xf32> -// CHECK-NEXT: taskflow.yield %9 : tensor<1x64x8x8xf32> -// CHECK-NEXT: }) : (tensor<1x64x8x8xf32>, f32, tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32> -// CHECK-NEXT: %5 = taskflow.channel %data_outs_3 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32> -// CHECK-NEXT: %data_outs_4 = "taskflow.task"(%arg2, %5) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "task_3"}> ({ -// CHECK-NEXT: ^bb0(%arg7: f32, %arg8: tensor<1x64x8x8xf32>): -// CHECK-NEXT: %padded = tensor.pad %data_outs_3 low[0, 0, 1, 1] high[0, 0, 1, 1] { -// CHECK-NEXT: ^bb0(%arg9: index, %arg10: index, %arg11: index, %arg12: index): -// CHECK-NEXT: tensor.yield %arg7 : f32 -// CHECK-NEXT: } : tensor<1x64x8x8xf32> to tensor<1x64x10x10xf32> -// CHECK-NEXT: taskflow.yield %padded : tensor<1x64x10x10xf32> -// CHECK-NEXT: }) : (f32, tensor<1x64x8x8xf32>) -> tensor<1x64x10x10xf32> -// CHECK-NEXT: %6 = taskflow.channel %data_outs_4 : tensor<1x64x10x10xf32> -> tensor<1x64x10x10xf32> -// CHECK-NEXT: %data_outs_5 = "taskflow.task"(%arg6, %arg4, %6) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "conv2d_4"}> ({ -// CHECK-NEXT: ^bb0(%arg7: tensor<64x64x3x3xf32>, %arg8: tensor<1x64x8x8xf32>, %arg9: tensor<1x64x10x10xf32>): -// CHECK-NEXT: %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%data_outs_4, %arg7 : tensor<1x64x10x10xf32>, tensor<64x64x3x3xf32>) outs(%arg8 : tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32> -// CHECK-NEXT: taskflow.yield %9 : tensor<1x64x8x8xf32> -// CHECK-NEXT: }) : (tensor<64x64x3x3xf32>, tensor<1x64x8x8xf32>, tensor<1x64x10x10xf32>) -> tensor<1x64x8x8xf32> -// CHECK-NEXT: %7 = taskflow.channel %data_outs_5 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32> -// CHECK-NEXT: %data_outs_6 = "taskflow.task"(%arg1, %arg5, %7) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "generic_5"}> ({ -// CHECK-NEXT: ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: tensor<1x64x8x8xf32>, %arg9: tensor<1x64x8x8xf32>): -// CHECK-NEXT: %9 = linalg.generic {indexing_maps = [#map, #map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%data_outs_5, %arg7 : tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>) outs(%arg8 : tensor<1x64x8x8xf32>) { -// CHECK-NEXT: ^bb0(%in: f32, %in_8: f32, %out: f32): -// CHECK-NEXT: %10 = arith.addf %in, %in_8 : f32 -// CHECK-NEXT: linalg.yield %10 : f32 -// CHECK-NEXT: } -> tensor<1x64x8x8xf32> -// CHECK-NEXT: taskflow.yield %9 : tensor<1x64x8x8xf32> -// CHECK-NEXT: }) : (tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32> -// CHECK-NEXT: %8 = taskflow.channel %data_outs_6 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32> -// CHECK-NEXT: %data_outs_7 = "taskflow.task"(%arg5, %arg2, %8) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "generic_6"}> ({ -// CHECK-NEXT: ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: f32, %arg9: tensor<1x64x8x8xf32>): -// CHECK-NEXT: %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%data_outs_6 : tensor<1x64x8x8xf32>) outs(%arg7 : tensor<1x64x8x8xf32>) { -// CHECK-NEXT: ^bb0(%in: f32, %out: f32): -// CHECK-NEXT: %10 = arith.cmpf ugt, %in, %arg8 : f32 -// CHECK-NEXT: %11 = arith.select %10, %in, %arg8 : f32 -// CHECK-NEXT: linalg.yield %11 : f32 -// CHECK-NEXT: } -> tensor<1x64x8x8xf32> -// CHECK-NEXT: taskflow.yield %9 : tensor<1x64x8x8xf32> -// CHECK-NEXT: }) : (tensor<1x64x8x8xf32>, f32, tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32> -// CHECK-NEXT: taskflow.return %data_outs_7 : tensor<1x64x8x8xf32> -// CHECK-NEXT: } : (tensor<1x64x8x8xf32>, f32, tensor<64x64x3x3xf32>, tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>, tensor<64x64x3x3xf32>) -> tensor<1x64x8x8xf32> -// CHECK-NEXT: return %2 : tensor<1x64x8x8xf32> +// CHECK: %2 = taskflow.graph(%arg0, %cst_1, %cst_0, %1, %0, %cst) { +// CHECK-NEXT: ^bb0(%arg1: tensor<1x64x8x8xf32>, %arg2: f32, %arg3: tensor<64x64x3x3xf32>, %arg4: tensor<1x64x8x8xf32>, %arg5: tensor<1x64x8x8xf32>, %arg6: tensor<64x64x3x3xf32>): +// CHECK-NEXT: %data_outs = "taskflow.task"(%arg1, %arg2) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "task_0"}> ({ +// CHECK-NEXT: ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: f32): +// CHECK-NEXT: %padded = tensor.pad %arg7 low[0, 0, 1, 1] high[0, 0, 1, 1] { +// CHECK-NEXT: ^bb0(%arg9: index, %arg10: index, %arg11: index, %arg12: index): +// CHECK-NEXT: tensor.yield %arg8 : f32 +// CHECK-NEXT: } : tensor<1x64x8x8xf32> to tensor<1x64x10x10xf32> +// CHECK-NEXT: taskflow.yield %padded : tensor<1x64x10x10xf32> +// CHECK-NEXT: }) : (tensor<1x64x8x8xf32>, f32) -> tensor<1x64x10x10xf32> +// CHECK-NEXT: %3 = taskflow.channel %data_outs : tensor<1x64x10x10xf32> -> tensor<1x64x10x10xf32> +// CHECK-NEXT: %data_outs_2 = "taskflow.task"(%3, %arg3, %arg4) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "conv2d_1"}> ({ +// CHECK-NEXT: ^bb0(%arg7: tensor<1x64x10x10xf32>, %arg8: tensor<64x64x3x3xf32>, %arg9: tensor<1x64x8x8xf32>): +// CHECK-NEXT: %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg7, %arg8 : tensor<1x64x10x10xf32>, tensor<64x64x3x3xf32>) outs(%arg9 : tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32> +// CHECK-NEXT: taskflow.yield %9 : tensor<1x64x8x8xf32> +// CHECK-NEXT: }) : (tensor<1x64x10x10xf32>, tensor<64x64x3x3xf32>, tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32> +// CHECK-NEXT: %4 = taskflow.channel %data_outs_2 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32> +// CHECK-NEXT: %data_outs_3 = "taskflow.task"(%4, %arg5, %arg2) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "generic_2"}> ({ +// CHECK-NEXT: ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: tensor<1x64x8x8xf32>, %arg9: f32): +// CHECK-NEXT: %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg7 : tensor<1x64x8x8xf32>) outs(%arg8 : tensor<1x64x8x8xf32>) { +// CHECK-NEXT: ^bb0(%in: f32, %out: f32): +// CHECK-NEXT: %10 = arith.cmpf ugt, %in, %arg9 : f32 +// CHECK-NEXT: %11 = arith.select %10, %in, %arg9 : f32 +// CHECK-NEXT: linalg.yield %11 : f32 +// CHECK-NEXT: } -> tensor<1x64x8x8xf32> +// CHECK-NEXT: taskflow.yield %9 : tensor<1x64x8x8xf32> +// CHECK-NEXT: }) : (tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>, f32) -> tensor<1x64x8x8xf32> +// CHECK-NEXT: %5 = taskflow.channel %data_outs_3 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32> +// CHECK-NEXT: %data_outs_4 = "taskflow.task"(%5, %arg2) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "task_3"}> ({ +// CHECK-NEXT: ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: f32): +// CHECK-NEXT: %padded = tensor.pad %arg7 low[0, 0, 1, 1] high[0, 0, 1, 1] { +// CHECK-NEXT: ^bb0(%arg9: index, %arg10: index, %arg11: index, %arg12: index): +// CHECK-NEXT: tensor.yield %arg8 : f32 +// CHECK-NEXT: } : tensor<1x64x8x8xf32> to tensor<1x64x10x10xf32> +// CHECK-NEXT: taskflow.yield %padded : tensor<1x64x10x10xf32> +// CHECK-NEXT: }) : (tensor<1x64x8x8xf32>, f32) -> tensor<1x64x10x10xf32> +// CHECK-NEXT: %6 = taskflow.channel %data_outs_4 : tensor<1x64x10x10xf32> -> tensor<1x64x10x10xf32> +// CHECK-NEXT: %data_outs_5 = "taskflow.task"(%6, %arg6, %arg4) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "conv2d_4"}> ({ +// CHECK-NEXT: ^bb0(%arg7: tensor<1x64x10x10xf32>, %arg8: tensor<64x64x3x3xf32>, %arg9: tensor<1x64x8x8xf32>): +// CHECK-NEXT: %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg7, %arg8 : tensor<1x64x10x10xf32>, tensor<64x64x3x3xf32>) outs(%arg9 : tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32> +// CHECK-NEXT: taskflow.yield %9 : tensor<1x64x8x8xf32> +// CHECK-NEXT: }) : (tensor<1x64x10x10xf32>, tensor<64x64x3x3xf32>, tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32> +// CHECK-NEXT: %7 = taskflow.channel %data_outs_5 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32> +// CHECK-NEXT: %data_outs_6 = "taskflow.task"(%7, %arg1, %arg5) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "generic_5"}> ({ +// CHECK-NEXT: ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: tensor<1x64x8x8xf32>, %arg9: tensor<1x64x8x8xf32>): +// CHECK-NEXT: %9 = linalg.generic {indexing_maps = [#map, #map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg7, %arg8 : tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>) outs(%arg9 : tensor<1x64x8x8xf32>) { +// CHECK-NEXT: ^bb0(%in: f32, %in_8: f32, %out: f32): +// CHECK-NEXT: %10 = arith.addf %in, %in_8 : f32 +// CHECK-NEXT: linalg.yield %10 : f32 +// CHECK-NEXT: } -> tensor<1x64x8x8xf32> +// CHECK-NEXT: taskflow.yield %9 : tensor<1x64x8x8xf32> +// CHECK-NEXT: }) : (tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32> +// CHECK-NEXT: %8 = taskflow.channel %data_outs_6 : tensor<1x64x8x8xf32> -> tensor<1x64x8x8xf32> +// CHECK-NEXT: %data_outs_7 = "taskflow.task"(%8, %arg5, %arg2) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "generic_6"}> ({ +// CHECK-NEXT: ^bb0(%arg7: tensor<1x64x8x8xf32>, %arg8: tensor<1x64x8x8xf32>, %arg9: f32): +// CHECK-NEXT: %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg7 : tensor<1x64x8x8xf32>) outs(%arg8 : tensor<1x64x8x8xf32>) { +// CHECK-NEXT: ^bb0(%in: f32, %out: f32): +// CHECK-NEXT: %10 = arith.cmpf ugt, %in, %arg9 : f32 +// CHECK-NEXT: %11 = arith.select %10, %in, %arg9 : f32 +// CHECK-NEXT: linalg.yield %11 : f32 +// CHECK-NEXT: } -> tensor<1x64x8x8xf32> +// CHECK-NEXT: taskflow.yield %9 : tensor<1x64x8x8xf32> +// CHECK-NEXT: }) : (tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>, f32) -> tensor<1x64x8x8xf32> +// CHECK-NEXT: taskflow.return %data_outs_7 : tensor<1x64x8x8xf32> +// CHECK-NEXT: } : (tensor<1x64x8x8xf32>, f32, tensor<64x64x3x3xf32>, tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>, tensor<64x64x3x3xf32>) -> tensor<1x64x8x8xf32> +// CHECK-NEXT: return %2 : tensor<1x64x8x8xf32> +// CHECK-NEXT: } // CHECK-NEXT: } \ No newline at end of file diff --git a/tools/mlir-neura-opt/CMakeLists.txt b/tools/mlir-neura-opt/CMakeLists.txt index 70c06a51..84980cd8 100644 --- a/tools/mlir-neura-opt/CMakeLists.txt +++ b/tools/mlir-neura-opt/CMakeLists.txt @@ -7,6 +7,7 @@ set(LIBS MLIRNeuraTransforms MLIRConversion MLIRNeura + MLIRTaskflow MLIRTransforms MLIROptLib MLIRPass diff --git a/tools/mlir-neura-opt/mlir-neura-opt.cpp b/tools/mlir-neura-opt/mlir-neura-opt.cpp index cd824879..55672b7c 100644 --- a/tools/mlir-neura-opt/mlir-neura-opt.cpp +++ b/tools/mlir-neura-opt/mlir-neura-opt.cpp @@ -17,6 +17,7 @@ #include "NeuraDialect/Architecture/ArchitectureSpec.h" #include "NeuraDialect/NeuraDialect.h" #include "NeuraDialect/NeuraPasses.h" +#include "TaskflowDialect/TaskflowDialect.h" // Global variable to store architecture spec file path static std::string architecture_spec_file; @@ -71,6 +72,7 @@ int main(int argc, char **argv) { registry.insert(); registry.insert(); registry.insert(); + registry.insert(); mlir::neura::registerPasses(); mlir::registerPasses();