diff --git a/include/TaskflowDialect/TaskflowAttributes.h b/include/TaskflowDialect/TaskflowAttributes.h new file mode 100644 index 00000000..668d649d --- /dev/null +++ b/include/TaskflowDialect/TaskflowAttributes.h @@ -0,0 +1,21 @@ +#pragma once + +#include "llvm/ADT/StringRef.h" + +namespace mlir { +namespace taskflow { +namespace attr { +// Attribute keys on taskflow.task operations produced by the +// TaskDivisibilityAnalysisPass. +constexpr llvm::StringLiteral kDivisibilityInfo = "divisibility_info"; +constexpr llvm::StringLiteral kDivisibility = "divisibility"; +constexpr llvm::StringLiteral kParallelDims = "parallel_dims"; +constexpr llvm::StringLiteral kParallelSpace = "parallel_space"; + +namespace val { +constexpr llvm::StringLiteral kDivisible = "divisible"; +constexpr llvm::StringLiteral kAtomic = "atomic"; +} // namespace val +} // namespace attr +} // namespace taskflow +} // namespace mlir \ No newline at end of file diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h index 92393d7c..21d7f322 100644 --- a/include/TaskflowDialect/TaskflowPasses.h +++ b/include/TaskflowDialect/TaskflowPasses.h @@ -23,6 +23,7 @@ std::unique_ptr createConstructHyperblockFromTaskPass(); std::unique_ptr createClassifyCountersPass(); std::unique_ptr createMapTaskOnCgraPass(); std::unique_ptr createFuseTaskPass(); +std::unique_ptr createTaskDivisibilityAnalysisPass(); //=========================================================// // Optimization Passes diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td index 5cf07cd7..5494d329 100644 --- a/include/TaskflowDialect/TaskflowPasses.td +++ b/include/TaskflowDialect/TaskflowPasses.td @@ -95,6 +95,33 @@ def FuseTask : Pass<"fuse-task", "func::FuncOp"> { "mlir::taskflow::TaskflowDialect"]; } +def TaskDivisibilityAnalysis : Pass<"task-divisibility-analysis", "func::FuncOp"> { + let summary = "Analyzes taskn divisibility based on loop parallelism"; + let description = [{ + Analyzes each taskflow.task operation to determine whether its loop nest + contains parallel loops that can be tiled for data-level parallelism. + + Task divisibility categories: + - divisible: The task has at least one parallel loop (no loop-carried + dependencies) whose trip count > 1. Such tasks can be tiled into + sibling sub-tasks for runtime configuration duplication (DLP). + - atomic: The task has no exploitable parallel loops. It must execute + as a single indivisible unit. + + The pass attaches three attributes to each taskflow.task: + - divisibility : StringAttr ("divisible" or "atomic") + - parallel_dims : DenseI64ArrayAttr (loop depth indices of parallel loops) + - parallel_space : DenseI64ArrayAttr (trip counts of the parallel dims) + + Parallel loop detection uses MLIR's affine dependence analysis + (isLoopParallel). Reduction loops recognised by MLIR are also counted + as parallel. + }]; + let constructor = "taskflow::createTaskDivisibilityAnalysisPass()"; + let dependentDialects = ["mlir::affine::AffineDialect", + "mlir::func::FuncDialect"]; +} + def MemoryAccessStreamingFusion : Pass<"memory-access-streaming-fusion", "func::FuncOp"> { let summary = diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt index 60078298..02b69ee5 100644 --- a/lib/TaskflowDialect/Transforms/CMakeLists.txt +++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt @@ -5,6 +5,7 @@ add_mlir_library(MLIRTaskflowTransforms ClassifyCountersPass.cpp MapTaskOnCgraPass.cpp FuseTaskPass.cpp + TaskDivisibilityAnalysisPass.cpp DEPENDS MLIRTaskflowTransformsIncGen diff --git a/lib/TaskflowDialect/Transforms/TaskDivisibilityAnalysisPass.cpp b/lib/TaskflowDialect/Transforms/TaskDivisibilityAnalysisPass.cpp new file mode 100644 index 00000000..f24b4004 --- /dev/null +++ b/lib/TaskflowDialect/Transforms/TaskDivisibilityAnalysisPass.cpp @@ -0,0 +1,220 @@ +//===- TaskDivisibilityAnalysisPass.cpp - Analyze task divisibility ----===// +// +// This pass analyzes each taskflow.task operation to determine whether its +// loop nest contains parallel loops that can be tiled for data-level +// parallelism (DLP). +// +// Task divisibility categories: +// - divisible: Has at least one parallel loop (no loop-carried deps) with +// trip_count > 1. Can be tiled into sibling sub-tasks for runtime +// configuration duplication. +// - atomic: No exploitable parallel loops. Must execute as a single +// indivisible unit. +// +// The pass attaches an attribute to each taskflow.task: +// divisibility_info = { +// divisibility : StringAttr ("divisible" or "atomic") +// parallel_dims : DenseI32ArrayAttr (loop depth indices of parallel loops) +// parallel_space : DenseI32ArrayAttr (trip counts of those parallel loops) +// } +// +//===----------------------------------------------------------------------===// + +#include "TaskflowDialect/TaskflowAttributes.h" +#include "TaskflowDialect/TaskflowDialect.h" +#include "TaskflowDialect/TaskflowOps.h" +#include "TaskflowDialect/TaskflowPasses.h" + +#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h" +#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/Attributes.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/MLIRContext.h" +#include "mlir/Pass/Pass.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" + +using namespace mlir; +using namespace mlir::taskflow; + +namespace { + +//===----------------------------------------------------------------------===// +// Loop Nest Traversal Helpers +//===----------------------------------------------------------------------===// + +// Collects the full loop nest starting from `outermost`, walking into +// perfectly and imperfectly nested loops (only follows the first nested +// affine.for at each level to form the "spine" of the nest). +static SmallVector +collectLoopNest(affine::AffineForOp outermost) { + SmallVector nest; + affine::AffineForOp current = outermost; + + while (current) { + nest.push_back(current); + + // Looks for a single nested affine.for in the body. + affine::AffineForOp nested = nullptr; + for (Operation &op : current.getBody()->getOperations()) { + if (auto for_op = dyn_cast(&op)) { + if (nested) { + // Multiple nested loops — stop descending (not a simple chain). + nested = nullptr; + break; + } + nested = for_op; + } + } + current = nested; + } + + return nest; +} + +//===----------------------------------------------------------------------===// +// Per-Task Parallelism Analysis +//===----------------------------------------------------------------------===// + +struct TaskParallelismInfo { + StringRef divisibility; // "divisible" or "atomic" + SmallVector parallel_dims; // Loop depth indices of parallel loops. + SmallVector parallel_space; // Trip counts of parallel dims. +}; + +// Analyzes a single taskflow.task and determines its category. +static TaskParallelismInfo analyzeTask(TaskflowTaskOp task_op) { + TaskParallelismInfo info; + info.divisibility = attr::val::kAtomic; // Default: no parallelism found. + + // Finds the outermost affine.for in the task body. + affine::AffineForOp outermost_loop = nullptr; + task_op.getBody().walk([&](affine::AffineForOp for_op) { + // We want the outermost loop. Walk visits ops in pre-order, + // so the first affine.for encountered at the top level is outermost. + if (!outermost_loop) { + // Checks that this loop is at the top level of the task body + // (its parent is the task's block, not another loop). + if (for_op->getParentOp() == task_op.getOperation()) { + outermost_loop = for_op; + } + } + }); + + if (!outermost_loop) { + llvm::errs() << "[TaskDivisibilityAnalysis] Task " << task_op.getTaskName() + << ": no affine.for found, classified as atomic\n"; + return info; + } + + // Collects the loop nest spine. + SmallVector loop_nest = collectLoopNest(outermost_loop); + + llvm::errs() << "[TaskDivisibilityAnalysis] Task " << task_op.getTaskName() + << ": loop nest depth = " << loop_nest.size() << "\n"; + + // Analyzes each loop level for parallelism. + for (size_t depth = 0; depth < loop_nest.size(); ++depth) { + affine::AffineForOp loop = loop_nest[depth]; + + // Checks if the loop is parallel (not including reduction-parallel). + bool is_parallel = affine::isLoopParallel(loop); + + // Gets the trip count. + std::optional trip_count = affine::getConstantTripCount(loop); + int tc = trip_count.has_value() ? static_cast(*trip_count) : -1; + + llvm::errs() << "[TaskDivisibilityAnalysis] depth " << depth + << ": parallel=" << is_parallel << ", trip_count=" << tc + << "\n"; + + if (is_parallel && tc > 1) { + info.parallel_dims.push_back(static_cast(depth)); + info.parallel_space.push_back(tc); + } + } + + // Classifies based on whether any parallel dims were found. + if (!info.parallel_dims.empty()) { + info.divisibility = "divisible"; + } + + llvm::errs() << "[TaskDivisibilityAnalysis] Task " << task_op.getTaskName() + << " -> " << info.divisibility; + if (!info.parallel_dims.empty()) { + llvm::errs() << ", parallel_dims=["; + for (size_t i = 0; i < info.parallel_dims.size(); ++i) { + if (i > 0) + llvm::errs() << ","; + llvm::errs() << info.parallel_dims[i]; + } + llvm::errs() << "], parallel_space=["; + for (size_t i = 0; i < info.parallel_space.size(); ++i) { + if (i > 0) + llvm::errs() << ","; + llvm::errs() << info.parallel_space[i]; + } + llvm::errs() << "]"; + } + llvm::errs() << "\n"; + + return info; +} + +//===----------------------------------------------------------------------===// +// Task Divisibility Analysis Pass +//===----------------------------------------------------------------------===// + +struct TaskDivisibilityAnalysisPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TaskDivisibilityAnalysisPass) + + StringRef getArgument() const final { return "task-divisibility-analysis"; } + + StringRef getDescription() const final { + return "Analyzes task divisibility based on loop parallelism"; + } + + void runOnOperation() override { + func::FuncOp func = getOperation(); + + llvm::errs() << "[TaskDivisibilityAnalysis] Running on function: " + << func.getName() << "\n"; + + func.walk([&](TaskflowTaskOp task_op) { + // Analyzes the task. + TaskParallelismInfo info = analyzeTask(task_op); + // Attaches the divisibility_info attribute to each task. + MLIRContext *ctx = task_op.getContext(); + OpBuilder builder(task_op); + + SmallVector div_attrs; + div_attrs.push_back( + NamedAttribute(StringAttr::get(ctx, attr::kDivisibility), + StringAttr::get(ctx, info.divisibility))); + div_attrs.push_back( + NamedAttribute(StringAttr::get(ctx, attr::kParallelDims), + DenseI32ArrayAttr::get(ctx, info.parallel_dims))); + div_attrs.push_back( + NamedAttribute(StringAttr::get(ctx, attr::kParallelSpace), + DenseI32ArrayAttr::get(ctx, info.parallel_space))); + + task_op->setAttr(attr::kDivisibilityInfo, + builder.getDictionaryAttr(div_attrs)); + }); + } +}; + +} // namespace + +//===----------------------------------------------------------------------===// +// Pass Registration +//===----------------------------------------------------------------------===// + +std::unique_ptr mlir::taskflow::createTaskDivisibilityAnalysisPass() { + return std::make_unique(); +} diff --git a/test/multi-cgra/kernel_mapping/fir/fir.mlir b/test/multi-cgra/kernel_mapping/fir/fir.mlir index cc8e85d8..ac231e87 100644 --- a/test/multi-cgra/kernel_mapping/fir/fir.mlir +++ b/test/multi-cgra/kernel_mapping/fir/fir.mlir @@ -2,6 +2,11 @@ // RUN: -o %t.taskflow.mlir // RUN: FileCheck %s --input-file=%t.taskflow.mlir --check-prefixes=TASKFLOW +// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: --task-divisibility-analysis \ +// RUN: -o %t.div.mlir +// RUN: FileCheck %s --input-file=%t.div.mlir --check-prefixes=DIV + // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ // RUN: -o %t.hyperblock.mlir @@ -107,6 +112,24 @@ module attributes {} { // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } +// DIV: module { +// DIV-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { +// DIV-NEXT: %c0_i32 = arith.constant 0 : i32 +// DIV-NEXT: %dependency_read_out:2, %value_outputs = taskflow.task @Task_0 dependency_read_in(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] {divisibility_info = {divisibility = "atomic", parallel_dims = array, parallel_space = array}} : (memref, memref, i32) -> (memref, memref, i32) { +// DIV-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): +// DIV-NEXT: %0 = affine.for %arg6 = 0 to 32 iter_args(%arg7 = %arg5) -> (i32) { +// DIV-NEXT: %1 = affine.load %arg3[%arg6] : memref +// DIV-NEXT: %2 = affine.load %arg4[%arg6] : memref +// DIV-NEXT: %3 = arith.muli %1, %2 : i32 +// DIV-NEXT: %4 = arith.addi %arg7, %3 : i32 +// DIV-NEXT: affine.yield %4 : i32 +// DIV-NEXT: } +// DIV-NEXT: taskflow.yield reads(%arg3, %arg4 : memref, memref) values(%0 : i32) +// DIV-NEXT: } +// DIV-NEXT: return %value_outputs : i32 +// DIV-NEXT: } +// DIV-NEXT: } + // HYPERBLOCK: module { // HYPERBLOCK-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // HYPERBLOCK-NEXT: %c0_i32 = arith.constant 0 : i32 diff --git a/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir b/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir index 968031f0..aa7362eb 100644 --- a/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir +++ b/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir @@ -15,6 +15,15 @@ // RUN: -o %t.stream.mlir // RUN: FileCheck %s --input-file=%t.stream.mlir --check-prefixes=STREAM +// RUN: mlir-neura-opt %t.affine.mlir \ +// RUN: --affine-loop-tree-serialization \ +// RUN: --affine-loop-perfection \ +// RUN: --convert-affine-to-taskflow \ +// RUN: --memory-access-streaming-fusion \ +// RUN: --task-divisibility-analysis \ +// RUN: -o %t.div.mlir +// RUN: FileCheck %s --input-file=%t.div.mlir --check-prefixes=DIV + // RUN: mlir-neura-opt %t.stream.mlir \ // RUN: --affine-loop-tree-serialization \ // RUN: --affine-loop-perfection \ @@ -704,6 +713,187 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // STREAM-NEXT: } // STREAM-NEXT: } +// DIV: module attributes {torch.debug_module_name = "SimpleResNetBlock"} { +// DIV-NEXT: memref.global "private" constant @__constant_64xf32 : memref<64xf32> = dense<0.000000e+00> {alignment = 64 : i64} +// DIV-NEXT: memref.global "private" constant @__constant_64x3x3x64xf32_0 : memref<64x3x3x64xf32> = dense<-0.0151730878> {alignment = 64 : i64} +// DIV-NEXT: memref.global "private" constant @__constant_64x3x3x64xf32 : memref<64x3x3x64xf32> = dense<0.0197670367> {alignment = 64 : i64} +// DIV-NEXT: func.func @forward(%arg0: memref<1x64x8x8xf32>) -> memref<1x64x8x8xf32> { +// DIV-NEXT: %cst = arith.constant 0.0197670367 : f32 +// DIV-NEXT: %cst_0 = arith.constant -0.0151730878 : f32 +// DIV-NEXT: %cst_1 = arith.constant 3.40282347E+38 : f32 +// DIV-NEXT: %cst_2 = arith.constant 0.000000e+00 : f32 +// DIV-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> +// DIV-NEXT: %dependency_read_out, %dependency_write_out = taskflow.task @Task_0 dependency_read_in(%arg0 : memref<1x64x8x8xf32>) dependency_write_in(%alloc : memref<1x8x8x64xf32>) [original_read_memrefs(%arg0 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc : memref<1x8x8x64xf32>)] {divisibility_info = {divisibility = "divisible", parallel_dims = array, parallel_space = array}} : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) { +// DIV-NEXT: ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x8x8x64xf32>): +// DIV-NEXT: affine.for %arg3 = 0 to 1 { +// DIV-NEXT: affine.for %arg4 = 0 to 8 { +// DIV-NEXT: affine.for %arg5 = 0 to 8 { +// DIV-NEXT: affine.for %arg6 = 0 to 64 { +// DIV-NEXT: %0 = affine.load %arg1[%arg3, %arg6, %arg4, %arg5] : memref<1x64x8x8xf32> +// DIV-NEXT: affine.store %0, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<1x8x8x64xf32> +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: taskflow.yield reads(%arg1 : memref<1x64x8x8xf32>) writes(%arg2 : memref<1x8x8x64xf32>) +// DIV-NEXT: } +// DIV-NEXT: %alloc_3 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10x64xf32> +// DIV-NEXT: %dependency_write_out_4 = taskflow.task @Task_1 dependency_write_in(%alloc_3 : memref<1x10x10x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_3 : memref<1x10x10x64xf32>)] {divisibility_info = {divisibility = "divisible", parallel_dims = array, parallel_space = array}} : (memref<1x10x10x64xf32>, f32) -> (memref<1x10x10x64xf32>) { +// DIV-NEXT: ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: f32): +// DIV-NEXT: affine.for %arg3 = 0 to 1 { +// DIV-NEXT: affine.for %arg4 = 0 to 10 { +// DIV-NEXT: affine.for %arg5 = 0 to 10 { +// DIV-NEXT: affine.for %arg6 = 0 to 64 { +// DIV-NEXT: affine.store %arg2, %arg1[%arg3, %arg4, %arg5, %arg6] : memref<1x10x10x64xf32> +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: taskflow.yield writes(%arg1 : memref<1x10x10x64xf32>) +// DIV-NEXT: } +// DIV-NEXT: %alloc_5 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> +// DIV-NEXT: %dependency_write_out_6 = taskflow.task @Task_2 dependency_write_in(%alloc_5 : memref<1x8x8x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_5 : memref<1x8x8x64xf32>)] {divisibility_info = {divisibility = "divisible", parallel_dims = array, parallel_space = array}} : (memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) { +// DIV-NEXT: ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: f32): +// DIV-NEXT: affine.for %arg3 = 0 to 1 { +// DIV-NEXT: affine.for %arg4 = 0 to 8 { +// DIV-NEXT: affine.for %arg5 = 0 to 8 { +// DIV-NEXT: affine.for %arg6 = 0 to 64 { +// DIV-NEXT: affine.store %arg2, %arg1[%arg3, %arg4, %arg5, %arg6] : memref<1x8x8x64xf32> +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: taskflow.yield writes(%arg1 : memref<1x8x8x64xf32>) +// DIV-NEXT: } +// DIV-NEXT: %dependency_read_out_7:2, %dependency_write_out_8 = taskflow.task @Task_3 dependency_read_in(%dependency_write_out_4, %dependency_write_out_6 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) dependency_write_in(%dependency_write_out_6 : memref<1x8x8x64xf32>) value_inputs(%cst_0 : f32) [original_read_memrefs(%alloc_3, %alloc_5 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_5 : memref<1x8x8x64xf32>)] {divisibility_info = {divisibility = "divisible", parallel_dims = array, parallel_space = array}} : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>) { +// DIV-NEXT: ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: memref<1x8x8x64xf32>, %arg3: memref<1x8x8x64xf32>, %arg4: f32): +// DIV-NEXT: affine.for %arg5 = 0 to 1 { +// DIV-NEXT: affine.for %arg6 = 0 to 8 { +// DIV-NEXT: affine.for %arg7 = 0 to 8 { +// DIV-NEXT: affine.for %arg8 = 0 to 64 { +// DIV-NEXT: affine.for %arg9 = 0 to 3 { +// DIV-NEXT: affine.for %arg10 = 0 to 3 { +// DIV-NEXT: affine.for %arg11 = 0 to 64 { +// DIV-NEXT: %0 = affine.load %arg1[%arg5, %arg6 + %arg9, %arg7 + %arg10, %arg11] : memref<1x10x10x64xf32> +// DIV-NEXT: %1 = affine.load %arg3[%arg5, %arg6, %arg7, %arg8] : memref<1x8x8x64xf32> +// DIV-NEXT: %2 = arith.mulf %0, %arg4 : f32 +// DIV-NEXT: %3 = arith.addf %1, %2 : f32 +// DIV-NEXT: affine.store %3, %arg3[%arg5, %arg6, %arg7, %arg8] : memref<1x8x8x64xf32> +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: taskflow.yield reads(%arg1, %arg3 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) writes(%arg3 : memref<1x8x8x64xf32>) +// DIV-NEXT: } +// DIV-NEXT: %alloc_9 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> +// DIV-NEXT: %dependency_read_out_10, %dependency_write_out_11 = taskflow.task @Task_4_Task_5_fused dependency_read_in(%dependency_write_out_8 : memref<1x8x8x64xf32>) dependency_write_in(%alloc_9 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_5 : memref<1x8x8x64xf32>), original_write_memrefs(%alloc_9 : memref<1x64x8x8xf32>)] {divisibility_info = {divisibility = "divisible", parallel_dims = array, parallel_space = array}} : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) { +// DIV-NEXT: ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: memref<1x64x8x8xf32>, %arg3: f32, %arg4: f32): +// DIV-NEXT: affine.for %arg5 = 0 to 1 { +// DIV-NEXT: affine.for %arg6 = 0 to 64 { +// DIV-NEXT: affine.for %arg7 = 0 to 8 { +// DIV-NEXT: affine.for %arg8 = 0 to 8 { +// DIV-NEXT: %0 = affine.load %arg1[%arg5, %arg7, %arg8, %arg6] : memref<1x8x8x64xf32> +// DIV-NEXT: %1 = arith.minimumf %0, %arg3 : f32 +// DIV-NEXT: %2 = arith.maximumf %1, %arg4 : f32 +// DIV-NEXT: affine.store %2, %arg2[%arg5, %arg6, %arg7, %arg8] : memref<1x64x8x8xf32> +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: taskflow.yield reads(%arg1 : memref<1x8x8x64xf32>) writes(%arg2 : memref<1x64x8x8xf32>) +// DIV-NEXT: } +// DIV-NEXT: %alloc_12 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> +// DIV-NEXT: %dependency_read_out_13, %dependency_write_out_14 = taskflow.task @Task_6 dependency_read_in(%dependency_write_out_11 : memref<1x64x8x8xf32>) dependency_write_in(%alloc_12 : memref<1x8x8x64xf32>) [original_read_memrefs(%alloc_9 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_12 : memref<1x8x8x64xf32>)] {divisibility_info = {divisibility = "divisible", parallel_dims = array, parallel_space = array}} : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) { +// DIV-NEXT: ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x8x8x64xf32>): +// DIV-NEXT: affine.for %arg3 = 0 to 1 { +// DIV-NEXT: affine.for %arg4 = 0 to 8 { +// DIV-NEXT: affine.for %arg5 = 0 to 8 { +// DIV-NEXT: affine.for %arg6 = 0 to 64 { +// DIV-NEXT: %0 = affine.load %arg1[%arg3, %arg6, %arg4, %arg5] : memref<1x64x8x8xf32> +// DIV-NEXT: affine.store %0, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<1x8x8x64xf32> +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: taskflow.yield reads(%arg1 : memref<1x64x8x8xf32>) writes(%arg2 : memref<1x8x8x64xf32>) +// DIV-NEXT: } +// DIV-NEXT: %alloc_15 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10x64xf32> +// DIV-NEXT: %dependency_write_out_16 = taskflow.task @Task_7 dependency_write_in(%alloc_15 : memref<1x10x10x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_15 : memref<1x10x10x64xf32>)] {divisibility_info = {divisibility = "divisible", parallel_dims = array, parallel_space = array}} : (memref<1x10x10x64xf32>, f32) -> (memref<1x10x10x64xf32>) { +// DIV-NEXT: ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: f32): +// DIV-NEXT: affine.for %arg3 = 0 to 1 { +// DIV-NEXT: affine.for %arg4 = 0 to 10 { +// DIV-NEXT: affine.for %arg5 = 0 to 10 { +// DIV-NEXT: affine.for %arg6 = 0 to 64 { +// DIV-NEXT: affine.store %arg2, %arg1[%arg3, %arg4, %arg5, %arg6] : memref<1x10x10x64xf32> +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: taskflow.yield writes(%arg1 : memref<1x10x10x64xf32>) +// DIV-NEXT: } +// DIV-NEXT: %alloc_17 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> +// DIV-NEXT: %dependency_write_out_18 = taskflow.task @Task_8 dependency_write_in(%alloc_17 : memref<1x8x8x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_17 : memref<1x8x8x64xf32>)] {divisibility_info = {divisibility = "divisible", parallel_dims = array, parallel_space = array}} : (memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) { +// DIV-NEXT: ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: f32): +// DIV-NEXT: affine.for %arg3 = 0 to 1 { +// DIV-NEXT: affine.for %arg4 = 0 to 8 { +// DIV-NEXT: affine.for %arg5 = 0 to 8 { +// DIV-NEXT: affine.for %arg6 = 0 to 64 { +// DIV-NEXT: affine.store %arg2, %arg1[%arg3, %arg4, %arg5, %arg6] : memref<1x8x8x64xf32> +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: taskflow.yield writes(%arg1 : memref<1x8x8x64xf32>) +// DIV-NEXT: } +// DIV-NEXT: %dependency_read_out_19:2, %dependency_write_out_20 = taskflow.task @Task_9 dependency_read_in(%dependency_write_out_16, %dependency_write_out_18 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) dependency_write_in(%dependency_write_out_18 : memref<1x8x8x64xf32>) value_inputs(%cst : f32) [original_read_memrefs(%alloc_15, %alloc_17 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_17 : memref<1x8x8x64xf32>)] {divisibility_info = {divisibility = "divisible", parallel_dims = array, parallel_space = array}} : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>) { +// DIV-NEXT: ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: memref<1x8x8x64xf32>, %arg3: memref<1x8x8x64xf32>, %arg4: f32): +// DIV-NEXT: affine.for %arg5 = 0 to 1 { +// DIV-NEXT: affine.for %arg6 = 0 to 8 { +// DIV-NEXT: affine.for %arg7 = 0 to 8 { +// DIV-NEXT: affine.for %arg8 = 0 to 64 { +// DIV-NEXT: affine.for %arg9 = 0 to 3 { +// DIV-NEXT: affine.for %arg10 = 0 to 3 { +// DIV-NEXT: affine.for %arg11 = 0 to 64 { +// DIV-NEXT: %0 = affine.load %arg1[%arg5, %arg6 + %arg9, %arg7 + %arg10, %arg11] : memref<1x10x10x64xf32> +// DIV-NEXT: %1 = affine.load %arg3[%arg5, %arg6, %arg7, %arg8] : memref<1x8x8x64xf32> +// DIV-NEXT: %2 = arith.mulf %0, %arg4 : f32 +// DIV-NEXT: %3 = arith.addf %1, %2 : f32 +// DIV-NEXT: affine.store %3, %arg3[%arg5, %arg6, %arg7, %arg8] : memref<1x8x8x64xf32> +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: taskflow.yield reads(%arg1, %arg3 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) writes(%arg3 : memref<1x8x8x64xf32>) +// DIV-NEXT: } +// DIV-NEXT: %alloc_21 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> +// DIV-NEXT: %dependency_read_out_22:2, %dependency_write_out_23 = taskflow.task @Task_10_Task_11_Task_12_fused_fused dependency_read_in(%dependency_write_out_20, %dependency_read_out : memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) dependency_write_in(%alloc_21 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_17, %arg0 : memref<1x8x8x64xf32>, memref<1x64x8x8xf32>), original_write_memrefs(%alloc_21 : memref<1x64x8x8xf32>)] {divisibility_info = {divisibility = "divisible", parallel_dims = array, parallel_space = array}} : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) { +// DIV-NEXT: ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: memref<1x64x8x8xf32>, %arg3: memref<1x64x8x8xf32>, %arg4: f32, %arg5: f32): +// DIV-NEXT: affine.for %arg6 = 0 to 1 { +// DIV-NEXT: affine.for %arg7 = 0 to 64 { +// DIV-NEXT: affine.for %arg8 = 0 to 8 { +// DIV-NEXT: affine.for %arg9 = 0 to 8 { +// DIV-NEXT: %0 = affine.load %arg1[%arg6, %arg8, %arg9, %arg7] : memref<1x8x8x64xf32> +// DIV-NEXT: %1 = affine.load %arg2[%arg6, %arg7, %arg8, %arg9] : memref<1x64x8x8xf32> +// DIV-NEXT: %2 = arith.addf %0, %1 : f32 +// DIV-NEXT: %3 = arith.minimumf %2, %arg4 : f32 +// DIV-NEXT: %4 = arith.maximumf %3, %arg5 : f32 +// DIV-NEXT: affine.store %4, %arg3[%arg6, %arg7, %arg8, %arg9] : memref<1x64x8x8xf32> +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: } +// DIV-NEXT: taskflow.yield reads(%arg1, %arg2 : memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) writes(%arg3 : memref<1x64x8x8xf32>) +// DIV-NEXT: } +// DIV-NEXT: return %dependency_write_out_23 : memref<1x64x8x8xf32> +// DIV-NEXT: } +// DIV-NEXT: } + // RESOPT: module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // RESOPT-NEXT: memref.global "private" constant @__constant_64xf32 : memref<64xf32> = dense<0.000000e+00> {alignment = 64 : i64}