diff --git a/include/Conversion/ConversionPasses.h b/include/Conversion/ConversionPasses.h index 14e27a03..c376ac70 100644 --- a/include/Conversion/ConversionPasses.h +++ b/include/Conversion/ConversionPasses.h @@ -23,6 +23,10 @@ std::unique_ptr createLowerAffineToNeuraPass(); // TaskFlow Conversion Passes. std::unique_ptr createConvertAffineToTaskflowPass(); std::unique_ptr createConvertTaskflowToNeuraPass(); + +// Memref SubView and Copy Conversion Passes. +std::unique_ptr createFoldSubViewPass(); +std::unique_ptr createConvertCopyToAffineLoopsPass(); #define GEN_PASS_REGISTRATION #include "Conversion/ConversionPasses.h.inc" diff --git a/include/Conversion/ConversionPasses.td b/include/Conversion/ConversionPasses.td index ec0b8a56..8d902055 100644 --- a/include/Conversion/ConversionPasses.td +++ b/include/Conversion/ConversionPasses.td @@ -93,4 +93,35 @@ def ConvertTaskflowToNeura : Pass<"convert-taskflow-to-neura", "ModuleOp">{ ]; } +def FoldSubView : Pass<"fold-subview", "func::FuncOp"> { + let summary = "Folds memref.subview into affine load/store operations"; + let description = [{ + Eliminates memref.subview operations by folding them into their users + (affine.load and affine.store). Adjusts indices to account for subview + offsets and strides, enabling direct access to the source memref. + }]; + let constructor = "mlir::createFoldSubViewPass()"; + let dependentDialects = [ + "mlir::affine::AffineDialect", + "mlir::memref::MemRefDialect", + "mlir::arith::ArithDialect", + "mlir::func::FuncDialect" + ]; +} + +def ConvertCopyToAffineLoops : Pass<"convert-copy-to-affine-loops", "func::FuncOp"> { + let summary = "Converts memref.copy to explicit affine loop nests"; + let description = [{ + Converts memref.copy operations into explicit affine.for loop nests + with affine.load and affine.store operations. This makes data movement + explicit for CGRA compilation with global addressing. + }]; + let constructor = "mlir::createConvertCopyToAffineLoopsPass()"; + let dependentDialects = [ + "mlir::affine::AffineDialect", + "mlir::memref::MemRefDialect", + "mlir::func::FuncDialect" + ]; +} + #endif // CONVERSION_PASSES_TD \ No newline at end of file diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h index 71a3b510..bccdb84d 100644 --- a/include/TaskflowDialect/TaskflowPasses.h +++ b/include/TaskflowDialect/TaskflowPasses.h @@ -12,6 +12,10 @@ #include namespace mlir { namespace taskflow { + +void registerTaskflowConversionPassPipeline(); +void registerTosaToAffineConversionPassPipeline(); + // Passes defined in TaskflowPasses.td #define GEN_PASS_DECL #include "TaskflowDialect/TaskflowPasses.h.inc" diff --git a/lib/Conversion/AffineToTaskflow/CMakeLists.txt b/lib/Conversion/AffineToTaskflow/CMakeLists.txt index bb4f3f52..a5e53989 100644 --- a/lib/Conversion/AffineToTaskflow/CMakeLists.txt +++ b/lib/Conversion/AffineToTaskflow/CMakeLists.txt @@ -2,6 +2,8 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR}) add_mlir_conversion_library(MLIRAffineToTaskflowPass AffineToTaskflowPass.cpp + FoldSubViewPass.cpp + ConvertCopyToAffineLoopsPass.cpp DEPENDS MLIRConversionIncGen diff --git a/lib/Conversion/AffineToTaskflow/ConvertCopyToAffineLoopsPass.cpp b/lib/Conversion/AffineToTaskflow/ConvertCopyToAffineLoopsPass.cpp new file mode 100644 index 00000000..f2b30c3d --- /dev/null +++ b/lib/Conversion/AffineToTaskflow/ConvertCopyToAffineLoopsPass.cpp @@ -0,0 +1,99 @@ +//===- ConvertCopyToAffineLoopsPass.cpp - Converts memref.copy to loops ---===// +// +// This pass converts memref.copy operations into explicit affine loop nests. +// +//===----------------------------------------------------------------------===// + +#include "Conversion/ConversionPasses.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +using namespace mlir; + +namespace { +// Converts memref.copy to nested affine loops with affine.load/store. +struct CopyOpLoweringPattern : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(memref::CopyOp copy, + PatternRewriter &rewriter) const override { + // Checks if the target has any users besides this copy. + // If the target (e.g., a subview) is only used by this copy and nothing + // else, this copy is dead code and should be removed without conversion. + Value target = copy.getTarget(); + bool has_other_users = false; + for (auto *user : target.getUsers()) { + if (user != copy.getOperation()) { + has_other_users = true; + break; + } + } + + if (!has_other_users) { + // Target has no users besides this copy, so just erase the copy. + rewriter.eraseOp(copy); + return success(); + } + + // Target has other users, convert copy to affine loops. + rewriter.setInsertionPoint(copy); + auto loc = copy.getLoc(); + MemRefType memref_type = dyn_cast(copy.getSource().getType()); + + // Creates explicit memory copy using an affine loop nest. + SmallVector ivs; + for (auto dim_size : memref_type.getShape()) { + auto loop = rewriter.create(loc, 0, dim_size); + rewriter.setInsertionPointToStart(loop.getBody()); + ivs.push_back(loop.getInductionVar()); + } + + // Creates affine load from source and store to target. + Value value = + rewriter.create(loc, copy.getSource(), ivs); + rewriter.create(loc, value, copy.getTarget(), ivs); + + rewriter.eraseOp(copy); + return success(); + } +}; +} // namespace + +//===----------------------------------------------------------------------===// +// Pass implementation +//===----------------------------------------------------------------------===// + +namespace { +struct ConvertCopyToAffineLoopsPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ConvertCopyToAffineLoopsPass) + + StringRef getArgument() const final { return "convert-copy-to-affine-loops"; } + + StringRef getDescription() const final { + return "Convert memref.copy to explicit affine loop nests"; + } + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() override { + RewritePatternSet patterns(&getContext()); + patterns.add(&getContext()); + if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { + signalPassFailure(); + } + } +}; +} // namespace + +std::unique_ptr mlir::createConvertCopyToAffineLoopsPass() { + return std::make_unique(); +} diff --git a/lib/Conversion/AffineToTaskflow/FoldSubViewPass.cpp b/lib/Conversion/AffineToTaskflow/FoldSubViewPass.cpp new file mode 100644 index 00000000..08c78a0e --- /dev/null +++ b/lib/Conversion/AffineToTaskflow/FoldSubViewPass.cpp @@ -0,0 +1,191 @@ +//===- FoldSubViewPass.cpp - Fold memref.subview into load/store ---------===// +// +// This pass folds memref.subview operations into their affine.load and +// affine.store users by adjusting the access indices. Designed for CGRA +// systems with global addressing. +// +//===----------------------------------------------------------------------===// + +#include "Conversion/ConversionPasses.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "llvm/ADT/SmallBitVector.h" + +using namespace mlir; + +//===----------------------------------------------------------------------===// +// Utility functions. +//===----------------------------------------------------------------------===// + +// Resolves the source indices for a load/store operation that accesses a +// subview. Computes the adjusted indices to access the source memref directly. +// +// For example: +// %subview = memref.subview %source[%offset0, %offset1][...][%stride0, +// %stride1] %val = affine.load %subview[%i, %j] +// becomes: +// %val = affine.load %source[%i * %stride0 + %offset0, %j * %stride1 + +// %offset1] +static LogicalResult +resolveSourceIndices(Location loc, PatternRewriter &rewriter, + memref::SubViewOp sub_view_op, ValueRange indices, + SmallVectorImpl &source_indices) { + SmallVector mixed_offsets = sub_view_op.getMixedOffsets(); + SmallVector mixed_sizes = sub_view_op.getMixedSizes(); + SmallVector mixed_strides = sub_view_op.getMixedStrides(); + + SmallVector use_indices; + // Handles rank-reducing subviews: for every unit-dim size, adds a zero index. + unsigned result_dim = 0; + llvm::SmallBitVector unused_dims = sub_view_op.getDroppedDims(); + for (auto dim : + llvm::seq(0, sub_view_op.getSourceType().getRank())) { + if (unused_dims.test(dim)) { + use_indices.push_back(rewriter.create(loc, 0)); + } else { + use_indices.push_back(indices[result_dim++]); + } + } + + if (use_indices.size() != mixed_offsets.size()) { + return failure(); + } + + source_indices.resize(use_indices.size()); + for (auto index : llvm::seq(0, mixed_offsets.size())) { + SmallVector dynamic_operands; + AffineExpr expr = rewriter.getAffineDimExpr(0); + unsigned num_symbols = 0; + dynamic_operands.push_back(use_indices[index]); + + // Multiplies by stride: index * stride. + if (auto attr = mixed_strides[index].dyn_cast()) { + int64_t stride_val = dyn_cast(attr).getInt(); + if (stride_val != 1) { + expr = expr * stride_val; + } + } else { + dynamic_operands.push_back(dyn_cast(mixed_strides[index])); + expr = expr * rewriter.getAffineSymbolExpr(num_symbols++); + } + + // Adds offset: index * stride + offset. + if (auto attr = mixed_offsets[index].dyn_cast()) { + int64_t offset_val = dyn_cast(attr).getInt(); + if (offset_val != 0) { + expr = expr + offset_val; + } + } else { + dynamic_operands.push_back(dyn_cast(mixed_offsets[index])); + expr = expr + rewriter.getAffineSymbolExpr(num_symbols++); + } + + source_indices[index] = rewriter.create( + loc, AffineMap::get(1, num_symbols, expr), dynamic_operands); + } + return success(); +} + +//===----------------------------------------------------------------------===// +// Patterns +//===----------------------------------------------------------------------===// + +namespace { +/// Folds affine.load from a subview into a load from the source memref. +class LoadOpOfSubViewFolder final + : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(affine::AffineLoadOp loadOp, + PatternRewriter &rewriter) const override { + auto subViewOp = loadOp.getMemRef().getDefiningOp(); + if (!subViewOp) + return failure(); + + SmallVector sourceIndices; + if (failed(resolveSourceIndices(loadOp.getLoc(), rewriter, subViewOp, + loadOp.getIndices(), sourceIndices))) + return failure(); + + rewriter.replaceOpWithNewOp( + loadOp, subViewOp.getSource(), sourceIndices); + return success(); + } +}; + +/// Folds affine.store to a subview into a store to the source memref. +class StoreOpOfSubViewFolder final + : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(affine::AffineStoreOp storeOp, + PatternRewriter &rewriter) const override { + auto subViewOp = storeOp.getMemRef().getDefiningOp(); + if (!subViewOp) + return failure(); + + SmallVector sourceIndices; + if (failed(resolveSourceIndices(storeOp.getLoc(), rewriter, subViewOp, + storeOp.getIndices(), sourceIndices))) + return failure(); + + rewriter.replaceOpWithNewOp( + storeOp, storeOp.getValue(), subViewOp.getSource(), sourceIndices); + return success(); + } +}; +} // namespace + +//===----------------------------------------------------------------------===// +// Pass implementation +//===----------------------------------------------------------------------===// + +namespace { +struct FoldSubViewPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(FoldSubViewPass) + + StringRef getArgument() const final { return "fold-subview"; } + + StringRef getDescription() const final { + return "Fold memref.subview into affine load/store operations"; + } + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() override { + // Step 1: Folds subviews into their load/store users. + RewritePatternSet patterns(&getContext()); + patterns.add(&getContext()); + if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { + signalPassFailure(); + } + + // Step 2: Cleans up dead subview operations that have no remaining users. + SmallVector dead_sub_views; + getOperation().walk([&](memref::SubViewOp sub_view_op) { + if (sub_view_op->use_empty()) { + dead_sub_views.push_back(sub_view_op); + } + }); + + for (auto sub_view_op : dead_sub_views) { + sub_view_op.erase(); + } + } +}; +} // namespace + +std::unique_ptr mlir::createFoldSubViewPass() { + return std::make_unique(); +} diff --git a/lib/TaskflowDialect/TaskflowPasses.cpp b/lib/TaskflowDialect/TaskflowPasses.cpp index 1a10c2ef..6a11db81 100644 --- a/lib/TaskflowDialect/TaskflowPasses.cpp +++ b/lib/TaskflowDialect/TaskflowPasses.cpp @@ -1,7 +1,77 @@ #include "TaskflowDialect/TaskflowPasses.h" +#include "Conversion/ConversionPasses.h" #include "TaskflowDialect/TaskflowDialect.h" #include "TaskflowDialect/TaskflowOps.h" - +#include "mlir/Conversion/Passes.h" +#include "mlir/Conversion/TosaToArith/TosaToArith.h" +#include "mlir/Dialect/Affine/Passes.h" +#include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h" +#include "mlir/Dialect/Bufferization/Transforms/Passes.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/Linalg/Passes.h" +#include "mlir/Dialect/Tosa/Transforms/Passes.h" #include "mlir/Pass/PassManager.h" #include "mlir/Pass/PassRegistry.h" -#include "mlir/Transforms/Passes.h" \ No newline at end of file +#include "mlir/Transforms/Passes.h" + +using namespace mlir; +using namespace mlir::bufferization; + +// This pass pipeline can convert affine dialect into taskflow dialect with +// neura.kernel op. +void mlir::taskflow::registerTaskflowConversionPassPipeline() { + PassPipelineRegistration<>( + "taskflow-conversion", + "Converts affine dialects to taskflow dialect with neura.kernel ops.", + [](OpPassManager &pm) { + pm.addPass(mlir::createConvertAffineToTaskflowPass()); + pm.addPass(mlir::taskflow::createConstructHyperblockFromTaskPass()); + pm.addPass(mlir::taskflow::createClassifyCountersPass()); + pm.addPass(mlir::createConvertTaskflowToNeuraPass()); + }); +} + +// This pass pipeline converts TOSA dialect to Affine dialect with cleanup. +void mlir::taskflow::registerTosaToAffineConversionPassPipeline() { + PassPipelineRegistration<>( + "tosa-to-affine-conversion", + "Complete pipeline: TOSA to Linalg to Affine with subview/copy cleanup", + [](OpPassManager &pm) { + // Step 1-3: TOSA to Linalg (function-level passes). + pm.nest().addPass( + mlir::tosa::createTosaInferShapesPass()); + pm.nest().addPass( + mlir::tosa::createTosaMakeBroadcastablePass()); + pm.nest().addPass(mlir::tosa::createTosaToLinalgNamed()); + pm.nest().addPass(mlir::tosa::createTosaToLinalg()); + pm.nest().addPass(mlir::tosa::createTosaToArith()); + pm.nest().addPass(mlir::tosa::createTosaToTensor()); + + // Step 4: Linalg Generalization (function-level). + pm.nest().addPass(createLinalgGeneralizeNamedOpsPass()); + + // Step 5: Canonicalization (module-level). + pm.addPass(createCanonicalizerPass()); + + // Step 6: Bufferization with proper options (module-level). + OneShotBufferizationOptions bufferizationOptions; + bufferizationOptions.bufferizeFunctionBoundaries = true; + bufferizationOptions.setFunctionBoundaryTypeConversion( + LayoutMapOption::IdentityLayoutMap); + pm.addPass(createOneShotBufferizePass(bufferizationOptions)); + + // Step 7: Linalg to Affine Loops (function-level). + pm.nest().addPass(createConvertLinalgToAffineLoopsPass()); + + // Step 8: Cleanup subview and copy operations (function-level). + pm.nest().addPass(createFoldSubViewPass()); + pm.nest().addPass(createConvertCopyToAffineLoopsPass()); + + // Step 9: Final Affine cleanup (function-level). + pm.nest().addPass( + mlir::affine::createAffineLoopNormalizePass()); + pm.nest().addPass( + mlir::affine::createSimplifyAffineStructuresPass()); + pm.addPass(createCanonicalizerPass()); + }); +} \ No newline at end of file diff --git a/test/multi-cgra/kernel_mapping/fir/fir.mlir b/test/multi-cgra/kernel_mapping/fir/fir.mlir index a5790c85..f70d99ca 100644 --- a/test/multi-cgra/kernel_mapping/fir/fir.mlir +++ b/test/multi-cgra/kernel_mapping/fir/fir.mlir @@ -7,10 +7,7 @@ // RUN: -o %t.hyperblock.mlir // RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK -// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ -// RUN: --construct-hyperblock-from-task \ -// RUN: --classify-counters \ -// RUN: --convert-taskflow-to-neura \ +// RUN: neura-compiler %s --taskflow-conversion \ // RUN: -o %t.kernel.mlir // RUN: FileCheck %s --input-file=%t.kernel.mlir --check-prefixes=KERNEL diff --git a/test/multi-cgra/kernel_mapping/relu/relu.mlir b/test/multi-cgra/kernel_mapping/relu/relu.mlir index 1671e85e..309c8512 100644 --- a/test/multi-cgra/kernel_mapping/relu/relu.mlir +++ b/test/multi-cgra/kernel_mapping/relu/relu.mlir @@ -7,10 +7,7 @@ // RUN: -o %t.hyperblock.mlir // RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK -// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ -// RUN: --construct-hyperblock-from-task \ -// RUN: --classify-counters \ -// RUN: --convert-taskflow-to-neura \ +// RUN: neura-compiler %s --taskflow-conversion \ // RUN: -o %t.kernel.mlir // RUN: FileCheck %s --input-file=%t.kernel.mlir --check-prefixes=KERNEL diff --git a/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir b/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir new file mode 100644 index 00000000..abc993f5 --- /dev/null +++ b/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir @@ -0,0 +1,488 @@ +// RUN: neura-compiler %s --tosa-to-affine-conversion \ +// RUN: -o %t.affine.mlir +// RUN: FileCheck %s --input-file=%t.affine.mlir --check-prefixes=AFFINE + +// RUN: neura-compiler %s --tosa-to-affine-conversion \ +// RUN: --taskflow-conversion \ +// RUN: -o %t.kernel.mlir +// RUN: FileCheck %s --input-file=%t.kernel.mlir --check-prefixes=KERNEL + +module attributes {torch.debug_module_name = "SimpleResNetBlock"} { + func.func @forward(%arg0: tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32> { + %0 = "tosa.const"() <{value = dense<"0x7BEEA13C"> : tensor<64x64x3x3xf32>}> : () -> tensor<64x64x3x3xf32> + %1 = "tosa.const"() <{value = dense<"0x8B9878BC"> : tensor<64x64x3x3xf32>}> : () -> tensor<64x64x3x3xf32> + %2 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<64xf32>}> : () -> tensor<64xf32> + %3 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> + %4 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> + %5 = tosa.transpose %arg0, %3 : (tensor<1x64x8x8xf32>, tensor<4xi32>) -> tensor<1x8x8x64xf32> + %6 = tosa.transpose %1, %3 : (tensor<64x64x3x3xf32>, tensor<4xi32>) -> tensor<64x3x3x64xf32> + %7 = tosa.conv2d %5, %6, %2 {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor<1x8x8x64xf32>, tensor<64x3x3x64xf32>, tensor<64xf32>) -> tensor<1x8x8x64xf32> + %8 = tosa.transpose %7, %4 : (tensor<1x8x8x64xf32>, tensor<4xi32>) -> tensor<1x64x8x8xf32> + %9 = tosa.clamp %8 {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32> + %10 = tosa.transpose %9, %3 : (tensor<1x64x8x8xf32>, tensor<4xi32>) -> tensor<1x8x8x64xf32> + %11 = tosa.transpose %0, %3 : (tensor<64x64x3x3xf32>, tensor<4xi32>) -> tensor<64x3x3x64xf32> + %12 = tosa.conv2d %10, %11, %2 {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor<1x8x8x64xf32>, tensor<64x3x3x64xf32>, tensor<64xf32>) -> tensor<1x8x8x64xf32> + %13 = tosa.transpose %12, %4 : (tensor<1x8x8x64xf32>, tensor<4xi32>) -> tensor<1x64x8x8xf32> + %14 = tosa.add %13, %arg0 : (tensor<1x64x8x8xf32>, tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32> + %15 = tosa.clamp %14 {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<1x64x8x8xf32>) -> tensor<1x64x8x8xf32> + return %15 : tensor<1x64x8x8xf32> + } +} + +// AFFINE: module attributes {torch.debug_module_name = "SimpleResNetBlock"} { +// AFFINE-NEXT: memref.global "private" constant @__constant_64xf32 : memref<64xf32> = dense<0.000000e+00> {alignment = 64 : i64} +// AFFINE-NEXT: memref.global "private" constant @__constant_64x3x3x64xf32_0 : memref<64x3x3x64xf32> = dense<-0.0151730878> {alignment = 64 : i64} +// AFFINE-NEXT: memref.global "private" constant @__constant_64x3x3x64xf32 : memref<64x3x3x64xf32> = dense<0.0197670367> {alignment = 64 : i64} +// AFFINE-NEXT: func.func @forward(%arg0: memref<1x64x8x8xf32>) -> memref<1x64x8x8xf32> { +// AFFINE-NEXT: %cst = arith.constant 0.0197670367 : f32 +// AFFINE-NEXT: %cst_0 = arith.constant -0.0151730878 : f32 +// AFFINE-NEXT: %cst_1 = arith.constant 3.40282347E+38 : f32 +// AFFINE-NEXT: %cst_2 = arith.constant 0.000000e+00 : f32 +// AFFINE-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> +// AFFINE-NEXT: affine.for %arg1 = 0 to 1 { +// AFFINE-NEXT: affine.for %arg2 = 0 to 8 { +// AFFINE-NEXT: affine.for %arg3 = 0 to 8 { +// AFFINE-NEXT: affine.for %arg4 = 0 to 64 { +// AFFINE-NEXT: %0 = affine.load %arg0[%arg1, %arg4, %arg2, %arg3] : memref<1x64x8x8xf32> +// AFFINE-NEXT: affine.store %0, %alloc[%arg1, %arg2, %arg3, %arg4] : memref<1x8x8x64xf32> +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: %alloc_3 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10x64xf32> +// AFFINE-NEXT: affine.for %arg1 = 0 to 1 { +// AFFINE-NEXT: affine.for %arg2 = 0 to 10 { +// AFFINE-NEXT: affine.for %arg3 = 0 to 10 { +// AFFINE-NEXT: affine.for %arg4 = 0 to 64 { +// AFFINE-NEXT: affine.store %cst_2, %alloc_3[%arg1, %arg2, %arg3, %arg4] : memref<1x10x10x64xf32> +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: %alloc_4 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> +// AFFINE-NEXT: affine.for %arg1 = 0 to 1 { +// AFFINE-NEXT: affine.for %arg2 = 0 to 8 { +// AFFINE-NEXT: affine.for %arg3 = 0 to 8 { +// AFFINE-NEXT: affine.for %arg4 = 0 to 64 { +// AFFINE-NEXT: affine.store %cst_2, %alloc_4[%arg1, %arg2, %arg3, %arg4] : memref<1x8x8x64xf32> +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: affine.for %arg1 = 0 to 1 { +// AFFINE-NEXT: affine.for %arg2 = 0 to 8 { +// AFFINE-NEXT: affine.for %arg3 = 0 to 8 { +// AFFINE-NEXT: affine.for %arg4 = 0 to 64 { +// AFFINE-NEXT: affine.for %arg5 = 0 to 3 { +// AFFINE-NEXT: affine.for %arg6 = 0 to 3 { +// AFFINE-NEXT: affine.for %arg7 = 0 to 64 { +// AFFINE-NEXT: %0 = affine.load %alloc_3[%arg1, %arg2 + %arg5, %arg3 + %arg6, %arg7] : memref<1x10x10x64xf32> +// AFFINE-NEXT: %1 = affine.load %alloc_4[%arg1, %arg2, %arg3, %arg4] : memref<1x8x8x64xf32> +// AFFINE-NEXT: %2 = arith.mulf %0, %cst_0 : f32 +// AFFINE-NEXT: %3 = arith.addf %1, %2 : f32 +// AFFINE-NEXT: affine.store %3, %alloc_4[%arg1, %arg2, %arg3, %arg4] : memref<1x8x8x64xf32> +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: %alloc_5 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> +// AFFINE-NEXT: affine.for %arg1 = 0 to 1 { +// AFFINE-NEXT: affine.for %arg2 = 0 to 64 { +// AFFINE-NEXT: affine.for %arg3 = 0 to 8 { +// AFFINE-NEXT: affine.for %arg4 = 0 to 8 { +// AFFINE-NEXT: %0 = affine.load %alloc_4[%arg1, %arg3, %arg4, %arg2] : memref<1x8x8x64xf32> +// AFFINE-NEXT: affine.store %0, %alloc_5[%arg1, %arg2, %arg3, %arg4] : memref<1x64x8x8xf32> +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: %alloc_6 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> +// AFFINE-NEXT: affine.for %arg1 = 0 to 1 { +// AFFINE-NEXT: affine.for %arg2 = 0 to 64 { +// AFFINE-NEXT: affine.for %arg3 = 0 to 8 { +// AFFINE-NEXT: affine.for %arg4 = 0 to 8 { +// AFFINE-NEXT: %0 = affine.load %alloc_5[%arg1, %arg2, %arg3, %arg4] : memref<1x64x8x8xf32> +// AFFINE-NEXT: %1 = arith.minimumf %0, %cst_1 : f32 +// AFFINE-NEXT: %2 = arith.maximumf %1, %cst_2 : f32 +// AFFINE-NEXT: affine.store %2, %alloc_6[%arg1, %arg2, %arg3, %arg4] : memref<1x64x8x8xf32> +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: %alloc_7 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> +// AFFINE-NEXT: affine.for %arg1 = 0 to 1 { +// AFFINE-NEXT: affine.for %arg2 = 0 to 8 { +// AFFINE-NEXT: affine.for %arg3 = 0 to 8 { +// AFFINE-NEXT: affine.for %arg4 = 0 to 64 { +// AFFINE-NEXT: %0 = affine.load %alloc_6[%arg1, %arg4, %arg2, %arg3] : memref<1x64x8x8xf32> +// AFFINE-NEXT: affine.store %0, %alloc_7[%arg1, %arg2, %arg3, %arg4] : memref<1x8x8x64xf32> +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: %alloc_8 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10x64xf32> +// AFFINE-NEXT: affine.for %arg1 = 0 to 1 { +// AFFINE-NEXT: affine.for %arg2 = 0 to 10 { +// AFFINE-NEXT: affine.for %arg3 = 0 to 10 { +// AFFINE-NEXT: affine.for %arg4 = 0 to 64 { +// AFFINE-NEXT: affine.store %cst_2, %alloc_8[%arg1, %arg2, %arg3, %arg4] : memref<1x10x10x64xf32> +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: %alloc_9 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> +// AFFINE-NEXT: affine.for %arg1 = 0 to 1 { +// AFFINE-NEXT: affine.for %arg2 = 0 to 8 { +// AFFINE-NEXT: affine.for %arg3 = 0 to 8 { +// AFFINE-NEXT: affine.for %arg4 = 0 to 64 { +// AFFINE-NEXT: affine.store %cst_2, %alloc_9[%arg1, %arg2, %arg3, %arg4] : memref<1x8x8x64xf32> +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: affine.for %arg1 = 0 to 1 { +// AFFINE-NEXT: affine.for %arg2 = 0 to 8 { +// AFFINE-NEXT: affine.for %arg3 = 0 to 8 { +// AFFINE-NEXT: affine.for %arg4 = 0 to 64 { +// AFFINE-NEXT: affine.for %arg5 = 0 to 3 { +// AFFINE-NEXT: affine.for %arg6 = 0 to 3 { +// AFFINE-NEXT: affine.for %arg7 = 0 to 64 { +// AFFINE-NEXT: %0 = affine.load %alloc_8[%arg1, %arg2 + %arg5, %arg3 + %arg6, %arg7] : memref<1x10x10x64xf32> +// AFFINE-NEXT: %1 = affine.load %alloc_9[%arg1, %arg2, %arg3, %arg4] : memref<1x8x8x64xf32> +// AFFINE-NEXT: %2 = arith.mulf %0, %cst : f32 +// AFFINE-NEXT: %3 = arith.addf %1, %2 : f32 +// AFFINE-NEXT: affine.store %3, %alloc_9[%arg1, %arg2, %arg3, %arg4] : memref<1x8x8x64xf32> +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: %alloc_10 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> +// AFFINE-NEXT: affine.for %arg1 = 0 to 1 { +// AFFINE-NEXT: affine.for %arg2 = 0 to 64 { +// AFFINE-NEXT: affine.for %arg3 = 0 to 8 { +// AFFINE-NEXT: affine.for %arg4 = 0 to 8 { +// AFFINE-NEXT: %0 = affine.load %alloc_9[%arg1, %arg3, %arg4, %arg2] : memref<1x8x8x64xf32> +// AFFINE-NEXT: affine.store %0, %alloc_10[%arg1, %arg2, %arg3, %arg4] : memref<1x64x8x8xf32> +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: %alloc_11 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> +// AFFINE-NEXT: affine.for %arg1 = 0 to 1 { +// AFFINE-NEXT: affine.for %arg2 = 0 to 64 { +// AFFINE-NEXT: affine.for %arg3 = 0 to 8 { +// AFFINE-NEXT: affine.for %arg4 = 0 to 8 { +// AFFINE-NEXT: %0 = affine.load %alloc_10[%arg1, %arg2, %arg3, %arg4] : memref<1x64x8x8xf32> +// AFFINE-NEXT: %1 = affine.load %arg0[%arg1, %arg2, %arg3, %arg4] : memref<1x64x8x8xf32> +// AFFINE-NEXT: %2 = arith.addf %0, %1 : f32 +// AFFINE-NEXT: affine.store %2, %alloc_11[%arg1, %arg2, %arg3, %arg4] : memref<1x64x8x8xf32> +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: %alloc_12 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> +// AFFINE-NEXT: affine.for %arg1 = 0 to 1 { +// AFFINE-NEXT: affine.for %arg2 = 0 to 64 { +// AFFINE-NEXT: affine.for %arg3 = 0 to 8 { +// AFFINE-NEXT: affine.for %arg4 = 0 to 8 { +// AFFINE-NEXT: %0 = affine.load %alloc_11[%arg1, %arg2, %arg3, %arg4] : memref<1x64x8x8xf32> +// AFFINE-NEXT: %1 = arith.minimumf %0, %cst_1 : f32 +// AFFINE-NEXT: %2 = arith.maximumf %1, %cst_2 : f32 +// AFFINE-NEXT: affine.store %2, %alloc_12[%arg1, %arg2, %arg3, %arg4] : memref<1x64x8x8xf32> +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: } +// AFFINE-NEXT: return %alloc_12 : memref<1x64x8x8xf32> +// AFFINE-NEXT: } +// AFFINE-NEXT: } + + +// KERNEL: module attributes {torch.debug_module_name = "SimpleResNetBlock"} { +// KERNEL-NEXT: memref.global "private" constant @__constant_64xf32 : memref<64xf32> = dense<0.000000e+00> {alignment = 64 : i64} +// KERNEL-NEXT: memref.global "private" constant @__constant_64x3x3x64xf32_0 : memref<64x3x3x64xf32> = dense<-0.0151730878> {alignment = 64 : i64} +// KERNEL-NEXT: memref.global "private" constant @__constant_64x3x3x64xf32 : memref<64x3x3x64xf32> = dense<0.0197670367> {alignment = 64 : i64} +// KERNEL-NEXT: func.func @forward(%arg0: memref<1x64x8x8xf32>) -> memref<1x64x8x8xf32> { +// KERNEL-NEXT: %cst = arith.constant 0.0197670367 : f32 +// KERNEL-NEXT: %cst_0 = arith.constant -0.0151730878 : f32 +// KERNEL-NEXT: %cst_1 = arith.constant 3.40282347E+38 : f32 +// KERNEL-NEXT: %cst_2 = arith.constant 0.000000e+00 : f32 +// KERNEL-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> +// KERNEL-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref<1x64x8x8xf32>) write_memrefs(%alloc : memref<1x8x8x64xf32>) [original_read_memrefs(%arg0 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc : memref<1x8x8x64xf32>)] : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x8x8x64xf32>) { +// KERNEL-NEXT: ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x8x8x64xf32>): +// KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: neura.kernel inputs(%arg1, %arg2 : memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) { +// KERNEL-NEXT: ^bb0(%arg3: memref<1x64x8x8xf32>, %arg4: memref<1x8x8x64xf32>): +// KERNEL-NEXT: %4 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %5 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %6 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %7 = neura.counter {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: %8 = memref.load %arg3[%4, %7, %5, %6] : memref<1x64x8x8xf32> +// KERNEL-NEXT: memref.store %8, %arg4[%4, %5, %6, %7] : memref<1x8x8x64xf32> +// KERNEL-NEXT: neura.yield +// KERNEL-NEXT: } +// KERNEL-NEXT: taskflow.yield writes(%arg2 : memref<1x8x8x64xf32>) +// KERNEL-NEXT: } +// KERNEL-NEXT: %alloc_3 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10x64xf32> +// KERNEL-NEXT: %write_outputs_4 = taskflow.task @Task_1 write_memrefs(%alloc_3 : memref<1x10x10x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_3 : memref<1x10x10x64xf32>)] : (memref<1x10x10x64xf32>, f32) -> (memref<1x10x10x64xf32>) { +// KERNEL-NEXT: ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: f32): +// KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 10 : index} : index +// KERNEL-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 10 : index} : index +// KERNEL-NEXT: %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: neura.kernel inputs(%arg2, %arg1 : f32, memref<1x10x10x64xf32>) { +// KERNEL-NEXT: ^bb0(%arg3: f32, %arg4: memref<1x10x10x64xf32>): +// KERNEL-NEXT: %4 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %5 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 10 : index} : index +// KERNEL-NEXT: %6 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 10 : index} : index +// KERNEL-NEXT: %7 = neura.counter {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: memref.store %arg3, %arg4[%4, %5, %6, %7] : memref<1x10x10x64xf32> +// KERNEL-NEXT: neura.yield +// KERNEL-NEXT: } +// KERNEL-NEXT: taskflow.yield writes(%arg1 : memref<1x10x10x64xf32>) +// KERNEL-NEXT: } +// KERNEL-NEXT: %alloc_5 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> +// KERNEL-NEXT: %write_outputs_6 = taskflow.task @Task_2 write_memrefs(%alloc_5 : memref<1x8x8x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_5 : memref<1x8x8x64xf32>)] : (memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) { +// KERNEL-NEXT: ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: f32): +// KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: neura.kernel inputs(%arg2, %arg1 : f32, memref<1x8x8x64xf32>) { +// KERNEL-NEXT: ^bb0(%arg3: f32, %arg4: memref<1x8x8x64xf32>): +// KERNEL-NEXT: %4 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %5 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %6 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %7 = neura.counter {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: memref.store %arg3, %arg4[%4, %5, %6, %7] : memref<1x8x8x64xf32> +// KERNEL-NEXT: neura.yield +// KERNEL-NEXT: } +// KERNEL-NEXT: taskflow.yield writes(%arg1 : memref<1x8x8x64xf32>) +// KERNEL-NEXT: } +// KERNEL-NEXT: %write_outputs_7 = taskflow.task @Task_3 read_memrefs(%write_outputs_4, %write_outputs_6 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) write_memrefs(%write_outputs_6 : memref<1x8x8x64xf32>) value_inputs(%cst_0 : f32) [original_read_memrefs(%alloc_3, %alloc_5 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_5 : memref<1x8x8x64xf32>)] : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) { +// KERNEL-NEXT: ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: memref<1x8x8x64xf32>, %arg3: memref<1x8x8x64xf32>, %arg4: f32): +// KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: %4 = taskflow.counter parent(%3 : index) attributes {counter_id = 4 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 3 : index} : index +// KERNEL-NEXT: %5 = taskflow.counter parent(%4 : index) attributes {counter_id = 5 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 3 : index} : index +// KERNEL-NEXT: %6 = taskflow.counter parent(%5 : index) attributes {counter_id = 6 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: neura.kernel inputs(%arg1, %arg3, %arg4 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, f32) { +// KERNEL-NEXT: ^bb0(%arg5: memref<1x10x10x64xf32>, %arg6: memref<1x8x8x64xf32>, %arg7: f32): +// KERNEL-NEXT: %7 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %8 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %9 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %10 = neura.counter {counter_id = 3 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: %11 = neura.counter {counter_id = 4 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 3 : index} : index +// KERNEL-NEXT: %12 = neura.counter {counter_id = 5 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 3 : index} : index +// KERNEL-NEXT: %13 = neura.counter {counter_id = 6 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: %14 = arith.addi %8, %11 : index +// KERNEL-NEXT: %15 = arith.addi %9, %12 : index +// KERNEL-NEXT: %16 = memref.load %arg5[%7, %14, %15, %13] : memref<1x10x10x64xf32> +// KERNEL-NEXT: %17 = memref.load %arg6[%7, %8, %9, %10] : memref<1x8x8x64xf32> +// KERNEL-NEXT: %18 = arith.mulf %16, %arg7 : f32 +// KERNEL-NEXT: %19 = arith.addf %17, %18 : f32 +// KERNEL-NEXT: memref.store %19, %arg6[%7, %8, %9, %10] : memref<1x8x8x64xf32> +// KERNEL-NEXT: neura.yield +// KERNEL-NEXT: } +// KERNEL-NEXT: taskflow.yield writes(%arg3 : memref<1x8x8x64xf32>) +// KERNEL-NEXT: } +// KERNEL-NEXT: %alloc_8 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> +// KERNEL-NEXT: %write_outputs_9 = taskflow.task @Task_4 read_memrefs(%write_outputs_7 : memref<1x8x8x64xf32>) write_memrefs(%alloc_8 : memref<1x64x8x8xf32>) [original_read_memrefs(%alloc_5 : memref<1x8x8x64xf32>), original_write_memrefs(%alloc_8 : memref<1x64x8x8xf32>)] : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) -> (memref<1x64x8x8xf32>) { +// KERNEL-NEXT: ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: memref<1x64x8x8xf32>): +// KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: neura.kernel inputs(%arg1, %arg2 : memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) { +// KERNEL-NEXT: ^bb0(%arg3: memref<1x8x8x64xf32>, %arg4: memref<1x64x8x8xf32>): +// KERNEL-NEXT: %4 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %5 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: %6 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %7 = neura.counter {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %8 = memref.load %arg3[%4, %6, %7, %5] : memref<1x8x8x64xf32> +// KERNEL-NEXT: memref.store %8, %arg4[%4, %5, %6, %7] : memref<1x64x8x8xf32> +// KERNEL-NEXT: neura.yield +// KERNEL-NEXT: } +// KERNEL-NEXT: taskflow.yield writes(%arg2 : memref<1x64x8x8xf32>) +// KERNEL-NEXT: } +// KERNEL-NEXT: %alloc_10 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> +// KERNEL-NEXT: %write_outputs_11 = taskflow.task @Task_5 read_memrefs(%write_outputs_9 : memref<1x64x8x8xf32>) write_memrefs(%alloc_10 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_8 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_10 : memref<1x64x8x8xf32>)] : (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x64x8x8xf32>) { +// KERNEL-NEXT: ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x64x8x8xf32>, %arg3: f32, %arg4: f32): +// KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: neura.kernel inputs(%arg1, %arg3, %arg4, %arg2 : memref<1x64x8x8xf32>, f32, f32, memref<1x64x8x8xf32>) { +// KERNEL-NEXT: ^bb0(%arg5: memref<1x64x8x8xf32>, %arg6: f32, %arg7: f32, %arg8: memref<1x64x8x8xf32>): +// KERNEL-NEXT: %4 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %5 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: %6 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %7 = neura.counter {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %8 = memref.load %arg5[%4, %5, %6, %7] : memref<1x64x8x8xf32> +// KERNEL-NEXT: %9 = arith.minimumf %8, %arg6 : f32 +// KERNEL-NEXT: %10 = arith.maximumf %9, %arg7 : f32 +// KERNEL-NEXT: memref.store %10, %arg8[%4, %5, %6, %7] : memref<1x64x8x8xf32> +// KERNEL-NEXT: neura.yield +// KERNEL-NEXT: } +// KERNEL-NEXT: taskflow.yield writes(%arg2 : memref<1x64x8x8xf32>) +// KERNEL-NEXT: } +// KERNEL-NEXT: %alloc_12 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> +// KERNEL-NEXT: %write_outputs_13 = taskflow.task @Task_6 read_memrefs(%write_outputs_11 : memref<1x64x8x8xf32>) write_memrefs(%alloc_12 : memref<1x8x8x64xf32>) [original_read_memrefs(%alloc_10 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_12 : memref<1x8x8x64xf32>)] : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x8x8x64xf32>) { +// KERNEL-NEXT: ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x8x8x64xf32>): +// KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: neura.kernel inputs(%arg1, %arg2 : memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) { +// KERNEL-NEXT: ^bb0(%arg3: memref<1x64x8x8xf32>, %arg4: memref<1x8x8x64xf32>): +// KERNEL-NEXT: %4 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %5 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %6 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %7 = neura.counter {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: %8 = memref.load %arg3[%4, %7, %5, %6] : memref<1x64x8x8xf32> +// KERNEL-NEXT: memref.store %8, %arg4[%4, %5, %6, %7] : memref<1x8x8x64xf32> +// KERNEL-NEXT: neura.yield +// KERNEL-NEXT: } +// KERNEL-NEXT: taskflow.yield writes(%arg2 : memref<1x8x8x64xf32>) +// KERNEL-NEXT: } +// KERNEL-NEXT: %alloc_14 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10x64xf32> +// KERNEL-NEXT: %write_outputs_15 = taskflow.task @Task_7 write_memrefs(%alloc_14 : memref<1x10x10x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_14 : memref<1x10x10x64xf32>)] : (memref<1x10x10x64xf32>, f32) -> (memref<1x10x10x64xf32>) { +// KERNEL-NEXT: ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: f32): +// KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 10 : index} : index +// KERNEL-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 10 : index} : index +// KERNEL-NEXT: %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: neura.kernel inputs(%arg2, %arg1 : f32, memref<1x10x10x64xf32>) { +// KERNEL-NEXT: ^bb0(%arg3: f32, %arg4: memref<1x10x10x64xf32>): +// KERNEL-NEXT: %4 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %5 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 10 : index} : index +// KERNEL-NEXT: %6 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 10 : index} : index +// KERNEL-NEXT: %7 = neura.counter {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: memref.store %arg3, %arg4[%4, %5, %6, %7] : memref<1x10x10x64xf32> +// KERNEL-NEXT: neura.yield +// KERNEL-NEXT: } +// KERNEL-NEXT: taskflow.yield writes(%arg1 : memref<1x10x10x64xf32>) +// KERNEL-NEXT: } +// KERNEL-NEXT: %alloc_16 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> +// KERNEL-NEXT: %write_outputs_17 = taskflow.task @Task_8 write_memrefs(%alloc_16 : memref<1x8x8x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_16 : memref<1x8x8x64xf32>)] : (memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) { +// KERNEL-NEXT: ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: f32): +// KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: neura.kernel inputs(%arg2, %arg1 : f32, memref<1x8x8x64xf32>) { +// KERNEL-NEXT: ^bb0(%arg3: f32, %arg4: memref<1x8x8x64xf32>): +// KERNEL-NEXT: %4 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %5 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %6 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %7 = neura.counter {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: memref.store %arg3, %arg4[%4, %5, %6, %7] : memref<1x8x8x64xf32> +// KERNEL-NEXT: neura.yield +// KERNEL-NEXT: } +// KERNEL-NEXT: taskflow.yield writes(%arg1 : memref<1x8x8x64xf32>) +// KERNEL-NEXT: } +// KERNEL-NEXT: %write_outputs_18 = taskflow.task @Task_9 read_memrefs(%write_outputs_15, %write_outputs_17 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) write_memrefs(%write_outputs_17 : memref<1x8x8x64xf32>) value_inputs(%cst : f32) [original_read_memrefs(%alloc_14, %alloc_16 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_16 : memref<1x8x8x64xf32>)] : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) { +// KERNEL-NEXT: ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: memref<1x8x8x64xf32>, %arg3: memref<1x8x8x64xf32>, %arg4: f32): +// KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: %4 = taskflow.counter parent(%3 : index) attributes {counter_id = 4 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 3 : index} : index +// KERNEL-NEXT: %5 = taskflow.counter parent(%4 : index) attributes {counter_id = 5 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 3 : index} : index +// KERNEL-NEXT: %6 = taskflow.counter parent(%5 : index) attributes {counter_id = 6 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: neura.kernel inputs(%arg1, %arg3, %arg4 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, f32) { +// KERNEL-NEXT: ^bb0(%arg5: memref<1x10x10x64xf32>, %arg6: memref<1x8x8x64xf32>, %arg7: f32): +// KERNEL-NEXT: %7 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %8 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %9 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %10 = neura.counter {counter_id = 3 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: %11 = neura.counter {counter_id = 4 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 3 : index} : index +// KERNEL-NEXT: %12 = neura.counter {counter_id = 5 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 3 : index} : index +// KERNEL-NEXT: %13 = neura.counter {counter_id = 6 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: %14 = arith.addi %8, %11 : index +// KERNEL-NEXT: %15 = arith.addi %9, %12 : index +// KERNEL-NEXT: %16 = memref.load %arg5[%7, %14, %15, %13] : memref<1x10x10x64xf32> +// KERNEL-NEXT: %17 = memref.load %arg6[%7, %8, %9, %10] : memref<1x8x8x64xf32> +// KERNEL-NEXT: %18 = arith.mulf %16, %arg7 : f32 +// KERNEL-NEXT: %19 = arith.addf %17, %18 : f32 +// KERNEL-NEXT: memref.store %19, %arg6[%7, %8, %9, %10] : memref<1x8x8x64xf32> +// KERNEL-NEXT: neura.yield +// KERNEL-NEXT: } +// KERNEL-NEXT: taskflow.yield writes(%arg3 : memref<1x8x8x64xf32>) +// KERNEL-NEXT: } +// KERNEL-NEXT: %alloc_19 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> +// KERNEL-NEXT: %write_outputs_20 = taskflow.task @Task_10 read_memrefs(%write_outputs_18 : memref<1x8x8x64xf32>) write_memrefs(%alloc_19 : memref<1x64x8x8xf32>) [original_read_memrefs(%alloc_16 : memref<1x8x8x64xf32>), original_write_memrefs(%alloc_19 : memref<1x64x8x8xf32>)] : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) -> (memref<1x64x8x8xf32>) { +// KERNEL-NEXT: ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: memref<1x64x8x8xf32>): +// KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: neura.kernel inputs(%arg1, %arg2 : memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) { +// KERNEL-NEXT: ^bb0(%arg3: memref<1x8x8x64xf32>, %arg4: memref<1x64x8x8xf32>): +// KERNEL-NEXT: %4 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %5 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: %6 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %7 = neura.counter {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %8 = memref.load %arg3[%4, %6, %7, %5] : memref<1x8x8x64xf32> +// KERNEL-NEXT: memref.store %8, %arg4[%4, %5, %6, %7] : memref<1x64x8x8xf32> +// KERNEL-NEXT: neura.yield +// KERNEL-NEXT: } +// KERNEL-NEXT: taskflow.yield writes(%arg2 : memref<1x64x8x8xf32>) +// KERNEL-NEXT: } +// KERNEL-NEXT: %alloc_21 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> +// KERNEL-NEXT: %write_outputs_22 = taskflow.task @Task_11 read_memrefs(%write_outputs_20, %arg0 : memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) write_memrefs(%alloc_21 : memref<1x64x8x8xf32>) [original_read_memrefs(%alloc_19, %arg0 : memref<1x64x8x8xf32>, memref<1x64x8x8xf32>), original_write_memrefs(%alloc_21 : memref<1x64x8x8xf32>)] : (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) -> (memref<1x64x8x8xf32>) { +// KERNEL-NEXT: ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x64x8x8xf32>, %arg3: memref<1x64x8x8xf32>): +// KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: neura.kernel inputs(%arg1, %arg2, %arg3 : memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) { +// KERNEL-NEXT: ^bb0(%arg4: memref<1x64x8x8xf32>, %arg5: memref<1x64x8x8xf32>, %arg6: memref<1x64x8x8xf32>): +// KERNEL-NEXT: %4 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %5 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: %6 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %7 = neura.counter {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %8 = memref.load %arg4[%4, %5, %6, %7] : memref<1x64x8x8xf32> +// KERNEL-NEXT: %9 = memref.load %arg5[%4, %5, %6, %7] : memref<1x64x8x8xf32> +// KERNEL-NEXT: %10 = arith.addf %8, %9 : f32 +// KERNEL-NEXT: memref.store %10, %arg6[%4, %5, %6, %7] : memref<1x64x8x8xf32> +// KERNEL-NEXT: neura.yield +// KERNEL-NEXT: } +// KERNEL-NEXT: taskflow.yield writes(%arg3 : memref<1x64x8x8xf32>) +// KERNEL-NEXT: } +// KERNEL-NEXT: %alloc_23 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> +// KERNEL-NEXT: %write_outputs_24 = taskflow.task @Task_12 read_memrefs(%write_outputs_22 : memref<1x64x8x8xf32>) write_memrefs(%alloc_23 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_21 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_23 : memref<1x64x8x8xf32>)] : (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x64x8x8xf32>) { +// KERNEL-NEXT: ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x64x8x8xf32>, %arg3: f32, %arg4: f32): +// KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %3 = taskflow.counter parent(%2 : index) attributes {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: neura.kernel inputs(%arg1, %arg3, %arg4, %arg2 : memref<1x64x8x8xf32>, f32, f32, memref<1x64x8x8xf32>) { +// KERNEL-NEXT: ^bb0(%arg5: memref<1x64x8x8xf32>, %arg6: f32, %arg7: f32, %arg8: memref<1x64x8x8xf32>): +// KERNEL-NEXT: %4 = neura.counter {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index +// KERNEL-NEXT: %5 = neura.counter {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index +// KERNEL-NEXT: %6 = neura.counter {counter_id = 2 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %7 = neura.counter {counter_id = 3 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// KERNEL-NEXT: %8 = memref.load %arg5[%4, %5, %6, %7] : memref<1x64x8x8xf32> +// KERNEL-NEXT: %9 = arith.minimumf %8, %arg6 : f32 +// KERNEL-NEXT: %10 = arith.maximumf %9, %arg7 : f32 +// KERNEL-NEXT: memref.store %10, %arg8[%4, %5, %6, %7] : memref<1x64x8x8xf32> +// KERNEL-NEXT: neura.yield +// KERNEL-NEXT: } +// KERNEL-NEXT: taskflow.yield writes(%arg2 : memref<1x64x8x8xf32>) +// KERNEL-NEXT: } +// KERNEL-NEXT: return %write_outputs_24 : memref<1x64x8x8xf32> +// KERNEL-NEXT: } +// KERNEL-NEXT: } + diff --git a/tools/neura-compiler/CMakeLists.txt b/tools/neura-compiler/CMakeLists.txt index 69e78747..fc4e13da 100644 --- a/tools/neura-compiler/CMakeLists.txt +++ b/tools/neura-compiler/CMakeLists.txt @@ -5,6 +5,7 @@ set(LIBS ${dialect_libs} ${conversion_libs} MLIRNeuraTransforms + MLIRTaskflowTransforms MLIRConversion MLIRNeura MLIRTransforms @@ -13,6 +14,8 @@ set(LIBS MLIRIR MLIRParser MLIRSupport + MLIRFuncAllExtensions + MLIRTensorAllExtensions ) target_link_libraries(neura-compiler PRIVATE ${LIBS}) \ No newline at end of file diff --git a/tools/neura-compiler/neura-compiler.cpp b/tools/neura-compiler/neura-compiler.cpp index e728b8b0..5e65f061 100644 --- a/tools/neura-compiler/neura-compiler.cpp +++ b/tools/neura-compiler/neura-compiler.cpp @@ -1,12 +1,7 @@ // neura-compiler.cpp - -#include "mlir/Dialect/Affine/IR/AffineOps.h" -#include "mlir/Dialect/DLTI/DLTI.h" -#include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/InitAllDialects.h" +#include "mlir/InitAllExtensions.h" #include "mlir/InitAllPasses.h" -#include "mlir/Support/FileUtilities.h" -#include "mlir/Support/LogicalResult.h" #include "mlir/Tools/mlir-opt/MlirOptMain.h" #include "Conversion/ConversionPasses.h" @@ -14,6 +9,7 @@ #include "NeuraDialect/NeuraDialect.h" #include "NeuraDialect/NeuraPasses.h" #include "NeuraDialect/Util/ArchParser.h" +#include "TaskflowDialect/TaskflowPasses.h" #include "mlir/Support/LogicalResult.h" using mlir::neura::Architecture; @@ -65,14 +61,14 @@ int main(int argc, char **argv) { // Registers MLIR dialects. mlir::DialectRegistry registry; registry.insert(); - registry.insert(); - registry.insert(); - registry.insert(); - registry.insert(); - registry.insert(); - registry.insert(); + registry.insert(); + + mlir::registerAllDialects(registry); + mlir::registerAllExtensions(registry); mlir::neura::registerNeuraConversionPassPipeline(); + mlir::taskflow::registerTosaToAffineConversionPassPipeline(); + mlir::taskflow::registerTaskflowConversionPassPipeline(); // Print architecture spec file info if (!architecture_spec_file.empty()) {