diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 98b116d3..c23378af 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -101,5 +101,5 @@ jobs: working-directory: ${{github.workspace}} run: | cd ${{github.workspace}}/test - ${{github.workspace}}/llvm-project/build/bin/llvm-lit * -v + ${{github.workspace}}/llvm-project/build/bin/llvm-lit . -v diff --git a/include/Conversion/ConversionPasses.h b/include/Conversion/ConversionPasses.h index 2477bb3d..2d871868 100644 --- a/include/Conversion/ConversionPasses.h +++ b/include/Conversion/ConversionPasses.h @@ -19,6 +19,7 @@ namespace mlir { // Conversion passes. std::unique_ptr createLowerArithToNeuraPass(); std::unique_ptr createLowerLlvmToNeuraPass(); +std::unique_ptr createLowerMemRefToNeuraPass(); #define GEN_PASS_REGISTRATION #include "Conversion/ConversionPasses.h.inc" diff --git a/include/Conversion/ConversionPasses.td b/include/Conversion/ConversionPasses.td index 7fca77bb..8f2db985 100644 --- a/include/Conversion/ConversionPasses.td +++ b/include/Conversion/ConversionPasses.td @@ -20,4 +20,10 @@ def LowerLlvmToNeura : Pass<"lower-llvm-to-neura", "ModuleOp">{ let constructor = "mlir::createLowerLlvmToNeuraPass()"; } +def LowerMemRefToNeura : Pass<"lower-memref-to-neura", "ModuleOp">{ + let summary = "Lower MemRef to Neura dialect"; + let description = [{Lower MemRef operations to Neura dialect operations.}]; + let constructor = "mlir::createLowerMemRefToNeuraPass()"; +} + #endif // CONVERSION_PASSES_TD \ No newline at end of file diff --git a/include/NeuraDialect/NeuraOps.td b/include/NeuraDialect/NeuraOps.td index 223eee9f..4021bbe2 100644 --- a/include/NeuraDialect/NeuraOps.td +++ b/include/NeuraDialect/NeuraOps.td @@ -24,6 +24,14 @@ def Neura_AddOp : Op { let traits = [SameOperandsAndResultElementType]; } +def Neura_SubOp : Op { + let summary = "Integer substraction operation"; + let arguments = (ins AnyType:$lhs, AnyType:$rhs, Optional:$predicate); + let results = (outs AnyType:$result); + // let assemblyFormat = "$lhs `,` $rhs `,` $predicate attr-dict `:` type($result)"; + let traits = [SameOperandsAndResultElementType]; +} + // Defines a floating-point addition operation. def Neura_FAddOp : Op { let summary = "Floating addition operation"; @@ -38,7 +46,7 @@ def Neura_FAddOp : Op { def Neura_FSubOp: Op { let summary = "Floating substraction operation"; let opName = "fsub"; - let arguments = (ins AnyFloat:$lhs, AnyFloat:$rhs); + let arguments = (ins AnyFloat:$lhs, AnyFloat:$rhs, Optional:$predicate); let results = (outs AnyFloat:$result); // let assemblyFormat = "$lhs `,` $rhs attr-dict `:` type($result)"; let traits = [SameOperandsAndResultElementType]; @@ -54,6 +62,13 @@ def Neura_FMulOp : Op { // let traits = [SameOperandsAndResultElementType]; } +def Neura_FDivOp : Op { + let summary = "Floating division operation"; + let arguments = (ins AnyFloat:$lhs, AnyFloat:$rhs, Optional:$predicate); + let results = (outs AnyFloat:$result); + // let assemblyFormat = "$lhs `,` $rhs `,` $predicate attr-dict `:` type($result)"; +} + // Defines a bitwise OR operation. def Neura_OrOp : Op { let summary = "Bitwise OR operation"; @@ -144,6 +159,14 @@ def Neura_ReturnOp : Op { // let assemblyFormat = "($values^)? `,` $predicate attr-dict"; } +// Defines a cast operation for type conversion. +def Neura_CastOp : Op{ + let summary = "Generic type conversion operation"; + let arguments = (ins AnyType:$input, StrAttr:$cast_type, Optional:$predicate); + let results = (outs AnyType:$result); + // let assemblyFormat = "$input type($input) `->` type($output) `,` $predicate attr-dict"; +} + // ---------------------------------------------------- // Defines vector operations. diff --git a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp index c5af8d47..72c83c6b 100644 --- a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp +++ b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp @@ -1,12 +1,13 @@ +#include "Conversion/ConversionPasses.h" #include "NeuraDialect/NeuraDialect.h" #include "NeuraDialect/NeuraOps.h" -#include "mlir/Dialect/Arith/IR/Arith.h" #include "NeuraDialect/NeuraPasses.h" +#include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/Attributes.h" #include "mlir/IR/PatternMatch.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" -#include "Conversion/ConversionPasses.h" namespace mlir { namespace neura { @@ -26,7 +27,39 @@ using namespace mlir::neura; #define GEN_PASS_DEF_LOWERARITHTONEURA #include "NeuraDialect/NeuraPasses.h.inc" -namespace{ +namespace { + +struct ArithConstantToNeuraConstant + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(arith::ConstantOp op, + PatternRewriter &rewriter) const override { + // Converts arith constant to Neura constant + Type result_type = op.getType(); + Attribute value = op.getValue(); + // Optional predicate parameter can be null + rewriter.replaceOpWithNewOp(op, result_type, value, + nullptr); + return success(); + } +}; + +struct ArithAddIToNeuraAdd : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(arith::AddIOp op, + PatternRewriter &rewriter) const override { + Value lhs = op.getLhs(); + Value rhs = op.getRhs(); + Type result_type = op.getType(); + + // Optional predicate: default to null + rewriter.replaceOpWithNewOp(op, result_type, lhs, rhs, + nullptr); + return success(); + } +}; struct ArithFAddToNeuraFAdd : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -35,16 +68,199 @@ struct ArithFAddToNeuraFAdd : public OpRewritePattern { PatternRewriter &rewriter) const override { Value lhs = op.getLhs(); Value rhs = op.getRhs(); - Type resultType = op.getType(); + Type result_type = op.getType(); + + // Optional predicate: default to null + rewriter.replaceOpWithNewOp(op, result_type, lhs, rhs, + nullptr); + return success(); + } +}; + +struct ArithSubIToNeuraSub : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(arith::SubIOp op, + PatternRewriter &rewriter) const override { + Value lhs = op.getLhs(); + Value rhs = op.getRhs(); + Type result_type = op.getType(); + + // Optional predicate: default to null + rewriter.replaceOpWithNewOp(op, result_type, lhs, rhs, + nullptr); + return success(); + } +}; + +struct ArithSubFToNeuraFSub : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(arith::SubFOp op, + PatternRewriter &rewriter) const override { + Value lhs = op.getLhs(); + Value rhs = op.getRhs(); + Type result_type = op.getType(); + + // Optional predicate: default to null + rewriter.replaceOpWithNewOp(op, result_type, lhs, rhs, + nullptr); + return success(); + } +}; + +struct ArithMulFToNeuraFMul : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(arith::MulFOp op, + PatternRewriter &rewriter) const override { + Value lhs = op.getLhs(); + Value rhs = op.getRhs(); + Type result_type = op.getType(); + + // Optional predicate: default to null + rewriter.replaceOpWithNewOp(op, result_type, lhs, rhs, + nullptr); + return success(); + } +}; + +struct ArithFDivToNeuraFDiv : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(arith::DivFOp op, + PatternRewriter &rewriter) const override { + Value lhs = op.getLhs(); + Value rhs = op.getRhs(); + Type result_type = op.getType(); + + // Optional predicate: default to null + rewriter.replaceOpWithNewOp(op, result_type, lhs, rhs, + nullptr); + return success(); + } +}; +struct ArithCmpiToNeuraICmp : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(arith::CmpIOp op, + PatternRewriter &rewriter) const override { + Value lhs = op.getLhs(); + Value rhs = op.getRhs(); + Type result_type = op.getType(); + arith::CmpIPredicate arith_cmp_type = op.getPredicate(); + StringRef cmp_type; + switch (arith_cmp_type) { + case arith::CmpIPredicate::eq: + cmp_type = "eq"; // == + break; + case arith::CmpIPredicate::ne: + cmp_type = "ne"; // != + break; + case arith::CmpIPredicate::slt: + cmp_type = "slt"; // < + break; + case arith::CmpIPredicate::sle: + cmp_type = "sle"; // <= + break; + case arith::CmpIPredicate::sgt: + cmp_type = "sgt"; // > + break; + case arith::CmpIPredicate::sge: + cmp_type = "sge"; // >= + break; + case arith::CmpIPredicate::ult: + cmp_type = "ult"; // unsigned < + break; + case arith::CmpIPredicate::ule: + cmp_type = "ule"; // unsigned <= + break; + case arith::CmpIPredicate::ugt: + cmp_type = "ugt"; // unsigned > + break; + case arith::CmpIPredicate::uge: + cmp_type = "uge"; // unsigned >= + break; + default: + return rewriter.notifyMatchFailure(op, "Unsupported arith CmpIOp type"); + } + + // Convert arith CmpIOp to Neura ICmpOp + // Optional predicate: default to null + rewriter.replaceOpWithNewOp( + op, result_type, lhs, rhs, nullptr, rewriter.getStringAttr(cmp_type)); + return success(); + } +}; + +struct ArithSelectToNeuraSel : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(arith::SelectOp op, + PatternRewriter &rewriter) const override { + Value condition = op.getCondition(); + Value true_value = op.getTrueValue(); + Value false_value = op.getFalseValue(); + Type result_type = op.getType(); + + // Convert arith SelectOp to Neura SelOp + rewriter.replaceOpWithNewOp(op, result_type, true_value, + false_value, condition); + return success(); + } +}; + +struct ArithExtUIToNeuraCast : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(arith::ExtUIOp op, + PatternRewriter &rewriter) const override { + Value input = op.getIn(); + Type result_type = op.getType(); + + // Convert arith ExtUIOp to Neura cast operation + // Optional predicate: default to null + rewriter.replaceOpWithNewOp( + op, result_type, input, rewriter.getStringAttr("extui"), nullptr); + return success(); + } +}; + +struct ArithExtfToNeuraCast : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(arith::ExtFOp op, + PatternRewriter &rewriter) const override { + Value input = op.getIn(); + Type result_type = op.getType(); + + // Convert arith ExtFOp to Neura cast operation + // Optional predicate: default to null + rewriter.replaceOpWithNewOp( + op, result_type, input, rewriter.getStringAttr("extf"), nullptr); + return success(); + } +}; + +struct ArithIndexCastToNeuraCast + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(arith::IndexCastOp op, + PatternRewriter &rewriter) const override { + Value input = op.getIn(); + Type result_type = op.getType(); - // Optional predicate: default to 'none' - rewriter.replaceOpWithNewOp(op, resultType, lhs, rhs, Value()); + // Convert arith IndexCastOp to Neura cast operation + // Optional predicate: default to null + rewriter.replaceOpWithNewOp( + op, result_type, input, rewriter.getStringAttr("indexCast"), nullptr); return success(); } }; struct LowerArithToNeuraPass - : public PassWrapper> { + : public PassWrapper> { MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LowerArithToNeuraPass) @@ -60,7 +276,11 @@ struct LowerArithToNeuraPass void runOnOperation() override { RewritePatternSet patterns(&getContext()); mlir::neura::arith2neura::populateWithGenerated(patterns); - patterns.add(&getContext()); + patterns + .add(&getContext()); if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { signalPassFailure(); } diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt index 1dbce29f..af5bb68a 100644 --- a/lib/Conversion/CMakeLists.txt +++ b/lib/Conversion/CMakeLists.txt @@ -2,6 +2,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) add_subdirectory(ArithToNeura) add_subdirectory(LlvmToNeura) +add_subdirectory(MemRefToNeura) # add_mlir_library( # MLIRNeuraConversion diff --git a/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp b/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp index 71ed33b5..6bc815b3 100644 --- a/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp +++ b/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp @@ -9,6 +9,7 @@ #include "mlir/Pass/Pass.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "Conversion/ConversionPasses.h" +#include "llvm/Support/raw_ostream.h" namespace mlir { namespace neura { @@ -62,6 +63,26 @@ struct LlvmFAddToNeuraFAdd : public OpRewritePattern { } }; +struct LlvmFSubToNeuraFSub : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(mlir::LLVM::FSubOp op, + PatternRewriter &rewriter) const override { + Value lhs = op->getOperand(0); + Value rhs = op.getOperand(1); + Type result_type = op->getResult(0).getType(); + + // Only matches scalar float. + if (!mlir::isa(result_type)){ + return failure(); + } + + // Optional predicate: default to 'none' + rewriter.replaceOpWithNewOp(op, result_type, lhs, rhs, Value()); + return success(); + } +}; + struct LlvmOrToNeuraOr : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -316,6 +337,7 @@ struct LowerLlvmToNeuraPass patterns.add(&getContext()); patterns.add(&getContext()); patterns.add(&getContext()); + patterns.add(&getContext()); FrozenRewritePatternSet frozen(std::move(patterns)); diff --git a/lib/Conversion/LlvmToNeura/LlvmToNeuraPatterns.td b/lib/Conversion/LlvmToNeura/LlvmToNeuraPatterns.td index e01ff728..3aef67d8 100644 --- a/lib/Conversion/LlvmToNeura/LlvmToNeuraPatterns.td +++ b/lib/Conversion/LlvmToNeura/LlvmToNeuraPatterns.td @@ -4,8 +4,9 @@ include "mlir/Dialect/LLVMIR/LLVMOps.td" include "NeuraDialect/NeuraOps.td" // Floating point binary operations. -def : Pat< - (LLVM_FSubOp $lhs, $rhs, $_fastmath), - (Neura_FSubOp $lhs, $rhs) ->; +// Deprecated Pattern: Because we need the predicate bit to be set to null initially +// def : Pat< +// (LLVM_FSubOp $lhs, $rhs, $_fastmath), +// (Neura_FSubOp $lhs, $rhs) +// >; diff --git a/lib/Conversion/MemRefToNeura/CMakeLists.txt b/lib/Conversion/MemRefToNeura/CMakeLists.txt new file mode 100644 index 00000000..335d2c39 --- /dev/null +++ b/lib/Conversion/MemRefToNeura/CMakeLists.txt @@ -0,0 +1,18 @@ +include_directories(${CMAKE_CURRENT_BINARY_DIR}) + +add_mlir_conversion_library(MLIRNeuraMemRefToNeuraPass + MemRefToNeuraPass.cpp + + DEPENDS + MLIRConversionIncGen + + LINK_LIBS PUBLIC + MLIRArithDialect + MLIRFuncDialect + MLIRLLVMDialect + MLIRIR + MLIRPass + MLIRTransforms + MLIRNeura + MLIRSupport +) diff --git a/lib/Conversion/MemRefToNeura/MemRefToNeuraPass.cpp b/lib/Conversion/MemRefToNeura/MemRefToNeuraPass.cpp new file mode 100644 index 00000000..3d3b543c --- /dev/null +++ b/lib/Conversion/MemRefToNeura/MemRefToNeuraPass.cpp @@ -0,0 +1,44 @@ +#include "Common/AcceleratorAttrs.h" +#include "NeuraDialect/NeuraDialect.h" +#include "NeuraDialect/NeuraOps.h" +#include "mlir/Dialect/LLVMIR/LLVMAttrs.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMTypes.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "Conversion/ConversionPasses.h" + +using namespace mlir; +using namespace mlir::neura; + +#define GEN_PASS_DEF_LOWERLLVMTONEURA +#include "NeuraDialect/NeuraPasses.h.inc" + + +namespace { + +struct LowerMemRefToNeuraPass + : public PassWrapper> { + + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LowerMemRefToNeuraPass) + + StringRef getArgument() const override { return "lower-memref-to-neura"; } + StringRef getDescription() const override { + return "Lower MemRef operations to Neura dialect operations"; + } + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() override { + RewritePatternSet patterns(&getContext()); + } +}; +} // namespace + +std::unique_ptr mlir::createLowerMemRefToNeuraPass() { + return std::make_unique(); +} diff --git a/test/README.md b/test/README.md new file mode 100644 index 00000000..e9599fb0 --- /dev/null +++ b/test/README.md @@ -0,0 +1,57 @@ +# Tests for Neura + +The structure of the files in this folder is as follows: +``` +. +├── affine2neura +│ └── bert +├── arith2neura +│ ├── add.mlir +│ └── Output +├── c2llvm2mlir +│ ├── kernel.cpp +│ ├── Output +│ └── test.mlir +├── lit.cfg +├── lit.cfg.in +├── neura +│ ├── arith_add.mlir +│ ├── ctrl +│ ├── fadd_fadd.mlir +│ ├── for_loop +│ ├── interpreter +│ ├── llvm_add.mlir +│ ├── llvm_sub.mlir +│ └── Output +├── Output +│ └── test.mlir.script +├── README.md +├── samples +│ ├── bert +│ └── lenet +└── test.mlir +``` + +All of the above content can be divided into three categories + +## 1 Conversion Test +We need to convert other dialects to our `neura` dialect for compilation optimization. In order to verify the correctness of conversions from other dialects to `nerua` dialect, we need to provide the appropriate test for a conversion pass from a dialect to `nerua` dialect. + +For now, we have: +`affine2neura`: tests provided for `--lower-affine-to-neura` [To be provided] +`arith2neura`: tests provided for `--lower-arith-to-neura` +`c2llvm2mlir`: tests provided for `--lower-llvm-to-neura` + +## 2 Neura Compiler Test +Tests for individual passes/pass pipelines at the `neura` dialect level. + +## 3 Samples +A collection of real-world applications for generating unit small tests. + +For now, [BERT](https://github.com/codertimo/BERT-pytorch) and [LENET](https://github.com/kuangliu/pytorch-cifar/blob/master/models/lenet.py) are included. + +We generate the `linalg` dialect of these models via [Torch MLIR](https://github.com/llvm/torch-mlir). which is then lowered to `affine` dialect for further lowering. + +Due to the data dependencies between loops in models, we are now unable to automatically extract each of these SINGLE loops from the model IR for individual tests. + +But we can manually collect some small unit tests from these sample IRs. For example, you can write `c++` code of a loop from BERT by mimicing the its corresponding `affine.for` operations, then use [Polygeist](https://github.com/llvm/Polygeist) to convert these `c++` code into `affine` mlir for further lowering. And that's how we generated tests in `affine2neura/bert`. \ No newline at end of file diff --git a/test/affine2neura/bert/bert_node0/bert_node0.cpp b/test/affine2neura/bert/bert_node0/bert_node0.cpp new file mode 100644 index 00000000..a5d2e86b --- /dev/null +++ b/test/affine2neura/bert/bert_node0/bert_node0.cpp @@ -0,0 +1,11 @@ +void bert_node0( + const int input[1][128], + bool output[1][128]) { + for (int arg3 = 0; arg3 < 1; arg3++) { + for (int arg4 = 0; arg4 < 128; arg4++) { + int value = input[0][arg4]; + bool result = (value > 0); + output[arg3][arg4] = result; + } + } +} \ No newline at end of file diff --git a/test/affine2neura/bert/bert_node0/bert_node0.mlir b/test/affine2neura/bert/bert_node0/bert_node0.mlir new file mode 100644 index 00000000..4c1eef85 --- /dev/null +++ b/test/affine2neura/bert/bert_node0/bert_node0.mlir @@ -0,0 +1,39 @@ +// RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir +// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s + +module attributes {} { + func.func @_Z10bert_node0PA128_KiPA128_b(%arg0: memref, %arg1: memref) attributes {} { + %c0_i32 = arith.constant 0 : i32 + affine.for %arg2 = 0 to 128 { + %0 = affine.load %arg0[0, %arg2] : memref + %1 = arith.cmpi sgt, %0, %c0_i32 : i32 + %2 = arith.extui %1 : i1 to i8 + affine.store %2, %arg1[0, %arg2] : memref + } + return + } +} + +// CHECK: func.func @_Z10bert_node0PA128_KiPA128_b(%arg0: memref, %arg1: memref) attributes {accelerator = "neura"} { +// CHECK-NEXT: %0 = "neura.constant"() <{value = 1 : index}> : () -> index +// CHECK-NEXT: %1 = "neura.constant"() <{value = 128 : index}> : () -> index +// CHECK-NEXT: %2 = "neura.constant"() <{value = 0 : i32}> : () -> i32 +// CHECK-NEXT: %3 = "neura.constant"() <{value = 0 : index}> : () -> index +// CHECK-NEXT: %4 = builtin.unrealized_conversion_cast %3 : index to i64 +// CHECK-NEXT: llvm.br ^bb1(%4 : i64) +// CHECK-NEXT: ^bb1(%5: i64): // 2 preds: ^bb0, ^bb2 +// CHECK-NEXT: %6 = builtin.unrealized_conversion_cast %5 : i64 to index +// CHECK-NEXT: %7 = "neura.icmp"(%6, %1) <{cmpType = "slt"}> : (index, index) -> i1 +// CHECK-NEXT: llvm.cond_br %7, ^bb2, ^bb3 +// CHECK-NEXT: ^bb2: // pred: ^bb1 +// CHECK-NEXT: %8 = memref.load %arg0[%3, %6] : memref +// CHECK-NEXT: %9 = "neura.icmp"(%8, %2) <{cmpType = "sgt"}> : (i32, i32) -> i1 +// CHECK-NEXT: %10 = "neura.cast"(%9) <{cast_type = "extui"}> : (i1) -> i8 +// CHECK-NEXT: memref.store %10, %arg1[%3, %6] : memref +// CHECK-NEXT: %11 = "neura.add"(%6, %0) : (index, index) -> index +// CHECK-NEXT: %12 = builtin.unrealized_conversion_cast %11 : index to i64 +// CHECK-NEXT: llvm.br ^bb1(%12 : i64) +// CHECK-NEXT: ^bb3: // pred: ^bb1 +// CHECK-NEXT: return +// CHECK-NEXT: } +// CHECK-NEXT: } diff --git a/test/affine2neura/bert/bert_node1/bert_node1.cpp b/test/affine2neura/bert/bert_node1/bert_node1.cpp new file mode 100644 index 00000000..7aa5ca29 --- /dev/null +++ b/test/affine2neura/bert/bert_node1/bert_node1.cpp @@ -0,0 +1,19 @@ +void bert_node1( + bool input[1][1][1][1][1][128], + bool output[1][1][128][1][1][128]) { + + for (int arg3 = 0; arg3 < 1; arg3++) { + for (int arg4 = 0; arg4 < 1; arg4++) { + for (int arg5 = 0; arg5 < 128; arg5++) { + for (int arg6 = 0; arg6 < 1; arg6++) { + for (int arg7 = 0; arg7 < 1; arg7++) { + for (int arg8 = 0; arg8 < 128; arg8++) { + bool value = input[arg3][arg4][0][arg6][arg7][arg8]; + output[arg3][arg4][arg5][arg6][arg7][arg8] = value; + } + } + } + } + } + } +} \ No newline at end of file diff --git a/test/affine2neura/bert/bert_node1/bert_node1.mlir b/test/affine2neura/bert/bert_node1/bert_node1.mlir new file mode 100644 index 00000000..0280d7c3 --- /dev/null +++ b/test/affine2neura/bert/bert_node1/bert_node1.mlir @@ -0,0 +1,44 @@ +// RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir +// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s +module attributes {} { + func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref, %arg1: memref) attributes {} { + affine.for %arg2 = 0 to 128 { + affine.for %arg3 = 0 to 128 { + %0 = affine.load %arg0[0, 0, 0, 0, 0, %arg3] : memref + affine.store %0, %arg1[0, 0, %arg2, 0, 0, %arg3] : memref + } + } + return + } +} + +// CHECK: func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref, %arg1: memref) attributes {accelerator = "neura"} { +// CHECK-NEXT: %0 = "neura.constant"() <{value = 1 : index}> : () -> index +// CHECK-NEXT: %1 = "neura.constant"() <{value = 128 : index}> : () -> index +// CHECK-NEXT: %2 = "neura.constant"() <{value = 0 : index}> : () -> index +// CHECK-NEXT: %3 = builtin.unrealized_conversion_cast %2 : index to i64 +// CHECK-NEXT: llvm.br ^bb1(%3 : i64) +// CHECK-NEXT: ^bb1(%4: i64): // 2 preds: ^bb0, ^bb5 +// CHECK-NEXT: %5 = builtin.unrealized_conversion_cast %4 : i64 to index +// CHECK-NEXT: %6 = "neura.icmp"(%5, %1) <{cmpType = "slt"}> : (index, index) -> i1 +// CHECK-NEXT: llvm.cond_br %6, ^bb2, ^bb6 +// CHECK-NEXT: ^bb2: // pred: ^bb1 +// CHECK-NEXT: %7 = builtin.unrealized_conversion_cast %2 : index to i64 +// CHECK-NEXT: llvm.br ^bb3(%7 : i64) +// CHECK-NEXT: ^bb3(%8: i64): // 2 preds: ^bb2, ^bb4 +// CHECK-NEXT: %9 = builtin.unrealized_conversion_cast %8 : i64 to index +// CHECK-NEXT: %10 = "neura.icmp"(%9, %1) <{cmpType = "slt"}> : (index, index) -> i1 +// CHECK-NEXT: llvm.cond_br %10, ^bb4, ^bb5 +// CHECK-NEXT: ^bb4: // pred: ^bb3 +// CHECK-NEXT: %11 = memref.load %arg0[%2, %2, %2, %2, %2, %9] : memref +// CHECK-NEXT: memref.store %11, %arg1[%2, %2, %5, %2, %2, %9] : memref +// CHECK-NEXT: %12 = "neura.add"(%9, %0) : (index, index) -> index +// CHECK-NEXT: %13 = builtin.unrealized_conversion_cast %12 : index to i64 +// CHECK-NEXT: llvm.br ^bb3(%13 : i64) +// CHECK-NEXT: ^bb5: // pred: ^bb3 +// CHECK-NEXT: %14 = "neura.add"(%5, %0) : (index, index) -> index +// CHECK-NEXT: %15 = builtin.unrealized_conversion_cast %14 : index to i64 +// CHECK-NEXT: llvm.br ^bb1(%15 : i64) +// CHECK-NEXT: ^bb6: // pred: ^bb1 +// CHECK-NEXT: return +// CHECK-NEXT: } \ No newline at end of file diff --git a/test/affine2neura/bert/bert_node2/bert_node2.cpp b/test/affine2neura/bert/bert_node2/bert_node2.cpp new file mode 100644 index 00000000..9dda6885 --- /dev/null +++ b/test/affine2neura/bert/bert_node2/bert_node2.cpp @@ -0,0 +1,25 @@ +void bert_node2( + const int input_indices[1][128], + const float embedding_table[30522][768], + float output[1][128][768]) { + const int c30522 = 30522; + const int c0_i64 = 0; + + for (int arg3 = 0; arg3 < 1; arg3++) { + for (int arg4 = 0; arg4 < 128; arg4++) { + for (int arg5 = 0; arg5 < 768; arg5++) { + int index_i64 = input_indices[arg3][arg4]; + int index = static_cast(index_i64); + // Bound checking instead of assertions + if (index >= c30522) { + index = c30522 - 1; // Clamp to maximum valid index + } + if (index < c0_i64) { + index = c0_i64; // Clamp to minimum valid index + } + float extracted_value = embedding_table[index][arg5]; + output[arg3][arg4][arg5] = extracted_value; + } + } + } +} \ No newline at end of file diff --git a/test/affine2neura/bert/bert_node2/bert_node2.mlir b/test/affine2neura/bert/bert_node2/bert_node2.mlir new file mode 100644 index 00000000..6b70666a --- /dev/null +++ b/test/affine2neura/bert/bert_node2/bert_node2.mlir @@ -0,0 +1,78 @@ +// RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir +// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s +module attributes {} { + func.func @_Z10bert_node2PA128_KiPA768_KfPA128_A768_f(%arg0: memref, %arg1: memref, %arg2: memref) attributes {} { + %false = arith.constant false + %c30521_i32 = arith.constant 30521 : i32 + %c0_i32 = arith.constant 0 : i32 + %c30522_i32 = arith.constant 30522 : i32 + affine.for %arg3 = 0 to 128 { + affine.for %arg4 = 0 to 768 { + %0 = affine.load %arg0[0, %arg3] : memref + %1 = arith.cmpi sge, %0, %c30522_i32 : i32 + %2 = arith.select %1, %c30521_i32, %0 : i32 + %3 = scf.if %1 -> (i1) { + scf.yield %false : i1 + } else { + %7 = arith.cmpi slt, %0, %c0_i32 : i32 + scf.yield %7 : i1 + } + %4 = arith.select %3, %c0_i32, %2 : i32 + %5 = arith.index_cast %4 : i32 to index + %6 = memref.load %arg1[%5, %arg4] : memref + affine.store %6, %arg2[0, %arg3, %arg4] : memref + } + } + return + } +} + +// CHECK: func.func @_Z10bert_node2PA128_KiPA768_KfPA128_A768_f(%arg0: memref, %arg1: memref, %arg2: memref) attributes {accelerator = "neura"} { +// CHECK-NEXT: %0 = "neura.constant"() <{value = 768 : index}> : () -> index +// CHECK-NEXT: %1 = "neura.constant"() <{value = 1 : index}> : () -> index +// CHECK-NEXT: %2 = "neura.constant"() <{value = 128 : index}> : () -> index +// CHECK-NEXT: %3 = "neura.constant"() <{value = false}> : () -> i1 +// CHECK-NEXT: %4 = "neura.constant"() <{value = 30521 : i32}> : () -> i32 +// CHECK-NEXT: %5 = "neura.constant"() <{value = 0 : i32}> : () -> i32 +// CHECK-NEXT: %6 = "neura.constant"() <{value = 30522 : i32}> : () -> i32 +// CHECK-NEXT: %7 = "neura.constant"() <{value = 0 : index}> : () -> index +// CHECK-NEXT: %8 = builtin.unrealized_conversion_cast %7 : index to i64 +// CHECK-NEXT: llvm.br ^bb1(%8 : i64) +// CHECK-NEXT: ^bb1(%9: i64): // 2 preds: ^bb0, ^bb9 +// CHECK-NEXT: %10 = builtin.unrealized_conversion_cast %9 : i64 to index +// CHECK-NEXT: %11 = "neura.icmp"(%10, %2) <{cmpType = "slt"}> : (index, index) -> i1 +// CHECK-NEXT: llvm.cond_br %11, ^bb2, ^bb10 +// CHECK-NEXT: ^bb2: // pred: ^bb1 +// CHECK-NEXT: %12 = builtin.unrealized_conversion_cast %7 : index to i64 +// CHECK-NEXT: llvm.br ^bb3(%12 : i64) +// CHECK-NEXT: ^bb3(%13: i64): // 2 preds: ^bb2, ^bb8 +// CHECK-NEXT: %14 = builtin.unrealized_conversion_cast %13 : i64 to index +// CHECK-NEXT: %15 = "neura.icmp"(%14, %0) <{cmpType = "slt"}> : (index, index) -> i1 +// CHECK-NEXT: llvm.cond_br %15, ^bb4, ^bb9 +// CHECK-NEXT: ^bb4: // pred: ^bb3 +// CHECK-NEXT: %16 = memref.load %arg0[%7, %10] : memref +// CHECK-NEXT: %17 = "neura.icmp"(%16, %6) <{cmpType = "sge"}> : (i32, i32) -> i1 +// CHECK-NEXT: %18 = "neura.sel"(%4, %16, %17) : (i32, i32, i1) -> i32 +// CHECK-NEXT: llvm.cond_br %17, ^bb5, ^bb6 +// CHECK-NEXT: ^bb5: // pred: ^bb4 +// CHECK-NEXT: llvm.br ^bb7(%3 : i1) +// CHECK-NEXT: ^bb6: // pred: ^bb4 +// CHECK-NEXT: %19 = "neura.icmp"(%16, %5) <{cmpType = "slt"}> : (i32, i32) -> i1 +// CHECK-NEXT: llvm.br ^bb7(%19 : i1) +// CHECK-NEXT: ^bb7(%20: i1): // 2 preds: ^bb5, ^bb6 +// CHECK-NEXT: llvm.br ^bb8 +// CHECK-NEXT: ^bb8: // pred: ^bb7 +// CHECK-NEXT: %21 = "neura.sel"(%5, %18, %20) : (i32, i32, i1) -> i32 +// CHECK-NEXT: %22 = "neura.cast"(%21) <{cast_type = "indexCast"}> : (i32) -> index +// CHECK-NEXT: %23 = memref.load %arg1[%22, %14] : memref +// CHECK-NEXT: memref.store %23, %arg2[%7, %10, %14] : memref +// CHECK-NEXT: %24 = "neura.add"(%14, %1) : (index, index) -> index +// CHECK-NEXT: %25 = builtin.unrealized_conversion_cast %24 : index to i64 +// CHECK-NEXT: llvm.br ^bb3(%25 : i64) +// CHECK-NEXT: ^bb9: // pred: ^bb3 +// CHECK-NEXT: %26 = "neura.add"(%10, %1) : (index, index) -> index +// CHECK-NEXT: %27 = builtin.unrealized_conversion_cast %26 : index to i64 +// CHECK-NEXT: llvm.br ^bb1(%27 : i64) +// CHECK-NEXT: ^bb10: // pred: ^bb1 +// CHECK-NEXT: return +// CHECK-NEXT: } \ No newline at end of file diff --git a/test/affine2neura/bert/bert_node28/bert_node28.cpp b/test/affine2neura/bert/bert_node28/bert_node28.cpp new file mode 100644 index 00000000..4853daef --- /dev/null +++ b/test/affine2neura/bert/bert_node28/bert_node28.cpp @@ -0,0 +1,19 @@ +void bert_node28(const float input_A[1][128][768], + const float input_B[1][768][768], + float output[1][128][768]) { + + for (int arg3 = 0; arg3 < 1; arg3++) { + for (int arg4 = 0; arg4 < 128; arg4++) { + for (int arg5 = 0; arg5 < 768; arg5++) { + for (int arg6 = 0; arg6 < 768; arg6++) { + float val_A = input_A[arg3][arg4][arg6]; + float val_B = input_B[arg3][arg6][arg5]; + float val_C = output[arg3][arg4][arg5]; + float mul_result = val_A * val_B; + float add_result = val_C + mul_result; + output[arg3][arg4][arg5] = add_result; + } + } + } + } +} \ No newline at end of file diff --git a/test/affine2neura/bert/bert_node28/bert_node28.mlir b/test/affine2neura/bert/bert_node28/bert_node28.mlir new file mode 100644 index 00000000..01f54a51 --- /dev/null +++ b/test/affine2neura/bert/bert_node28/bert_node28.mlir @@ -0,0 +1,64 @@ +// RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir +// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s +module attributes {} { + func.func @_Z11bert_node28PA128_A768_KfPA768_S0_PA128_A768_f(%arg0: memref, %arg1: memref, %arg2: memref) attributes {} { + affine.for %arg3 = 0 to 128 { + affine.for %arg4 = 0 to 768 { + affine.for %arg5 = 0 to 768 { + %0 = affine.load %arg0[0, %arg3, %arg5] : memref + %1 = affine.load %arg1[0, %arg5, %arg4] : memref + %2 = affine.load %arg2[0, %arg3, %arg4] : memref + %3 = arith.mulf %0, %1 : f32 + %4 = arith.addf %2, %3 : f32 + affine.store %4, %arg2[0, %arg3, %arg4] : memref + } + } + } + return + } +} +// CHECK: func.func @_Z11bert_node28PA128_A768_KfPA768_S0_PA128_A768_f(%arg0: memref, %arg1: memref, %arg2: memref) attributes {accelerator = "neura"} { +// CHECK-NEXT: %0 = "neura.constant"() <{value = 768 : index}> : () -> index +// CHECK-NEXT: %1 = "neura.constant"() <{value = 1 : index}> : () -> index +// CHECK-NEXT: %2 = "neura.constant"() <{value = 128 : index}> : () -> index +// CHECK-NEXT: %3 = "neura.constant"() <{value = 0 : index}> : () -> index +// CHECK-NEXT: %4 = builtin.unrealized_conversion_cast %3 : index to i64 +// CHECK-NEXT: llvm.br ^bb1(%4 : i64) +// CHECK-NEXT: ^bb1(%5: i64): // 2 preds: ^bb0, ^bb8 +// CHECK-NEXT: %6 = builtin.unrealized_conversion_cast %5 : i64 to index +// CHECK-NEXT: %7 = "neura.icmp"(%6, %2) <{cmpType = "slt"}> : (index, index) -> i1 +// CHECK-NEXT: llvm.cond_br %7, ^bb2, ^bb9 +// CHECK-NEXT: ^bb2: // pred: ^bb1 +// CHECK-NEXT: %8 = builtin.unrealized_conversion_cast %3 : index to i64 +// CHECK-NEXT: llvm.br ^bb3(%8 : i64) +// CHECK-NEXT: ^bb3(%9: i64): // 2 preds: ^bb2, ^bb7 +// CHECK-NEXT: %10 = builtin.unrealized_conversion_cast %9 : i64 to index +// CHECK-NEXT: %11 = "neura.icmp"(%10, %0) <{cmpType = "slt"}> : (index, index) -> i1 +// CHECK-NEXT: llvm.cond_br %11, ^bb4, ^bb8 +// CHECK-NEXT: ^bb4: // pred: ^bb3 +// CHECK-NEXT: %12 = builtin.unrealized_conversion_cast %3 : index to i64 +// CHECK-NEXT: llvm.br ^bb5(%12 : i64) +// CHECK-NEXT: ^bb5(%13: i64): // 2 preds: ^bb4, ^bb6 +// CHECK-NEXT: %14 = builtin.unrealized_conversion_cast %13 : i64 to index +// CHECK-NEXT: %15 = "neura.icmp"(%14, %0) <{cmpType = "slt"}> : (index, index) -> i1 +// CHECK-NEXT: llvm.cond_br %15, ^bb6, ^bb7 +// CHECK-NEXT: ^bb6: // pred: ^bb5 +// CHECK-NEXT: %16 = memref.load %arg0[%3, %6, %14] : memref +// CHECK-NEXT: %17 = memref.load %arg1[%3, %14, %10] : memref +// CHECK-NEXT: %18 = memref.load %arg2[%3, %6, %10] : memref +// CHECK-NEXT: %19 = "neura.fmul"(%16, %17) : (f32, f32) -> f32 +// CHECK-NEXT: %20 = "neura.fadd"(%18, %19) : (f32, f32) -> f32 +// CHECK-NEXT: memref.store %20, %arg2[%3, %6, %10] : memref +// CHECK-NEXT: %21 = "neura.add"(%14, %1) : (index, index) -> index +// CHECK-NEXT: %22 = builtin.unrealized_conversion_cast %21 : index to i64 +// CHECK-NEXT: llvm.br ^bb5(%22 : i64) +// CHECK-NEXT: ^bb7: // pred: ^bb5 +// CHECK-NEXT: %23 = "neura.add"(%10, %1) : (index, index) -> index +// CHECK-NEXT: %24 = builtin.unrealized_conversion_cast %23 : index to i64 +// CHECK-NEXT: llvm.br ^bb3(%24 : i64) +// CHECK-NEXT: ^bb8: // pred: ^bb3 +// CHECK-NEXT: %25 = "neura.add"(%6, %1) : (index, index) -> index +// CHECK-NEXT: %26 = builtin.unrealized_conversion_cast %25 : index to i64 +// CHECK-NEXT: llvm.br ^bb1(%26 : i64) +// CHECK-NEXT: ^bb9: // pred: ^bb1 +// CHECK-NEXT: return \ No newline at end of file diff --git a/test/affine2neura/bert/bert_node3/bert_node3.cpp b/test/affine2neura/bert/bert_node3/bert_node3.cpp new file mode 100644 index 00000000..ddafc0a6 --- /dev/null +++ b/test/affine2neura/bert/bert_node3/bert_node3.cpp @@ -0,0 +1,14 @@ +void bert_node3(const float input1[1][128][768], + const float input2[1][128][768], float output[1][128][768]) { + + for (int arg3 = 0; arg3 < 1; arg3++) { + for (int arg4 = 0; arg4 < 128; arg4++) { + for (int arg5 = 0; arg5 < 768; arg5++) { + float val1 = input1[0][arg4][arg5]; + float val2 = input2[0][arg4][arg5]; + float sum = val1 + val2; + output[arg3][arg4][arg5] = sum; + } + } + } +} \ No newline at end of file diff --git a/test/affine2neura/bert/bert_node3/bert_node3.mlir b/test/affine2neura/bert/bert_node3/bert_node3.mlir new file mode 100644 index 00000000..1c400deb --- /dev/null +++ b/test/affine2neura/bert/bert_node3/bert_node3.mlir @@ -0,0 +1,48 @@ +// RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir +// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s +module attributes {} { + func.func @_Z10bert_node3PA128_A768_KfS2_PA128_A768_f(%arg0: memref, %arg1: memref, %arg2: memref) attributes {} { + affine.for %arg3 = 0 to 128 { + affine.for %arg4 = 0 to 768 { + %0 = affine.load %arg0[0, %arg3, %arg4] : memref + %1 = affine.load %arg1[0, %arg3, %arg4] : memref + %2 = arith.addf %0, %1 : f32 + affine.store %2, %arg2[0, %arg3, %arg4] : memref + } + } + return + } +} + +// CHECK: func.func @_Z10bert_node3PA128_A768_KfS2_PA128_A768_f(%arg0: memref, %arg1: memref, %arg2: memref) attributes {accelerator = "neura"} { +// CHECK-NEXT: %0 = "neura.constant"() <{value = 768 : index}> : () -> index +// CHECK-NEXT: %1 = "neura.constant"() <{value = 1 : index}> : () -> index +// CHECK-NEXT: %2 = "neura.constant"() <{value = 128 : index}> : () -> index +// CHECK-NEXT: %3 = "neura.constant"() <{value = 0 : index}> : () -> index +// CHECK-NEXT: %4 = builtin.unrealized_conversion_cast %3 : index to i64 +// CHECK-NEXT: llvm.br ^bb1(%4 : i64) +// CHECK-NEXT: ^bb1(%5: i64): // 2 preds: ^bb0, ^bb5 +// CHECK-NEXT: %6 = builtin.unrealized_conversion_cast %5 : i64 to index +// CHECK-NEXT: %7 = "neura.icmp"(%6, %2) <{cmpType = "slt"}> : (index, index) -> i1 +// CHECK-NEXT: llvm.cond_br %7, ^bb2, ^bb6 +// CHECK-NEXT: ^bb2: // pred: ^bb1 +// CHECK-NEXT: %8 = builtin.unrealized_conversion_cast %3 : index to i64 +// CHECK-NEXT: llvm.br ^bb3(%8 : i64) +// CHECK-NEXT: ^bb3(%9: i64): // 2 preds: ^bb2, ^bb4 +// CHECK-NEXT: %10 = builtin.unrealized_conversion_cast %9 : i64 to index +// CHECK-NEXT: %11 = "neura.icmp"(%10, %0) <{cmpType = "slt"}> : (index, index) -> i1 +// CHECK-NEXT: llvm.cond_br %11, ^bb4, ^bb5 +// CHECK-NEXT: ^bb4: // pred: ^bb3 +// CHECK-NEXT: %12 = memref.load %arg0[%3, %6, %10] : memref +// CHECK-NEXT: %13 = memref.load %arg1[%3, %6, %10] : memref +// CHECK-NEXT: %14 = "neura.fadd"(%12, %13) : (f32, f32) -> f32 +// CHECK-NEXT: memref.store %14, %arg2[%3, %6, %10] : memref +// CHECK-NEXT: %15 = "neura.add"(%10, %1) : (index, index) -> index +// CHECK-NEXT: %16 = builtin.unrealized_conversion_cast %15 : index to i64 +// CHECK-NEXT: llvm.br ^bb3(%16 : i64) +// CHECK-NEXT: ^bb5: // pred: ^bb3 +// CHECK-NEXT: %17 = "neura.add"(%6, %1) : (index, index) -> index +// CHECK-NEXT: %18 = builtin.unrealized_conversion_cast %17 : index to i64 +// CHECK-NEXT: llvm.br ^bb1(%18 : i64) +// CHECK-NEXT: ^bb6: // pred: ^bb1 +// CHECK-NEXT: return diff --git a/test/affine2neura/bert/bert_node8/bert_node8.cpp b/test/affine2neura/bert/bert_node8/bert_node8.cpp new file mode 100644 index 00000000..e2054b7d --- /dev/null +++ b/test/affine2neura/bert/bert_node8/bert_node8.cpp @@ -0,0 +1,14 @@ +void bert_node8( + const float input[1][128][1], + float output[1][128][1]) { + const float divisor = 768.0f; + for (int arg3 = 0; arg3 < 1; arg3++) { + for (int arg4 = 0; arg4 < 128; arg4++) { + for (int arg5 = 0; arg5 < 1; arg5++) { + float value = input[0][arg4][0]; + float result = value / divisor; + output[arg3][arg4][arg5] = result; + } + } + } +} \ No newline at end of file diff --git a/test/affine2neura/bert/bert_node8/bert_node8.mlir b/test/affine2neura/bert/bert_node8/bert_node8.mlir new file mode 100644 index 00000000..dbb59d40 --- /dev/null +++ b/test/affine2neura/bert/bert_node8/bert_node8.mlir @@ -0,0 +1,34 @@ +// RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir +// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s +module attributes {} { + func.func @_Z10bert_node8PA128_A1_KfPA128_A1_f(%arg0: memref, %arg1: memref) attributes {} { + %cst = arith.constant 7.680000e+02 : f32 + affine.for %arg2 = 0 to 128 { + %0 = affine.load %arg0[0, %arg2, 0] : memref + %1 = arith.divf %0, %cst : f32 + affine.store %1, %arg1[0, %arg2, 0] : memref + } + return + } +} + +// CHECK: func.func @_Z10bert_node8PA128_A1_KfPA128_A1_f(%arg0: memref, %arg1: memref) attributes {accelerator = "neura"} { +// CHECK-NEXT: %0 = "neura.constant"() <{value = 1 : index}> : () -> index +// CHECK-NEXT: %1 = "neura.constant"() <{value = 128 : index}> : () -> index +// CHECK-NEXT: %2 = "neura.constant"() <{value = 7.680000e+02 : f32}> : () -> f32 +// CHECK-NEXT: %3 = "neura.constant"() <{value = 0 : index}> : () -> index +// CHECK-NEXT: %4 = builtin.unrealized_conversion_cast %3 : index to i64 +// CHECK-NEXT: llvm.br ^bb1(%4 : i64) +// CHECK-NEXT: ^bb1(%5: i64): // 2 preds: ^bb0, ^bb2 +// CHECK-NEXT: %6 = builtin.unrealized_conversion_cast %5 : i64 to index +// CHECK-NEXT: %7 = "neura.icmp"(%6, %1) <{cmpType = "slt"}> : (index, index) -> i1 +// CHECK-NEXT: llvm.cond_br %7, ^bb2, ^bb3 +// CHECK-NEXT: ^bb2: // pred: ^bb1 +// CHECK-NEXT: %8 = memref.load %arg0[%3, %6, %3] : memref +// CHECK-NEXT: %9 = "neura.fdiv"(%8, %2) : (f32, f32) -> f32 +// CHECK-NEXT: memref.store %9, %arg1[%3, %6, %3] : memref +// CHECK-NEXT: %10 = "neura.add"(%6, %0) : (index, index) -> index +// CHECK-NEXT: %11 = builtin.unrealized_conversion_cast %10 : index to i64 +// CHECK-NEXT: llvm.br ^bb1(%11 : i64) +// CHECK-NEXT: ^bb3: // pred: ^bb1 +// CHECK-NEXT: return diff --git a/test/affine2neura/bert/bert_node9/bert_node9.cpp b/test/affine2neura/bert/bert_node9/bert_node9.cpp new file mode 100644 index 00000000..63f63756 --- /dev/null +++ b/test/affine2neura/bert/bert_node9/bert_node9.cpp @@ -0,0 +1,13 @@ +void bert_node9( + const float input[1][128][768], + double output[1][128][768]) { + for (int arg3 = 0; arg3 < 1; arg3++) { + for (int arg4 = 0; arg4 < 128; arg4++) { + for (int arg5 = 0; arg5 < 768; arg5++) { + float value = input[0][arg4][arg5]; + double extended_value = static_cast(value); + output[arg3][arg4][arg5] = extended_value; + } + } + } +} \ No newline at end of file diff --git a/test/affine2neura/bert/bert_node9/bert_node9.mlir b/test/affine2neura/bert/bert_node9/bert_node9.mlir new file mode 100644 index 00000000..3641e16b --- /dev/null +++ b/test/affine2neura/bert/bert_node9/bert_node9.mlir @@ -0,0 +1,47 @@ +// RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir +// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s +module attributes {} { + func.func @_Z10bert_node9PA128_A768_KfPA128_A768_d(%arg0: memref, %arg1: memref) attributes {} { + affine.for %arg2 = 0 to 128 { + affine.for %arg3 = 0 to 768 { + %0 = affine.load %arg0[0, %arg2, %arg3] : memref + %1 = arith.extf %0 : f32 to f64 + affine.store %1, %arg1[0, %arg2, %arg3] : memref + } + } + return + } +} + + +// CHECK: func.func @_Z10bert_node9PA128_A768_KfPA128_A768_d(%arg0: memref, %arg1: memref) attributes {accelerator = "neura"} { +// CHECK-NEXT: %0 = "neura.constant"() <{value = 768 : index}> : () -> index +// CHECK-NEXT: %1 = "neura.constant"() <{value = 1 : index}> : () -> index +// CHECK-NEXT: %2 = "neura.constant"() <{value = 128 : index}> : () -> index +// CHECK-NEXT: %3 = "neura.constant"() <{value = 0 : index}> : () -> index +// CHECK-NEXT: %4 = builtin.unrealized_conversion_cast %3 : index to i64 +// CHECK-NEXT: llvm.br ^bb1(%4 : i64) +// CHECK-NEXT: ^bb1(%5: i64): // 2 preds: ^bb0, ^bb5 +// CHECK-NEXT: %6 = builtin.unrealized_conversion_cast %5 : i64 to index +// CHECK-NEXT: %7 = "neura.icmp"(%6, %2) <{cmpType = "slt"}> : (index, index) -> i1 +// CHECK-NEXT: llvm.cond_br %7, ^bb2, ^bb6 +// CHECK-NEXT: ^bb2: // pred: ^bb1 +// CHECK-NEXT: %8 = builtin.unrealized_conversion_cast %3 : index to i64 +// CHECK-NEXT: llvm.br ^bb3(%8 : i64) +// CHECK-NEXT: ^bb3(%9: i64): // 2 preds: ^bb2, ^bb4 +// CHECK-NEXT: %10 = builtin.unrealized_conversion_cast %9 : i64 to index +// CHECK-NEXT: %11 = "neura.icmp"(%10, %0) <{cmpType = "slt"}> : (index, index) -> i1 +// CHECK-NEXT: llvm.cond_br %11, ^bb4, ^bb5 +// CHECK-NEXT: ^bb4: // pred: ^bb3 +// CHECK-NEXT: %12 = memref.load %arg0[%3, %6, %10] : memref +// CHECK-NEXT: %13 = "neura.cast"(%12) <{cast_type = "extf"}> : (f32) -> f64 +// CHECK-NEXT: memref.store %13, %arg1[%3, %6, %10] : memref +// CHECK-NEXT: %14 = "neura.add"(%10, %1) : (index, index) -> index +// CHECK-NEXT: %15 = builtin.unrealized_conversion_cast %14 : index to i64 +// CHECK-NEXT: llvm.br ^bb3(%15 : i64) +// CHECK-NEXT: ^bb5: // pred: ^bb3 +// CHECK-NEXT: %16 = "neura.add"(%6, %1) : (index, index) -> index +// CHECK-NEXT: %17 = builtin.unrealized_conversion_cast %16 : index to i64 +// CHECK-NEXT: llvm.br ^bb1(%17 : i64) +// CHECK-NEXT: ^bb6: // pred: ^bb1 +// CHECK-NEXT: return diff --git a/test/lit.cfg.in b/test/lit.cfg.in index aa45ea35..2530f5c3 100644 --- a/test/lit.cfg.in +++ b/test/lit.cfg.in @@ -6,6 +6,7 @@ config.test_format = lit.formats.ShTest(True) config.suffixes = ['.mlir'] config.test_source_root = os.path.dirname(__file__) config.test_exec_root = os.path.dirname(__file__) +config.excludes = ['samples'] # Tool substitutions from CMake config.substitutions.append(('mlir-neura-opt', '@MLIR_NEURA_OPT@')) diff --git a/test/neura/llvm_sub.mlir b/test/neura/llvm_sub.mlir index 1cf1fbf4..8b1f8b27 100644 --- a/test/neura/llvm_sub.mlir +++ b/test/neura/llvm_sub.mlir @@ -5,6 +5,6 @@ func.func @test(%a: f32) -> f32 { %res = llvm.fsub %a, %b : f32 // CHECK: [[LHS:%.*]] = "neura.data_mov"(%{{.*}}) : (f32) -> f32 // CHECK: [[RHS:%.*]] = "neura.data_mov"(%{{.*}}) : (f32) -> f32 - // CHECK: [[RES:%.*]] = "neura.fsub"([[LHS]], [[RHS]]) + // CHECK: [[RES:%.*]] = "neura.fsub"([[LHS]], [[RHS]]) : (f32, f32) -> f32 return %res : f32 } \ No newline at end of file diff --git a/test/samples/bert/bert_affine.mlir b/test/samples/bert/bert_affine.mlir new file mode 100644 index 00000000..e47b9f88 --- /dev/null +++ b/test/samples/bert/bert_affine.mlir @@ -0,0 +1,2266 @@ +module { + func.func @main(%arg0: tensor<1x512x768xf32>, %arg1: tensor<1x128xi64>, %arg2: tensor<1x128xi64>) -> tensor<1x128x768xf32> { + %c0 = arith.constant 0 : index + %cst = arith.constant dense<-1.000000e+09> : tensor + %cst_0 = arith.constant dense_resource : tensor<768xf32> + %cst_1 = arith.constant dense_resource : tensor<768x3072xf32> + %cst_2 = arith.constant dense_resource : tensor<3072xf32> + %cst_3 = arith.constant dense_resource : tensor<3072x768xf32> + %cst_4 = arith.constant dense_resource : tensor<768xf32> + %cst_5 = arith.constant dense_resource : tensor<768xf32> + %cst_6 = arith.constant dense_resource : tensor<768xf32> + %cst_7 = arith.constant dense_resource : tensor<768x768xf32> + %cst_8 = arith.constant dense_resource : tensor<768xf32> + %cst_9 = arith.constant dense_resource : tensor<768x768xf32> + %cst_10 = arith.constant dense_resource : tensor<768xf32> + %cst_11 = arith.constant dense_resource : tensor<768x768xf32> + %cst_12 = arith.constant dense_resource : tensor<768xf32> + %cst_13 = arith.constant dense_resource : tensor<768x768xf32> + %cst_14 = arith.constant dense_resource : tensor<768xf32> + %cst_15 = arith.constant dense_resource : tensor<768xf32> + %cst_16 = arith.constant dense_resource : tensor<768xf32> + %cst_17 = arith.constant dense_resource : tensor<768x3072xf32> + %cst_18 = arith.constant dense_resource : tensor<3072xf32> + %cst_19 = arith.constant dense_resource : tensor<3072x768xf32> + %cst_20 = arith.constant dense_resource : tensor<768xf32> + %cst_21 = arith.constant dense_resource : tensor<768xf32> + %cst_22 = arith.constant dense_resource : tensor<768xf32> + %cst_23 = arith.constant dense_resource : tensor<768x768xf32> + %cst_24 = arith.constant dense_resource : tensor<768xf32> + %cst_25 = arith.constant dense_resource : tensor<768x768xf32> + %cst_26 = arith.constant dense_resource : tensor<768xf32> + %cst_27 = arith.constant dense_resource : tensor<768x768xf32> + %cst_28 = arith.constant dense_resource : tensor<768xf32> + %cst_29 = arith.constant dense_resource : tensor<768x768xf32> + %cst_30 = arith.constant dense_resource : tensor<768xf32> + %cst_31 = arith.constant dense_resource : tensor<768xf32> + %cst_32 = arith.constant dense_resource : tensor<3x768xf32> + %cst_33 = arith.constant 1.000000e+00 : f32 + %cst_34 = arith.constant 3.000000e+00 : f32 + %cst_35 = arith.constant 5.000000e-01 : f32 + %cst_36 = arith.constant 8.000000e+00 : f32 + %cst_37 = arith.constant 7.680000e+02 : f64 + %cst_38 = arith.constant 7.680000e+02 : f32 + %cst_39 = arith.constant 9.9999999999999995E-7 : f64 + %cst_40 = arith.constant 4.471500e-02 : f64 + %cst_41 = arith.constant 0.79788456080286541 : f64 + %cst_42 = arith.constant 7.670000e+02 : f64 + %cst_43 = arith.constant 0xFF800000 : f32 + %cst_44 = arith.constant 0.000000e+00 : f64 + %cst_45 = arith.constant 0.000000e+00 : f32 + %c30522 = arith.constant 30522 : index + %c3 = arith.constant 3 : index + %c0_i64 = arith.constant 0 : i64 + %cst_46 = arith.constant dense_resource : tensor<30522x768xf32> + %0 = bufferization.to_memref %arg2 : memref<1x128xi64> + %1 = bufferization.to_memref %arg1 : memref<1x128xi64> + %2 = bufferization.to_memref %arg1 : memref<1x128xi64> + %3 = bufferization.to_memref %cst_31 : memref<768xf32> + %4 = bufferization.to_memref %cst_30 : memref<768xf32> + %5 = bufferization.to_memref %cst_29 : memref<768x768xf32> + %6 = bufferization.to_memref %cst_28 : memref<768xf32> + %7 = bufferization.to_memref %cst_27 : memref<768x768xf32> + %8 = bufferization.to_memref %cst_26 : memref<768xf32> + %9 = bufferization.to_memref %cst_25 : memref<768x768xf32> + %10 = bufferization.to_memref %cst_24 : memref<768xf32> + %11 = bufferization.to_memref %cst_23 : memref<768x768xf32> + %12 = bufferization.to_memref %cst_22 : memref<768xf32> + %13 = bufferization.to_memref %cst_21 : memref<768xf32> + %14 = bufferization.to_memref %cst_20 : memref<768xf32> + %15 = bufferization.to_memref %cst_19 : memref<3072x768xf32> + %16 = bufferization.to_memref %cst_18 : memref<3072xf32> + %17 = bufferization.to_memref %cst_17 : memref<768x3072xf32> + %18 = bufferization.to_memref %cst_16 : memref<768xf32> + %19 = bufferization.to_memref %cst_15 : memref<768xf32> + %20 = bufferization.to_memref %cst_14 : memref<768xf32> + %21 = bufferization.to_memref %cst_13 : memref<768x768xf32> + %22 = bufferization.to_memref %cst_12 : memref<768xf32> + %23 = bufferization.to_memref %cst_11 : memref<768x768xf32> + %24 = bufferization.to_memref %cst_10 : memref<768xf32> + %25 = bufferization.to_memref %cst_9 : memref<768x768xf32> + %26 = bufferization.to_memref %cst_8 : memref<768xf32> + %27 = bufferization.to_memref %cst_7 : memref<768x768xf32> + %28 = bufferization.to_memref %cst_6 : memref<768xf32> + %29 = bufferization.to_memref %cst_5 : memref<768xf32> + %30 = bufferization.to_memref %cst_4 : memref<768xf32> + %31 = bufferization.to_memref %cst_3 : memref<3072x768xf32> + %32 = bufferization.to_memref %cst_2 : memref<3072xf32> + %33 = bufferization.to_memref %cst_1 : memref<768x3072xf32> + %34 = bufferization.to_memref %cst_0 : memref<768xf32> + %35 = bufferization.to_memref %cst : memref + %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x128xi1> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + %88 = affine.load %2[%c0, %arg4] : memref<1x128xi64> + %89 = arith.cmpi sgt, %88, %c0_i64 : i64 + affine.store %89, %alloc[%arg3, %arg4] : memref<1x128xi1> + } + } + %36 = bufferization.to_tensor %alloc : memref<1x128xi1> + %expanded = tensor.expand_shape %36 [[0, 1], [2, 3, 4, 5]] : tensor<1x128xi1> into tensor<1x1x1x1x1x128xi1> + %37 = bufferization.to_memref %expanded : memref<1x1x1x1x1x128xi1> + %alloc_47 = memref.alloc() {alignment = 64 : i64} : memref<1x1x128x1x1x128xi1> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 1 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 1 { + affine.for %arg7 = 0 to 1 { + affine.for %arg8 = 0 to 128 { + %88 = affine.load %37[%arg3, %arg4, %c0, %arg6, %arg7, %arg8] : memref<1x1x1x1x1x128xi1> + affine.store %88, %alloc_47[%arg3, %arg4, %arg5, %arg6, %arg7, %arg8] : memref<1x1x128x1x1x128xi1> + } + } + } + } + } + } + %38 = bufferization.to_tensor %alloc_47 : memref<1x1x128x1x1x128xi1> + %collapsed = tensor.collapse_shape %38 [[0], [1, 2], [3, 4, 5]] : tensor<1x1x128x1x1x128xi1> into tensor<1x128x128xi1> + %expanded_48 = tensor.expand_shape %collapsed [[0], [1, 2], [3]] : tensor<1x128x128xi1> into tensor<1x1x128x128xi1> + %39 = bufferization.to_memref %expanded_48 : memref<1x1x128x128xi1> + %alloc_49 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %1[%arg3, %arg4] : memref<1x128xi64> + %89 = arith.index_cast %88 : i64 to index + %90 = arith.cmpi slt, %89, %c30522 : index + cf.assert %90, "index must be smaller than dim size" + %91 = arith.cmpi sge, %88, %c0_i64 : i64 + cf.assert %91, "index must be larger or equal to 0" + %extracted = tensor.extract %cst_46[%89, %arg5] : tensor<30522x768xf32> + affine.store %extracted, %alloc_49[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %extracted_slice = tensor.extract_slice %arg0[0, 0, 0] [1, 128, 768] [1, 1, 1] : tensor<1x512x768xf32> to tensor<1x128x768xf32> + %40 = bufferization.to_memref %extracted_slice : memref<1x128x768xf32> + %alloc_50 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_49[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %40[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_50[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_51 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %0[%arg3, %arg4] : memref<1x128xi64> + %89 = arith.index_cast %88 : i64 to index + %90 = arith.cmpi slt, %89, %c3 : index + cf.assert %90, "index must be smaller than dim size" + %91 = arith.cmpi sge, %88, %c0_i64 : i64 + cf.assert %91, "index must be larger or equal to 0" + %extracted = tensor.extract %cst_32[%89, %arg5] : tensor<3x768xf32> + affine.store %extracted, %alloc_51[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_52 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_50[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %alloc_51[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_52[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_53 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + affine.store %cst_45, %alloc_53[%arg3, %arg4, %arg5] : memref<1x128x1xf32> + } + } + } + %alloc_54 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32> + memref.copy %alloc_53, %alloc_54 : memref<1x128x1xf32> to memref<1x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_52[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %alloc_54[%arg3, %arg4, %c0] : memref<1x128x1xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_54[%arg3, %arg4, %c0] : memref<1x128x1xf32> + } + } + } + %alloc_55 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_54[%c0, %arg4, %c0] : memref<1x128x1xf32> + %89 = arith.divf %88, %cst_38 : f32 + affine.store %89, %alloc_55[%arg3, %arg4, %arg5] : memref<1x128x1xf32> + } + } + } + %alloc_56 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_52[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = arith.extf %88 : f32 to f64 + affine.store %89, %alloc_56[%arg3, %arg4, %arg5] : memref<1x128x768xf64> + } + } + } + %alloc_57 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + affine.store %cst_44, %alloc_57[%arg3, %arg4, %arg5] : memref<1x128x1xf64> + } + } + } + %alloc_58 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64> + memref.copy %alloc_57, %alloc_58 : memref<1x128x1xf64> to memref<1x128x1xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_56[%arg3, %arg4, %arg5] : memref<1x128x768xf64> + %89 = affine.load %alloc_58[%arg3, %arg4, %c0] : memref<1x128x1xf64> + %90 = arith.addf %88, %89 : f64 + affine.store %90, %alloc_58[%arg3, %arg4, %c0] : memref<1x128x1xf64> + } + } + } + %alloc_59 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_58[%c0, %arg4, %c0] : memref<1x128x1xf64> + %89 = arith.divf %88, %cst_37 : f64 + affine.store %89, %alloc_59[%arg3, %arg4, %arg5] : memref<1x128x1xf64> + } + } + } + %alloc_60 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_56[%c0, %arg4, %arg5] : memref<1x128x768xf64> + %89 = affine.load %alloc_59[%c0, %arg4, %c0] : memref<1x128x1xf64> + %90 = arith.subf %88, %89 : f64 + affine.store %90, %alloc_60[%arg3, %arg4, %arg5] : memref<1x128x768xf64> + } + } + } + %alloc_61 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_60[%c0, %arg4, %arg5] : memref<1x128x768xf64> + %89 = affine.load %alloc_60[%c0, %arg4, %arg5] : memref<1x128x768xf64> + %90 = arith.mulf %88, %89 : f64 + affine.store %90, %alloc_61[%arg3, %arg4, %arg5] : memref<1x128x768xf64> + } + } + } + %alloc_62 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64> + memref.copy %alloc_57, %alloc_62 : memref<1x128x1xf64> to memref<1x128x1xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_61[%arg3, %arg4, %arg5] : memref<1x128x768xf64> + %89 = affine.load %alloc_62[%arg3, %arg4, %c0] : memref<1x128x1xf64> + %90 = arith.addf %88, %89 : f64 + affine.store %90, %alloc_62[%arg3, %arg4, %c0] : memref<1x128x1xf64> + } + } + } + %alloc_63 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_62[%c0, %arg4, %c0] : memref<1x128x1xf64> + %89 = arith.divf %88, %cst_42 : f64 + affine.store %89, %alloc_63[%arg3, %arg4, %arg5] : memref<1x128x1xf64> + } + } + } + %alloc_64 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_63[%c0, %arg4, %c0] : memref<1x128x1xf64> + %89 = arith.truncf %88 : f64 to f32 + affine.store %89, %alloc_64[%arg3, %arg4, %arg5] : memref<1x128x1xf32> + } + } + } + %alloc_65 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_64[%c0, %arg4, %c0] : memref<1x128x1xf32> + %89 = math.sqrt %88 : f32 + affine.store %89, %alloc_65[%arg3, %arg4, %arg5] : memref<1x128x1xf32> + } + } + } + %alloc_66 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_52[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %alloc_55[%c0, %arg4, %c0] : memref<1x128x1xf32> + %90 = arith.subf %88, %89 : f32 + affine.store %90, %alloc_66[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_67 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %3[%arg5] : memref<768xf32> + %89 = affine.load %alloc_66[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %90 = arith.mulf %88, %89 : f32 + affine.store %90, %alloc_67[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_68 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_65[%c0, %arg4, %c0] : memref<1x128x1xf32> + %89 = arith.truncf %cst_39 : f64 to f32 + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_68[%arg3, %arg4, %arg5] : memref<1x128x1xf32> + } + } + } + %alloc_69 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_67[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %alloc_68[%c0, %arg4, %c0] : memref<1x128x1xf32> + %90 = arith.divf %88, %89 : f32 + affine.store %90, %alloc_69[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_70 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_69[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %4[%arg5] : memref<768xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_70[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_71 = memref.alloc() {alignment = 64 : i64} : memref<768x768xf32> + affine.for %arg3 = 0 to 768 { + affine.for %arg4 = 0 to 768 { + %88 = affine.load %5[%arg3, %arg4] : memref<768x768xf32> + affine.store %88, %alloc_71[%arg4, %arg3] : memref<768x768xf32> + } + } + %alloc_72 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_70[%c0, %arg4, %arg5] : memref<1x128x768xf32> + affine.store %88, %alloc_72[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_73 = memref.alloc() {alignment = 64 : i64} : memref<1x768x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 768 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_71[%arg4, %arg5] : memref<768x768xf32> + affine.store %88, %alloc_73[%arg3, %arg4, %arg5] : memref<1x768x768xf32> + } + } + } + %alloc_74 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + affine.store %cst_45, %alloc_74[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_75 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + memref.copy %alloc_74, %alloc_75 : memref<1x128x768xf32> to memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + affine.for %arg6 = 0 to 768 { + %88 = affine.load %alloc_72[%arg3, %arg4, %arg6] : memref<1x128x768xf32> + %89 = affine.load %alloc_73[%arg3, %arg6, %arg5] : memref<1x768x768xf32> + %90 = affine.load %alloc_75[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + %91 = arith.mulf %88, %89 : f32 + %92 = arith.addf %90, %91 : f32 + affine.store %92, %alloc_75[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + } + %alloc_76 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_75[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %6[%arg5] : memref<768xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_76[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %41 = bufferization.to_tensor %alloc_76 : memref<1x128x768xf32> + %expanded_77 = tensor.expand_shape %41 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32> + %42 = bufferization.to_memref %expanded_77 : memref<1x128x12x64xf32> + %alloc_78 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x64xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 12 { + affine.for %arg6 = 0 to 64 { + %88 = affine.load %42[%arg3, %arg4, %arg5, %arg6] : memref<1x128x12x64xf32> + affine.store %88, %alloc_78[%arg3, %arg5, %arg4, %arg6] : memref<1x12x128x64xf32> + } + } + } + } + %alloc_79 = memref.alloc() {alignment = 64 : i64} : memref<768x768xf32> + affine.for %arg3 = 0 to 768 { + affine.for %arg4 = 0 to 768 { + %88 = affine.load %7[%arg3, %arg4] : memref<768x768xf32> + affine.store %88, %alloc_79[%arg4, %arg3] : memref<768x768xf32> + } + } + %alloc_80 = memref.alloc() {alignment = 64 : i64} : memref<1x768x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 768 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_79[%arg4, %arg5] : memref<768x768xf32> + affine.store %88, %alloc_80[%arg3, %arg4, %arg5] : memref<1x768x768xf32> + } + } + } + %alloc_81 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + memref.copy %alloc_74, %alloc_81 : memref<1x128x768xf32> to memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + affine.for %arg6 = 0 to 768 { + %88 = affine.load %alloc_72[%arg3, %arg4, %arg6] : memref<1x128x768xf32> + %89 = affine.load %alloc_80[%arg3, %arg6, %arg5] : memref<1x768x768xf32> + %90 = affine.load %alloc_81[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + %91 = arith.mulf %88, %89 : f32 + %92 = arith.addf %90, %91 : f32 + affine.store %92, %alloc_81[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + } + %alloc_82 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_81[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %8[%arg5] : memref<768xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_82[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %43 = bufferization.to_tensor %alloc_82 : memref<1x128x768xf32> + %expanded_83 = tensor.expand_shape %43 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32> + %44 = bufferization.to_memref %expanded_83 : memref<1x128x12x64xf32> + %alloc_84 = memref.alloc() {alignment = 64 : i64} : memref<768x768xf32> + affine.for %arg3 = 0 to 768 { + affine.for %arg4 = 0 to 768 { + %88 = affine.load %9[%arg3, %arg4] : memref<768x768xf32> + affine.store %88, %alloc_84[%arg4, %arg3] : memref<768x768xf32> + } + } + %alloc_85 = memref.alloc() {alignment = 64 : i64} : memref<1x768x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 768 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_84[%arg4, %arg5] : memref<768x768xf32> + affine.store %88, %alloc_85[%arg3, %arg4, %arg5] : memref<1x768x768xf32> + } + } + } + %alloc_86 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + memref.copy %alloc_74, %alloc_86 : memref<1x128x768xf32> to memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + affine.for %arg6 = 0 to 768 { + %88 = affine.load %alloc_72[%arg3, %arg4, %arg6] : memref<1x128x768xf32> + %89 = affine.load %alloc_85[%arg3, %arg6, %arg5] : memref<1x768x768xf32> + %90 = affine.load %alloc_86[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + %91 = arith.mulf %88, %89 : f32 + %92 = arith.addf %90, %91 : f32 + affine.store %92, %alloc_86[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + } + %alloc_87 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_86[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %10[%arg5] : memref<768xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_87[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %45 = bufferization.to_tensor %alloc_87 : memref<1x128x768xf32> + %expanded_88 = tensor.expand_shape %45 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32> + %46 = bufferization.to_memref %expanded_88 : memref<1x128x12x64xf32> + %alloc_89 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x64xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 12 { + affine.for %arg6 = 0 to 64 { + %88 = affine.load %46[%arg3, %arg4, %arg5, %arg6] : memref<1x128x12x64xf32> + affine.store %88, %alloc_89[%arg3, %arg5, %arg4, %arg6] : memref<1x12x128x64xf32> + } + } + } + } + %alloc_90 = memref.alloc() {alignment = 64 : i64} : memref<1x12x64x128xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 12 { + affine.for %arg6 = 0 to 64 { + %88 = affine.load %44[%arg3, %arg4, %arg5, %arg6] : memref<1x128x12x64xf32> + affine.store %88, %alloc_90[%arg3, %arg5, %arg6, %arg4] : memref<1x12x64x128xf32> + } + } + } + } + %alloc_91 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x64xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 64 { + %88 = affine.load %alloc_78[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x64xf32> + affine.store %88, %alloc_91[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x64xf32> + } + } + } + } + %47 = bufferization.to_tensor %alloc_91 : memref<1x12x128x64xf32> + %alloc_92 = memref.alloc() {alignment = 64 : i64} : memref<1x12x64x128xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 64 { + affine.for %arg6 = 0 to 128 { + %88 = affine.load %alloc_90[%c0, %arg4, %arg5, %arg6] : memref<1x12x64x128xf32> + affine.store %88, %alloc_92[%arg3, %arg4, %arg5, %arg6] : memref<1x12x64x128xf32> + } + } + } + } + %48 = bufferization.to_tensor %alloc_92 : memref<1x12x64x128xf32> + %collapsed_93 = tensor.collapse_shape %47 [[0, 1], [2], [3]] : tensor<1x12x128x64xf32> into tensor<12x128x64xf32> + %49 = bufferization.to_memref %collapsed_93 : memref<12x128x64xf32> + %collapsed_94 = tensor.collapse_shape %48 [[0, 1], [2], [3]] : tensor<1x12x64x128xf32> into tensor<12x64x128xf32> + %50 = bufferization.to_memref %collapsed_94 : memref<12x64x128xf32> + %alloc_95 = memref.alloc() {alignment = 64 : i64} : memref<12x128x128xf32> + affine.for %arg3 = 0 to 12 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 128 { + affine.store %cst_45, %alloc_95[%arg3, %arg4, %arg5] : memref<12x128x128xf32> + } + } + } + %alloc_96 = memref.alloc() {alignment = 64 : i64} : memref<12x128x128xf32> + memref.copy %alloc_95, %alloc_96 : memref<12x128x128xf32> to memref<12x128x128xf32> + affine.for %arg3 = 0 to 12 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 64 { + %88 = affine.load %49[%arg3, %arg4, %arg6] : memref<12x128x64xf32> + %89 = affine.load %50[%arg3, %arg6, %arg5] : memref<12x64x128xf32> + %90 = affine.load %alloc_96[%arg3, %arg4, %arg5] : memref<12x128x128xf32> + %91 = arith.mulf %88, %89 : f32 + %92 = arith.addf %90, %91 : f32 + affine.store %92, %alloc_96[%arg3, %arg4, %arg5] : memref<12x128x128xf32> + } + } + } + } + %51 = bufferization.to_tensor %alloc_96 : memref<12x128x128xf32> + %expanded_97 = tensor.expand_shape %51 [[0, 1], [2], [3]] : tensor<12x128x128xf32> into tensor<1x12x128x128xf32> + %52 = bufferization.to_memref %expanded_97 : memref<1x12x128x128xf32> + %alloc_98 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 128 { + %88 = affine.load %52[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + %89 = arith.divf %88, %cst_36 : f32 + affine.store %89, %alloc_98[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + } + } + } + } + %alloc_99 = memref.alloc() {alignment = 64 : i64} : memref<1x1x128x128xi1> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 1 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 128 { + %88 = affine.load %39[%c0, %c0, %arg5, %arg6] : memref<1x1x128x128xi1> + %89 = arith.extui %88 : i1 to i64 + %90 = arith.cmpi eq, %89, %c0_i64 : i64 + affine.store %90, %alloc_99[%arg3, %arg4, %arg5, %arg6] : memref<1x1x128x128xi1> + } + } + } + } + %alloc_100 = memref.alloc() {alignment = 64 : i64} : memref + %53 = affine.load %35[] : memref + %54 = arith.truncf %53 : f64 to f32 + affine.store %54, %alloc_100[] : memref + %alloc_101 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 128 { + %88 = affine.load %alloc_99[%c0, %c0, %arg5, %arg6] : memref<1x1x128x128xi1> + %89 = affine.load %alloc_100[] : memref + %90 = affine.load %alloc_98[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + %91 = arith.select %88, %89, %90 : f32 + affine.store %91, %alloc_101[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + } + } + } + } + %alloc_102 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128xi64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.store %c0_i64, %alloc_102[%arg3, %arg4, %arg5] : memref<1x12x128xi64> + } + } + } + %alloc_103 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.store %cst_43, %alloc_103[%arg3, %arg4, %arg5] : memref<1x12x128xf32> + } + } + } + %alloc_104 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128xf32> + memref.copy %alloc_103, %alloc_104 : memref<1x12x128xf32> to memref<1x12x128xf32> + %alloc_105 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128xi64> + memref.copy %alloc_102, %alloc_105 : memref<1x12x128xi64> to memref<1x12x128xi64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 128 { + %88 = affine.load %alloc_101[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + %89 = affine.load %alloc_104[%arg3, %arg4, %arg5] : memref<1x12x128xf32> + %90 = affine.load %alloc_105[%arg3, %arg4, %arg5] : memref<1x12x128xi64> + %91 = arith.index_cast %arg6 : index to i64 + %92 = arith.maximumf %88, %89 : f32 + %93 = arith.cmpf ogt, %88, %89 : f32 + %94 = arith.select %93, %91, %90 : i64 + affine.store %92, %alloc_104[%arg3, %arg4, %arg5] : memref<1x12x128xf32> + affine.store %94, %alloc_105[%arg3, %arg4, %arg5] : memref<1x12x128xi64> + } + } + } + } + %55 = bufferization.to_tensor %alloc_104 : memref<1x12x128xf32> + %expanded_106 = tensor.expand_shape %55 [[0], [1], [2, 3]] : tensor<1x12x128xf32> into tensor<1x12x128x1xf32> + %56 = bufferization.to_memref %expanded_106 : memref<1x12x128x1xf32> + %alloc_107 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 128 { + %88 = affine.load %alloc_101[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + %89 = affine.load %56[%c0, %arg4, %arg5, %c0] : memref<1x12x128x1xf32> + %90 = arith.subf %88, %89 : f32 + affine.store %90, %alloc_107[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + } + } + } + } + %alloc_108 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 128 { + %88 = affine.load %alloc_107[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + %89 = math.exp %88 : f32 + affine.store %89, %alloc_108[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + } + } + } + } + %alloc_109 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 1 { + affine.store %cst_45, %alloc_109[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x1xf32> + } + } + } + } + %alloc_110 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x1xf32> + memref.copy %alloc_109, %alloc_110 : memref<1x12x128x1xf32> to memref<1x12x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 128 { + %88 = affine.load %alloc_108[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + %89 = affine.load %alloc_110[%arg3, %arg4, %arg5, %c0] : memref<1x12x128x1xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_110[%arg3, %arg4, %arg5, %c0] : memref<1x12x128x1xf32> + } + } + } + } + %alloc_111 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 128 { + %88 = affine.load %alloc_108[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + %89 = affine.load %alloc_110[%c0, %arg4, %arg5, %c0] : memref<1x12x128x1xf32> + %90 = arith.divf %88, %89 : f32 + affine.store %90, %alloc_111[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + } + } + } + } + %alloc_112 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 128 { + %88 = affine.load %alloc_111[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + affine.store %88, %alloc_112[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + } + } + } + } + %57 = bufferization.to_tensor %alloc_112 : memref<1x12x128x128xf32> + %alloc_113 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x64xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 64 { + %88 = affine.load %alloc_89[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x64xf32> + affine.store %88, %alloc_113[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x64xf32> + } + } + } + } + %58 = bufferization.to_tensor %alloc_113 : memref<1x12x128x64xf32> + %collapsed_114 = tensor.collapse_shape %57 [[0, 1], [2], [3]] : tensor<1x12x128x128xf32> into tensor<12x128x128xf32> + %59 = bufferization.to_memref %collapsed_114 : memref<12x128x128xf32> + %collapsed_115 = tensor.collapse_shape %58 [[0, 1], [2], [3]] : tensor<1x12x128x64xf32> into tensor<12x128x64xf32> + %60 = bufferization.to_memref %collapsed_115 : memref<12x128x64xf32> + %alloc_116 = memref.alloc() {alignment = 64 : i64} : memref<12x128x64xf32> + affine.for %arg3 = 0 to 12 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 64 { + affine.store %cst_45, %alloc_116[%arg3, %arg4, %arg5] : memref<12x128x64xf32> + } + } + } + %alloc_117 = memref.alloc() {alignment = 64 : i64} : memref<12x128x64xf32> + memref.copy %alloc_116, %alloc_117 : memref<12x128x64xf32> to memref<12x128x64xf32> + affine.for %arg3 = 0 to 12 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 64 { + affine.for %arg6 = 0 to 128 { + %88 = affine.load %59[%arg3, %arg4, %arg6] : memref<12x128x128xf32> + %89 = affine.load %60[%arg3, %arg6, %arg5] : memref<12x128x64xf32> + %90 = affine.load %alloc_117[%arg3, %arg4, %arg5] : memref<12x128x64xf32> + %91 = arith.mulf %88, %89 : f32 + %92 = arith.addf %90, %91 : f32 + affine.store %92, %alloc_117[%arg3, %arg4, %arg5] : memref<12x128x64xf32> + } + } + } + } + %61 = bufferization.to_tensor %alloc_117 : memref<12x128x64xf32> + %expanded_118 = tensor.expand_shape %61 [[0, 1], [2], [3]] : tensor<12x128x64xf32> into tensor<1x12x128x64xf32> + %62 = bufferization.to_memref %expanded_118 : memref<1x12x128x64xf32> + %alloc_119 = memref.alloc() {alignment = 64 : i64} : memref<1x128x12x64xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 64 { + %88 = affine.load %62[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x64xf32> + affine.store %88, %alloc_119[%arg3, %arg5, %arg4, %arg6] : memref<1x128x12x64xf32> + } + } + } + } + %63 = bufferization.to_tensor %alloc_119 : memref<1x128x12x64xf32> + %collapsed_120 = tensor.collapse_shape %63 [[0], [1], [2, 3]] : tensor<1x128x12x64xf32> into tensor<1x128x768xf32> + %64 = bufferization.to_memref %collapsed_120 : memref<1x128x768xf32> + %alloc_121 = memref.alloc() {alignment = 64 : i64} : memref<768x768xf32> + affine.for %arg3 = 0 to 768 { + affine.for %arg4 = 0 to 768 { + %88 = affine.load %11[%arg3, %arg4] : memref<768x768xf32> + affine.store %88, %alloc_121[%arg4, %arg3] : memref<768x768xf32> + } + } + %alloc_122 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %64[%c0, %arg4, %arg5] : memref<1x128x768xf32> + affine.store %88, %alloc_122[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_123 = memref.alloc() {alignment = 64 : i64} : memref<1x768x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 768 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_121[%arg4, %arg5] : memref<768x768xf32> + affine.store %88, %alloc_123[%arg3, %arg4, %arg5] : memref<1x768x768xf32> + } + } + } + %alloc_124 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + memref.copy %alloc_74, %alloc_124 : memref<1x128x768xf32> to memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + affine.for %arg6 = 0 to 768 { + %88 = affine.load %alloc_122[%arg3, %arg4, %arg6] : memref<1x128x768xf32> + %89 = affine.load %alloc_123[%arg3, %arg6, %arg5] : memref<1x768x768xf32> + %90 = affine.load %alloc_124[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + %91 = arith.mulf %88, %89 : f32 + %92 = arith.addf %90, %91 : f32 + affine.store %92, %alloc_124[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + } + %alloc_125 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_124[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %12[%arg5] : memref<768xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_125[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_126 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_52[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %alloc_125[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_126[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_127 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32> + memref.copy %alloc_53, %alloc_127 : memref<1x128x1xf32> to memref<1x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_126[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %alloc_127[%arg3, %arg4, %c0] : memref<1x128x1xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_127[%arg3, %arg4, %c0] : memref<1x128x1xf32> + } + } + } + %alloc_128 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_127[%c0, %arg4, %c0] : memref<1x128x1xf32> + %89 = arith.divf %88, %cst_38 : f32 + affine.store %89, %alloc_128[%arg3, %arg4, %arg5] : memref<1x128x1xf32> + } + } + } + %alloc_129 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_126[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = arith.extf %88 : f32 to f64 + affine.store %89, %alloc_129[%arg3, %arg4, %arg5] : memref<1x128x768xf64> + } + } + } + %alloc_130 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64> + memref.copy %alloc_57, %alloc_130 : memref<1x128x1xf64> to memref<1x128x1xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_129[%arg3, %arg4, %arg5] : memref<1x128x768xf64> + %89 = affine.load %alloc_130[%arg3, %arg4, %c0] : memref<1x128x1xf64> + %90 = arith.addf %88, %89 : f64 + affine.store %90, %alloc_130[%arg3, %arg4, %c0] : memref<1x128x1xf64> + } + } + } + %alloc_131 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_130[%c0, %arg4, %c0] : memref<1x128x1xf64> + %89 = arith.divf %88, %cst_37 : f64 + affine.store %89, %alloc_131[%arg3, %arg4, %arg5] : memref<1x128x1xf64> + } + } + } + %alloc_132 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_129[%c0, %arg4, %arg5] : memref<1x128x768xf64> + %89 = affine.load %alloc_131[%c0, %arg4, %c0] : memref<1x128x1xf64> + %90 = arith.subf %88, %89 : f64 + affine.store %90, %alloc_132[%arg3, %arg4, %arg5] : memref<1x128x768xf64> + } + } + } + %alloc_133 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_132[%c0, %arg4, %arg5] : memref<1x128x768xf64> + %89 = affine.load %alloc_132[%c0, %arg4, %arg5] : memref<1x128x768xf64> + %90 = arith.mulf %88, %89 : f64 + affine.store %90, %alloc_133[%arg3, %arg4, %arg5] : memref<1x128x768xf64> + } + } + } + %alloc_134 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64> + memref.copy %alloc_57, %alloc_134 : memref<1x128x1xf64> to memref<1x128x1xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_133[%arg3, %arg4, %arg5] : memref<1x128x768xf64> + %89 = affine.load %alloc_134[%arg3, %arg4, %c0] : memref<1x128x1xf64> + %90 = arith.addf %88, %89 : f64 + affine.store %90, %alloc_134[%arg3, %arg4, %c0] : memref<1x128x1xf64> + } + } + } + %alloc_135 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_134[%c0, %arg4, %c0] : memref<1x128x1xf64> + %89 = arith.divf %88, %cst_42 : f64 + affine.store %89, %alloc_135[%arg3, %arg4, %arg5] : memref<1x128x1xf64> + } + } + } + %alloc_136 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_135[%c0, %arg4, %c0] : memref<1x128x1xf64> + %89 = arith.truncf %88 : f64 to f32 + affine.store %89, %alloc_136[%arg3, %arg4, %arg5] : memref<1x128x1xf32> + } + } + } + %alloc_137 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_136[%c0, %arg4, %c0] : memref<1x128x1xf32> + %89 = math.sqrt %88 : f32 + affine.store %89, %alloc_137[%arg3, %arg4, %arg5] : memref<1x128x1xf32> + } + } + } + %alloc_138 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_126[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %alloc_128[%c0, %arg4, %c0] : memref<1x128x1xf32> + %90 = arith.subf %88, %89 : f32 + affine.store %90, %alloc_138[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_139 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %13[%arg5] : memref<768xf32> + %89 = affine.load %alloc_138[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %90 = arith.mulf %88, %89 : f32 + affine.store %90, %alloc_139[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_140 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_137[%c0, %arg4, %c0] : memref<1x128x1xf32> + %89 = arith.truncf %cst_39 : f64 to f32 + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_140[%arg3, %arg4, %arg5] : memref<1x128x1xf32> + } + } + } + %alloc_141 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_139[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %alloc_140[%c0, %arg4, %c0] : memref<1x128x1xf32> + %90 = arith.divf %88, %89 : f32 + affine.store %90, %alloc_141[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_142 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_141[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %14[%arg5] : memref<768xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_142[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_143 = memref.alloc() {alignment = 64 : i64} : memref<768x3072xf32> + affine.for %arg3 = 0 to 3072 { + affine.for %arg4 = 0 to 768 { + %88 = affine.load %15[%arg3, %arg4] : memref<3072x768xf32> + affine.store %88, %alloc_143[%arg4, %arg3] : memref<768x3072xf32> + } + } + %alloc_144 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_142[%c0, %arg4, %arg5] : memref<1x128x768xf32> + affine.store %88, %alloc_144[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_145 = memref.alloc() {alignment = 64 : i64} : memref<1x768x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 768 { + affine.for %arg5 = 0 to 3072 { + %88 = affine.load %alloc_143[%arg4, %arg5] : memref<768x3072xf32> + affine.store %88, %alloc_145[%arg3, %arg4, %arg5] : memref<1x768x3072xf32> + } + } + } + %alloc_146 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 3072 { + affine.store %cst_45, %alloc_146[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + } + } + } + %alloc_147 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32> + memref.copy %alloc_146, %alloc_147 : memref<1x128x3072xf32> to memref<1x128x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 3072 { + affine.for %arg6 = 0 to 768 { + %88 = affine.load %alloc_144[%arg3, %arg4, %arg6] : memref<1x128x768xf32> + %89 = affine.load %alloc_145[%arg3, %arg6, %arg5] : memref<1x768x3072xf32> + %90 = affine.load %alloc_147[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + %91 = arith.mulf %88, %89 : f32 + %92 = arith.addf %90, %91 : f32 + affine.store %92, %alloc_147[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + } + } + } + } + %alloc_148 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 3072 { + %88 = affine.load %alloc_147[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + %89 = affine.load %16[%arg5] : memref<3072xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_148[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + } + } + } + %alloc_149 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 3072 { + %88 = affine.load %alloc_148[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + %89 = arith.mulf %88, %cst_35 : f32 + affine.store %89, %alloc_149[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + } + } + } + %alloc_150 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 3072 { + %88 = affine.load %alloc_148[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + %89 = math.powf %88, %cst_34 : f32 + affine.store %89, %alloc_150[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + } + } + } + %alloc_151 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 3072 { + %88 = affine.load %alloc_150[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + %89 = arith.truncf %cst_40 : f64 to f32 + %90 = arith.mulf %88, %89 : f32 + affine.store %90, %alloc_151[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + } + } + } + %alloc_152 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 3072 { + %88 = affine.load %alloc_148[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + %89 = affine.load %alloc_151[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_152[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + } + } + } + %alloc_153 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 3072 { + %88 = affine.load %alloc_152[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + %89 = arith.truncf %cst_41 : f64 to f32 + %90 = arith.mulf %88, %89 : f32 + affine.store %90, %alloc_153[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + } + } + } + %alloc_154 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 3072 { + %88 = affine.load %alloc_153[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + %89 = math.tanh %88 : f32 + affine.store %89, %alloc_154[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + } + } + } + %alloc_155 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 3072 { + %88 = affine.load %alloc_154[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + %89 = arith.addf %88, %cst_33 : f32 + affine.store %89, %alloc_155[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + } + } + } + %alloc_156 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 3072 { + %88 = affine.load %alloc_149[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + %89 = affine.load %alloc_155[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + %90 = arith.mulf %88, %89 : f32 + affine.store %90, %alloc_156[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + } + } + } + %alloc_157 = memref.alloc() {alignment = 64 : i64} : memref<3072x768xf32> + affine.for %arg3 = 0 to 768 { + affine.for %arg4 = 0 to 3072 { + %88 = affine.load %17[%arg3, %arg4] : memref<768x3072xf32> + affine.store %88, %alloc_157[%arg4, %arg3] : memref<3072x768xf32> + } + } + %alloc_158 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 3072 { + %88 = affine.load %alloc_156[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + affine.store %88, %alloc_158[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + } + } + } + %alloc_159 = memref.alloc() {alignment = 64 : i64} : memref<1x3072x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 3072 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_157[%arg4, %arg5] : memref<3072x768xf32> + affine.store %88, %alloc_159[%arg3, %arg4, %arg5] : memref<1x3072x768xf32> + } + } + } + %alloc_160 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + memref.copy %alloc_74, %alloc_160 : memref<1x128x768xf32> to memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + affine.for %arg6 = 0 to 3072 { + %88 = affine.load %alloc_158[%arg3, %arg4, %arg6] : memref<1x128x3072xf32> + %89 = affine.load %alloc_159[%arg3, %arg6, %arg5] : memref<1x3072x768xf32> + %90 = affine.load %alloc_160[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + %91 = arith.mulf %88, %89 : f32 + %92 = arith.addf %90, %91 : f32 + affine.store %92, %alloc_160[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + } + %alloc_161 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_160[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %18[%arg5] : memref<768xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_161[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_162 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_126[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %alloc_161[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_162[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_163 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32> + memref.copy %alloc_53, %alloc_163 : memref<1x128x1xf32> to memref<1x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_162[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %alloc_163[%arg3, %arg4, %c0] : memref<1x128x1xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_163[%arg3, %arg4, %c0] : memref<1x128x1xf32> + } + } + } + %alloc_164 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_163[%c0, %arg4, %c0] : memref<1x128x1xf32> + %89 = arith.divf %88, %cst_38 : f32 + affine.store %89, %alloc_164[%arg3, %arg4, %arg5] : memref<1x128x1xf32> + } + } + } + %alloc_165 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_162[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = arith.extf %88 : f32 to f64 + affine.store %89, %alloc_165[%arg3, %arg4, %arg5] : memref<1x128x768xf64> + } + } + } + %alloc_166 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64> + memref.copy %alloc_57, %alloc_166 : memref<1x128x1xf64> to memref<1x128x1xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_165[%arg3, %arg4, %arg5] : memref<1x128x768xf64> + %89 = affine.load %alloc_166[%arg3, %arg4, %c0] : memref<1x128x1xf64> + %90 = arith.addf %88, %89 : f64 + affine.store %90, %alloc_166[%arg3, %arg4, %c0] : memref<1x128x1xf64> + } + } + } + %alloc_167 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_166[%c0, %arg4, %c0] : memref<1x128x1xf64> + %89 = arith.divf %88, %cst_37 : f64 + affine.store %89, %alloc_167[%arg3, %arg4, %arg5] : memref<1x128x1xf64> + } + } + } + %alloc_168 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_165[%c0, %arg4, %arg5] : memref<1x128x768xf64> + %89 = affine.load %alloc_167[%c0, %arg4, %c0] : memref<1x128x1xf64> + %90 = arith.subf %88, %89 : f64 + affine.store %90, %alloc_168[%arg3, %arg4, %arg5] : memref<1x128x768xf64> + } + } + } + %alloc_169 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_168[%c0, %arg4, %arg5] : memref<1x128x768xf64> + %89 = affine.load %alloc_168[%c0, %arg4, %arg5] : memref<1x128x768xf64> + %90 = arith.mulf %88, %89 : f64 + affine.store %90, %alloc_169[%arg3, %arg4, %arg5] : memref<1x128x768xf64> + } + } + } + %alloc_170 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64> + memref.copy %alloc_57, %alloc_170 : memref<1x128x1xf64> to memref<1x128x1xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_169[%arg3, %arg4, %arg5] : memref<1x128x768xf64> + %89 = affine.load %alloc_170[%arg3, %arg4, %c0] : memref<1x128x1xf64> + %90 = arith.addf %88, %89 : f64 + affine.store %90, %alloc_170[%arg3, %arg4, %c0] : memref<1x128x1xf64> + } + } + } + %alloc_171 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_170[%c0, %arg4, %c0] : memref<1x128x1xf64> + %89 = arith.divf %88, %cst_42 : f64 + affine.store %89, %alloc_171[%arg3, %arg4, %arg5] : memref<1x128x1xf64> + } + } + } + %alloc_172 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_171[%c0, %arg4, %c0] : memref<1x128x1xf64> + %89 = arith.truncf %88 : f64 to f32 + affine.store %89, %alloc_172[%arg3, %arg4, %arg5] : memref<1x128x1xf32> + } + } + } + %alloc_173 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_172[%c0, %arg4, %c0] : memref<1x128x1xf32> + %89 = math.sqrt %88 : f32 + affine.store %89, %alloc_173[%arg3, %arg4, %arg5] : memref<1x128x1xf32> + } + } + } + %alloc_174 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_162[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %alloc_164[%c0, %arg4, %c0] : memref<1x128x1xf32> + %90 = arith.subf %88, %89 : f32 + affine.store %90, %alloc_174[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_175 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %19[%arg5] : memref<768xf32> + %89 = affine.load %alloc_174[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %90 = arith.mulf %88, %89 : f32 + affine.store %90, %alloc_175[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_176 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_173[%c0, %arg4, %c0] : memref<1x128x1xf32> + %89 = arith.truncf %cst_39 : f64 to f32 + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_176[%arg3, %arg4, %arg5] : memref<1x128x1xf32> + } + } + } + %alloc_177 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_175[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %alloc_176[%c0, %arg4, %c0] : memref<1x128x1xf32> + %90 = arith.divf %88, %89 : f32 + affine.store %90, %alloc_177[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_178 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_177[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %20[%arg5] : memref<768xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_178[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_179 = memref.alloc() {alignment = 64 : i64} : memref<768x768xf32> + affine.for %arg3 = 0 to 768 { + affine.for %arg4 = 0 to 768 { + %88 = affine.load %21[%arg3, %arg4] : memref<768x768xf32> + affine.store %88, %alloc_179[%arg4, %arg3] : memref<768x768xf32> + } + } + %alloc_180 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_178[%c0, %arg4, %arg5] : memref<1x128x768xf32> + affine.store %88, %alloc_180[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_181 = memref.alloc() {alignment = 64 : i64} : memref<1x768x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 768 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_179[%arg4, %arg5] : memref<768x768xf32> + affine.store %88, %alloc_181[%arg3, %arg4, %arg5] : memref<1x768x768xf32> + } + } + } + %alloc_182 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + memref.copy %alloc_74, %alloc_182 : memref<1x128x768xf32> to memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + affine.for %arg6 = 0 to 768 { + %88 = affine.load %alloc_180[%arg3, %arg4, %arg6] : memref<1x128x768xf32> + %89 = affine.load %alloc_181[%arg3, %arg6, %arg5] : memref<1x768x768xf32> + %90 = affine.load %alloc_182[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + %91 = arith.mulf %88, %89 : f32 + %92 = arith.addf %90, %91 : f32 + affine.store %92, %alloc_182[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + } + %alloc_183 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_182[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %22[%arg5] : memref<768xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_183[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %65 = bufferization.to_tensor %alloc_183 : memref<1x128x768xf32> + %expanded_184 = tensor.expand_shape %65 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32> + %66 = bufferization.to_memref %expanded_184 : memref<1x128x12x64xf32> + %alloc_185 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x64xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 12 { + affine.for %arg6 = 0 to 64 { + %88 = affine.load %66[%arg3, %arg4, %arg5, %arg6] : memref<1x128x12x64xf32> + affine.store %88, %alloc_185[%arg3, %arg5, %arg4, %arg6] : memref<1x12x128x64xf32> + } + } + } + } + %alloc_186 = memref.alloc() {alignment = 64 : i64} : memref<768x768xf32> + affine.for %arg3 = 0 to 768 { + affine.for %arg4 = 0 to 768 { + %88 = affine.load %23[%arg3, %arg4] : memref<768x768xf32> + affine.store %88, %alloc_186[%arg4, %arg3] : memref<768x768xf32> + } + } + %alloc_187 = memref.alloc() {alignment = 64 : i64} : memref<1x768x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 768 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_186[%arg4, %arg5] : memref<768x768xf32> + affine.store %88, %alloc_187[%arg3, %arg4, %arg5] : memref<1x768x768xf32> + } + } + } + %alloc_188 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + memref.copy %alloc_74, %alloc_188 : memref<1x128x768xf32> to memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + affine.for %arg6 = 0 to 768 { + %88 = affine.load %alloc_180[%arg3, %arg4, %arg6] : memref<1x128x768xf32> + %89 = affine.load %alloc_187[%arg3, %arg6, %arg5] : memref<1x768x768xf32> + %90 = affine.load %alloc_188[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + %91 = arith.mulf %88, %89 : f32 + %92 = arith.addf %90, %91 : f32 + affine.store %92, %alloc_188[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + } + %alloc_189 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_188[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %24[%arg5] : memref<768xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_189[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %67 = bufferization.to_tensor %alloc_189 : memref<1x128x768xf32> + %expanded_190 = tensor.expand_shape %67 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32> + %68 = bufferization.to_memref %expanded_190 : memref<1x128x12x64xf32> + %alloc_191 = memref.alloc() {alignment = 64 : i64} : memref<768x768xf32> + affine.for %arg3 = 0 to 768 { + affine.for %arg4 = 0 to 768 { + %88 = affine.load %25[%arg3, %arg4] : memref<768x768xf32> + affine.store %88, %alloc_191[%arg4, %arg3] : memref<768x768xf32> + } + } + %alloc_192 = memref.alloc() {alignment = 64 : i64} : memref<1x768x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 768 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_191[%arg4, %arg5] : memref<768x768xf32> + affine.store %88, %alloc_192[%arg3, %arg4, %arg5] : memref<1x768x768xf32> + } + } + } + %alloc_193 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + memref.copy %alloc_74, %alloc_193 : memref<1x128x768xf32> to memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + affine.for %arg6 = 0 to 768 { + %88 = affine.load %alloc_180[%arg3, %arg4, %arg6] : memref<1x128x768xf32> + %89 = affine.load %alloc_192[%arg3, %arg6, %arg5] : memref<1x768x768xf32> + %90 = affine.load %alloc_193[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + %91 = arith.mulf %88, %89 : f32 + %92 = arith.addf %90, %91 : f32 + affine.store %92, %alloc_193[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + } + %alloc_194 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_193[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %26[%arg5] : memref<768xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_194[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %69 = bufferization.to_tensor %alloc_194 : memref<1x128x768xf32> + %expanded_195 = tensor.expand_shape %69 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32> + %70 = bufferization.to_memref %expanded_195 : memref<1x128x12x64xf32> + %alloc_196 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x64xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 12 { + affine.for %arg6 = 0 to 64 { + %88 = affine.load %70[%arg3, %arg4, %arg5, %arg6] : memref<1x128x12x64xf32> + affine.store %88, %alloc_196[%arg3, %arg5, %arg4, %arg6] : memref<1x12x128x64xf32> + } + } + } + } + %alloc_197 = memref.alloc() {alignment = 64 : i64} : memref<1x12x64x128xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 12 { + affine.for %arg6 = 0 to 64 { + %88 = affine.load %68[%arg3, %arg4, %arg5, %arg6] : memref<1x128x12x64xf32> + affine.store %88, %alloc_197[%arg3, %arg5, %arg6, %arg4] : memref<1x12x64x128xf32> + } + } + } + } + %alloc_198 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x64xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 64 { + %88 = affine.load %alloc_185[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x64xf32> + affine.store %88, %alloc_198[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x64xf32> + } + } + } + } + %71 = bufferization.to_tensor %alloc_198 : memref<1x12x128x64xf32> + %alloc_199 = memref.alloc() {alignment = 64 : i64} : memref<1x12x64x128xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 64 { + affine.for %arg6 = 0 to 128 { + %88 = affine.load %alloc_197[%c0, %arg4, %arg5, %arg6] : memref<1x12x64x128xf32> + affine.store %88, %alloc_199[%arg3, %arg4, %arg5, %arg6] : memref<1x12x64x128xf32> + } + } + } + } + %72 = bufferization.to_tensor %alloc_199 : memref<1x12x64x128xf32> + %collapsed_200 = tensor.collapse_shape %71 [[0, 1], [2], [3]] : tensor<1x12x128x64xf32> into tensor<12x128x64xf32> + %73 = bufferization.to_memref %collapsed_200 : memref<12x128x64xf32> + %collapsed_201 = tensor.collapse_shape %72 [[0, 1], [2], [3]] : tensor<1x12x64x128xf32> into tensor<12x64x128xf32> + %74 = bufferization.to_memref %collapsed_201 : memref<12x64x128xf32> + %alloc_202 = memref.alloc() {alignment = 64 : i64} : memref<12x128x128xf32> + memref.copy %alloc_95, %alloc_202 : memref<12x128x128xf32> to memref<12x128x128xf32> + affine.for %arg3 = 0 to 12 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 64 { + %88 = affine.load %73[%arg3, %arg4, %arg6] : memref<12x128x64xf32> + %89 = affine.load %74[%arg3, %arg6, %arg5] : memref<12x64x128xf32> + %90 = affine.load %alloc_202[%arg3, %arg4, %arg5] : memref<12x128x128xf32> + %91 = arith.mulf %88, %89 : f32 + %92 = arith.addf %90, %91 : f32 + affine.store %92, %alloc_202[%arg3, %arg4, %arg5] : memref<12x128x128xf32> + } + } + } + } + %75 = bufferization.to_tensor %alloc_202 : memref<12x128x128xf32> + %expanded_203 = tensor.expand_shape %75 [[0, 1], [2], [3]] : tensor<12x128x128xf32> into tensor<1x12x128x128xf32> + %76 = bufferization.to_memref %expanded_203 : memref<1x12x128x128xf32> + %alloc_204 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 128 { + %88 = affine.load %76[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + %89 = arith.divf %88, %cst_36 : f32 + affine.store %89, %alloc_204[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + } + } + } + } + %alloc_205 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 128 { + %88 = affine.load %alloc_99[%c0, %c0, %arg5, %arg6] : memref<1x1x128x128xi1> + %89 = affine.load %alloc_100[] : memref + %90 = affine.load %alloc_204[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + %91 = arith.select %88, %89, %90 : f32 + affine.store %91, %alloc_205[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + } + } + } + } + %alloc_206 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128xf32> + memref.copy %alloc_103, %alloc_206 : memref<1x12x128xf32> to memref<1x12x128xf32> + %alloc_207 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128xi64> + memref.copy %alloc_102, %alloc_207 : memref<1x12x128xi64> to memref<1x12x128xi64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 128 { + %88 = affine.load %alloc_205[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + %89 = affine.load %alloc_206[%arg3, %arg4, %arg5] : memref<1x12x128xf32> + %90 = affine.load %alloc_207[%arg3, %arg4, %arg5] : memref<1x12x128xi64> + %91 = arith.index_cast %arg6 : index to i64 + %92 = arith.maximumf %88, %89 : f32 + %93 = arith.cmpf ogt, %88, %89 : f32 + %94 = arith.select %93, %91, %90 : i64 + affine.store %92, %alloc_206[%arg3, %arg4, %arg5] : memref<1x12x128xf32> + affine.store %94, %alloc_207[%arg3, %arg4, %arg5] : memref<1x12x128xi64> + } + } + } + } + %77 = bufferization.to_tensor %alloc_206 : memref<1x12x128xf32> + %expanded_208 = tensor.expand_shape %77 [[0], [1], [2, 3]] : tensor<1x12x128xf32> into tensor<1x12x128x1xf32> + %78 = bufferization.to_memref %expanded_208 : memref<1x12x128x1xf32> + %alloc_209 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 128 { + %88 = affine.load %alloc_205[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + %89 = affine.load %78[%c0, %arg4, %arg5, %c0] : memref<1x12x128x1xf32> + %90 = arith.subf %88, %89 : f32 + affine.store %90, %alloc_209[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + } + } + } + } + %alloc_210 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 128 { + %88 = affine.load %alloc_209[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + %89 = math.exp %88 : f32 + affine.store %89, %alloc_210[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + } + } + } + } + %alloc_211 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x1xf32> + memref.copy %alloc_109, %alloc_211 : memref<1x12x128x1xf32> to memref<1x12x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 128 { + %88 = affine.load %alloc_210[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + %89 = affine.load %alloc_211[%arg3, %arg4, %arg5, %c0] : memref<1x12x128x1xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_211[%arg3, %arg4, %arg5, %c0] : memref<1x12x128x1xf32> + } + } + } + } + %alloc_212 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 128 { + %88 = affine.load %alloc_210[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + %89 = affine.load %alloc_211[%c0, %arg4, %arg5, %c0] : memref<1x12x128x1xf32> + %90 = arith.divf %88, %89 : f32 + affine.store %90, %alloc_212[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + } + } + } + } + %alloc_213 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 128 { + %88 = affine.load %alloc_212[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + affine.store %88, %alloc_213[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32> + } + } + } + } + %79 = bufferization.to_tensor %alloc_213 : memref<1x12x128x128xf32> + %alloc_214 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x64xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 64 { + %88 = affine.load %alloc_196[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x64xf32> + affine.store %88, %alloc_214[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x64xf32> + } + } + } + } + %80 = bufferization.to_tensor %alloc_214 : memref<1x12x128x64xf32> + %collapsed_215 = tensor.collapse_shape %79 [[0, 1], [2], [3]] : tensor<1x12x128x128xf32> into tensor<12x128x128xf32> + %81 = bufferization.to_memref %collapsed_215 : memref<12x128x128xf32> + %collapsed_216 = tensor.collapse_shape %80 [[0, 1], [2], [3]] : tensor<1x12x128x64xf32> into tensor<12x128x64xf32> + %82 = bufferization.to_memref %collapsed_216 : memref<12x128x64xf32> + %alloc_217 = memref.alloc() {alignment = 64 : i64} : memref<12x128x64xf32> + memref.copy %alloc_116, %alloc_217 : memref<12x128x64xf32> to memref<12x128x64xf32> + affine.for %arg3 = 0 to 12 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 64 { + affine.for %arg6 = 0 to 128 { + %88 = affine.load %81[%arg3, %arg4, %arg6] : memref<12x128x128xf32> + %89 = affine.load %82[%arg3, %arg6, %arg5] : memref<12x128x64xf32> + %90 = affine.load %alloc_217[%arg3, %arg4, %arg5] : memref<12x128x64xf32> + %91 = arith.mulf %88, %89 : f32 + %92 = arith.addf %90, %91 : f32 + affine.store %92, %alloc_217[%arg3, %arg4, %arg5] : memref<12x128x64xf32> + } + } + } + } + %83 = bufferization.to_tensor %alloc_217 : memref<12x128x64xf32> + %expanded_218 = tensor.expand_shape %83 [[0, 1], [2], [3]] : tensor<12x128x64xf32> into tensor<1x12x128x64xf32> + %84 = bufferization.to_memref %expanded_218 : memref<1x12x128x64xf32> + %alloc_219 = memref.alloc() {alignment = 64 : i64} : memref<1x128x12x64xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 12 { + affine.for %arg5 = 0 to 128 { + affine.for %arg6 = 0 to 64 { + %88 = affine.load %84[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x64xf32> + affine.store %88, %alloc_219[%arg3, %arg5, %arg4, %arg6] : memref<1x128x12x64xf32> + } + } + } + } + %85 = bufferization.to_tensor %alloc_219 : memref<1x128x12x64xf32> + %collapsed_220 = tensor.collapse_shape %85 [[0], [1], [2, 3]] : tensor<1x128x12x64xf32> into tensor<1x128x768xf32> + %86 = bufferization.to_memref %collapsed_220 : memref<1x128x768xf32> + %alloc_221 = memref.alloc() {alignment = 64 : i64} : memref<768x768xf32> + affine.for %arg3 = 0 to 768 { + affine.for %arg4 = 0 to 768 { + %88 = affine.load %27[%arg3, %arg4] : memref<768x768xf32> + affine.store %88, %alloc_221[%arg4, %arg3] : memref<768x768xf32> + } + } + %alloc_222 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %86[%c0, %arg4, %arg5] : memref<1x128x768xf32> + affine.store %88, %alloc_222[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_223 = memref.alloc() {alignment = 64 : i64} : memref<1x768x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 768 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_221[%arg4, %arg5] : memref<768x768xf32> + affine.store %88, %alloc_223[%arg3, %arg4, %arg5] : memref<1x768x768xf32> + } + } + } + %alloc_224 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + memref.copy %alloc_74, %alloc_224 : memref<1x128x768xf32> to memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + affine.for %arg6 = 0 to 768 { + %88 = affine.load %alloc_222[%arg3, %arg4, %arg6] : memref<1x128x768xf32> + %89 = affine.load %alloc_223[%arg3, %arg6, %arg5] : memref<1x768x768xf32> + %90 = affine.load %alloc_224[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + %91 = arith.mulf %88, %89 : f32 + %92 = arith.addf %90, %91 : f32 + affine.store %92, %alloc_224[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + } + %alloc_225 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_224[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %28[%arg5] : memref<768xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_225[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_226 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_162[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %alloc_225[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_226[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_227 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32> + memref.copy %alloc_53, %alloc_227 : memref<1x128x1xf32> to memref<1x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_226[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %alloc_227[%arg3, %arg4, %c0] : memref<1x128x1xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_227[%arg3, %arg4, %c0] : memref<1x128x1xf32> + } + } + } + %alloc_228 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_227[%c0, %arg4, %c0] : memref<1x128x1xf32> + %89 = arith.divf %88, %cst_38 : f32 + affine.store %89, %alloc_228[%arg3, %arg4, %arg5] : memref<1x128x1xf32> + } + } + } + %alloc_229 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_226[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = arith.extf %88 : f32 to f64 + affine.store %89, %alloc_229[%arg3, %arg4, %arg5] : memref<1x128x768xf64> + } + } + } + %alloc_230 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64> + memref.copy %alloc_57, %alloc_230 : memref<1x128x1xf64> to memref<1x128x1xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_229[%arg3, %arg4, %arg5] : memref<1x128x768xf64> + %89 = affine.load %alloc_230[%arg3, %arg4, %c0] : memref<1x128x1xf64> + %90 = arith.addf %88, %89 : f64 + affine.store %90, %alloc_230[%arg3, %arg4, %c0] : memref<1x128x1xf64> + } + } + } + %alloc_231 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_230[%c0, %arg4, %c0] : memref<1x128x1xf64> + %89 = arith.divf %88, %cst_37 : f64 + affine.store %89, %alloc_231[%arg3, %arg4, %arg5] : memref<1x128x1xf64> + } + } + } + %alloc_232 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_229[%c0, %arg4, %arg5] : memref<1x128x768xf64> + %89 = affine.load %alloc_231[%c0, %arg4, %c0] : memref<1x128x1xf64> + %90 = arith.subf %88, %89 : f64 + affine.store %90, %alloc_232[%arg3, %arg4, %arg5] : memref<1x128x768xf64> + } + } + } + %alloc_233 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_232[%c0, %arg4, %arg5] : memref<1x128x768xf64> + %89 = affine.load %alloc_232[%c0, %arg4, %arg5] : memref<1x128x768xf64> + %90 = arith.mulf %88, %89 : f64 + affine.store %90, %alloc_233[%arg3, %arg4, %arg5] : memref<1x128x768xf64> + } + } + } + %alloc_234 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64> + memref.copy %alloc_57, %alloc_234 : memref<1x128x1xf64> to memref<1x128x1xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_233[%arg3, %arg4, %arg5] : memref<1x128x768xf64> + %89 = affine.load %alloc_234[%arg3, %arg4, %c0] : memref<1x128x1xf64> + %90 = arith.addf %88, %89 : f64 + affine.store %90, %alloc_234[%arg3, %arg4, %c0] : memref<1x128x1xf64> + } + } + } + %alloc_235 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_234[%c0, %arg4, %c0] : memref<1x128x1xf64> + %89 = arith.divf %88, %cst_42 : f64 + affine.store %89, %alloc_235[%arg3, %arg4, %arg5] : memref<1x128x1xf64> + } + } + } + %alloc_236 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_235[%c0, %arg4, %c0] : memref<1x128x1xf64> + %89 = arith.truncf %88 : f64 to f32 + affine.store %89, %alloc_236[%arg3, %arg4, %arg5] : memref<1x128x1xf32> + } + } + } + %alloc_237 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_236[%c0, %arg4, %c0] : memref<1x128x1xf32> + %89 = math.sqrt %88 : f32 + affine.store %89, %alloc_237[%arg3, %arg4, %arg5] : memref<1x128x1xf32> + } + } + } + %alloc_238 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_226[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %alloc_228[%c0, %arg4, %c0] : memref<1x128x1xf32> + %90 = arith.subf %88, %89 : f32 + affine.store %90, %alloc_238[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_239 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %29[%arg5] : memref<768xf32> + %89 = affine.load %alloc_238[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %90 = arith.mulf %88, %89 : f32 + affine.store %90, %alloc_239[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_240 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 1 { + %88 = affine.load %alloc_237[%c0, %arg4, %c0] : memref<1x128x1xf32> + %89 = arith.truncf %cst_39 : f64 to f32 + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_240[%arg3, %arg4, %arg5] : memref<1x128x1xf32> + } + } + } + %alloc_241 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_239[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %alloc_240[%c0, %arg4, %c0] : memref<1x128x1xf32> + %90 = arith.divf %88, %89 : f32 + affine.store %90, %alloc_241[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_242 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_241[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %30[%arg5] : memref<768xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_242[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_243 = memref.alloc() {alignment = 64 : i64} : memref<768x3072xf32> + affine.for %arg3 = 0 to 3072 { + affine.for %arg4 = 0 to 768 { + %88 = affine.load %31[%arg3, %arg4] : memref<3072x768xf32> + affine.store %88, %alloc_243[%arg4, %arg3] : memref<768x3072xf32> + } + } + %alloc_244 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_242[%c0, %arg4, %arg5] : memref<1x128x768xf32> + affine.store %88, %alloc_244[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_245 = memref.alloc() {alignment = 64 : i64} : memref<1x768x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 768 { + affine.for %arg5 = 0 to 3072 { + %88 = affine.load %alloc_243[%arg4, %arg5] : memref<768x3072xf32> + affine.store %88, %alloc_245[%arg3, %arg4, %arg5] : memref<1x768x3072xf32> + } + } + } + %alloc_246 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32> + memref.copy %alloc_146, %alloc_246 : memref<1x128x3072xf32> to memref<1x128x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 3072 { + affine.for %arg6 = 0 to 768 { + %88 = affine.load %alloc_244[%arg3, %arg4, %arg6] : memref<1x128x768xf32> + %89 = affine.load %alloc_245[%arg3, %arg6, %arg5] : memref<1x768x3072xf32> + %90 = affine.load %alloc_246[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + %91 = arith.mulf %88, %89 : f32 + %92 = arith.addf %90, %91 : f32 + affine.store %92, %alloc_246[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + } + } + } + } + %alloc_247 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 3072 { + %88 = affine.load %alloc_246[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + %89 = affine.load %32[%arg5] : memref<3072xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_247[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + } + } + } + %alloc_248 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 3072 { + %88 = affine.load %alloc_247[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + %89 = arith.mulf %88, %cst_35 : f32 + affine.store %89, %alloc_248[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + } + } + } + %alloc_249 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 3072 { + %88 = affine.load %alloc_247[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + %89 = math.powf %88, %cst_34 : f32 + affine.store %89, %alloc_249[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + } + } + } + %alloc_250 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 3072 { + %88 = affine.load %alloc_249[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + %89 = arith.truncf %cst_40 : f64 to f32 + %90 = arith.mulf %88, %89 : f32 + affine.store %90, %alloc_250[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + } + } + } + %alloc_251 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 3072 { + %88 = affine.load %alloc_247[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + %89 = affine.load %alloc_250[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_251[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + } + } + } + %alloc_252 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 3072 { + %88 = affine.load %alloc_251[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + %89 = arith.truncf %cst_41 : f64 to f32 + %90 = arith.mulf %88, %89 : f32 + affine.store %90, %alloc_252[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + } + } + } + %alloc_253 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 3072 { + %88 = affine.load %alloc_252[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + %89 = math.tanh %88 : f32 + affine.store %89, %alloc_253[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + } + } + } + %alloc_254 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 3072 { + %88 = affine.load %alloc_253[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + %89 = arith.addf %88, %cst_33 : f32 + affine.store %89, %alloc_254[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + } + } + } + %alloc_255 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 3072 { + %88 = affine.load %alloc_248[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + %89 = affine.load %alloc_254[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + %90 = arith.mulf %88, %89 : f32 + affine.store %90, %alloc_255[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + } + } + } + %alloc_256 = memref.alloc() {alignment = 64 : i64} : memref<3072x768xf32> + affine.for %arg3 = 0 to 768 { + affine.for %arg4 = 0 to 3072 { + %88 = affine.load %33[%arg3, %arg4] : memref<768x3072xf32> + affine.store %88, %alloc_256[%arg4, %arg3] : memref<3072x768xf32> + } + } + %alloc_257 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 3072 { + %88 = affine.load %alloc_255[%c0, %arg4, %arg5] : memref<1x128x3072xf32> + affine.store %88, %alloc_257[%arg3, %arg4, %arg5] : memref<1x128x3072xf32> + } + } + } + %alloc_258 = memref.alloc() {alignment = 64 : i64} : memref<1x3072x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 3072 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_256[%arg4, %arg5] : memref<3072x768xf32> + affine.store %88, %alloc_258[%arg3, %arg4, %arg5] : memref<1x3072x768xf32> + } + } + } + %alloc_259 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + memref.copy %alloc_74, %alloc_259 : memref<1x128x768xf32> to memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + affine.for %arg6 = 0 to 3072 { + %88 = affine.load %alloc_257[%arg3, %arg4, %arg6] : memref<1x128x3072xf32> + %89 = affine.load %alloc_258[%arg3, %arg6, %arg5] : memref<1x3072x768xf32> + %90 = affine.load %alloc_259[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + %91 = arith.mulf %88, %89 : f32 + %92 = arith.addf %90, %91 : f32 + affine.store %92, %alloc_259[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + } + %alloc_260 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_259[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %34[%arg5] : memref<768xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_260[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %alloc_261 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32> + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 128 { + affine.for %arg5 = 0 to 768 { + %88 = affine.load %alloc_226[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %89 = affine.load %alloc_260[%c0, %arg4, %arg5] : memref<1x128x768xf32> + %90 = arith.addf %88, %89 : f32 + affine.store %90, %alloc_261[%arg3, %arg4, %arg5] : memref<1x128x768xf32> + } + } + } + %87 = bufferization.to_tensor %alloc_261 : memref<1x128x768xf32> + return %87 : tensor<1x128x768xf32> + } +} + diff --git a/test/samples/bert/bert_linalg.mlir b/test/samples/bert/bert_linalg.mlir new file mode 100644 index 00000000..2a663edc --- /dev/null +++ b/test/samples/bert/bert_linalg.mlir @@ -0,0 +1,900 @@ +#map = affine_map<(d0, d1) -> (0, d1)> +#map1 = affine_map<(d0, d1) -> (d0, d1)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, 0, d3, d4, d5)> +#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)> +#map4 = affine_map<(d0, d1, d2) -> (d0, d1)> +#map5 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> +#map6 = affine_map<(d0, d1, d2) -> (0, d1, d2)> +#map7 = affine_map<(d0, d1, d2) -> (d0, d1, 0)> +#map8 = affine_map<(d0, d1, d2) -> (0, d1, 0)> +#map9 = affine_map<(d0, d1, d2) -> (d2)> +#map10 = affine_map<(d0, d1, d2) -> (d1, d2)> +#map11 = affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)> +#map12 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#map13 = affine_map<(d0, d1, d2, d3) -> (0, 0, d2, d3)> +#map14 = affine_map<() -> ()> +#map15 = affine_map<(d0, d1, d2, d3) -> ()> +#map16 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> +#map17 = affine_map<(d0, d1, d2, d3) -> (0, d1, d2, 0)> +#map18 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, 0)> +module { + func.func @main(%arg0: tensor<1x512x768xf32>, %arg1: tensor<1x128xi64>, %arg2: tensor<1x128xi64>) -> tensor<1x128x768xf32> { + %cst = arith.constant dense_resource : tensor<30522x768xf32> + %c0_i64 = arith.constant 0 : i64 + %c3 = arith.constant 3 : index + %c30522 = arith.constant 30522 : index + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant 0.000000e+00 : f64 + %cst_2 = arith.constant 0xFF800000 : f32 + %cst_3 = arith.constant 7.670000e+02 : f64 + %cst_4 = arith.constant 0.79788456080286541 : f64 + %cst_5 = arith.constant 4.471500e-02 : f64 + %cst_6 = arith.constant 9.9999999999999995E-7 : f64 + %cst_7 = arith.constant 7.680000e+02 : f32 + %cst_8 = arith.constant 7.680000e+02 : f64 + %cst_9 = arith.constant 8.000000e+00 : f32 + %cst_10 = arith.constant 5.000000e-01 : f32 + %cst_11 = arith.constant 3.000000e+00 : f32 + %cst_12 = arith.constant 1.000000e+00 : f32 + %cst_13 = arith.constant dense_resource : tensor<3x768xf32> + %cst_14 = arith.constant dense_resource : tensor<768xf32> + %cst_15 = arith.constant dense_resource : tensor<768xf32> + %cst_16 = arith.constant dense_resource : tensor<768x768xf32> + %cst_17 = arith.constant dense_resource : tensor<768xf32> + %cst_18 = arith.constant dense_resource : tensor<768x768xf32> + %cst_19 = arith.constant dense_resource : tensor<768xf32> + %cst_20 = arith.constant dense_resource : tensor<768x768xf32> + %cst_21 = arith.constant dense_resource : tensor<768xf32> + %cst_22 = arith.constant dense_resource : tensor<768x768xf32> + %cst_23 = arith.constant dense_resource : tensor<768xf32> + %cst_24 = arith.constant dense_resource : tensor<768xf32> + %cst_25 = arith.constant dense_resource : tensor<768xf32> + %cst_26 = arith.constant dense_resource : tensor<3072x768xf32> + %cst_27 = arith.constant dense_resource : tensor<3072xf32> + %cst_28 = arith.constant dense_resource : tensor<768x3072xf32> + %cst_29 = arith.constant dense_resource : tensor<768xf32> + %cst_30 = arith.constant dense_resource : tensor<768xf32> + %cst_31 = arith.constant dense_resource : tensor<768xf32> + %cst_32 = arith.constant dense_resource : tensor<768x768xf32> + %cst_33 = arith.constant dense_resource : tensor<768xf32> + %cst_34 = arith.constant dense_resource : tensor<768x768xf32> + %cst_35 = arith.constant dense_resource : tensor<768xf32> + %cst_36 = arith.constant dense_resource : tensor<768x768xf32> + %cst_37 = arith.constant dense_resource : tensor<768xf32> + %cst_38 = arith.constant dense_resource : tensor<768x768xf32> + %cst_39 = arith.constant dense_resource : tensor<768xf32> + %cst_40 = arith.constant dense_resource : tensor<768xf32> + %cst_41 = arith.constant dense_resource : tensor<768xf32> + %cst_42 = arith.constant dense_resource : tensor<3072x768xf32> + %cst_43 = arith.constant dense_resource : tensor<3072xf32> + %cst_44 = arith.constant dense_resource : tensor<768x3072xf32> + %cst_45 = arith.constant dense_resource : tensor<768xf32> + %cst_46 = arith.constant dense<-1.000000e+09> : tensor + %0 = tensor.empty() : tensor<1x128xi1> + %1 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1 : tensor<1x128xi64>) outs(%0 : tensor<1x128xi1>) { + ^bb0(%in: i64, %out: i1): + %195 = arith.cmpi sgt, %in, %c0_i64 : i64 + linalg.yield %195 : i1 + } -> tensor<1x128xi1> + %expanded = tensor.expand_shape %1 [[0, 1], [2, 3, 4, 5]] : tensor<1x128xi1> into tensor<1x1x1x1x1x128xi1> + %2 = tensor.empty() : tensor<1x1x128x1x1x128xi1> + %3 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<1x1x1x1x1x128xi1>) outs(%2 : tensor<1x1x128x1x1x128xi1>) { + ^bb0(%in: i1, %out: i1): + linalg.yield %in : i1 + } -> tensor<1x1x128x1x1x128xi1> + %collapsed = tensor.collapse_shape %3 [[0], [1, 2], [3, 4, 5]] : tensor<1x1x128x1x1x128xi1> into tensor<1x128x128xi1> + %expanded_47 = tensor.expand_shape %collapsed [[0], [1, 2], [3]] : tensor<1x128x128xi1> into tensor<1x1x128x128xi1> + %4 = tensor.empty() : tensor<1x128x768xf32> + %5 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg1 : tensor<1x128xi64>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: i64, %out: f32): + %195 = arith.index_cast %in : i64 to index + %196 = linalg.index 2 : index + %197 = arith.cmpi slt, %195, %c30522 : index + cf.assert %197, "index must be smaller than dim size" + %198 = arith.cmpi sge, %in, %c0_i64 : i64 + cf.assert %198, "index must be larger or equal to 0" + %extracted = tensor.extract %cst[%195, %196] : tensor<30522x768xf32> + linalg.yield %extracted : f32 + } -> tensor<1x128x768xf32> + %extracted_slice = tensor.extract_slice %arg0[0, 0, 0] [1, 128, 768] [1, 1, 1] : tensor<1x512x768xf32> to tensor<1x128x768xf32> + %6 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %extracted_slice : tensor<1x128x768xf32>, tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %7 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg2 : tensor<1x128xi64>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: i64, %out: f32): + %195 = arith.index_cast %in : i64 to index + %196 = linalg.index 2 : index + %197 = arith.cmpi slt, %195, %c3 : index + cf.assert %197, "index must be smaller than dim size" + %198 = arith.cmpi sge, %in, %c0_i64 : i64 + cf.assert %198, "index must be larger or equal to 0" + %extracted = tensor.extract %cst_13[%195, %196] : tensor<3x768xf32> + linalg.yield %extracted : f32 + } -> tensor<1x128x768xf32> + %8 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%6, %7 : tensor<1x128x768xf32>, tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %9 = tensor.empty() : tensor<1x128x1xf32> + %10 = linalg.fill ins(%cst_0 : f32) outs(%9 : tensor<1x128x1xf32>) -> tensor<1x128x1xf32> + %11 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%8 : tensor<1x128x768xf32>) outs(%10 : tensor<1x128x1xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.addf %in, %out : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x1xf32> + %12 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%11 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.divf %in, %cst_7 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x1xf32> + %13 = tensor.empty() : tensor<1x128x768xf64> + %14 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%8 : tensor<1x128x768xf32>) outs(%13 : tensor<1x128x768xf64>) { + ^bb0(%in: f32, %out: f64): + %195 = arith.extf %in : f32 to f64 + linalg.yield %195 : f64 + } -> tensor<1x128x768xf64> + %15 = tensor.empty() : tensor<1x128x1xf64> + %16 = linalg.fill ins(%cst_1 : f64) outs(%15 : tensor<1x128x1xf64>) -> tensor<1x128x1xf64> + %17 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%14 : tensor<1x128x768xf64>) outs(%16 : tensor<1x128x1xf64>) { + ^bb0(%in: f64, %out: f64): + %195 = arith.addf %in, %out : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x1xf64> + %18 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17 : tensor<1x128x1xf64>) outs(%15 : tensor<1x128x1xf64>) { + ^bb0(%in: f64, %out: f64): + %195 = arith.divf %in, %cst_8 : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x1xf64> + %19 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%14, %18 : tensor<1x128x768xf64>, tensor<1x128x1xf64>) outs(%13 : tensor<1x128x768xf64>) { + ^bb0(%in: f64, %in_89: f64, %out: f64): + %195 = arith.subf %in, %in_89 : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x768xf64> + %20 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%19, %19 : tensor<1x128x768xf64>, tensor<1x128x768xf64>) outs(%13 : tensor<1x128x768xf64>) { + ^bb0(%in: f64, %in_89: f64, %out: f64): + %195 = arith.mulf %in, %in_89 : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x768xf64> + %21 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%20 : tensor<1x128x768xf64>) outs(%16 : tensor<1x128x1xf64>) { + ^bb0(%in: f64, %out: f64): + %195 = arith.addf %in, %out : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x1xf64> + %22 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%21 : tensor<1x128x1xf64>) outs(%15 : tensor<1x128x1xf64>) { + ^bb0(%in: f64, %out: f64): + %195 = arith.divf %in, %cst_3 : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x1xf64> + %23 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%22 : tensor<1x128x1xf64>) outs(%9 : tensor<1x128x1xf32>) { + ^bb0(%in: f64, %out: f32): + %195 = arith.truncf %in : f64 to f32 + linalg.yield %195 : f32 + } -> tensor<1x128x1xf32> + %24 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%23 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = math.sqrt %in : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x1xf32> + %25 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%8, %12 : tensor<1x128x768xf32>, tensor<1x128x1xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.subf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %26 = linalg.generic {indexing_maps = [#map9, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_14, %25 : tensor<768xf32>, tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.mulf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %27 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%24 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.truncf %cst_6 : f64 to f32 + %196 = arith.addf %in, %195 : f32 + linalg.yield %196 : f32 + } -> tensor<1x128x1xf32> + %28 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%26, %27 : tensor<1x128x768xf32>, tensor<1x128x1xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.divf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %29 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%28, %cst_15 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %30 = tensor.empty() : tensor<768x768xf32> + %transposed = linalg.transpose ins(%cst_16 : tensor<768x768xf32>) outs(%30 : tensor<768x768xf32>) permutation = [1, 0] + %31 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%29 : tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x128x768xf32> + %32 = tensor.empty() : tensor<1x768x768xf32> + %33 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed : tensor<768x768xf32>) outs(%32 : tensor<1x768x768xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x768x768xf32> + %34 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<1x128x768xf32>) -> tensor<1x128x768xf32> + %35 = linalg.batch_matmul ins(%31, %33 : tensor<1x128x768xf32>, tensor<1x768x768xf32>) outs(%34 : tensor<1x128x768xf32>) -> tensor<1x128x768xf32> + %36 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%35, %cst_17 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %expanded_48 = tensor.expand_shape %36 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32> + %37 = tensor.empty() : tensor<1x12x128x64xf32> + %transposed_49 = linalg.transpose ins(%expanded_48 : tensor<1x128x12x64xf32>) outs(%37 : tensor<1x12x128x64xf32>) permutation = [0, 2, 1, 3] + %transposed_50 = linalg.transpose ins(%cst_18 : tensor<768x768xf32>) outs(%30 : tensor<768x768xf32>) permutation = [1, 0] + %38 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed_50 : tensor<768x768xf32>) outs(%32 : tensor<1x768x768xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x768x768xf32> + %39 = linalg.batch_matmul ins(%31, %38 : tensor<1x128x768xf32>, tensor<1x768x768xf32>) outs(%34 : tensor<1x128x768xf32>) -> tensor<1x128x768xf32> + %40 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%39, %cst_19 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %expanded_51 = tensor.expand_shape %40 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32> + %transposed_52 = linalg.transpose ins(%cst_20 : tensor<768x768xf32>) outs(%30 : tensor<768x768xf32>) permutation = [1, 0] + %41 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed_52 : tensor<768x768xf32>) outs(%32 : tensor<1x768x768xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x768x768xf32> + %42 = linalg.batch_matmul ins(%31, %41 : tensor<1x128x768xf32>, tensor<1x768x768xf32>) outs(%34 : tensor<1x128x768xf32>) -> tensor<1x128x768xf32> + %43 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%42, %cst_21 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %expanded_53 = tensor.expand_shape %43 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32> + %transposed_54 = linalg.transpose ins(%expanded_53 : tensor<1x128x12x64xf32>) outs(%37 : tensor<1x12x128x64xf32>) permutation = [0, 2, 1, 3] + %44 = tensor.empty() : tensor<1x12x64x128xf32> + %transposed_55 = linalg.transpose ins(%expanded_51 : tensor<1x128x12x64xf32>) outs(%44 : tensor<1x12x64x128xf32>) permutation = [0, 2, 3, 1] + %45 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%transposed_49 : tensor<1x12x128x64xf32>) outs(%37 : tensor<1x12x128x64xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x12x128x64xf32> + %46 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%transposed_55 : tensor<1x12x64x128xf32>) outs(%44 : tensor<1x12x64x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x12x64x128xf32> + %collapsed_56 = tensor.collapse_shape %45 [[0, 1], [2], [3]] : tensor<1x12x128x64xf32> into tensor<12x128x64xf32> + %collapsed_57 = tensor.collapse_shape %46 [[0, 1], [2], [3]] : tensor<1x12x64x128xf32> into tensor<12x64x128xf32> + %47 = tensor.empty() : tensor<12x128x128xf32> + %48 = linalg.fill ins(%cst_0 : f32) outs(%47 : tensor<12x128x128xf32>) -> tensor<12x128x128xf32> + %49 = linalg.batch_matmul ins(%collapsed_56, %collapsed_57 : tensor<12x128x64xf32>, tensor<12x64x128xf32>) outs(%48 : tensor<12x128x128xf32>) -> tensor<12x128x128xf32> + %expanded_58 = tensor.expand_shape %49 [[0, 1], [2], [3]] : tensor<12x128x128xf32> into tensor<1x12x128x128xf32> + %50 = tensor.empty() : tensor<1x12x128x128xf32> + %51 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_58 : tensor<1x12x128x128xf32>) outs(%50 : tensor<1x12x128x128xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.divf %in, %cst_9 : f32 + linalg.yield %195 : f32 + } -> tensor<1x12x128x128xf32> + %52 = tensor.empty() : tensor<1x1x128x128xi1> + %53 = linalg.generic {indexing_maps = [#map13, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_47 : tensor<1x1x128x128xi1>) outs(%52 : tensor<1x1x128x128xi1>) { + ^bb0(%in: i1, %out: i1): + %195 = arith.extui %in : i1 to i64 + %196 = arith.cmpi eq, %195, %c0_i64 : i64 + linalg.yield %196 : i1 + } -> tensor<1x1x128x128xi1> + %54 = tensor.empty() : tensor + %55 = linalg.generic {indexing_maps = [#map14, #map14], iterator_types = []} ins(%cst_46 : tensor) outs(%54 : tensor) { + ^bb0(%in: f64, %out: f32): + %195 = arith.truncf %in : f64 to f32 + linalg.yield %195 : f32 + } -> tensor + %56 = linalg.generic {indexing_maps = [#map13, #map15, #map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%53, %55, %51 : tensor<1x1x128x128xi1>, tensor, tensor<1x12x128x128xf32>) outs(%50 : tensor<1x12x128x128xf32>) { + ^bb0(%in: i1, %in_89: f32, %in_90: f32, %out: f32): + %195 = arith.select %in, %in_89, %in_90 : f32 + linalg.yield %195 : f32 + } -> tensor<1x12x128x128xf32> + %57 = tensor.empty() : tensor<1x12x128xi64> + %58 = linalg.fill ins(%c0_i64 : i64) outs(%57 : tensor<1x12x128xi64>) -> tensor<1x12x128xi64> + %59 = tensor.empty() : tensor<1x12x128xf32> + %60 = linalg.fill ins(%cst_2 : f32) outs(%59 : tensor<1x12x128xf32>) -> tensor<1x12x128xf32> + %61:2 = linalg.generic {indexing_maps = [#map12, #map16, #map16], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%56 : tensor<1x12x128x128xf32>) outs(%60, %58 : tensor<1x12x128xf32>, tensor<1x12x128xi64>) { + ^bb0(%in: f32, %out: f32, %out_89: i64): + %195 = linalg.index 3 : index + %196 = arith.index_cast %195 : index to i64 + %197 = arith.maximumf %in, %out : f32 + %198 = arith.cmpf ogt, %in, %out : f32 + %199 = arith.select %198, %196, %out_89 : i64 + linalg.yield %197, %199 : f32, i64 + } -> (tensor<1x12x128xf32>, tensor<1x12x128xi64>) + %expanded_59 = tensor.expand_shape %61#0 [[0], [1], [2, 3]] : tensor<1x12x128xf32> into tensor<1x12x128x1xf32> + %62 = linalg.generic {indexing_maps = [#map11, #map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%56, %expanded_59 : tensor<1x12x128x128xf32>, tensor<1x12x128x1xf32>) outs(%50 : tensor<1x12x128x128xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.subf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x12x128x128xf32> + %63 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%62 : tensor<1x12x128x128xf32>) outs(%50 : tensor<1x12x128x128xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = math.exp %in : f32 + linalg.yield %195 : f32 + } -> tensor<1x12x128x128xf32> + %64 = tensor.empty() : tensor<1x12x128x1xf32> + %65 = linalg.fill ins(%cst_0 : f32) outs(%64 : tensor<1x12x128x1xf32>) -> tensor<1x12x128x1xf32> + %66 = linalg.generic {indexing_maps = [#map12, #map18], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%63 : tensor<1x12x128x128xf32>) outs(%65 : tensor<1x12x128x1xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.addf %in, %out : f32 + linalg.yield %195 : f32 + } -> tensor<1x12x128x1xf32> + %67 = linalg.generic {indexing_maps = [#map11, #map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%63, %66 : tensor<1x12x128x128xf32>, tensor<1x12x128x1xf32>) outs(%50 : tensor<1x12x128x128xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.divf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x12x128x128xf32> + %68 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%67 : tensor<1x12x128x128xf32>) outs(%50 : tensor<1x12x128x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x12x128x128xf32> + %69 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%transposed_54 : tensor<1x12x128x64xf32>) outs(%37 : tensor<1x12x128x64xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x12x128x64xf32> + %collapsed_60 = tensor.collapse_shape %68 [[0, 1], [2], [3]] : tensor<1x12x128x128xf32> into tensor<12x128x128xf32> + %collapsed_61 = tensor.collapse_shape %69 [[0, 1], [2], [3]] : tensor<1x12x128x64xf32> into tensor<12x128x64xf32> + %70 = tensor.empty() : tensor<12x128x64xf32> + %71 = linalg.fill ins(%cst_0 : f32) outs(%70 : tensor<12x128x64xf32>) -> tensor<12x128x64xf32> + %72 = linalg.batch_matmul ins(%collapsed_60, %collapsed_61 : tensor<12x128x128xf32>, tensor<12x128x64xf32>) outs(%71 : tensor<12x128x64xf32>) -> tensor<12x128x64xf32> + %expanded_62 = tensor.expand_shape %72 [[0, 1], [2], [3]]: tensor<12x128x64xf32> into tensor<1x12x128x64xf32> + %73 = tensor.empty() : tensor<1x128x12x64xf32> + %transposed_63 = linalg.transpose ins(%expanded_62 : tensor<1x12x128x64xf32>) outs(%73 : tensor<1x128x12x64xf32>) permutation = [0, 2, 1, 3] + %collapsed_64 = tensor.collapse_shape %transposed_63 [[0], [1], [2, 3]] : tensor<1x128x12x64xf32> into tensor<1x128x768xf32> + %transposed_65 = linalg.transpose ins(%cst_22 : tensor<768x768xf32>) outs(%30 : tensor<768x768xf32>) permutation = [1, 0] + %74 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_64 : tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x128x768xf32> + %75 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed_65 : tensor<768x768xf32>) outs(%32 : tensor<1x768x768xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x768x768xf32> + %76 = linalg.batch_matmul ins(%74, %75 : tensor<1x128x768xf32>, tensor<1x768x768xf32>) outs(%34 : tensor<1x128x768xf32>) -> tensor<1x128x768xf32> + %77 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%76, %cst_23 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %78 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%8, %77 : tensor<1x128x768xf32>, tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %79 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%78 : tensor<1x128x768xf32>) outs(%10 : tensor<1x128x1xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.addf %in, %out : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x1xf32> + %80 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%79 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.divf %in, %cst_7 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x1xf32> + %81 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%78 : tensor<1x128x768xf32>) outs(%13 : tensor<1x128x768xf64>) { + ^bb0(%in: f32, %out: f64): + %195 = arith.extf %in : f32 to f64 + linalg.yield %195 : f64 + } -> tensor<1x128x768xf64> + %82 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%81 : tensor<1x128x768xf64>) outs(%16 : tensor<1x128x1xf64>) { + ^bb0(%in: f64, %out: f64): + %195 = arith.addf %in, %out : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x1xf64> + %83 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%82 : tensor<1x128x1xf64>) outs(%15 : tensor<1x128x1xf64>) { + ^bb0(%in: f64, %out: f64): + %195 = arith.divf %in, %cst_8 : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x1xf64> + %84 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%81, %83 : tensor<1x128x768xf64>, tensor<1x128x1xf64>) outs(%13 : tensor<1x128x768xf64>) { + ^bb0(%in: f64, %in_89: f64, %out: f64): + %195 = arith.subf %in, %in_89 : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x768xf64> + %85 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%84, %84 : tensor<1x128x768xf64>, tensor<1x128x768xf64>) outs(%13 : tensor<1x128x768xf64>) { + ^bb0(%in: f64, %in_89: f64, %out: f64): + %195 = arith.mulf %in, %in_89 : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x768xf64> + %86 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%85 : tensor<1x128x768xf64>) outs(%16 : tensor<1x128x1xf64>) { + ^bb0(%in: f64, %out: f64): + %195 = arith.addf %in, %out : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x1xf64> + %87 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%86 : tensor<1x128x1xf64>) outs(%15 : tensor<1x128x1xf64>) { + ^bb0(%in: f64, %out: f64): + %195 = arith.divf %in, %cst_3 : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x1xf64> + %88 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%87 : tensor<1x128x1xf64>) outs(%9 : tensor<1x128x1xf32>) { + ^bb0(%in: f64, %out: f32): + %195 = arith.truncf %in : f64 to f32 + linalg.yield %195 : f32 + } -> tensor<1x128x1xf32> + %89 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%88 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = math.sqrt %in : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x1xf32> + %90 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%78, %80 : tensor<1x128x768xf32>, tensor<1x128x1xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.subf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %91 = linalg.generic {indexing_maps = [#map9, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_24, %90 : tensor<768xf32>, tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.mulf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %92 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%89 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.truncf %cst_6 : f64 to f32 + %196 = arith.addf %in, %195 : f32 + linalg.yield %196 : f32 + } -> tensor<1x128x1xf32> + %93 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%91, %92 : tensor<1x128x768xf32>, tensor<1x128x1xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.divf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %94 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%93, %cst_25 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %95 = tensor.empty() : tensor<768x3072xf32> + %transposed_66 = linalg.transpose ins(%cst_26 : tensor<3072x768xf32>) outs(%95 : tensor<768x3072xf32>) permutation = [1, 0] + %96 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%94 : tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x128x768xf32> + %97 = tensor.empty() : tensor<1x768x3072xf32> + %98 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed_66 : tensor<768x3072xf32>) outs(%97 : tensor<1x768x3072xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x768x3072xf32> + %99 = tensor.empty() : tensor<1x128x3072xf32> + %100 = linalg.fill ins(%cst_0 : f32) outs(%99 : tensor<1x128x3072xf32>) -> tensor<1x128x3072xf32> + %101 = linalg.batch_matmul ins(%96, %98 : tensor<1x128x768xf32>, tensor<1x768x3072xf32>) outs(%100 : tensor<1x128x3072xf32>) -> tensor<1x128x3072xf32> + %102 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%101, %cst_27 : tensor<1x128x3072xf32>, tensor<3072xf32>) outs(%99 : tensor<1x128x3072xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x3072xf32> + %103 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%102 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.mulf %in, %cst_10 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x3072xf32> + %104 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%102 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = math.powf %in, %cst_11 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x3072xf32> + %105 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%104 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.truncf %cst_5 : f64 to f32 + %196 = arith.mulf %in, %195 : f32 + linalg.yield %196 : f32 + } -> tensor<1x128x3072xf32> + %106 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%102, %105 : tensor<1x128x3072xf32>, tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x3072xf32> + %107 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%106 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.truncf %cst_4 : f64 to f32 + %196 = arith.mulf %in, %195 : f32 + linalg.yield %196 : f32 + } -> tensor<1x128x3072xf32> + %108 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%107 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = math.tanh %in : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x3072xf32> + %109 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%108 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.addf %in, %cst_12 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x3072xf32> + %110 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%103, %109 : tensor<1x128x3072xf32>, tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.mulf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x3072xf32> + %111 = tensor.empty() : tensor<3072x768xf32> + %transposed_67 = linalg.transpose ins(%cst_28 : tensor<768x3072xf32>) outs(%111 : tensor<3072x768xf32>) permutation = [1, 0] + %112 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%110 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x128x3072xf32> + %113 = tensor.empty() : tensor<1x3072x768xf32> + %114 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed_67 : tensor<3072x768xf32>) outs(%113 : tensor<1x3072x768xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x3072x768xf32> + %115 = linalg.batch_matmul ins(%112, %114 : tensor<1x128x3072xf32>, tensor<1x3072x768xf32>) outs(%34 : tensor<1x128x768xf32>) -> tensor<1x128x768xf32> + %116 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%115, %cst_29 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %117 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%78, %116 : tensor<1x128x768xf32>, tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %118 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%117 : tensor<1x128x768xf32>) outs(%10 : tensor<1x128x1xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.addf %in, %out : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x1xf32> + %119 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%118 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.divf %in, %cst_7 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x1xf32> + %120 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%117 : tensor<1x128x768xf32>) outs(%13 : tensor<1x128x768xf64>) { + ^bb0(%in: f32, %out: f64): + %195 = arith.extf %in : f32 to f64 + linalg.yield %195 : f64 + } -> tensor<1x128x768xf64> + %121 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%120 : tensor<1x128x768xf64>) outs(%16 : tensor<1x128x1xf64>) { + ^bb0(%in: f64, %out: f64): + %195 = arith.addf %in, %out : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x1xf64> + %122 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%121 : tensor<1x128x1xf64>) outs(%15 : tensor<1x128x1xf64>) { + ^bb0(%in: f64, %out: f64): + %195 = arith.divf %in, %cst_8 : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x1xf64> + %123 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%120, %122 : tensor<1x128x768xf64>, tensor<1x128x1xf64>) outs(%13 : tensor<1x128x768xf64>) { + ^bb0(%in: f64, %in_89: f64, %out: f64): + %195 = arith.subf %in, %in_89 : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x768xf64> + %124 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%123, %123 : tensor<1x128x768xf64>, tensor<1x128x768xf64>) outs(%13 : tensor<1x128x768xf64>) { + ^bb0(%in: f64, %in_89: f64, %out: f64): + %195 = arith.mulf %in, %in_89 : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x768xf64> + %125 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%124 : tensor<1x128x768xf64>) outs(%16 : tensor<1x128x1xf64>) { + ^bb0(%in: f64, %out: f64): + %195 = arith.addf %in, %out : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x1xf64> + %126 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%125 : tensor<1x128x1xf64>) outs(%15 : tensor<1x128x1xf64>) { + ^bb0(%in: f64, %out: f64): + %195 = arith.divf %in, %cst_3 : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x1xf64> + %127 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%126 : tensor<1x128x1xf64>) outs(%9 : tensor<1x128x1xf32>) { + ^bb0(%in: f64, %out: f32): + %195 = arith.truncf %in : f64 to f32 + linalg.yield %195 : f32 + } -> tensor<1x128x1xf32> + %128 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%127 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = math.sqrt %in : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x1xf32> + %129 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%117, %119 : tensor<1x128x768xf32>, tensor<1x128x1xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.subf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %130 = linalg.generic {indexing_maps = [#map9, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_30, %129 : tensor<768xf32>, tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.mulf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %131 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%128 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.truncf %cst_6 : f64 to f32 + %196 = arith.addf %in, %195 : f32 + linalg.yield %196 : f32 + } -> tensor<1x128x1xf32> + %132 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%130, %131 : tensor<1x128x768xf32>, tensor<1x128x1xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.divf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %133 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%132, %cst_31 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %transposed_68 = linalg.transpose ins(%cst_32 : tensor<768x768xf32>) outs(%30 : tensor<768x768xf32>) permutation = [1, 0] + %134 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%133 : tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x128x768xf32> + %135 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed_68 : tensor<768x768xf32>) outs(%32 : tensor<1x768x768xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x768x768xf32> + %136 = linalg.batch_matmul ins(%134, %135 : tensor<1x128x768xf32>, tensor<1x768x768xf32>) outs(%34 : tensor<1x128x768xf32>) -> tensor<1x128x768xf32> + %137 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%136, %cst_33 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %expanded_69 = tensor.expand_shape %137 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32> + %transposed_70 = linalg.transpose ins(%expanded_69 : tensor<1x128x12x64xf32>) outs(%37 : tensor<1x12x128x64xf32>) permutation = [0, 2, 1, 3] + %transposed_71 = linalg.transpose ins(%cst_34 : tensor<768x768xf32>) outs(%30 : tensor<768x768xf32>) permutation = [1, 0] + %138 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed_71 : tensor<768x768xf32>) outs(%32 : tensor<1x768x768xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x768x768xf32> + %139 = linalg.batch_matmul ins(%134, %138 : tensor<1x128x768xf32>, tensor<1x768x768xf32>) outs(%34 : tensor<1x128x768xf32>) -> tensor<1x128x768xf32> + %140 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%139, %cst_35 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %expanded_72 = tensor.expand_shape %140 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32> + %transposed_73 = linalg.transpose ins(%cst_36 : tensor<768x768xf32>) outs(%30 : tensor<768x768xf32>) permutation = [1, 0] + %141 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed_73 : tensor<768x768xf32>) outs(%32 : tensor<1x768x768xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x768x768xf32> + %142 = linalg.batch_matmul ins(%134, %141 : tensor<1x128x768xf32>, tensor<1x768x768xf32>) outs(%34 : tensor<1x128x768xf32>) -> tensor<1x128x768xf32> + %143 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%142, %cst_37 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %expanded_74 = tensor.expand_shape %143 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32> + %transposed_75 = linalg.transpose ins(%expanded_74 : tensor<1x128x12x64xf32>) outs(%37 : tensor<1x12x128x64xf32>) permutation = [0, 2, 1, 3] + %transposed_76 = linalg.transpose ins(%expanded_72 : tensor<1x128x12x64xf32>) outs(%44 : tensor<1x12x64x128xf32>) permutation = [0, 2, 3, 1] + %144 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%transposed_70 : tensor<1x12x128x64xf32>) outs(%37 : tensor<1x12x128x64xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x12x128x64xf32> + %145 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%transposed_76 : tensor<1x12x64x128xf32>) outs(%44 : tensor<1x12x64x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x12x64x128xf32> + %collapsed_77 = tensor.collapse_shape %144 [[0, 1], [2], [3]] : tensor<1x12x128x64xf32> into tensor<12x128x64xf32> + %collapsed_78 = tensor.collapse_shape %145 [[0, 1], [2], [3]] : tensor<1x12x64x128xf32> into tensor<12x64x128xf32> + %146 = linalg.batch_matmul ins(%collapsed_77, %collapsed_78 : tensor<12x128x64xf32>, tensor<12x64x128xf32>) outs(%48 : tensor<12x128x128xf32>) -> tensor<12x128x128xf32> + %expanded_79 = tensor.expand_shape %146 [[0, 1], [2], [3]] : tensor<12x128x128xf32> into tensor<1x12x128x128xf32> + %147 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_79 : tensor<1x12x128x128xf32>) outs(%50 : tensor<1x12x128x128xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.divf %in, %cst_9 : f32 + linalg.yield %195 : f32 + } -> tensor<1x12x128x128xf32> + %148 = linalg.generic {indexing_maps = [#map13, #map15, #map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%53, %55, %147 : tensor<1x1x128x128xi1>, tensor, tensor<1x12x128x128xf32>) outs(%50 : tensor<1x12x128x128xf32>) { + ^bb0(%in: i1, %in_89: f32, %in_90: f32, %out: f32): + %195 = arith.select %in, %in_89, %in_90 : f32 + linalg.yield %195 : f32 + } -> tensor<1x12x128x128xf32> + %149:2 = linalg.generic {indexing_maps = [#map12, #map16, #map16], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%148 : tensor<1x12x128x128xf32>) outs(%60, %58 : tensor<1x12x128xf32>, tensor<1x12x128xi64>) { + ^bb0(%in: f32, %out: f32, %out_89: i64): + %195 = linalg.index 3 : index + %196 = arith.index_cast %195 : index to i64 + %197 = arith.maximumf %in, %out : f32 + %198 = arith.cmpf ogt, %in, %out : f32 + %199 = arith.select %198, %196, %out_89 : i64 + linalg.yield %197, %199 : f32, i64 + } -> (tensor<1x12x128xf32>, tensor<1x12x128xi64>) + %expanded_80 = tensor.expand_shape %149#0 [[0], [1], [2, 3]] : tensor<1x12x128xf32> into tensor<1x12x128x1xf32> + %150 = linalg.generic {indexing_maps = [#map11, #map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%148, %expanded_80 : tensor<1x12x128x128xf32>, tensor<1x12x128x1xf32>) outs(%50 : tensor<1x12x128x128xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.subf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x12x128x128xf32> + %151 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%150 : tensor<1x12x128x128xf32>) outs(%50 : tensor<1x12x128x128xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = math.exp %in : f32 + linalg.yield %195 : f32 + } -> tensor<1x12x128x128xf32> + %152 = linalg.generic {indexing_maps = [#map12, #map18], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%151 : tensor<1x12x128x128xf32>) outs(%65 : tensor<1x12x128x1xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.addf %in, %out : f32 + linalg.yield %195 : f32 + } -> tensor<1x12x128x1xf32> + %153 = linalg.generic {indexing_maps = [#map11, #map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%151, %152 : tensor<1x12x128x128xf32>, tensor<1x12x128x1xf32>) outs(%50 : tensor<1x12x128x128xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.divf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x12x128x128xf32> + %154 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%153 : tensor<1x12x128x128xf32>) outs(%50 : tensor<1x12x128x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x12x128x128xf32> + %155 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%transposed_75 : tensor<1x12x128x64xf32>) outs(%37 : tensor<1x12x128x64xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x12x128x64xf32> + %collapsed_81 = tensor.collapse_shape %154 [[0, 1], [2], [3]] : tensor<1x12x128x128xf32> into tensor<12x128x128xf32> + %collapsed_82 = tensor.collapse_shape %155 [[0, 1], [2], [3]] : tensor<1x12x128x64xf32> into tensor<12x128x64xf32> + %156 = linalg.batch_matmul ins(%collapsed_81, %collapsed_82 : tensor<12x128x128xf32>, tensor<12x128x64xf32>) outs(%71 : tensor<12x128x64xf32>) -> tensor<12x128x64xf32> + %expanded_83 = tensor.expand_shape %156 [[0, 1], [2], [3]] : tensor<12x128x64xf32> into tensor<1x12x128x64xf32> + %transposed_84 = linalg.transpose ins(%expanded_83 : tensor<1x12x128x64xf32>) outs(%73 : tensor<1x128x12x64xf32>) permutation = [0, 2, 1, 3] + %collapsed_85 = tensor.collapse_shape %transposed_84 [[0], [1], [2, 3]] : tensor<1x128x12x64xf32> into tensor<1x128x768xf32> + %transposed_86 = linalg.transpose ins(%cst_38 : tensor<768x768xf32>) outs(%30 : tensor<768x768xf32>) permutation = [1, 0] + %157 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_85 : tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x128x768xf32> + %158 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed_86 : tensor<768x768xf32>) outs(%32 : tensor<1x768x768xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x768x768xf32> + %159 = linalg.batch_matmul ins(%157, %158 : tensor<1x128x768xf32>, tensor<1x768x768xf32>) outs(%34 : tensor<1x128x768xf32>) -> tensor<1x128x768xf32> + %160 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%159, %cst_39 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %161 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%117, %160 : tensor<1x128x768xf32>, tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %162 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%161 : tensor<1x128x768xf32>) outs(%10 : tensor<1x128x1xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.addf %in, %out : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x1xf32> + %163 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%162 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.divf %in, %cst_7 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x1xf32> + %164 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%161 : tensor<1x128x768xf32>) outs(%13 : tensor<1x128x768xf64>) { + ^bb0(%in: f32, %out: f64): + %195 = arith.extf %in : f32 to f64 + linalg.yield %195 : f64 + } -> tensor<1x128x768xf64> + %165 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%164 : tensor<1x128x768xf64>) outs(%16 : tensor<1x128x1xf64>) { + ^bb0(%in: f64, %out: f64): + %195 = arith.addf %in, %out : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x1xf64> + %166 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%165 : tensor<1x128x1xf64>) outs(%15 : tensor<1x128x1xf64>) { + ^bb0(%in: f64, %out: f64): + %195 = arith.divf %in, %cst_8 : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x1xf64> + %167 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%164, %166 : tensor<1x128x768xf64>, tensor<1x128x1xf64>) outs(%13 : tensor<1x128x768xf64>) { + ^bb0(%in: f64, %in_89: f64, %out: f64): + %195 = arith.subf %in, %in_89 : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x768xf64> + %168 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%167, %167 : tensor<1x128x768xf64>, tensor<1x128x768xf64>) outs(%13 : tensor<1x128x768xf64>) { + ^bb0(%in: f64, %in_89: f64, %out: f64): + %195 = arith.mulf %in, %in_89 : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x768xf64> + %169 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%168 : tensor<1x128x768xf64>) outs(%16 : tensor<1x128x1xf64>) { + ^bb0(%in: f64, %out: f64): + %195 = arith.addf %in, %out : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x1xf64> + %170 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%169 : tensor<1x128x1xf64>) outs(%15 : tensor<1x128x1xf64>) { + ^bb0(%in: f64, %out: f64): + %195 = arith.divf %in, %cst_3 : f64 + linalg.yield %195 : f64 + } -> tensor<1x128x1xf64> + %171 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%170 : tensor<1x128x1xf64>) outs(%9 : tensor<1x128x1xf32>) { + ^bb0(%in: f64, %out: f32): + %195 = arith.truncf %in : f64 to f32 + linalg.yield %195 : f32 + } -> tensor<1x128x1xf32> + %172 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%171 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = math.sqrt %in : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x1xf32> + %173 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%161, %163 : tensor<1x128x768xf32>, tensor<1x128x1xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.subf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %174 = linalg.generic {indexing_maps = [#map9, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_40, %173 : tensor<768xf32>, tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.mulf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %175 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%172 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.truncf %cst_6 : f64 to f32 + %196 = arith.addf %in, %195 : f32 + linalg.yield %196 : f32 + } -> tensor<1x128x1xf32> + %176 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%174, %175 : tensor<1x128x768xf32>, tensor<1x128x1xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.divf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %177 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%176, %cst_41 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %transposed_87 = linalg.transpose ins(%cst_42 : tensor<3072x768xf32>) outs(%95 : tensor<768x3072xf32>) permutation = [1, 0] + %178 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%177 : tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x128x768xf32> + %179 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed_87 : tensor<768x3072xf32>) outs(%97 : tensor<1x768x3072xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x768x3072xf32> + %180 = linalg.batch_matmul ins(%178, %179 : tensor<1x128x768xf32>, tensor<1x768x3072xf32>) outs(%100 : tensor<1x128x3072xf32>) -> tensor<1x128x3072xf32> + %181 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%180, %cst_43 : tensor<1x128x3072xf32>, tensor<3072xf32>) outs(%99 : tensor<1x128x3072xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x3072xf32> + %182 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%181 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.mulf %in, %cst_10 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x3072xf32> + %183 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%181 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = math.powf %in, %cst_11 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x3072xf32> + %184 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%183 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.truncf %cst_5 : f64 to f32 + %196 = arith.mulf %in, %195 : f32 + linalg.yield %196 : f32 + } -> tensor<1x128x3072xf32> + %185 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%181, %184 : tensor<1x128x3072xf32>, tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x3072xf32> + %186 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%185 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.truncf %cst_4 : f64 to f32 + %196 = arith.mulf %in, %195 : f32 + linalg.yield %196 : f32 + } -> tensor<1x128x3072xf32> + %187 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%186 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = math.tanh %in : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x3072xf32> + %188 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%187 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) { + ^bb0(%in: f32, %out: f32): + %195 = arith.addf %in, %cst_12 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x3072xf32> + %189 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%182, %188 : tensor<1x128x3072xf32>, tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.mulf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x3072xf32> + %transposed_88 = linalg.transpose ins(%cst_44 : tensor<768x3072xf32>) outs(%111 : tensor<3072x768xf32>) permutation = [1, 0] + %190 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%189 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x128x3072xf32> + %191 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed_88 : tensor<3072x768xf32>) outs(%113 : tensor<1x3072x768xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x3072x768xf32> + %192 = linalg.batch_matmul ins(%190, %191 : tensor<1x128x3072xf32>, tensor<1x3072x768xf32>) outs(%34 : tensor<1x128x768xf32>) -> tensor<1x128x768xf32> + %193 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%192, %cst_45 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + %194 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%161, %193 : tensor<1x128x768xf32>, tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) { + ^bb0(%in: f32, %in_89: f32, %out: f32): + %195 = arith.addf %in, %in_89 : f32 + linalg.yield %195 : f32 + } -> tensor<1x128x768xf32> + return %194 : tensor<1x128x768xf32> + } +} \ No newline at end of file diff --git a/test/samples/lenet/lenet_affine.mlir b/test/samples/lenet/lenet_affine.mlir new file mode 100644 index 00000000..de0bb4e4 --- /dev/null +++ b/test/samples/lenet/lenet_affine.mlir @@ -0,0 +1,250 @@ +#map = affine_map<(d0, d1) -> (d0 * 2 + d1)> +module { + func.func @main(%arg0: tensor<1x3x32x32xf32>) -> tensor<1x10xf32> { + %c0 = arith.constant 0 : index + %cst = arith.constant dense_resource : tensor<10xf32> + %cst_0 = arith.constant dense_resource : tensor<10x84xf32> + %cst_1 = arith.constant dense_resource : tensor<84xf32> + %cst_2 = arith.constant dense_resource : tensor<84x120xf32> + %cst_3 = arith.constant dense_resource : tensor<120xf32> + %cst_4 = arith.constant dense_resource : tensor<120x400xf32> + %cst_5 = arith.constant dense_resource : tensor<16x6x5x5xf32> + %cst_6 = arith.constant 0.000000e+00 : f32 + %cst_7 = arith.constant dense_resource : tensor<6x3x5x5xf32> + %0 = bufferization.to_memref %arg0 : memref<1x3x32x32xf32> + %1 = bufferization.to_memref %cst_7 : memref<6x3x5x5xf32> + %2 = bufferization.to_memref %cst_5 : memref<16x6x5x5xf32> + %3 = bufferization.to_memref %cst_4 : memref<120x400xf32> + %4 = bufferization.to_memref %cst_3 : memref<120xf32> + %5 = bufferization.to_memref %cst_2 : memref<84x120xf32> + %6 = bufferization.to_memref %cst_1 : memref<84xf32> + %7 = bufferization.to_memref %cst_0 : memref<10x84xf32> + %8 = bufferization.to_memref %cst : memref<10xf32> + %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x6x14x14xf32> + affine.for %arg1 = 0 to 1 { + affine.for %arg2 = 0 to 6 { + affine.for %arg3 = 0 to 14 { + affine.for %arg4 = 0 to 14 { + affine.store %cst_6, %alloc[%arg1, %arg2, %arg3, %arg4] : memref<1x6x14x14xf32> + } + } + } + } + %alloc_8 = memref.alloc() {alignment = 64 : i64} : memref<1x6x14x14xf32> + memref.copy %alloc, %alloc_8 : memref<1x6x14x14xf32> to memref<1x6x14x14xf32> + affine.for %arg1 = 0 to 1 { + affine.for %arg2 = 0 to 6 { + affine.for %arg3 = 0 to 14 { + affine.for %arg4 = 0 to 14 { + affine.for %arg5 = 0 to 3 { + affine.for %arg6 = 0 to 5 { + affine.for %arg7 = 0 to 5 { + %12 = affine.apply #map(%arg3, %arg6) + %13 = affine.apply #map(%arg4, %arg7) + %14 = affine.load %0[%arg1, %arg5, %12, %13] : memref<1x3x32x32xf32> + %15 = affine.load %1[%arg2, %arg5, %arg6, %arg7] : memref<6x3x5x5xf32> + %16 = affine.load %alloc_8[%arg1, %arg2, %arg3, %arg4] : memref<1x6x14x14xf32> + %17 = arith.mulf %14, %15 : f32 + %18 = arith.addf %16, %17 : f32 + affine.store %18, %alloc_8[%arg1, %arg2, %arg3, %arg4] : memref<1x6x14x14xf32> + } + } + } + } + } + } + } + %alloc_9 = memref.alloc() {alignment = 64 : i64} : memref<1x6x14x14xf32> + affine.for %arg1 = 0 to 1 { + affine.for %arg2 = 0 to 6 { + affine.for %arg3 = 0 to 14 { + affine.for %arg4 = 0 to 14 { + %12 = affine.load %alloc_8[%c0, %arg2, %arg3, %arg4] : memref<1x6x14x14xf32> + %13 = arith.cmpf ugt, %12, %cst_6 : f32 + %14 = arith.select %13, %12, %cst_6 : f32 + affine.store %14, %alloc_9[%arg1, %arg2, %arg3, %arg4] : memref<1x6x14x14xf32> + } + } + } + } + %alloc_10 = memref.alloc() {alignment = 64 : i64} : memref<1x16x5x5xf32> + affine.for %arg1 = 0 to 1 { + affine.for %arg2 = 0 to 16 { + affine.for %arg3 = 0 to 5 { + affine.for %arg4 = 0 to 5 { + affine.store %cst_6, %alloc_10[%arg1, %arg2, %arg3, %arg4] : memref<1x16x5x5xf32> + } + } + } + } + %alloc_11 = memref.alloc() {alignment = 64 : i64} : memref<1x16x5x5xf32> + memref.copy %alloc_10, %alloc_11 : memref<1x16x5x5xf32> to memref<1x16x5x5xf32> + affine.for %arg1 = 0 to 1 { + affine.for %arg2 = 0 to 16 { + affine.for %arg3 = 0 to 5 { + affine.for %arg4 = 0 to 5 { + affine.for %arg5 = 0 to 6 { + affine.for %arg6 = 0 to 5 { + affine.for %arg7 = 0 to 5 { + %12 = affine.apply #map(%arg3, %arg6) + %13 = affine.apply #map(%arg4, %arg7) + %14 = affine.load %alloc_9[%arg1, %arg5, %12, %13] : memref<1x6x14x14xf32> + %15 = affine.load %2[%arg2, %arg5, %arg6, %arg7] : memref<16x6x5x5xf32> + %16 = affine.load %alloc_11[%arg1, %arg2, %arg3, %arg4] : memref<1x16x5x5xf32> + %17 = arith.mulf %14, %15 : f32 + %18 = arith.addf %16, %17 : f32 + affine.store %18, %alloc_11[%arg1, %arg2, %arg3, %arg4] : memref<1x16x5x5xf32> + } + } + } + } + } + } + } + %alloc_12 = memref.alloc() {alignment = 64 : i64} : memref<1x16x5x5xf32> + affine.for %arg1 = 0 to 1 { + affine.for %arg2 = 0 to 16 { + affine.for %arg3 = 0 to 5 { + affine.for %arg4 = 0 to 5 { + %12 = affine.load %alloc_11[%c0, %arg2, %arg3, %arg4] : memref<1x16x5x5xf32> + %13 = arith.cmpf ugt, %12, %cst_6 : f32 + %14 = arith.select %13, %12, %cst_6 : f32 + affine.store %14, %alloc_12[%arg1, %arg2, %arg3, %arg4] : memref<1x16x5x5xf32> + } + } + } + } + %9 = bufferization.to_tensor %alloc_12 : memref<1x16x5x5xf32> + %collapsed = tensor.collapse_shape %9 [[0], [1, 2, 3]] : tensor<1x16x5x5xf32> into tensor<1x400xf32> + %10 = bufferization.to_memref %collapsed : memref<1x400xf32> + %alloc_13 = memref.alloc() {alignment = 64 : i64} : memref<400x120xf32> + affine.for %arg1 = 0 to 120 { + affine.for %arg2 = 0 to 400 { + %12 = affine.load %3[%arg1, %arg2] : memref<120x400xf32> + affine.store %12, %alloc_13[%arg2, %arg1] : memref<400x120xf32> + } + } + %alloc_14 = memref.alloc() {alignment = 64 : i64} : memref<1x120xf32> + affine.for %arg1 = 0 to 1 { + affine.for %arg2 = 0 to 120 { + affine.store %cst_6, %alloc_14[%arg1, %arg2] : memref<1x120xf32> + } + } + %alloc_15 = memref.alloc() {alignment = 64 : i64} : memref<1x120xf32> + memref.copy %alloc_14, %alloc_15 : memref<1x120xf32> to memref<1x120xf32> + affine.for %arg1 = 0 to 1 { + affine.for %arg2 = 0 to 120 { + affine.for %arg3 = 0 to 400 { + %12 = affine.load %10[%arg1, %arg3] : memref<1x400xf32> + %13 = affine.load %alloc_13[%arg3, %arg2] : memref<400x120xf32> + %14 = affine.load %alloc_15[%arg1, %arg2] : memref<1x120xf32> + %15 = arith.mulf %12, %13 : f32 + %16 = arith.addf %14, %15 : f32 + affine.store %16, %alloc_15[%arg1, %arg2] : memref<1x120xf32> + } + } + } + %alloc_16 = memref.alloc() {alignment = 64 : i64} : memref<1x120xf32> + affine.for %arg1 = 0 to 1 { + affine.for %arg2 = 0 to 120 { + %12 = affine.load %alloc_15[%c0, %arg2] : memref<1x120xf32> + %13 = affine.load %4[%arg2] : memref<120xf32> + %14 = arith.addf %12, %13 : f32 + affine.store %14, %alloc_16[%arg1, %arg2] : memref<1x120xf32> + } + } + %alloc_17 = memref.alloc() {alignment = 64 : i64} : memref<1x120xf32> + affine.for %arg1 = 0 to 1 { + affine.for %arg2 = 0 to 120 { + %12 = affine.load %alloc_16[%c0, %arg2] : memref<1x120xf32> + %13 = arith.cmpf ugt, %12, %cst_6 : f32 + %14 = arith.select %13, %12, %cst_6 : f32 + affine.store %14, %alloc_17[%arg1, %arg2] : memref<1x120xf32> + } + } + %alloc_18 = memref.alloc() {alignment = 64 : i64} : memref<120x84xf32> + affine.for %arg1 = 0 to 84 { + affine.for %arg2 = 0 to 120 { + %12 = affine.load %5[%arg1, %arg2] : memref<84x120xf32> + affine.store %12, %alloc_18[%arg2, %arg1] : memref<120x84xf32> + } + } + %alloc_19 = memref.alloc() {alignment = 64 : i64} : memref<1x84xf32> + affine.for %arg1 = 0 to 1 { + affine.for %arg2 = 0 to 84 { + affine.store %cst_6, %alloc_19[%arg1, %arg2] : memref<1x84xf32> + } + } + %alloc_20 = memref.alloc() {alignment = 64 : i64} : memref<1x84xf32> + memref.copy %alloc_19, %alloc_20 : memref<1x84xf32> to memref<1x84xf32> + affine.for %arg1 = 0 to 1 { + affine.for %arg2 = 0 to 84 { + affine.for %arg3 = 0 to 120 { + %12 = affine.load %alloc_17[%arg1, %arg3] : memref<1x120xf32> + %13 = affine.load %alloc_18[%arg3, %arg2] : memref<120x84xf32> + %14 = affine.load %alloc_20[%arg1, %arg2] : memref<1x84xf32> + %15 = arith.mulf %12, %13 : f32 + %16 = arith.addf %14, %15 : f32 + affine.store %16, %alloc_20[%arg1, %arg2] : memref<1x84xf32> + } + } + } + %alloc_21 = memref.alloc() {alignment = 64 : i64} : memref<1x84xf32> + affine.for %arg1 = 0 to 1 { + affine.for %arg2 = 0 to 84 { + %12 = affine.load %alloc_20[%c0, %arg2] : memref<1x84xf32> + %13 = affine.load %6[%arg2] : memref<84xf32> + %14 = arith.addf %12, %13 : f32 + affine.store %14, %alloc_21[%arg1, %arg2] : memref<1x84xf32> + } + } + %alloc_22 = memref.alloc() {alignment = 64 : i64} : memref<1x84xf32> + affine.for %arg1 = 0 to 1 { + affine.for %arg2 = 0 to 84 { + %12 = affine.load %alloc_21[%c0, %arg2] : memref<1x84xf32> + %13 = arith.cmpf ugt, %12, %cst_6 : f32 + %14 = arith.select %13, %12, %cst_6 : f32 + affine.store %14, %alloc_22[%arg1, %arg2] : memref<1x84xf32> + } + } + %alloc_23 = memref.alloc() {alignment = 64 : i64} : memref<84x10xf32> + affine.for %arg1 = 0 to 10 { + affine.for %arg2 = 0 to 84 { + %12 = affine.load %7[%arg1, %arg2] : memref<10x84xf32> + affine.store %12, %alloc_23[%arg2, %arg1] : memref<84x10xf32> + } + } + %alloc_24 = memref.alloc() {alignment = 64 : i64} : memref<1x10xf32> + affine.for %arg1 = 0 to 1 { + affine.for %arg2 = 0 to 10 { + affine.store %cst_6, %alloc_24[%arg1, %arg2] : memref<1x10xf32> + } + } + %alloc_25 = memref.alloc() {alignment = 64 : i64} : memref<1x10xf32> + memref.copy %alloc_24, %alloc_25 : memref<1x10xf32> to memref<1x10xf32> + affine.for %arg1 = 0 to 1 { + affine.for %arg2 = 0 to 10 { + affine.for %arg3 = 0 to 84 { + %12 = affine.load %alloc_22[%arg1, %arg3] : memref<1x84xf32> + %13 = affine.load %alloc_23[%arg3, %arg2] : memref<84x10xf32> + %14 = affine.load %alloc_25[%arg1, %arg2] : memref<1x10xf32> + %15 = arith.mulf %12, %13 : f32 + %16 = arith.addf %14, %15 : f32 + affine.store %16, %alloc_25[%arg1, %arg2] : memref<1x10xf32> + } + } + } + %alloc_26 = memref.alloc() {alignment = 64 : i64} : memref<1x10xf32> + affine.for %arg1 = 0 to 1 { + affine.for %arg2 = 0 to 10 { + %12 = affine.load %alloc_25[%c0, %arg2] : memref<1x10xf32> + %13 = affine.load %8[%arg2] : memref<10xf32> + %14 = arith.addf %12, %13 : f32 + affine.store %14, %alloc_26[%arg1, %arg2] : memref<1x10xf32> + } + } + %11 = bufferization.to_tensor %alloc_26 : memref<1x10xf32> + return %11 : tensor<1x10xf32> + } +} + diff --git a/test/samples/lenet/lenet_linalg.mlir b/test/samples/lenet/lenet_linalg.mlir new file mode 100644 index 00000000..d66d2400 --- /dev/null +++ b/test/samples/lenet/lenet_linalg.mlir @@ -0,0 +1,80 @@ +#map = affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)> +#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#map2 = affine_map<(d0, d1) -> (0, d1)> +#map3 = affine_map<(d0, d1) -> (d1)> +#map4 = affine_map<(d0, d1) -> (d0, d1)> +module { + func.func @main(%arg0: tensor<1x3x32x32xf32>) -> tensor<1x10xf32> { + %cst = arith.constant dense_resource : tensor<6x3x5x5xf32> + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant dense_resource : tensor<16x6x5x5xf32> + %cst_2 = arith.constant dense_resource : tensor<120x400xf32> + %cst_3 = arith.constant dense_resource : tensor<120xf32> + %cst_4 = arith.constant dense_resource : tensor<84x120xf32> + %cst_5 = arith.constant dense_resource : tensor<84xf32> + %cst_6 = arith.constant dense_resource : tensor<10x84xf32> + %cst_7 = arith.constant dense_resource : tensor<10xf32> + %0 = tensor.empty() : tensor<1x6x14x14xf32> + %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<1x6x14x14xf32>) -> tensor<1x6x14x14xf32> + %2 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %cst : tensor<1x3x32x32xf32>, tensor<6x3x5x5xf32>) outs(%1 : tensor<1x6x14x14xf32>) -> tensor<1x6x14x14xf32> + %3 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<1x6x14x14xf32>) outs(%0 : tensor<1x6x14x14xf32>) { + ^bb0(%in: f32, %out: f32): + %25 = arith.cmpf ugt, %in, %cst_0 : f32 + %26 = arith.select %25, %in, %cst_0 : f32 + linalg.yield %26 : f32 + } -> tensor<1x6x14x14xf32> + %4 = tensor.empty() : tensor<1x16x5x5xf32> + %5 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<1x16x5x5xf32>) -> tensor<1x16x5x5xf32> + %6 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%3, %cst_1 : tensor<1x6x14x14xf32>, tensor<16x6x5x5xf32>) outs(%5 : tensor<1x16x5x5xf32>) -> tensor<1x16x5x5xf32> + %7 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<1x16x5x5xf32>) outs(%4 : tensor<1x16x5x5xf32>) { + ^bb0(%in: f32, %out: f32): + %25 = arith.cmpf ugt, %in, %cst_0 : f32 + %26 = arith.select %25, %in, %cst_0 : f32 + linalg.yield %26 : f32 + } -> tensor<1x16x5x5xf32> + %collapsed = tensor.collapse_shape %7 [[0], [1, 2, 3]] : tensor<1x16x5x5xf32> into tensor<1x400xf32> + %8 = tensor.empty() : tensor<400x120xf32> + %transposed = linalg.transpose ins(%cst_2 : tensor<120x400xf32>) outs(%8 : tensor<400x120xf32>) permutation = [1, 0] + %9 = tensor.empty() : tensor<1x120xf32> + %10 = linalg.fill ins(%cst_0 : f32) outs(%9 : tensor<1x120xf32>) -> tensor<1x120xf32> + %11 = linalg.matmul ins(%collapsed, %transposed : tensor<1x400xf32>, tensor<400x120xf32>) outs(%10 : tensor<1x120xf32>) -> tensor<1x120xf32> + %12 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel"]} ins(%11, %cst_3 : tensor<1x120xf32>, tensor<120xf32>) outs(%9 : tensor<1x120xf32>) { + ^bb0(%in: f32, %in_10: f32, %out: f32): + %25 = arith.addf %in, %in_10 : f32 + linalg.yield %25 : f32 + } -> tensor<1x120xf32> + %13 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<1x120xf32>) outs(%9 : tensor<1x120xf32>) { + ^bb0(%in: f32, %out: f32): + %25 = arith.cmpf ugt, %in, %cst_0 : f32 + %26 = arith.select %25, %in, %cst_0 : f32 + linalg.yield %26 : f32 + } -> tensor<1x120xf32> + %14 = tensor.empty() : tensor<120x84xf32> + %transposed_8 = linalg.transpose ins(%cst_4 : tensor<84x120xf32>) outs(%14 : tensor<120x84xf32>) permutation = [1, 0] + %15 = tensor.empty() : tensor<1x84xf32> + %16 = linalg.fill ins(%cst_0 : f32) outs(%15 : tensor<1x84xf32>) -> tensor<1x84xf32> + %17 = linalg.matmul ins(%13, %transposed_8 : tensor<1x120xf32>, tensor<120x84xf32>) outs(%16 : tensor<1x84xf32>) -> tensor<1x84xf32> + %18 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel"]} ins(%17, %cst_5 : tensor<1x84xf32>, tensor<84xf32>) outs(%15 : tensor<1x84xf32>) { + ^bb0(%in: f32, %in_10: f32, %out: f32): + %25 = arith.addf %in, %in_10 : f32 + linalg.yield %25 : f32 + } -> tensor<1x84xf32> + %19 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel"]} ins(%18 : tensor<1x84xf32>) outs(%15 : tensor<1x84xf32>) { + ^bb0(%in: f32, %out: f32): + %25 = arith.cmpf ugt, %in, %cst_0 : f32 + %26 = arith.select %25, %in, %cst_0 : f32 + linalg.yield %26 : f32 + } -> tensor<1x84xf32> + %20 = tensor.empty() : tensor<84x10xf32> + %transposed_9 = linalg.transpose ins(%cst_6 : tensor<10x84xf32>) outs(%20 : tensor<84x10xf32>) permutation = [1, 0] + %21 = tensor.empty() : tensor<1x10xf32> + %22 = linalg.fill ins(%cst_0 : f32) outs(%21 : tensor<1x10xf32>) -> tensor<1x10xf32> + %23 = linalg.matmul ins(%19, %transposed_9 : tensor<1x84xf32>, tensor<84x10xf32>) outs(%22 : tensor<1x10xf32>) -> tensor<1x10xf32> + %24 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel"]} ins(%23, %cst_7 : tensor<1x10xf32>, tensor<10xf32>) outs(%21 : tensor<1x10xf32>) { + ^bb0(%in: f32, %in_10: f32, %out: f32): + %25 = arith.addf %in, %in_10 : f32 + linalg.yield %25 : f32 + } -> tensor<1x10xf32> + return %24 : tensor<1x10xf32> + } +} \ No newline at end of file diff --git a/tools/mlir-neura-opt/mlir-neura-opt.cpp b/tools/mlir-neura-opt/mlir-neura-opt.cpp index d21664fb..b1791e47 100644 --- a/tools/mlir-neura-opt/mlir-neura-opt.cpp +++ b/tools/mlir-neura-opt/mlir-neura-opt.cpp @@ -20,6 +20,7 @@ int main(int argc, char **argv) { registry.insert(); registry.insert(); registry.insert(); + registry.insert(); mlir::neura::registerPasses(); mlir::registerPasses();