diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 98b116d3..c23378af 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -101,5 +101,5 @@ jobs:
       working-directory: ${{github.workspace}}
       run: |
         cd ${{github.workspace}}/test
-        ${{github.workspace}}/llvm-project/build/bin/llvm-lit * -v
+        ${{github.workspace}}/llvm-project/build/bin/llvm-lit . -v
 
diff --git a/include/Conversion/ConversionPasses.h b/include/Conversion/ConversionPasses.h
index 2477bb3d..2d871868 100644
--- a/include/Conversion/ConversionPasses.h
+++ b/include/Conversion/ConversionPasses.h
@@ -19,6 +19,7 @@ namespace mlir {
 // Conversion passes.
 std::unique_ptr<mlir::Pass> createLowerArithToNeuraPass();
 std::unique_ptr<mlir::Pass> createLowerLlvmToNeuraPass();
+std::unique_ptr<mlir::Pass> createLowerMemRefToNeuraPass();
 
 #define GEN_PASS_REGISTRATION
 #include "Conversion/ConversionPasses.h.inc"
diff --git a/include/Conversion/ConversionPasses.td b/include/Conversion/ConversionPasses.td
index 7fca77bb..8f2db985 100644
--- a/include/Conversion/ConversionPasses.td
+++ b/include/Conversion/ConversionPasses.td
@@ -20,4 +20,10 @@ def LowerLlvmToNeura : Pass<"lower-llvm-to-neura", "ModuleOp">{
   let constructor = "mlir::createLowerLlvmToNeuraPass()";
 }
 
+def LowerMemRefToNeura : Pass<"lower-memref-to-neura", "ModuleOp">{
+  let summary = "Lower MemRef to Neura dialect";
+  let description = [{Lower MemRef operations to Neura dialect operations.}];
+  let constructor = "mlir::createLowerMemRefToNeuraPass()";
+}
+
 #endif // CONVERSION_PASSES_TD
\ No newline at end of file
diff --git a/include/NeuraDialect/NeuraOps.td b/include/NeuraDialect/NeuraOps.td
index 223eee9f..4021bbe2 100644
--- a/include/NeuraDialect/NeuraOps.td
+++ b/include/NeuraDialect/NeuraOps.td
@@ -24,6 +24,14 @@ def Neura_AddOp : Op<NeuraDialect, "add"> {
   let traits = [SameOperandsAndResultElementType];
 }
 
+def Neura_SubOp : Op<NeuraDialect, "sub"> {
+  let summary = "Integer substraction operation";
+  let arguments = (ins AnyType:$lhs, AnyType:$rhs, Optional<AnyType>:$predicate);
+  let results = (outs AnyType:$result);
+  // let assemblyFormat = "$lhs `,` $rhs `,` $predicate attr-dict `:` type($result)";
+  let traits = [SameOperandsAndResultElementType];
+}
+
 // Defines a floating-point addition operation.
 def Neura_FAddOp : Op<NeuraDialect, "fadd"> {
   let summary = "Floating addition operation";
@@ -38,7 +46,7 @@ def Neura_FAddOp : Op<NeuraDialect, "fadd"> {
 def Neura_FSubOp: Op<NeuraDialect, "fsub"> {
   let summary = "Floating substraction operation";
   let opName = "fsub";
-  let arguments = (ins AnyFloat:$lhs, AnyFloat:$rhs);
+  let arguments = (ins AnyFloat:$lhs, AnyFloat:$rhs, Optional<AnyType>:$predicate);
   let results = (outs AnyFloat:$result);
   // let assemblyFormat = "$lhs `,` $rhs attr-dict `:` type($result)";
   let traits = [SameOperandsAndResultElementType];
@@ -54,6 +62,13 @@ def Neura_FMulOp : Op<NeuraDialect, "fmul"> {
   // let traits = [SameOperandsAndResultElementType];
 }
 
+def Neura_FDivOp : Op<NeuraDialect, "fdiv"> {
+  let summary = "Floating division operation";
+  let arguments = (ins AnyFloat:$lhs, AnyFloat:$rhs, Optional<AnyType>:$predicate);
+  let results = (outs AnyFloat:$result);
+  // let assemblyFormat = "$lhs `,` $rhs `,` $predicate attr-dict `:` type($result)";
+}
+
 // Defines a bitwise OR operation.
 def Neura_OrOp : Op<NeuraDialect, "or"> {
   let summary = "Bitwise OR operation";
@@ -144,6 +159,14 @@ def Neura_ReturnOp : Op<NeuraDialect, "return", [Terminator]> {
   // let assemblyFormat = "($values^)? `,` $predicate attr-dict";
 }
 
+// Defines a cast operation for type conversion.
+def Neura_CastOp : Op<NeuraDialect, "cast">{
+  let summary = "Generic type conversion operation";
+  let arguments = (ins AnyType:$input, StrAttr:$cast_type, Optional<AnyType>:$predicate);
+  let results = (outs AnyType:$result);
+  // let assemblyFormat = "$input type($input) `->` type($output) `,` $predicate attr-dict";
+}
+
 // ----------------------------------------------------
 // Defines vector operations.
 
diff --git a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
index c5af8d47..72c83c6b 100644
--- a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
+++ b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
@@ -1,12 +1,13 @@
+#include "Conversion/ConversionPasses.h"
 #include "NeuraDialect/NeuraDialect.h"
 #include "NeuraDialect/NeuraOps.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "NeuraDialect/NeuraPasses.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "Conversion/ConversionPasses.h"
 
 namespace mlir {
 namespace neura {
@@ -26,7 +27,39 @@ using namespace mlir::neura;
 #define GEN_PASS_DEF_LOWERARITHTONEURA
 #include "NeuraDialect/NeuraPasses.h.inc"
 
-namespace{
+namespace {
+
+struct ArithConstantToNeuraConstant
+    : public OpRewritePattern<mlir::arith::ConstantOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::ConstantOp op,
+                                PatternRewriter &rewriter) const override {
+    // Converts arith constant to Neura constant
+    Type result_type = op.getType();
+    Attribute value = op.getValue();
+    // Optional predicate parameter can be null
+    rewriter.replaceOpWithNewOp<neura::ConstantOp>(op, result_type, value,
+                                                   nullptr);
+    return success();
+  }
+};
+
+struct ArithAddIToNeuraAdd : public OpRewritePattern<mlir::arith::AddIOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::AddIOp op,
+                                PatternRewriter &rewriter) const override {
+    Value lhs = op.getLhs();
+    Value rhs = op.getRhs();
+    Type result_type = op.getType();
+
+    // Optional predicate: default to null
+    rewriter.replaceOpWithNewOp<neura::AddOp>(op, result_type, lhs, rhs,
+                                              nullptr);
+    return success();
+  }
+};
 
 struct ArithFAddToNeuraFAdd : public OpRewritePattern<mlir::arith::AddFOp> {
   using OpRewritePattern::OpRewritePattern;
@@ -35,16 +68,199 @@ struct ArithFAddToNeuraFAdd : public OpRewritePattern<mlir::arith::AddFOp> {
                                 PatternRewriter &rewriter) const override {
     Value lhs = op.getLhs();
     Value rhs = op.getRhs();
-    Type resultType = op.getType();
+    Type result_type = op.getType();
+
+    // Optional predicate: default to null
+    rewriter.replaceOpWithNewOp<neura::FAddOp>(op, result_type, lhs, rhs,
+                                               nullptr);
+    return success();
+  }
+};
+
+struct ArithSubIToNeuraSub : public OpRewritePattern<mlir::arith::SubIOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::SubIOp op,
+                                PatternRewriter &rewriter) const override {
+    Value lhs = op.getLhs();
+    Value rhs = op.getRhs();
+    Type result_type = op.getType();
+
+    // Optional predicate: default to null
+    rewriter.replaceOpWithNewOp<neura::SubOp>(op, result_type, lhs, rhs,
+                                              nullptr);
+    return success();
+  }
+};
+
+struct ArithSubFToNeuraFSub : public OpRewritePattern<mlir::arith::SubFOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::SubFOp op,
+                                PatternRewriter &rewriter) const override {
+    Value lhs = op.getLhs();
+    Value rhs = op.getRhs();
+    Type result_type = op.getType();
+
+    // Optional predicate: default to null
+    rewriter.replaceOpWithNewOp<neura::FSubOp>(op, result_type, lhs, rhs,
+                                               nullptr);
+    return success();
+  }
+};
+
+struct ArithMulFToNeuraFMul : public OpRewritePattern<mlir::arith::MulFOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::MulFOp op,
+                                PatternRewriter &rewriter) const override {
+    Value lhs = op.getLhs();
+    Value rhs = op.getRhs();
+    Type result_type = op.getType();
+
+    // Optional predicate: default to null
+    rewriter.replaceOpWithNewOp<neura::FMulOp>(op, result_type, lhs, rhs,
+                                               nullptr);
+    return success();
+  }
+};
+
+struct ArithFDivToNeuraFDiv : public OpRewritePattern<mlir::arith::DivFOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::DivFOp op,
+                                PatternRewriter &rewriter) const override {
+    Value lhs = op.getLhs();
+    Value rhs = op.getRhs();
+    Type result_type = op.getType();
+
+    // Optional predicate: default to null
+    rewriter.replaceOpWithNewOp<neura::FDivOp>(op, result_type, lhs, rhs,
+                                               nullptr);
+    return success();
+  }
+};
+struct ArithCmpiToNeuraICmp : public OpRewritePattern<mlir::arith::CmpIOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::CmpIOp op,
+                                PatternRewriter &rewriter) const override {
+    Value lhs = op.getLhs();
+    Value rhs = op.getRhs();
+    Type result_type = op.getType();
+    arith::CmpIPredicate arith_cmp_type = op.getPredicate();
+    StringRef cmp_type;
+    switch (arith_cmp_type) {
+    case arith::CmpIPredicate::eq:
+      cmp_type = "eq"; // ==
+      break;
+    case arith::CmpIPredicate::ne:
+      cmp_type = "ne"; // !=
+      break;
+    case arith::CmpIPredicate::slt:
+      cmp_type = "slt"; // <
+      break;
+    case arith::CmpIPredicate::sle:
+      cmp_type = "sle"; // <=
+      break;
+    case arith::CmpIPredicate::sgt:
+      cmp_type = "sgt"; // >
+      break;
+    case arith::CmpIPredicate::sge:
+      cmp_type = "sge"; // >=
+      break;
+    case arith::CmpIPredicate::ult:
+      cmp_type = "ult"; // unsigned <
+      break;
+    case arith::CmpIPredicate::ule:
+      cmp_type = "ule"; // unsigned <=
+      break;
+    case arith::CmpIPredicate::ugt:
+      cmp_type = "ugt"; // unsigned >
+      break;
+    case arith::CmpIPredicate::uge:
+      cmp_type = "uge"; // unsigned >=
+      break;
+    default:
+      return rewriter.notifyMatchFailure(op, "Unsupported arith CmpIOp type");
+    }
+
+    // Convert arith CmpIOp to Neura ICmpOp
+    // Optional predicate: default to null
+    rewriter.replaceOpWithNewOp<neura::ICmpOp>(
+        op, result_type, lhs, rhs, nullptr, rewriter.getStringAttr(cmp_type));
+    return success();
+  }
+};
+
+struct ArithSelectToNeuraSel : public OpRewritePattern<mlir::arith::SelectOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::SelectOp op,
+                                PatternRewriter &rewriter) const override {
+    Value condition = op.getCondition();
+    Value true_value = op.getTrueValue();
+    Value false_value = op.getFalseValue();
+    Type result_type = op.getType();
+
+    // Convert arith SelectOp to Neura SelOp
+    rewriter.replaceOpWithNewOp<neura::SelOp>(op, result_type, true_value,
+                                              false_value, condition);
+    return success();
+  }
+};
+
+struct ArithExtUIToNeuraCast : public OpRewritePattern<mlir::arith::ExtUIOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::ExtUIOp op,
+                                PatternRewriter &rewriter) const override {
+    Value input = op.getIn();
+    Type result_type = op.getType();
+
+    // Convert arith ExtUIOp to Neura cast operation
+    // Optional predicate: default to null
+    rewriter.replaceOpWithNewOp<neura::CastOp>(
+        op, result_type, input, rewriter.getStringAttr("extui"), nullptr);
+    return success();
+  }
+};
+
+struct ArithExtfToNeuraCast : public OpRewritePattern<mlir::arith::ExtFOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::ExtFOp op,
+                                PatternRewriter &rewriter) const override {
+    Value input = op.getIn();
+    Type result_type = op.getType();
+
+    // Convert arith ExtFOp to Neura cast operation
+    // Optional predicate: default to null
+    rewriter.replaceOpWithNewOp<neura::CastOp>(
+        op, result_type, input, rewriter.getStringAttr("extf"), nullptr);
+    return success();
+  }
+};
+
+struct ArithIndexCastToNeuraCast
+    : public OpRewritePattern<mlir::arith::IndexCastOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::IndexCastOp op,
+                                PatternRewriter &rewriter) const override {
+    Value input = op.getIn();
+    Type result_type = op.getType();
 
-    // Optional predicate: default to 'none'
-    rewriter.replaceOpWithNewOp<neura::FAddOp>(op, resultType, lhs, rhs, Value());
+    // Convert arith IndexCastOp to Neura cast operation
+    // Optional predicate: default to null
+    rewriter.replaceOpWithNewOp<neura::CastOp>(
+        op, result_type, input, rewriter.getStringAttr("indexCast"), nullptr);
     return success();
   }
 };
 
 struct LowerArithToNeuraPass
-    : public PassWrapper<LowerArithToNeuraPass, OperationPass<func::FuncOp>> {
+    : public PassWrapper<LowerArithToNeuraPass, OperationPass<ModuleOp>> {
 
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LowerArithToNeuraPass)
 
@@ -60,7 +276,11 @@ struct LowerArithToNeuraPass
   void runOnOperation() override {
     RewritePatternSet patterns(&getContext());
     mlir::neura::arith2neura::populateWithGenerated(patterns);
-    patterns.add<ArithFAddToNeuraFAdd>(&getContext());
+    patterns
+        .add<ArithFAddToNeuraFAdd, ArithConstantToNeuraConstant,
+             ArithAddIToNeuraAdd, ArithCmpiToNeuraICmp, ArithSelectToNeuraSel,
+             ArithExtUIToNeuraCast, ArithIndexCastToNeuraCast,
+             ArithFDivToNeuraFDiv, ArithExtfToNeuraCast, ArithMulFToNeuraFMul, ArithSubIToNeuraSub, ArithSubFToNeuraFSub>(&getContext());
     if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
       signalPassFailure();
     }
diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt
index 1dbce29f..af5bb68a 100644
--- a/lib/Conversion/CMakeLists.txt
+++ b/lib/Conversion/CMakeLists.txt
@@ -2,6 +2,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 
 add_subdirectory(ArithToNeura)
 add_subdirectory(LlvmToNeura)
+add_subdirectory(MemRefToNeura)
 
 # add_mlir_library(
 #     MLIRNeuraConversion
diff --git a/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp b/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp
index 71ed33b5..6bc815b3 100644
--- a/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp
+++ b/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp
@@ -9,6 +9,7 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "Conversion/ConversionPasses.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace mlir {
 namespace neura {
@@ -62,6 +63,26 @@ struct LlvmFAddToNeuraFAdd : public OpRewritePattern<mlir::LLVM::FAddOp> {
   }
 };
 
+struct LlvmFSubToNeuraFSub : public OpRewritePattern<mlir::LLVM::FSubOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mlir::LLVM::FSubOp op,
+                                PatternRewriter &rewriter) const override {
+    Value lhs = op->getOperand(0);
+    Value rhs = op.getOperand(1);
+    Type result_type = op->getResult(0).getType();
+
+    // Only matches scalar float.
+    if (!mlir::isa<FloatType>(result_type)){
+      return failure();
+    }
+
+    // Optional predicate: default to 'none'
+    rewriter.replaceOpWithNewOp<neura::FSubOp>(op, result_type, lhs, rhs, Value());
+    return success();
+  }
+};
+
 struct LlvmOrToNeuraOr : public OpRewritePattern<mlir::LLVM::OrOp> {
   using OpRewritePattern::OpRewritePattern;
 
@@ -316,6 +337,7 @@ struct LowerLlvmToNeuraPass
     patterns.add<LlvmBrToNeuraBr>(&getContext());
     patterns.add<LlvmReturnToNeuraReturn>(&getContext());
     patterns.add<FuncReturnToNeuraReturn>(&getContext());
+    patterns.add<LlvmFSubToNeuraFSub>(&getContext());
 
     FrozenRewritePatternSet frozen(std::move(patterns));
 
diff --git a/lib/Conversion/LlvmToNeura/LlvmToNeuraPatterns.td b/lib/Conversion/LlvmToNeura/LlvmToNeuraPatterns.td
index e01ff728..3aef67d8 100644
--- a/lib/Conversion/LlvmToNeura/LlvmToNeuraPatterns.td
+++ b/lib/Conversion/LlvmToNeura/LlvmToNeuraPatterns.td
@@ -4,8 +4,9 @@ include "mlir/Dialect/LLVMIR/LLVMOps.td"
 include "NeuraDialect/NeuraOps.td"
 
 // Floating point binary operations.
-def : Pat<
-  (LLVM_FSubOp $lhs, $rhs, $_fastmath),
-  (Neura_FSubOp $lhs, $rhs)
->;
+// Deprecated Pattern: Because we need the predicate bit to be set to null initially
+// def : Pat<
+//   (LLVM_FSubOp $lhs, $rhs, $_fastmath),
+//   (Neura_FSubOp $lhs, $rhs)
+// >;
 
diff --git a/lib/Conversion/MemRefToNeura/CMakeLists.txt b/lib/Conversion/MemRefToNeura/CMakeLists.txt
new file mode 100644
index 00000000..335d2c39
--- /dev/null
+++ b/lib/Conversion/MemRefToNeura/CMakeLists.txt
@@ -0,0 +1,18 @@
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+add_mlir_conversion_library(MLIRNeuraMemRefToNeuraPass
+  MemRefToNeuraPass.cpp
+
+  DEPENDS
+  MLIRConversionIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRArithDialect
+  MLIRFuncDialect
+  MLIRLLVMDialect
+  MLIRIR
+  MLIRPass
+  MLIRTransforms
+  MLIRNeura
+  MLIRSupport
+)
diff --git a/lib/Conversion/MemRefToNeura/MemRefToNeuraPass.cpp b/lib/Conversion/MemRefToNeura/MemRefToNeuraPass.cpp
new file mode 100644
index 00000000..3d3b543c
--- /dev/null
+++ b/lib/Conversion/MemRefToNeura/MemRefToNeuraPass.cpp
@@ -0,0 +1,44 @@
+#include "Common/AcceleratorAttrs.h"
+#include "NeuraDialect/NeuraDialect.h"
+#include "NeuraDialect/NeuraOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "Conversion/ConversionPasses.h"
+
+using namespace mlir;
+using namespace mlir::neura;
+
+#define GEN_PASS_DEF_LOWERLLVMTONEURA
+#include "NeuraDialect/NeuraPasses.h.inc"
+
+
+namespace {
+
+struct LowerMemRefToNeuraPass
+    : public PassWrapper<LowerMemRefToNeuraPass, OperationPass<ModuleOp>> {
+
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LowerMemRefToNeuraPass)
+
+  StringRef getArgument() const override { return "lower-memref-to-neura"; }
+  StringRef getDescription() const override {
+    return "Lower MemRef operations to Neura dialect operations";
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<mlir::neura::NeuraDialect>();
+  }
+
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+  }
+};
+} // namespace
+
+std::unique_ptr<Pass> mlir::createLowerMemRefToNeuraPass() {
+  return std::make_unique<LowerMemRefToNeuraPass>();
+}
diff --git a/test/README.md b/test/README.md
new file mode 100644
index 00000000..e9599fb0
--- /dev/null
+++ b/test/README.md
@@ -0,0 +1,57 @@
+# Tests for Neura
+
+The structure of the files in this folder is as follows：
+```
+.
+├── affine2neura
+│   └── bert
+├── arith2neura
+│   ├── add.mlir
+│   └── Output
+├── c2llvm2mlir
+│   ├── kernel.cpp
+│   ├── Output
+│   └── test.mlir
+├── lit.cfg
+├── lit.cfg.in
+├── neura
+│   ├── arith_add.mlir
+│   ├── ctrl
+│   ├── fadd_fadd.mlir
+│   ├── for_loop
+│   ├── interpreter
+│   ├── llvm_add.mlir
+│   ├── llvm_sub.mlir
+│   └── Output
+├── Output
+│   └── test.mlir.script
+├── README.md
+├── samples
+│   ├── bert
+│   └── lenet
+└── test.mlir
+```
+
+All of the above content can be divided into three categories
+
+## 1 Conversion Test
+We need to convert other dialects to our `neura` dialect for compilation optimization. In order to verify the correctness of conversions from other dialects to `nerua` dialect, we need to provide the appropriate test for a conversion pass from a dialect to `nerua` dialect.
+
+For now, we have:
+`affine2neura`: tests provided for `--lower-affine-to-neura` [To be provided]
+`arith2neura`: tests provided for `--lower-arith-to-neura`
+`c2llvm2mlir`: tests provided for `--lower-llvm-to-neura`
+
+## 2 Neura Compiler Test
+Tests for individual passes/pass pipelines at the `neura` dialect level.
+
+## 3 Samples
+A collection of real-world applications for generating unit small tests.
+
+For now, [BERT](https://github.com/codertimo/BERT-pytorch) and [LENET](https://github.com/kuangliu/pytorch-cifar/blob/master/models/lenet.py) are included.
+
+We generate the `linalg` dialect of these models via [Torch MLIR](https://github.com/llvm/torch-mlir). which is then lowered to `affine` dialect for further lowering.
+
+Due to the data dependencies between loops in models, we are now unable to automatically extract each of these SINGLE loops from the model IR for individual tests.
+
+But we can manually collect some small unit tests from these sample IRs. For example, you can write `c++` code of a loop from BERT by mimicing the its corresponding `affine.for` operations, then use [Polygeist](https://github.com/llvm/Polygeist) to convert these `c++` code into `affine` mlir for further lowering. And that's how we generated tests in `affine2neura/bert`.
\ No newline at end of file
diff --git a/test/affine2neura/bert/bert_node0/bert_node0.cpp b/test/affine2neura/bert/bert_node0/bert_node0.cpp
new file mode 100644
index 00000000..a5d2e86b
--- /dev/null
+++ b/test/affine2neura/bert/bert_node0/bert_node0.cpp
@@ -0,0 +1,11 @@
+void bert_node0(
+    const int input[1][128],
+    bool output[1][128]) {
+    for (int arg3 = 0; arg3 < 1; arg3++) {
+        for (int arg4 = 0; arg4 < 128; arg4++) {
+            int value = input[0][arg4];
+            bool result = (value > 0);
+            output[arg3][arg4] = result;
+        }
+    }
+}
\ No newline at end of file
diff --git a/test/affine2neura/bert/bert_node0/bert_node0.mlir b/test/affine2neura/bert/bert_node0/bert_node0.mlir
new file mode 100644
index 00000000..4c1eef85
--- /dev/null
+++ b/test/affine2neura/bert/bert_node0/bert_node0.mlir
@@ -0,0 +1,39 @@
+// RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir
+// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s
+
+module attributes {} {
+  func.func @_Z10bert_node0PA128_KiPA128_b(%arg0: memref<?x128xi32>, %arg1: memref<?x128xi8>) attributes {} {
+    %c0_i32 = arith.constant 0 : i32
+    affine.for %arg2 = 0 to 128 {
+      %0 = affine.load %arg0[0, %arg2] : memref<?x128xi32>
+      %1 = arith.cmpi sgt, %0, %c0_i32 : i32
+      %2 = arith.extui %1 : i1 to i8
+      affine.store %2, %arg1[0, %arg2] : memref<?x128xi8>
+    }
+    return
+  }
+}
+
+// CHECK: func.func @_Z10bert_node0PA128_KiPA128_b(%arg0: memref<?x128xi32>, %arg1: memref<?x128xi8>) attributes {accelerator = "neura"} {
+// CHECK-NEXT:  %0 = "neura.constant"() <{value = 1 : index}> : () -> index
+// CHECK-NEXT:  %1 = "neura.constant"() <{value = 128 : index}> : () -> index
+// CHECK-NEXT:  %2 = "neura.constant"() <{value = 0 : i32}> : () -> i32
+// CHECK-NEXT:  %3 = "neura.constant"() <{value = 0 : index}> : () -> index
+// CHECK-NEXT:  %4 = builtin.unrealized_conversion_cast %3 : index to i64
+// CHECK-NEXT:  llvm.br ^bb1(%4 : i64)
+// CHECK-NEXT: ^bb1(%5: i64):  // 2 preds: ^bb0, ^bb2
+// CHECK-NEXT:  %6 = builtin.unrealized_conversion_cast %5 : i64 to index
+// CHECK-NEXT:  %7 = "neura.icmp"(%6, %1) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT:  llvm.cond_br %7, ^bb2, ^bb3
+// CHECK-NEXT: ^bb2:  // pred: ^bb1
+// CHECK-NEXT:  %8 = memref.load %arg0[%3, %6] : memref<?x128xi32>
+// CHECK-NEXT:  %9 = "neura.icmp"(%8, %2) <{cmpType = "sgt"}> : (i32, i32) -> i1
+// CHECK-NEXT:  %10 = "neura.cast"(%9) <{cast_type = "extui"}> : (i1) -> i8
+// CHECK-NEXT:  memref.store %10, %arg1[%3, %6] : memref<?x128xi8>
+// CHECK-NEXT:  %11 = "neura.add"(%6, %0) : (index, index) -> index
+// CHECK-NEXT:  %12 = builtin.unrealized_conversion_cast %11 : index to i64
+// CHECK-NEXT:  llvm.br ^bb1(%12 : i64)
+// CHECK-NEXT: ^bb3:  // pred: ^bb1
+// CHECK-NEXT:  return
+// CHECK-NEXT:  }
+// CHECK-NEXT:  }
diff --git a/test/affine2neura/bert/bert_node1/bert_node1.cpp b/test/affine2neura/bert/bert_node1/bert_node1.cpp
new file mode 100644
index 00000000..7aa5ca29
--- /dev/null
+++ b/test/affine2neura/bert/bert_node1/bert_node1.cpp
@@ -0,0 +1,19 @@
+void bert_node1(
+    bool input[1][1][1][1][1][128], 
+    bool output[1][1][128][1][1][128]) {
+    
+    for (int arg3 = 0; arg3 < 1; arg3++) {
+        for (int arg4 = 0; arg4 < 1; arg4++) {
+            for (int arg5 = 0; arg5 < 128; arg5++) {
+                for (int arg6 = 0; arg6 < 1; arg6++) {
+                    for (int arg7 = 0; arg7 < 1; arg7++) {
+                        for (int arg8 = 0; arg8 < 128; arg8++) {
+                            bool value = input[arg3][arg4][0][arg6][arg7][arg8];
+                            output[arg3][arg4][arg5][arg6][arg7][arg8] = value;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/test/affine2neura/bert/bert_node1/bert_node1.mlir b/test/affine2neura/bert/bert_node1/bert_node1.mlir
new file mode 100644
index 00000000..0280d7c3
--- /dev/null
+++ b/test/affine2neura/bert/bert_node1/bert_node1.mlir
@@ -0,0 +1,44 @@
+// RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir
+// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s
+module attributes {} {
+  func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref<?x1x1x1x1x128xi8>, %arg1: memref<?x1x128x1x1x128xi8>) attributes {} {
+    affine.for %arg2 = 0 to 128 {
+      affine.for %arg3 = 0 to 128 {
+        %0 = affine.load %arg0[0, 0, 0, 0, 0, %arg3] : memref<?x1x1x1x1x128xi8>
+        affine.store %0, %arg1[0, 0, %arg2, 0, 0, %arg3] : memref<?x1x128x1x1x128xi8>
+      }
+    }
+    return
+  }
+}
+
+// CHECK: func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref<?x1x1x1x1x128xi8>, %arg1: memref<?x1x128x1x1x128xi8>) attributes {accelerator = "neura"} {
+// CHECK-NEXT:    %0 = "neura.constant"() <{value = 1 : index}> : () -> index
+// CHECK-NEXT:    %1 = "neura.constant"() <{value = 128 : index}> : () -> index
+// CHECK-NEXT:    %2 = "neura.constant"() <{value = 0 : index}> : () -> index
+// CHECK-NEXT:    %3 = builtin.unrealized_conversion_cast %2 : index to i64
+// CHECK-NEXT:    llvm.br ^bb1(%3 : i64)
+// CHECK-NEXT:  ^bb1(%4: i64):  // 2 preds: ^bb0, ^bb5
+// CHECK-NEXT:    %5 = builtin.unrealized_conversion_cast %4 : i64 to index
+// CHECK-NEXT:    %6 = "neura.icmp"(%5, %1) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT:    llvm.cond_br %6, ^bb2, ^bb6
+// CHECK-NEXT:  ^bb2:  // pred: ^bb1
+// CHECK-NEXT:    %7 = builtin.unrealized_conversion_cast %2 : index to i64
+// CHECK-NEXT:    llvm.br ^bb3(%7 : i64)
+// CHECK-NEXT:  ^bb3(%8: i64):  // 2 preds: ^bb2, ^bb4
+// CHECK-NEXT:    %9 = builtin.unrealized_conversion_cast %8 : i64 to index
+// CHECK-NEXT:    %10 = "neura.icmp"(%9, %1) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT:    llvm.cond_br %10, ^bb4, ^bb5
+// CHECK-NEXT:  ^bb4:  // pred: ^bb3
+// CHECK-NEXT:    %11 = memref.load %arg0[%2, %2, %2, %2, %2, %9] : memref<?x1x1x1x1x128xi8>
+// CHECK-NEXT:    memref.store %11, %arg1[%2, %2, %5, %2, %2, %9] : memref<?x1x128x1x1x128xi8>
+// CHECK-NEXT:    %12 = "neura.add"(%9, %0) : (index, index) -> index
+// CHECK-NEXT:    %13 = builtin.unrealized_conversion_cast %12 : index to i64
+// CHECK-NEXT:    llvm.br ^bb3(%13 : i64)
+// CHECK-NEXT:  ^bb5:  // pred: ^bb3
+// CHECK-NEXT:    %14 = "neura.add"(%5, %0) : (index, index) -> index
+// CHECK-NEXT:    %15 = builtin.unrealized_conversion_cast %14 : index to i64
+// CHECK-NEXT:    llvm.br ^bb1(%15 : i64)
+// CHECK-NEXT:  ^bb6:  // pred: ^bb1
+// CHECK-NEXT:    return
+// CHECK-NEXT:  }
\ No newline at end of file
diff --git a/test/affine2neura/bert/bert_node2/bert_node2.cpp b/test/affine2neura/bert/bert_node2/bert_node2.cpp
new file mode 100644
index 00000000..9dda6885
--- /dev/null
+++ b/test/affine2neura/bert/bert_node2/bert_node2.cpp
@@ -0,0 +1,25 @@
+void bert_node2(
+    const int input_indices[1][128],
+    const float embedding_table[30522][768],
+    float output[1][128][768]) {
+    const int c30522 = 30522;
+    const int c0_i64 = 0;
+    
+    for (int arg3 = 0; arg3 < 1; arg3++) {
+        for (int arg4 = 0; arg4 < 128; arg4++) {
+            for (int arg5 = 0; arg5 < 768; arg5++) {
+                int index_i64 = input_indices[arg3][arg4];
+                int index = static_cast<int>(index_i64);
+                // Bound checking instead of assertions
+                if (index >= c30522) {
+                    index = c30522 - 1;  // Clamp to maximum valid index
+                }
+                if (index < c0_i64) {
+                    index = c0_i64;  // Clamp to minimum valid index
+                }
+                float extracted_value = embedding_table[index][arg5];
+                output[arg3][arg4][arg5] = extracted_value;
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/test/affine2neura/bert/bert_node2/bert_node2.mlir b/test/affine2neura/bert/bert_node2/bert_node2.mlir
new file mode 100644
index 00000000..6b70666a
--- /dev/null
+++ b/test/affine2neura/bert/bert_node2/bert_node2.mlir
@@ -0,0 +1,78 @@
+// RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir
+// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s
+module attributes {} {
+  func.func @_Z10bert_node2PA128_KiPA768_KfPA128_A768_f(%arg0: memref<?x128xi32>, %arg1: memref<?x768xf32>, %arg2: memref<?x128x768xf32>) attributes {} {
+    %false = arith.constant false
+    %c30521_i32 = arith.constant 30521 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c30522_i32 = arith.constant 30522 : i32
+    affine.for %arg3 = 0 to 128 {
+      affine.for %arg4 = 0 to 768 {
+        %0 = affine.load %arg0[0, %arg3] : memref<?x128xi32>
+        %1 = arith.cmpi sge, %0, %c30522_i32 : i32
+        %2 = arith.select %1, %c30521_i32, %0 : i32
+        %3 = scf.if %1 -> (i1) {
+          scf.yield %false : i1
+        } else {
+          %7 = arith.cmpi slt, %0, %c0_i32 : i32
+          scf.yield %7 : i1
+        }
+        %4 = arith.select %3, %c0_i32, %2 : i32
+        %5 = arith.index_cast %4 : i32 to index
+        %6 = memref.load %arg1[%5, %arg4] : memref<?x768xf32>
+        affine.store %6, %arg2[0, %arg3, %arg4] : memref<?x128x768xf32>
+      }
+    }
+    return
+  }
+}
+
+// CHECK: func.func @_Z10bert_node2PA128_KiPA768_KfPA128_A768_f(%arg0: memref<?x128xi32>, %arg1: memref<?x768xf32>, %arg2: memref<?x128x768xf32>) attributes {accelerator = "neura"} {
+// CHECK-NEXT: %0 = "neura.constant"() <{value = 768 : index}> : () -> index
+// CHECK-NEXT: %1 = "neura.constant"() <{value = 1 : index}> : () -> index
+// CHECK-NEXT: %2 = "neura.constant"() <{value = 128 : index}> : () -> index
+// CHECK-NEXT: %3 = "neura.constant"() <{value = false}> : () -> i1
+// CHECK-NEXT: %4 = "neura.constant"() <{value = 30521 : i32}> : () -> i32
+// CHECK-NEXT: %5 = "neura.constant"() <{value = 0 : i32}> : () -> i32
+// CHECK-NEXT: %6 = "neura.constant"() <{value = 30522 : i32}> : () -> i32
+// CHECK-NEXT: %7 = "neura.constant"() <{value = 0 : index}> : () -> index
+// CHECK-NEXT: %8 = builtin.unrealized_conversion_cast %7 : index to i64
+// CHECK-NEXT: llvm.br ^bb1(%8 : i64)
+// CHECK-NEXT: ^bb1(%9: i64):  // 2 preds: ^bb0, ^bb9
+// CHECK-NEXT: %10 = builtin.unrealized_conversion_cast %9 : i64 to index
+// CHECK-NEXT: %11 = "neura.icmp"(%10, %2) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT: llvm.cond_br %11, ^bb2, ^bb10
+// CHECK-NEXT: ^bb2:  // pred: ^bb1
+// CHECK-NEXT: %12 = builtin.unrealized_conversion_cast %7 : index to i64
+// CHECK-NEXT: llvm.br ^bb3(%12 : i64)
+// CHECK-NEXT: ^bb3(%13: i64):  // 2 preds: ^bb2, ^bb8
+// CHECK-NEXT: %14 = builtin.unrealized_conversion_cast %13 : i64 to index
+// CHECK-NEXT: %15 = "neura.icmp"(%14, %0) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT: llvm.cond_br %15, ^bb4, ^bb9
+// CHECK-NEXT: ^bb4:  // pred: ^bb3
+// CHECK-NEXT: %16 = memref.load %arg0[%7, %10] : memref<?x128xi32>
+// CHECK-NEXT: %17 = "neura.icmp"(%16, %6) <{cmpType = "sge"}> : (i32, i32) -> i1
+// CHECK-NEXT: %18 = "neura.sel"(%4, %16, %17) : (i32, i32, i1) -> i32
+// CHECK-NEXT: llvm.cond_br %17, ^bb5, ^bb6
+// CHECK-NEXT: ^bb5:  // pred: ^bb4
+// CHECK-NEXT: llvm.br ^bb7(%3 : i1)
+// CHECK-NEXT: ^bb6:  // pred: ^bb4
+// CHECK-NEXT: %19 = "neura.icmp"(%16, %5) <{cmpType = "slt"}> : (i32, i32) -> i1
+// CHECK-NEXT: llvm.br ^bb7(%19 : i1)
+// CHECK-NEXT: ^bb7(%20: i1):  // 2 preds: ^bb5, ^bb6
+// CHECK-NEXT: llvm.br ^bb8
+// CHECK-NEXT: ^bb8:  // pred: ^bb7
+// CHECK-NEXT: %21 = "neura.sel"(%5, %18, %20) : (i32, i32, i1) -> i32
+// CHECK-NEXT: %22 = "neura.cast"(%21) <{cast_type = "indexCast"}> : (i32) -> index
+// CHECK-NEXT: %23 = memref.load %arg1[%22, %14] : memref<?x768xf32>
+// CHECK-NEXT: memref.store %23, %arg2[%7, %10, %14] : memref<?x128x768xf32>
+// CHECK-NEXT: %24 = "neura.add"(%14, %1) : (index, index) -> index
+// CHECK-NEXT: %25 = builtin.unrealized_conversion_cast %24 : index to i64
+// CHECK-NEXT: llvm.br ^bb3(%25 : i64)
+// CHECK-NEXT: ^bb9:  // pred: ^bb3
+// CHECK-NEXT: %26 = "neura.add"(%10, %1) : (index, index) -> index
+// CHECK-NEXT: %27 = builtin.unrealized_conversion_cast %26 : index to i64
+// CHECK-NEXT: llvm.br ^bb1(%27 : i64)
+// CHECK-NEXT: ^bb10:  // pred: ^bb1
+// CHECK-NEXT: return
+// CHECK-NEXT: }
\ No newline at end of file
diff --git a/test/affine2neura/bert/bert_node28/bert_node28.cpp b/test/affine2neura/bert/bert_node28/bert_node28.cpp
new file mode 100644
index 00000000..4853daef
--- /dev/null
+++ b/test/affine2neura/bert/bert_node28/bert_node28.cpp
@@ -0,0 +1,19 @@
+void bert_node28(const float input_A[1][128][768],
+                   const float input_B[1][768][768],
+                   float output[1][128][768]) {
+
+  for (int arg3 = 0; arg3 < 1; arg3++) {
+    for (int arg4 = 0; arg4 < 128; arg4++) {
+      for (int arg5 = 0; arg5 < 768; arg5++) {
+        for (int arg6 = 0; arg6 < 768; arg6++) {
+          float val_A = input_A[arg3][arg4][arg6];
+          float val_B = input_B[arg3][arg6][arg5];
+          float val_C = output[arg3][arg4][arg5];
+          float mul_result = val_A * val_B;
+          float add_result = val_C + mul_result;
+          output[arg3][arg4][arg5] = add_result;
+        }
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/test/affine2neura/bert/bert_node28/bert_node28.mlir b/test/affine2neura/bert/bert_node28/bert_node28.mlir
new file mode 100644
index 00000000..01f54a51
--- /dev/null
+++ b/test/affine2neura/bert/bert_node28/bert_node28.mlir
@@ -0,0 +1,64 @@
+// RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir
+// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s
+module attributes {} {
+  func.func @_Z11bert_node28PA128_A768_KfPA768_S0_PA128_A768_f(%arg0: memref<?x128x768xf32>, %arg1: memref<?x768x768xf32>, %arg2: memref<?x128x768xf32>) attributes {} {
+    affine.for %arg3 = 0 to 128 {
+      affine.for %arg4 = 0 to 768 {
+        affine.for %arg5 = 0 to 768 {
+          %0 = affine.load %arg0[0, %arg3, %arg5] : memref<?x128x768xf32>
+          %1 = affine.load %arg1[0, %arg5, %arg4] : memref<?x768x768xf32>
+          %2 = affine.load %arg2[0, %arg3, %arg4] : memref<?x128x768xf32>
+          %3 = arith.mulf %0, %1 : f32
+          %4 = arith.addf %2, %3 : f32
+          affine.store %4, %arg2[0, %arg3, %arg4] : memref<?x128x768xf32>
+        }
+      }
+    }
+    return
+  }
+}
+// CHECK: func.func @_Z11bert_node28PA128_A768_KfPA768_S0_PA128_A768_f(%arg0: memref<?x128x768xf32>, %arg1: memref<?x768x768xf32>, %arg2: memref<?x128x768xf32>) attributes {accelerator = "neura"} {
+// CHECK-NEXT: %0 = "neura.constant"() <{value = 768 : index}> : () -> index
+// CHECK-NEXT: %1 = "neura.constant"() <{value = 1 : index}> : () -> index
+// CHECK-NEXT: %2 = "neura.constant"() <{value = 128 : index}> : () -> index
+// CHECK-NEXT: %3 = "neura.constant"() <{value = 0 : index}> : () -> index
+// CHECK-NEXT: %4 = builtin.unrealized_conversion_cast %3 : index to i64
+// CHECK-NEXT: llvm.br ^bb1(%4 : i64)
+// CHECK-NEXT: ^bb1(%5: i64):  // 2 preds: ^bb0, ^bb8
+// CHECK-NEXT: %6 = builtin.unrealized_conversion_cast %5 : i64 to index
+// CHECK-NEXT: %7 = "neura.icmp"(%6, %2) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT: llvm.cond_br %7, ^bb2, ^bb9
+// CHECK-NEXT: ^bb2:  // pred: ^bb1
+// CHECK-NEXT: %8 = builtin.unrealized_conversion_cast %3 : index to i64
+// CHECK-NEXT: llvm.br ^bb3(%8 : i64)
+// CHECK-NEXT: ^bb3(%9: i64):  // 2 preds: ^bb2, ^bb7
+// CHECK-NEXT: %10 = builtin.unrealized_conversion_cast %9 : i64 to index
+// CHECK-NEXT: %11 = "neura.icmp"(%10, %0) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT: llvm.cond_br %11, ^bb4, ^bb8
+// CHECK-NEXT: ^bb4:  // pred: ^bb3
+// CHECK-NEXT: %12 = builtin.unrealized_conversion_cast %3 : index to i64
+// CHECK-NEXT: llvm.br ^bb5(%12 : i64)
+// CHECK-NEXT: ^bb5(%13: i64):  // 2 preds: ^bb4, ^bb6
+// CHECK-NEXT: %14 = builtin.unrealized_conversion_cast %13 : i64 to index
+// CHECK-NEXT: %15 = "neura.icmp"(%14, %0) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT: llvm.cond_br %15, ^bb6, ^bb7
+// CHECK-NEXT: ^bb6:  // pred: ^bb5
+// CHECK-NEXT: %16 = memref.load %arg0[%3, %6, %14] : memref<?x128x768xf32>
+// CHECK-NEXT: %17 = memref.load %arg1[%3, %14, %10] : memref<?x768x768xf32>
+// CHECK-NEXT: %18 = memref.load %arg2[%3, %6, %10] : memref<?x128x768xf32>
+// CHECK-NEXT: %19 = "neura.fmul"(%16, %17) : (f32, f32) -> f32
+// CHECK-NEXT: %20 = "neura.fadd"(%18, %19) : (f32, f32) -> f32
+// CHECK-NEXT: memref.store %20, %arg2[%3, %6, %10] : memref<?x128x768xf32>
+// CHECK-NEXT: %21 = "neura.add"(%14, %1) : (index, index) -> index
+// CHECK-NEXT: %22 = builtin.unrealized_conversion_cast %21 : index to i64
+// CHECK-NEXT: llvm.br ^bb5(%22 : i64)
+// CHECK-NEXT: ^bb7:  // pred: ^bb5
+// CHECK-NEXT: %23 = "neura.add"(%10, %1) : (index, index) -> index
+// CHECK-NEXT: %24 = builtin.unrealized_conversion_cast %23 : index to i64
+// CHECK-NEXT: llvm.br ^bb3(%24 : i64)
+// CHECK-NEXT: ^bb8:  // pred: ^bb3
+// CHECK-NEXT: %25 = "neura.add"(%6, %1) : (index, index) -> index
+// CHECK-NEXT: %26 = builtin.unrealized_conversion_cast %25 : index to i64
+// CHECK-NEXT: llvm.br ^bb1(%26 : i64)
+// CHECK-NEXT: ^bb9:  // pred: ^bb1
+// CHECK-NEXT: return
\ No newline at end of file
diff --git a/test/affine2neura/bert/bert_node3/bert_node3.cpp b/test/affine2neura/bert/bert_node3/bert_node3.cpp
new file mode 100644
index 00000000..ddafc0a6
--- /dev/null
+++ b/test/affine2neura/bert/bert_node3/bert_node3.cpp
@@ -0,0 +1,14 @@
+void bert_node3(const float input1[1][128][768],
+                const float input2[1][128][768], float output[1][128][768]) {
+
+  for (int arg3 = 0; arg3 < 1; arg3++) {
+    for (int arg4 = 0; arg4 < 128; arg4++) {
+      for (int arg5 = 0; arg5 < 768; arg5++) {
+        float val1 = input1[0][arg4][arg5];
+        float val2 = input2[0][arg4][arg5];
+        float sum = val1 + val2;
+        output[arg3][arg4][arg5] = sum;
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/test/affine2neura/bert/bert_node3/bert_node3.mlir b/test/affine2neura/bert/bert_node3/bert_node3.mlir
new file mode 100644
index 00000000..1c400deb
--- /dev/null
+++ b/test/affine2neura/bert/bert_node3/bert_node3.mlir
@@ -0,0 +1,48 @@
+// RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir
+// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s
+module attributes {} {
+  func.func @_Z10bert_node3PA128_A768_KfS2_PA128_A768_f(%arg0: memref<?x128x768xf32>, %arg1: memref<?x128x768xf32>, %arg2: memref<?x128x768xf32>) attributes {} {
+    affine.for %arg3 = 0 to 128 {
+      affine.for %arg4 = 0 to 768 {
+        %0 = affine.load %arg0[0, %arg3, %arg4] : memref<?x128x768xf32>
+        %1 = affine.load %arg1[0, %arg3, %arg4] : memref<?x128x768xf32>
+        %2 = arith.addf %0, %1 : f32
+        affine.store %2, %arg2[0, %arg3, %arg4] : memref<?x128x768xf32>
+      }
+    }
+    return
+  }
+}
+
+// CHECK: func.func @_Z10bert_node3PA128_A768_KfS2_PA128_A768_f(%arg0: memref<?x128x768xf32>, %arg1: memref<?x128x768xf32>, %arg2: memref<?x128x768xf32>) attributes {accelerator = "neura"} {
+// CHECK-NEXT: %0 = "neura.constant"() <{value = 768 : index}> : () -> index
+// CHECK-NEXT: %1 = "neura.constant"() <{value = 1 : index}> : () -> index
+// CHECK-NEXT: %2 = "neura.constant"() <{value = 128 : index}> : () -> index
+// CHECK-NEXT: %3 = "neura.constant"() <{value = 0 : index}> : () -> index
+// CHECK-NEXT: %4 = builtin.unrealized_conversion_cast %3 : index to i64
+// CHECK-NEXT: llvm.br ^bb1(%4 : i64)
+// CHECK-NEXT: ^bb1(%5: i64):  // 2 preds: ^bb0, ^bb5
+// CHECK-NEXT: %6 = builtin.unrealized_conversion_cast %5 : i64 to index
+// CHECK-NEXT: %7 = "neura.icmp"(%6, %2) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT: llvm.cond_br %7, ^bb2, ^bb6
+// CHECK-NEXT: ^bb2:  // pred: ^bb1
+// CHECK-NEXT: %8 = builtin.unrealized_conversion_cast %3 : index to i64
+// CHECK-NEXT: llvm.br ^bb3(%8 : i64)
+// CHECK-NEXT: ^bb3(%9: i64):  // 2 preds: ^bb2, ^bb4
+// CHECK-NEXT: %10 = builtin.unrealized_conversion_cast %9 : i64 to index
+// CHECK-NEXT: %11 = "neura.icmp"(%10, %0) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT: llvm.cond_br %11, ^bb4, ^bb5
+// CHECK-NEXT: ^bb4:  // pred: ^bb3
+// CHECK-NEXT: %12 = memref.load %arg0[%3, %6, %10] : memref<?x128x768xf32>
+// CHECK-NEXT: %13 = memref.load %arg1[%3, %6, %10] : memref<?x128x768xf32>
+// CHECK-NEXT: %14 = "neura.fadd"(%12, %13) : (f32, f32) -> f32
+// CHECK-NEXT: memref.store %14, %arg2[%3, %6, %10] : memref<?x128x768xf32>
+// CHECK-NEXT: %15 = "neura.add"(%10, %1) : (index, index) -> index
+// CHECK-NEXT: %16 = builtin.unrealized_conversion_cast %15 : index to i64
+// CHECK-NEXT: llvm.br ^bb3(%16 : i64)
+// CHECK-NEXT: ^bb5:  // pred: ^bb3
+// CHECK-NEXT: %17 = "neura.add"(%6, %1) : (index, index) -> index
+// CHECK-NEXT: %18 = builtin.unrealized_conversion_cast %17 : index to i64
+// CHECK-NEXT: llvm.br ^bb1(%18 : i64)
+// CHECK-NEXT: ^bb6:  // pred: ^bb1
+// CHECK-NEXT: return
diff --git a/test/affine2neura/bert/bert_node8/bert_node8.cpp b/test/affine2neura/bert/bert_node8/bert_node8.cpp
new file mode 100644
index 00000000..e2054b7d
--- /dev/null
+++ b/test/affine2neura/bert/bert_node8/bert_node8.cpp
@@ -0,0 +1,14 @@
+void bert_node8(
+    const float input[1][128][1],
+    float output[1][128][1]) {
+    const float divisor = 768.0f;
+    for (int arg3 = 0; arg3 < 1; arg3++) {
+        for (int arg4 = 0; arg4 < 128; arg4++) {
+            for (int arg5 = 0; arg5 < 1; arg5++) {
+                float value = input[0][arg4][0];
+                float result = value / divisor;
+                output[arg3][arg4][arg5] = result;
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/test/affine2neura/bert/bert_node8/bert_node8.mlir b/test/affine2neura/bert/bert_node8/bert_node8.mlir
new file mode 100644
index 00000000..dbb59d40
--- /dev/null
+++ b/test/affine2neura/bert/bert_node8/bert_node8.mlir
@@ -0,0 +1,34 @@
+// RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir
+// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s
+module attributes {} {
+  func.func @_Z10bert_node8PA128_A1_KfPA128_A1_f(%arg0: memref<?x128x1xf32>, %arg1: memref<?x128x1xf32>) attributes {} {
+    %cst = arith.constant 7.680000e+02 : f32
+    affine.for %arg2 = 0 to 128 {
+      %0 = affine.load %arg0[0, %arg2, 0] : memref<?x128x1xf32>
+      %1 = arith.divf %0, %cst : f32
+      affine.store %1, %arg1[0, %arg2, 0] : memref<?x128x1xf32>
+    }
+    return
+  }
+}
+
+// CHECK:  func.func @_Z10bert_node8PA128_A1_KfPA128_A1_f(%arg0: memref<?x128x1xf32>, %arg1: memref<?x128x1xf32>) attributes {accelerator = "neura"} {
+// CHECK-NEXT: %0 = "neura.constant"() <{value = 1 : index}> : () -> index
+// CHECK-NEXT: %1 = "neura.constant"() <{value = 128 : index}> : () -> index
+// CHECK-NEXT: %2 = "neura.constant"() <{value = 7.680000e+02 : f32}> : () -> f32
+// CHECK-NEXT: %3 = "neura.constant"() <{value = 0 : index}> : () -> index
+// CHECK-NEXT: %4 = builtin.unrealized_conversion_cast %3 : index to i64
+// CHECK-NEXT: llvm.br ^bb1(%4 : i64)
+// CHECK-NEXT: ^bb1(%5: i64):  // 2 preds: ^bb0, ^bb2
+// CHECK-NEXT: %6 = builtin.unrealized_conversion_cast %5 : i64 to index
+// CHECK-NEXT: %7 = "neura.icmp"(%6, %1) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT: llvm.cond_br %7, ^bb2, ^bb3
+// CHECK-NEXT: ^bb2:  // pred: ^bb1
+// CHECK-NEXT: %8 = memref.load %arg0[%3, %6, %3] : memref<?x128x1xf32>
+// CHECK-NEXT: %9 = "neura.fdiv"(%8, %2) : (f32, f32) -> f32
+// CHECK-NEXT: memref.store %9, %arg1[%3, %6, %3] : memref<?x128x1xf32>
+// CHECK-NEXT: %10 = "neura.add"(%6, %0) : (index, index) -> index
+// CHECK-NEXT: %11 = builtin.unrealized_conversion_cast %10 : index to i64
+// CHECK-NEXT: llvm.br ^bb1(%11 : i64)
+// CHECK-NEXT: ^bb3:  // pred: ^bb1
+// CHECK-NEXT: return
diff --git a/test/affine2neura/bert/bert_node9/bert_node9.cpp b/test/affine2neura/bert/bert_node9/bert_node9.cpp
new file mode 100644
index 00000000..63f63756
--- /dev/null
+++ b/test/affine2neura/bert/bert_node9/bert_node9.cpp
@@ -0,0 +1,13 @@
+void bert_node9(
+    const float input[1][128][768], 
+    double output[1][128][768]) {
+    for (int arg3 = 0; arg3 < 1; arg3++) {
+        for (int arg4 = 0; arg4 < 128; arg4++) {
+            for (int arg5 = 0; arg5 < 768; arg5++) {
+                float value = input[0][arg4][arg5];
+                double extended_value = static_cast<double>(value);
+                output[arg3][arg4][arg5] = extended_value;
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/test/affine2neura/bert/bert_node9/bert_node9.mlir b/test/affine2neura/bert/bert_node9/bert_node9.mlir
new file mode 100644
index 00000000..3641e16b
--- /dev/null
+++ b/test/affine2neura/bert/bert_node9/bert_node9.mlir
@@ -0,0 +1,47 @@
+// RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir
+// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s
+module attributes {} {
+  func.func @_Z10bert_node9PA128_A768_KfPA128_A768_d(%arg0: memref<?x128x768xf32>, %arg1: memref<?x128x768xf64>) attributes {} {
+    affine.for %arg2 = 0 to 128 {
+      affine.for %arg3 = 0 to 768 {
+        %0 = affine.load %arg0[0, %arg2, %arg3] : memref<?x128x768xf32>
+        %1 = arith.extf %0 : f32 to f64
+        affine.store %1, %arg1[0, %arg2, %arg3] : memref<?x128x768xf64>
+      }
+    }
+    return
+  }
+}
+
+
+// CHECK: func.func @_Z10bert_node9PA128_A768_KfPA128_A768_d(%arg0: memref<?x128x768xf32>, %arg1: memref<?x128x768xf64>) attributes {accelerator = "neura"} {
+// CHECK-NEXT: %0 = "neura.constant"() <{value = 768 : index}> : () -> index
+// CHECK-NEXT: %1 = "neura.constant"() <{value = 1 : index}> : () -> index
+// CHECK-NEXT: %2 = "neura.constant"() <{value = 128 : index}> : () -> index
+// CHECK-NEXT: %3 = "neura.constant"() <{value = 0 : index}> : () -> index
+// CHECK-NEXT: %4 = builtin.unrealized_conversion_cast %3 : index to i64
+// CHECK-NEXT: llvm.br ^bb1(%4 : i64)
+// CHECK-NEXT: ^bb1(%5: i64):  // 2 preds: ^bb0, ^bb5
+// CHECK-NEXT: %6 = builtin.unrealized_conversion_cast %5 : i64 to index
+// CHECK-NEXT: %7 = "neura.icmp"(%6, %2) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT: llvm.cond_br %7, ^bb2, ^bb6
+// CHECK-NEXT: ^bb2:  // pred: ^bb1
+// CHECK-NEXT: %8 = builtin.unrealized_conversion_cast %3 : index to i64
+// CHECK-NEXT: llvm.br ^bb3(%8 : i64)
+// CHECK-NEXT: ^bb3(%9: i64):  // 2 preds: ^bb2, ^bb4
+// CHECK-NEXT: %10 = builtin.unrealized_conversion_cast %9 : i64 to index
+// CHECK-NEXT: %11 = "neura.icmp"(%10, %0) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT: llvm.cond_br %11, ^bb4, ^bb5
+// CHECK-NEXT: ^bb4:  // pred: ^bb3
+// CHECK-NEXT: %12 = memref.load %arg0[%3, %6, %10] : memref<?x128x768xf32>
+// CHECK-NEXT: %13 = "neura.cast"(%12) <{cast_type = "extf"}> : (f32) -> f64
+// CHECK-NEXT: memref.store %13, %arg1[%3, %6, %10] : memref<?x128x768xf64>
+// CHECK-NEXT: %14 = "neura.add"(%10, %1) : (index, index) -> index
+// CHECK-NEXT: %15 = builtin.unrealized_conversion_cast %14 : index to i64
+// CHECK-NEXT: llvm.br ^bb3(%15 : i64)
+// CHECK-NEXT: ^bb5:  // pred: ^bb3
+// CHECK-NEXT: %16 = "neura.add"(%6, %1) : (index, index) -> index
+// CHECK-NEXT: %17 = builtin.unrealized_conversion_cast %16 : index to i64
+// CHECK-NEXT: llvm.br ^bb1(%17 : i64)
+// CHECK-NEXT: ^bb6:  // pred: ^bb1
+// CHECK-NEXT: return
diff --git a/test/lit.cfg.in b/test/lit.cfg.in
index aa45ea35..2530f5c3 100644
--- a/test/lit.cfg.in
+++ b/test/lit.cfg.in
@@ -6,6 +6,7 @@ config.test_format = lit.formats.ShTest(True)
 config.suffixes = ['.mlir']
 config.test_source_root = os.path.dirname(__file__)
 config.test_exec_root = os.path.dirname(__file__)
+config.excludes = ['samples']
 
 # Tool substitutions from CMake
 config.substitutions.append(('mlir-neura-opt', '@MLIR_NEURA_OPT@'))
diff --git a/test/neura/llvm_sub.mlir b/test/neura/llvm_sub.mlir
index 1cf1fbf4..8b1f8b27 100644
--- a/test/neura/llvm_sub.mlir
+++ b/test/neura/llvm_sub.mlir
@@ -5,6 +5,6 @@ func.func @test(%a: f32) -> f32 {
   %res = llvm.fsub %a, %b : f32
   // CHECK: [[LHS:%.*]] = "neura.data_mov"(%{{.*}}) : (f32) -> f32
   // CHECK: [[RHS:%.*]] = "neura.data_mov"(%{{.*}}) : (f32) -> f32
-  // CHECK: [[RES:%.*]] = "neura.fsub"([[LHS]], [[RHS]])
+  // CHECK: [[RES:%.*]] = "neura.fsub"([[LHS]], [[RHS]]) : (f32, f32) -> f32
   return %res : f32
 }
\ No newline at end of file
diff --git a/test/samples/bert/bert_affine.mlir b/test/samples/bert/bert_affine.mlir
new file mode 100644
index 00000000..e47b9f88
--- /dev/null
+++ b/test/samples/bert/bert_affine.mlir
@@ -0,0 +1,2266 @@
+module {
+  func.func @main(%arg0: tensor<1x512x768xf32>, %arg1: tensor<1x128xi64>, %arg2: tensor<1x128xi64>) -> tensor<1x128x768xf32> {
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant dense<-1.000000e+09> : tensor<f64>
+    %cst_0 = arith.constant dense_resource<torch_tensor_768_torch.float32_17> : tensor<768xf32>
+    %cst_1 = arith.constant dense_resource<torch_tensor_768_3072_torch.float32_1> : tensor<768x3072xf32>
+    %cst_2 = arith.constant dense_resource<torch_tensor_3072_torch.float32_1> : tensor<3072xf32>
+    %cst_3 = arith.constant dense_resource<torch_tensor_3072_768_torch.float32_1> : tensor<3072x768xf32>
+    %cst_4 = arith.constant dense_resource<torch_tensor_768_torch.float32_16> : tensor<768xf32>
+    %cst_5 = arith.constant dense_resource<torch_tensor_768_torch.float32_15> : tensor<768xf32>
+    %cst_6 = arith.constant dense_resource<torch_tensor_768_torch.float32_14> : tensor<768xf32>
+    %cst_7 = arith.constant dense_resource<torch_tensor_768_768_torch.float32_7> : tensor<768x768xf32>
+    %cst_8 = arith.constant dense_resource<torch_tensor_768_torch.float32_13> : tensor<768xf32>
+    %cst_9 = arith.constant dense_resource<torch_tensor_768_768_torch.float32_6> : tensor<768x768xf32>
+    %cst_10 = arith.constant dense_resource<torch_tensor_768_torch.float32_12> : tensor<768xf32>
+    %cst_11 = arith.constant dense_resource<torch_tensor_768_768_torch.float32_5> : tensor<768x768xf32>
+    %cst_12 = arith.constant dense_resource<torch_tensor_768_torch.float32_11> : tensor<768xf32>
+    %cst_13 = arith.constant dense_resource<torch_tensor_768_768_torch.float32_4> : tensor<768x768xf32>
+    %cst_14 = arith.constant dense_resource<torch_tensor_768_torch.float32_10> : tensor<768xf32>
+    %cst_15 = arith.constant dense_resource<torch_tensor_768_torch.float32_9> : tensor<768xf32>
+    %cst_16 = arith.constant dense_resource<torch_tensor_768_torch.float32_8> : tensor<768xf32>
+    %cst_17 = arith.constant dense_resource<torch_tensor_768_3072_torch.float32> : tensor<768x3072xf32>
+    %cst_18 = arith.constant dense_resource<torch_tensor_3072_torch.float32> : tensor<3072xf32>
+    %cst_19 = arith.constant dense_resource<torch_tensor_3072_768_torch.float32> : tensor<3072x768xf32>
+    %cst_20 = arith.constant dense_resource<torch_tensor_768_torch.float32_7> : tensor<768xf32>
+    %cst_21 = arith.constant dense_resource<torch_tensor_768_torch.float32_6> : tensor<768xf32>
+    %cst_22 = arith.constant dense_resource<torch_tensor_768_torch.float32_5> : tensor<768xf32>
+    %cst_23 = arith.constant dense_resource<torch_tensor_768_768_torch.float32_3> : tensor<768x768xf32>
+    %cst_24 = arith.constant dense_resource<torch_tensor_768_torch.float32_4> : tensor<768xf32>
+    %cst_25 = arith.constant dense_resource<torch_tensor_768_768_torch.float32_2> : tensor<768x768xf32>
+    %cst_26 = arith.constant dense_resource<torch_tensor_768_torch.float32_3> : tensor<768xf32>
+    %cst_27 = arith.constant dense_resource<torch_tensor_768_768_torch.float32_1> : tensor<768x768xf32>
+    %cst_28 = arith.constant dense_resource<torch_tensor_768_torch.float32_2> : tensor<768xf32>
+    %cst_29 = arith.constant dense_resource<torch_tensor_768_768_torch.float32> : tensor<768x768xf32>
+    %cst_30 = arith.constant dense_resource<torch_tensor_768_torch.float32_1> : tensor<768xf32>
+    %cst_31 = arith.constant dense_resource<torch_tensor_768_torch.float32> : tensor<768xf32>
+    %cst_32 = arith.constant dense_resource<torch_tensor_3_768_torch.float32> : tensor<3x768xf32>
+    %cst_33 = arith.constant 1.000000e+00 : f32
+    %cst_34 = arith.constant 3.000000e+00 : f32
+    %cst_35 = arith.constant 5.000000e-01 : f32
+    %cst_36 = arith.constant 8.000000e+00 : f32
+    %cst_37 = arith.constant 7.680000e+02 : f64
+    %cst_38 = arith.constant 7.680000e+02 : f32
+    %cst_39 = arith.constant 9.9999999999999995E-7 : f64
+    %cst_40 = arith.constant 4.471500e-02 : f64
+    %cst_41 = arith.constant 0.79788456080286541 : f64
+    %cst_42 = arith.constant 7.670000e+02 : f64
+    %cst_43 = arith.constant 0xFF800000 : f32
+    %cst_44 = arith.constant 0.000000e+00 : f64
+    %cst_45 = arith.constant 0.000000e+00 : f32
+    %c30522 = arith.constant 30522 : index
+    %c3 = arith.constant 3 : index
+    %c0_i64 = arith.constant 0 : i64
+    %cst_46 = arith.constant dense_resource<torch_tensor_30522_768_torch.float32> : tensor<30522x768xf32>
+    %0 = bufferization.to_memref %arg2 : memref<1x128xi64>
+    %1 = bufferization.to_memref %arg1 : memref<1x128xi64>
+    %2 = bufferization.to_memref %arg1 : memref<1x128xi64>
+    %3 = bufferization.to_memref %cst_31 : memref<768xf32>
+    %4 = bufferization.to_memref %cst_30 : memref<768xf32>
+    %5 = bufferization.to_memref %cst_29 : memref<768x768xf32>
+    %6 = bufferization.to_memref %cst_28 : memref<768xf32>
+    %7 = bufferization.to_memref %cst_27 : memref<768x768xf32>
+    %8 = bufferization.to_memref %cst_26 : memref<768xf32>
+    %9 = bufferization.to_memref %cst_25 : memref<768x768xf32>
+    %10 = bufferization.to_memref %cst_24 : memref<768xf32>
+    %11 = bufferization.to_memref %cst_23 : memref<768x768xf32>
+    %12 = bufferization.to_memref %cst_22 : memref<768xf32>
+    %13 = bufferization.to_memref %cst_21 : memref<768xf32>
+    %14 = bufferization.to_memref %cst_20 : memref<768xf32>
+    %15 = bufferization.to_memref %cst_19 : memref<3072x768xf32>
+    %16 = bufferization.to_memref %cst_18 : memref<3072xf32>
+    %17 = bufferization.to_memref %cst_17 : memref<768x3072xf32>
+    %18 = bufferization.to_memref %cst_16 : memref<768xf32>
+    %19 = bufferization.to_memref %cst_15 : memref<768xf32>
+    %20 = bufferization.to_memref %cst_14 : memref<768xf32>
+    %21 = bufferization.to_memref %cst_13 : memref<768x768xf32>
+    %22 = bufferization.to_memref %cst_12 : memref<768xf32>
+    %23 = bufferization.to_memref %cst_11 : memref<768x768xf32>
+    %24 = bufferization.to_memref %cst_10 : memref<768xf32>
+    %25 = bufferization.to_memref %cst_9 : memref<768x768xf32>
+    %26 = bufferization.to_memref %cst_8 : memref<768xf32>
+    %27 = bufferization.to_memref %cst_7 : memref<768x768xf32>
+    %28 = bufferization.to_memref %cst_6 : memref<768xf32>
+    %29 = bufferization.to_memref %cst_5 : memref<768xf32>
+    %30 = bufferization.to_memref %cst_4 : memref<768xf32>
+    %31 = bufferization.to_memref %cst_3 : memref<3072x768xf32>
+    %32 = bufferization.to_memref %cst_2 : memref<3072xf32>
+    %33 = bufferization.to_memref %cst_1 : memref<768x3072xf32>
+    %34 = bufferization.to_memref %cst_0 : memref<768xf32>
+    %35 = bufferization.to_memref %cst : memref<f64>
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x128xi1>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        %88 = affine.load %2[%c0, %arg4] : memref<1x128xi64>
+        %89 = arith.cmpi sgt, %88, %c0_i64 : i64
+        affine.store %89, %alloc[%arg3, %arg4] : memref<1x128xi1>
+      }
+    }
+    %36 = bufferization.to_tensor %alloc : memref<1x128xi1>
+    %expanded = tensor.expand_shape %36 [[0, 1], [2, 3, 4, 5]] : tensor<1x128xi1> into tensor<1x1x1x1x1x128xi1>
+    %37 = bufferization.to_memref %expanded : memref<1x1x1x1x1x128xi1>
+    %alloc_47 = memref.alloc() {alignment = 64 : i64} : memref<1x1x128x1x1x128xi1>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 1 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 1 {
+            affine.for %arg7 = 0 to 1 {
+              affine.for %arg8 = 0 to 128 {
+                %88 = affine.load %37[%arg3, %arg4, %c0, %arg6, %arg7, %arg8] : memref<1x1x1x1x1x128xi1>
+                affine.store %88, %alloc_47[%arg3, %arg4, %arg5, %arg6, %arg7, %arg8] : memref<1x1x128x1x1x128xi1>
+              }
+            }
+          }
+        }
+      }
+    }
+    %38 = bufferization.to_tensor %alloc_47 : memref<1x1x128x1x1x128xi1>
+    %collapsed = tensor.collapse_shape %38 [[0], [1, 2], [3, 4, 5]] : tensor<1x1x128x1x1x128xi1> into tensor<1x128x128xi1>
+    %expanded_48 = tensor.expand_shape %collapsed [[0], [1, 2], [3]] : tensor<1x128x128xi1> into tensor<1x1x128x128xi1>
+    %39 = bufferization.to_memref %expanded_48 : memref<1x1x128x128xi1>
+    %alloc_49 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %1[%arg3, %arg4] : memref<1x128xi64>
+          %89 = arith.index_cast %88 : i64 to index
+          %90 = arith.cmpi slt, %89, %c30522 : index
+          cf.assert %90, "index must be smaller than dim size"
+          %91 = arith.cmpi sge, %88, %c0_i64 : i64
+          cf.assert %91, "index must be larger or equal to 0"
+          %extracted = tensor.extract %cst_46[%89, %arg5] : tensor<30522x768xf32>
+          affine.store %extracted, %alloc_49[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %extracted_slice = tensor.extract_slice %arg0[0, 0, 0] [1, 128, 768] [1, 1, 1] : tensor<1x512x768xf32> to tensor<1x128x768xf32>
+    %40 = bufferization.to_memref %extracted_slice : memref<1x128x768xf32>
+    %alloc_50 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_49[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %40[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_50[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_51 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %0[%arg3, %arg4] : memref<1x128xi64>
+          %89 = arith.index_cast %88 : i64 to index
+          %90 = arith.cmpi slt, %89, %c3 : index
+          cf.assert %90, "index must be smaller than dim size"
+          %91 = arith.cmpi sge, %88, %c0_i64 : i64
+          cf.assert %91, "index must be larger or equal to 0"
+          %extracted = tensor.extract %cst_32[%89, %arg5] : tensor<3x768xf32>
+          affine.store %extracted, %alloc_51[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_52 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_50[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %alloc_51[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_52[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_53 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          affine.store %cst_45, %alloc_53[%arg3, %arg4, %arg5] : memref<1x128x1xf32>
+        }
+      }
+    }
+    %alloc_54 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    memref.copy %alloc_53, %alloc_54 : memref<1x128x1xf32> to memref<1x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_52[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %alloc_54[%arg3, %arg4, %c0] : memref<1x128x1xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_54[%arg3, %arg4, %c0] : memref<1x128x1xf32>
+        }
+      }
+    }
+    %alloc_55 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_54[%c0, %arg4, %c0] : memref<1x128x1xf32>
+          %89 = arith.divf %88, %cst_38 : f32
+          affine.store %89, %alloc_55[%arg3, %arg4, %arg5] : memref<1x128x1xf32>
+        }
+      }
+    }
+    %alloc_56 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_52[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = arith.extf %88 : f32 to f64
+          affine.store %89, %alloc_56[%arg3, %arg4, %arg5] : memref<1x128x768xf64>
+        }
+      }
+    }
+    %alloc_57 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          affine.store %cst_44, %alloc_57[%arg3, %arg4, %arg5] : memref<1x128x1xf64>
+        }
+      }
+    }
+    %alloc_58 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64>
+    memref.copy %alloc_57, %alloc_58 : memref<1x128x1xf64> to memref<1x128x1xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_56[%arg3, %arg4, %arg5] : memref<1x128x768xf64>
+          %89 = affine.load %alloc_58[%arg3, %arg4, %c0] : memref<1x128x1xf64>
+          %90 = arith.addf %88, %89 : f64
+          affine.store %90, %alloc_58[%arg3, %arg4, %c0] : memref<1x128x1xf64>
+        }
+      }
+    }
+    %alloc_59 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_58[%c0, %arg4, %c0] : memref<1x128x1xf64>
+          %89 = arith.divf %88, %cst_37 : f64
+          affine.store %89, %alloc_59[%arg3, %arg4, %arg5] : memref<1x128x1xf64>
+        }
+      }
+    }
+    %alloc_60 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_56[%c0, %arg4, %arg5] : memref<1x128x768xf64>
+          %89 = affine.load %alloc_59[%c0, %arg4, %c0] : memref<1x128x1xf64>
+          %90 = arith.subf %88, %89 : f64
+          affine.store %90, %alloc_60[%arg3, %arg4, %arg5] : memref<1x128x768xf64>
+        }
+      }
+    }
+    %alloc_61 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_60[%c0, %arg4, %arg5] : memref<1x128x768xf64>
+          %89 = affine.load %alloc_60[%c0, %arg4, %arg5] : memref<1x128x768xf64>
+          %90 = arith.mulf %88, %89 : f64
+          affine.store %90, %alloc_61[%arg3, %arg4, %arg5] : memref<1x128x768xf64>
+        }
+      }
+    }
+    %alloc_62 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64>
+    memref.copy %alloc_57, %alloc_62 : memref<1x128x1xf64> to memref<1x128x1xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_61[%arg3, %arg4, %arg5] : memref<1x128x768xf64>
+          %89 = affine.load %alloc_62[%arg3, %arg4, %c0] : memref<1x128x1xf64>
+          %90 = arith.addf %88, %89 : f64
+          affine.store %90, %alloc_62[%arg3, %arg4, %c0] : memref<1x128x1xf64>
+        }
+      }
+    }
+    %alloc_63 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_62[%c0, %arg4, %c0] : memref<1x128x1xf64>
+          %89 = arith.divf %88, %cst_42 : f64
+          affine.store %89, %alloc_63[%arg3, %arg4, %arg5] : memref<1x128x1xf64>
+        }
+      }
+    }
+    %alloc_64 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_63[%c0, %arg4, %c0] : memref<1x128x1xf64>
+          %89 = arith.truncf %88 : f64 to f32
+          affine.store %89, %alloc_64[%arg3, %arg4, %arg5] : memref<1x128x1xf32>
+        }
+      }
+    }
+    %alloc_65 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_64[%c0, %arg4, %c0] : memref<1x128x1xf32>
+          %89 = math.sqrt %88 : f32
+          affine.store %89, %alloc_65[%arg3, %arg4, %arg5] : memref<1x128x1xf32>
+        }
+      }
+    }
+    %alloc_66 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_52[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %alloc_55[%c0, %arg4, %c0] : memref<1x128x1xf32>
+          %90 = arith.subf %88, %89 : f32
+          affine.store %90, %alloc_66[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_67 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %3[%arg5] : memref<768xf32>
+          %89 = affine.load %alloc_66[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %90 = arith.mulf %88, %89 : f32
+          affine.store %90, %alloc_67[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_68 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_65[%c0, %arg4, %c0] : memref<1x128x1xf32>
+          %89 = arith.truncf %cst_39 : f64 to f32
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_68[%arg3, %arg4, %arg5] : memref<1x128x1xf32>
+        }
+      }
+    }
+    %alloc_69 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_67[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %alloc_68[%c0, %arg4, %c0] : memref<1x128x1xf32>
+          %90 = arith.divf %88, %89 : f32
+          affine.store %90, %alloc_69[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_70 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_69[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %4[%arg5] : memref<768xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_70[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_71 = memref.alloc() {alignment = 64 : i64} : memref<768x768xf32>
+    affine.for %arg3 = 0 to 768 {
+      affine.for %arg4 = 0 to 768 {
+        %88 = affine.load %5[%arg3, %arg4] : memref<768x768xf32>
+        affine.store %88, %alloc_71[%arg4, %arg3] : memref<768x768xf32>
+      }
+    }
+    %alloc_72 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_70[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          affine.store %88, %alloc_72[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_73 = memref.alloc() {alignment = 64 : i64} : memref<1x768x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 768 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_71[%arg4, %arg5] : memref<768x768xf32>
+          affine.store %88, %alloc_73[%arg3, %arg4, %arg5] : memref<1x768x768xf32>
+        }
+      }
+    }
+    %alloc_74 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          affine.store %cst_45, %alloc_74[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_75 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    memref.copy %alloc_74, %alloc_75 : memref<1x128x768xf32> to memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          affine.for %arg6 = 0 to 768 {
+            %88 = affine.load %alloc_72[%arg3, %arg4, %arg6] : memref<1x128x768xf32>
+            %89 = affine.load %alloc_73[%arg3, %arg6, %arg5] : memref<1x768x768xf32>
+            %90 = affine.load %alloc_75[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+            %91 = arith.mulf %88, %89 : f32
+            %92 = arith.addf %90, %91 : f32
+            affine.store %92, %alloc_75[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+          }
+        }
+      }
+    }
+    %alloc_76 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_75[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %6[%arg5] : memref<768xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_76[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %41 = bufferization.to_tensor %alloc_76 : memref<1x128x768xf32>
+    %expanded_77 = tensor.expand_shape %41 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32>
+    %42 = bufferization.to_memref %expanded_77 : memref<1x128x12x64xf32>
+    %alloc_78 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x64xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 12 {
+          affine.for %arg6 = 0 to 64 {
+            %88 = affine.load %42[%arg3, %arg4, %arg5, %arg6] : memref<1x128x12x64xf32>
+            affine.store %88, %alloc_78[%arg3, %arg5, %arg4, %arg6] : memref<1x12x128x64xf32>
+          }
+        }
+      }
+    }
+    %alloc_79 = memref.alloc() {alignment = 64 : i64} : memref<768x768xf32>
+    affine.for %arg3 = 0 to 768 {
+      affine.for %arg4 = 0 to 768 {
+        %88 = affine.load %7[%arg3, %arg4] : memref<768x768xf32>
+        affine.store %88, %alloc_79[%arg4, %arg3] : memref<768x768xf32>
+      }
+    }
+    %alloc_80 = memref.alloc() {alignment = 64 : i64} : memref<1x768x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 768 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_79[%arg4, %arg5] : memref<768x768xf32>
+          affine.store %88, %alloc_80[%arg3, %arg4, %arg5] : memref<1x768x768xf32>
+        }
+      }
+    }
+    %alloc_81 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    memref.copy %alloc_74, %alloc_81 : memref<1x128x768xf32> to memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          affine.for %arg6 = 0 to 768 {
+            %88 = affine.load %alloc_72[%arg3, %arg4, %arg6] : memref<1x128x768xf32>
+            %89 = affine.load %alloc_80[%arg3, %arg6, %arg5] : memref<1x768x768xf32>
+            %90 = affine.load %alloc_81[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+            %91 = arith.mulf %88, %89 : f32
+            %92 = arith.addf %90, %91 : f32
+            affine.store %92, %alloc_81[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+          }
+        }
+      }
+    }
+    %alloc_82 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_81[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %8[%arg5] : memref<768xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_82[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %43 = bufferization.to_tensor %alloc_82 : memref<1x128x768xf32>
+    %expanded_83 = tensor.expand_shape %43 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32>
+    %44 = bufferization.to_memref %expanded_83 : memref<1x128x12x64xf32>
+    %alloc_84 = memref.alloc() {alignment = 64 : i64} : memref<768x768xf32>
+    affine.for %arg3 = 0 to 768 {
+      affine.for %arg4 = 0 to 768 {
+        %88 = affine.load %9[%arg3, %arg4] : memref<768x768xf32>
+        affine.store %88, %alloc_84[%arg4, %arg3] : memref<768x768xf32>
+      }
+    }
+    %alloc_85 = memref.alloc() {alignment = 64 : i64} : memref<1x768x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 768 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_84[%arg4, %arg5] : memref<768x768xf32>
+          affine.store %88, %alloc_85[%arg3, %arg4, %arg5] : memref<1x768x768xf32>
+        }
+      }
+    }
+    %alloc_86 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    memref.copy %alloc_74, %alloc_86 : memref<1x128x768xf32> to memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          affine.for %arg6 = 0 to 768 {
+            %88 = affine.load %alloc_72[%arg3, %arg4, %arg6] : memref<1x128x768xf32>
+            %89 = affine.load %alloc_85[%arg3, %arg6, %arg5] : memref<1x768x768xf32>
+            %90 = affine.load %alloc_86[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+            %91 = arith.mulf %88, %89 : f32
+            %92 = arith.addf %90, %91 : f32
+            affine.store %92, %alloc_86[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+          }
+        }
+      }
+    }
+    %alloc_87 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_86[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %10[%arg5] : memref<768xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_87[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %45 = bufferization.to_tensor %alloc_87 : memref<1x128x768xf32>
+    %expanded_88 = tensor.expand_shape %45 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32>
+    %46 = bufferization.to_memref %expanded_88 : memref<1x128x12x64xf32>
+    %alloc_89 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x64xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 12 {
+          affine.for %arg6 = 0 to 64 {
+            %88 = affine.load %46[%arg3, %arg4, %arg5, %arg6] : memref<1x128x12x64xf32>
+            affine.store %88, %alloc_89[%arg3, %arg5, %arg4, %arg6] : memref<1x12x128x64xf32>
+          }
+        }
+      }
+    }
+    %alloc_90 = memref.alloc() {alignment = 64 : i64} : memref<1x12x64x128xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 12 {
+          affine.for %arg6 = 0 to 64 {
+            %88 = affine.load %44[%arg3, %arg4, %arg5, %arg6] : memref<1x128x12x64xf32>
+            affine.store %88, %alloc_90[%arg3, %arg5, %arg6, %arg4] : memref<1x12x64x128xf32>
+          }
+        }
+      }
+    }
+    %alloc_91 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x64xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 64 {
+            %88 = affine.load %alloc_78[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x64xf32>
+            affine.store %88, %alloc_91[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x64xf32>
+          }
+        }
+      }
+    }
+    %47 = bufferization.to_tensor %alloc_91 : memref<1x12x128x64xf32>
+    %alloc_92 = memref.alloc() {alignment = 64 : i64} : memref<1x12x64x128xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 64 {
+          affine.for %arg6 = 0 to 128 {
+            %88 = affine.load %alloc_90[%c0, %arg4, %arg5, %arg6] : memref<1x12x64x128xf32>
+            affine.store %88, %alloc_92[%arg3, %arg4, %arg5, %arg6] : memref<1x12x64x128xf32>
+          }
+        }
+      }
+    }
+    %48 = bufferization.to_tensor %alloc_92 : memref<1x12x64x128xf32>
+    %collapsed_93 = tensor.collapse_shape %47 [[0, 1], [2], [3]] : tensor<1x12x128x64xf32> into tensor<12x128x64xf32>
+    %49 = bufferization.to_memref %collapsed_93 : memref<12x128x64xf32>
+    %collapsed_94 = tensor.collapse_shape %48 [[0, 1], [2], [3]] : tensor<1x12x64x128xf32> into tensor<12x64x128xf32>
+    %50 = bufferization.to_memref %collapsed_94 : memref<12x64x128xf32>
+    %alloc_95 = memref.alloc() {alignment = 64 : i64} : memref<12x128x128xf32>
+    affine.for %arg3 = 0 to 12 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 128 {
+          affine.store %cst_45, %alloc_95[%arg3, %arg4, %arg5] : memref<12x128x128xf32>
+        }
+      }
+    }
+    %alloc_96 = memref.alloc() {alignment = 64 : i64} : memref<12x128x128xf32>
+    memref.copy %alloc_95, %alloc_96 : memref<12x128x128xf32> to memref<12x128x128xf32>
+    affine.for %arg3 = 0 to 12 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 64 {
+            %88 = affine.load %49[%arg3, %arg4, %arg6] : memref<12x128x64xf32>
+            %89 = affine.load %50[%arg3, %arg6, %arg5] : memref<12x64x128xf32>
+            %90 = affine.load %alloc_96[%arg3, %arg4, %arg5] : memref<12x128x128xf32>
+            %91 = arith.mulf %88, %89 : f32
+            %92 = arith.addf %90, %91 : f32
+            affine.store %92, %alloc_96[%arg3, %arg4, %arg5] : memref<12x128x128xf32>
+          }
+        }
+      }
+    }
+    %51 = bufferization.to_tensor %alloc_96 : memref<12x128x128xf32>
+    %expanded_97 = tensor.expand_shape %51 [[0, 1], [2], [3]] : tensor<12x128x128xf32> into tensor<1x12x128x128xf32>
+    %52 = bufferization.to_memref %expanded_97 : memref<1x12x128x128xf32>
+    %alloc_98 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 128 {
+            %88 = affine.load %52[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+            %89 = arith.divf %88, %cst_36 : f32
+            affine.store %89, %alloc_98[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+          }
+        }
+      }
+    }
+    %alloc_99 = memref.alloc() {alignment = 64 : i64} : memref<1x1x128x128xi1>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 1 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 128 {
+            %88 = affine.load %39[%c0, %c0, %arg5, %arg6] : memref<1x1x128x128xi1>
+            %89 = arith.extui %88 : i1 to i64
+            %90 = arith.cmpi eq, %89, %c0_i64 : i64
+            affine.store %90, %alloc_99[%arg3, %arg4, %arg5, %arg6] : memref<1x1x128x128xi1>
+          }
+        }
+      }
+    }
+    %alloc_100 = memref.alloc() {alignment = 64 : i64} : memref<f32>
+    %53 = affine.load %35[] : memref<f64>
+    %54 = arith.truncf %53 : f64 to f32
+    affine.store %54, %alloc_100[] : memref<f32>
+    %alloc_101 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 128 {
+            %88 = affine.load %alloc_99[%c0, %c0, %arg5, %arg6] : memref<1x1x128x128xi1>
+            %89 = affine.load %alloc_100[] : memref<f32>
+            %90 = affine.load %alloc_98[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+            %91 = arith.select %88, %89, %90 : f32
+            affine.store %91, %alloc_101[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+          }
+        }
+      }
+    }
+    %alloc_102 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128xi64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.store %c0_i64, %alloc_102[%arg3, %arg4, %arg5] : memref<1x12x128xi64>
+        }
+      }
+    }
+    %alloc_103 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.store %cst_43, %alloc_103[%arg3, %arg4, %arg5] : memref<1x12x128xf32>
+        }
+      }
+    }
+    %alloc_104 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128xf32>
+    memref.copy %alloc_103, %alloc_104 : memref<1x12x128xf32> to memref<1x12x128xf32>
+    %alloc_105 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128xi64>
+    memref.copy %alloc_102, %alloc_105 : memref<1x12x128xi64> to memref<1x12x128xi64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 128 {
+            %88 = affine.load %alloc_101[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+            %89 = affine.load %alloc_104[%arg3, %arg4, %arg5] : memref<1x12x128xf32>
+            %90 = affine.load %alloc_105[%arg3, %arg4, %arg5] : memref<1x12x128xi64>
+            %91 = arith.index_cast %arg6 : index to i64
+            %92 = arith.maximumf %88, %89 : f32
+            %93 = arith.cmpf ogt, %88, %89 : f32
+            %94 = arith.select %93, %91, %90 : i64
+            affine.store %92, %alloc_104[%arg3, %arg4, %arg5] : memref<1x12x128xf32>
+            affine.store %94, %alloc_105[%arg3, %arg4, %arg5] : memref<1x12x128xi64>
+          }
+        }
+      }
+    }
+    %55 = bufferization.to_tensor %alloc_104 : memref<1x12x128xf32>
+    %expanded_106 = tensor.expand_shape %55 [[0], [1], [2, 3]] : tensor<1x12x128xf32> into tensor<1x12x128x1xf32>
+    %56 = bufferization.to_memref %expanded_106 : memref<1x12x128x1xf32>
+    %alloc_107 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 128 {
+            %88 = affine.load %alloc_101[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+            %89 = affine.load %56[%c0, %arg4, %arg5, %c0] : memref<1x12x128x1xf32>
+            %90 = arith.subf %88, %89 : f32
+            affine.store %90, %alloc_107[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+          }
+        }
+      }
+    }
+    %alloc_108 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 128 {
+            %88 = affine.load %alloc_107[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+            %89 = math.exp %88 : f32
+            affine.store %89, %alloc_108[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+          }
+        }
+      }
+    }
+    %alloc_109 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 1 {
+            affine.store %cst_45, %alloc_109[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x1xf32>
+          }
+        }
+      }
+    }
+    %alloc_110 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x1xf32>
+    memref.copy %alloc_109, %alloc_110 : memref<1x12x128x1xf32> to memref<1x12x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 128 {
+            %88 = affine.load %alloc_108[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+            %89 = affine.load %alloc_110[%arg3, %arg4, %arg5, %c0] : memref<1x12x128x1xf32>
+            %90 = arith.addf %88, %89 : f32
+            affine.store %90, %alloc_110[%arg3, %arg4, %arg5, %c0] : memref<1x12x128x1xf32>
+          }
+        }
+      }
+    }
+    %alloc_111 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 128 {
+            %88 = affine.load %alloc_108[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+            %89 = affine.load %alloc_110[%c0, %arg4, %arg5, %c0] : memref<1x12x128x1xf32>
+            %90 = arith.divf %88, %89 : f32
+            affine.store %90, %alloc_111[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+          }
+        }
+      }
+    }
+    %alloc_112 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 128 {
+            %88 = affine.load %alloc_111[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+            affine.store %88, %alloc_112[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+          }
+        }
+      }
+    }
+    %57 = bufferization.to_tensor %alloc_112 : memref<1x12x128x128xf32>
+    %alloc_113 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x64xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 64 {
+            %88 = affine.load %alloc_89[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x64xf32>
+            affine.store %88, %alloc_113[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x64xf32>
+          }
+        }
+      }
+    }
+    %58 = bufferization.to_tensor %alloc_113 : memref<1x12x128x64xf32>
+    %collapsed_114 = tensor.collapse_shape %57 [[0, 1], [2], [3]] : tensor<1x12x128x128xf32> into tensor<12x128x128xf32>
+    %59 = bufferization.to_memref %collapsed_114 : memref<12x128x128xf32>
+    %collapsed_115 = tensor.collapse_shape %58 [[0, 1], [2], [3]] : tensor<1x12x128x64xf32> into tensor<12x128x64xf32>
+    %60 = bufferization.to_memref %collapsed_115 : memref<12x128x64xf32>
+    %alloc_116 = memref.alloc() {alignment = 64 : i64} : memref<12x128x64xf32>
+    affine.for %arg3 = 0 to 12 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 64 {
+          affine.store %cst_45, %alloc_116[%arg3, %arg4, %arg5] : memref<12x128x64xf32>
+        }
+      }
+    }
+    %alloc_117 = memref.alloc() {alignment = 64 : i64} : memref<12x128x64xf32>
+    memref.copy %alloc_116, %alloc_117 : memref<12x128x64xf32> to memref<12x128x64xf32>
+    affine.for %arg3 = 0 to 12 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 64 {
+          affine.for %arg6 = 0 to 128 {
+            %88 = affine.load %59[%arg3, %arg4, %arg6] : memref<12x128x128xf32>
+            %89 = affine.load %60[%arg3, %arg6, %arg5] : memref<12x128x64xf32>
+            %90 = affine.load %alloc_117[%arg3, %arg4, %arg5] : memref<12x128x64xf32>
+            %91 = arith.mulf %88, %89 : f32
+            %92 = arith.addf %90, %91 : f32
+            affine.store %92, %alloc_117[%arg3, %arg4, %arg5] : memref<12x128x64xf32>
+          }
+        }
+      }
+    }
+    %61 = bufferization.to_tensor %alloc_117 : memref<12x128x64xf32>
+    %expanded_118 = tensor.expand_shape %61 [[0, 1], [2], [3]] : tensor<12x128x64xf32> into tensor<1x12x128x64xf32>
+    %62 = bufferization.to_memref %expanded_118 : memref<1x12x128x64xf32>
+    %alloc_119 = memref.alloc() {alignment = 64 : i64} : memref<1x128x12x64xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 64 {
+            %88 = affine.load %62[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x64xf32>
+            affine.store %88, %alloc_119[%arg3, %arg5, %arg4, %arg6] : memref<1x128x12x64xf32>
+          }
+        }
+      }
+    }
+    %63 = bufferization.to_tensor %alloc_119 : memref<1x128x12x64xf32>
+    %collapsed_120 = tensor.collapse_shape %63 [[0], [1], [2, 3]] : tensor<1x128x12x64xf32> into tensor<1x128x768xf32>
+    %64 = bufferization.to_memref %collapsed_120 : memref<1x128x768xf32>
+    %alloc_121 = memref.alloc() {alignment = 64 : i64} : memref<768x768xf32>
+    affine.for %arg3 = 0 to 768 {
+      affine.for %arg4 = 0 to 768 {
+        %88 = affine.load %11[%arg3, %arg4] : memref<768x768xf32>
+        affine.store %88, %alloc_121[%arg4, %arg3] : memref<768x768xf32>
+      }
+    }
+    %alloc_122 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %64[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          affine.store %88, %alloc_122[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_123 = memref.alloc() {alignment = 64 : i64} : memref<1x768x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 768 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_121[%arg4, %arg5] : memref<768x768xf32>
+          affine.store %88, %alloc_123[%arg3, %arg4, %arg5] : memref<1x768x768xf32>
+        }
+      }
+    }
+    %alloc_124 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    memref.copy %alloc_74, %alloc_124 : memref<1x128x768xf32> to memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          affine.for %arg6 = 0 to 768 {
+            %88 = affine.load %alloc_122[%arg3, %arg4, %arg6] : memref<1x128x768xf32>
+            %89 = affine.load %alloc_123[%arg3, %arg6, %arg5] : memref<1x768x768xf32>
+            %90 = affine.load %alloc_124[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+            %91 = arith.mulf %88, %89 : f32
+            %92 = arith.addf %90, %91 : f32
+            affine.store %92, %alloc_124[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+          }
+        }
+      }
+    }
+    %alloc_125 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_124[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %12[%arg5] : memref<768xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_125[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_126 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_52[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %alloc_125[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_126[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_127 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    memref.copy %alloc_53, %alloc_127 : memref<1x128x1xf32> to memref<1x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_126[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %alloc_127[%arg3, %arg4, %c0] : memref<1x128x1xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_127[%arg3, %arg4, %c0] : memref<1x128x1xf32>
+        }
+      }
+    }
+    %alloc_128 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_127[%c0, %arg4, %c0] : memref<1x128x1xf32>
+          %89 = arith.divf %88, %cst_38 : f32
+          affine.store %89, %alloc_128[%arg3, %arg4, %arg5] : memref<1x128x1xf32>
+        }
+      }
+    }
+    %alloc_129 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_126[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = arith.extf %88 : f32 to f64
+          affine.store %89, %alloc_129[%arg3, %arg4, %arg5] : memref<1x128x768xf64>
+        }
+      }
+    }
+    %alloc_130 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64>
+    memref.copy %alloc_57, %alloc_130 : memref<1x128x1xf64> to memref<1x128x1xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_129[%arg3, %arg4, %arg5] : memref<1x128x768xf64>
+          %89 = affine.load %alloc_130[%arg3, %arg4, %c0] : memref<1x128x1xf64>
+          %90 = arith.addf %88, %89 : f64
+          affine.store %90, %alloc_130[%arg3, %arg4, %c0] : memref<1x128x1xf64>
+        }
+      }
+    }
+    %alloc_131 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_130[%c0, %arg4, %c0] : memref<1x128x1xf64>
+          %89 = arith.divf %88, %cst_37 : f64
+          affine.store %89, %alloc_131[%arg3, %arg4, %arg5] : memref<1x128x1xf64>
+        }
+      }
+    }
+    %alloc_132 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_129[%c0, %arg4, %arg5] : memref<1x128x768xf64>
+          %89 = affine.load %alloc_131[%c0, %arg4, %c0] : memref<1x128x1xf64>
+          %90 = arith.subf %88, %89 : f64
+          affine.store %90, %alloc_132[%arg3, %arg4, %arg5] : memref<1x128x768xf64>
+        }
+      }
+    }
+    %alloc_133 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_132[%c0, %arg4, %arg5] : memref<1x128x768xf64>
+          %89 = affine.load %alloc_132[%c0, %arg4, %arg5] : memref<1x128x768xf64>
+          %90 = arith.mulf %88, %89 : f64
+          affine.store %90, %alloc_133[%arg3, %arg4, %arg5] : memref<1x128x768xf64>
+        }
+      }
+    }
+    %alloc_134 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64>
+    memref.copy %alloc_57, %alloc_134 : memref<1x128x1xf64> to memref<1x128x1xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_133[%arg3, %arg4, %arg5] : memref<1x128x768xf64>
+          %89 = affine.load %alloc_134[%arg3, %arg4, %c0] : memref<1x128x1xf64>
+          %90 = arith.addf %88, %89 : f64
+          affine.store %90, %alloc_134[%arg3, %arg4, %c0] : memref<1x128x1xf64>
+        }
+      }
+    }
+    %alloc_135 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_134[%c0, %arg4, %c0] : memref<1x128x1xf64>
+          %89 = arith.divf %88, %cst_42 : f64
+          affine.store %89, %alloc_135[%arg3, %arg4, %arg5] : memref<1x128x1xf64>
+        }
+      }
+    }
+    %alloc_136 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_135[%c0, %arg4, %c0] : memref<1x128x1xf64>
+          %89 = arith.truncf %88 : f64 to f32
+          affine.store %89, %alloc_136[%arg3, %arg4, %arg5] : memref<1x128x1xf32>
+        }
+      }
+    }
+    %alloc_137 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_136[%c0, %arg4, %c0] : memref<1x128x1xf32>
+          %89 = math.sqrt %88 : f32
+          affine.store %89, %alloc_137[%arg3, %arg4, %arg5] : memref<1x128x1xf32>
+        }
+      }
+    }
+    %alloc_138 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_126[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %alloc_128[%c0, %arg4, %c0] : memref<1x128x1xf32>
+          %90 = arith.subf %88, %89 : f32
+          affine.store %90, %alloc_138[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_139 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %13[%arg5] : memref<768xf32>
+          %89 = affine.load %alloc_138[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %90 = arith.mulf %88, %89 : f32
+          affine.store %90, %alloc_139[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_140 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_137[%c0, %arg4, %c0] : memref<1x128x1xf32>
+          %89 = arith.truncf %cst_39 : f64 to f32
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_140[%arg3, %arg4, %arg5] : memref<1x128x1xf32>
+        }
+      }
+    }
+    %alloc_141 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_139[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %alloc_140[%c0, %arg4, %c0] : memref<1x128x1xf32>
+          %90 = arith.divf %88, %89 : f32
+          affine.store %90, %alloc_141[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_142 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_141[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %14[%arg5] : memref<768xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_142[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_143 = memref.alloc() {alignment = 64 : i64} : memref<768x3072xf32>
+    affine.for %arg3 = 0 to 3072 {
+      affine.for %arg4 = 0 to 768 {
+        %88 = affine.load %15[%arg3, %arg4] : memref<3072x768xf32>
+        affine.store %88, %alloc_143[%arg4, %arg3] : memref<768x3072xf32>
+      }
+    }
+    %alloc_144 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_142[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          affine.store %88, %alloc_144[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_145 = memref.alloc() {alignment = 64 : i64} : memref<1x768x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 768 {
+        affine.for %arg5 = 0 to 3072 {
+          %88 = affine.load %alloc_143[%arg4, %arg5] : memref<768x3072xf32>
+          affine.store %88, %alloc_145[%arg3, %arg4, %arg5] : memref<1x768x3072xf32>
+        }
+      }
+    }
+    %alloc_146 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 3072 {
+          affine.store %cst_45, %alloc_146[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+        }
+      }
+    }
+    %alloc_147 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32>
+    memref.copy %alloc_146, %alloc_147 : memref<1x128x3072xf32> to memref<1x128x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 3072 {
+          affine.for %arg6 = 0 to 768 {
+            %88 = affine.load %alloc_144[%arg3, %arg4, %arg6] : memref<1x128x768xf32>
+            %89 = affine.load %alloc_145[%arg3, %arg6, %arg5] : memref<1x768x3072xf32>
+            %90 = affine.load %alloc_147[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+            %91 = arith.mulf %88, %89 : f32
+            %92 = arith.addf %90, %91 : f32
+            affine.store %92, %alloc_147[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+          }
+        }
+      }
+    }
+    %alloc_148 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 3072 {
+          %88 = affine.load %alloc_147[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          %89 = affine.load %16[%arg5] : memref<3072xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_148[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+        }
+      }
+    }
+    %alloc_149 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 3072 {
+          %88 = affine.load %alloc_148[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          %89 = arith.mulf %88, %cst_35 : f32
+          affine.store %89, %alloc_149[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+        }
+      }
+    }
+    %alloc_150 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 3072 {
+          %88 = affine.load %alloc_148[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          %89 = math.powf %88, %cst_34 : f32
+          affine.store %89, %alloc_150[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+        }
+      }
+    }
+    %alloc_151 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 3072 {
+          %88 = affine.load %alloc_150[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          %89 = arith.truncf %cst_40 : f64 to f32
+          %90 = arith.mulf %88, %89 : f32
+          affine.store %90, %alloc_151[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+        }
+      }
+    }
+    %alloc_152 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 3072 {
+          %88 = affine.load %alloc_148[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          %89 = affine.load %alloc_151[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_152[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+        }
+      }
+    }
+    %alloc_153 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 3072 {
+          %88 = affine.load %alloc_152[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          %89 = arith.truncf %cst_41 : f64 to f32
+          %90 = arith.mulf %88, %89 : f32
+          affine.store %90, %alloc_153[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+        }
+      }
+    }
+    %alloc_154 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 3072 {
+          %88 = affine.load %alloc_153[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          %89 = math.tanh %88 : f32
+          affine.store %89, %alloc_154[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+        }
+      }
+    }
+    %alloc_155 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 3072 {
+          %88 = affine.load %alloc_154[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          %89 = arith.addf %88, %cst_33 : f32
+          affine.store %89, %alloc_155[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+        }
+      }
+    }
+    %alloc_156 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 3072 {
+          %88 = affine.load %alloc_149[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          %89 = affine.load %alloc_155[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          %90 = arith.mulf %88, %89 : f32
+          affine.store %90, %alloc_156[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+        }
+      }
+    }
+    %alloc_157 = memref.alloc() {alignment = 64 : i64} : memref<3072x768xf32>
+    affine.for %arg3 = 0 to 768 {
+      affine.for %arg4 = 0 to 3072 {
+        %88 = affine.load %17[%arg3, %arg4] : memref<768x3072xf32>
+        affine.store %88, %alloc_157[%arg4, %arg3] : memref<3072x768xf32>
+      }
+    }
+    %alloc_158 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 3072 {
+          %88 = affine.load %alloc_156[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          affine.store %88, %alloc_158[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+        }
+      }
+    }
+    %alloc_159 = memref.alloc() {alignment = 64 : i64} : memref<1x3072x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 3072 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_157[%arg4, %arg5] : memref<3072x768xf32>
+          affine.store %88, %alloc_159[%arg3, %arg4, %arg5] : memref<1x3072x768xf32>
+        }
+      }
+    }
+    %alloc_160 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    memref.copy %alloc_74, %alloc_160 : memref<1x128x768xf32> to memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          affine.for %arg6 = 0 to 3072 {
+            %88 = affine.load %alloc_158[%arg3, %arg4, %arg6] : memref<1x128x3072xf32>
+            %89 = affine.load %alloc_159[%arg3, %arg6, %arg5] : memref<1x3072x768xf32>
+            %90 = affine.load %alloc_160[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+            %91 = arith.mulf %88, %89 : f32
+            %92 = arith.addf %90, %91 : f32
+            affine.store %92, %alloc_160[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+          }
+        }
+      }
+    }
+    %alloc_161 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_160[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %18[%arg5] : memref<768xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_161[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_162 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_126[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %alloc_161[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_162[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_163 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    memref.copy %alloc_53, %alloc_163 : memref<1x128x1xf32> to memref<1x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_162[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %alloc_163[%arg3, %arg4, %c0] : memref<1x128x1xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_163[%arg3, %arg4, %c0] : memref<1x128x1xf32>
+        }
+      }
+    }
+    %alloc_164 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_163[%c0, %arg4, %c0] : memref<1x128x1xf32>
+          %89 = arith.divf %88, %cst_38 : f32
+          affine.store %89, %alloc_164[%arg3, %arg4, %arg5] : memref<1x128x1xf32>
+        }
+      }
+    }
+    %alloc_165 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_162[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = arith.extf %88 : f32 to f64
+          affine.store %89, %alloc_165[%arg3, %arg4, %arg5] : memref<1x128x768xf64>
+        }
+      }
+    }
+    %alloc_166 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64>
+    memref.copy %alloc_57, %alloc_166 : memref<1x128x1xf64> to memref<1x128x1xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_165[%arg3, %arg4, %arg5] : memref<1x128x768xf64>
+          %89 = affine.load %alloc_166[%arg3, %arg4, %c0] : memref<1x128x1xf64>
+          %90 = arith.addf %88, %89 : f64
+          affine.store %90, %alloc_166[%arg3, %arg4, %c0] : memref<1x128x1xf64>
+        }
+      }
+    }
+    %alloc_167 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_166[%c0, %arg4, %c0] : memref<1x128x1xf64>
+          %89 = arith.divf %88, %cst_37 : f64
+          affine.store %89, %alloc_167[%arg3, %arg4, %arg5] : memref<1x128x1xf64>
+        }
+      }
+    }
+    %alloc_168 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_165[%c0, %arg4, %arg5] : memref<1x128x768xf64>
+          %89 = affine.load %alloc_167[%c0, %arg4, %c0] : memref<1x128x1xf64>
+          %90 = arith.subf %88, %89 : f64
+          affine.store %90, %alloc_168[%arg3, %arg4, %arg5] : memref<1x128x768xf64>
+        }
+      }
+    }
+    %alloc_169 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_168[%c0, %arg4, %arg5] : memref<1x128x768xf64>
+          %89 = affine.load %alloc_168[%c0, %arg4, %arg5] : memref<1x128x768xf64>
+          %90 = arith.mulf %88, %89 : f64
+          affine.store %90, %alloc_169[%arg3, %arg4, %arg5] : memref<1x128x768xf64>
+        }
+      }
+    }
+    %alloc_170 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64>
+    memref.copy %alloc_57, %alloc_170 : memref<1x128x1xf64> to memref<1x128x1xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_169[%arg3, %arg4, %arg5] : memref<1x128x768xf64>
+          %89 = affine.load %alloc_170[%arg3, %arg4, %c0] : memref<1x128x1xf64>
+          %90 = arith.addf %88, %89 : f64
+          affine.store %90, %alloc_170[%arg3, %arg4, %c0] : memref<1x128x1xf64>
+        }
+      }
+    }
+    %alloc_171 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_170[%c0, %arg4, %c0] : memref<1x128x1xf64>
+          %89 = arith.divf %88, %cst_42 : f64
+          affine.store %89, %alloc_171[%arg3, %arg4, %arg5] : memref<1x128x1xf64>
+        }
+      }
+    }
+    %alloc_172 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_171[%c0, %arg4, %c0] : memref<1x128x1xf64>
+          %89 = arith.truncf %88 : f64 to f32
+          affine.store %89, %alloc_172[%arg3, %arg4, %arg5] : memref<1x128x1xf32>
+        }
+      }
+    }
+    %alloc_173 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_172[%c0, %arg4, %c0] : memref<1x128x1xf32>
+          %89 = math.sqrt %88 : f32
+          affine.store %89, %alloc_173[%arg3, %arg4, %arg5] : memref<1x128x1xf32>
+        }
+      }
+    }
+    %alloc_174 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_162[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %alloc_164[%c0, %arg4, %c0] : memref<1x128x1xf32>
+          %90 = arith.subf %88, %89 : f32
+          affine.store %90, %alloc_174[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_175 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %19[%arg5] : memref<768xf32>
+          %89 = affine.load %alloc_174[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %90 = arith.mulf %88, %89 : f32
+          affine.store %90, %alloc_175[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_176 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_173[%c0, %arg4, %c0] : memref<1x128x1xf32>
+          %89 = arith.truncf %cst_39 : f64 to f32
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_176[%arg3, %arg4, %arg5] : memref<1x128x1xf32>
+        }
+      }
+    }
+    %alloc_177 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_175[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %alloc_176[%c0, %arg4, %c0] : memref<1x128x1xf32>
+          %90 = arith.divf %88, %89 : f32
+          affine.store %90, %alloc_177[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_178 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_177[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %20[%arg5] : memref<768xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_178[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_179 = memref.alloc() {alignment = 64 : i64} : memref<768x768xf32>
+    affine.for %arg3 = 0 to 768 {
+      affine.for %arg4 = 0 to 768 {
+        %88 = affine.load %21[%arg3, %arg4] : memref<768x768xf32>
+        affine.store %88, %alloc_179[%arg4, %arg3] : memref<768x768xf32>
+      }
+    }
+    %alloc_180 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_178[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          affine.store %88, %alloc_180[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_181 = memref.alloc() {alignment = 64 : i64} : memref<1x768x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 768 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_179[%arg4, %arg5] : memref<768x768xf32>
+          affine.store %88, %alloc_181[%arg3, %arg4, %arg5] : memref<1x768x768xf32>
+        }
+      }
+    }
+    %alloc_182 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    memref.copy %alloc_74, %alloc_182 : memref<1x128x768xf32> to memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          affine.for %arg6 = 0 to 768 {
+            %88 = affine.load %alloc_180[%arg3, %arg4, %arg6] : memref<1x128x768xf32>
+            %89 = affine.load %alloc_181[%arg3, %arg6, %arg5] : memref<1x768x768xf32>
+            %90 = affine.load %alloc_182[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+            %91 = arith.mulf %88, %89 : f32
+            %92 = arith.addf %90, %91 : f32
+            affine.store %92, %alloc_182[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+          }
+        }
+      }
+    }
+    %alloc_183 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_182[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %22[%arg5] : memref<768xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_183[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %65 = bufferization.to_tensor %alloc_183 : memref<1x128x768xf32>
+    %expanded_184 = tensor.expand_shape %65 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32>
+    %66 = bufferization.to_memref %expanded_184 : memref<1x128x12x64xf32>
+    %alloc_185 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x64xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 12 {
+          affine.for %arg6 = 0 to 64 {
+            %88 = affine.load %66[%arg3, %arg4, %arg5, %arg6] : memref<1x128x12x64xf32>
+            affine.store %88, %alloc_185[%arg3, %arg5, %arg4, %arg6] : memref<1x12x128x64xf32>
+          }
+        }
+      }
+    }
+    %alloc_186 = memref.alloc() {alignment = 64 : i64} : memref<768x768xf32>
+    affine.for %arg3 = 0 to 768 {
+      affine.for %arg4 = 0 to 768 {
+        %88 = affine.load %23[%arg3, %arg4] : memref<768x768xf32>
+        affine.store %88, %alloc_186[%arg4, %arg3] : memref<768x768xf32>
+      }
+    }
+    %alloc_187 = memref.alloc() {alignment = 64 : i64} : memref<1x768x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 768 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_186[%arg4, %arg5] : memref<768x768xf32>
+          affine.store %88, %alloc_187[%arg3, %arg4, %arg5] : memref<1x768x768xf32>
+        }
+      }
+    }
+    %alloc_188 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    memref.copy %alloc_74, %alloc_188 : memref<1x128x768xf32> to memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          affine.for %arg6 = 0 to 768 {
+            %88 = affine.load %alloc_180[%arg3, %arg4, %arg6] : memref<1x128x768xf32>
+            %89 = affine.load %alloc_187[%arg3, %arg6, %arg5] : memref<1x768x768xf32>
+            %90 = affine.load %alloc_188[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+            %91 = arith.mulf %88, %89 : f32
+            %92 = arith.addf %90, %91 : f32
+            affine.store %92, %alloc_188[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+          }
+        }
+      }
+    }
+    %alloc_189 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_188[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %24[%arg5] : memref<768xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_189[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %67 = bufferization.to_tensor %alloc_189 : memref<1x128x768xf32>
+    %expanded_190 = tensor.expand_shape %67 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32>
+    %68 = bufferization.to_memref %expanded_190 : memref<1x128x12x64xf32>
+    %alloc_191 = memref.alloc() {alignment = 64 : i64} : memref<768x768xf32>
+    affine.for %arg3 = 0 to 768 {
+      affine.for %arg4 = 0 to 768 {
+        %88 = affine.load %25[%arg3, %arg4] : memref<768x768xf32>
+        affine.store %88, %alloc_191[%arg4, %arg3] : memref<768x768xf32>
+      }
+    }
+    %alloc_192 = memref.alloc() {alignment = 64 : i64} : memref<1x768x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 768 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_191[%arg4, %arg5] : memref<768x768xf32>
+          affine.store %88, %alloc_192[%arg3, %arg4, %arg5] : memref<1x768x768xf32>
+        }
+      }
+    }
+    %alloc_193 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    memref.copy %alloc_74, %alloc_193 : memref<1x128x768xf32> to memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          affine.for %arg6 = 0 to 768 {
+            %88 = affine.load %alloc_180[%arg3, %arg4, %arg6] : memref<1x128x768xf32>
+            %89 = affine.load %alloc_192[%arg3, %arg6, %arg5] : memref<1x768x768xf32>
+            %90 = affine.load %alloc_193[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+            %91 = arith.mulf %88, %89 : f32
+            %92 = arith.addf %90, %91 : f32
+            affine.store %92, %alloc_193[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+          }
+        }
+      }
+    }
+    %alloc_194 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_193[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %26[%arg5] : memref<768xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_194[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %69 = bufferization.to_tensor %alloc_194 : memref<1x128x768xf32>
+    %expanded_195 = tensor.expand_shape %69 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32>
+    %70 = bufferization.to_memref %expanded_195 : memref<1x128x12x64xf32>
+    %alloc_196 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x64xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 12 {
+          affine.for %arg6 = 0 to 64 {
+            %88 = affine.load %70[%arg3, %arg4, %arg5, %arg6] : memref<1x128x12x64xf32>
+            affine.store %88, %alloc_196[%arg3, %arg5, %arg4, %arg6] : memref<1x12x128x64xf32>
+          }
+        }
+      }
+    }
+    %alloc_197 = memref.alloc() {alignment = 64 : i64} : memref<1x12x64x128xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 12 {
+          affine.for %arg6 = 0 to 64 {
+            %88 = affine.load %68[%arg3, %arg4, %arg5, %arg6] : memref<1x128x12x64xf32>
+            affine.store %88, %alloc_197[%arg3, %arg5, %arg6, %arg4] : memref<1x12x64x128xf32>
+          }
+        }
+      }
+    }
+    %alloc_198 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x64xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 64 {
+            %88 = affine.load %alloc_185[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x64xf32>
+            affine.store %88, %alloc_198[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x64xf32>
+          }
+        }
+      }
+    }
+    %71 = bufferization.to_tensor %alloc_198 : memref<1x12x128x64xf32>
+    %alloc_199 = memref.alloc() {alignment = 64 : i64} : memref<1x12x64x128xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 64 {
+          affine.for %arg6 = 0 to 128 {
+            %88 = affine.load %alloc_197[%c0, %arg4, %arg5, %arg6] : memref<1x12x64x128xf32>
+            affine.store %88, %alloc_199[%arg3, %arg4, %arg5, %arg6] : memref<1x12x64x128xf32>
+          }
+        }
+      }
+    }
+    %72 = bufferization.to_tensor %alloc_199 : memref<1x12x64x128xf32>
+    %collapsed_200 = tensor.collapse_shape %71 [[0, 1], [2], [3]] : tensor<1x12x128x64xf32> into tensor<12x128x64xf32>
+    %73 = bufferization.to_memref %collapsed_200 : memref<12x128x64xf32>
+    %collapsed_201 = tensor.collapse_shape %72 [[0, 1], [2], [3]] : tensor<1x12x64x128xf32> into tensor<12x64x128xf32>
+    %74 = bufferization.to_memref %collapsed_201 : memref<12x64x128xf32>
+    %alloc_202 = memref.alloc() {alignment = 64 : i64} : memref<12x128x128xf32>
+    memref.copy %alloc_95, %alloc_202 : memref<12x128x128xf32> to memref<12x128x128xf32>
+    affine.for %arg3 = 0 to 12 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 64 {
+            %88 = affine.load %73[%arg3, %arg4, %arg6] : memref<12x128x64xf32>
+            %89 = affine.load %74[%arg3, %arg6, %arg5] : memref<12x64x128xf32>
+            %90 = affine.load %alloc_202[%arg3, %arg4, %arg5] : memref<12x128x128xf32>
+            %91 = arith.mulf %88, %89 : f32
+            %92 = arith.addf %90, %91 : f32
+            affine.store %92, %alloc_202[%arg3, %arg4, %arg5] : memref<12x128x128xf32>
+          }
+        }
+      }
+    }
+    %75 = bufferization.to_tensor %alloc_202 : memref<12x128x128xf32>
+    %expanded_203 = tensor.expand_shape %75 [[0, 1], [2], [3]] : tensor<12x128x128xf32> into tensor<1x12x128x128xf32>
+    %76 = bufferization.to_memref %expanded_203 : memref<1x12x128x128xf32>
+    %alloc_204 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 128 {
+            %88 = affine.load %76[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+            %89 = arith.divf %88, %cst_36 : f32
+            affine.store %89, %alloc_204[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+          }
+        }
+      }
+    }
+    %alloc_205 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 128 {
+            %88 = affine.load %alloc_99[%c0, %c0, %arg5, %arg6] : memref<1x1x128x128xi1>
+            %89 = affine.load %alloc_100[] : memref<f32>
+            %90 = affine.load %alloc_204[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+            %91 = arith.select %88, %89, %90 : f32
+            affine.store %91, %alloc_205[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+          }
+        }
+      }
+    }
+    %alloc_206 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128xf32>
+    memref.copy %alloc_103, %alloc_206 : memref<1x12x128xf32> to memref<1x12x128xf32>
+    %alloc_207 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128xi64>
+    memref.copy %alloc_102, %alloc_207 : memref<1x12x128xi64> to memref<1x12x128xi64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 128 {
+            %88 = affine.load %alloc_205[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+            %89 = affine.load %alloc_206[%arg3, %arg4, %arg5] : memref<1x12x128xf32>
+            %90 = affine.load %alloc_207[%arg3, %arg4, %arg5] : memref<1x12x128xi64>
+            %91 = arith.index_cast %arg6 : index to i64
+            %92 = arith.maximumf %88, %89 : f32
+            %93 = arith.cmpf ogt, %88, %89 : f32
+            %94 = arith.select %93, %91, %90 : i64
+            affine.store %92, %alloc_206[%arg3, %arg4, %arg5] : memref<1x12x128xf32>
+            affine.store %94, %alloc_207[%arg3, %arg4, %arg5] : memref<1x12x128xi64>
+          }
+        }
+      }
+    }
+    %77 = bufferization.to_tensor %alloc_206 : memref<1x12x128xf32>
+    %expanded_208 = tensor.expand_shape %77 [[0], [1], [2, 3]] : tensor<1x12x128xf32> into tensor<1x12x128x1xf32>
+    %78 = bufferization.to_memref %expanded_208 : memref<1x12x128x1xf32>
+    %alloc_209 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 128 {
+            %88 = affine.load %alloc_205[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+            %89 = affine.load %78[%c0, %arg4, %arg5, %c0] : memref<1x12x128x1xf32>
+            %90 = arith.subf %88, %89 : f32
+            affine.store %90, %alloc_209[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+          }
+        }
+      }
+    }
+    %alloc_210 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 128 {
+            %88 = affine.load %alloc_209[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+            %89 = math.exp %88 : f32
+            affine.store %89, %alloc_210[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+          }
+        }
+      }
+    }
+    %alloc_211 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x1xf32>
+    memref.copy %alloc_109, %alloc_211 : memref<1x12x128x1xf32> to memref<1x12x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 128 {
+            %88 = affine.load %alloc_210[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+            %89 = affine.load %alloc_211[%arg3, %arg4, %arg5, %c0] : memref<1x12x128x1xf32>
+            %90 = arith.addf %88, %89 : f32
+            affine.store %90, %alloc_211[%arg3, %arg4, %arg5, %c0] : memref<1x12x128x1xf32>
+          }
+        }
+      }
+    }
+    %alloc_212 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 128 {
+            %88 = affine.load %alloc_210[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+            %89 = affine.load %alloc_211[%c0, %arg4, %arg5, %c0] : memref<1x12x128x1xf32>
+            %90 = arith.divf %88, %89 : f32
+            affine.store %90, %alloc_212[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+          }
+        }
+      }
+    }
+    %alloc_213 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x128xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 128 {
+            %88 = affine.load %alloc_212[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+            affine.store %88, %alloc_213[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x128xf32>
+          }
+        }
+      }
+    }
+    %79 = bufferization.to_tensor %alloc_213 : memref<1x12x128x128xf32>
+    %alloc_214 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x64xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 64 {
+            %88 = affine.load %alloc_196[%c0, %arg4, %arg5, %arg6] : memref<1x12x128x64xf32>
+            affine.store %88, %alloc_214[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x64xf32>
+          }
+        }
+      }
+    }
+    %80 = bufferization.to_tensor %alloc_214 : memref<1x12x128x64xf32>
+    %collapsed_215 = tensor.collapse_shape %79 [[0, 1], [2], [3]] : tensor<1x12x128x128xf32> into tensor<12x128x128xf32>
+    %81 = bufferization.to_memref %collapsed_215 : memref<12x128x128xf32>
+    %collapsed_216 = tensor.collapse_shape %80 [[0, 1], [2], [3]] : tensor<1x12x128x64xf32> into tensor<12x128x64xf32>
+    %82 = bufferization.to_memref %collapsed_216 : memref<12x128x64xf32>
+    %alloc_217 = memref.alloc() {alignment = 64 : i64} : memref<12x128x64xf32>
+    memref.copy %alloc_116, %alloc_217 : memref<12x128x64xf32> to memref<12x128x64xf32>
+    affine.for %arg3 = 0 to 12 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 64 {
+          affine.for %arg6 = 0 to 128 {
+            %88 = affine.load %81[%arg3, %arg4, %arg6] : memref<12x128x128xf32>
+            %89 = affine.load %82[%arg3, %arg6, %arg5] : memref<12x128x64xf32>
+            %90 = affine.load %alloc_217[%arg3, %arg4, %arg5] : memref<12x128x64xf32>
+            %91 = arith.mulf %88, %89 : f32
+            %92 = arith.addf %90, %91 : f32
+            affine.store %92, %alloc_217[%arg3, %arg4, %arg5] : memref<12x128x64xf32>
+          }
+        }
+      }
+    }
+    %83 = bufferization.to_tensor %alloc_217 : memref<12x128x64xf32>
+    %expanded_218 = tensor.expand_shape %83 [[0, 1], [2], [3]] : tensor<12x128x64xf32> into tensor<1x12x128x64xf32>
+    %84 = bufferization.to_memref %expanded_218 : memref<1x12x128x64xf32>
+    %alloc_219 = memref.alloc() {alignment = 64 : i64} : memref<1x128x12x64xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 12 {
+        affine.for %arg5 = 0 to 128 {
+          affine.for %arg6 = 0 to 64 {
+            %88 = affine.load %84[%arg3, %arg4, %arg5, %arg6] : memref<1x12x128x64xf32>
+            affine.store %88, %alloc_219[%arg3, %arg5, %arg4, %arg6] : memref<1x128x12x64xf32>
+          }
+        }
+      }
+    }
+    %85 = bufferization.to_tensor %alloc_219 : memref<1x128x12x64xf32>
+    %collapsed_220 = tensor.collapse_shape %85 [[0], [1], [2, 3]] : tensor<1x128x12x64xf32> into tensor<1x128x768xf32>
+    %86 = bufferization.to_memref %collapsed_220 : memref<1x128x768xf32>
+    %alloc_221 = memref.alloc() {alignment = 64 : i64} : memref<768x768xf32>
+    affine.for %arg3 = 0 to 768 {
+      affine.for %arg4 = 0 to 768 {
+        %88 = affine.load %27[%arg3, %arg4] : memref<768x768xf32>
+        affine.store %88, %alloc_221[%arg4, %arg3] : memref<768x768xf32>
+      }
+    }
+    %alloc_222 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %86[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          affine.store %88, %alloc_222[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_223 = memref.alloc() {alignment = 64 : i64} : memref<1x768x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 768 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_221[%arg4, %arg5] : memref<768x768xf32>
+          affine.store %88, %alloc_223[%arg3, %arg4, %arg5] : memref<1x768x768xf32>
+        }
+      }
+    }
+    %alloc_224 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    memref.copy %alloc_74, %alloc_224 : memref<1x128x768xf32> to memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          affine.for %arg6 = 0 to 768 {
+            %88 = affine.load %alloc_222[%arg3, %arg4, %arg6] : memref<1x128x768xf32>
+            %89 = affine.load %alloc_223[%arg3, %arg6, %arg5] : memref<1x768x768xf32>
+            %90 = affine.load %alloc_224[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+            %91 = arith.mulf %88, %89 : f32
+            %92 = arith.addf %90, %91 : f32
+            affine.store %92, %alloc_224[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+          }
+        }
+      }
+    }
+    %alloc_225 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_224[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %28[%arg5] : memref<768xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_225[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_226 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_162[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %alloc_225[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_226[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_227 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    memref.copy %alloc_53, %alloc_227 : memref<1x128x1xf32> to memref<1x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_226[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %alloc_227[%arg3, %arg4, %c0] : memref<1x128x1xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_227[%arg3, %arg4, %c0] : memref<1x128x1xf32>
+        }
+      }
+    }
+    %alloc_228 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_227[%c0, %arg4, %c0] : memref<1x128x1xf32>
+          %89 = arith.divf %88, %cst_38 : f32
+          affine.store %89, %alloc_228[%arg3, %arg4, %arg5] : memref<1x128x1xf32>
+        }
+      }
+    }
+    %alloc_229 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_226[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = arith.extf %88 : f32 to f64
+          affine.store %89, %alloc_229[%arg3, %arg4, %arg5] : memref<1x128x768xf64>
+        }
+      }
+    }
+    %alloc_230 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64>
+    memref.copy %alloc_57, %alloc_230 : memref<1x128x1xf64> to memref<1x128x1xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_229[%arg3, %arg4, %arg5] : memref<1x128x768xf64>
+          %89 = affine.load %alloc_230[%arg3, %arg4, %c0] : memref<1x128x1xf64>
+          %90 = arith.addf %88, %89 : f64
+          affine.store %90, %alloc_230[%arg3, %arg4, %c0] : memref<1x128x1xf64>
+        }
+      }
+    }
+    %alloc_231 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_230[%c0, %arg4, %c0] : memref<1x128x1xf64>
+          %89 = arith.divf %88, %cst_37 : f64
+          affine.store %89, %alloc_231[%arg3, %arg4, %arg5] : memref<1x128x1xf64>
+        }
+      }
+    }
+    %alloc_232 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_229[%c0, %arg4, %arg5] : memref<1x128x768xf64>
+          %89 = affine.load %alloc_231[%c0, %arg4, %c0] : memref<1x128x1xf64>
+          %90 = arith.subf %88, %89 : f64
+          affine.store %90, %alloc_232[%arg3, %arg4, %arg5] : memref<1x128x768xf64>
+        }
+      }
+    }
+    %alloc_233 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_232[%c0, %arg4, %arg5] : memref<1x128x768xf64>
+          %89 = affine.load %alloc_232[%c0, %arg4, %arg5] : memref<1x128x768xf64>
+          %90 = arith.mulf %88, %89 : f64
+          affine.store %90, %alloc_233[%arg3, %arg4, %arg5] : memref<1x128x768xf64>
+        }
+      }
+    }
+    %alloc_234 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64>
+    memref.copy %alloc_57, %alloc_234 : memref<1x128x1xf64> to memref<1x128x1xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_233[%arg3, %arg4, %arg5] : memref<1x128x768xf64>
+          %89 = affine.load %alloc_234[%arg3, %arg4, %c0] : memref<1x128x1xf64>
+          %90 = arith.addf %88, %89 : f64
+          affine.store %90, %alloc_234[%arg3, %arg4, %c0] : memref<1x128x1xf64>
+        }
+      }
+    }
+    %alloc_235 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_234[%c0, %arg4, %c0] : memref<1x128x1xf64>
+          %89 = arith.divf %88, %cst_42 : f64
+          affine.store %89, %alloc_235[%arg3, %arg4, %arg5] : memref<1x128x1xf64>
+        }
+      }
+    }
+    %alloc_236 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_235[%c0, %arg4, %c0] : memref<1x128x1xf64>
+          %89 = arith.truncf %88 : f64 to f32
+          affine.store %89, %alloc_236[%arg3, %arg4, %arg5] : memref<1x128x1xf32>
+        }
+      }
+    }
+    %alloc_237 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_236[%c0, %arg4, %c0] : memref<1x128x1xf32>
+          %89 = math.sqrt %88 : f32
+          affine.store %89, %alloc_237[%arg3, %arg4, %arg5] : memref<1x128x1xf32>
+        }
+      }
+    }
+    %alloc_238 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_226[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %alloc_228[%c0, %arg4, %c0] : memref<1x128x1xf32>
+          %90 = arith.subf %88, %89 : f32
+          affine.store %90, %alloc_238[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_239 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %29[%arg5] : memref<768xf32>
+          %89 = affine.load %alloc_238[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %90 = arith.mulf %88, %89 : f32
+          affine.store %90, %alloc_239[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_240 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 1 {
+          %88 = affine.load %alloc_237[%c0, %arg4, %c0] : memref<1x128x1xf32>
+          %89 = arith.truncf %cst_39 : f64 to f32
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_240[%arg3, %arg4, %arg5] : memref<1x128x1xf32>
+        }
+      }
+    }
+    %alloc_241 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_239[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %alloc_240[%c0, %arg4, %c0] : memref<1x128x1xf32>
+          %90 = arith.divf %88, %89 : f32
+          affine.store %90, %alloc_241[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_242 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_241[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %30[%arg5] : memref<768xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_242[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_243 = memref.alloc() {alignment = 64 : i64} : memref<768x3072xf32>
+    affine.for %arg3 = 0 to 3072 {
+      affine.for %arg4 = 0 to 768 {
+        %88 = affine.load %31[%arg3, %arg4] : memref<3072x768xf32>
+        affine.store %88, %alloc_243[%arg4, %arg3] : memref<768x3072xf32>
+      }
+    }
+    %alloc_244 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_242[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          affine.store %88, %alloc_244[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_245 = memref.alloc() {alignment = 64 : i64} : memref<1x768x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 768 {
+        affine.for %arg5 = 0 to 3072 {
+          %88 = affine.load %alloc_243[%arg4, %arg5] : memref<768x3072xf32>
+          affine.store %88, %alloc_245[%arg3, %arg4, %arg5] : memref<1x768x3072xf32>
+        }
+      }
+    }
+    %alloc_246 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32>
+    memref.copy %alloc_146, %alloc_246 : memref<1x128x3072xf32> to memref<1x128x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 3072 {
+          affine.for %arg6 = 0 to 768 {
+            %88 = affine.load %alloc_244[%arg3, %arg4, %arg6] : memref<1x128x768xf32>
+            %89 = affine.load %alloc_245[%arg3, %arg6, %arg5] : memref<1x768x3072xf32>
+            %90 = affine.load %alloc_246[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+            %91 = arith.mulf %88, %89 : f32
+            %92 = arith.addf %90, %91 : f32
+            affine.store %92, %alloc_246[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+          }
+        }
+      }
+    }
+    %alloc_247 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 3072 {
+          %88 = affine.load %alloc_246[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          %89 = affine.load %32[%arg5] : memref<3072xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_247[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+        }
+      }
+    }
+    %alloc_248 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 3072 {
+          %88 = affine.load %alloc_247[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          %89 = arith.mulf %88, %cst_35 : f32
+          affine.store %89, %alloc_248[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+        }
+      }
+    }
+    %alloc_249 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 3072 {
+          %88 = affine.load %alloc_247[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          %89 = math.powf %88, %cst_34 : f32
+          affine.store %89, %alloc_249[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+        }
+      }
+    }
+    %alloc_250 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 3072 {
+          %88 = affine.load %alloc_249[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          %89 = arith.truncf %cst_40 : f64 to f32
+          %90 = arith.mulf %88, %89 : f32
+          affine.store %90, %alloc_250[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+        }
+      }
+    }
+    %alloc_251 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 3072 {
+          %88 = affine.load %alloc_247[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          %89 = affine.load %alloc_250[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_251[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+        }
+      }
+    }
+    %alloc_252 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 3072 {
+          %88 = affine.load %alloc_251[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          %89 = arith.truncf %cst_41 : f64 to f32
+          %90 = arith.mulf %88, %89 : f32
+          affine.store %90, %alloc_252[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+        }
+      }
+    }
+    %alloc_253 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 3072 {
+          %88 = affine.load %alloc_252[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          %89 = math.tanh %88 : f32
+          affine.store %89, %alloc_253[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+        }
+      }
+    }
+    %alloc_254 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 3072 {
+          %88 = affine.load %alloc_253[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          %89 = arith.addf %88, %cst_33 : f32
+          affine.store %89, %alloc_254[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+        }
+      }
+    }
+    %alloc_255 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 3072 {
+          %88 = affine.load %alloc_248[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          %89 = affine.load %alloc_254[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          %90 = arith.mulf %88, %89 : f32
+          affine.store %90, %alloc_255[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+        }
+      }
+    }
+    %alloc_256 = memref.alloc() {alignment = 64 : i64} : memref<3072x768xf32>
+    affine.for %arg3 = 0 to 768 {
+      affine.for %arg4 = 0 to 3072 {
+        %88 = affine.load %33[%arg3, %arg4] : memref<768x3072xf32>
+        affine.store %88, %alloc_256[%arg4, %arg3] : memref<3072x768xf32>
+      }
+    }
+    %alloc_257 = memref.alloc() {alignment = 64 : i64} : memref<1x128x3072xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 3072 {
+          %88 = affine.load %alloc_255[%c0, %arg4, %arg5] : memref<1x128x3072xf32>
+          affine.store %88, %alloc_257[%arg3, %arg4, %arg5] : memref<1x128x3072xf32>
+        }
+      }
+    }
+    %alloc_258 = memref.alloc() {alignment = 64 : i64} : memref<1x3072x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 3072 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_256[%arg4, %arg5] : memref<3072x768xf32>
+          affine.store %88, %alloc_258[%arg3, %arg4, %arg5] : memref<1x3072x768xf32>
+        }
+      }
+    }
+    %alloc_259 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    memref.copy %alloc_74, %alloc_259 : memref<1x128x768xf32> to memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          affine.for %arg6 = 0 to 3072 {
+            %88 = affine.load %alloc_257[%arg3, %arg4, %arg6] : memref<1x128x3072xf32>
+            %89 = affine.load %alloc_258[%arg3, %arg6, %arg5] : memref<1x3072x768xf32>
+            %90 = affine.load %alloc_259[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+            %91 = arith.mulf %88, %89 : f32
+            %92 = arith.addf %90, %91 : f32
+            affine.store %92, %alloc_259[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+          }
+        }
+      }
+    }
+    %alloc_260 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_259[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %34[%arg5] : memref<768xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_260[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %alloc_261 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 128 {
+        affine.for %arg5 = 0 to 768 {
+          %88 = affine.load %alloc_226[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %89 = affine.load %alloc_260[%c0, %arg4, %arg5] : memref<1x128x768xf32>
+          %90 = arith.addf %88, %89 : f32
+          affine.store %90, %alloc_261[%arg3, %arg4, %arg5] : memref<1x128x768xf32>
+        }
+      }
+    }
+    %87 = bufferization.to_tensor %alloc_261 : memref<1x128x768xf32>
+    return %87 : tensor<1x128x768xf32>
+  }
+}
+
diff --git a/test/samples/bert/bert_linalg.mlir b/test/samples/bert/bert_linalg.mlir
new file mode 100644
index 00000000..2a663edc
--- /dev/null
+++ b/test/samples/bert/bert_linalg.mlir
@@ -0,0 +1,900 @@
+#map = affine_map<(d0, d1) -> (0, d1)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, 0, d3, d4, d5)>
+#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
+#map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#map5 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map6 = affine_map<(d0, d1, d2) -> (0, d1, d2)>
+#map7 = affine_map<(d0, d1, d2) -> (d0, d1, 0)>
+#map8 = affine_map<(d0, d1, d2) -> (0, d1, 0)>
+#map9 = affine_map<(d0, d1, d2) -> (d2)>
+#map10 = affine_map<(d0, d1, d2) -> (d1, d2)>
+#map11 = affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)>
+#map12 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+#map13 = affine_map<(d0, d1, d2, d3) -> (0, 0, d2, d3)>
+#map14 = affine_map<() -> ()>
+#map15 = affine_map<(d0, d1, d2, d3) -> ()>
+#map16 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+#map17 = affine_map<(d0, d1, d2, d3) -> (0, d1, d2, 0)>
+#map18 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, 0)>
+module {
+  func.func @main(%arg0: tensor<1x512x768xf32>, %arg1: tensor<1x128xi64>, %arg2: tensor<1x128xi64>) -> tensor<1x128x768xf32> {
+    %cst = arith.constant dense_resource<torch_tensor_30522_768_torch.float32> : tensor<30522x768xf32>
+    %c0_i64 = arith.constant 0 : i64
+    %c3 = arith.constant 3 : index
+    %c30522 = arith.constant 30522 : index
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %cst_1 = arith.constant 0.000000e+00 : f64
+    %cst_2 = arith.constant 0xFF800000 : f32
+    %cst_3 = arith.constant 7.670000e+02 : f64
+    %cst_4 = arith.constant 0.79788456080286541 : f64
+    %cst_5 = arith.constant 4.471500e-02 : f64
+    %cst_6 = arith.constant 9.9999999999999995E-7 : f64
+    %cst_7 = arith.constant 7.680000e+02 : f32
+    %cst_8 = arith.constant 7.680000e+02 : f64
+    %cst_9 = arith.constant 8.000000e+00 : f32
+    %cst_10 = arith.constant 5.000000e-01 : f32
+    %cst_11 = arith.constant 3.000000e+00 : f32
+    %cst_12 = arith.constant 1.000000e+00 : f32
+    %cst_13 = arith.constant dense_resource<torch_tensor_3_768_torch.float32> : tensor<3x768xf32>
+    %cst_14 = arith.constant dense_resource<torch_tensor_768_torch.float32> : tensor<768xf32>
+    %cst_15 = arith.constant dense_resource<torch_tensor_768_torch.float32_1> : tensor<768xf32>
+    %cst_16 = arith.constant dense_resource<torch_tensor_768_768_torch.float32> : tensor<768x768xf32>
+    %cst_17 = arith.constant dense_resource<torch_tensor_768_torch.float32_2> : tensor<768xf32>
+    %cst_18 = arith.constant dense_resource<torch_tensor_768_768_torch.float32_1> : tensor<768x768xf32>
+    %cst_19 = arith.constant dense_resource<torch_tensor_768_torch.float32_3> : tensor<768xf32>
+    %cst_20 = arith.constant dense_resource<torch_tensor_768_768_torch.float32_2> : tensor<768x768xf32>
+    %cst_21 = arith.constant dense_resource<torch_tensor_768_torch.float32_4> : tensor<768xf32>
+    %cst_22 = arith.constant dense_resource<torch_tensor_768_768_torch.float32_3> : tensor<768x768xf32>
+    %cst_23 = arith.constant dense_resource<torch_tensor_768_torch.float32_5> : tensor<768xf32>
+    %cst_24 = arith.constant dense_resource<torch_tensor_768_torch.float32_6> : tensor<768xf32>
+    %cst_25 = arith.constant dense_resource<torch_tensor_768_torch.float32_7> : tensor<768xf32>
+    %cst_26 = arith.constant dense_resource<torch_tensor_3072_768_torch.float32> : tensor<3072x768xf32>
+    %cst_27 = arith.constant dense_resource<torch_tensor_3072_torch.float32> : tensor<3072xf32>
+    %cst_28 = arith.constant dense_resource<torch_tensor_768_3072_torch.float32> : tensor<768x3072xf32>
+    %cst_29 = arith.constant dense_resource<torch_tensor_768_torch.float32_8> : tensor<768xf32>
+    %cst_30 = arith.constant dense_resource<torch_tensor_768_torch.float32_9> : tensor<768xf32>
+    %cst_31 = arith.constant dense_resource<torch_tensor_768_torch.float32_10> : tensor<768xf32>
+    %cst_32 = arith.constant dense_resource<torch_tensor_768_768_torch.float32_4> : tensor<768x768xf32>
+    %cst_33 = arith.constant dense_resource<torch_tensor_768_torch.float32_11> : tensor<768xf32>
+    %cst_34 = arith.constant dense_resource<torch_tensor_768_768_torch.float32_5> : tensor<768x768xf32>
+    %cst_35 = arith.constant dense_resource<torch_tensor_768_torch.float32_12> : tensor<768xf32>
+    %cst_36 = arith.constant dense_resource<torch_tensor_768_768_torch.float32_6> : tensor<768x768xf32>
+    %cst_37 = arith.constant dense_resource<torch_tensor_768_torch.float32_13> : tensor<768xf32>
+    %cst_38 = arith.constant dense_resource<torch_tensor_768_768_torch.float32_7> : tensor<768x768xf32>
+    %cst_39 = arith.constant dense_resource<torch_tensor_768_torch.float32_14> : tensor<768xf32>
+    %cst_40 = arith.constant dense_resource<torch_tensor_768_torch.float32_15> : tensor<768xf32>
+    %cst_41 = arith.constant dense_resource<torch_tensor_768_torch.float32_16> : tensor<768xf32>
+    %cst_42 = arith.constant dense_resource<torch_tensor_3072_768_torch.float32_1> : tensor<3072x768xf32>
+    %cst_43 = arith.constant dense_resource<torch_tensor_3072_torch.float32_1> : tensor<3072xf32>
+    %cst_44 = arith.constant dense_resource<torch_tensor_768_3072_torch.float32_1> : tensor<768x3072xf32>
+    %cst_45 = arith.constant dense_resource<torch_tensor_768_torch.float32_17> : tensor<768xf32>
+    %cst_46 = arith.constant dense<-1.000000e+09> : tensor<f64>
+    %0 = tensor.empty() : tensor<1x128xi1>
+    %1 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1 : tensor<1x128xi64>) outs(%0 : tensor<1x128xi1>) {
+    ^bb0(%in: i64, %out: i1):
+      %195 = arith.cmpi sgt, %in, %c0_i64 : i64
+      linalg.yield %195 : i1
+    } -> tensor<1x128xi1>
+    %expanded = tensor.expand_shape %1 [[0, 1], [2, 3, 4, 5]] : tensor<1x128xi1> into tensor<1x1x1x1x1x128xi1>
+    %2 = tensor.empty() : tensor<1x1x128x1x1x128xi1>
+    %3 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<1x1x1x1x1x128xi1>) outs(%2 : tensor<1x1x128x1x1x128xi1>) {
+    ^bb0(%in: i1, %out: i1):
+      linalg.yield %in : i1
+    } -> tensor<1x1x128x1x1x128xi1>
+    %collapsed = tensor.collapse_shape %3 [[0], [1, 2], [3, 4, 5]] : tensor<1x1x128x1x1x128xi1> into tensor<1x128x128xi1>
+    %expanded_47 = tensor.expand_shape %collapsed [[0], [1, 2], [3]] : tensor<1x128x128xi1> into tensor<1x1x128x128xi1>
+    %4 = tensor.empty() : tensor<1x128x768xf32>
+    %5 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg1 : tensor<1x128xi64>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %195 = arith.index_cast %in : i64 to index
+      %196 = linalg.index 2 : index
+      %197 = arith.cmpi slt, %195, %c30522 : index
+      cf.assert %197, "index must be smaller than dim size"
+      %198 = arith.cmpi sge, %in, %c0_i64 : i64
+      cf.assert %198, "index must be larger or equal to 0"
+      %extracted = tensor.extract %cst[%195, %196] : tensor<30522x768xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x128x768xf32>
+    %extracted_slice = tensor.extract_slice %arg0[0, 0, 0] [1, 128, 768] [1, 1, 1] : tensor<1x512x768xf32> to tensor<1x128x768xf32>
+    %6 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %extracted_slice : tensor<1x128x768xf32>, tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %7 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg2 : tensor<1x128xi64>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %195 = arith.index_cast %in : i64 to index
+      %196 = linalg.index 2 : index
+      %197 = arith.cmpi slt, %195, %c3 : index
+      cf.assert %197, "index must be smaller than dim size"
+      %198 = arith.cmpi sge, %in, %c0_i64 : i64
+      cf.assert %198, "index must be larger or equal to 0"
+      %extracted = tensor.extract %cst_13[%195, %196] : tensor<3x768xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x128x768xf32>
+    %8 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%6, %7 : tensor<1x128x768xf32>, tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %9 = tensor.empty() : tensor<1x128x1xf32>
+    %10 = linalg.fill ins(%cst_0 : f32) outs(%9 : tensor<1x128x1xf32>) -> tensor<1x128x1xf32>
+    %11 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%8 : tensor<1x128x768xf32>) outs(%10 : tensor<1x128x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.addf %in, %out : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x1xf32>
+    %12 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%11 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.divf %in, %cst_7 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x1xf32>
+    %13 = tensor.empty() : tensor<1x128x768xf64>
+    %14 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%8 : tensor<1x128x768xf32>) outs(%13 : tensor<1x128x768xf64>) {
+    ^bb0(%in: f32, %out: f64):
+      %195 = arith.extf %in : f32 to f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x768xf64>
+    %15 = tensor.empty() : tensor<1x128x1xf64>
+    %16 = linalg.fill ins(%cst_1 : f64) outs(%15 : tensor<1x128x1xf64>) -> tensor<1x128x1xf64>
+    %17 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%14 : tensor<1x128x768xf64>) outs(%16 : tensor<1x128x1xf64>) {
+    ^bb0(%in: f64, %out: f64):
+      %195 = arith.addf %in, %out : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x1xf64>
+    %18 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17 : tensor<1x128x1xf64>) outs(%15 : tensor<1x128x1xf64>) {
+    ^bb0(%in: f64, %out: f64):
+      %195 = arith.divf %in, %cst_8 : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x1xf64>
+    %19 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%14, %18 : tensor<1x128x768xf64>, tensor<1x128x1xf64>) outs(%13 : tensor<1x128x768xf64>) {
+    ^bb0(%in: f64, %in_89: f64, %out: f64):
+      %195 = arith.subf %in, %in_89 : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x768xf64>
+    %20 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%19, %19 : tensor<1x128x768xf64>, tensor<1x128x768xf64>) outs(%13 : tensor<1x128x768xf64>) {
+    ^bb0(%in: f64, %in_89: f64, %out: f64):
+      %195 = arith.mulf %in, %in_89 : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x768xf64>
+    %21 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%20 : tensor<1x128x768xf64>) outs(%16 : tensor<1x128x1xf64>) {
+    ^bb0(%in: f64, %out: f64):
+      %195 = arith.addf %in, %out : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x1xf64>
+    %22 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%21 : tensor<1x128x1xf64>) outs(%15 : tensor<1x128x1xf64>) {
+    ^bb0(%in: f64, %out: f64):
+      %195 = arith.divf %in, %cst_3 : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x1xf64>
+    %23 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%22 : tensor<1x128x1xf64>) outs(%9 : tensor<1x128x1xf32>) {
+    ^bb0(%in: f64, %out: f32):
+      %195 = arith.truncf %in : f64 to f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x1xf32>
+    %24 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%23 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = math.sqrt %in : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x1xf32>
+    %25 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%8, %12 : tensor<1x128x768xf32>, tensor<1x128x1xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.subf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %26 = linalg.generic {indexing_maps = [#map9, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_14, %25 : tensor<768xf32>, tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.mulf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %27 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%24 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.truncf %cst_6 : f64 to f32
+      %196 = arith.addf %in, %195 : f32
+      linalg.yield %196 : f32
+    } -> tensor<1x128x1xf32>
+    %28 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%26, %27 : tensor<1x128x768xf32>, tensor<1x128x1xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.divf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %29 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%28, %cst_15 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %30 = tensor.empty() : tensor<768x768xf32>
+    %transposed = linalg.transpose ins(%cst_16 : tensor<768x768xf32>) outs(%30 : tensor<768x768xf32>) permutation = [1, 0] 
+    %31 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%29 : tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x128x768xf32>
+    %32 = tensor.empty() : tensor<1x768x768xf32>
+    %33 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed : tensor<768x768xf32>) outs(%32 : tensor<1x768x768xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x768x768xf32>
+    %34 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<1x128x768xf32>) -> tensor<1x128x768xf32>
+    %35 = linalg.batch_matmul ins(%31, %33 : tensor<1x128x768xf32>, tensor<1x768x768xf32>) outs(%34 : tensor<1x128x768xf32>) -> tensor<1x128x768xf32>
+    %36 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%35, %cst_17 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %expanded_48 = tensor.expand_shape %36 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32>
+    %37 = tensor.empty() : tensor<1x12x128x64xf32>
+    %transposed_49 = linalg.transpose ins(%expanded_48 : tensor<1x128x12x64xf32>) outs(%37 : tensor<1x12x128x64xf32>) permutation = [0, 2, 1, 3] 
+    %transposed_50 = linalg.transpose ins(%cst_18 : tensor<768x768xf32>) outs(%30 : tensor<768x768xf32>) permutation = [1, 0] 
+    %38 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed_50 : tensor<768x768xf32>) outs(%32 : tensor<1x768x768xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x768x768xf32>
+    %39 = linalg.batch_matmul ins(%31, %38 : tensor<1x128x768xf32>, tensor<1x768x768xf32>) outs(%34 : tensor<1x128x768xf32>) -> tensor<1x128x768xf32>
+    %40 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%39, %cst_19 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %expanded_51 = tensor.expand_shape %40 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32>
+    %transposed_52 = linalg.transpose ins(%cst_20 : tensor<768x768xf32>) outs(%30 : tensor<768x768xf32>) permutation = [1, 0] 
+    %41 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed_52 : tensor<768x768xf32>) outs(%32 : tensor<1x768x768xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x768x768xf32>
+    %42 = linalg.batch_matmul ins(%31, %41 : tensor<1x128x768xf32>, tensor<1x768x768xf32>) outs(%34 : tensor<1x128x768xf32>) -> tensor<1x128x768xf32>
+    %43 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%42, %cst_21 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %expanded_53 = tensor.expand_shape %43 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32>
+    %transposed_54 = linalg.transpose ins(%expanded_53 : tensor<1x128x12x64xf32>) outs(%37 : tensor<1x12x128x64xf32>) permutation = [0, 2, 1, 3] 
+    %44 = tensor.empty() : tensor<1x12x64x128xf32>
+    %transposed_55 = linalg.transpose ins(%expanded_51 : tensor<1x128x12x64xf32>) outs(%44 : tensor<1x12x64x128xf32>) permutation = [0, 2, 3, 1] 
+    %45 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%transposed_49 : tensor<1x12x128x64xf32>) outs(%37 : tensor<1x12x128x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x12x128x64xf32>
+    %46 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%transposed_55 : tensor<1x12x64x128xf32>) outs(%44 : tensor<1x12x64x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x12x64x128xf32>
+    %collapsed_56 = tensor.collapse_shape %45 [[0, 1], [2], [3]] : tensor<1x12x128x64xf32> into tensor<12x128x64xf32>
+    %collapsed_57 = tensor.collapse_shape %46 [[0, 1], [2], [3]] : tensor<1x12x64x128xf32> into tensor<12x64x128xf32>
+    %47 = tensor.empty() : tensor<12x128x128xf32>
+    %48 = linalg.fill ins(%cst_0 : f32) outs(%47 : tensor<12x128x128xf32>) -> tensor<12x128x128xf32>
+    %49 = linalg.batch_matmul ins(%collapsed_56, %collapsed_57 : tensor<12x128x64xf32>, tensor<12x64x128xf32>) outs(%48 : tensor<12x128x128xf32>) -> tensor<12x128x128xf32>
+    %expanded_58 = tensor.expand_shape %49 [[0, 1], [2], [3]] : tensor<12x128x128xf32> into tensor<1x12x128x128xf32>
+    %50 = tensor.empty() : tensor<1x12x128x128xf32>
+    %51 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_58 : tensor<1x12x128x128xf32>) outs(%50 : tensor<1x12x128x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.divf %in, %cst_9 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x12x128x128xf32>
+    %52 = tensor.empty() : tensor<1x1x128x128xi1>
+    %53 = linalg.generic {indexing_maps = [#map13, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_47 : tensor<1x1x128x128xi1>) outs(%52 : tensor<1x1x128x128xi1>) {
+    ^bb0(%in: i1, %out: i1):
+      %195 = arith.extui %in : i1 to i64
+      %196 = arith.cmpi eq, %195, %c0_i64 : i64
+      linalg.yield %196 : i1
+    } -> tensor<1x1x128x128xi1>
+    %54 = tensor.empty() : tensor<f32>
+    %55 = linalg.generic {indexing_maps = [#map14, #map14], iterator_types = []} ins(%cst_46 : tensor<f64>) outs(%54 : tensor<f32>) {
+    ^bb0(%in: f64, %out: f32):
+      %195 = arith.truncf %in : f64 to f32
+      linalg.yield %195 : f32
+    } -> tensor<f32>
+    %56 = linalg.generic {indexing_maps = [#map13, #map15, #map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%53, %55, %51 : tensor<1x1x128x128xi1>, tensor<f32>, tensor<1x12x128x128xf32>) outs(%50 : tensor<1x12x128x128xf32>) {
+    ^bb0(%in: i1, %in_89: f32, %in_90: f32, %out: f32):
+      %195 = arith.select %in, %in_89, %in_90 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x12x128x128xf32>
+    %57 = tensor.empty() : tensor<1x12x128xi64>
+    %58 = linalg.fill ins(%c0_i64 : i64) outs(%57 : tensor<1x12x128xi64>) -> tensor<1x12x128xi64>
+    %59 = tensor.empty() : tensor<1x12x128xf32>
+    %60 = linalg.fill ins(%cst_2 : f32) outs(%59 : tensor<1x12x128xf32>) -> tensor<1x12x128xf32>
+    %61:2 = linalg.generic {indexing_maps = [#map12, #map16, #map16], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%56 : tensor<1x12x128x128xf32>) outs(%60, %58 : tensor<1x12x128xf32>, tensor<1x12x128xi64>) {
+    ^bb0(%in: f32, %out: f32, %out_89: i64):
+      %195 = linalg.index 3 : index
+      %196 = arith.index_cast %195 : index to i64
+      %197 = arith.maximumf %in, %out : f32
+      %198 = arith.cmpf ogt, %in, %out : f32
+      %199 = arith.select %198, %196, %out_89 : i64
+      linalg.yield %197, %199 : f32, i64
+    } -> (tensor<1x12x128xf32>, tensor<1x12x128xi64>)
+    %expanded_59 = tensor.expand_shape %61#0 [[0], [1], [2, 3]] : tensor<1x12x128xf32> into tensor<1x12x128x1xf32>
+    %62 = linalg.generic {indexing_maps = [#map11, #map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%56, %expanded_59 : tensor<1x12x128x128xf32>, tensor<1x12x128x1xf32>) outs(%50 : tensor<1x12x128x128xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.subf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x12x128x128xf32>
+    %63 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%62 : tensor<1x12x128x128xf32>) outs(%50 : tensor<1x12x128x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = math.exp %in : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x12x128x128xf32>
+    %64 = tensor.empty() : tensor<1x12x128x1xf32>
+    %65 = linalg.fill ins(%cst_0 : f32) outs(%64 : tensor<1x12x128x1xf32>) -> tensor<1x12x128x1xf32>
+    %66 = linalg.generic {indexing_maps = [#map12, #map18], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%63 : tensor<1x12x128x128xf32>) outs(%65 : tensor<1x12x128x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.addf %in, %out : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x12x128x1xf32>
+    %67 = linalg.generic {indexing_maps = [#map11, #map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%63, %66 : tensor<1x12x128x128xf32>, tensor<1x12x128x1xf32>) outs(%50 : tensor<1x12x128x128xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.divf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x12x128x128xf32>
+    %68 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%67 : tensor<1x12x128x128xf32>) outs(%50 : tensor<1x12x128x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x12x128x128xf32>
+    %69 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%transposed_54 : tensor<1x12x128x64xf32>) outs(%37 : tensor<1x12x128x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x12x128x64xf32>
+    %collapsed_60 = tensor.collapse_shape %68 [[0, 1], [2], [3]] : tensor<1x12x128x128xf32> into tensor<12x128x128xf32>
+    %collapsed_61 = tensor.collapse_shape %69 [[0, 1], [2], [3]] : tensor<1x12x128x64xf32> into tensor<12x128x64xf32>
+    %70 = tensor.empty() : tensor<12x128x64xf32>
+    %71 = linalg.fill ins(%cst_0 : f32) outs(%70 : tensor<12x128x64xf32>) -> tensor<12x128x64xf32>
+    %72 = linalg.batch_matmul ins(%collapsed_60, %collapsed_61 : tensor<12x128x128xf32>, tensor<12x128x64xf32>) outs(%71 : tensor<12x128x64xf32>) -> tensor<12x128x64xf32>
+    %expanded_62 = tensor.expand_shape %72 [[0, 1], [2], [3]]: tensor<12x128x64xf32> into tensor<1x12x128x64xf32>
+    %73 = tensor.empty() : tensor<1x128x12x64xf32>
+    %transposed_63 = linalg.transpose ins(%expanded_62 : tensor<1x12x128x64xf32>) outs(%73 : tensor<1x128x12x64xf32>) permutation = [0, 2, 1, 3] 
+    %collapsed_64 = tensor.collapse_shape %transposed_63 [[0], [1], [2, 3]] : tensor<1x128x12x64xf32> into tensor<1x128x768xf32>
+    %transposed_65 = linalg.transpose ins(%cst_22 : tensor<768x768xf32>) outs(%30 : tensor<768x768xf32>) permutation = [1, 0] 
+    %74 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_64 : tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x128x768xf32>
+    %75 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed_65 : tensor<768x768xf32>) outs(%32 : tensor<1x768x768xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x768x768xf32>
+    %76 = linalg.batch_matmul ins(%74, %75 : tensor<1x128x768xf32>, tensor<1x768x768xf32>) outs(%34 : tensor<1x128x768xf32>) -> tensor<1x128x768xf32>
+    %77 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%76, %cst_23 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %78 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%8, %77 : tensor<1x128x768xf32>, tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %79 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%78 : tensor<1x128x768xf32>) outs(%10 : tensor<1x128x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.addf %in, %out : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x1xf32>
+    %80 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%79 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.divf %in, %cst_7 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x1xf32>
+    %81 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%78 : tensor<1x128x768xf32>) outs(%13 : tensor<1x128x768xf64>) {
+    ^bb0(%in: f32, %out: f64):
+      %195 = arith.extf %in : f32 to f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x768xf64>
+    %82 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%81 : tensor<1x128x768xf64>) outs(%16 : tensor<1x128x1xf64>) {
+    ^bb0(%in: f64, %out: f64):
+      %195 = arith.addf %in, %out : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x1xf64>
+    %83 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%82 : tensor<1x128x1xf64>) outs(%15 : tensor<1x128x1xf64>) {
+    ^bb0(%in: f64, %out: f64):
+      %195 = arith.divf %in, %cst_8 : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x1xf64>
+    %84 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%81, %83 : tensor<1x128x768xf64>, tensor<1x128x1xf64>) outs(%13 : tensor<1x128x768xf64>) {
+    ^bb0(%in: f64, %in_89: f64, %out: f64):
+      %195 = arith.subf %in, %in_89 : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x768xf64>
+    %85 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%84, %84 : tensor<1x128x768xf64>, tensor<1x128x768xf64>) outs(%13 : tensor<1x128x768xf64>) {
+    ^bb0(%in: f64, %in_89: f64, %out: f64):
+      %195 = arith.mulf %in, %in_89 : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x768xf64>
+    %86 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%85 : tensor<1x128x768xf64>) outs(%16 : tensor<1x128x1xf64>) {
+    ^bb0(%in: f64, %out: f64):
+      %195 = arith.addf %in, %out : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x1xf64>
+    %87 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%86 : tensor<1x128x1xf64>) outs(%15 : tensor<1x128x1xf64>) {
+    ^bb0(%in: f64, %out: f64):
+      %195 = arith.divf %in, %cst_3 : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x1xf64>
+    %88 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%87 : tensor<1x128x1xf64>) outs(%9 : tensor<1x128x1xf32>) {
+    ^bb0(%in: f64, %out: f32):
+      %195 = arith.truncf %in : f64 to f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x1xf32>
+    %89 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%88 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = math.sqrt %in : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x1xf32>
+    %90 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%78, %80 : tensor<1x128x768xf32>, tensor<1x128x1xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.subf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %91 = linalg.generic {indexing_maps = [#map9, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_24, %90 : tensor<768xf32>, tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.mulf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %92 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%89 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.truncf %cst_6 : f64 to f32
+      %196 = arith.addf %in, %195 : f32
+      linalg.yield %196 : f32
+    } -> tensor<1x128x1xf32>
+    %93 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%91, %92 : tensor<1x128x768xf32>, tensor<1x128x1xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.divf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %94 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%93, %cst_25 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %95 = tensor.empty() : tensor<768x3072xf32>
+    %transposed_66 = linalg.transpose ins(%cst_26 : tensor<3072x768xf32>) outs(%95 : tensor<768x3072xf32>) permutation = [1, 0] 
+    %96 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%94 : tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x128x768xf32>
+    %97 = tensor.empty() : tensor<1x768x3072xf32>
+    %98 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed_66 : tensor<768x3072xf32>) outs(%97 : tensor<1x768x3072xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x768x3072xf32>
+    %99 = tensor.empty() : tensor<1x128x3072xf32>
+    %100 = linalg.fill ins(%cst_0 : f32) outs(%99 : tensor<1x128x3072xf32>) -> tensor<1x128x3072xf32>
+    %101 = linalg.batch_matmul ins(%96, %98 : tensor<1x128x768xf32>, tensor<1x768x3072xf32>) outs(%100 : tensor<1x128x3072xf32>) -> tensor<1x128x3072xf32>
+    %102 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%101, %cst_27 : tensor<1x128x3072xf32>, tensor<3072xf32>) outs(%99 : tensor<1x128x3072xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x3072xf32>
+    %103 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%102 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.mulf %in, %cst_10 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x3072xf32>
+    %104 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%102 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = math.powf %in, %cst_11 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x3072xf32>
+    %105 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%104 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.truncf %cst_5 : f64 to f32
+      %196 = arith.mulf %in, %195 : f32
+      linalg.yield %196 : f32
+    } -> tensor<1x128x3072xf32>
+    %106 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%102, %105 : tensor<1x128x3072xf32>, tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x3072xf32>
+    %107 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%106 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.truncf %cst_4 : f64 to f32
+      %196 = arith.mulf %in, %195 : f32
+      linalg.yield %196 : f32
+    } -> tensor<1x128x3072xf32>
+    %108 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%107 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = math.tanh %in : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x3072xf32>
+    %109 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%108 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.addf %in, %cst_12 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x3072xf32>
+    %110 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%103, %109 : tensor<1x128x3072xf32>, tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.mulf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x3072xf32>
+    %111 = tensor.empty() : tensor<3072x768xf32>
+    %transposed_67 = linalg.transpose ins(%cst_28 : tensor<768x3072xf32>) outs(%111 : tensor<3072x768xf32>) permutation = [1, 0] 
+    %112 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%110 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x128x3072xf32>
+    %113 = tensor.empty() : tensor<1x3072x768xf32>
+    %114 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed_67 : tensor<3072x768xf32>) outs(%113 : tensor<1x3072x768xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x3072x768xf32>
+    %115 = linalg.batch_matmul ins(%112, %114 : tensor<1x128x3072xf32>, tensor<1x3072x768xf32>) outs(%34 : tensor<1x128x768xf32>) -> tensor<1x128x768xf32>
+    %116 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%115, %cst_29 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %117 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%78, %116 : tensor<1x128x768xf32>, tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %118 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%117 : tensor<1x128x768xf32>) outs(%10 : tensor<1x128x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.addf %in, %out : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x1xf32>
+    %119 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%118 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.divf %in, %cst_7 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x1xf32>
+    %120 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%117 : tensor<1x128x768xf32>) outs(%13 : tensor<1x128x768xf64>) {
+    ^bb0(%in: f32, %out: f64):
+      %195 = arith.extf %in : f32 to f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x768xf64>
+    %121 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%120 : tensor<1x128x768xf64>) outs(%16 : tensor<1x128x1xf64>) {
+    ^bb0(%in: f64, %out: f64):
+      %195 = arith.addf %in, %out : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x1xf64>
+    %122 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%121 : tensor<1x128x1xf64>) outs(%15 : tensor<1x128x1xf64>) {
+    ^bb0(%in: f64, %out: f64):
+      %195 = arith.divf %in, %cst_8 : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x1xf64>
+    %123 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%120, %122 : tensor<1x128x768xf64>, tensor<1x128x1xf64>) outs(%13 : tensor<1x128x768xf64>) {
+    ^bb0(%in: f64, %in_89: f64, %out: f64):
+      %195 = arith.subf %in, %in_89 : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x768xf64>
+    %124 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%123, %123 : tensor<1x128x768xf64>, tensor<1x128x768xf64>) outs(%13 : tensor<1x128x768xf64>) {
+    ^bb0(%in: f64, %in_89: f64, %out: f64):
+      %195 = arith.mulf %in, %in_89 : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x768xf64>
+    %125 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%124 : tensor<1x128x768xf64>) outs(%16 : tensor<1x128x1xf64>) {
+    ^bb0(%in: f64, %out: f64):
+      %195 = arith.addf %in, %out : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x1xf64>
+    %126 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%125 : tensor<1x128x1xf64>) outs(%15 : tensor<1x128x1xf64>) {
+    ^bb0(%in: f64, %out: f64):
+      %195 = arith.divf %in, %cst_3 : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x1xf64>
+    %127 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%126 : tensor<1x128x1xf64>) outs(%9 : tensor<1x128x1xf32>) {
+    ^bb0(%in: f64, %out: f32):
+      %195 = arith.truncf %in : f64 to f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x1xf32>
+    %128 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%127 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = math.sqrt %in : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x1xf32>
+    %129 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%117, %119 : tensor<1x128x768xf32>, tensor<1x128x1xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.subf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %130 = linalg.generic {indexing_maps = [#map9, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_30, %129 : tensor<768xf32>, tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.mulf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %131 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%128 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.truncf %cst_6 : f64 to f32
+      %196 = arith.addf %in, %195 : f32
+      linalg.yield %196 : f32
+    } -> tensor<1x128x1xf32>
+    %132 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%130, %131 : tensor<1x128x768xf32>, tensor<1x128x1xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.divf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %133 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%132, %cst_31 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %transposed_68 = linalg.transpose ins(%cst_32 : tensor<768x768xf32>) outs(%30 : tensor<768x768xf32>) permutation = [1, 0] 
+    %134 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%133 : tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x128x768xf32>
+    %135 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed_68 : tensor<768x768xf32>) outs(%32 : tensor<1x768x768xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x768x768xf32>
+    %136 = linalg.batch_matmul ins(%134, %135 : tensor<1x128x768xf32>, tensor<1x768x768xf32>) outs(%34 : tensor<1x128x768xf32>) -> tensor<1x128x768xf32>
+    %137 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%136, %cst_33 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %expanded_69 = tensor.expand_shape %137 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32>
+    %transposed_70 = linalg.transpose ins(%expanded_69 : tensor<1x128x12x64xf32>) outs(%37 : tensor<1x12x128x64xf32>) permutation = [0, 2, 1, 3] 
+    %transposed_71 = linalg.transpose ins(%cst_34 : tensor<768x768xf32>) outs(%30 : tensor<768x768xf32>) permutation = [1, 0] 
+    %138 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed_71 : tensor<768x768xf32>) outs(%32 : tensor<1x768x768xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x768x768xf32>
+    %139 = linalg.batch_matmul ins(%134, %138 : tensor<1x128x768xf32>, tensor<1x768x768xf32>) outs(%34 : tensor<1x128x768xf32>) -> tensor<1x128x768xf32>
+    %140 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%139, %cst_35 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %expanded_72 = tensor.expand_shape %140 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32>
+    %transposed_73 = linalg.transpose ins(%cst_36 : tensor<768x768xf32>) outs(%30 : tensor<768x768xf32>) permutation = [1, 0] 
+    %141 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed_73 : tensor<768x768xf32>) outs(%32 : tensor<1x768x768xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x768x768xf32>
+    %142 = linalg.batch_matmul ins(%134, %141 : tensor<1x128x768xf32>, tensor<1x768x768xf32>) outs(%34 : tensor<1x128x768xf32>) -> tensor<1x128x768xf32>
+    %143 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%142, %cst_37 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %expanded_74 = tensor.expand_shape %143 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32>
+    %transposed_75 = linalg.transpose ins(%expanded_74 : tensor<1x128x12x64xf32>) outs(%37 : tensor<1x12x128x64xf32>) permutation = [0, 2, 1, 3] 
+    %transposed_76 = linalg.transpose ins(%expanded_72 : tensor<1x128x12x64xf32>) outs(%44 : tensor<1x12x64x128xf32>) permutation = [0, 2, 3, 1] 
+    %144 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%transposed_70 : tensor<1x12x128x64xf32>) outs(%37 : tensor<1x12x128x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x12x128x64xf32>
+    %145 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%transposed_76 : tensor<1x12x64x128xf32>) outs(%44 : tensor<1x12x64x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x12x64x128xf32>
+    %collapsed_77 = tensor.collapse_shape %144 [[0, 1], [2], [3]] : tensor<1x12x128x64xf32> into tensor<12x128x64xf32>
+    %collapsed_78 = tensor.collapse_shape %145 [[0, 1], [2], [3]] : tensor<1x12x64x128xf32> into tensor<12x64x128xf32>
+    %146 = linalg.batch_matmul ins(%collapsed_77, %collapsed_78 : tensor<12x128x64xf32>, tensor<12x64x128xf32>) outs(%48 : tensor<12x128x128xf32>) -> tensor<12x128x128xf32>
+    %expanded_79 = tensor.expand_shape %146 [[0, 1], [2], [3]] : tensor<12x128x128xf32> into tensor<1x12x128x128xf32>
+    %147 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_79 : tensor<1x12x128x128xf32>) outs(%50 : tensor<1x12x128x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.divf %in, %cst_9 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x12x128x128xf32>
+    %148 = linalg.generic {indexing_maps = [#map13, #map15, #map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%53, %55, %147 : tensor<1x1x128x128xi1>, tensor<f32>, tensor<1x12x128x128xf32>) outs(%50 : tensor<1x12x128x128xf32>) {
+    ^bb0(%in: i1, %in_89: f32, %in_90: f32, %out: f32):
+      %195 = arith.select %in, %in_89, %in_90 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x12x128x128xf32>
+    %149:2 = linalg.generic {indexing_maps = [#map12, #map16, #map16], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%148 : tensor<1x12x128x128xf32>) outs(%60, %58 : tensor<1x12x128xf32>, tensor<1x12x128xi64>) {
+    ^bb0(%in: f32, %out: f32, %out_89: i64):
+      %195 = linalg.index 3 : index
+      %196 = arith.index_cast %195 : index to i64
+      %197 = arith.maximumf %in, %out : f32
+      %198 = arith.cmpf ogt, %in, %out : f32
+      %199 = arith.select %198, %196, %out_89 : i64
+      linalg.yield %197, %199 : f32, i64
+    } -> (tensor<1x12x128xf32>, tensor<1x12x128xi64>)
+    %expanded_80 = tensor.expand_shape %149#0 [[0], [1], [2, 3]] : tensor<1x12x128xf32> into tensor<1x12x128x1xf32>
+    %150 = linalg.generic {indexing_maps = [#map11, #map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%148, %expanded_80 : tensor<1x12x128x128xf32>, tensor<1x12x128x1xf32>) outs(%50 : tensor<1x12x128x128xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.subf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x12x128x128xf32>
+    %151 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%150 : tensor<1x12x128x128xf32>) outs(%50 : tensor<1x12x128x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = math.exp %in : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x12x128x128xf32>
+    %152 = linalg.generic {indexing_maps = [#map12, #map18], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%151 : tensor<1x12x128x128xf32>) outs(%65 : tensor<1x12x128x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.addf %in, %out : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x12x128x1xf32>
+    %153 = linalg.generic {indexing_maps = [#map11, #map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%151, %152 : tensor<1x12x128x128xf32>, tensor<1x12x128x1xf32>) outs(%50 : tensor<1x12x128x128xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.divf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x12x128x128xf32>
+    %154 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%153 : tensor<1x12x128x128xf32>) outs(%50 : tensor<1x12x128x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x12x128x128xf32>
+    %155 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%transposed_75 : tensor<1x12x128x64xf32>) outs(%37 : tensor<1x12x128x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x12x128x64xf32>
+    %collapsed_81 = tensor.collapse_shape %154 [[0, 1], [2], [3]] : tensor<1x12x128x128xf32> into tensor<12x128x128xf32>
+    %collapsed_82 = tensor.collapse_shape %155 [[0, 1], [2], [3]] : tensor<1x12x128x64xf32> into tensor<12x128x64xf32>
+    %156 = linalg.batch_matmul ins(%collapsed_81, %collapsed_82 : tensor<12x128x128xf32>, tensor<12x128x64xf32>) outs(%71 : tensor<12x128x64xf32>) -> tensor<12x128x64xf32>
+    %expanded_83 = tensor.expand_shape %156 [[0, 1], [2], [3]] : tensor<12x128x64xf32> into tensor<1x12x128x64xf32>
+    %transposed_84 = linalg.transpose ins(%expanded_83 : tensor<1x12x128x64xf32>) outs(%73 : tensor<1x128x12x64xf32>) permutation = [0, 2, 1, 3] 
+    %collapsed_85 = tensor.collapse_shape %transposed_84 [[0], [1], [2, 3]] : tensor<1x128x12x64xf32> into tensor<1x128x768xf32>
+    %transposed_86 = linalg.transpose ins(%cst_38 : tensor<768x768xf32>) outs(%30 : tensor<768x768xf32>) permutation = [1, 0] 
+    %157 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_85 : tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x128x768xf32>
+    %158 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed_86 : tensor<768x768xf32>) outs(%32 : tensor<1x768x768xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x768x768xf32>
+    %159 = linalg.batch_matmul ins(%157, %158 : tensor<1x128x768xf32>, tensor<1x768x768xf32>) outs(%34 : tensor<1x128x768xf32>) -> tensor<1x128x768xf32>
+    %160 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%159, %cst_39 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %161 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%117, %160 : tensor<1x128x768xf32>, tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %162 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%161 : tensor<1x128x768xf32>) outs(%10 : tensor<1x128x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.addf %in, %out : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x1xf32>
+    %163 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%162 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.divf %in, %cst_7 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x1xf32>
+    %164 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%161 : tensor<1x128x768xf32>) outs(%13 : tensor<1x128x768xf64>) {
+    ^bb0(%in: f32, %out: f64):
+      %195 = arith.extf %in : f32 to f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x768xf64>
+    %165 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%164 : tensor<1x128x768xf64>) outs(%16 : tensor<1x128x1xf64>) {
+    ^bb0(%in: f64, %out: f64):
+      %195 = arith.addf %in, %out : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x1xf64>
+    %166 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%165 : tensor<1x128x1xf64>) outs(%15 : tensor<1x128x1xf64>) {
+    ^bb0(%in: f64, %out: f64):
+      %195 = arith.divf %in, %cst_8 : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x1xf64>
+    %167 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%164, %166 : tensor<1x128x768xf64>, tensor<1x128x1xf64>) outs(%13 : tensor<1x128x768xf64>) {
+    ^bb0(%in: f64, %in_89: f64, %out: f64):
+      %195 = arith.subf %in, %in_89 : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x768xf64>
+    %168 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%167, %167 : tensor<1x128x768xf64>, tensor<1x128x768xf64>) outs(%13 : tensor<1x128x768xf64>) {
+    ^bb0(%in: f64, %in_89: f64, %out: f64):
+      %195 = arith.mulf %in, %in_89 : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x768xf64>
+    %169 = linalg.generic {indexing_maps = [#map5, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%168 : tensor<1x128x768xf64>) outs(%16 : tensor<1x128x1xf64>) {
+    ^bb0(%in: f64, %out: f64):
+      %195 = arith.addf %in, %out : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x1xf64>
+    %170 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%169 : tensor<1x128x1xf64>) outs(%15 : tensor<1x128x1xf64>) {
+    ^bb0(%in: f64, %out: f64):
+      %195 = arith.divf %in, %cst_3 : f64
+      linalg.yield %195 : f64
+    } -> tensor<1x128x1xf64>
+    %171 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%170 : tensor<1x128x1xf64>) outs(%9 : tensor<1x128x1xf32>) {
+    ^bb0(%in: f64, %out: f32):
+      %195 = arith.truncf %in : f64 to f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x1xf32>
+    %172 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%171 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = math.sqrt %in : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x1xf32>
+    %173 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%161, %163 : tensor<1x128x768xf32>, tensor<1x128x1xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.subf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %174 = linalg.generic {indexing_maps = [#map9, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_40, %173 : tensor<768xf32>, tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.mulf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %175 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%172 : tensor<1x128x1xf32>) outs(%9 : tensor<1x128x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.truncf %cst_6 : f64 to f32
+      %196 = arith.addf %in, %195 : f32
+      linalg.yield %196 : f32
+    } -> tensor<1x128x1xf32>
+    %176 = linalg.generic {indexing_maps = [#map6, #map8, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%174, %175 : tensor<1x128x768xf32>, tensor<1x128x1xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.divf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %177 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%176, %cst_41 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %transposed_87 = linalg.transpose ins(%cst_42 : tensor<3072x768xf32>) outs(%95 : tensor<768x3072xf32>) permutation = [1, 0] 
+    %178 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%177 : tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x128x768xf32>
+    %179 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed_87 : tensor<768x3072xf32>) outs(%97 : tensor<1x768x3072xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x768x3072xf32>
+    %180 = linalg.batch_matmul ins(%178, %179 : tensor<1x128x768xf32>, tensor<1x768x3072xf32>) outs(%100 : tensor<1x128x3072xf32>) -> tensor<1x128x3072xf32>
+    %181 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%180, %cst_43 : tensor<1x128x3072xf32>, tensor<3072xf32>) outs(%99 : tensor<1x128x3072xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x3072xf32>
+    %182 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%181 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.mulf %in, %cst_10 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x3072xf32>
+    %183 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%181 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = math.powf %in, %cst_11 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x3072xf32>
+    %184 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%183 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.truncf %cst_5 : f64 to f32
+      %196 = arith.mulf %in, %195 : f32
+      linalg.yield %196 : f32
+    } -> tensor<1x128x3072xf32>
+    %185 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%181, %184 : tensor<1x128x3072xf32>, tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x3072xf32>
+    %186 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%185 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.truncf %cst_4 : f64 to f32
+      %196 = arith.mulf %in, %195 : f32
+      linalg.yield %196 : f32
+    } -> tensor<1x128x3072xf32>
+    %187 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%186 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = math.tanh %in : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x3072xf32>
+    %188 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%187 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %195 = arith.addf %in, %cst_12 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x3072xf32>
+    %189 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%182, %188 : tensor<1x128x3072xf32>, tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.mulf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x3072xf32>
+    %transposed_88 = linalg.transpose ins(%cst_44 : tensor<768x3072xf32>) outs(%111 : tensor<3072x768xf32>) permutation = [1, 0] 
+    %190 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%189 : tensor<1x128x3072xf32>) outs(%99 : tensor<1x128x3072xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x128x3072xf32>
+    %191 = linalg.generic {indexing_maps = [#map10, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%transposed_88 : tensor<3072x768xf32>) outs(%113 : tensor<1x3072x768xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x3072x768xf32>
+    %192 = linalg.batch_matmul ins(%190, %191 : tensor<1x128x3072xf32>, tensor<1x3072x768xf32>) outs(%34 : tensor<1x128x768xf32>) -> tensor<1x128x768xf32>
+    %193 = linalg.generic {indexing_maps = [#map6, #map9, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%192, %cst_45 : tensor<1x128x768xf32>, tensor<768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    %194 = linalg.generic {indexing_maps = [#map6, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%161, %193 : tensor<1x128x768xf32>, tensor<1x128x768xf32>) outs(%4 : tensor<1x128x768xf32>) {
+    ^bb0(%in: f32, %in_89: f32, %out: f32):
+      %195 = arith.addf %in, %in_89 : f32
+      linalg.yield %195 : f32
+    } -> tensor<1x128x768xf32>
+    return %194 : tensor<1x128x768xf32>
+  }
+}
\ No newline at end of file
diff --git a/test/samples/lenet/lenet_affine.mlir b/test/samples/lenet/lenet_affine.mlir
new file mode 100644
index 00000000..de0bb4e4
--- /dev/null
+++ b/test/samples/lenet/lenet_affine.mlir
@@ -0,0 +1,250 @@
+#map = affine_map<(d0, d1) -> (d0 * 2 + d1)>
+module {
+  func.func @main(%arg0: tensor<1x3x32x32xf32>) -> tensor<1x10xf32> {
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant dense_resource<torch_tensor_10_torch.float32> : tensor<10xf32>
+    %cst_0 = arith.constant dense_resource<torch_tensor_10_84_torch.float32> : tensor<10x84xf32>
+    %cst_1 = arith.constant dense_resource<torch_tensor_84_torch.float32> : tensor<84xf32>
+    %cst_2 = arith.constant dense_resource<torch_tensor_84_120_torch.float32> : tensor<84x120xf32>
+    %cst_3 = arith.constant dense_resource<torch_tensor_120_torch.float32> : tensor<120xf32>
+    %cst_4 = arith.constant dense_resource<torch_tensor_120_400_torch.float32> : tensor<120x400xf32>
+    %cst_5 = arith.constant dense_resource<torch_tensor_16_6_5_5_torch.float32> : tensor<16x6x5x5xf32>
+    %cst_6 = arith.constant 0.000000e+00 : f32
+    %cst_7 = arith.constant dense_resource<torch_tensor_6_3_5_5_torch.float32> : tensor<6x3x5x5xf32>
+    %0 = bufferization.to_memref %arg0 : memref<1x3x32x32xf32>
+    %1 = bufferization.to_memref %cst_7 : memref<6x3x5x5xf32>
+    %2 = bufferization.to_memref %cst_5 : memref<16x6x5x5xf32>
+    %3 = bufferization.to_memref %cst_4 : memref<120x400xf32>
+    %4 = bufferization.to_memref %cst_3 : memref<120xf32>
+    %5 = bufferization.to_memref %cst_2 : memref<84x120xf32>
+    %6 = bufferization.to_memref %cst_1 : memref<84xf32>
+    %7 = bufferization.to_memref %cst_0 : memref<10x84xf32>
+    %8 = bufferization.to_memref %cst : memref<10xf32>
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x6x14x14xf32>
+    affine.for %arg1 = 0 to 1 {
+      affine.for %arg2 = 0 to 6 {
+        affine.for %arg3 = 0 to 14 {
+          affine.for %arg4 = 0 to 14 {
+            affine.store %cst_6, %alloc[%arg1, %arg2, %arg3, %arg4] : memref<1x6x14x14xf32>
+          }
+        }
+      }
+    }
+    %alloc_8 = memref.alloc() {alignment = 64 : i64} : memref<1x6x14x14xf32>
+    memref.copy %alloc, %alloc_8 : memref<1x6x14x14xf32> to memref<1x6x14x14xf32>
+    affine.for %arg1 = 0 to 1 {
+      affine.for %arg2 = 0 to 6 {
+        affine.for %arg3 = 0 to 14 {
+          affine.for %arg4 = 0 to 14 {
+            affine.for %arg5 = 0 to 3 {
+              affine.for %arg6 = 0 to 5 {
+                affine.for %arg7 = 0 to 5 {
+                  %12 = affine.apply #map(%arg3, %arg6)
+                  %13 = affine.apply #map(%arg4, %arg7)
+                  %14 = affine.load %0[%arg1, %arg5, %12, %13] : memref<1x3x32x32xf32>
+                  %15 = affine.load %1[%arg2, %arg5, %arg6, %arg7] : memref<6x3x5x5xf32>
+                  %16 = affine.load %alloc_8[%arg1, %arg2, %arg3, %arg4] : memref<1x6x14x14xf32>
+                  %17 = arith.mulf %14, %15 : f32
+                  %18 = arith.addf %16, %17 : f32
+                  affine.store %18, %alloc_8[%arg1, %arg2, %arg3, %arg4] : memref<1x6x14x14xf32>
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    %alloc_9 = memref.alloc() {alignment = 64 : i64} : memref<1x6x14x14xf32>
+    affine.for %arg1 = 0 to 1 {
+      affine.for %arg2 = 0 to 6 {
+        affine.for %arg3 = 0 to 14 {
+          affine.for %arg4 = 0 to 14 {
+            %12 = affine.load %alloc_8[%c0, %arg2, %arg3, %arg4] : memref<1x6x14x14xf32>
+            %13 = arith.cmpf ugt, %12, %cst_6 : f32
+            %14 = arith.select %13, %12, %cst_6 : f32
+            affine.store %14, %alloc_9[%arg1, %arg2, %arg3, %arg4] : memref<1x6x14x14xf32>
+          }
+        }
+      }
+    }
+    %alloc_10 = memref.alloc() {alignment = 64 : i64} : memref<1x16x5x5xf32>
+    affine.for %arg1 = 0 to 1 {
+      affine.for %arg2 = 0 to 16 {
+        affine.for %arg3 = 0 to 5 {
+          affine.for %arg4 = 0 to 5 {
+            affine.store %cst_6, %alloc_10[%arg1, %arg2, %arg3, %arg4] : memref<1x16x5x5xf32>
+          }
+        }
+      }
+    }
+    %alloc_11 = memref.alloc() {alignment = 64 : i64} : memref<1x16x5x5xf32>
+    memref.copy %alloc_10, %alloc_11 : memref<1x16x5x5xf32> to memref<1x16x5x5xf32>
+    affine.for %arg1 = 0 to 1 {
+      affine.for %arg2 = 0 to 16 {
+        affine.for %arg3 = 0 to 5 {
+          affine.for %arg4 = 0 to 5 {
+            affine.for %arg5 = 0 to 6 {
+              affine.for %arg6 = 0 to 5 {
+                affine.for %arg7 = 0 to 5 {
+                  %12 = affine.apply #map(%arg3, %arg6)
+                  %13 = affine.apply #map(%arg4, %arg7)
+                  %14 = affine.load %alloc_9[%arg1, %arg5, %12, %13] : memref<1x6x14x14xf32>
+                  %15 = affine.load %2[%arg2, %arg5, %arg6, %arg7] : memref<16x6x5x5xf32>
+                  %16 = affine.load %alloc_11[%arg1, %arg2, %arg3, %arg4] : memref<1x16x5x5xf32>
+                  %17 = arith.mulf %14, %15 : f32
+                  %18 = arith.addf %16, %17 : f32
+                  affine.store %18, %alloc_11[%arg1, %arg2, %arg3, %arg4] : memref<1x16x5x5xf32>
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    %alloc_12 = memref.alloc() {alignment = 64 : i64} : memref<1x16x5x5xf32>
+    affine.for %arg1 = 0 to 1 {
+      affine.for %arg2 = 0 to 16 {
+        affine.for %arg3 = 0 to 5 {
+          affine.for %arg4 = 0 to 5 {
+            %12 = affine.load %alloc_11[%c0, %arg2, %arg3, %arg4] : memref<1x16x5x5xf32>
+            %13 = arith.cmpf ugt, %12, %cst_6 : f32
+            %14 = arith.select %13, %12, %cst_6 : f32
+            affine.store %14, %alloc_12[%arg1, %arg2, %arg3, %arg4] : memref<1x16x5x5xf32>
+          }
+        }
+      }
+    }
+    %9 = bufferization.to_tensor %alloc_12 : memref<1x16x5x5xf32>
+    %collapsed = tensor.collapse_shape %9 [[0], [1, 2, 3]] : tensor<1x16x5x5xf32> into tensor<1x400xf32>
+    %10 = bufferization.to_memref %collapsed : memref<1x400xf32>
+    %alloc_13 = memref.alloc() {alignment = 64 : i64} : memref<400x120xf32>
+    affine.for %arg1 = 0 to 120 {
+      affine.for %arg2 = 0 to 400 {
+        %12 = affine.load %3[%arg1, %arg2] : memref<120x400xf32>
+        affine.store %12, %alloc_13[%arg2, %arg1] : memref<400x120xf32>
+      }
+    }
+    %alloc_14 = memref.alloc() {alignment = 64 : i64} : memref<1x120xf32>
+    affine.for %arg1 = 0 to 1 {
+      affine.for %arg2 = 0 to 120 {
+        affine.store %cst_6, %alloc_14[%arg1, %arg2] : memref<1x120xf32>
+      }
+    }
+    %alloc_15 = memref.alloc() {alignment = 64 : i64} : memref<1x120xf32>
+    memref.copy %alloc_14, %alloc_15 : memref<1x120xf32> to memref<1x120xf32>
+    affine.for %arg1 = 0 to 1 {
+      affine.for %arg2 = 0 to 120 {
+        affine.for %arg3 = 0 to 400 {
+          %12 = affine.load %10[%arg1, %arg3] : memref<1x400xf32>
+          %13 = affine.load %alloc_13[%arg3, %arg2] : memref<400x120xf32>
+          %14 = affine.load %alloc_15[%arg1, %arg2] : memref<1x120xf32>
+          %15 = arith.mulf %12, %13 : f32
+          %16 = arith.addf %14, %15 : f32
+          affine.store %16, %alloc_15[%arg1, %arg2] : memref<1x120xf32>
+        }
+      }
+    }
+    %alloc_16 = memref.alloc() {alignment = 64 : i64} : memref<1x120xf32>
+    affine.for %arg1 = 0 to 1 {
+      affine.for %arg2 = 0 to 120 {
+        %12 = affine.load %alloc_15[%c0, %arg2] : memref<1x120xf32>
+        %13 = affine.load %4[%arg2] : memref<120xf32>
+        %14 = arith.addf %12, %13 : f32
+        affine.store %14, %alloc_16[%arg1, %arg2] : memref<1x120xf32>
+      }
+    }
+    %alloc_17 = memref.alloc() {alignment = 64 : i64} : memref<1x120xf32>
+    affine.for %arg1 = 0 to 1 {
+      affine.for %arg2 = 0 to 120 {
+        %12 = affine.load %alloc_16[%c0, %arg2] : memref<1x120xf32>
+        %13 = arith.cmpf ugt, %12, %cst_6 : f32
+        %14 = arith.select %13, %12, %cst_6 : f32
+        affine.store %14, %alloc_17[%arg1, %arg2] : memref<1x120xf32>
+      }
+    }
+    %alloc_18 = memref.alloc() {alignment = 64 : i64} : memref<120x84xf32>
+    affine.for %arg1 = 0 to 84 {
+      affine.for %arg2 = 0 to 120 {
+        %12 = affine.load %5[%arg1, %arg2] : memref<84x120xf32>
+        affine.store %12, %alloc_18[%arg2, %arg1] : memref<120x84xf32>
+      }
+    }
+    %alloc_19 = memref.alloc() {alignment = 64 : i64} : memref<1x84xf32>
+    affine.for %arg1 = 0 to 1 {
+      affine.for %arg2 = 0 to 84 {
+        affine.store %cst_6, %alloc_19[%arg1, %arg2] : memref<1x84xf32>
+      }
+    }
+    %alloc_20 = memref.alloc() {alignment = 64 : i64} : memref<1x84xf32>
+    memref.copy %alloc_19, %alloc_20 : memref<1x84xf32> to memref<1x84xf32>
+    affine.for %arg1 = 0 to 1 {
+      affine.for %arg2 = 0 to 84 {
+        affine.for %arg3 = 0 to 120 {
+          %12 = affine.load %alloc_17[%arg1, %arg3] : memref<1x120xf32>
+          %13 = affine.load %alloc_18[%arg3, %arg2] : memref<120x84xf32>
+          %14 = affine.load %alloc_20[%arg1, %arg2] : memref<1x84xf32>
+          %15 = arith.mulf %12, %13 : f32
+          %16 = arith.addf %14, %15 : f32
+          affine.store %16, %alloc_20[%arg1, %arg2] : memref<1x84xf32>
+        }
+      }
+    }
+    %alloc_21 = memref.alloc() {alignment = 64 : i64} : memref<1x84xf32>
+    affine.for %arg1 = 0 to 1 {
+      affine.for %arg2 = 0 to 84 {
+        %12 = affine.load %alloc_20[%c0, %arg2] : memref<1x84xf32>
+        %13 = affine.load %6[%arg2] : memref<84xf32>
+        %14 = arith.addf %12, %13 : f32
+        affine.store %14, %alloc_21[%arg1, %arg2] : memref<1x84xf32>
+      }
+    }
+    %alloc_22 = memref.alloc() {alignment = 64 : i64} : memref<1x84xf32>
+    affine.for %arg1 = 0 to 1 {
+      affine.for %arg2 = 0 to 84 {
+        %12 = affine.load %alloc_21[%c0, %arg2] : memref<1x84xf32>
+        %13 = arith.cmpf ugt, %12, %cst_6 : f32
+        %14 = arith.select %13, %12, %cst_6 : f32
+        affine.store %14, %alloc_22[%arg1, %arg2] : memref<1x84xf32>
+      }
+    }
+    %alloc_23 = memref.alloc() {alignment = 64 : i64} : memref<84x10xf32>
+    affine.for %arg1 = 0 to 10 {
+      affine.for %arg2 = 0 to 84 {
+        %12 = affine.load %7[%arg1, %arg2] : memref<10x84xf32>
+        affine.store %12, %alloc_23[%arg2, %arg1] : memref<84x10xf32>
+      }
+    }
+    %alloc_24 = memref.alloc() {alignment = 64 : i64} : memref<1x10xf32>
+    affine.for %arg1 = 0 to 1 {
+      affine.for %arg2 = 0 to 10 {
+        affine.store %cst_6, %alloc_24[%arg1, %arg2] : memref<1x10xf32>
+      }
+    }
+    %alloc_25 = memref.alloc() {alignment = 64 : i64} : memref<1x10xf32>
+    memref.copy %alloc_24, %alloc_25 : memref<1x10xf32> to memref<1x10xf32>
+    affine.for %arg1 = 0 to 1 {
+      affine.for %arg2 = 0 to 10 {
+        affine.for %arg3 = 0 to 84 {
+          %12 = affine.load %alloc_22[%arg1, %arg3] : memref<1x84xf32>
+          %13 = affine.load %alloc_23[%arg3, %arg2] : memref<84x10xf32>
+          %14 = affine.load %alloc_25[%arg1, %arg2] : memref<1x10xf32>
+          %15 = arith.mulf %12, %13 : f32
+          %16 = arith.addf %14, %15 : f32
+          affine.store %16, %alloc_25[%arg1, %arg2] : memref<1x10xf32>
+        }
+      }
+    }
+    %alloc_26 = memref.alloc() {alignment = 64 : i64} : memref<1x10xf32>
+    affine.for %arg1 = 0 to 1 {
+      affine.for %arg2 = 0 to 10 {
+        %12 = affine.load %alloc_25[%c0, %arg2] : memref<1x10xf32>
+        %13 = affine.load %8[%arg2] : memref<10xf32>
+        %14 = arith.addf %12, %13 : f32
+        affine.store %14, %alloc_26[%arg1, %arg2] : memref<1x10xf32>
+      }
+    }
+    %11 = bufferization.to_tensor %alloc_26 : memref<1x10xf32>
+    return %11 : tensor<1x10xf32>
+  }
+}
+
diff --git a/test/samples/lenet/lenet_linalg.mlir b/test/samples/lenet/lenet_linalg.mlir
new file mode 100644
index 00000000..d66d2400
--- /dev/null
+++ b/test/samples/lenet/lenet_linalg.mlir
@@ -0,0 +1,80 @@
+#map = affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+#map2 = affine_map<(d0, d1) -> (0, d1)>
+#map3 = affine_map<(d0, d1) -> (d1)>
+#map4 = affine_map<(d0, d1) -> (d0, d1)>
+module {
+  func.func @main(%arg0: tensor<1x3x32x32xf32>) -> tensor<1x10xf32> {
+    %cst = arith.constant dense_resource<torch_tensor_6_3_5_5_torch.float32> : tensor<6x3x5x5xf32>
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %cst_1 = arith.constant dense_resource<torch_tensor_16_6_5_5_torch.float32> : tensor<16x6x5x5xf32>
+    %cst_2 = arith.constant dense_resource<torch_tensor_120_400_torch.float32> : tensor<120x400xf32>
+    %cst_3 = arith.constant dense_resource<torch_tensor_120_torch.float32> : tensor<120xf32>
+    %cst_4 = arith.constant dense_resource<torch_tensor_84_120_torch.float32> : tensor<84x120xf32>
+    %cst_5 = arith.constant dense_resource<torch_tensor_84_torch.float32> : tensor<84xf32>
+    %cst_6 = arith.constant dense_resource<torch_tensor_10_84_torch.float32> : tensor<10x84xf32>
+    %cst_7 = arith.constant dense_resource<torch_tensor_10_torch.float32> : tensor<10xf32>
+    %0 = tensor.empty() : tensor<1x6x14x14xf32>
+    %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<1x6x14x14xf32>) -> tensor<1x6x14x14xf32>
+    %2 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %cst : tensor<1x3x32x32xf32>, tensor<6x3x5x5xf32>) outs(%1 : tensor<1x6x14x14xf32>) -> tensor<1x6x14x14xf32>
+    %3 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<1x6x14x14xf32>) outs(%0 : tensor<1x6x14x14xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %25 = arith.cmpf ugt, %in, %cst_0 : f32
+      %26 = arith.select %25, %in, %cst_0 : f32
+      linalg.yield %26 : f32
+    } -> tensor<1x6x14x14xf32>
+    %4 = tensor.empty() : tensor<1x16x5x5xf32>
+    %5 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<1x16x5x5xf32>) -> tensor<1x16x5x5xf32>
+    %6 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%3, %cst_1 : tensor<1x6x14x14xf32>, tensor<16x6x5x5xf32>) outs(%5 : tensor<1x16x5x5xf32>) -> tensor<1x16x5x5xf32>
+    %7 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<1x16x5x5xf32>) outs(%4 : tensor<1x16x5x5xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %25 = arith.cmpf ugt, %in, %cst_0 : f32
+      %26 = arith.select %25, %in, %cst_0 : f32
+      linalg.yield %26 : f32
+    } -> tensor<1x16x5x5xf32>
+    %collapsed = tensor.collapse_shape %7 [[0], [1, 2, 3]] : tensor<1x16x5x5xf32> into tensor<1x400xf32>
+    %8 = tensor.empty() : tensor<400x120xf32>
+    %transposed = linalg.transpose ins(%cst_2 : tensor<120x400xf32>) outs(%8 : tensor<400x120xf32>) permutation = [1, 0] 
+    %9 = tensor.empty() : tensor<1x120xf32>
+    %10 = linalg.fill ins(%cst_0 : f32) outs(%9 : tensor<1x120xf32>) -> tensor<1x120xf32>
+    %11 = linalg.matmul ins(%collapsed, %transposed : tensor<1x400xf32>, tensor<400x120xf32>) outs(%10 : tensor<1x120xf32>) -> tensor<1x120xf32>
+    %12 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel"]} ins(%11, %cst_3 : tensor<1x120xf32>, tensor<120xf32>) outs(%9 : tensor<1x120xf32>) {
+    ^bb0(%in: f32, %in_10: f32, %out: f32):
+      %25 = arith.addf %in, %in_10 : f32
+      linalg.yield %25 : f32
+    } -> tensor<1x120xf32>
+    %13 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<1x120xf32>) outs(%9 : tensor<1x120xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %25 = arith.cmpf ugt, %in, %cst_0 : f32
+      %26 = arith.select %25, %in, %cst_0 : f32
+      linalg.yield %26 : f32
+    } -> tensor<1x120xf32>
+    %14 = tensor.empty() : tensor<120x84xf32>
+    %transposed_8 = linalg.transpose ins(%cst_4 : tensor<84x120xf32>) outs(%14 : tensor<120x84xf32>) permutation = [1, 0] 
+    %15 = tensor.empty() : tensor<1x84xf32>
+    %16 = linalg.fill ins(%cst_0 : f32) outs(%15 : tensor<1x84xf32>) -> tensor<1x84xf32>
+    %17 = linalg.matmul ins(%13, %transposed_8 : tensor<1x120xf32>, tensor<120x84xf32>) outs(%16 : tensor<1x84xf32>) -> tensor<1x84xf32>
+    %18 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel"]} ins(%17, %cst_5 : tensor<1x84xf32>, tensor<84xf32>) outs(%15 : tensor<1x84xf32>) {
+    ^bb0(%in: f32, %in_10: f32, %out: f32):
+      %25 = arith.addf %in, %in_10 : f32
+      linalg.yield %25 : f32
+    } -> tensor<1x84xf32>
+    %19 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel"]} ins(%18 : tensor<1x84xf32>) outs(%15 : tensor<1x84xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %25 = arith.cmpf ugt, %in, %cst_0 : f32
+      %26 = arith.select %25, %in, %cst_0 : f32
+      linalg.yield %26 : f32
+    } -> tensor<1x84xf32>
+    %20 = tensor.empty() : tensor<84x10xf32>
+    %transposed_9 = linalg.transpose ins(%cst_6 : tensor<10x84xf32>) outs(%20 : tensor<84x10xf32>) permutation = [1, 0] 
+    %21 = tensor.empty() : tensor<1x10xf32>
+    %22 = linalg.fill ins(%cst_0 : f32) outs(%21 : tensor<1x10xf32>) -> tensor<1x10xf32>
+    %23 = linalg.matmul ins(%19, %transposed_9 : tensor<1x84xf32>, tensor<84x10xf32>) outs(%22 : tensor<1x10xf32>) -> tensor<1x10xf32>
+    %24 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel"]} ins(%23, %cst_7 : tensor<1x10xf32>, tensor<10xf32>) outs(%21 : tensor<1x10xf32>) {
+    ^bb0(%in: f32, %in_10: f32, %out: f32):
+      %25 = arith.addf %in, %in_10 : f32
+      linalg.yield %25 : f32
+    } -> tensor<1x10xf32>
+    return %24 : tensor<1x10xf32>
+  }
+}
\ No newline at end of file
diff --git a/tools/mlir-neura-opt/mlir-neura-opt.cpp b/tools/mlir-neura-opt/mlir-neura-opt.cpp
index d21664fb..b1791e47 100644
--- a/tools/mlir-neura-opt/mlir-neura-opt.cpp
+++ b/tools/mlir-neura-opt/mlir-neura-opt.cpp
@@ -20,6 +20,7 @@ int main(int argc, char **argv) {
   registry.insert<mlir::arith::ArithDialect>();
   registry.insert<mlir::DLTIDialect>();
   registry.insert<mlir::LLVM::LLVMDialect>();
+  registry.insert<mlir::memref::MemRefDialect>();
 
   mlir::neura::registerPasses();
   mlir::registerPasses();