diff --git a/include/Conversion/ConversionPasses.h b/include/Conversion/ConversionPasses.h
index 2d871868..78f6c3ce 100644
--- a/include/Conversion/ConversionPasses.h
+++ b/include/Conversion/ConversionPasses.h
@@ -20,6 +20,7 @@ namespace mlir {
 std::unique_ptr<mlir::Pass> createLowerArithToNeuraPass();
 std::unique_ptr<mlir::Pass> createLowerLlvmToNeuraPass();
 std::unique_ptr<mlir::Pass> createLowerMemRefToNeuraPass();
+std::unique_ptr<mlir::Pass> createLowerBuiltinToNeuraPass();
 
 #define GEN_PASS_REGISTRATION
 #include "Conversion/ConversionPasses.h.inc"
diff --git a/include/Conversion/ConversionPasses.td b/include/Conversion/ConversionPasses.td
index 8f2db985..2e79dd96 100644
--- a/include/Conversion/ConversionPasses.td
+++ b/include/Conversion/ConversionPasses.td
@@ -26,4 +26,10 @@ def LowerMemRefToNeura : Pass<"lower-memref-to-neura", "ModuleOp">{
   let constructor = "mlir::createLowerMemRefToNeuraPass()";
 }
 
+def LowerBuiltinToNeura : Pass<"lower-builtin-to-neura", "ModuleOp">{
+  let summary = "Lower Builtin to Neura dialect";
+  let description = [{Lower Builtin operations to Neura dialect operations.}];
+  let constructor = "mlir::createLowerBuiltinToNeuraPass()";
+}
+
 #endif // CONVERSION_PASSES_TD
\ No newline at end of file
diff --git a/include/NeuraDialect/NeuraOps.td b/include/NeuraDialect/NeuraOps.td
index e988774a..2c2a8758 100644
--- a/include/NeuraDialect/NeuraOps.td
+++ b/include/NeuraDialect/NeuraOps.td
@@ -116,6 +116,34 @@ def Neura_StoreOp : Op<NeuraDialect, "store"> {
   // let assemblyFormat = "$value `,` $addr `,` $predicate attr-dict";
 }
 
+// Defines a load operation with integrated address calculation.
+def Neura_LoadIndexedOp: Op<NeuraDialect, "load_indexed", [AttrSizedOperandSegments]>{
+  let summary = "Load with integrated address calculation for multi-dimensional arrays";
+  let description = [{
+    Calculates the address using the base address and indices.
+    Load the value at the calculated address.
+    Example:
+      %value = neura.load_indexed %base [%arg1, %arg2] : f32
+  }];
+  let arguments = (ins Arg<AnyMemRef, "the load operation">:$base, Variadic<AnyType>:$indices, Optional<AnyType>:$predicate);
+  let results = (outs AnyType:$result);
+  let assemblyFormat = "$base `[` $indices `:` type($indices) `]` type($base) ($predicate^ `:` type($predicate))? attr-dict `:` type($result)";
+}
+
+//Defines a store operation with integrated address calculation.
+def Neura_StoreIndexedOp: Op<NeuraDialect, "store_indexed", [AttrSizedOperandSegments]> {
+  let summary = "Store with integrated address calculation for multi-dimensional arrays";
+  let description = [{
+    Calculates the address using the base address and indices.
+    Store the value at the calculated address.
+    Example:
+      neura.store_indexed %value, %base [%arg1, %arg2] : f32
+  }];
+  let arguments = (ins AnyType:$value, Arg<AnyMemRef, "the store operation">:$base, Variadic<AnyType>:$indices, Optional<AnyType>:$predicate);
+  let results = (outs);
+  let assemblyFormat = "$value `to` $base `[` $indices `:` type($indices) `]` type($base) ($predicate^ `:` type($predicate))? attr-dict `:` type($value)";
+}
+
 // Defines a pointer computation operation.
 def Neura_GEP : Op<NeuraDialect, "gep"> {
   let summary = "Pointer computation using offset indices";
@@ -131,14 +159,14 @@ def Neura_CondBr : Op<NeuraDialect, "cond_br", [Terminator, AttrSizedOperandSegm
                    Variadic<AnyType>:$trueArgs,
                    Variadic<AnyType>:$falseArgs);
   let successors = (successor AnySuccessor:$trueDest, AnySuccessor:$falseDest);
-  let assemblyFormat = "$condition `:` type($condition) ($predicate^ `:` type($predicate))? `then` ($trueArgs^)? `:` type($trueArgs) `to` $trueDest `else` ($falseArgs^)? `:` type($falseArgs) `to` $falseDest attr-dict";
+  let assemblyFormat = "$condition `:` type($condition) ($predicate^ `:` type($predicate))? `then` ($trueArgs^ `:` type($trueArgs))? `to` $trueDest `else` ($falseArgs^ `:` type($falseArgs))? `to` $falseDest attr-dict";
 }
 
 // Defines an unconditional branch operation.
 def Neura_Br : Op<NeuraDialect, "br", [Terminator]> {
   let arguments = (ins Variadic<AnyType>:$args);
   let successors = (successor AnySuccessor:$dest);
-  let assemblyFormat = "($args^)? `:` type($args) `to` $dest attr-dict";
+  let assemblyFormat = "($args^ `:` type($args))? `to` $dest attr-dict";
 }
 
 def Neura_SelOp : Op<NeuraDialect, "sel"> {
diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td
index 8808a452..426fe6d0 100644
--- a/include/NeuraDialect/NeuraPasses.td
+++ b/include/NeuraDialect/NeuraPasses.td
@@ -57,5 +57,4 @@ def MapToAccelerator : Pass<"map-to-accelerator", "ModuleOp"> {
   }];
   let constructor = "neura::createMapToAcceleratorPass()";
 }
-
 #endif // NEURA_PASSES_TD
\ No newline at end of file
diff --git a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
index 72c83c6b..e1960b66 100644
--- a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
+++ b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
@@ -1,3 +1,4 @@
+#include "Common/AcceleratorAttrs.h"
 #include "Conversion/ConversionPasses.h"
 #include "NeuraDialect/NeuraDialect.h"
 #include "NeuraDialect/NeuraOps.h"
@@ -8,6 +9,7 @@
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/StringRef.h"
 
 namespace mlir {
 namespace neura {
@@ -24,9 +26,6 @@ using namespace mlir;
 using namespace mlir::func;
 using namespace mlir::neura;
 
-#define GEN_PASS_DEF_LOWERARITHTONEURA
-#include "NeuraDialect/NeuraPasses.h.inc"
-
 namespace {
 
 struct ArithConstantToNeuraConstant
@@ -35,10 +34,10 @@ struct ArithConstantToNeuraConstant
 
   LogicalResult matchAndRewrite(arith::ConstantOp op,
                                 PatternRewriter &rewriter) const override {
-    // Converts arith constant to Neura constant
+    // Converts arith constant to Neura constant.
     Type result_type = op.getType();
     Attribute value = op.getValue();
-    // Optional predicate parameter can be null
+    // Optional predicate parameter can be null.
     rewriter.replaceOpWithNewOp<neura::ConstantOp>(op, result_type, value,
                                                    nullptr);
     return success();
@@ -54,7 +53,7 @@ struct ArithAddIToNeuraAdd : public OpRewritePattern<mlir::arith::AddIOp> {
     Value rhs = op.getRhs();
     Type result_type = op.getType();
 
-    // Optional predicate: default to null
+    // Optional predicate: default to null.
     rewriter.replaceOpWithNewOp<neura::AddOp>(op, result_type, lhs, rhs,
                                               nullptr);
     return success();
@@ -70,7 +69,7 @@ struct ArithFAddToNeuraFAdd : public OpRewritePattern<mlir::arith::AddFOp> {
     Value rhs = op.getRhs();
     Type result_type = op.getType();
 
-    // Optional predicate: default to null
+    // Optional predicate: default to null.
     rewriter.replaceOpWithNewOp<neura::FAddOp>(op, result_type, lhs, rhs,
                                                nullptr);
     return success();
@@ -86,7 +85,7 @@ struct ArithSubIToNeuraSub : public OpRewritePattern<mlir::arith::SubIOp> {
     Value rhs = op.getRhs();
     Type result_type = op.getType();
 
-    // Optional predicate: default to null
+    // Optional predicate: default to null.
     rewriter.replaceOpWithNewOp<neura::SubOp>(op, result_type, lhs, rhs,
                                               nullptr);
     return success();
@@ -102,7 +101,7 @@ struct ArithSubFToNeuraFSub : public OpRewritePattern<mlir::arith::SubFOp> {
     Value rhs = op.getRhs();
     Type result_type = op.getType();
 
-    // Optional predicate: default to null
+    // Optional predicate: default to null.
     rewriter.replaceOpWithNewOp<neura::FSubOp>(op, result_type, lhs, rhs,
                                                nullptr);
     return success();
@@ -118,7 +117,7 @@ struct ArithMulFToNeuraFMul : public OpRewritePattern<mlir::arith::MulFOp> {
     Value rhs = op.getRhs();
     Type result_type = op.getType();
 
-    // Optional predicate: default to null
+    // Optional predicate: default to null.
     rewriter.replaceOpWithNewOp<neura::FMulOp>(op, result_type, lhs, rhs,
                                                nullptr);
     return success();
@@ -134,7 +133,7 @@ struct ArithFDivToNeuraFDiv : public OpRewritePattern<mlir::arith::DivFOp> {
     Value rhs = op.getRhs();
     Type result_type = op.getType();
 
-    // Optional predicate: default to null
+    // Optional predicate: default to null.
     rewriter.replaceOpWithNewOp<neura::FDivOp>(op, result_type, lhs, rhs,
                                                nullptr);
     return success();
@@ -185,8 +184,8 @@ struct ArithCmpiToNeuraICmp : public OpRewritePattern<mlir::arith::CmpIOp> {
       return rewriter.notifyMatchFailure(op, "Unsupported arith CmpIOp type");
     }
 
-    // Convert arith CmpIOp to Neura ICmpOp
-    // Optional predicate: default to null
+    // Converts arith CmpIOp to Neura ICmpOp.
+    // Optional predicate: default to null.
     rewriter.replaceOpWithNewOp<neura::ICmpOp>(
         op, result_type, lhs, rhs, nullptr, rewriter.getStringAttr(cmp_type));
     return success();
@@ -203,7 +202,7 @@ struct ArithSelectToNeuraSel : public OpRewritePattern<mlir::arith::SelectOp> {
     Value false_value = op.getFalseValue();
     Type result_type = op.getType();
 
-    // Convert arith SelectOp to Neura SelOp
+    // Converts arith SelectOp to Neura SelOp.
     rewriter.replaceOpWithNewOp<neura::SelOp>(op, result_type, true_value,
                                               false_value, condition);
     return success();
@@ -218,8 +217,8 @@ struct ArithExtUIToNeuraCast : public OpRewritePattern<mlir::arith::ExtUIOp> {
     Value input = op.getIn();
     Type result_type = op.getType();
 
-    // Convert arith ExtUIOp to Neura cast operation
-    // Optional predicate: default to null
+    // Converts arith ExtUIOp to Neura cast operation.
+    // Optional predicate: default to null.
     rewriter.replaceOpWithNewOp<neura::CastOp>(
         op, result_type, input, rewriter.getStringAttr("extui"), nullptr);
     return success();
@@ -234,8 +233,8 @@ struct ArithExtfToNeuraCast : public OpRewritePattern<mlir::arith::ExtFOp> {
     Value input = op.getIn();
     Type result_type = op.getType();
 
-    // Convert arith ExtFOp to Neura cast operation
-    // Optional predicate: default to null
+    // Converts arith ExtFOp to Neura cast operation.
+    // Optional predicate: default to null.
     rewriter.replaceOpWithNewOp<neura::CastOp>(
         op, result_type, input, rewriter.getStringAttr("extf"), nullptr);
     return success();
@@ -250,11 +249,23 @@ struct ArithIndexCastToNeuraCast
                                 PatternRewriter &rewriter) const override {
     Value input = op.getIn();
     Type result_type = op.getType();
+    Type in_type = input.getType();
+    StringRef cast_string;
+
+    // The isa<IntegerType> check is generic and handles any integer bit width.
+    // (e.g., i32, i64).
+    if (in_type.isIndex() && isa<IntegerType>(result_type)) {
+      cast_string = "index_to_int";
+    } else if (isa<IntegerType>(in_type) && result_type.isIndex()) {
+      cast_string = "int_to_index";
+    } else {
+      return rewriter.notifyMatchFailure(op, "index_cast");
+    }
 
-    // Convert arith IndexCastOp to Neura cast operation
-    // Optional predicate: default to null
+    // Converts arith IndexCastOp to Neura cast operation.
+    // Optional predicate: default to null.
     rewriter.replaceOpWithNewOp<neura::CastOp>(
-        op, result_type, input, rewriter.getStringAttr("indexCast"), nullptr);
+        op, result_type, input, rewriter.getStringAttr(cast_string), nullptr);
     return success();
   }
 };
@@ -274,16 +285,28 @@ struct LowerArithToNeuraPass
   }
 
   void runOnOperation() override {
-    RewritePatternSet patterns(&getContext());
-    mlir::neura::arith2neura::populateWithGenerated(patterns);
-    patterns
-        .add<ArithFAddToNeuraFAdd, ArithConstantToNeuraConstant,
-             ArithAddIToNeuraAdd, ArithCmpiToNeuraICmp, ArithSelectToNeuraSel,
-             ArithExtUIToNeuraCast, ArithIndexCastToNeuraCast,
-             ArithFDivToNeuraFDiv, ArithExtfToNeuraCast, ArithMulFToNeuraFMul, ArithSubIToNeuraSub, ArithSubFToNeuraFSub>(&getContext());
-    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
-      signalPassFailure();
-    }
+    ModuleOp module_op = getOperation();
+    MLIRContext *context = &getContext();
+    module_op.walk([&](func::FuncOp func_op) {
+      if (func_op->hasAttr(mlir::accel::kAcceleratorAttr)) {
+        auto target =
+            func_op->getAttrOfType<StringAttr>(mlir::accel::kAcceleratorAttr);
+        if (target && target.getValue() == mlir::accel::kNeuraTarget) {
+          RewritePatternSet patterns(&getContext());
+          mlir::neura::arith2neura::populateWithGenerated(patterns);
+          patterns.add<ArithFAddToNeuraFAdd, ArithConstantToNeuraConstant,
+                       ArithAddIToNeuraAdd, ArithCmpiToNeuraICmp,
+                       ArithSelectToNeuraSel, ArithExtUIToNeuraCast,
+                       ArithIndexCastToNeuraCast, ArithFDivToNeuraFDiv,
+                       ArithExtfToNeuraCast, ArithMulFToNeuraFMul,
+                       ArithSubIToNeuraSub, ArithSubFToNeuraFSub>(context);
+          if (failed(
+                  applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+            signalPassFailure();
+          }
+        }
+      }
+    });
   }
 };
 } // namespace
diff --git a/lib/Conversion/BuiltinToNeura/BuiltinToNeuraPass.cpp b/lib/Conversion/BuiltinToNeura/BuiltinToNeuraPass.cpp
new file mode 100644
index 00000000..260d8d90
--- /dev/null
+++ b/lib/Conversion/BuiltinToNeura/BuiltinToNeuraPass.cpp
@@ -0,0 +1,88 @@
+#include "Common/AcceleratorAttrs.h"
+#include "Conversion/ConversionPasses.h"
+#include "NeuraDialect/NeuraDialect.h"
+#include "NeuraDialect/NeuraOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::neura;
+
+namespace {
+
+struct BuiltinUnrealizedConversionCastToNeuraCast
+    : public OpRewritePattern<mlir::UnrealizedConversionCastOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mlir::UnrealizedConversionCastOp op,
+                                PatternRewriter &rewriter) const override {
+    // Only handles simple 1:1 casts.
+    // TODO: Handle more complex casts if needed.
+    if (op.getInputs().size() == 1 && op.getResults().size() == 1) {
+      Value input = op.getInputs()[0];
+      Type result_type = op.getResults()[0].getType();
+      Type input_type = input.getType();
+
+      StringRef cast_type;
+      if (input_type.isIndex() && isa<IntegerType>(result_type)) {
+        cast_type = "index_to_int";
+      } else if (isa<IntegerType>(input_type) && result_type.isIndex()) {
+        cast_type = "int_to_index";
+      } else {
+        return rewriter.notifyMatchFailure(op, "unsupported cast");
+      }
+
+      // Optional predicate: default to null.
+      rewriter.replaceOpWithNewOp<neura::CastOp>(
+          op, result_type, input, rewriter.getStringAttr(cast_type), nullptr);
+      return success();
+    }
+    return failure();
+  }
+};
+
+struct LowerBuiltinToNeuraPass
+    : public PassWrapper<LowerBuiltinToNeuraPass, OperationPass<ModuleOp>> {
+
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LowerBuiltinToNeuraPass)
+
+  StringRef getArgument() const override { return "lower-builtin-to-neura"; }
+  StringRef getDescription() const override {
+    return "Lower Builtin operations to Neura dialect operations";
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<mlir::neura::NeuraDialect>();
+  }
+
+  void runOnOperation() override {
+    ModuleOp module_op = getOperation();
+    MLIRContext *context = &getContext();
+    RewritePatternSet patterns(&getContext());
+    patterns.add<BuiltinUnrealizedConversionCastToNeuraCast>(context);
+    module_op.walk([&](func::FuncOp func_op) {
+      if (func_op->hasAttr(mlir::accel::kAcceleratorAttr)) {
+        auto target =
+            func_op->getAttrOfType<StringAttr>(mlir::accel::kAcceleratorAttr);
+        if (target && target.getValue() == mlir::accel::kNeuraTarget) {
+          if (failed(applyPatternsGreedily(func_op, std::move(patterns)))) {
+            return signalPassFailure();
+          }
+        }
+      }
+    });
+  }
+};
+} // namespace
+
+std::unique_ptr<Pass> mlir::createLowerBuiltinToNeuraPass() {
+  return std::make_unique<LowerBuiltinToNeuraPass>();
+}
diff --git a/lib/Conversion/BuiltinToNeura/CMakeLists.txt b/lib/Conversion/BuiltinToNeura/CMakeLists.txt
new file mode 100644
index 00000000..094aa44d
--- /dev/null
+++ b/lib/Conversion/BuiltinToNeura/CMakeLists.txt
@@ -0,0 +1,18 @@
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+add_mlir_conversion_library(MLIRNeuraBuiltinToNeuraPass
+  BuiltinToNeuraPass.cpp
+
+  DEPENDS
+  MLIRConversionIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRArithDialect
+  MLIRFuncDialect
+  MLIRLLVMDialect
+  MLIRIR
+  MLIRPass
+  MLIRTransforms
+  MLIRNeura
+  MLIRSupport
+)
diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt
index af5bb68a..ee851744 100644
--- a/lib/Conversion/CMakeLists.txt
+++ b/lib/Conversion/CMakeLists.txt
@@ -3,6 +3,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 add_subdirectory(ArithToNeura)
 add_subdirectory(LlvmToNeura)
 add_subdirectory(MemRefToNeura)
+add_subdirectory(BuiltinToNeura)
 
 # add_mlir_library(
 #     MLIRNeuraConversion
@@ -32,5 +33,7 @@ target_link_libraries(MLIRConversion INTERFACE
   MLIRNeura
   MLIRNeuraArithToNeuraPass
   MLIRNeuraLlvmToNeuraPass
+  MLIRNeuraMemRefToNeuraPass
+  MLIRNeuraBuiltinToNeuraPass
   ${dialect_libs}
 )
\ No newline at end of file
diff --git a/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp b/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp
index 6bc815b3..758c3fca 100644
--- a/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp
+++ b/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp
@@ -25,10 +25,6 @@ namespace llvm2neura {
 using namespace mlir;
 using namespace mlir::neura;
 
-#define GEN_PASS_DEF_LOWERLLVMTONEURA
-#include "NeuraDialect/NeuraPasses.h.inc"
-
-
 namespace {
 // Lowers integer add from mlir.llvm.add to nuera.add. We provide the lowering
 // here instead of tablegen due to that mlir.llvm.add uses an EnumProperty
diff --git a/lib/Conversion/LlvmToNeura/LlvmToNeuraPatterns.td b/lib/Conversion/LlvmToNeura/LlvmToNeuraPatterns.td
index 3aef67d8..1b99a47c 100644
--- a/lib/Conversion/LlvmToNeura/LlvmToNeuraPatterns.td
+++ b/lib/Conversion/LlvmToNeura/LlvmToNeuraPatterns.td
@@ -2,11 +2,3 @@ include "mlir/IR/OpBase.td"
 include "mlir/IR/PatternBase.td"
 include "mlir/Dialect/LLVMIR/LLVMOps.td"
 include "NeuraDialect/NeuraOps.td"
-
-// Floating point binary operations.
-// Deprecated Pattern: Because we need the predicate bit to be set to null initially
-// def : Pat<
-//   (LLVM_FSubOp $lhs, $rhs, $_fastmath),
-//   (Neura_FSubOp $lhs, $rhs)
-// >;
-
diff --git a/lib/Conversion/MemRefToNeura/MemRefToNeuraPass.cpp b/lib/Conversion/MemRefToNeura/MemRefToNeuraPass.cpp
index 3d3b543c..312797e4 100644
--- a/lib/Conversion/MemRefToNeura/MemRefToNeuraPass.cpp
+++ b/lib/Conversion/MemRefToNeura/MemRefToNeuraPass.cpp
@@ -1,23 +1,54 @@
 #include "Common/AcceleratorAttrs.h"
+#include "Conversion/ConversionPasses.h"
 #include "NeuraDialect/NeuraDialect.h"
 #include "NeuraDialect/NeuraOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "Conversion/ConversionPasses.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace mlir;
 using namespace mlir::neura;
 
-#define GEN_PASS_DEF_LOWERLLVMTONEURA
-#include "NeuraDialect/NeuraPasses.h.inc"
+namespace {
 
+struct MemRefLoadLowering : public OpRewritePattern<memref::LoadOp> {
+  using OpRewritePattern<memref::LoadOp>::OpRewritePattern;
 
-namespace {
+  LogicalResult matchAndRewrite(memref::LoadOp load_op,
+                                PatternRewriter &rewriter) const override {
+    // Creates a Neura LoadIndexedOp from the MemRef LoadOp.
+    Type result_type = load_op.getType();
+    Value memref = load_op.getMemRef();
+    ValueRange indices = load_op.getIndices();
+    // Optiional predicate: default to null
+    rewriter.replaceOpWithNewOp<neura::LoadIndexedOp>(load_op, result_type,
+                                                      memref, indices, nullptr);
+    return success();
+  }
+};
+
+struct MemRefStoreLowering : public OpRewritePattern<memref::StoreOp> {
+  using OpRewritePattern<memref::StoreOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(memref::StoreOp store_op,
+                                PatternRewriter &rewriter) const override {
+    // Creates a Neura StoreIndexedOp from the MemRef StoreOp.
+    Value value = store_op.getValueToStore();
+    Value memref = store_op.getMemRef();
+    ValueRange indices = store_op.getIndices();
+    // Optional predicate: default to null.
+    rewriter.replaceOpWithNewOp<neura::StoreIndexedOp>(store_op, value, memref,
+                                                       indices, nullptr);
+    return success();
+  }
+};
 
 struct LowerMemRefToNeuraPass
     : public PassWrapper<LowerMemRefToNeuraPass, OperationPass<ModuleOp>> {
@@ -34,7 +65,22 @@ struct LowerMemRefToNeuraPass
   }
 
   void runOnOperation() override {
+    ModuleOp module_op = getOperation();
+    MLIRContext *context = &getContext();
     RewritePatternSet patterns(&getContext());
+    patterns.add<MemRefLoadLowering>(context);
+    patterns.add<MemRefStoreLowering>(context);
+    module_op.walk([&](func::FuncOp func_op) {
+      if (func_op->hasAttr(mlir::accel::kAcceleratorAttr)) {
+        auto target =
+            func_op->getAttrOfType<StringAttr>(mlir::accel::kAcceleratorAttr);
+        if (target && target.getValue() == mlir::accel::kNeuraTarget) {
+          if (failed(applyPatternsGreedily(func_op, std::move(patterns)))) {
+            return signalPassFailure();
+          }
+        }
+      }
+    });
   }
 };
 } // namespace
diff --git a/lib/NeuraDialect/NeuraPasses.cpp b/lib/NeuraDialect/NeuraPasses.cpp
index 6a88abee..a346ddb0 100644
--- a/lib/NeuraDialect/NeuraPasses.cpp
+++ b/lib/NeuraDialect/NeuraPasses.cpp
@@ -2,19 +2,20 @@
 #include "mlir/Pass/PassRegistry.h"
 #include "mlir/Transforms/Passes.h"
 
+#include "Conversion/ConversionPasses.h"
 #include "NeuraDialect/NeuraDialect.h"
 #include "NeuraDialect/NeuraOps.h"
 #include "NeuraDialect/NeuraPasses.h"
 #include "NeuraDialect/NeuraTypes.h"
-#include "Conversion/ConversionPasses.h"
 
 // This pass pipeline can convert all the other dialects into the Neura dialect
 void mlir::neura::registerNeuraConversionPassPipeline() {
-  PassPipelineRegistration<>("neura-conversion",
-                             "Convert all dialects to Neura dialect",
-                             [](OpPassManager &pm) {
-                                // Convert all the other dialects into the Neura dialect
-                                pm.addPass(mlir::createLowerArithToNeuraPass());
-                                pm.addPass(mlir::createLowerLlvmToNeuraPass());
-                             });
+  PassPipelineRegistration<>(
+      "neura-conversion", "Convert all dialects to Neura dialect",
+      [](OpPassManager &pm) {
+        pm.addPass(mlir::neura::createAssignAcceleratorPass());
+        // Convert all the other dialects into the Neura dialect
+        pm.addPass(mlir::createLowerArithToNeuraPass());
+        pm.addPass(mlir::createLowerLlvmToNeuraPass());
+      });
 }
\ No newline at end of file
diff --git a/test/affine2neura/bert/bert_node0/bert_node0.mlir b/test/affine2neura/bert/bert_node0/bert_node0.mlir
index 4c1eef85..ba82071e 100644
--- a/test/affine2neura/bert/bert_node0/bert_node0.mlir
+++ b/test/affine2neura/bert/bert_node0/bert_node0.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir
-// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s
+// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura --lower-memref-to-neura --lower-builtin-to-neura --lower-llvm-to-neura | FileCheck %s
 
 module attributes {} {
   func.func @_Z10bert_node0PA128_KiPA128_b(%arg0: memref<?x128xi32>, %arg1: memref<?x128xi8>) attributes {} {
@@ -15,25 +15,23 @@ module attributes {} {
 }
 
 // CHECK: func.func @_Z10bert_node0PA128_KiPA128_b(%arg0: memref<?x128xi32>, %arg1: memref<?x128xi8>) attributes {accelerator = "neura"} {
-// CHECK-NEXT:  %0 = "neura.constant"() <{value = 1 : index}> : () -> index
-// CHECK-NEXT:  %1 = "neura.constant"() <{value = 128 : index}> : () -> index
-// CHECK-NEXT:  %2 = "neura.constant"() <{value = 0 : i32}> : () -> i32
-// CHECK-NEXT:  %3 = "neura.constant"() <{value = 0 : index}> : () -> index
-// CHECK-NEXT:  %4 = builtin.unrealized_conversion_cast %3 : index to i64
-// CHECK-NEXT:  llvm.br ^bb1(%4 : i64)
+// CHECK-NEXT: %0 = "neura.constant"() <{value = 1 : index}> : () -> index
+// CHECK-NEXT: %1 = "neura.constant"() <{value = 128 : index}> : () -> index
+// CHECK-NEXT: %2 = "neura.constant"() <{value = 0 : i32}> : () -> i32
+// CHECK-NEXT: %3 = "neura.constant"() <{value = 0 : index}> : () -> index
+// CHECK-NEXT: %4 = "neura.cast"(%3) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT: neura.br %4 : i64 to ^bb1
 // CHECK-NEXT: ^bb1(%5: i64):  // 2 preds: ^bb0, ^bb2
-// CHECK-NEXT:  %6 = builtin.unrealized_conversion_cast %5 : i64 to index
-// CHECK-NEXT:  %7 = "neura.icmp"(%6, %1) <{cmpType = "slt"}> : (index, index) -> i1
-// CHECK-NEXT:  llvm.cond_br %7, ^bb2, ^bb3
+// CHECK-NEXT: %6 = "neura.cast"(%5) <{cast_type = "int_to_index"}> : (i64) -> index
+// CHECK-NEXT: %7 = "neura.icmp"(%6, %1) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT: neura.cond_br %7 : i1 then to ^bb2 else to ^bb3
 // CHECK-NEXT: ^bb2:  // pred: ^bb1
-// CHECK-NEXT:  %8 = memref.load %arg0[%3, %6] : memref<?x128xi32>
-// CHECK-NEXT:  %9 = "neura.icmp"(%8, %2) <{cmpType = "sgt"}> : (i32, i32) -> i1
-// CHECK-NEXT:  %10 = "neura.cast"(%9) <{cast_type = "extui"}> : (i1) -> i8
-// CHECK-NEXT:  memref.store %10, %arg1[%3, %6] : memref<?x128xi8>
-// CHECK-NEXT:  %11 = "neura.add"(%6, %0) : (index, index) -> index
-// CHECK-NEXT:  %12 = builtin.unrealized_conversion_cast %11 : index to i64
-// CHECK-NEXT:  llvm.br ^bb1(%12 : i64)
+// CHECK-NEXT: %8 = neura.load_indexed %arg0[%3, %6 : index, index] memref<?x128xi32> : i32
+// CHECK-NEXT: %9 = "neura.icmp"(%8, %2) <{cmpType = "sgt"}> : (i32, i32) -> i1
+// CHECK-NEXT: %10 = "neura.cast"(%9) <{cast_type = "extui"}> : (i1) -> i8
+// CHECK-NEXT: neura.store_indexed %10 to %arg1[%3, %6 : index, index] memref<?x128xi8> : i8
+// CHECK-NEXT: %11 = "neura.add"(%6, %0) : (index, index) -> index
+// CHECK-NEXT: %12 = "neura.cast"(%11) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT: neura.br %12 : i64 to ^bb1
 // CHECK-NEXT: ^bb3:  // pred: ^bb1
-// CHECK-NEXT:  return
-// CHECK-NEXT:  }
-// CHECK-NEXT:  }
+// CHECK-NEXT: "neura.return"() : () -> ()
diff --git a/test/affine2neura/bert/bert_node1/bert_node1.mlir b/test/affine2neura/bert/bert_node1/bert_node1.mlir
index 0280d7c3..f79959a2 100644
--- a/test/affine2neura/bert/bert_node1/bert_node1.mlir
+++ b/test/affine2neura/bert/bert_node1/bert_node1.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir
-// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s
+// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura --lower-memref-to-neura --lower-builtin-to-neura --lower-llvm-to-neura | FileCheck %s
 module attributes {} {
   func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref<?x1x1x1x1x128xi8>, %arg1: memref<?x1x128x1x1x128xi8>) attributes {} {
     affine.for %arg2 = 0 to 128 {
@@ -13,32 +13,32 @@ module attributes {} {
 }
 
 // CHECK: func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref<?x1x1x1x1x128xi8>, %arg1: memref<?x1x128x1x1x128xi8>) attributes {accelerator = "neura"} {
-// CHECK-NEXT:    %0 = "neura.constant"() <{value = 1 : index}> : () -> index
-// CHECK-NEXT:    %1 = "neura.constant"() <{value = 128 : index}> : () -> index
-// CHECK-NEXT:    %2 = "neura.constant"() <{value = 0 : index}> : () -> index
-// CHECK-NEXT:    %3 = builtin.unrealized_conversion_cast %2 : index to i64
-// CHECK-NEXT:    llvm.br ^bb1(%3 : i64)
-// CHECK-NEXT:  ^bb1(%4: i64):  // 2 preds: ^bb0, ^bb5
-// CHECK-NEXT:    %5 = builtin.unrealized_conversion_cast %4 : i64 to index
-// CHECK-NEXT:    %6 = "neura.icmp"(%5, %1) <{cmpType = "slt"}> : (index, index) -> i1
-// CHECK-NEXT:    llvm.cond_br %6, ^bb2, ^bb6
-// CHECK-NEXT:  ^bb2:  // pred: ^bb1
-// CHECK-NEXT:    %7 = builtin.unrealized_conversion_cast %2 : index to i64
-// CHECK-NEXT:    llvm.br ^bb3(%7 : i64)
-// CHECK-NEXT:  ^bb3(%8: i64):  // 2 preds: ^bb2, ^bb4
-// CHECK-NEXT:    %9 = builtin.unrealized_conversion_cast %8 : i64 to index
-// CHECK-NEXT:    %10 = "neura.icmp"(%9, %1) <{cmpType = "slt"}> : (index, index) -> i1
-// CHECK-NEXT:    llvm.cond_br %10, ^bb4, ^bb5
-// CHECK-NEXT:  ^bb4:  // pred: ^bb3
-// CHECK-NEXT:    %11 = memref.load %arg0[%2, %2, %2, %2, %2, %9] : memref<?x1x1x1x1x128xi8>
-// CHECK-NEXT:    memref.store %11, %arg1[%2, %2, %5, %2, %2, %9] : memref<?x1x128x1x1x128xi8>
-// CHECK-NEXT:    %12 = "neura.add"(%9, %0) : (index, index) -> index
-// CHECK-NEXT:    %13 = builtin.unrealized_conversion_cast %12 : index to i64
-// CHECK-NEXT:    llvm.br ^bb3(%13 : i64)
-// CHECK-NEXT:  ^bb5:  // pred: ^bb3
-// CHECK-NEXT:    %14 = "neura.add"(%5, %0) : (index, index) -> index
-// CHECK-NEXT:    %15 = builtin.unrealized_conversion_cast %14 : index to i64
-// CHECK-NEXT:    llvm.br ^bb1(%15 : i64)
-// CHECK-NEXT:  ^bb6:  // pred: ^bb1
-// CHECK-NEXT:    return
-// CHECK-NEXT:  }
\ No newline at end of file
+// CHECK-NEXT: %0 = "neura.constant"() <{value = 1 : index}> : () -> index
+// CHECK-NEXT: %1 = "neura.constant"() <{value = 128 : index}> : () -> index
+// CHECK-NEXT: %2 = "neura.constant"() <{value = 0 : index}> : () -> index
+// CHECK-NEXT: %3 = "neura.cast"(%2) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT: neura.br %3 : i64 to ^bb1
+// CHECK-NEXT: ^bb1(%4: i64):  // 2 preds: ^bb0, ^bb5
+// CHECK-NEXT: %5 = "neura.cast"(%4) <{cast_type = "int_to_index"}> : (i64) -> index
+// CHECK-NEXT: %6 = "neura.icmp"(%5, %1) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT: neura.cond_br %6 : i1 then to ^bb2 else to ^bb6
+// CHECK-NEXT: ^bb2:  // pred: ^bb1
+// CHECK-NEXT: %7 = "neura.cast"(%2) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT: neura.br %7 : i64 to ^bb3
+// CHECK-NEXT: ^bb3(%8: i64):  // 2 preds: ^bb2, ^bb4
+// CHECK-NEXT: %9 = "neura.cast"(%8) <{cast_type = "int_to_index"}> : (i64) -> index
+// CHECK-NEXT: %10 = "neura.icmp"(%9, %1) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT: neura.cond_br %10 : i1 then to ^bb4 else to ^bb5
+// CHECK-NEXT: ^bb4:  // pred: ^bb3
+// CHECK-NEXT: %11 = neura.load_indexed %arg0[%2, %2, %2, %2, %2, %9 : index, index, index, index, index, index] memref<?x1x1x1x1x128xi8> : i8
+// CHECK-NEXT: neura.store_indexed %11 to %arg1[%2, %2, %5, %2, %2, %9 : index, index, index, index, index, index] memref<?x1x128x1x1x128xi8> : i8
+// CHECK-NEXT: %12 = "neura.add"(%9, %0) : (index, index) -> index
+// CHECK-NEXT: %13 = "neura.cast"(%12) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT: neura.br %13 : i64 to ^bb3
+// CHECK-NEXT: ^bb5:  // pred: ^bb3
+// CHECK-NEXT: %14 = "neura.add"(%5, %0) : (index, index) -> index
+// CHECK-NEXT: %15 = "neura.cast"(%14) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT: neura.br %15 : i64 to ^bb1
+// CHECK-NEXT: ^bb6:  // pred: ^bb1
+// CHECK-NEXT: "neura.return"() : () -> ()
+// CHECK-NEXT: }
diff --git a/test/affine2neura/bert/bert_node2/bert_node2.mlir b/test/affine2neura/bert/bert_node2/bert_node2.mlir
index 6b70666a..0bc0a274 100644
--- a/test/affine2neura/bert/bert_node2/bert_node2.mlir
+++ b/test/affine2neura/bert/bert_node2/bert_node2.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir
-// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s
+// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura --lower-memref-to-neura --lower-builtin-to-neura --lower-llvm-to-neura | FileCheck %s
 module attributes {} {
   func.func @_Z10bert_node2PA128_KiPA768_KfPA128_A768_f(%arg0: memref<?x128xi32>, %arg1: memref<?x768xf32>, %arg2: memref<?x128x768xf32>) attributes {} {
     %false = arith.constant false
@@ -28,51 +28,51 @@ module attributes {} {
 }
 
 // CHECK: func.func @_Z10bert_node2PA128_KiPA768_KfPA128_A768_f(%arg0: memref<?x128xi32>, %arg1: memref<?x768xf32>, %arg2: memref<?x128x768xf32>) attributes {accelerator = "neura"} {
-// CHECK-NEXT: %0 = "neura.constant"() <{value = 768 : index}> : () -> index
-// CHECK-NEXT: %1 = "neura.constant"() <{value = 1 : index}> : () -> index
-// CHECK-NEXT: %2 = "neura.constant"() <{value = 128 : index}> : () -> index
-// CHECK-NEXT: %3 = "neura.constant"() <{value = false}> : () -> i1
-// CHECK-NEXT: %4 = "neura.constant"() <{value = 30521 : i32}> : () -> i32
-// CHECK-NEXT: %5 = "neura.constant"() <{value = 0 : i32}> : () -> i32
-// CHECK-NEXT: %6 = "neura.constant"() <{value = 30522 : i32}> : () -> i32
-// CHECK-NEXT: %7 = "neura.constant"() <{value = 0 : index}> : () -> index
-// CHECK-NEXT: %8 = builtin.unrealized_conversion_cast %7 : index to i64
-// CHECK-NEXT: llvm.br ^bb1(%8 : i64)
+// CHECK-NEXT:  %0 = "neura.constant"() <{value = 768 : index}> : () -> index
+// CHECK-NEXT:  %1 = "neura.constant"() <{value = 1 : index}> : () -> index
+// CHECK-NEXT:  %2 = "neura.constant"() <{value = 128 : index}> : () -> index
+// CHECK-NEXT:  %3 = "neura.constant"() <{value = false}> : () -> i1
+// CHECK-NEXT:  %4 = "neura.constant"() <{value = 30521 : i32}> : () -> i32
+// CHECK-NEXT:  %5 = "neura.constant"() <{value = 0 : i32}> : () -> i32
+// CHECK-NEXT:  %6 = "neura.constant"() <{value = 30522 : i32}> : () -> i32
+// CHECK-NEXT:  %7 = "neura.constant"() <{value = 0 : index}> : () -> index
+// CHECK-NEXT:  %8 = "neura.cast"(%7) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT:  neura.br %8 : i64 to ^bb1
 // CHECK-NEXT: ^bb1(%9: i64):  // 2 preds: ^bb0, ^bb9
-// CHECK-NEXT: %10 = builtin.unrealized_conversion_cast %9 : i64 to index
-// CHECK-NEXT: %11 = "neura.icmp"(%10, %2) <{cmpType = "slt"}> : (index, index) -> i1
-// CHECK-NEXT: llvm.cond_br %11, ^bb2, ^bb10
+// CHECK-NEXT:  %10 = "neura.cast"(%9) <{cast_type = "int_to_index"}> : (i64) -> index
+// CHECK-NEXT:  %11 = "neura.icmp"(%10, %2) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT:  neura.cond_br %11 : i1 then to ^bb2 else to ^bb10
 // CHECK-NEXT: ^bb2:  // pred: ^bb1
-// CHECK-NEXT: %12 = builtin.unrealized_conversion_cast %7 : index to i64
-// CHECK-NEXT: llvm.br ^bb3(%12 : i64)
+// CHECK-NEXT:  %12 = "neura.cast"(%7) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT:  neura.br %12 : i64 to ^bb3
 // CHECK-NEXT: ^bb3(%13: i64):  // 2 preds: ^bb2, ^bb8
-// CHECK-NEXT: %14 = builtin.unrealized_conversion_cast %13 : i64 to index
-// CHECK-NEXT: %15 = "neura.icmp"(%14, %0) <{cmpType = "slt"}> : (index, index) -> i1
-// CHECK-NEXT: llvm.cond_br %15, ^bb4, ^bb9
+// CHECK-NEXT:  %14 = "neura.cast"(%13) <{cast_type = "int_to_index"}> : (i64) -> index
+// CHECK-NEXT:  %15 = "neura.icmp"(%14, %0) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT:  neura.cond_br %15 : i1 then to ^bb4 else to ^bb9
 // CHECK-NEXT: ^bb4:  // pred: ^bb3
-// CHECK-NEXT: %16 = memref.load %arg0[%7, %10] : memref<?x128xi32>
-// CHECK-NEXT: %17 = "neura.icmp"(%16, %6) <{cmpType = "sge"}> : (i32, i32) -> i1
-// CHECK-NEXT: %18 = "neura.sel"(%4, %16, %17) : (i32, i32, i1) -> i32
-// CHECK-NEXT: llvm.cond_br %17, ^bb5, ^bb6
+// CHECK-NEXT:  %16 = neura.load_indexed %arg0[%7, %10 : index, index] memref<?x128xi32> : i32
+// CHECK-NEXT:  %17 = "neura.icmp"(%16, %6) <{cmpType = "sge"}> : (i32, i32) -> i1
+// CHECK-NEXT:  %18 = "neura.sel"(%4, %16, %17) : (i32, i32, i1) -> i32
+// CHECK-NEXT:  neura.cond_br %17 : i1 then to ^bb5 else to ^bb6
 // CHECK-NEXT: ^bb5:  // pred: ^bb4
-// CHECK-NEXT: llvm.br ^bb7(%3 : i1)
+// CHECK-NEXT:  neura.br %3 : i1 to ^bb7
 // CHECK-NEXT: ^bb6:  // pred: ^bb4
-// CHECK-NEXT: %19 = "neura.icmp"(%16, %5) <{cmpType = "slt"}> : (i32, i32) -> i1
-// CHECK-NEXT: llvm.br ^bb7(%19 : i1)
+// CHECK-NEXT:  %19 = "neura.icmp"(%16, %5) <{cmpType = "slt"}> : (i32, i32) -> i1
+// CHECK-NEXT:  neura.br %19 : i1 to ^bb7
 // CHECK-NEXT: ^bb7(%20: i1):  // 2 preds: ^bb5, ^bb6
-// CHECK-NEXT: llvm.br ^bb8
+// CHECK-NEXT:  neura.br to ^bb8
 // CHECK-NEXT: ^bb8:  // pred: ^bb7
-// CHECK-NEXT: %21 = "neura.sel"(%5, %18, %20) : (i32, i32, i1) -> i32
-// CHECK-NEXT: %22 = "neura.cast"(%21) <{cast_type = "indexCast"}> : (i32) -> index
-// CHECK-NEXT: %23 = memref.load %arg1[%22, %14] : memref<?x768xf32>
-// CHECK-NEXT: memref.store %23, %arg2[%7, %10, %14] : memref<?x128x768xf32>
-// CHECK-NEXT: %24 = "neura.add"(%14, %1) : (index, index) -> index
-// CHECK-NEXT: %25 = builtin.unrealized_conversion_cast %24 : index to i64
-// CHECK-NEXT: llvm.br ^bb3(%25 : i64)
+// CHECK-NEXT:  %21 = "neura.sel"(%5, %18, %20) : (i32, i32, i1) -> i32
+// CHECK-NEXT:  %22 = "neura.cast"(%21) <{cast_type = "int_to_index"}> : (i32) -> index
+// CHECK-NEXT:  %23 = neura.load_indexed %arg1[%22, %14 : index, index] memref<?x768xf32> : f32
+// CHECK-NEXT:  neura.store_indexed %23 to %arg2[%7, %10, %14 : index, index, index] memref<?x128x768xf32> : f32
+// CHECK-NEXT:  %24 = "neura.add"(%14, %1) : (index, index) -> index
+// CHECK-NEXT:  %25 = "neura.cast"(%24) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT:  neura.br %25 : i64 to ^bb3
 // CHECK-NEXT: ^bb9:  // pred: ^bb3
-// CHECK-NEXT: %26 = "neura.add"(%10, %1) : (index, index) -> index
-// CHECK-NEXT: %27 = builtin.unrealized_conversion_cast %26 : index to i64
-// CHECK-NEXT: llvm.br ^bb1(%27 : i64)
+// CHECK-NEXT:  %26 = "neura.add"(%10, %1) : (index, index) -> index
+// CHECK-NEXT:  %27 = "neura.cast"(%26) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT:  neura.br %27 : i64 to ^bb1
 // CHECK-NEXT: ^bb10:  // pred: ^bb1
-// CHECK-NEXT: return
-// CHECK-NEXT: }
\ No newline at end of file
+// CHECK-NEXT:  "neura.return"() : () -> ()
+// CHECK-NEXT: }
diff --git a/test/affine2neura/bert/bert_node28/bert_node28.mlir b/test/affine2neura/bert/bert_node28/bert_node28.mlir
index 01f54a51..e93de764 100644
--- a/test/affine2neura/bert/bert_node28/bert_node28.mlir
+++ b/test/affine2neura/bert/bert_node28/bert_node28.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir
-// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s
+// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura --lower-memref-to-neura --lower-builtin-to-neura --lower-llvm-to-neura | FileCheck %s
 module attributes {} {
   func.func @_Z11bert_node28PA128_A768_KfPA768_S0_PA128_A768_f(%arg0: memref<?x128x768xf32>, %arg1: memref<?x768x768xf32>, %arg2: memref<?x128x768xf32>) attributes {} {
     affine.for %arg3 = 0 to 128 {
@@ -22,43 +22,43 @@ module attributes {} {
 // CHECK-NEXT: %1 = "neura.constant"() <{value = 1 : index}> : () -> index
 // CHECK-NEXT: %2 = "neura.constant"() <{value = 128 : index}> : () -> index
 // CHECK-NEXT: %3 = "neura.constant"() <{value = 0 : index}> : () -> index
-// CHECK-NEXT: %4 = builtin.unrealized_conversion_cast %3 : index to i64
-// CHECK-NEXT: llvm.br ^bb1(%4 : i64)
+// CHECK-NEXT: %4 = "neura.cast"(%3) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT: neura.br %4 : i64 to ^bb1
 // CHECK-NEXT: ^bb1(%5: i64):  // 2 preds: ^bb0, ^bb8
-// CHECK-NEXT: %6 = builtin.unrealized_conversion_cast %5 : i64 to index
+// CHECK-NEXT: %6 = "neura.cast"(%5) <{cast_type = "int_to_index"}> : (i64) -> index
 // CHECK-NEXT: %7 = "neura.icmp"(%6, %2) <{cmpType = "slt"}> : (index, index) -> i1
-// CHECK-NEXT: llvm.cond_br %7, ^bb2, ^bb9
+// CHECK-NEXT: neura.cond_br %7 : i1 then to ^bb2 else to ^bb9
 // CHECK-NEXT: ^bb2:  // pred: ^bb1
-// CHECK-NEXT: %8 = builtin.unrealized_conversion_cast %3 : index to i64
-// CHECK-NEXT: llvm.br ^bb3(%8 : i64)
+// CHECK-NEXT: %8 = "neura.cast"(%3) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT: neura.br %8 : i64 to ^bb3
 // CHECK-NEXT: ^bb3(%9: i64):  // 2 preds: ^bb2, ^bb7
-// CHECK-NEXT: %10 = builtin.unrealized_conversion_cast %9 : i64 to index
+// CHECK-NEXT: %10 = "neura.cast"(%9) <{cast_type = "int_to_index"}> : (i64) -> index
 // CHECK-NEXT: %11 = "neura.icmp"(%10, %0) <{cmpType = "slt"}> : (index, index) -> i1
-// CHECK-NEXT: llvm.cond_br %11, ^bb4, ^bb8
+// CHECK-NEXT: neura.cond_br %11 : i1 then to ^bb4 else to ^bb8
 // CHECK-NEXT: ^bb4:  // pred: ^bb3
-// CHECK-NEXT: %12 = builtin.unrealized_conversion_cast %3 : index to i64
-// CHECK-NEXT: llvm.br ^bb5(%12 : i64)
+// CHECK-NEXT: %12 = "neura.cast"(%3) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT: neura.br %12 : i64 to ^bb5
 // CHECK-NEXT: ^bb5(%13: i64):  // 2 preds: ^bb4, ^bb6
-// CHECK-NEXT: %14 = builtin.unrealized_conversion_cast %13 : i64 to index
+// CHECK-NEXT: %14 = "neura.cast"(%13) <{cast_type = "int_to_index"}> : (i64) -> index
 // CHECK-NEXT: %15 = "neura.icmp"(%14, %0) <{cmpType = "slt"}> : (index, index) -> i1
-// CHECK-NEXT: llvm.cond_br %15, ^bb6, ^bb7
+// CHECK-NEXT: neura.cond_br %15 : i1 then to ^bb6 else to ^bb7
 // CHECK-NEXT: ^bb6:  // pred: ^bb5
-// CHECK-NEXT: %16 = memref.load %arg0[%3, %6, %14] : memref<?x128x768xf32>
-// CHECK-NEXT: %17 = memref.load %arg1[%3, %14, %10] : memref<?x768x768xf32>
-// CHECK-NEXT: %18 = memref.load %arg2[%3, %6, %10] : memref<?x128x768xf32>
+// CHECK-NEXT: %16 = neura.load_indexed %arg0[%3, %6, %14 : index, index, index] memref<?x128x768xf32> : f32
+// CHECK-NEXT: %17 = neura.load_indexed %arg1[%3, %14, %10 : index, index, index] memref<?x768x768xf32> : f32
+// CHECK-NEXT: %18 = neura.load_indexed %arg2[%3, %6, %10 : index, index, index] memref<?x128x768xf32> : f32
 // CHECK-NEXT: %19 = "neura.fmul"(%16, %17) : (f32, f32) -> f32
 // CHECK-NEXT: %20 = "neura.fadd"(%18, %19) : (f32, f32) -> f32
-// CHECK-NEXT: memref.store %20, %arg2[%3, %6, %10] : memref<?x128x768xf32>
+// CHECK-NEXT: neura.store_indexed %20 to %arg2[%3, %6, %10 : index, index, index] memref<?x128x768xf32> : f32
 // CHECK-NEXT: %21 = "neura.add"(%14, %1) : (index, index) -> index
-// CHECK-NEXT: %22 = builtin.unrealized_conversion_cast %21 : index to i64
-// CHECK-NEXT: llvm.br ^bb5(%22 : i64)
+// CHECK-NEXT: %22 = "neura.cast"(%21) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT: neura.br %22 : i64 to ^bb5
 // CHECK-NEXT: ^bb7:  // pred: ^bb5
 // CHECK-NEXT: %23 = "neura.add"(%10, %1) : (index, index) -> index
-// CHECK-NEXT: %24 = builtin.unrealized_conversion_cast %23 : index to i64
-// CHECK-NEXT: llvm.br ^bb3(%24 : i64)
+// CHECK-NEXT: %24 = "neura.cast"(%23) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT: neura.br %24 : i64 to ^bb3
 // CHECK-NEXT: ^bb8:  // pred: ^bb3
 // CHECK-NEXT: %25 = "neura.add"(%6, %1) : (index, index) -> index
-// CHECK-NEXT: %26 = builtin.unrealized_conversion_cast %25 : index to i64
-// CHECK-NEXT: llvm.br ^bb1(%26 : i64)
+// CHECK-NEXT: %26 = "neura.cast"(%25) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT: neura.br %26 : i64 to ^bb1
 // CHECK-NEXT: ^bb9:  // pred: ^bb1
-// CHECK-NEXT: return
\ No newline at end of file
+// CHECK-NEXT: "neura.return"() : () -> ()
diff --git a/test/affine2neura/bert/bert_node3/bert_node3.mlir b/test/affine2neura/bert/bert_node3/bert_node3.mlir
index 1c400deb..19d121e4 100644
--- a/test/affine2neura/bert/bert_node3/bert_node3.mlir
+++ b/test/affine2neura/bert/bert_node3/bert_node3.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir
-// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s
+// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura --lower-memref-to-neura --lower-builtin-to-neura --lower-llvm-to-neura | FileCheck %s
 module attributes {} {
   func.func @_Z10bert_node3PA128_A768_KfS2_PA128_A768_f(%arg0: memref<?x128x768xf32>, %arg1: memref<?x128x768xf32>, %arg2: memref<?x128x768xf32>) attributes {} {
     affine.for %arg3 = 0 to 128 {
@@ -19,30 +19,31 @@ module attributes {} {
 // CHECK-NEXT: %1 = "neura.constant"() <{value = 1 : index}> : () -> index
 // CHECK-NEXT: %2 = "neura.constant"() <{value = 128 : index}> : () -> index
 // CHECK-NEXT: %3 = "neura.constant"() <{value = 0 : index}> : () -> index
-// CHECK-NEXT: %4 = builtin.unrealized_conversion_cast %3 : index to i64
-// CHECK-NEXT: llvm.br ^bb1(%4 : i64)
+// CHECK-NEXT: %4 = "neura.cast"(%3) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT: neura.br %4 : i64 to ^bb1
 // CHECK-NEXT: ^bb1(%5: i64):  // 2 preds: ^bb0, ^bb5
-// CHECK-NEXT: %6 = builtin.unrealized_conversion_cast %5 : i64 to index
+// CHECK-NEXT: %6 = "neura.cast"(%5) <{cast_type = "int_to_index"}> : (i64) -> index
 // CHECK-NEXT: %7 = "neura.icmp"(%6, %2) <{cmpType = "slt"}> : (index, index) -> i1
-// CHECK-NEXT: llvm.cond_br %7, ^bb2, ^bb6
+// CHECK-NEXT: neura.cond_br %7 : i1 then to ^bb2 else to ^bb6
 // CHECK-NEXT: ^bb2:  // pred: ^bb1
-// CHECK-NEXT: %8 = builtin.unrealized_conversion_cast %3 : index to i64
-// CHECK-NEXT: llvm.br ^bb3(%8 : i64)
+// CHECK-NEXT: %8 = "neura.cast"(%3) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT: neura.br %8 : i64 to ^bb3
 // CHECK-NEXT: ^bb3(%9: i64):  // 2 preds: ^bb2, ^bb4
-// CHECK-NEXT: %10 = builtin.unrealized_conversion_cast %9 : i64 to index
+// CHECK-NEXT: %10 = "neura.cast"(%9) <{cast_type = "int_to_index"}> : (i64) -> index
 // CHECK-NEXT: %11 = "neura.icmp"(%10, %0) <{cmpType = "slt"}> : (index, index) -> i1
-// CHECK-NEXT: llvm.cond_br %11, ^bb4, ^bb5
+// CHECK-NEXT: neura.cond_br %11 : i1 then to ^bb4 else to ^bb5
 // CHECK-NEXT: ^bb4:  // pred: ^bb3
-// CHECK-NEXT: %12 = memref.load %arg0[%3, %6, %10] : memref<?x128x768xf32>
-// CHECK-NEXT: %13 = memref.load %arg1[%3, %6, %10] : memref<?x128x768xf32>
+// CHECK-NEXT: %12 = neura.load_indexed %arg0[%3, %6, %10 : index, index, index] memref<?x128x768xf32> : f32
+// CHECK-NEXT: %13 = neura.load_indexed %arg1[%3, %6, %10 : index, index, index] memref<?x128x768xf32> : f32
 // CHECK-NEXT: %14 = "neura.fadd"(%12, %13) : (f32, f32) -> f32
-// CHECK-NEXT: memref.store %14, %arg2[%3, %6, %10] : memref<?x128x768xf32>
+// CHECK-NEXT: neura.store_indexed %14 to %arg2[%3, %6, %10 : index, index, index] memref<?x128x768xf32> : f32
 // CHECK-NEXT: %15 = "neura.add"(%10, %1) : (index, index) -> index
-// CHECK-NEXT: %16 = builtin.unrealized_conversion_cast %15 : index to i64
-// CHECK-NEXT: llvm.br ^bb3(%16 : i64)
+// CHECK-NEXT: %16 = "neura.cast"(%15) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT: neura.br %16 : i64 to ^bb3
 // CHECK-NEXT: ^bb5:  // pred: ^bb3
 // CHECK-NEXT: %17 = "neura.add"(%6, %1) : (index, index) -> index
-// CHECK-NEXT: %18 = builtin.unrealized_conversion_cast %17 : index to i64
-// CHECK-NEXT: llvm.br ^bb1(%18 : i64)
+// CHECK-NEXT: %18 = "neura.cast"(%17) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT: neura.br %18 : i64 to ^bb1
 // CHECK-NEXT: ^bb6:  // pred: ^bb1
-// CHECK-NEXT: return
+// CHECK-NEXT: "neura.return"() : () -> ()
+// CHECK-NEXT: }
\ No newline at end of file
diff --git a/test/affine2neura/bert/bert_node8/bert_node8.mlir b/test/affine2neura/bert/bert_node8/bert_node8.mlir
index dbb59d40..b0cb6345 100644
--- a/test/affine2neura/bert/bert_node8/bert_node8.mlir
+++ b/test/affine2neura/bert/bert_node8/bert_node8.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir
-// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s
+// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura --lower-memref-to-neura --lower-builtin-to-neura --lower-llvm-to-neura | FileCheck %s
 module attributes {} {
   func.func @_Z10bert_node8PA128_A1_KfPA128_A1_f(%arg0: memref<?x128x1xf32>, %arg1: memref<?x128x1xf32>) attributes {} {
     %cst = arith.constant 7.680000e+02 : f32
@@ -17,18 +17,19 @@ module attributes {} {
 // CHECK-NEXT: %1 = "neura.constant"() <{value = 128 : index}> : () -> index
 // CHECK-NEXT: %2 = "neura.constant"() <{value = 7.680000e+02 : f32}> : () -> f32
 // CHECK-NEXT: %3 = "neura.constant"() <{value = 0 : index}> : () -> index
-// CHECK-NEXT: %4 = builtin.unrealized_conversion_cast %3 : index to i64
-// CHECK-NEXT: llvm.br ^bb1(%4 : i64)
+// CHECK-NEXT: %4 = "neura.cast"(%3) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT: neura.br %4 : i64 to ^bb1
 // CHECK-NEXT: ^bb1(%5: i64):  // 2 preds: ^bb0, ^bb2
-// CHECK-NEXT: %6 = builtin.unrealized_conversion_cast %5 : i64 to index
+// CHECK-NEXT: %6 = "neura.cast"(%5) <{cast_type = "int_to_index"}> : (i64) -> index
 // CHECK-NEXT: %7 = "neura.icmp"(%6, %1) <{cmpType = "slt"}> : (index, index) -> i1
-// CHECK-NEXT: llvm.cond_br %7, ^bb2, ^bb3
+// CHECK-NEXT: neura.cond_br %7 : i1 then to ^bb2 else to ^bb3
 // CHECK-NEXT: ^bb2:  // pred: ^bb1
-// CHECK-NEXT: %8 = memref.load %arg0[%3, %6, %3] : memref<?x128x1xf32>
+// CHECK-NEXT: %8 = neura.load_indexed %arg0[%3, %6, %3 : index, index, index] memref<?x128x1xf32> : f32
 // CHECK-NEXT: %9 = "neura.fdiv"(%8, %2) : (f32, f32) -> f32
-// CHECK-NEXT: memref.store %9, %arg1[%3, %6, %3] : memref<?x128x1xf32>
+// CHECK-NEXT: neura.store_indexed %9 to %arg1[%3, %6, %3 : index, index, index] memref<?x128x1xf32> : f32
 // CHECK-NEXT: %10 = "neura.add"(%6, %0) : (index, index) -> index
-// CHECK-NEXT: %11 = builtin.unrealized_conversion_cast %10 : index to i64
-// CHECK-NEXT: llvm.br ^bb1(%11 : i64)
+// CHECK-NEXT: %11 = "neura.cast"(%10) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT: neura.br %11 : i64 to ^bb1
 // CHECK-NEXT: ^bb3:  // pred: ^bb1
-// CHECK-NEXT: return
+// CHECK-NEXT: "neura.return"() : () -> ()
+// CHECK-NEXT: }
diff --git a/test/affine2neura/bert/bert_node9/bert_node9.mlir b/test/affine2neura/bert/bert_node9/bert_node9.mlir
index 3641e16b..333589ab 100644
--- a/test/affine2neura/bert/bert_node9/bert_node9.mlir
+++ b/test/affine2neura/bert/bert_node9/bert_node9.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir
-// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s
+// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura --lower-memref-to-neura --lower-builtin-to-neura --lower-llvm-to-neura | FileCheck %s
 module attributes {} {
   func.func @_Z10bert_node9PA128_A768_KfPA128_A768_d(%arg0: memref<?x128x768xf32>, %arg1: memref<?x128x768xf64>) attributes {} {
     affine.for %arg2 = 0 to 128 {
@@ -19,29 +19,30 @@ module attributes {} {
 // CHECK-NEXT: %1 = "neura.constant"() <{value = 1 : index}> : () -> index
 // CHECK-NEXT: %2 = "neura.constant"() <{value = 128 : index}> : () -> index
 // CHECK-NEXT: %3 = "neura.constant"() <{value = 0 : index}> : () -> index
-// CHECK-NEXT: %4 = builtin.unrealized_conversion_cast %3 : index to i64
-// CHECK-NEXT: llvm.br ^bb1(%4 : i64)
+// CHECK-NEXT: %4 = "neura.cast"(%3) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT: neura.br %4 : i64 to ^bb1
 // CHECK-NEXT: ^bb1(%5: i64):  // 2 preds: ^bb0, ^bb5
-// CHECK-NEXT: %6 = builtin.unrealized_conversion_cast %5 : i64 to index
+// CHECK-NEXT: %6 = "neura.cast"(%5) <{cast_type = "int_to_index"}> : (i64) -> index
 // CHECK-NEXT: %7 = "neura.icmp"(%6, %2) <{cmpType = "slt"}> : (index, index) -> i1
-// CHECK-NEXT: llvm.cond_br %7, ^bb2, ^bb6
+// CHECK-NEXT: neura.cond_br %7 : i1 then to ^bb2 else to ^bb6
 // CHECK-NEXT: ^bb2:  // pred: ^bb1
-// CHECK-NEXT: %8 = builtin.unrealized_conversion_cast %3 : index to i64
-// CHECK-NEXT: llvm.br ^bb3(%8 : i64)
+// CHECK-NEXT: %8 = "neura.cast"(%3) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT: neura.br %8 : i64 to ^bb3
 // CHECK-NEXT: ^bb3(%9: i64):  // 2 preds: ^bb2, ^bb4
-// CHECK-NEXT: %10 = builtin.unrealized_conversion_cast %9 : i64 to index
+// CHECK-NEXT: %10 = "neura.cast"(%9) <{cast_type = "int_to_index"}> : (i64) -> index
 // CHECK-NEXT: %11 = "neura.icmp"(%10, %0) <{cmpType = "slt"}> : (index, index) -> i1
-// CHECK-NEXT: llvm.cond_br %11, ^bb4, ^bb5
+// CHECK-NEXT: neura.cond_br %11 : i1 then to ^bb4 else to ^bb5
 // CHECK-NEXT: ^bb4:  // pred: ^bb3
-// CHECK-NEXT: %12 = memref.load %arg0[%3, %6, %10] : memref<?x128x768xf32>
+// CHECK-NEXT: %12 = neura.load_indexed %arg0[%3, %6, %10 : index, index, index] memref<?x128x768xf32> : f32
 // CHECK-NEXT: %13 = "neura.cast"(%12) <{cast_type = "extf"}> : (f32) -> f64
-// CHECK-NEXT: memref.store %13, %arg1[%3, %6, %10] : memref<?x128x768xf64>
+// CHECK-NEXT: neura.store_indexed %13 to %arg1[%3, %6, %10 : index, index, index] memref<?x128x768xf64> : f64
 // CHECK-NEXT: %14 = "neura.add"(%10, %1) : (index, index) -> index
-// CHECK-NEXT: %15 = builtin.unrealized_conversion_cast %14 : index to i64
-// CHECK-NEXT: llvm.br ^bb3(%15 : i64)
+// CHECK-NEXT: %15 = "neura.cast"(%14) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT: neura.br %15 : i64 to ^bb3
 // CHECK-NEXT: ^bb5:  // pred: ^bb3
 // CHECK-NEXT: %16 = "neura.add"(%6, %1) : (index, index) -> index
-// CHECK-NEXT: %17 = builtin.unrealized_conversion_cast %16 : index to i64
-// CHECK-NEXT: llvm.br ^bb1(%17 : i64)
+// CHECK-NEXT: %17 = "neura.cast"(%16) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT: neura.br %17 : i64 to ^bb1
 // CHECK-NEXT: ^bb6:  // pred: ^bb1
-// CHECK-NEXT: return
+// CHECK-NEXT: "neura.return"() : () -> ()
+// CHECK-NEXT: }
diff --git a/test/arith2neura/add.mlir b/test/arith2neura/add.mlir
index fd06d201..71e1a995 100644
--- a/test/arith2neura/add.mlir
+++ b/test/arith2neura/add.mlir
@@ -1,5 +1,5 @@
 // RUN: neura-compiler --neura-conversion %s | FileCheck %s --check-prefix=COMPILER
-// RUN: mlir-neura-opt --lower-arith-to-neura %s | FileCheck %s --check-prefix=OPT
+// RUN: mlir-neura-opt --assign-accelerator --lower-arith-to-neura %s | FileCheck %s --check-prefix=OPT
 
 func.func @test(%a: f32) -> f32 {
   %b = arith.constant 2.0 : f32
diff --git a/test/neura/arith_add.mlir b/test/neura/arith_add.mlir
index 86ecefa7..40ee8fe9 100644
--- a/test/neura/arith_add.mlir
+++ b/test/neura/arith_add.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-neura-opt --lower-arith-to-neura --insert-data-mov %s | FileCheck %s
+// RUN: mlir-neura-opt --assign-accelerator --lower-arith-to-neura --insert-data-mov %s | FileCheck %s
 
 func.func @test(%a: f32) -> f32 {
   %b = arith.constant 2.0 : f32
diff --git a/test/neura/ctrl/branch_without_arg.mlir b/test/neura/ctrl/branch_without_arg.mlir
index 385fe20f..131753d4 100644
--- a/test/neura/ctrl/branch_without_arg.mlir
+++ b/test/neura/ctrl/branch_without_arg.mlir
@@ -39,7 +39,7 @@ func.func @test(%in: i64) -> f32 {
 // CHECK-NEXT:   %3 = "neura.constant"() <{predicate = true, value = 3.000000e+00 : f32}> : () -> !neura.data<f32, i1>
 // CHECK-NEXT:   %4 = "neura.constant"() <{predicate = true, value = 4.000000e+00 : f32}> : () -> !neura.data<f32, i1>
 // CHECK-NEXT:   %5 = "neura.icmp"(%arg0, %0) <{cmpType = "eq"}> : (i64, !neura.data<i64, i1>) -> !neura.data<i1, i1>
-// CHECK-NEXT:   neura.cond_br %5 : !neura.data<i1, i1> then %3, %4 : !neura.data<f32, i1>, !neura.data<f32, i1> to ^bb2 else :  to ^bb1
+// CHECK-NEXT:   neura.cond_br %5 : !neura.data<i1, i1> then %3, %4 : !neura.data<f32, i1>, !neura.data<f32, i1> to ^bb2 else to ^bb1
 // CHECK-NEXT: ^bb1:  // pred: ^bb0
 // CHECK-NEXT:   %6 = "neura.fadd"(%1, %2) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
 // CHECK-NEXT:   neura.br %6 : !neura.data<f32, i1> to ^bb3
diff --git a/test/neura/fadd_fadd.mlir b/test/neura/fadd_fadd.mlir
index da1aef44..87ca3f8e 100644
--- a/test/neura/fadd_fadd.mlir
+++ b/test/neura/fadd_fadd.mlir
@@ -1,5 +1,5 @@
 // Applies pattern fusion before mov insertion.
-// RUN: mlir-neura-opt --lower-arith-to-neura --fuse-patterns --insert-data-mov %s | FileCheck %s
+// RUN: mlir-neura-opt --assign-accelerator --lower-arith-to-neura --fuse-patterns --insert-data-mov %s | FileCheck %s
 
 func.func @test(%a: f32, %b: f32) -> f32 {
   %c = arith.constant 2.0 : f32
diff --git a/test/samples/bert/bert_affine.mlir b/test/samples/bert/bert_affine.mlir
index e47b9f88..cd20b0b3 100644
--- a/test/samples/bert/bert_affine.mlir
+++ b/test/samples/bert/bert_affine.mlir
@@ -89,6 +89,7 @@ module {
     %34 = bufferization.to_memref %cst_0 : memref<768xf32>
     %35 = bufferization.to_memref %cst : memref<f64>
     %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x128xi1>
+    // Node0
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         %88 = affine.load %2[%c0, %arg4] : memref<1x128xi64>
@@ -100,6 +101,7 @@ module {
     %expanded = tensor.expand_shape %36 [[0, 1], [2, 3, 4, 5]] : tensor<1x128xi1> into tensor<1x1x1x1x1x128xi1>
     %37 = bufferization.to_memref %expanded : memref<1x1x1x1x1x128xi1>
     %alloc_47 = memref.alloc() {alignment = 64 : i64} : memref<1x1x128x1x1x128xi1>
+    // Node1
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 1 {
         affine.for %arg5 = 0 to 128 {
@@ -119,6 +121,7 @@ module {
     %expanded_48 = tensor.expand_shape %collapsed [[0], [1, 2], [3]] : tensor<1x128x128xi1> into tensor<1x1x128x128xi1>
     %39 = bufferization.to_memref %expanded_48 : memref<1x1x128x128xi1>
     %alloc_49 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    // Node2
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 768 {
@@ -136,6 +139,7 @@ module {
     %extracted_slice = tensor.extract_slice %arg0[0, 0, 0] [1, 128, 768] [1, 1, 1] : tensor<1x512x768xf32> to tensor<1x128x768xf32>
     %40 = bufferization.to_memref %extracted_slice : memref<1x128x768xf32>
     %alloc_50 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    // Node3
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 768 {
@@ -147,6 +151,7 @@ module {
       }
     }
     %alloc_51 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    // Node4
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 768 {
@@ -162,6 +167,7 @@ module {
       }
     }
     %alloc_52 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    // Node5
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 768 {
@@ -173,6 +179,7 @@ module {
       }
     }
     %alloc_53 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    // Node6
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 1 {
@@ -182,6 +189,7 @@ module {
     }
     %alloc_54 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
     memref.copy %alloc_53, %alloc_54 : memref<1x128x1xf32> to memref<1x128x1xf32>
+    // Node7
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 768 {
@@ -193,6 +201,7 @@ module {
       }
     }
     %alloc_55 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    // Node8
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 1 {
@@ -203,6 +212,7 @@ module {
       }
     }
     %alloc_56 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64>
+    // Node9
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 768 {
@@ -213,6 +223,7 @@ module {
       }
     }
     %alloc_57 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64>
+    // Node10
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 1 {
@@ -222,6 +233,7 @@ module {
     }
     %alloc_58 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64>
     memref.copy %alloc_57, %alloc_58 : memref<1x128x1xf64> to memref<1x128x1xf64>
+    // Node11
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 768 {
@@ -233,6 +245,7 @@ module {
       }
     }
     %alloc_59 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64>
+    // Node12
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 1 {
@@ -243,6 +256,7 @@ module {
       }
     }
     %alloc_60 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64>
+    // Node13
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 768 {
@@ -254,6 +268,7 @@ module {
       }
     }
     %alloc_61 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf64>
+    // Node14
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 768 {
@@ -266,6 +281,7 @@ module {
     }
     %alloc_62 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64>
     memref.copy %alloc_57, %alloc_62 : memref<1x128x1xf64> to memref<1x128x1xf64>
+    // Node15
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 768 {
@@ -277,6 +293,7 @@ module {
       }
     }
     %alloc_63 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf64>
+    // Node16
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 1 {
@@ -287,6 +304,7 @@ module {
       }
     }
     %alloc_64 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    // Node17
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 1 {
@@ -297,6 +315,7 @@ module {
       }
     }
     %alloc_65 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    // Node18
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 1 {
@@ -307,6 +326,7 @@ module {
       }
     }
     %alloc_66 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    // Node19
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 768 {
@@ -318,6 +338,7 @@ module {
       }
     }
     %alloc_67 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    // Node20
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 768 {
@@ -329,6 +350,7 @@ module {
       }
     }
     %alloc_68 = memref.alloc() {alignment = 64 : i64} : memref<1x128x1xf32>
+    // Node21
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 1 {
@@ -340,6 +362,7 @@ module {
       }
     }
     %alloc_69 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    // Node22
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 768 {
@@ -351,6 +374,7 @@ module {
       }
     }
     %alloc_70 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    // Node23
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 768 {
@@ -362,6 +386,7 @@ module {
       }
     }
     %alloc_71 = memref.alloc() {alignment = 64 : i64} : memref<768x768xf32>
+    // Node24
     affine.for %arg3 = 0 to 768 {
       affine.for %arg4 = 0 to 768 {
         %88 = affine.load %5[%arg3, %arg4] : memref<768x768xf32>
@@ -369,6 +394,7 @@ module {
       }
     }
     %alloc_72 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    // Node25
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 768 {
@@ -378,6 +404,7 @@ module {
       }
     }
     %alloc_73 = memref.alloc() {alignment = 64 : i64} : memref<1x768x768xf32>
+    // Node26
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 768 {
         affine.for %arg5 = 0 to 768 {
@@ -387,6 +414,7 @@ module {
       }
     }
     %alloc_74 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    // Node27
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 768 {
@@ -396,6 +424,7 @@ module {
     }
     %alloc_75 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
     memref.copy %alloc_74, %alloc_75 : memref<1x128x768xf32> to memref<1x128x768xf32>
+    // Node28
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 768 {
@@ -411,6 +440,7 @@ module {
       }
     }
     %alloc_76 = memref.alloc() {alignment = 64 : i64} : memref<1x128x768xf32>
+    // Node29
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 768 {
@@ -425,6 +455,7 @@ module {
     %expanded_77 = tensor.expand_shape %41 [[0], [1], [2, 3]] : tensor<1x128x768xf32> into tensor<1x128x12x64xf32>
     %42 = bufferization.to_memref %expanded_77 : memref<1x128x12x64xf32>
     %alloc_78 = memref.alloc() {alignment = 64 : i64} : memref<1x12x128x64xf32>
+    // Node30
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 128 {
         affine.for %arg5 = 0 to 12 {