coredac · ShangkunLi · Jul 22, 2025 · Jul 22, 2025
diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h
@@ -28,6 +28,7 @@ std::unique_ptr<mlir::Pass> createMapToAcceleratorPass();
 std::unique_ptr<mlir::Pass> createGenerateCodePass();
 std::unique_ptr<mlir::Pass> createFuseControlFlowPass();
 std::unique_ptr<mlir::Pass> createCanonicalizeLiveInPass();
+std::unique_ptr<mlir::Pass> createCanonicalizeCastPass();
 
 #define GEN_PASS_REGISTRATION
 #include "NeuraDialect/NeuraPasses.h.inc"

diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td
@@ -86,4 +86,15 @@ def CanonicalizeLiveIn : Pass<"canonicalize-live-in", "ModuleOp"> {
   let constructor = "neura::createCanonicalizeLiveInPass()";
 }
 
+def CanonicalizeCast : Pass<"canonicalize-cast", "ModuleOp"> {
+  let summary = "Canonicalizes cast operations in the Neura dialect";
+  let description = [{
+    This pass applies canonicalization transformations to neura::cast operations.
+    The canonicalization includes:
+    1. Removing redundant casts.
+    2. Converting index (i64) types to i64 (index).
+  }];
+  let constructor = "neura::createCanonicalizeCastPass()";
+}
+
 #endif // NEURA_PASSES_TD
diff --git a/lib/NeuraDialect/Transforms/CMakeLists.txt b/lib/NeuraDialect/Transforms/CMakeLists.txt
@@ -12,6 +12,7 @@ add_mlir_library(
     GenerateCodePass.cpp
     FuseControlFlowPass.cpp
     CanonicalizeLiveInPass.cpp
+    CanonicalizeCastPass.cpp
 
     DEPENDS
     MLIRNeuraTransformsIncGen

diff --git a/lib/NeuraDialect/Transforms/CanonicalizeCastPass.cpp b/lib/NeuraDialect/Transforms/CanonicalizeCastPass.cpp
@@ -0,0 +1,144 @@
+#include "NeuraDialect/NeuraOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Region.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+
+using namespace mlir;
+
+namespace {
+
+LogicalResult canonicalizeCast(Region &region) {
+  // Handles block arguments.
+  for (Block &block : region.getBlocks()) {
+    for (BlockArgument arg : block.getArguments()) {
+      if (arg.getType().isIndex()) {
+        // Replaces index type with i64.
+        arg.setType(IntegerType::get(arg.getContext(), 64));
+      }
+    }
+  }
+
+  region.walk([&](Operation *op) {
+    // Handles the value attributes in neura::ConstantOp.
+    if (isa<neura::ConstantOp>(op)) {
+      Attribute value_attr = op->getAttr("value");
+      if (!value_attr) {
+        return;
+      }
+      if (IntegerAttr int_attr = dyn_cast<IntegerAttr>(value_attr)) {
+        if (isa<IntegerType>(op->getResult(0).getType())) {
+          return;
+        }
+        if (isa<IndexType>(op->getResult(0).getType())) {
+          IntegerAttr new_attr = IntegerAttr::get(
+              IntegerType::get(op->getContext(), 64), int_attr.getInt());
+          op->setAttr("value", new_attr);
+        }
+      }
+    }
+
+    // Replaces all index types with i64.
+    for (OpResult result : op->getOpResults()) {
+      auto type = result.getType();
+      if (isa<IndexType>(type)) {
+        result.setType(mlir::IntegerType::get(op->getContext(), 64));
+      }
+    }
+
+    if (neura::CastOp cast_op = dyn_cast<neura::CastOp>(op)) {
+      StringAttr cast_type_attr =
+          cast_op->getAttrOfType<StringAttr>("cast_type");
+      if (!cast_type_attr)
+        return;
+      StringRef cast_type = cast_type_attr.getValue();
+
+      Type src_type = cast_op->getOperand(0).getType();
+      Type dst_type = cast_op->getResult(0).getType();
+
+      // Reomoves the index->i64 or i64->index cast operations.
+      if ((cast_type == "index_to_int" && isa<IntegerType>(src_type) &&
+           isa<IntegerType>(dst_type) &&
+           dyn_cast<IntegerType>(src_type).getWidth() == 64 &&
+           dyn_cast<IntegerType>(dst_type).getWidth() == 64) ||
+          (cast_type == "int_to_index" && isa<IntegerType>(src_type) &&
+           isa<IntegerType>(dst_type) &&
+           dyn_cast<IntegerType>(src_type).getWidth() == 64 &&
+           dyn_cast<IntegerType>(dst_type).getWidth() == 64)) {
+        cast_op->getResult(0).replaceAllUsesWith(cast_op->getOperand(0));
+        cast_op->erase();
+        return;
+      }
+
+      // Changes index->i32 or i32->index casts to i64->i32 or i32->i64.
+      if (cast_type == "index_to_int" && isa<IntegerType>(dst_type) &&
+          dyn_cast<IntegerType>(dst_type).getWidth() == 32) {
+        cast_op->setAttr("cast_type",
+                         StringAttr::get(op->getContext(), "i64_to_i32"));
+        return;
+      }
+      if (cast_type == "int_to_index" && isa<IntegerType>(src_type) &&
+          dyn_cast<IntegerType>(src_type).getWidth() == 32) {
+        cast_op->setAttr("cast_type",
+                         StringAttr::get(op->getContext(), "i32_to_i64"));
+        return;
+      }
+      // TODO: Handles other cast types if needed.
+    }
+  });
+  return success();
+}
+
+struct CanonicalizeCastPass
+    : public PassWrapper<CanonicalizeCastPass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CanonicalizeCastPass)
+  StringRef getArgument() const override { return "canonicalize-cast"; }
+  StringRef getDescription() const override {
+    return "Canonicalizes cast operations in the Neura dialect, specifically "
+           "removing unnecessary index to i64 casts and vice versa.";
+  }
+
+  void runOnOperation() override {
+    auto module_op = getOperation();
+
+    module_op.walk([&](Operation *op) {
+      Region *region = nullptr;
+      if (auto func_op = dyn_cast<func::FuncOp>(op)) {
+        auto accel_attr = func_op->getAttrOfType<StringAttr>("accelerator");
+        if (!accel_attr || accel_attr.getValue() != "neura") {
+          return;
+        }
+        region = &func_op.getBody();
+      } else if (auto llvm_func = dyn_cast<LLVM::LLVMFuncOp>(op)) {
+        auto accel_attr = llvm_func->getAttrOfType<StringAttr>("accelerator");
+        if (!accel_attr || accel_attr.getValue() != "neura") {
+          return;
+        }
+        region = &llvm_func.getBody();
+      } else {
+        return;
+      }
+
+      if (!region || region->empty()) {
+        return;
+      }
+
+      if (failed(canonicalizeCast(*region))) {
+        signalPassFailure();
+        return;
+      }
+    });
+  }
+};
+} // namespace
+
+namespace mlir::neura {
+std::unique_ptr<mlir::Pass> createCanonicalizeCastPass() {
+  return std::make_unique<CanonicalizeCastPass>();
+}
+} // namespace mlir::neura
diff --git a/test/controflow_fuse/perfect_nested/perfect_nested.mlir b/test/controflow_fuse/perfect_nested/perfect_nested.mlir
@@ -1,5 +1,6 @@
 // RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir
 // RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura --lower-memref-to-neura --lower-builtin-to-neura --lower-llvm-to-neura | FileCheck %s
+// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura --lower-memref-to-neura --lower-builtin-to-neura --lower-llvm-to-neura --canonicalize-cast | FileCheck %s --check-prefix=CAST
 // RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura --lower-memref-to-neura --lower-builtin-to-neura --lower-llvm-to-neura --leverage-predicated-value --transform-ctrl-to-data-flow | FileCheck %s -check-prefix=CTRL2DATA
 
 module attributes {} {
@@ -45,6 +46,31 @@ module attributes {} {
 // CHECK-NEXT:     "neura.return"() : () -> ()
 // CHECK-NEXT:   }
 
+// CAST:     func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref<?x1x1x1x1x128xi8>, %arg1: memref<?x1x128x1x1x128xi8>) attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
+// CAST-NEXT:     %0 = "neura.constant"() <{predicate = true, value = 1 : i64}> : () -> i64
+// CAST-NEXT:     %1 = "neura.constant"() <{predicate = true, value = 128 : i64}> : () -> i64
+// CAST-NEXT:     %2 = "neura.constant"() <{predicate = true, value = 0 : i64}> : () -> i64
+// CAST-NEXT:     neura.br %2 : i64 to ^bb1
+// CAST-NEXT:   ^bb1(%3: i64):  // 2 preds: ^bb0, ^bb5
+// CAST-NEXT:     %4 = "neura.icmp"(%3, %1) <{cmpType = "slt"}> : (i64, i64) -> i1
+// CAST-NEXT:     neura.cond_br %4 : i1 then to ^bb2 else to ^bb6
+// CAST-NEXT:   ^bb2:  // pred: ^bb1
+// CAST-NEXT:     neura.br %2 : i64 to ^bb3
+// CAST-NEXT:   ^bb3(%5: i64):  // 2 preds: ^bb2, ^bb4
+// CAST-NEXT:     %6 = "neura.icmp"(%5, %1) <{cmpType = "slt"}> : (i64, i64) -> i1
+// CAST-NEXT:     neura.cond_br %6 : i1 then to ^bb4 else to ^bb5
+// CAST-NEXT:   ^bb4:  // pred: ^bb3
+// CAST-NEXT:     %7 = neura.load_indexed %arg0[%2, %2, %2, %2, %2, %5 : i64, i64, i64, i64, i64, i64] memref<?x1x1x1x1x128xi8> : i8
+// CAST-NEXT:     neura.store_indexed %7 to %arg1[%2, %2, %3, %2, %2, %5 : i64, i64, i64, i64, i64, i64] memref<?x1x128x1x1x128xi8> : i8
+// CAST-NEXT:     %8 = "neura.add"(%5, %0) : (i64, i64) -> i64
+// CAST-NEXT:     neura.br %8 : i64 to ^bb3
+// CAST-NEXT:   ^bb5:  // pred: ^bb3
+// CAST-NEXT:     %9 = "neura.add"(%3, %0) : (i64, i64) -> i64
+// CAST-NEXT:     neura.br %9 : i64 to ^bb1
+// CAST-NEXT:   ^bb6:  // pred: ^bb1
+// CAST-NEXT:     "neura.return"() : () -> ()
+// CAST-NEXT:   }
+
 // CTRL2DATA: func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref<?x1x1x1x1x128xi8>, %arg1: memref<?x1x128x1x1x128xi8>) attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
 // CTRL2DATA-NEXT:     %0 = "neura.constant"() <{predicate = true, value = 1 : index}> : () -> !neura.data<index, i1>
 // CTRL2DATA-NEXT:     %1 = "neura.grant_always"(%0) : (!neura.data<index, i1>) -> !neura.data<index, i1>

diff --git a/test/controflow_fuse/perfect_reduction/perfect_reduction.mlir b/test/controflow_fuse/perfect_reduction/perfect_reduction.mlir
@@ -1,5 +1,6 @@
 // RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir
 // RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura --lower-memref-to-neura --lower-builtin-to-neura --lower-llvm-to-neura | FileCheck %s
+// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura --lower-memref-to-neura --lower-builtin-to-neura --lower-llvm-to-neura --canonicalize-cast | FileCheck %s --check-prefix=CAST
 // RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura --lower-memref-to-neura --lower-builtin-to-neura --lower-llvm-to-neura --leverage-predicated-value --transform-ctrl-to-data-flow | FileCheck %s -check-prefix=CTRL2DATA
 
 module attributes {} {
@@ -50,6 +51,32 @@ module attributes {} {
 // CHECK-NEXT:   "neura.return"(%6) : (i32) -> ()
 // CHECK-NEXT: }
 
+// CAST:     func.func @_Z27perfect_nested_reduction_2dPA128_i(%arg0: memref<?x128xi32>) -> i32 attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
+// CAST-NEXT:     %0 = "neura.constant"() <{predicate = true, value = 1 : i64}> : () -> i64
+// CAST-NEXT:     %1 = "neura.constant"() <{predicate = true, value = 128 : i64}> : () -> i64
+// CAST-NEXT:     %2 = "neura.constant"() <{predicate = true, value = 0 : i32}> : () -> i32
+// CAST-NEXT:     %3 = "neura.constant"() <{predicate = true, value = 0 : i64}> : () -> i64
+// CAST-NEXT:     neura.br %3, %2 : i64, i32 to ^bb1
+// CAST-NEXT:   ^bb1(%4: i64, %5: i32):  // 2 preds: ^bb0, ^bb5
+// CAST-NEXT:     %6 = "neura.icmp"(%4, %1) <{cmpType = "slt"}> : (i64, i64) -> i1
+// CAST-NEXT:     neura.cond_br %6 : i1 then to ^bb2 else to ^bb6
+// CAST-NEXT:   ^bb2:  // pred: ^bb1
+// CAST-NEXT:     neura.br %3, %5 : i64, i32 to ^bb3
+// CAST-NEXT:   ^bb3(%7: i64, %8: i32):  // 2 preds: ^bb2, ^bb4
+// CAST-NEXT:     %9 = "neura.icmp"(%7, %1) <{cmpType = "slt"}> : (i64, i64) -> i1
+// CAST-NEXT:     neura.cond_br %9 : i1 then to ^bb4 else to ^bb5
+// CAST-NEXT:   ^bb4:  // pred: ^bb3
+// CAST-NEXT:     %10 = neura.load_indexed %arg0[%4, %7 : i64, i64] memref<?x128xi32> : i32
+// CAST-NEXT:     %11 = "neura.add"(%8, %10) : (i32, i32) -> i32
+// CAST-NEXT:     %12 = "neura.add"(%7, %0) : (i64, i64) -> i64
+// CAST-NEXT:     neura.br %12, %11 : i64, i32 to ^bb3
+// CAST-NEXT:   ^bb5:  // pred: ^bb3
+// CAST-NEXT:     %13 = "neura.add"(%4, %0) : (i64, i64) -> i64
+// CAST-NEXT:     neura.br %13, %8 : i64, i32 to ^bb1
+// CAST-NEXT:   ^bb6:  // pred: ^bb1
+// CAST-NEXT:     "neura.return"(%5) : (i32) -> ()
+// CAST-NEXT:   }
+
 // CTRL2DATA: func.func @_Z27perfect_nested_reduction_2dPA128_i(%arg0: memref<?x128xi32>) -> i32 attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
 // CTRL2DATA-NEXT:     %0 = "neura.constant"() <{predicate = true, value = 1 : index}> : () -> !neura.data<index, i1>
 // CTRL2DATA-NEXT:     %1 = "neura.grant_always"(%0) : (!neura.data<index, i1>) -> !neura.data<index, i1>

diff --git a/test/controflow_fuse/simpleloop/simpleloop.mlir b/test/controflow_fuse/simpleloop/simpleloop.mlir
@@ -1,5 +1,6 @@
 // RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir
 // RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura --lower-memref-to-neura --lower-builtin-to-neura --lower-llvm-to-neura | FileCheck %s
+// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura --lower-memref-to-neura --lower-builtin-to-neura --lower-llvm-to-neura --canonicalize-cast | FileCheck %s --check-prefix=CAST
 // RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura --lower-memref-to-neura --lower-builtin-to-neura --lower-llvm-to-neura --leverage-predicated-value --transform-ctrl-to-data-flow | FileCheck %s -check-prefix=CTRL2DATA
 
 module attributes {} {
@@ -35,6 +36,23 @@ module attributes {} {
 // CHECK-NEXT:     "neura.return"(%6) : (i32) -> ()
 // CHECK-NEXT:   }
 
+// CAST:     func.func @_Z10simpleloopv() -> i32 attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
+// CAST-NEXT:     %0 = "neura.constant"() <{predicate = true, value = 1 : i64}> : () -> i64
+// CAST-NEXT:     %1 = "neura.constant"() <{predicate = true, value = 128 : i64}> : () -> i64
+// CAST-NEXT:     %2 = "neura.constant"() <{predicate = true, value = 0 : i32}> : () -> i32
+// CAST-NEXT:     %3 = "neura.constant"() <{predicate = true, value = 0 : i64}> : () -> i64
+// CAST-NEXT:     neura.br %3, %2 : i64, i32 to ^bb1
+// CAST-NEXT:   ^bb1(%4: i64, %5: i32):  // 2 preds: ^bb0, ^bb2
+// CAST-NEXT:     %6 = "neura.icmp"(%4, %1) <{cmpType = "slt"}> : (i64, i64) -> i1
+// CAST-NEXT:     neura.cond_br %6 : i1 then to ^bb2 else to ^bb3
+// CAST-NEXT:   ^bb2:  // pred: ^bb1
+// CAST-NEXT:     %7 = "neura.cast"(%4) <{cast_type = "i64_to_i32"}> : (i64) -> i32
+// CAST-NEXT:     %8 = "neura.add"(%5, %7) : (i32, i32) -> i32
+// CAST-NEXT:     %9 = "neura.add"(%4, %0) : (i64, i64) -> i64
+// CAST-NEXT:     neura.br %9, %8 : i64, i32 to ^bb1
+// CAST-NEXT:   ^bb3:  // pred: ^bb1
+// CAST-NEXT:     "neura.return"(%5) : (i32) -> ()
+// CAST-NEXT:   }
 
 // CTRL2DATA: func.func @_Z10simpleloopv() -> i32 attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
 // CTRL2DATA-NEXT:     %0 = "neura.constant"() <{predicate = true, value = 1 : index}> : () -> !neura.data<index, i1>