coredac · n0thingNoob · Oct 19, 2025 · Oct 17, 2025 · Oct 18, 2025 · Oct 18, 2025
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -67,6 +67,9 @@ jobs:
 
         cmake --build .
         cmake --build . --target check-mlir
+
+        # Add LLVM tools to PATH
+        echo "${{github.workspace}}/llvm-project/build/bin" >> $GITHUB_PATH
     # setup mlir-cgra
     - name: setup dataflow tool-chain
       working-directory: ${{github.workspace}}
@@ -80,6 +83,9 @@ jobs:
           -DCMAKE_CXX_FLAGS="-std=c++17"
 
         ninja
+
+        # Add dataflow tools to PATH
+        echo "${{github.workspace}}/build/bin" >> $GITHUB_PATH
 
         # # install clang-12/opt-12
         # - name: install LLVM and Clang for scripts/experiment

diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,14 @@ build/
 lit.cfg
 *.dot
 
+# temporary files
+tmp-*
+*_final.mlir
+*_mapped.mlir
+*_processed.mlir
+*_log.txt
+*.backup
+
 # vscode config
 .clangd
 .cache

diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,3 @@
-[submodule "test/CGRA-Bench"]
-	path = test/CGRA-Bench
-	url = https://github.com/tancheng/CGRA-Bench
+[submodule "test/benchmark/CGRA-Bench"]
+	path = test/benchmark/CGRA-Bench
+	url = https://github.com/tancheng/CGRA-Bench.git
diff --git a/include/NeuraDialect/NeuraOps.td b/include/NeuraDialect/NeuraOps.td
@@ -92,16 +92,23 @@ def Neura_FMulOp : Op<NeuraDialect, "fmul"> {
 
 def Neura_FDivOp : Op<NeuraDialect, "fdiv"> {
   let summary = "Floating division operation";
-  let arguments = (ins AnyType:$lhs, Optional<AnyType>:$rhs);
+  let description = [{
+    Performs a floating-point division operation, computing the result of
+    a / b, where / is the floating-point division operator.
+
+    Example:
+      %result = neura.fdiv %a, %b : f32
+  }];
+  let arguments = (ins AnyType:$lhs, AnyType:$rhs);
   let results = (outs AnyType:$result);
-  // let assemblyFormat = "$lhs `,` $rhs `,` $predicate attr-dict `:` type($result)";
+  let traits = [SameOperandsAndResultElementType];
 }
 
 // Defines a bitwise OR operation.
 def Neura_OrOp : Op<NeuraDialect, "or"> {
   let summary = "Bitwise OR operation";
-  let arguments = (ins AnySignlessInteger:$lhs, AnySignlessInteger:$rhs);
-  let results = (outs AnySignlessInteger:$result);
+  let arguments = (ins AnyType:$lhs, AnyType:$rhs);
+  let results = (outs AnyType:$result);
   // let assemblyFormat = "$lhs `,` $rhs `,` attr-dict `:` type($result)";
   let traits = [SameOperandsAndResultElementType];
 }

diff --git a/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp b/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp
@@ -134,6 +134,58 @@ struct LlvmSRemToNeuraRem : public OpRewritePattern<LLVM::SRemOp> {
   }
 };
 
+struct LlvmFDivToNeuraFDiv : public OpRewritePattern<mlir::LLVM::FDivOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mlir::LLVM::FDivOp op,
+                                PatternRewriter &rewriter) const override {
+    Value lhs = op->getOperand(0);
+    Value rhs = op->getOperand(1);
+    Type result_type = op->getResult(0).getType();
+
+    // Only matches scalar float.
+    if (!mlir::isa<FloatType>(result_type))
+      return failure();
+
+    rewriter.replaceOpWithNewOp<neura::FDivOp>(op, result_type, lhs, rhs);
+    return success();
+  }
+};
+
+struct LlvmFPToSIToNeuraCast : public OpRewritePattern<mlir::LLVM::FPToSIOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mlir::LLVM::FPToSIOp op,
+                                PatternRewriter &rewriter) const override {
+    Value input = op.getArg();
+    Type result_type = op.getType();
+
+    // Creates a cast operation with "fptosi" as the cast type.
+    rewriter.replaceOpWithNewOp<neura::CastOp>(op, result_type, input, 
+                                               rewriter.getStringAttr("fptosi"));
+    return success();
+  }
+};
+
+struct LlvmFMulAddToNeuraFMulFAdd : public OpRewritePattern<mlir::LLVM::FMulAddOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mlir::LLVM::FMulAddOp op,
+                                PatternRewriter &rewriter) const override {
+    Value a = op->getOperand(0);
+    Value b = op->getOperand(1);
+    Value c = op->getOperand(2);
+    Type result_type = op->getResult(0).getType();
+
+    // Only matches scalar float.
+    if (!mlir::isa<FloatType>(result_type))
+      return failure();
+
+    rewriter.replaceOpWithNewOp<neura::FMulFAddOp>(op, result_type, a, b, c);
+    return success();
+  }
+};
+
 struct LlvmVFMulToNeuraVFMul : public OpRewritePattern<mlir::LLVM::FMulOp> {
   using OpRewritePattern::OpRewritePattern;
 
@@ -533,6 +585,9 @@ struct LowerLlvmToNeuraPass
     patterns.add<LlvmShlToNeuraShl>(&getContext());
     patterns.add<LlvmSDivToNeuraDiv>(&getContext());
     patterns.add<LlvmSRemToNeuraRem>(&getContext());
+    patterns.add<LlvmFDivToNeuraFDiv>(&getContext());
+    patterns.add<LlvmFPToSIToNeuraCast>(&getContext());
+    patterns.add<LlvmFMulAddToNeuraFMulFAdd>(&getContext());
 
     FrozenRewritePatternSet frozen(std::move(patterns));
 

diff --git a/test/CGRA-Bench → test/benchmark/CGRA-Bench b/test/CGRA-Bench → test/benchmark/CGRA-Bench
diff --git a/test/e2e/fir/fir_kernel.mlir b/test/e2e/fir/fir_kernel.mlir
@@ -0,0 +1,89 @@
+// Compiles the original C kernel to mlir, then lowers it via Neura.
+// TODO: Got error when using -O3 -fno-vectorize -fno-slp-vectorize -mllvm -force-vector-width=1 
+// Issue: https://github.com/coredac/dataflow/issues/164
+// RUN: clang++ -S -emit-llvm -O0 -o %t-kernel-full.ll %S/../../benchmark/CGRA-Bench/kernels/fir/fir.cpp
+// RUN: llvm-extract --rfunc=".*kernel.*" %t-kernel-full.ll -o %t-kernel-only.ll
+// RUN: mlir-translate --import-llvm %t-kernel-only.ll -o %t-kernel.mlir
+
+// RUN: mlir-neura-opt %t-kernel.mlir \
+// RUN:   --assign-accelerator \
+// RUN:   --lower-llvm-to-neura \
+// RUN:   --canonicalize-live-in \
+// RUN:   --leverage-predicated-value \
+// RUN:   --transform-ctrl-to-data-flow \
+// RUN:   --promote-func-arg-to-const \
+// RUN:   --insert-data-mov \
+// RUN:   --map-to-accelerator="mapping-strategy=heuristic" \
+// RUN:   --architecture-spec=../../arch_spec/architecture.yaml \
+// RUN:   --generate-code -o %t-mapping.mlir 
+// RUN: FileCheck %s --input-file=%t-mapping.mlir -check-prefix=MAPPING
+// RUN: FileCheck %s --input-file=tmp-generated-instructions.yaml --check-prefix=YAML
+// RUN: FileCheck %s --input-file=tmp-generated-instructions.asm --check-prefix=ASM
+
+#loop_annotation = #llvm.loop_annotation<mustProgress = true>
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, f128 = dense<128> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, i128 = dense<128> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, llvm.ident = "clang version 20.1.7 (https://github.com/llvm/llvm-project.git 6146a88f60492b520a36f8f8f3231e15f3cc6082)"} {
+  llvm.func @_Z6kernelPfS_S_(%arg0: !llvm.ptr {llvm.noundef}, %arg1: !llvm.ptr {llvm.noundef}, %arg2: !llvm.ptr {llvm.noundef}) attributes {frame_pointer = #llvm.framePointerKind<all>, no_inline, no_unwind, optimize_none, passthrough = ["mustprogress", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} {
+    %0 = llvm.mlir.constant(1 : i32) : i32
+    %1 = llvm.mlir.constant(0.000000e+00 : f32) : f32
+    %2 = llvm.mlir.constant(0 : i32) : i32
+    %3 = llvm.mlir.constant(32 : i32) : i32
+    %4 = llvm.mlir.constant(0 : i64) : i64
+    %5 = llvm.alloca %0 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    %6 = llvm.alloca %0 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    %7 = llvm.alloca %0 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    %8 = llvm.alloca %0 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr
+    %9 = llvm.alloca %0 x f32 {alignment = 4 : i64} : (i32) -> !llvm.ptr
+    llvm.store %arg0, %5 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr
+    llvm.store %arg1, %6 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr
+    llvm.store %arg2, %7 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr
+    llvm.store %1, %9 {alignment = 4 : i64} : f32, !llvm.ptr
+    llvm.store %2, %8 {alignment = 4 : i64} : i32, !llvm.ptr
+    llvm.br ^bb1
+  ^bb1:  // 2 preds: ^bb0, ^bb3
+    %10 = llvm.load %8 {alignment = 4 : i64} : !llvm.ptr -> i32
+    %11 = llvm.icmp "slt" %10, %3 : i32
+    llvm.cond_br %11, ^bb2, ^bb4
+  ^bb2:  // pred: ^bb1
+    %12 = llvm.load %5 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr
+    %13 = llvm.load %8 {alignment = 4 : i64} : !llvm.ptr -> i32
+    %14 = llvm.sext %13 : i32 to i64
+    %15 = llvm.getelementptr inbounds %12[%14] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %16 = llvm.load %15 {alignment = 4 : i64} : !llvm.ptr -> f32
+    %17 = llvm.load %7 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr
+    %18 = llvm.load %8 {alignment = 4 : i64} : !llvm.ptr -> i32
+    %19 = llvm.sext %18 : i32 to i64
+    %20 = llvm.getelementptr inbounds %17[%19] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %21 = llvm.load %20 {alignment = 4 : i64} : !llvm.ptr -> f32
+    %22 = llvm.load %9 {alignment = 4 : i64} : !llvm.ptr -> f32
+    %23 = llvm.intr.fmuladd(%16, %21, %22) : (f32, f32, f32) -> f32
+    llvm.store %23, %9 {alignment = 4 : i64} : f32, !llvm.ptr
+    llvm.br ^bb3
+  ^bb3:  // pred: ^bb2
+    %24 = llvm.load %8 {alignment = 4 : i64} : !llvm.ptr -> i32
+    %25 = llvm.add %24, %0 overflow<nsw> : i32
+    llvm.store %25, %8 {alignment = 4 : i64} : i32, !llvm.ptr
+    llvm.br ^bb1 {loop_annotation = #loop_annotation}
+  ^bb4:  // pred: ^bb1
+    %26 = llvm.load %9 {alignment = 4 : i64} : !llvm.ptr -> f32
+    %27 = llvm.load %6 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr
+    %28 = llvm.getelementptr inbounds %27[%4] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.store %26, %28 {alignment = 4 : i64} : f32, !llvm.ptr
+    llvm.return
+  }
+}
+
+// MAPPING: module
+// MAPPING: func @_Z6kernelPfS_S_
+// MAPPING: neura.constant
+// MAPPING: neura.fmul_fadd
+// MAPPING: neura.load
+// MAPPING: neura.store
+
+// YAML: instructions:
+// YAML: - opcode: "CONSTANT"
+// YAML: - opcode: "FMUL_FADD"
+// YAML: - opcode: "LOAD"
+// YAML: - opcode: "STORE"
+
+// ASM: PE(0,0):
+// ASM: CONSTANT
diff --git a/test/e2e/histogram/histogram_kernel.mlir b/test/e2e/histogram/histogram_kernel.mlir
@@ -0,0 +1,85 @@
+// Compiles the original C kernel to mlir, then lowers it via Neura.
+// TODO: Got error when using -O3 -fno-vectorize -fno-slp-vectorize -mllvm -force-vector-width=1 
+// Issue: https://github.com/coredac/dataflow/issues/164
+// RUN: clang++ -S -emit-llvm -O2 -o %t-kernel-full.ll %S/../../benchmark/CGRA-Bench/kernels/histogram/histogram.cpp
+// RUN: llvm-extract --rfunc=".*kernel.*" %t-kernel-full.ll -o %t-kernel-only.ll
+// RUN: mlir-translate --import-llvm %t-kernel-only.ll -o %t-kernel.mlir
+
+// RUN: mlir-neura-opt %t-kernel.mlir \
+// RUN:   --assign-accelerator \
+// RUN:   --lower-llvm-to-neura \
+// RUN:   --canonicalize-live-in \
+// RUN:   --leverage-predicated-value \
+// RUN:   --transform-ctrl-to-data-flow \
+// RUN:   --promote-func-arg-to-const \
+// RUN:   --insert-data-mov \
+// RUN:   --map-to-accelerator="mapping-strategy=heuristic" \
+// RUN:   --architecture-spec=%S/../../arch_spec/architecture.yaml \
+// RUN:   --generate-code -o %t-mapping.mlir 
+// RUN: FileCheck %s --input-file=%t-mapping.mlir -check-prefix=MAPPING
+// RUN: FileCheck %s --input-file=tmp-generated-instructions.yaml --check-prefix=YAML
+// RUN: FileCheck %s --input-file=tmp-generated-instructions.asm --check-prefix=ASM
+
+#loop_annotation = #llvm.loop_annotation<mustProgress = true>
+#tbaa_root = #llvm.tbaa_root<id = "Simple C++ TBAA">
+#tbaa_type_desc = #llvm.tbaa_type_desc<id = "omnipotent char", members = {<#tbaa_root, 0>}>
+#tbaa_type_desc1 = #llvm.tbaa_type_desc<id = "float", members = {<#tbaa_type_desc, 0>}>
+#tbaa_type_desc2 = #llvm.tbaa_type_desc<id = "int", members = {<#tbaa_type_desc, 0>}>
+#tbaa_tag = #llvm.tbaa_tag<base_type = #tbaa_type_desc1, access_type = #tbaa_type_desc1, offset = 0>
+#tbaa_tag1 = #llvm.tbaa_tag<base_type = #tbaa_type_desc2, access_type = #tbaa_type_desc2, offset = 0>
+module attributes {dlti.dl_spec = #dlti.dl_spec<f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f64 = dense<64> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, llvm.ident = "clang version 20.1.7 (https://github.com/llvm/llvm-project.git 6146a88f60492b520a36f8f8f3231e15f3cc6082)"} {
+  llvm.func local_unnamed_addr @_Z6kernelPfPi(%arg0: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg1: !llvm.ptr {llvm.nocapture, llvm.noundef}) attributes {memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} {
+    %0 = llvm.mlir.constant(0 : i64) : i64
+    %1 = llvm.mlir.constant(-1.000000e+00 : f32) : f32
+    %2 = llvm.mlir.constant(5.000000e+00 : f32) : f32
+    %3 = llvm.mlir.constant(1.800000e+01 : f32) : f32
+    %4 = llvm.mlir.constant(1 : i32) : i32
+    %5 = llvm.mlir.constant(1 : i64) : i64
+    %6 = llvm.mlir.constant(2 : i64) : i64
+    %7 = llvm.mlir.constant(20 : i64) : i64
+    llvm.br ^bb1(%0 : i64)
+  ^bb1(%8: i64):  // 2 preds: ^bb0, ^bb1
+    %9 = llvm.getelementptr inbounds %arg0[%8] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %10 = llvm.load %9 {alignment = 4 : i64, tbaa = [#tbaa_tag]} : !llvm.ptr -> f32
+    %11 = llvm.fadd %10, %1 : f32
+    %12 = llvm.fmul %11, %2 : f32
+    %13 = llvm.fdiv %12, %3 : f32
+    %14 = llvm.fptosi %13 : f32 to i32
+    %15 = llvm.sext %14 : i32 to i64
+    %16 = llvm.getelementptr inbounds %arg1[%15] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+    %17 = llvm.load %16 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> i32
+    %18 = llvm.add %17, %4 overflow<nsw> : i32
+    llvm.store %18, %16 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : i32, !llvm.ptr
+    %19 = llvm.or disjoint %8, %5 : i64
+    %20 = llvm.getelementptr inbounds %arg0[%19] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %21 = llvm.load %20 {alignment = 4 : i64, tbaa = [#tbaa_tag]} : !llvm.ptr -> f32
+    %22 = llvm.fadd %21, %1 : f32
+    %23 = llvm.fmul %22, %2 : f32
+    %24 = llvm.fdiv %23, %3 : f32
+    %25 = llvm.fptosi %24 : f32 to i32
+    %26 = llvm.sext %25 : i32 to i64
+    %27 = llvm.getelementptr inbounds %arg1[%26] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+    %28 = llvm.load %27 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> i32
+    %29 = llvm.add %28, %4 overflow<nsw> : i32
+    llvm.store %29, %27 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : i32, !llvm.ptr
+    %30 = llvm.add %8, %6 overflow<nsw, nuw> : i64
+    %31 = llvm.icmp "eq" %30, %7 : i64
+    llvm.cond_br %31, ^bb2, ^bb1(%30 : i64) {loop_annotation = #loop_annotation}
+  ^bb2:  // pred: ^bb1
+    llvm.return
+  }
+}
+
+// MAPPING: module
+// MAPPING: func @_Z6kernelPfPi
+// MAPPING: neura.constant
+// MAPPING: neura.fdiv
+// MAPPING: neura.cast
+
+// YAML: instructions:
+// YAML: - opcode: "CONSTANT"
+// YAML: - opcode: "FDIV"
+// YAML: - opcode: "CAST"
+
+// ASM: PE(0,0):
+// ASM: CONSTANT
diff --git a/test/testbench/Conv2D/.gitkeep b/test/testbench/Conv2D/.gitkeep
diff --git a/test/testbench/FFT/.gitkeep b/test/testbench/FFT/.gitkeep
diff --git a/test/testbench/SpMV/.gitkeep b/test/testbench/SpMV/.gitkeep
diff --git a/test/testbench/dtw/.gitkeep b/test/testbench/dtw/.gitkeep
diff --git a/test/testbench/fir/.gitkeep b/test/testbench/fir/.gitkeep
diff --git a/test/testbench/gemm/.gitkeep b/test/testbench/gemm/.gitkeep
diff --git a/test/testbench/histogram/.gitkeep b/test/testbench/histogram/.gitkeep
diff --git a/test/testbench/relu/.gitkeep b/test/testbench/relu/.gitkeep