coredac · n0thingNoob · Oct 19, 2025 · Oct 17, 2025 · Oct 18, 2025 · Oct 18, 2025
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,9 @@
 [submodule "test/CGRA-Bench"]
-	path = test/CGRA-Bench
+	path = test/benchmark/CGRA-Bench/CGRA-Bench
 	url = https://github.com/tancheng/CGRA-Bench
+[submodule "test/benchmark"]
+	path = test/benchmark
+	url = https://github.com/tancheng/CGRA-Bench.git
+[submodule "test/benchmark/CGRA-Bench"]
+	path = test/benchmark/CGRA-Bench
+	url = https://github.com/tancheng/CGRA-Bench.git
diff --git a/include/NeuraDialect/NeuraOps.td b/include/NeuraDialect/NeuraOps.td
@@ -92,9 +92,16 @@ def Neura_FMulOp : Op<NeuraDialect, "fmul"> {
 
 def Neura_FDivOp : Op<NeuraDialect, "fdiv"> {
   let summary = "Floating division operation";
-  let arguments = (ins AnyType:$lhs, Optional<AnyType>:$rhs);
+  let description = [{
+    Performs a floating-point division operation, computing the result of
+    a / b, where / is the floating-point division operator.
+
+    Example:
+      %result = neura.fdiv %a, %b : f32
+  }];
+  let arguments = (ins AnyType:$lhs, AnyType:$rhs);
   let results = (outs AnyType:$result);
-  // let assemblyFormat = "$lhs `,` $rhs `,` $predicate attr-dict `:` type($result)";
+  let traits = [SameOperandsAndResultElementType];
 }
 
 // Defines a bitwise OR operation.

diff --git a/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp b/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp
@@ -134,6 +134,39 @@ struct LlvmSRemToNeuraRem : public OpRewritePattern<LLVM::SRemOp> {
   }
 };
 
+struct LlvmFDivToNeuraFDiv : public OpRewritePattern<mlir::LLVM::FDivOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mlir::LLVM::FDivOp op,
+                                PatternRewriter &rewriter) const override {
+    Value lhs = op->getOperand(0);
+    Value rhs = op->getOperand(1);
+    Type result_type = op->getResult(0).getType();
+
+    // Only matches scalar float.
+    if (!mlir::isa<FloatType>(result_type))
+      return failure();
+
+    rewriter.replaceOpWithNewOp<neura::FDivOp>(op, result_type, lhs, rhs);
+    return success();
+  }
+};
+
+struct LlvmFPToSIToNeuraCast : public OpRewritePattern<mlir::LLVM::FPToSIOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mlir::LLVM::FPToSIOp op,
+                                PatternRewriter &rewriter) const override {
+    Value input = op.getArg();
+    Type result_type = op.getType();
+
+    // Create a cast operation with "fptosi" as the cast type
+    rewriter.replaceOpWithNewOp<neura::CastOp>(op, result_type, input, 
+                                               rewriter.getStringAttr("fptosi"));
+    return success();
+  }
+};
+
 struct LlvmVFMulToNeuraVFMul : public OpRewritePattern<mlir::LLVM::FMulOp> {
   using OpRewritePattern::OpRewritePattern;
 
@@ -533,6 +566,8 @@ struct LowerLlvmToNeuraPass
     patterns.add<LlvmShlToNeuraShl>(&getContext());
     patterns.add<LlvmSDivToNeuraDiv>(&getContext());
     patterns.add<LlvmSRemToNeuraRem>(&getContext());
+    patterns.add<LlvmFDivToNeuraFDiv>(&getContext());
+    patterns.add<LlvmFPToSIToNeuraCast>(&getContext());
 
     FrozenRewritePatternSet frozen(std::move(patterns));
 

diff --git a/test/CGRA-Bench → test/benchmark/CGRA-Bench b/test/CGRA-Bench → test/benchmark/CGRA-Bench
diff --git a/test/e2e/histogram/histogram_kernel.cpp b/test/e2e/histogram/histogram_kernel.cpp
@@ -0,0 +1,16 @@
+#define DATA_LEN 20
+#define BUCKET_LEN 5
+#define MIN 1.0
+#define MAX 19.0
+
+void kernel(float input[], int histogram[]) {
+  int i;
+  float dmin = (float)MIN;
+  float delt = (float)(MAX - dmin);
+
+  for (i = 0; i < DATA_LEN; i++) {
+    float r = BUCKET_LEN * (input[i] - dmin) / delt;
+    int b = (int)(r);
+    histogram[b]++;
+  }
+}
diff --git a/test/e2e/histogram/histogram_kernel.ll b/test/e2e/histogram/histogram_kernel.ll
@@ -0,0 +1,60 @@
+; ModuleID = 'histogram_kernel.cpp'
+source_filename = "histogram_kernel.cpp"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable
+define dso_local void @_Z6kernelPfPi(ptr nocapture noundef readonly %0, ptr nocapture noundef %1) local_unnamed_addr #0 {
+  br label %3
+
+3:                                                ; preds = %3, %2
+  %4 = phi i64 [ 0, %2 ], [ %26, %3 ]
+  %5 = getelementptr inbounds nuw float, ptr %0, i64 %4
+  %6 = load float, ptr %5, align 4, !tbaa !5
+  %7 = fadd float %6, -1.000000e+00
+  %8 = fmul float %7, 5.000000e+00
+  %9 = fdiv float %8, 1.800000e+01
+  %10 = fptosi float %9 to i32
+  %11 = sext i32 %10 to i64
+  %12 = getelementptr inbounds i32, ptr %1, i64 %11
+  %13 = load i32, ptr %12, align 4, !tbaa !9
+  %14 = add nsw i32 %13, 1
+  store i32 %14, ptr %12, align 4, !tbaa !9
+  %15 = or disjoint i64 %4, 1
+  %16 = getelementptr inbounds nuw float, ptr %0, i64 %15
+  %17 = load float, ptr %16, align 4, !tbaa !5
+  %18 = fadd float %17, -1.000000e+00
+  %19 = fmul float %18, 5.000000e+00
+  %20 = fdiv float %19, 1.800000e+01
+  %21 = fptosi float %20 to i32
+  %22 = sext i32 %21 to i64
+  %23 = getelementptr inbounds i32, ptr %1, i64 %22
+  %24 = load i32, ptr %23, align 4, !tbaa !9
+  %25 = add nsw i32 %24, 1
+  store i32 %25, ptr %23, align 4, !tbaa !9
+  %26 = add nuw nsw i64 %4, 2
+  %27 = icmp eq i64 %26, 20
+  br i1 %27, label %28, label %3, !llvm.loop !11
+
+28:                                               ; preds = %3
+  ret void
+}
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+!llvm.ident = !{!4}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"PIE Level", i32 2}
+!3 = !{i32 7, !"uwtable", i32 2}
+!4 = !{!"clang version 20.1.7 (https://github.com/llvm/llvm-project.git 6146a88f60492b520a36f8f8f3231e15f3cc6082)"}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"float", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C++ TBAA"}
+!9 = !{!10, !10, i64 0}
+!10 = !{!"int", !7, i64 0}
+!11 = distinct !{!11, !12}
+!12 = !{!"llvm.loop.mustprogress"}
diff --git a/test/e2e/histogram/histogram_kernel.mlir b/test/e2e/histogram/histogram_kernel.mlir
@@ -0,0 +1,81 @@
+// RUN: mlir-neura-opt %s \
+// RUN:   --assign-accelerator \
+// RUN:   --lower-llvm-to-neura \
+// RUN:   --canonicalize-live-in \
+// RUN:   --leverage-predicated-value \
+// RUN:   --transform-ctrl-to-data-flow \
+// RUN:   --promote-func-arg-to-const \
+// RUN:   --insert-data-mov \
+// RUN:   --map-to-accelerator="mapping-strategy=heuristic" \
+// RUN:   --architecture-spec=../../arch_spec/architecture.yaml \
+// RUN:   --generate-code -o %t-mapping.mlir 
+// RUN: FileCheck %s --input-file=%t-mapping.mlir -check-prefix=MAPPING
+// RUN: FileCheck %s --input-file=tmp-generated-instructions.yaml --check-prefix=YAML
+// RUN: FileCheck %s --input-file=tmp-generated-instructions.asm --check-prefix=ASM
+
+// This test verifies the complete compilation pipeline for histogram kernel
+// from LLVM IR to Neura dialect with code generation.
+
+#loop_annotation = #llvm.loop_annotation<mustProgress = true>
+#tbaa_root = #llvm.tbaa_root<id = "Simple C++ TBAA">
+#tbaa_type_desc = #llvm.tbaa_type_desc<id = "omnipotent char", members = {<#tbaa_root, 0>}>
+#tbaa_type_desc1 = #llvm.tbaa_type_desc<id = "float", members = {<#tbaa_type_desc, 0>}>
+#tbaa_type_desc2 = #llvm.tbaa_type_desc<id = "int", members = {<#tbaa_type_desc, 0>}>
+#tbaa_tag = #llvm.tbaa_tag<base_type = #tbaa_type_desc1, access_type = #tbaa_type_desc1, offset = 0>
+#tbaa_tag1 = #llvm.tbaa_tag<base_type = #tbaa_type_desc2, access_type = #tbaa_type_desc2, offset = 0>
+module attributes {dlti.dl_spec = #dlti.dl_spec<f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f64 = dense<64> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, llvm.ident = "clang version 20.1.7 (https://github.com/llvm/llvm-project.git 6146a88f60492b520a36f8f8f3231e15f3cc6082)"} {
+  llvm.func local_unnamed_addr @_Z6kernelPfPi(%arg0: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg1: !llvm.ptr {llvm.nocapture, llvm.noundef}) attributes {memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} {
+    %0 = llvm.mlir.constant(0 : i64) : i64
+    %1 = llvm.mlir.constant(-1.000000e+00 : f32) : f32
+    %2 = llvm.mlir.constant(5.000000e+00 : f32) : f32
+    %3 = llvm.mlir.constant(1.800000e+01 : f32) : f32
+    %4 = llvm.mlir.constant(1 : i32) : i32
+    %5 = llvm.mlir.constant(1 : i64) : i64
+    %6 = llvm.mlir.constant(2 : i64) : i64
+    %7 = llvm.mlir.constant(20 : i64) : i64
+    llvm.br ^bb1(%0 : i64)
+  ^bb1(%8: i64):  // 2 preds: ^bb0, ^bb1
+    %9 = llvm.getelementptr inbounds %arg0[%8] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %10 = llvm.load %9 {alignment = 4 : i64, tbaa = [#tbaa_tag]} : !llvm.ptr -> f32
+    %11 = llvm.fadd %10, %1 : f32
+    %12 = llvm.fmul %11, %2 : f32
+    %13 = llvm.fdiv %12, %3 : f32
+    %14 = llvm.fptosi %13 : f32 to i32
+    %15 = llvm.sext %14 : i32 to i64
+    %16 = llvm.getelementptr inbounds %arg1[%15] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+    %17 = llvm.load %16 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> i32
+    %18 = llvm.add %17, %4 overflow<nsw> : i32
+    llvm.store %18, %16 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : i32, !llvm.ptr
+    %19 = llvm.or disjoint %8, %5 : i64
+    %20 = llvm.getelementptr inbounds %arg0[%19] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %21 = llvm.load %20 {alignment = 4 : i64, tbaa = [#tbaa_tag]} : !llvm.ptr -> f32
+    %22 = llvm.fadd %21, %1 : f32
+    %23 = llvm.fmul %22, %2 : f32
+    %24 = llvm.fdiv %23, %3 : f32
+    %25 = llvm.fptosi %24 : f32 to i32
+    %26 = llvm.sext %25 : i32 to i64
+    %27 = llvm.getelementptr inbounds %arg1[%26] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+    %28 = llvm.load %27 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> i32
+    %29 = llvm.add %28, %4 overflow<nsw> : i32
+    llvm.store %29, %27 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : i32, !llvm.ptr
+    %30 = llvm.add %8, %6 overflow<nsw, nuw> : i64
+    %31 = llvm.icmp "eq" %30, %7 : i64
+    llvm.cond_br %31, ^bb2, ^bb1(%30 : i64) {loop_annotation = #loop_annotation}
+  ^bb2:  // pred: ^bb1
+    llvm.return
+  }
+}
+
+// MAPPING: module
+// MAPPING: func @_Z6kernelPfPi
+// MAPPING: neura.constant
+// MAPPING: neura.fdiv
+// MAPPING: neura.cast
+
+// YAML: instructions:
+// YAML: - opcode: "CONSTANT"
+// YAML: - opcode: "FDIV"
+// YAML: - opcode: "CAST"
+
+// ASM: PE(0,0):
+// ASM: CONSTANT
diff --git a/test/testbench/histogram/histogram.cpp b/test/testbench/histogram/histogram.cpp
@@ -0,0 +1,53 @@
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <float.h>
+
+#include <string.h>
+#include <unistd.h>
+
+#define DATA_LEN 20
+#define BUCKET_LEN 5
+#define MIN 1.0
+#define MAX 19.0
+
+void kernel(float input_data[], int histogram[]);
+void output();
+
+float input_data[DATA_LEN] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,14,14,14,14,14,19};
+int histogram[BUCKET_LEN] = {0};
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int main( int argc, char** argv) {
+
+  printf("DATA_LEN %d BUCKET_LEN %d\n",DATA_LEN, BUCKET_LEN);
+  kernel(input_data, histogram);
+  output();
+
+  return 0;
+}
+
+void kernel(float input[], int histogram[]) {
+  int i;
+  float dmin = (float)MIN;
+  float delt = (float)(MAX - dmin);
+
+  #pragma clang loop vectorize(enable) vectorize_width(4) unroll_count(4)
+  for (i = 0; i < DATA_LEN; i++) {
+    float r = BUCKET_LEN * (input[i] - dmin) / delt;
+    int b = (int)(r);
+    histogram[b]++;
+  }
+}
+
+void output() {
+  printf("len %d\n", BUCKET_LEN);
+  printf("min %f\n", MIN);
+  printf("del %f\n", MAX-MIN);
+  for (int i = 0; i < BUCKET_LEN; i++)
+    printf("%d ", histogram[i]);
+  printf("\n");
+}
diff --git a/test/testbench/histogram/histogram_kernel.cpp b/test/testbench/histogram/histogram_kernel.cpp
@@ -0,0 +1,16 @@
+#define DATA_LEN 20
+#define BUCKET_LEN 5
+#define MIN 1.0
+#define MAX 19.0
+
+void kernel(float input[], int histogram[]) {
+  int i;
+  float dmin = (float)MIN;
+  float delt = (float)(MAX - dmin);
+
+  for (i = 0; i < DATA_LEN; i++) {
+    float r = BUCKET_LEN * (input[i] - dmin) / delt;
+    int b = (int)(r);
+    histogram[b]++;
+  }
+}
diff --git a/test/testbench/histogram/histogram_kernel.ll b/test/testbench/histogram/histogram_kernel.ll
@@ -0,0 +1,60 @@
+; ModuleID = 'histogram_kernel.cpp'
+source_filename = "histogram_kernel.cpp"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable
+define dso_local void @_Z6kernelPfPi(ptr nocapture noundef readonly %0, ptr nocapture noundef %1) local_unnamed_addr #0 {
+  br label %3
+
+3:                                                ; preds = %3, %2
+  %4 = phi i64 [ 0, %2 ], [ %26, %3 ]
+  %5 = getelementptr inbounds nuw float, ptr %0, i64 %4
+  %6 = load float, ptr %5, align 4, !tbaa !5
+  %7 = fadd float %6, -1.000000e+00
+  %8 = fmul float %7, 5.000000e+00
+  %9 = fdiv float %8, 1.800000e+01
+  %10 = fptosi float %9 to i32
+  %11 = sext i32 %10 to i64
+  %12 = getelementptr inbounds i32, ptr %1, i64 %11
+  %13 = load i32, ptr %12, align 4, !tbaa !9
+  %14 = add nsw i32 %13, 1
+  store i32 %14, ptr %12, align 4, !tbaa !9
+  %15 = or disjoint i64 %4, 1
+  %16 = getelementptr inbounds nuw float, ptr %0, i64 %15
+  %17 = load float, ptr %16, align 4, !tbaa !5
+  %18 = fadd float %17, -1.000000e+00
+  %19 = fmul float %18, 5.000000e+00
+  %20 = fdiv float %19, 1.800000e+01
+  %21 = fptosi float %20 to i32
+  %22 = sext i32 %21 to i64
+  %23 = getelementptr inbounds i32, ptr %1, i64 %22
+  %24 = load i32, ptr %23, align 4, !tbaa !9
+  %25 = add nsw i32 %24, 1
+  store i32 %25, ptr %23, align 4, !tbaa !9
+  %26 = add nuw nsw i64 %4, 2
+  %27 = icmp eq i64 %26, 20
+  br i1 %27, label %28, label %3, !llvm.loop !11
+
+28:                                               ; preds = %3
+  ret void
+}
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+!llvm.ident = !{!4}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"PIE Level", i32 2}
+!3 = !{i32 7, !"uwtable", i32 2}
+!4 = !{!"clang version 20.1.7 (https://github.com/llvm/llvm-project.git 6146a88f60492b520a36f8f8f3231e15f3cc6082)"}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"float", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C++ TBAA"}
+!9 = !{!10, !10, i64 0}
+!10 = !{!"int", !7, i64 0}
+!11 = distinct !{!11, !12}
+!12 = !{!"llvm.loop.mustprogress"}