Skip to content
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
3b19db4
Add histogram testbench files
n0thingNoob Oct 17, 2025
6da8b5c
Fix conversion for llvm.fdiv and llvm.fptosi, add e2e/histogram kerne…
n0thingNoob Oct 18, 2025
40aa68f
Delete test/testbench/histogram/histogram.cpp
n0thingNoob Oct 18, 2025
4b9f1c6
Delete test/testbench/histogram/histogram_kernel_neura.mlir
n0thingNoob Oct 18, 2025
3cd3901
Update test/e2e/histogram/histogram_kernel.mlir
n0thingNoob Oct 18, 2025
cdb66da
Delete test/testbench/histogram/histogram_kernel.cpp
n0thingNoob Oct 18, 2025
d694f69
Delete test/testbench/histogram/histogram_kernel.ll
n0thingNoob Oct 18, 2025
1b854f3
Delete test/testbench/histogram/histogram_kernel.mlir
n0thingNoob Oct 18, 2025
4fb939a
Clean up .gitmodules by removing duplicates
n0thingNoob Oct 18, 2025
71c28c3
add fir and modify LlvmToNeuraPass.cpp for llvm.fmuladd conversion
n0thingNoob Oct 18, 2025
180c3ef
Add FIR kernel support and llvm.fmuladd conversion
n0thingNoob Oct 18, 2025
f70a11e
Merge remote-tracking branch 'origin/testbench'
n0thingNoob Oct 18, 2025
411b066
Clean up repository: remove temporary and generated files
n0thingNoob Oct 19, 2025
9a846b1
Fix Neura_OrOp type definition to support neura.data types
n0thingNoob Oct 19, 2025
e73bf40
Remove FFT and fusion test files
n0thingNoob Oct 19, 2025
10dfd4b
remove histogram.cpp
n0thingNoob Oct 19, 2025
e59f4de
remove ll file
n0thingNoob Oct 19, 2025
5a4c2ff
rm testbench folder
n0thingNoob Oct 19, 2025
096359f
backup for fir kernel and histogram kernel
n0thingNoob Oct 19, 2025
db4e012
Use llvm extract to extract kernel from benchmarks
n0thingNoob Oct 19, 2025
ebcf014
unify the kernel name in the llvm extract command
n0thingNoob Oct 19, 2025
d4314ae
add issue link to mlir file
n0thingNoob Oct 19, 2025
a83fe7f
Fix GitHub CI: Add LLVM tools to PATH for llvm-extract
n0thingNoob Oct 19, 2025
811a674
Add TODO, remove redundant file
n0thingNoob Oct 19, 2025
6f683fa
rm extra file
n0thingNoob Oct 19, 2025
bf45e98
upload gitignore and remove the unnecessary mlir.llvm in test
n0thingNoob Oct 19, 2025
3176f6f
rm adding the build/bin to PATH
n0thingNoob Oct 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@ build/
lit.cfg
*.dot

# temporary files
tmp-*
*_final.mlir
*_mapped.mlir
*_processed.mlir
*_log.txt
*.backup

# vscode config
.clangd
.cache
Expand Down
6 changes: 3 additions & 3 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[submodule "test/CGRA-Bench"]
path = test/CGRA-Bench
url = https://github.com/tancheng/CGRA-Bench
[submodule "test/benchmark/CGRA-Bench"]
path = test/benchmark/CGRA-Bench
url = https://github.com/tancheng/CGRA-Bench.git
15 changes: 11 additions & 4 deletions include/NeuraDialect/NeuraOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -92,16 +92,23 @@ def Neura_FMulOp : Op<NeuraDialect, "fmul"> {

def Neura_FDivOp : Op<NeuraDialect, "fdiv"> {
let summary = "Floating division operation";
let arguments = (ins AnyType:$lhs, Optional<AnyType>:$rhs);
let description = [{
Performs a floating-point division operation, computing the result of
a / b, where / is the floating-point division operator.

Example:
%result = neura.fdiv %a, %b : f32
}];
let arguments = (ins AnyType:$lhs, AnyType:$rhs);
let results = (outs AnyType:$result);
// let assemblyFormat = "$lhs `,` $rhs `,` $predicate attr-dict `:` type($result)";
let traits = [SameOperandsAndResultElementType];
}

// Defines a bitwise OR operation.
def Neura_OrOp : Op<NeuraDialect, "or"> {
let summary = "Bitwise OR operation";
let arguments = (ins AnySignlessInteger:$lhs, AnySignlessInteger:$rhs);
let results = (outs AnySignlessInteger:$result);
let arguments = (ins AnyType:$lhs, AnyType:$rhs);
let results = (outs AnyType:$result);
// let assemblyFormat = "$lhs `,` $rhs `,` attr-dict `:` type($result)";
let traits = [SameOperandsAndResultElementType];
}
Expand Down
55 changes: 55 additions & 0 deletions lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,58 @@ struct LlvmSRemToNeuraRem : public OpRewritePattern<LLVM::SRemOp> {
}
};

struct LlvmFDivToNeuraFDiv : public OpRewritePattern<mlir::LLVM::FDivOp> {
using OpRewritePattern::OpRewritePattern;

LogicalResult matchAndRewrite(mlir::LLVM::FDivOp op,
PatternRewriter &rewriter) const override {
Value lhs = op->getOperand(0);
Value rhs = op->getOperand(1);
Type result_type = op->getResult(0).getType();

// Only matches scalar float.
if (!mlir::isa<FloatType>(result_type))
return failure();

rewriter.replaceOpWithNewOp<neura::FDivOp>(op, result_type, lhs, rhs);
return success();
}
};

struct LlvmFPToSIToNeuraCast : public OpRewritePattern<mlir::LLVM::FPToSIOp> {
using OpRewritePattern::OpRewritePattern;

LogicalResult matchAndRewrite(mlir::LLVM::FPToSIOp op,
PatternRewriter &rewriter) const override {
Value input = op.getArg();
Type result_type = op.getType();

// Creates a cast operation with "fptosi" as the cast type.
rewriter.replaceOpWithNewOp<neura::CastOp>(op, result_type, input,
rewriter.getStringAttr("fptosi"));
return success();
}
};

struct LlvmFMulAddToNeuraFMulFAdd : public OpRewritePattern<mlir::LLVM::FMulAddOp> {
using OpRewritePattern::OpRewritePattern;

LogicalResult matchAndRewrite(mlir::LLVM::FMulAddOp op,
PatternRewriter &rewriter) const override {
Value a = op->getOperand(0);
Value b = op->getOperand(1);
Value c = op->getOperand(2);
Type result_type = op->getResult(0).getType();

// Only matches scalar float.
if (!mlir::isa<FloatType>(result_type))
return failure();

rewriter.replaceOpWithNewOp<neura::FMulFAddOp>(op, result_type, a, b, c);
return success();
}
};

struct LlvmVFMulToNeuraVFMul : public OpRewritePattern<mlir::LLVM::FMulOp> {
using OpRewritePattern::OpRewritePattern;

Expand Down Expand Up @@ -533,6 +585,9 @@ struct LowerLlvmToNeuraPass
patterns.add<LlvmShlToNeuraShl>(&getContext());
patterns.add<LlvmSDivToNeuraDiv>(&getContext());
patterns.add<LlvmSRemToNeuraRem>(&getContext());
patterns.add<LlvmFDivToNeuraFDiv>(&getContext());
patterns.add<LlvmFPToSIToNeuraCast>(&getContext());
patterns.add<LlvmFMulAddToNeuraFMulFAdd>(&getContext());

FrozenRewritePatternSet frozen(std::move(patterns));

Expand Down
105 changes: 105 additions & 0 deletions test/c2llvm2mlir/nested_loop/kernel.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
; ModuleID = 'kernel.cpp'
source_filename = "kernel.cpp"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@input = dso_local global [32 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 16
@output = dso_local global [32 x i32] zeroinitializer, align 16
@coefficients = dso_local global [32 x i32] [i32 25, i32 150, i32 375, i32 -225, i32 50, i32 75, i32 -300, i32 125, i32 25, i32 150, i32 375, i32 -225, i32 50, i32 75, i32 -300, i32 125, i32 25, i32 150, i32 375, i32 -225, i32 50, i32 75, i32 -300, i32 125, i32 25, i32 150, i32 375, i32 -225, i32 50, i32 75, i32 -300, i32 125], align 16
@.str = private unnamed_addr constant [12 x i8] c"output: %d\0A\00", align 1

; Function Attrs: mustprogress noinline norecurse optnone uwtable
define dso_local noundef i32 @main() #0 {
%1 = alloca i32, align 4
store i32 0, ptr %1, align 4
call void @_Z6kernelPiS_S_(ptr noundef @input, ptr noundef @output, ptr noundef @coefficients)
%2 = load i32, ptr @output, align 16
%3 = call i32 (ptr, ...) @printf(ptr noundef @.str, i32 noundef %2)
ret i32 0
}

; Function Attrs: mustprogress noinline nounwind optnone uwtable
define dso_local void @_Z6kernelPiS_S_(ptr noundef %0, ptr noundef %1, ptr noundef %2) #1 {
%4 = alloca ptr, align 8
%5 = alloca ptr, align 8
%6 = alloca ptr, align 8
%7 = alloca i32, align 4
%8 = alloca i32, align 4
store ptr %0, ptr %4, align 8
store ptr %1, ptr %5, align 8
store ptr %2, ptr %6, align 8
store i32 0, ptr %7, align 4
br label %9

9: ; preds = %38, %3
%10 = load i32, ptr %7, align 4
%11 = icmp slt i32 %10, 32
br i1 %11, label %12, label %41

12: ; preds = %9
store i32 0, ptr %8, align 4
br label %13

13: ; preds = %34, %12
%14 = load i32, ptr %8, align 4
%15 = icmp slt i32 %14, 32
br i1 %15, label %16, label %37

16: ; preds = %13
%17 = load ptr, ptr %4, align 8
%18 = load i32, ptr %7, align 4
%19 = sext i32 %18 to i64
%20 = getelementptr inbounds i32, ptr %17, i64 %19
%21 = load i32, ptr %20, align 4
%22 = load ptr, ptr %6, align 8
%23 = load i32, ptr %7, align 4
%24 = sext i32 %23 to i64
%25 = getelementptr inbounds i32, ptr %22, i64 %24
%26 = load i32, ptr %25, align 4
%27 = mul nsw i32 %21, %26
%28 = load ptr, ptr %5, align 8
%29 = load i32, ptr %8, align 4
%30 = sext i32 %29 to i64
%31 = getelementptr inbounds i32, ptr %28, i64 %30
%32 = load i32, ptr %31, align 4
%33 = add nsw i32 %32, %27
store i32 %33, ptr %31, align 4
br label %34

34: ; preds = %16
%35 = load i32, ptr %8, align 4
%36 = add nsw i32 %35, 1
store i32 %36, ptr %8, align 4
br label %13, !llvm.loop !6

37: ; preds = %13
br label %38

38: ; preds = %37
%39 = load i32, ptr %7, align 4
%40 = add nsw i32 %39, 1
store i32 %40, ptr %7, align 4
br label %9, !llvm.loop !8

41: ; preds = %9
ret void
}

declare i32 @printf(ptr noundef, ...) #2

attributes #0 = { mustprogress noinline norecurse optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #1 = { mustprogress noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #2 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }

!llvm.module.flags = !{!0, !1, !2, !3, !4}
!llvm.ident = !{!5}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 8, !"PIC Level", i32 2}
!2 = !{i32 7, !"PIE Level", i32 2}
!3 = !{i32 7, !"uwtable", i32 2}
!4 = !{i32 7, !"frame-pointer", i32 2}
!5 = !{!"clang version 20.1.7 (https://github.com/llvm/llvm-project.git 6146a88f60492b520a36f8f8f3231e15f3cc6082)"}
!6 = distinct !{!6, !7}
!7 = !{!"llvm.loop.mustprogress"}
!8 = distinct !{!8, !7}
82 changes: 82 additions & 0 deletions test/c2llvm2mlir/nested_loop/kernel.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#loop_annotation = #llvm.loop_annotation<mustProgress = true>
module attributes {dlti.dl_spec = #dlti.dl_spec<f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, i32 = dense<32> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i16 = dense<16> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, "dlti.endianness" = "little", "dlti.stack_alignment" = 128 : i64>, llvm.ident = "clang version 20.1.7 (https://github.com/llvm/llvm-project.git 6146a88f60492b520a36f8f8f3231e15f3cc6082)"} {
llvm.mlir.global external @input(dense<1> : tensor<32xi32>) {addr_space = 0 : i32, alignment = 16 : i64, dso_local} : !llvm.array<32 x i32>
llvm.mlir.global external @output(dense<0> : tensor<32xi32>) {addr_space = 0 : i32, alignment = 16 : i64, dso_local} : !llvm.array<32 x i32>
llvm.mlir.global external @coefficients(dense<[25, 150, 375, -225, 50, 75, -300, 125, 25, 150, 375, -225, 50, 75, -300, 125, 25, 150, 375, -225, 50, 75, -300, 125, 25, 150, 375, -225, 50, 75, -300, 125]> : tensor<32xi32>) {addr_space = 0 : i32, alignment = 16 : i64, dso_local} : !llvm.array<32 x i32>
llvm.mlir.global private unnamed_addr constant @".str"("output: %d\0A\00") {addr_space = 0 : i32, alignment = 1 : i64, dso_local}
llvm.func @main() -> (i32 {llvm.noundef}) attributes {frame_pointer = #llvm.framePointerKind<all>, no_inline, optimize_none, passthrough = ["mustprogress", "norecurse", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} {
%0 = llvm.mlir.constant(1 : i32) : i32
%1 = llvm.mlir.constant(0 : i32) : i32
%2 = llvm.mlir.addressof @input : !llvm.ptr
%3 = llvm.mlir.addressof @output : !llvm.ptr
%4 = llvm.mlir.addressof @coefficients : !llvm.ptr
%5 = llvm.mlir.addressof @".str" : !llvm.ptr
%6 = llvm.alloca %0 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr
llvm.store %1, %6 {alignment = 4 : i64} : i32, !llvm.ptr
llvm.call @_Z6kernelPiS_S_(%2, %3, %4) {no_unwind} : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> ()
%7 = llvm.load %3 {alignment = 16 : i64} : !llvm.ptr -> i32
%8 = llvm.call @printf(%5, %7) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32) -> i32
llvm.return %1 : i32
}
llvm.func @_Z6kernelPiS_S_(%arg0: !llvm.ptr {llvm.noundef}, %arg1: !llvm.ptr {llvm.noundef}, %arg2: !llvm.ptr {llvm.noundef}) attributes {frame_pointer = #llvm.framePointerKind<all>, no_inline, no_unwind, optimize_none, passthrough = ["mustprogress", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} {
%0 = llvm.mlir.constant(1 : i32) : i32
%1 = llvm.mlir.constant(0 : i32) : i32
%2 = llvm.mlir.constant(32 : i32) : i32
%3 = llvm.alloca %0 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr
%4 = llvm.alloca %0 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr
%5 = llvm.alloca %0 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr
%6 = llvm.alloca %0 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr
%7 = llvm.alloca %0 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr
llvm.store %arg0, %3 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr
llvm.store %arg1, %4 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr
llvm.store %arg2, %5 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr
llvm.store %1, %6 {alignment = 4 : i64} : i32, !llvm.ptr
llvm.br ^bb1
^bb1: // 2 preds: ^bb0, ^bb7
%8 = llvm.load %6 {alignment = 4 : i64} : !llvm.ptr -> i32
%9 = llvm.icmp "slt" %8, %2 : i32
llvm.cond_br %9, ^bb2, ^bb8
^bb2: // pred: ^bb1
llvm.store %1, %7 {alignment = 4 : i64} : i32, !llvm.ptr
llvm.br ^bb3
^bb3: // 2 preds: ^bb2, ^bb5
%10 = llvm.load %7 {alignment = 4 : i64} : !llvm.ptr -> i32
%11 = llvm.icmp "slt" %10, %2 : i32
llvm.cond_br %11, ^bb4, ^bb6
^bb4: // pred: ^bb3
%12 = llvm.load %3 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr
%13 = llvm.load %6 {alignment = 4 : i64} : !llvm.ptr -> i32
%14 = llvm.sext %13 : i32 to i64
%15 = llvm.getelementptr inbounds %12[%14] : (!llvm.ptr, i64) -> !llvm.ptr, i32
%16 = llvm.load %15 {alignment = 4 : i64} : !llvm.ptr -> i32
%17 = llvm.load %5 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr
%18 = llvm.load %6 {alignment = 4 : i64} : !llvm.ptr -> i32
%19 = llvm.sext %18 : i32 to i64
%20 = llvm.getelementptr inbounds %17[%19] : (!llvm.ptr, i64) -> !llvm.ptr, i32
%21 = llvm.load %20 {alignment = 4 : i64} : !llvm.ptr -> i32
%22 = llvm.mul %16, %21 overflow<nsw> : i32
%23 = llvm.load %4 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr
%24 = llvm.load %7 {alignment = 4 : i64} : !llvm.ptr -> i32
%25 = llvm.sext %24 : i32 to i64
%26 = llvm.getelementptr inbounds %23[%25] : (!llvm.ptr, i64) -> !llvm.ptr, i32
%27 = llvm.load %26 {alignment = 4 : i64} : !llvm.ptr -> i32
%28 = llvm.add %27, %22 overflow<nsw> : i32
llvm.store %28, %26 {alignment = 4 : i64} : i32, !llvm.ptr
llvm.br ^bb5
^bb5: // pred: ^bb4
%29 = llvm.load %7 {alignment = 4 : i64} : !llvm.ptr -> i32
%30 = llvm.add %29, %0 overflow<nsw> : i32
llvm.store %30, %7 {alignment = 4 : i64} : i32, !llvm.ptr
llvm.br ^bb3 {loop_annotation = #loop_annotation}
^bb6: // pred: ^bb3
llvm.br ^bb7
^bb7: // pred: ^bb6
%31 = llvm.load %6 {alignment = 4 : i64} : !llvm.ptr -> i32
%32 = llvm.add %31, %0 overflow<nsw> : i32
llvm.store %32, %6 {alignment = 4 : i64} : i32, !llvm.ptr
llvm.br ^bb1 {loop_annotation = #loop_annotation}
^bb8: // pred: ^bb1
llvm.return
}
llvm.func @printf(!llvm.ptr {llvm.noundef}, ...) -> i32 attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"}
}
Loading
Loading