diff --git a/test/testbench/histogram/histogram.cpp b/test/testbench/histogram/histogram.cpp new file mode 100644 index 00000000..9f130d1a --- /dev/null +++ b/test/testbench/histogram/histogram.cpp @@ -0,0 +1,53 @@ + +#include +#include +#include +#include + +#include +#include + +#define DATA_LEN 20 +#define BUCKET_LEN 5 +#define MIN 1.0 +#define MAX 19.0 + +void kernel(float input_data[], int histogram[]); +void output(); + +float input_data[DATA_LEN] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,14,14,14,14,14,19}; +int histogram[BUCKET_LEN] = {0}; + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// +int main( int argc, char** argv) { + + printf("DATA_LEN %d BUCKET_LEN %d\n",DATA_LEN, BUCKET_LEN); + kernel(input_data, histogram); + output(); + + return 0; +} + +void kernel(float input[], int histogram[]) { + int i; + float dmin = (float)MIN; + float delt = (float)(MAX - dmin); + + #pragma clang loop vectorize(enable) vectorize_width(4) unroll_count(4) + for (i = 0; i < DATA_LEN; i++) { + float r = BUCKET_LEN * (input[i] - dmin) / delt; + int b = (int)(r); + histogram[b]++; + } +} + +void output() { + printf("len %d\n", BUCKET_LEN); + printf("min %f\n", MIN); + printf("del %f\n", MAX-MIN); + for (int i = 0; i < BUCKET_LEN; i++) + printf("%d ", histogram[i]); + printf("\n"); +} \ No newline at end of file diff --git a/test/testbench/histogram/histogram_kernel.cpp b/test/testbench/histogram/histogram_kernel.cpp new file mode 100644 index 00000000..792ba523 --- /dev/null +++ b/test/testbench/histogram/histogram_kernel.cpp @@ -0,0 +1,16 @@ +#define DATA_LEN 20 +#define BUCKET_LEN 5 +#define MIN 1.0 +#define MAX 19.0 + +void kernel(float input[], int histogram[]) { + int i; + float dmin = (float)MIN; + float delt = (float)(MAX - dmin); + + for (i = 0; i < DATA_LEN; i++) { + float r = BUCKET_LEN * (input[i] - dmin) / delt; + int b = (int)(r); + histogram[b]++; + } +} diff --git a/test/testbench/histogram/histogram_kernel.ll b/test/testbench/histogram/histogram_kernel.ll new file mode 100644 index 00000000..eae036e4 --- /dev/null +++ b/test/testbench/histogram/histogram_kernel.ll @@ -0,0 +1,60 @@ +; ModuleID = 'histogram_kernel.cpp' +source_filename = "histogram_kernel.cpp" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: mustprogress nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable +define dso_local void @_Z6kernelPfPi(ptr nocapture noundef readonly %0, ptr nocapture noundef %1) local_unnamed_addr #0 { + br label %3 + +3: ; preds = %3, %2 + %4 = phi i64 [ 0, %2 ], [ %26, %3 ] + %5 = getelementptr inbounds nuw float, ptr %0, i64 %4 + %6 = load float, ptr %5, align 4, !tbaa !5 + %7 = fadd float %6, -1.000000e+00 + %8 = fmul float %7, 5.000000e+00 + %9 = fdiv float %8, 1.800000e+01 + %10 = fptosi float %9 to i32 + %11 = sext i32 %10 to i64 + %12 = getelementptr inbounds i32, ptr %1, i64 %11 + %13 = load i32, ptr %12, align 4, !tbaa !9 + %14 = add nsw i32 %13, 1 + store i32 %14, ptr %12, align 4, !tbaa !9 + %15 = or disjoint i64 %4, 1 + %16 = getelementptr inbounds nuw float, ptr %0, i64 %15 + %17 = load float, ptr %16, align 4, !tbaa !5 + %18 = fadd float %17, -1.000000e+00 + %19 = fmul float %18, 5.000000e+00 + %20 = fdiv float %19, 1.800000e+01 + %21 = fptosi float %20 to i32 + %22 = sext i32 %21 to i64 + %23 = getelementptr inbounds i32, ptr %1, i64 %22 + %24 = load i32, ptr %23, align 4, !tbaa !9 + %25 = add nsw i32 %24, 1 + store i32 %25, ptr %23, align 4, !tbaa !9 + %26 = add nuw nsw i64 %4, 2 + %27 = icmp eq i64 %26, 20 + br i1 %27, label %28, label %3, !llvm.loop !11 + +28: ; preds = %3 + ret void +} + +attributes #0 = { mustprogress nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } + +!llvm.module.flags = !{!0, !1, !2, !3} +!llvm.ident = !{!4} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 8, !"PIC Level", i32 2} +!2 = !{i32 7, !"PIE Level", i32 2} +!3 = !{i32 7, !"uwtable", i32 2} +!4 = !{!"clang version 20.1.7 (https://github.com/llvm/llvm-project.git 6146a88f60492b520a36f8f8f3231e15f3cc6082)"} +!5 = !{!6, !6, i64 0} +!6 = !{!"float", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C++ TBAA"} +!9 = !{!10, !10, i64 0} +!10 = !{!"int", !7, i64 0} +!11 = distinct !{!11, !12} +!12 = !{!"llvm.loop.mustprogress"} diff --git a/test/testbench/histogram/histogram_kernel.mlir b/test/testbench/histogram/histogram_kernel.mlir new file mode 100644 index 00000000..cbf0a4a1 --- /dev/null +++ b/test/testbench/histogram/histogram_kernel.mlir @@ -0,0 +1,49 @@ +#loop_annotation = #llvm.loop_annotation +#tbaa_root = #llvm.tbaa_root +#tbaa_type_desc = #llvm.tbaa_type_desc}> +#tbaa_type_desc1 = #llvm.tbaa_type_desc}> +#tbaa_type_desc2 = #llvm.tbaa_type_desc}> +#tbaa_tag = #llvm.tbaa_tag +#tbaa_tag1 = #llvm.tbaa_tag +module attributes {dlti.dl_spec = #dlti.dl_spec : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f64 = dense<64> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, llvm.ident = "clang version 20.1.7 (https://github.com/llvm/llvm-project.git 6146a88f60492b520a36f8f8f3231e15f3cc6082)"} { + llvm.func local_unnamed_addr @_Z6kernelPfPi(%arg0: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg1: !llvm.ptr {llvm.nocapture, llvm.noundef}) attributes {memory_effects = #llvm.memory_effects, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} { + %0 = llvm.mlir.constant(0 : i64) : i64 + %1 = llvm.mlir.constant(-1.000000e+00 : f32) : f32 + %2 = llvm.mlir.constant(5.000000e+00 : f32) : f32 + %3 = llvm.mlir.constant(1.800000e+01 : f32) : f32 + %4 = llvm.mlir.constant(1 : i32) : i32 + %5 = llvm.mlir.constant(1 : i64) : i64 + %6 = llvm.mlir.constant(2 : i64) : i64 + %7 = llvm.mlir.constant(20 : i64) : i64 + llvm.br ^bb1(%0 : i64) + ^bb1(%8: i64): // 2 preds: ^bb0, ^bb1 + %9 = llvm.getelementptr inbounds %arg0[%8] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %10 = llvm.load %9 {alignment = 4 : i64, tbaa = [#tbaa_tag]} : !llvm.ptr -> f32 + %11 = llvm.fadd %10, %1 : f32 + %12 = llvm.fmul %11, %2 : f32 + %13 = llvm.fdiv %12, %3 : f32 + %14 = llvm.fptosi %13 : f32 to i32 + %15 = llvm.sext %14 : i32 to i64 + %16 = llvm.getelementptr inbounds %arg1[%15] : (!llvm.ptr, i64) -> !llvm.ptr, i32 + %17 = llvm.load %16 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> i32 + %18 = llvm.add %17, %4 overflow : i32 + llvm.store %18, %16 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : i32, !llvm.ptr + %19 = llvm.or disjoint %8, %5 : i64 + %20 = llvm.getelementptr inbounds %arg0[%19] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %21 = llvm.load %20 {alignment = 4 : i64, tbaa = [#tbaa_tag]} : !llvm.ptr -> f32 + %22 = llvm.fadd %21, %1 : f32 + %23 = llvm.fmul %22, %2 : f32 + %24 = llvm.fdiv %23, %3 : f32 + %25 = llvm.fptosi %24 : f32 to i32 + %26 = llvm.sext %25 : i32 to i64 + %27 = llvm.getelementptr inbounds %arg1[%26] : (!llvm.ptr, i64) -> !llvm.ptr, i32 + %28 = llvm.load %27 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> i32 + %29 = llvm.add %28, %4 overflow : i32 + llvm.store %29, %27 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : i32, !llvm.ptr + %30 = llvm.add %8, %6 overflow : i64 + %31 = llvm.icmp "eq" %30, %7 : i64 + llvm.cond_br %31, ^bb2, ^bb1(%30 : i64) {loop_annotation = #loop_annotation} + ^bb2: // pred: ^bb1 + llvm.return + } +} diff --git a/test/testbench/histogram/histogram_kernel_neura.mlir b/test/testbench/histogram/histogram_kernel_neura.mlir new file mode 100644 index 00000000..b8f1f204 --- /dev/null +++ b/test/testbench/histogram/histogram_kernel_neura.mlir @@ -0,0 +1,43 @@ +module attributes {dlti.dl_spec = #dlti.dl_spec : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f64 = dense<64> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, llvm.ident = "clang version 20.1.7 (https://github.com/llvm/llvm-project.git 6146a88f60492b520a36f8f8f3231e15f3cc6082)"} { + func.func @_Z6kernelPfPi(%arg0: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg1: !llvm.ptr {llvm.nocapture, llvm.noundef}) -> !llvm.void attributes {CConv = #llvm.cconv, accelerator = "neura", linkage = #llvm.linkage, memory_effects = #llvm.memory_effects, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic", unnamed_addr = 1 : i64, visibility_ = 0 : i64} { + %0 = "neura.constant"() <{value = 0 : i64}> : () -> i64 + %1 = "neura.constant"() <{value = -1.000000e+00 : f32}> : () -> f32 + %2 = "neura.constant"() <{value = 5.000000e+00 : f32}> : () -> f32 + %3 = "neura.constant"() <{value = 1.800000e+01 : f32}> : () -> f32 + %4 = "neura.constant"() <{value = 1 : i32}> : () -> i32 + %5 = "neura.constant"() <{value = 1 : i64}> : () -> i64 + %6 = "neura.constant"() <{value = 2 : i64}> : () -> i64 + %7 = "neura.constant"() <{value = 20 : i64}> : () -> i64 + neura.br %0 : i64 to ^bb1 + ^bb1(%8: i64): // 2 preds: ^bb0, ^bb1 + %9 = "neura.gep"(%arg0, %8) : (!llvm.ptr, i64) -> !llvm.ptr + %10 = "neura.load"(%9) : (!llvm.ptr) -> f32 + %11 = "neura.fadd"(%10, %1) : (f32, f32) -> f32 + %12 = "neura.fmul"(%11, %2) : (f32, f32) -> f32 + %13 = llvm.fdiv %12, %3 : f32 + %14 = llvm.fptosi %13 : f32 to i32 + %15 = neura.sext %14 : i32 -> i64 + %16 = "neura.gep"(%arg1, %15) : (!llvm.ptr, i64) -> !llvm.ptr + %17 = "neura.load"(%16) : (!llvm.ptr) -> i32 + %18 = "neura.add"(%17, %4) : (i32, i32) -> i32 + "neura.store"(%18, %16) : (i32, !llvm.ptr) -> () + %19 = "neura.or"(%8, %5) : (i64, i64) -> i64 + %20 = "neura.gep"(%arg0, %19) : (!llvm.ptr, i64) -> !llvm.ptr + %21 = "neura.load"(%20) : (!llvm.ptr) -> f32 + %22 = "neura.fadd"(%21, %1) : (f32, f32) -> f32 + %23 = "neura.fmul"(%22, %2) : (f32, f32) -> f32 + %24 = llvm.fdiv %23, %3 : f32 + %25 = llvm.fptosi %24 : f32 to i32 + %26 = neura.sext %25 : i32 -> i64 + %27 = "neura.gep"(%arg1, %26) : (!llvm.ptr, i64) -> !llvm.ptr + %28 = "neura.load"(%27) : (!llvm.ptr) -> i32 + %29 = "neura.add"(%28, %4) : (i32, i32) -> i32 + "neura.store"(%29, %27) : (i32, !llvm.ptr) -> () + %30 = "neura.add"(%8, %6) : (i64, i64) -> i64 + %31 = "neura.icmp"(%30, %7) <{cmpType = "eq"}> : (i64, i64) -> i1 + neura.cond_br %31 : i1 then to ^bb2 else %30 : i64 to ^bb1 + ^bb2: // pred: ^bb1 + "neura.return"() : () -> () + } +} +