-
Notifications
You must be signed in to change notification settings - Fork 15
[P0] Potential redundent oprations optimization #152
Copy link
Copy link
Closed
Description
I've been testing c2llvm2mlir testcase recently, and I found that there are some redundent oprations after convering to dataflow format mlir that can be potentially optimized.
Take test/c2llvm2mlir/simple_loop for example:
The original mlir is:
llvm.func local_unnamed_addr @_Z6kernelPfS_S_(%arg0: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg1: !llvm.ptr {llvm.nocapture, llvm.noundef}, %arg2: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}) attributes {memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} {
%0 = llvm.mlir.constant(0 : i64) : i64
%1 = llvm.mlir.constant(1 : i64) : i64
%2 = llvm.mlir.constant(32 : i64) : i64
%3 = llvm.load %arg1 {alignment = 4 : i64, tbaa = [#tbaa_tag]} : !llvm.ptr -> f32
llvm.br ^bb1(%0, %3 : i64, f32)
^bb1(%4: i64, %5: f32): // 2 preds: ^bb0, ^bb1
%6 = llvm.getelementptr inbounds %arg0[%4] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%7 = llvm.load %6 {alignment = 4 : i64, tbaa = [#tbaa_tag]} : !llvm.ptr -> f32
%8 = llvm.getelementptr inbounds %arg2[%4] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%9 = llvm.load %8 {alignment = 4 : i64, tbaa = [#tbaa_tag]} : !llvm.ptr -> f32
%10 = llvm.fmul %7, %9 : f32
%11 = llvm.fadd %5, %10 : f32
llvm.store %11, %arg1 {alignment = 4 : i64, tbaa = [#tbaa_tag]} : f32, !llvm.ptr
%12 = llvm.add %4, %1 overflow<nsw, nuw> : i64
%13 = llvm.icmp "eq" %12, %2 : i64
llvm.cond_br %13, ^bb2, ^bb1(%12, %11 : i64, f32) {loop_annotation = #loop_annotation}
^bb2: // pred: ^bb1
llvm.return
}
After canonicalize-live-in:
func.func @_Z6kernelPfS_S_(%arg0: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg1: !llvm.ptr {llvm.nocapture, llvm.noundef}, %arg2: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}) -> !llvm.void attributes {CConv = #llvm.cconv<ccc>, accelerator = "neura", linkage = #llvm.linkage<external>, memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic", unnamed_addr = 1 : i64, visibility_ = 0 : i64} {
%0 = "neura.constant"() <{value = "%arg0"}> : () -> !llvm.ptr
%1 = "neura.constant"() <{value = "%arg1"}> : () -> !llvm.ptr
%2 = "neura.constant"() <{value = "%arg2"}> : () -> !llvm.ptr
%3 = "neura.constant"() <{value = 0 : i64}> : () -> i64
%4 = "neura.constant"() <{value = 1 : i64}> : () -> i64
%5 = "neura.constant"() <{value = 32 : i64}> : () -> i64
%6 = "neura.load"(%1) : (!llvm.ptr) -> f32
neura.br %3, %6, %0, %2, %1, %4, %5 : i64, f32, !llvm.ptr, !llvm.ptr, !llvm.ptr, i64, i64 to ^bb1
^bb1(%7: i64, %8: f32, %9: !llvm.ptr, %10: !llvm.ptr, %11: !llvm.ptr, %12: i64, %13: i64): // 2 preds: ^bb0, ^bb1
%14 = "neura.gep"(%9, %7) : (!llvm.ptr, i64) -> !llvm.ptr
%15 = "neura.load"(%14) : (!llvm.ptr) -> f32
%16 = "neura.gep"(%10, %7) : (!llvm.ptr, i64) -> !llvm.ptr
%17 = "neura.load"(%16) : (!llvm.ptr) -> f32
%18 = "neura.fmul"(%15, %17) : (f32, f32) -> f32
%19 = "neura.fadd"(%8, %18) : (f32, f32) -> f32
"neura.store"(%19, %11) : (f32, !llvm.ptr) -> ()
%20 = "neura.add"(%7, %12) : (i64, i64) -> i64
%21 = "neura.icmp"(%20, %13) <{cmpType = "eq"}> : (i64, i64) -> i1
neura.cond_br %21 : i1 then to ^bb2 else %20, %19, %9, %10, %11, %12, %13 : i64, f32, !llvm.ptr, !llvm.ptr, !llvm.ptr, i64, i64 to ^bb1
^bb2: // pred: ^bb1
"neura.return"() : () -> ()
}
After transform-ctrl-to-data-flow:
func.func @_Z6kernelPfS_S_(%arg0: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg1: !llvm.ptr {llvm.nocapture, llvm.noundef}, %arg2: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}) -> !llvm.void attributes {CConv = #llvm.cconv<ccc>, accelerator = "neura", dataflow_mode = "predicate", linkage = #llvm.linkage<external>, memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic", unnamed_addr = 1 : i64, visibility_ = 0 : i64} {
%0 = "neura.constant"() <{value = "%arg0"}> : () -> !neura.data<!llvm.ptr, i1>
%1 = "neura.grant_once"(%0) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
%2 = "neura.constant"() <{value = "%arg1"}> : () -> !neura.data<!llvm.ptr, i1>
%3 = "neura.grant_once"(%2) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
%4 = "neura.constant"() <{value = "%arg2"}> : () -> !neura.data<!llvm.ptr, i1>
%5 = "neura.grant_once"(%4) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
%6 = "neura.constant"() <{value = 0 : i64}> : () -> !neura.data<i64, i1>
%7 = "neura.grant_once"(%6) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
%8 = "neura.constant"() <{value = 1 : i64}> : () -> !neura.data<i64, i1>
%9 = "neura.grant_once"(%8) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
%10 = "neura.constant"() <{value = 32 : i64}> : () -> !neura.data<i64, i1>
%11 = "neura.grant_once"(%10) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
%12 = "neura.load"(%2) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<f32, i1>
%13 = "neura.grant_once"(%12) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
%14 = neura.reserve : !neura.data<i64, i1>
%15 = "neura.phi"(%14, %11) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
%16 = neura.reserve : !neura.data<i64, i1>
%17 = "neura.phi"(%16, %9) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
%18 = neura.reserve : !neura.data<!llvm.ptr, i1>
%19 = "neura.phi"(%18, %3) : (!neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
%20 = neura.reserve : !neura.data<!llvm.ptr, i1>
%21 = "neura.phi"(%20, %5) : (!neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
%22 = neura.reserve : !neura.data<!llvm.ptr, i1>
%23 = "neura.phi"(%22, %1) : (!neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
%24 = neura.reserve : !neura.data<f32, i1>
%25 = "neura.phi"(%24, %13) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
%26 = neura.reserve : !neura.data<i64, i1>
%27 = "neura.phi"(%26, %7) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
%28 = "neura.gep"(%23, %27) : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
%29 = "neura.load"(%28) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<f32, i1>
%30 = "neura.gep"(%21, %27) : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
%31 = "neura.load"(%30) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<f32, i1>
%32 = "neura.fmul"(%29, %31) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
%33 = "neura.fadd"(%25, %32) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
"neura.store"(%33, %19) : (!neura.data<f32, i1>, !neura.data<!llvm.ptr, i1>) -> ()
%34 = "neura.add"(%27, %17) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
%35 = "neura.icmp"(%34, %15) <{cmpType = "eq"}> : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i1, i1>
%36 = "neura.not"(%35) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
%37 = neura.grant_predicate %34, %36 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
neura.ctrl_mov %37 -> %26 : !neura.data<i64, i1> !neura.data<i64, i1>
%38 = neura.grant_predicate %33, %36 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
neura.ctrl_mov %38 -> %24 : !neura.data<f32, i1> !neura.data<f32, i1>
%39 = neura.grant_predicate %23, %36 : !neura.data<!llvm.ptr, i1>, !neura.data<i1, i1> -> !neura.data<!llvm.ptr, i1>
neura.ctrl_mov %39 -> %22 : !neura.data<!llvm.ptr, i1> !neura.data<!llvm.ptr, i1>
%40 = neura.grant_predicate %21, %36 : !neura.data<!llvm.ptr, i1>, !neura.data<i1, i1> -> !neura.data<!llvm.ptr, i1>
neura.ctrl_mov %40 -> %20 : !neura.data<!llvm.ptr, i1> !neura.data<!llvm.ptr, i1>
%41 = neura.grant_predicate %19, %36 : !neura.data<!llvm.ptr, i1>, !neura.data<i1, i1> -> !neura.data<!llvm.ptr, i1>
neura.ctrl_mov %41 -> %18 : !neura.data<!llvm.ptr, i1> !neura.data<!llvm.ptr, i1>
%42 = neura.grant_predicate %17, %36 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
neura.ctrl_mov %42 -> %16 : !neura.data<i64, i1> !neura.data<i64, i1>
%43 = neura.grant_predicate %15, %36 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
neura.ctrl_mov %43 -> %14 : !neura.data<i64, i1> !neura.data<i64, i1>
"neura.return"() : () -> ()
}
After fold-constant:
func.func @_Z6kernelPfS_S_(%arg0: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg1: !llvm.ptr {llvm.nocapture, llvm.noundef}, %arg2: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}) -> !llvm.void attributes {CConv = #llvm.cconv<ccc>, accelerator = "neura", dataflow_mode = "predicate", linkage = #llvm.linkage<external>, memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic", unnamed_addr = 1 : i64, visibility_ = 0 : i64} {
%0 = "neura.grant_once"() <{constant_value = "%arg0"}> : () -> !neura.data<!llvm.ptr, i1>
%1 = "neura.grant_once"() <{constant_value = "%arg1"}> : () -> !neura.data<!llvm.ptr, i1>
%2 = "neura.constant"() <{value = "%arg1"}> : () -> !neura.data<!llvm.ptr, i1>
%3 = "neura.grant_once"() <{constant_value = "%arg2"}> : () -> !neura.data<!llvm.ptr, i1>
%4 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1>
%5 = "neura.grant_once"() <{constant_value = 1 : i64}> : () -> !neura.data<i64, i1>
%6 = "neura.grant_once"() <{constant_value = 32 : i64}> : () -> !neura.data<i64, i1>
%7 = "neura.load"(%2) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<f32, i1>
%8 = "neura.grant_once"(%7) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
%9 = neura.reserve : !neura.data<i64, i1>
%10 = "neura.phi"(%9, %6) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
%11 = neura.reserve : !neura.data<i64, i1>
%12 = "neura.phi"(%11, %5) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
%13 = neura.reserve : !neura.data<!llvm.ptr, i1>
%14 = "neura.phi"(%13, %1) : (!neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
%15 = neura.reserve : !neura.data<!llvm.ptr, i1>
%16 = "neura.phi"(%15, %3) : (!neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
%17 = neura.reserve : !neura.data<!llvm.ptr, i1>
%18 = "neura.phi"(%17, %0) : (!neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
%19 = neura.reserve : !neura.data<f32, i1>
%20 = "neura.phi"(%19, %8) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
%21 = neura.reserve : !neura.data<i64, i1>
%22 = "neura.phi"(%21, %4) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
%23 = "neura.gep"(%18, %22) : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
%24 = "neura.load"(%23) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<f32, i1>
%25 = "neura.gep"(%16, %22) : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
%26 = "neura.load"(%25) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<f32, i1>
%27 = "neura.fmul"(%24, %26) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
%28 = "neura.fadd"(%20, %27) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
"neura.store"(%28, %14) : (!neura.data<f32, i1>, !neura.data<!llvm.ptr, i1>) -> ()
%29 = "neura.add"(%22, %12) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
%30 = "neura.icmp"(%29, %10) <{cmpType = "eq"}> : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i1, i1>
%31 = "neura.not"(%30) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
%32 = neura.grant_predicate %29, %31 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
neura.ctrl_mov %32 -> %21 : !neura.data<i64, i1> !neura.data<i64, i1>
%33 = neura.grant_predicate %28, %31 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
neura.ctrl_mov %33 -> %19 : !neura.data<f32, i1> !neura.data<f32, i1>
%34 = neura.grant_predicate %18, %31 : !neura.data<!llvm.ptr, i1>, !neura.data<i1, i1> -> !neura.data<!llvm.ptr, i1>
neura.ctrl_mov %34 -> %17 : !neura.data<!llvm.ptr, i1> !neura.data<!llvm.ptr, i1>
%35 = neura.grant_predicate %16, %31 : !neura.data<!llvm.ptr, i1>, !neura.data<i1, i1> -> !neura.data<!llvm.ptr, i1>
neura.ctrl_mov %35 -> %15 : !neura.data<!llvm.ptr, i1> !neura.data<!llvm.ptr, i1>
%36 = neura.grant_predicate %14, %31 : !neura.data<!llvm.ptr, i1>, !neura.data<i1, i1> -> !neura.data<!llvm.ptr, i1>
neura.ctrl_mov %36 -> %13 : !neura.data<!llvm.ptr, i1> !neura.data<!llvm.ptr, i1>
%37 = neura.grant_predicate %12, %31 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
neura.ctrl_mov %37 -> %11 : !neura.data<i64, i1> !neura.data<i64, i1>
%38 = neura.grant_predicate %10, %31 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
neura.ctrl_mov %38 -> %9 : !neura.data<i64, i1> !neura.data<i64, i1>
"neura.return"() : () -> ()
}
The final dataflow format mlir adds several grant_predicate, grant_once, reserve, phi oprators and expand oprators numbers as a result. So there is a chance that we could remove some redundent oprations after trasforming to dataflow format.
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels