Skip to content

[P0] Potential redundent oprations optimization #152

@YanzhouTang

Description

@YanzhouTang

I've been testing c2llvm2mlir testcase recently, and I found that there are some redundent oprations after convering to dataflow format mlir that can be potentially optimized.

Take test/c2llvm2mlir/simple_loop for example:

The original mlir is:

  llvm.func local_unnamed_addr @_Z6kernelPfS_S_(%arg0: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg1: !llvm.ptr {llvm.nocapture, llvm.noundef}, %arg2: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}) attributes {memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} {
    %0 = llvm.mlir.constant(0 : i64) : i64
    %1 = llvm.mlir.constant(1 : i64) : i64
    %2 = llvm.mlir.constant(32 : i64) : i64
    %3 = llvm.load %arg1 {alignment = 4 : i64, tbaa = [#tbaa_tag]} : !llvm.ptr -> f32
    llvm.br ^bb1(%0, %3 : i64, f32)
  ^bb1(%4: i64, %5: f32):  // 2 preds: ^bb0, ^bb1
    %6 = llvm.getelementptr inbounds %arg0[%4] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %7 = llvm.load %6 {alignment = 4 : i64, tbaa = [#tbaa_tag]} : !llvm.ptr -> f32
    %8 = llvm.getelementptr inbounds %arg2[%4] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %9 = llvm.load %8 {alignment = 4 : i64, tbaa = [#tbaa_tag]} : !llvm.ptr -> f32
    %10 = llvm.fmul %7, %9 : f32
    %11 = llvm.fadd %5, %10 : f32
    llvm.store %11, %arg1 {alignment = 4 : i64, tbaa = [#tbaa_tag]} : f32, !llvm.ptr
    %12 = llvm.add %4, %1 overflow<nsw, nuw> : i64
    %13 = llvm.icmp "eq" %12, %2 : i64
    llvm.cond_br %13, ^bb2, ^bb1(%12, %11 : i64, f32) {loop_annotation = #loop_annotation}
  ^bb2:  // pred: ^bb1
    llvm.return
  }

After canonicalize-live-in:

  func.func @_Z6kernelPfS_S_(%arg0: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg1: !llvm.ptr {llvm.nocapture, llvm.noundef}, %arg2: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}) -> !llvm.void attributes {CConv = #llvm.cconv<ccc>, accelerator = "neura", linkage = #llvm.linkage<external>, memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic", unnamed_addr = 1 : i64, visibility_ = 0 : i64} {
    %0 = "neura.constant"() <{value = "%arg0"}> : () -> !llvm.ptr
    %1 = "neura.constant"() <{value = "%arg1"}> : () -> !llvm.ptr
    %2 = "neura.constant"() <{value = "%arg2"}> : () -> !llvm.ptr
    %3 = "neura.constant"() <{value = 0 : i64}> : () -> i64
    %4 = "neura.constant"() <{value = 1 : i64}> : () -> i64
    %5 = "neura.constant"() <{value = 32 : i64}> : () -> i64
    %6 = "neura.load"(%1) : (!llvm.ptr) -> f32
    neura.br %3, %6, %0, %2, %1, %4, %5 : i64, f32, !llvm.ptr, !llvm.ptr, !llvm.ptr, i64, i64 to ^bb1
  ^bb1(%7: i64, %8: f32, %9: !llvm.ptr, %10: !llvm.ptr, %11: !llvm.ptr, %12: i64, %13: i64):  // 2 preds: ^bb0, ^bb1
    %14 = "neura.gep"(%9, %7) : (!llvm.ptr, i64) -> !llvm.ptr
    %15 = "neura.load"(%14) : (!llvm.ptr) -> f32
    %16 = "neura.gep"(%10, %7) : (!llvm.ptr, i64) -> !llvm.ptr
    %17 = "neura.load"(%16) : (!llvm.ptr) -> f32
    %18 = "neura.fmul"(%15, %17) : (f32, f32) -> f32
    %19 = "neura.fadd"(%8, %18) : (f32, f32) -> f32
    "neura.store"(%19, %11) : (f32, !llvm.ptr) -> ()
    %20 = "neura.add"(%7, %12) : (i64, i64) -> i64
    %21 = "neura.icmp"(%20, %13) <{cmpType = "eq"}> : (i64, i64) -> i1
    neura.cond_br %21 : i1 then to ^bb2 else %20, %19, %9, %10, %11, %12, %13 : i64, f32, !llvm.ptr, !llvm.ptr, !llvm.ptr, i64, i64 to ^bb1
  ^bb2:  // pred: ^bb1
    "neura.return"() : () -> ()
  }

After transform-ctrl-to-data-flow:

  func.func @_Z6kernelPfS_S_(%arg0: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg1: !llvm.ptr {llvm.nocapture, llvm.noundef}, %arg2: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}) -> !llvm.void attributes {CConv = #llvm.cconv<ccc>, accelerator = "neura", dataflow_mode = "predicate", linkage = #llvm.linkage<external>, memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic", unnamed_addr = 1 : i64, visibility_ = 0 : i64} {
    %0 = "neura.constant"() <{value = "%arg0"}> : () -> !neura.data<!llvm.ptr, i1>
    %1 = "neura.grant_once"(%0) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
    %2 = "neura.constant"() <{value = "%arg1"}> : () -> !neura.data<!llvm.ptr, i1>
    %3 = "neura.grant_once"(%2) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
    %4 = "neura.constant"() <{value = "%arg2"}> : () -> !neura.data<!llvm.ptr, i1>
    %5 = "neura.grant_once"(%4) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
    %6 = "neura.constant"() <{value = 0 : i64}> : () -> !neura.data<i64, i1>
    %7 = "neura.grant_once"(%6) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %8 = "neura.constant"() <{value = 1 : i64}> : () -> !neura.data<i64, i1>
    %9 = "neura.grant_once"(%8) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %10 = "neura.constant"() <{value = 32 : i64}> : () -> !neura.data<i64, i1>
    %11 = "neura.grant_once"(%10) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %12 = "neura.load"(%2) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<f32, i1>
    %13 = "neura.grant_once"(%12) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
    %14 = neura.reserve : !neura.data<i64, i1>
    %15 = "neura.phi"(%14, %11) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
    %16 = neura.reserve : !neura.data<i64, i1>
    %17 = "neura.phi"(%16, %9) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
    %18 = neura.reserve : !neura.data<!llvm.ptr, i1>
    %19 = "neura.phi"(%18, %3) : (!neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
    %20 = neura.reserve : !neura.data<!llvm.ptr, i1>
    %21 = "neura.phi"(%20, %5) : (!neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
    %22 = neura.reserve : !neura.data<!llvm.ptr, i1>
    %23 = "neura.phi"(%22, %1) : (!neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
    %24 = neura.reserve : !neura.data<f32, i1>
    %25 = "neura.phi"(%24, %13) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
    %26 = neura.reserve : !neura.data<i64, i1>
    %27 = "neura.phi"(%26, %7) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
    %28 = "neura.gep"(%23, %27) : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
    %29 = "neura.load"(%28) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<f32, i1>
    %30 = "neura.gep"(%21, %27) : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
    %31 = "neura.load"(%30) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<f32, i1>
    %32 = "neura.fmul"(%29, %31) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
    %33 = "neura.fadd"(%25, %32) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
    "neura.store"(%33, %19) : (!neura.data<f32, i1>, !neura.data<!llvm.ptr, i1>) -> ()
    %34 = "neura.add"(%27, %17) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
    %35 = "neura.icmp"(%34, %15) <{cmpType = "eq"}> : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i1, i1>
    %36 = "neura.not"(%35) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
    %37 = neura.grant_predicate %34, %36 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
    neura.ctrl_mov %37 -> %26 : !neura.data<i64, i1> !neura.data<i64, i1>
    %38 = neura.grant_predicate %33, %36 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
    neura.ctrl_mov %38 -> %24 : !neura.data<f32, i1> !neura.data<f32, i1>
    %39 = neura.grant_predicate %23, %36 : !neura.data<!llvm.ptr, i1>, !neura.data<i1, i1> -> !neura.data<!llvm.ptr, i1>
    neura.ctrl_mov %39 -> %22 : !neura.data<!llvm.ptr, i1> !neura.data<!llvm.ptr, i1>
    %40 = neura.grant_predicate %21, %36 : !neura.data<!llvm.ptr, i1>, !neura.data<i1, i1> -> !neura.data<!llvm.ptr, i1>
    neura.ctrl_mov %40 -> %20 : !neura.data<!llvm.ptr, i1> !neura.data<!llvm.ptr, i1>
    %41 = neura.grant_predicate %19, %36 : !neura.data<!llvm.ptr, i1>, !neura.data<i1, i1> -> !neura.data<!llvm.ptr, i1>
    neura.ctrl_mov %41 -> %18 : !neura.data<!llvm.ptr, i1> !neura.data<!llvm.ptr, i1>
    %42 = neura.grant_predicate %17, %36 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
    neura.ctrl_mov %42 -> %16 : !neura.data<i64, i1> !neura.data<i64, i1>
    %43 = neura.grant_predicate %15, %36 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
    neura.ctrl_mov %43 -> %14 : !neura.data<i64, i1> !neura.data<i64, i1>
    "neura.return"() : () -> ()
  }

After fold-constant:

  func.func @_Z6kernelPfS_S_(%arg0: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg1: !llvm.ptr {llvm.nocapture, llvm.noundef}, %arg2: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}) -> !llvm.void attributes {CConv = #llvm.cconv<ccc>, accelerator = "neura", dataflow_mode = "predicate", linkage = #llvm.linkage<external>, memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic", unnamed_addr = 1 : i64, visibility_ = 0 : i64} {
    %0 = "neura.grant_once"() <{constant_value = "%arg0"}> : () -> !neura.data<!llvm.ptr, i1>
    %1 = "neura.grant_once"() <{constant_value = "%arg1"}> : () -> !neura.data<!llvm.ptr, i1>
    %2 = "neura.constant"() <{value = "%arg1"}> : () -> !neura.data<!llvm.ptr, i1>
    %3 = "neura.grant_once"() <{constant_value = "%arg2"}> : () -> !neura.data<!llvm.ptr, i1>
    %4 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1>
    %5 = "neura.grant_once"() <{constant_value = 1 : i64}> : () -> !neura.data<i64, i1>
    %6 = "neura.grant_once"() <{constant_value = 32 : i64}> : () -> !neura.data<i64, i1>
    %7 = "neura.load"(%2) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<f32, i1>
    %8 = "neura.grant_once"(%7) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
    %9 = neura.reserve : !neura.data<i64, i1>
    %10 = "neura.phi"(%9, %6) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
    %11 = neura.reserve : !neura.data<i64, i1>
    %12 = "neura.phi"(%11, %5) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
    %13 = neura.reserve : !neura.data<!llvm.ptr, i1>
    %14 = "neura.phi"(%13, %1) : (!neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
    %15 = neura.reserve : !neura.data<!llvm.ptr, i1>
    %16 = "neura.phi"(%15, %3) : (!neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
    %17 = neura.reserve : !neura.data<!llvm.ptr, i1>
    %18 = "neura.phi"(%17, %0) : (!neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
    %19 = neura.reserve : !neura.data<f32, i1>
    %20 = "neura.phi"(%19, %8) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
    %21 = neura.reserve : !neura.data<i64, i1>
    %22 = "neura.phi"(%21, %4) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
    %23 = "neura.gep"(%18, %22) : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
    %24 = "neura.load"(%23) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<f32, i1>
    %25 = "neura.gep"(%16, %22) : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
    %26 = "neura.load"(%25) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<f32, i1>
    %27 = "neura.fmul"(%24, %26) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
    %28 = "neura.fadd"(%20, %27) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
    "neura.store"(%28, %14) : (!neura.data<f32, i1>, !neura.data<!llvm.ptr, i1>) -> ()
    %29 = "neura.add"(%22, %12) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
    %30 = "neura.icmp"(%29, %10) <{cmpType = "eq"}> : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i1, i1>
    %31 = "neura.not"(%30) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
    %32 = neura.grant_predicate %29, %31 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
    neura.ctrl_mov %32 -> %21 : !neura.data<i64, i1> !neura.data<i64, i1>
    %33 = neura.grant_predicate %28, %31 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
    neura.ctrl_mov %33 -> %19 : !neura.data<f32, i1> !neura.data<f32, i1>
    %34 = neura.grant_predicate %18, %31 : !neura.data<!llvm.ptr, i1>, !neura.data<i1, i1> -> !neura.data<!llvm.ptr, i1>
    neura.ctrl_mov %34 -> %17 : !neura.data<!llvm.ptr, i1> !neura.data<!llvm.ptr, i1>
    %35 = neura.grant_predicate %16, %31 : !neura.data<!llvm.ptr, i1>, !neura.data<i1, i1> -> !neura.data<!llvm.ptr, i1>
    neura.ctrl_mov %35 -> %15 : !neura.data<!llvm.ptr, i1> !neura.data<!llvm.ptr, i1>
    %36 = neura.grant_predicate %14, %31 : !neura.data<!llvm.ptr, i1>, !neura.data<i1, i1> -> !neura.data<!llvm.ptr, i1>
    neura.ctrl_mov %36 -> %13 : !neura.data<!llvm.ptr, i1> !neura.data<!llvm.ptr, i1>
    %37 = neura.grant_predicate %12, %31 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
    neura.ctrl_mov %37 -> %11 : !neura.data<i64, i1> !neura.data<i64, i1>
    %38 = neura.grant_predicate %10, %31 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
    neura.ctrl_mov %38 -> %9 : !neura.data<i64, i1> !neura.data<i64, i1>
    "neura.return"() : () -> ()
  }

The final dataflow format mlir adds several grant_predicate, grant_once, reserve, phi oprators and expand oprators numbers as a result. So there is a chance that we could remove some redundent oprations after trasforming to dataflow format.

Metadata

Metadata

Labels

No labels
No labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions