Skip to content

Support simple loop control-related operation fuse.#94

Closed
ShangkunLi wants to merge 1 commit intocoredac:mainfrom
ShangkunLi:ctrlfuse-simple
Closed

Support simple loop control-related operation fuse.#94
ShangkunLi wants to merge 1 commit intocoredac:mainfrom
ShangkunLi:ctrlfuse-simple

Conversation

@ShangkunLi
Copy link
Copy Markdown
Collaborator

@ShangkunLi ShangkunLi commented Jul 26, 2025

In this pr:

  • We support the simple loop control-related operation fuse
  • Propose a loop_controller operation to control loops
  • We can eliminate the long recurrence brought by loop index increment

As shown below, the recurrence II of 3 is caused by the loop reduction dependence and the end value.

[DEBUG] Recurrence cycle (length 3):
  %22 = neura.reserve : !neura.data<i32, i1>
  %24 = "neura.phi"(%22, %23) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
  %25 = "neura.data_mov"(%24) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
  %27 = neura.grant_predicate %25, %26 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
  %36 = "neura.data_mov"(%27) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
  %38 = "neura.add"(%36, %37) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
  neura.ctrl_mov %38 -> %22 : !neura.data<i32, i1> !neura.data<i32, i1>
[DEBUG] Recurrence cycle (length 3):
  %12 = neura.reserve : !neura.data<i64, i1>
  %14 = "neura.phi"(%12, %13) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
  %18 = "neura.data_mov"(%14) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
  %index, %valid = neura.loop_controller(parent_valid = %16, start = %17, end = %18, step = %19) {iterationType = "increment"} : !neura.data<i1, i1>, !neura.data<i64, i1>, !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>, !neura.data<i1, i1>
  %29 = "neura.data_mov"(%valid) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
  %30 = neura.grant_predicate %28, %29 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
  neura.ctrl_mov %30 -> %12 : !neura.data<i64, i1> !neura.data<i64, i1>
[MapToAcceleratorPass] Longest recurrence cycle (length 3):
%22 = neura.reserve : !neura.data<i32, i1>
%24 = "neura.phi"(%22, %23) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
%25 = "neura.data_mov"(%24) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
%27 = neura.grant_predicate %25, %26 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
%36 = "neura.data_mov"(%27) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
%38 = "neura.add"(%36, %37) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
neura.ctrl_mov %38 -> %22 : !neura.data<i32, i1> !neura.data<i32, i1>

The IR before control flow fuse:

module {
  func.func @_Z10simpleloopv() -> i32 attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
    %0 = "neura.constant"() <{predicate = true, value = 1 : i64}> : () -> !neura.data<i64, i1>
    %1 = "neura.grant_always"(%0) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %2 = "neura.grant_once"(%0) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %3 = "neura.constant"() <{predicate = true, value = 128 : i64}> : () -> !neura.data<i64, i1>
    %4 = "neura.grant_always"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %5 = "neura.grant_once"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %6 = "neura.constant"() <{predicate = true, value = 0 : i32}> : () -> !neura.data<i32, i1>
    %7 = "neura.grant_once"(%6) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
    %8 = "neura.constant"() <{predicate = true, value = 0 : i64}> : () -> !neura.data<i64, i1>
    %9 = "neura.grant_once"(%8) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %10 = neura.reserve : !neura.data<i64, i1>
    %11 = "neura.phi"(%10, %5) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
    %12 = neura.reserve : !neura.data<i32, i1>
    %13 = "neura.phi"(%12, %7) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
    %14 = neura.reserve : !neura.data<i64, i1>
    %15 = "neura.phi"(%14, %9) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
    %16 = "neura.icmp"(%15, %11) <{cmpType = "slt"}> : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i1, i1>
    %17 = neura.grant_predicate %15, %16 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
    %18 = neura.grant_predicate %13, %16 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
    %19 = neura.grant_predicate %2, %16 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
    %20 = neura.grant_predicate %5, %16 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
    %21 = "neura.not"(%16) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
    %22 = neura.grant_predicate %13, %21 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
    %23 = "neura.cast"(%17) <{cast_type = "i64_to_i32"}> : (!neura.data<i64, i1>) -> !neura.data<i32, i1>
    %24 = "neura.add"(%18, %23) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
    %25 = "neura.add"(%17, %19) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
    neura.ctrl_mov %25 -> %14 : !neura.data<i64, i1> !neura.data<i64, i1>
    neura.ctrl_mov %24 -> %12 : !neura.data<i32, i1> !neura.data<i32, i1>
    neura.ctrl_mov %20 -> %10 : !neura.data<i64, i1> !neura.data<i64, i1>
    "neura.return"(%22) : (!neura.data<i32, i1>) -> ()
  }
}

The IR after control flow fuse:

module {
  func.func @_Z10simpleloopv() -> i32 attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
    %0 = "neura.constant"() <{predicate = true, value = 1 : i64}> : () -> !neura.data<i64, i1>
    %1 = "neura.grant_always"(%0) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %2 = "neura.constant"() <{predicate = true, value = 128 : i64}> : () -> !neura.data<i64, i1>
    %3 = "neura.grant_always"(%2) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %4 = "neura.grant_once"(%2) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %5 = "neura.constant"() <{predicate = true, value = 0 : i32}> : () -> !neura.data<i32, i1>
    %6 = "neura.grant_once"(%5) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
    %7 = "neura.constant"() <{predicate = true, value = 0 : i64}> : () -> !neura.data<i64, i1>
    %8 = neura.reserve : !neura.data<i64, i1>
    %9 = "neura.phi"(%8, %4) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
    %10 = "neura.constant"() <{predicate = true, value = true}> : () -> !neura.data<i1, i1>
    %index, %valid = neura.loop_controller(parent_valid = %10, start = %7, end = %9, step = %0) {iterationType = "increment"} : !neura.data<i1, i1>, !neura.data<i64, i1>, !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>, !neura.data<i1, i1>
    %11 = "neura.not"(%valid) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
    %12 = neura.reserve : !neura.data<i32, i1>
    %13 = "neura.phi"(%12, %6) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
    %14 = neura.grant_predicate %13, %valid : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
    %15 = neura.grant_predicate %4, %valid : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
    %16 = neura.grant_predicate %13, %11 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
    %17 = "neura.cast"(%index) <{cast_type = "i64_to_i32"}> : (!neura.data<i64, i1>) -> !neura.data<i32, i1>
    %18 = "neura.add"(%14, %17) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
    neura.ctrl_mov %18 -> %12 : !neura.data<i32, i1> !neura.data<i32, i1>
    neura.ctrl_mov %15 -> %8 : !neura.data<i64, i1> !neura.data<i64, i1>
    "neura.return"(%16) : (!neura.data<i32, i1>) -> ()
  }
}
  • Eliminate the recurrence brought by end value
  • Support nested loops

@ShangkunLi ShangkunLi marked this pull request as ready for review July 26, 2025 18:07
@tancheng
Copy link
Copy Markdown
Contributor

Plz show before and after on the region of interest (i.e., the fused part) of the IRs in the PR's description. Thanks~!

@ShangkunLi
Copy link
Copy Markdown
Collaborator Author

ShangkunLi commented Jul 27, 2025

Plz show before and after on the region of interest (i.e., the fused part) of the IRs in the PR's description. Thanks~!

Updated~

@tancheng
Copy link
Copy Markdown
Contributor

The branch_for.mlir's mapped II can be decreased? If so, can we include this pass into that test?

@ShangkunLi
Copy link
Copy Markdown
Collaborator Author

The branch_for.mlir's mapped II can be decreased? If so, can we include this pass into that test?

This pass cannot be applied to branch_for.mlir yet. A huge difference between branch_for.mlir and simpleloop.mlir is their frontend (i.e., branch_for -> clang++, simpleloop -> polygeist), this results in different loop structures.

For branch_for.mlir, it calculates the index increment (%8 = "neura.add"(%5, %2) : (i64, i64) -> i64) first and then the compare operation (%9 = "neura.icmp"(%8, %0) <{cmpType = "slt"}> : (i64, i64) -> i1).

module {
  func.func @loop_test() -> f32 attributes {accelerator = "neura"} {
    %0 = "neura.constant"() <{predicate = true, value = 10 : i64}> : () -> i64
    %1 = "neura.constant"() <{predicate = true, value = 0 : i64}> : () -> i64
    %2 = "neura.constant"() <{predicate = true, value = 1 : i64}> : () -> i64
    %3 = "neura.constant"() <{predicate = true, value = 3.000000e+00 : f32}> : () -> f32
    %4 = "neura.constant"() <{predicate = true, value = 0.000000e+00 : f32}> : () -> f32
    neura.br %1, %4 : i64, f32 to ^bb1
  ^bb1(%5: i64, %6: f32):  // 2 preds: ^bb0, ^bb1
    %7 = "neura.fadd"(%6, %3) : (f32, f32) -> f32
    %8 = "neura.add"(%5, %2) : (i64, i64) -> i64
    %9 = "neura.icmp"(%8, %0) <{cmpType = "slt"}> : (i64, i64) -> i1
    neura.cond_br %9 : i1 then %8, %7 : i64, f32 to ^bb1 else %7 : f32 to ^bb2
  ^bb2(%10: f32):  // pred: ^bb1
    "neura.return"(%10) : (f32) -> ()
  }
}

For simpleloop.mlir, it calculates the compare operation (%8 = "neura.icmp"(%7, %1) <{cmpType = "slt"}> : (index, index) -> i1) first and then the index increament operation (%11 = "neura.add"(%7, %0) : (index, index) -> index). Similar transforms for loops can also be found in dataflow/test/bert/*.

module {
  func.func @_Z10simpleloopv() -> i32 attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
    %0 = "neura.constant"() <{predicate = true, value = 1 : index}> : () -> index
    %1 = "neura.constant"() <{predicate = true, value = 128 : index}> : () -> index
    %2 = "neura.constant"() <{predicate = true, value = 0 : i32}> : () -> i32
    %3 = "neura.constant"() <{predicate = true, value = 0 : index}> : () -> index
    %4 = "neura.cast"(%3) <{cast_type = "index_to_int"}> : (index) -> i64
    neura.br %4, %2 : i64, i32 to ^bb1
  ^bb1(%5: i64, %6: i32):  // 2 preds: ^bb0, ^bb2
    %7 = "neura.cast"(%5) <{cast_type = "int_to_index"}> : (i64) -> index
    %8 = "neura.icmp"(%7, %1) <{cmpType = "slt"}> : (index, index) -> i1
    neura.cond_br %8 : i1 then to ^bb2 else to ^bb3
  ^bb2:  // pred: ^bb1
    %9 = "neura.cast"(%7) <{cast_type = "index_to_int"}> : (index) -> i32
    %10 = "neura.add"(%6, %9) : (i32, i32) -> i32
    %11 = "neura.add"(%7, %0) : (index, index) -> index
    %12 = "neura.cast"(%11) <{cast_type = "index_to_int"}> : (index) -> i64
    neura.br %12, %10 : i64, i32 to ^bb1
  ^bb3:  // pred: ^bb1
    "neura.return"(%6) : (i32) -> ()
  }
}

For this pass, I only support the loop structures generated by polygeist.

@ShangkunLi
Copy link
Copy Markdown
Collaborator Author

The branch_for.mlir's mapped II can be decreased? If so, can we include this pass into that test?

For the simpleloop.mlir, the pre-fuse mapping result is [CompiledII=4, RecMII=4, ResMII=1]. The post-fuse mapping result is [CompiledII=5, RecMII=3, ResMII=1]. The reason why the compiled II is increased is that for pre-fuse ir, there are 17 data_mov ops while for post-fuse ir, there are 21 data_mov ops.

So, we are urgently needing the register support.

@tancheng
Copy link
Copy Markdown
Contributor

tancheng commented Jul 27, 2025

The branch_for.mlir's mapped II can be decreased? If so, can we include this pass into that test?

For the simpleloop.mlir, the pre-fuse mapping result is [CompiledII=4, RecMII=4, ResMII=1]. The post-fuse mapping result is [CompiledII=5, RecMII=3, ResMII=1]. The reason why the compiled II is increased is that for pre-fuse ir, there are 17 data_mov ops while for post-fuse ir, there are 21 data_mov ops.

So, we are urgently needing the register support.

lol, i am on it #95.

However, the issue is that after applying it, the mappedII becomes worse (from 6 to 7), and need to take hours to complete. I tried a few times and found the [x, y] is wrongly assigned (and II become even worse after I correct it), which would affect which tile can be first tried during place.

  • So I think the place is more important than fusion or register...

@ShangkunLi
Copy link
Copy Markdown
Collaborator Author

The branch_for.mlir's mapped II can be decreased? If so, can we include this pass into that test?

For the simpleloop.mlir, the pre-fuse mapping result is [CompiledII=4, RecMII=4, ResMII=1]. The post-fuse mapping result is [CompiledII=5, RecMII=3, ResMII=1]. The reason why the compiled II is increased is that for pre-fuse ir, there are 17 data_mov ops while for post-fuse ir, there are 21 data_mov ops.
So, we are urgently needing the register support.

lol, i am on it #95.

However, the issue is that after applying it, the mappedII becomes worse (from 6 to 7), and need to take hours to complete. I tried a few times and found the [x, y] is wrongly assigned (and II become even worse after I correct it), which would affect which tile can be first tried during place.

  • So I think the place is more important than fusion or register...

Hmm, I may read through your implementation ASAP and try to find out the problem.

@tancheng
Copy link
Copy Markdown
Contributor

The branch_for.mlir's mapped II can be decreased? If so, can we include this pass into that test?

For the simpleloop.mlir, the pre-fuse mapping result is [CompiledII=4, RecMII=4, ResMII=1]. The post-fuse mapping result is [CompiledII=5, RecMII=3, ResMII=1]. The reason why the compiled II is increased is that for pre-fuse ir, there are 17 data_mov ops while for post-fuse ir, there are 21 data_mov ops.
So, we are urgently needing the register support.

lol, i am on it #95.
However, the issue is that after applying it, the mappedII becomes worse (from 6 to 7), and need to take hours to complete. I tried a few times and found the [x, y] is wrongly assigned (and II become even worse after I correct it), which would affect which tile can be first tried during place.

  • So I think the place is more important than fusion or register...

Hmm, I may read through your implementation ASAP and try to find out the problem.

Yes, plz #59

@ShangkunLi
Copy link
Copy Markdown
Collaborator Author

The branch_for.mlir's mapped II can be decreased? If so, can we include this pass into that test?

For the simpleloop.mlir, the pre-fuse mapping result is [CompiledII=4, RecMII=4, ResMII=1]. The post-fuse mapping result is [CompiledII=5, RecMII=3, ResMII=1]. The reason why the compiled II is increased is that for pre-fuse ir, there are 17 data_mov ops while for post-fuse ir, there are 21 data_mov ops.

So, we are urgently needing the register support.

After reading the mapping II, the problem comes from that the operands of the loop_controller are most transformed through the data_mov operation from constant to loop_controller, thus making the loop_controller can only be executed after all these constants are ready. If we can fuse those constant operations with the loop_controller, it will bring an increased II.

module {
  func.func @_Z10simpleloopv() -> i32 attributes {CompiledII = 5 : i32, RecMII = 3 : i32, ResMII = 1 : i32, accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
    %0 = "neura.constant"() <{predicate = true, value = 1 : i64}> {mapping_locs = [{id = 5 : i32, resource = "tile", time_step = 0 : i32, x = 1 : i32, y = 1 : i32}]} : () -> !neura.data<i64, i1>
    %1 = "neura.data_mov"(%0) {mapping_locs = []} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %2 = "neura.grant_always"(%1) {mapping_locs = [{id = 5 : i32, resource = "tile", time_step = 1 : i32, x = 1 : i32, y = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %3 = "neura.constant"() <{predicate = true, value = 128 : i64}> {mapping_locs = [{id = 6 : i32, resource = "tile", time_step = 0 : i32, x = 1 : i32, y = 2 : i32}]} : () -> !neura.data<i64, i1>
    %4 = "neura.data_mov"(%3) {mapping_locs = []} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %5 = "neura.grant_always"(%4) {mapping_locs = [{id = 6 : i32, resource = "tile", time_step = 1 : i32, x = 1 : i32, y = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %6 = "neura.data_mov"(%3) {mapping_locs = [{id = 18 : i32, resource = "link", time_step = 0 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %7 = "neura.grant_once"(%6) {mapping_locs = [{id = 10 : i32, resource = "tile", time_step = 1 : i32, x = 2 : i32, y = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %8 = "neura.constant"() <{predicate = true, value = 0 : i32}> {mapping_locs = [{id = 9 : i32, resource = "tile", time_step = 0 : i32, x = 2 : i32, y = 1 : i32}]} : () -> !neura.data<i32, i1>
    %9 = "neura.data_mov"(%8) {mapping_locs = [{id = 28 : i32, resource = "link", time_step = 0 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
    %10 = "neura.grant_once"(%9) {mapping_locs = [{id = 13 : i32, resource = "tile", time_step = 1 : i32, x = 3 : i32, y = 1 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
    %11 = "neura.constant"() <{predicate = true, value = 0 : i64}> {mapping_locs = [{id = 10 : i32, resource = "tile", time_step = 0 : i32, x = 2 : i32, y = 2 : i32}]} : () -> !neura.data<i64, i1>
    %12 = neura.reserve : !neura.data<i64, i1>
    %13 = "neura.data_mov"(%7) {mapping_locs = [{id = 33 : i32, resource = "link", time_step = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %14 = "neura.phi"(%12, %13) {mapping_locs = [{id = 9 : i32, resource = "tile", time_step = 2 : i32, x = 2 : i32, y = 1 : i32}]} : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
    %15 = "neura.constant"() <{predicate = true, value = true}> {mapping_locs = [{id = 14 : i32, resource = "tile", time_step = 0 : i32, x = 3 : i32, y = 2 : i32}]} : () -> !neura.data<i1, i1>
    %16 = "neura.data_mov"(%15) {mapping_locs = [{id = 43 : i32, resource = "link", time_step = 0 : i32}, {id = 43 : i32, resource = "link", time_step = 1 : i32}, {id = 43 : i32, resource = "link", time_step = 2 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
    %17 = "neura.data_mov"(%11) {mapping_locs = []} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %18 = "neura.data_mov"(%14) {mapping_locs = [{id = 30 : i32, resource = "link", time_step = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %19 = "neura.data_mov"(%0) {mapping_locs = [{id = 16 : i32, resource = "link", time_step = 0 : i32}, {id = 18 : i32, resource = "link", time_step = 1 : i32}, {id = 18 : i32, resource = "link", time_step = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %index, %valid = neura.loop_controller(parent_valid = %16, start = %17, end = %18, step = %19) {iterationType = "increment", mapping_locs = [{id = 10 : i32, resource = "tile", time_step = 3 : i32, x = 2 : i32, y = 2 : i32}]} : !neura.data<i1, i1>, !neura.data<i64, i1>, !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>, !neura.data<i1, i1>
    %20 = "neura.data_mov"(%valid) {mapping_locs = [{id = 31 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
    %21 = "neura.not"(%20) {mapping_locs = [{id = 6 : i32, resource = "tile", time_step = 4 : i32, x = 1 : i32, y = 2 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
    %22 = neura.reserve : !neura.data<i32, i1>
    %23 = "neura.data_mov"(%10) {mapping_locs = [{id = 42 : i32, resource = "link", time_step = 1 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
    %24 = "neura.phi"(%22, %23) {mapping_locs = [{id = 14 : i32, resource = "tile", time_step = 2 : i32, x = 3 : i32, y = 2 : i32}]} : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
    %25 = "neura.data_mov"(%24) {mapping_locs = []} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
    %26 = "neura.data_mov"(%valid) {mapping_locs = [{id = 32 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
    %27 = neura.grant_predicate %25, %26 {mapping_locs = [{id = 14 : i32, resource = "tile", time_step = 4 : i32, x = 3 : i32, y = 2 : i32}]} : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
    %28 = "neura.data_mov"(%7) {mapping_locs = [{id = 31 : i32, resource = "link", time_step = 1 : i32}, {id = 19 : i32, resource = "link", time_step = 2 : i32}, {id = 14 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %29 = "neura.data_mov"(%valid) {mapping_locs = [{id = 33 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
    %30 = neura.grant_predicate %28, %29 {mapping_locs = [{id = 9 : i32, resource = "tile", time_step = 4 : i32, x = 2 : i32, y = 1 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
    %31 = "neura.data_mov"(%24) {mapping_locs = [{id = 45 : i32, resource = "link", time_step = 2 : i32}, {id = 46 : i32, resource = "link", time_step = 3 : i32}, {id = 35 : i32, resource = "link", time_step = 4 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
    %32 = "neura.data_mov"(%21) {mapping_locs = [{id = 20 : i32, resource = "link", time_step = 4 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
    %33 = neura.grant_predicate %31, %32 {mapping_locs = [{id = 7 : i32, resource = "tile", time_step = 5 : i32, x = 1 : i32, y = 3 : i32}]} : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
    %34 = "neura.data_mov"(%index) {mapping_locs = []} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %35 = "neura.cast"(%34) <{cast_type = "i64_to_i32"}> {mapping_locs = [{id = 10 : i32, resource = "tile", time_step = 4 : i32, x = 2 : i32, y = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i32, i1>
    %36 = "neura.data_mov"(%27) {mapping_locs = [{id = 45 : i32, resource = "link", time_step = 4 : i32}, {id = 45 : i32, resource = "link", time_step = 5 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
    %37 = "neura.data_mov"(%35) {mapping_locs = [{id = 34 : i32, resource = "link", time_step = 4 : i32}, {id = 36 : i32, resource = "link", time_step = 5 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
    %38 = "neura.add"(%36, %37) {mapping_locs = [{id = 15 : i32, resource = "tile", time_step = 6 : i32, x = 3 : i32, y = 3 : i32}]} : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
    neura.ctrl_mov %38 -> %22 {mapping_locs = [{id = 47 : i32, resource = "link", time_step = 6 : i32}]} : !neura.data<i32, i1> !neura.data<i32, i1>
    neura.ctrl_mov %30 -> %12 {mapping_locs = []} : !neura.data<i64, i1> !neura.data<i64, i1>
    %39 = "neura.data_mov"(%33) {mapping_locs = []} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
    "neura.return"(%39) {mapping_locs = [{id = 7 : i32, resource = "tile", time_step = 6 : i32, x = 1 : i32, y = 3 : i32}]} : (!neura.data<i32, i1>) -> ()
  }
}

@tancheng
Copy link
Copy Markdown
Contributor

@ShangkunLi right, i already have a pending issue for you: #96 :-)

@tancheng tancheng added the new feature New feature or request label Jul 29, 2025
@tancheng
Copy link
Copy Markdown
Contributor

What does parent_valid = %10 mean?

And I don't get why there are still ctrl_mov after fusion, can you help briefly explain following by providing the equivalent C++:

    %c0_i32 = arith.constant 0 : i32
    %0 = affine.for %arg0 = 0 to 128 iter_args(%arg1 = %c0_i32) -> (i32) {
      %1 = arith.index_cast %arg0 : index to i32
      %2 = arith.addi %arg1, %1 : i32
      affine.yield %2 : i32
    }
    return %0 : i32
  }

@ShangkunLi
Copy link
Copy Markdown
Collaborator Author

What does parent_valid = %10 mean?

And I don't get why there are still ctrl_mov after fusion, can you help briefly explain following by providing the equivalent C++:

    %c0_i32 = arith.constant 0 : i32
    %0 = affine.for %arg0 = 0 to 128 iter_args(%arg1 = %c0_i32) -> (i32) {
      %1 = arith.index_cast %arg0 : index to i32
      %2 = arith.addi %arg1, %1 : i32
      affine.yield %2 : i32
    }
    return %0 : i32
  }

parent_valid is a valid bit for the loop_controller. It is provided to support nested loops. For example, for a two-level nested loops, the parent_valid bit is the valid of the outer loop's loop_controller. And for a simple one-level loop, parent_valid is just true.

The source .cpp file is:

int simpleloop() {
  int start = 0;
  int multiplier = 1;
  int result = start;
  for (int i = 0; i < 128; i++) {
    result = result * multiplier + i;
  }

  return result;
}

There are two reasons why we still have ctrl_mov:

  1. This loop has reduction dependency
  2. I didn't remove the ctrl_mov for the upper bound value of the loop in this prototype fusion pass (will be fixed in the next pr).

@tancheng
Copy link
Copy Markdown
Contributor

the parent_valid bit is the valid of the outer loop's loop_controller.

If that's the case, shouldn't there be grant operation to apply on a constant (or later fused grant op)? In your example, I only see a constant directly used as attribute in loop.controller.

@tancheng
Copy link
Copy Markdown
Contributor

And I am thinking the design of parent_valid, is this sth really needed, and is it able to support all cases (including imperfect nested loop, and inner index depend on outer-loop). How did you come up such a design?

// RUN: --insert-data-mov \
// RUN: --map-to-accelerator="mapping-strategy=heuristic" | FileCheck %s -check-prefix=CTRLFUSE-MAPPING

module attributes {} {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Plz add the equivalent C++ code here as comment, thanks~!

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rename to simple_loop.mlir.

@ShangkunLi
Copy link
Copy Markdown
Collaborator Author

And I am thinking the design of parent_valid, is this sth really needed, and is it able to support all cases (including imperfect nested loop, and inner index depend on outer-loop). How did you come up such a design?

I think we can embed all the constant loop parameters as attributes in the loop_controller. For a loop with all constant loop parameters, if we don't have such parent_valid signal, we cannot ensure its correctness.

And the newest version of loop_controller is:

def Neura_LoopControllerOp : Op<NeuraDialect, "loop_controller", [AttrSizedOperandSegments]>{
  let summary = "Generates loop indicies and valid predicates.";
  let description = [{
    Manages a single level of loop execution based on cycle counting.
    Each loop_controller outputs a current index value and a valid predicate.
    
    The loop_controller uses dynamic loop bounds (start, end, step),
    allowing for variable-length loops and runtime-determined bounds.
    
    The execution is conditioned on the parent_valid input, creating an
    efficient hierarchical structure for nested loops.
  }];

  let arguments = (ins
    AnyType:$parentValid,  // Valid predicate from the parent loop.
    StrAttr:$iterationType, // Type of the loop iteration (e.g., "increment", "decrement").
    Optional<AnyType>:$start,         // Start index of the loop (optional if startValue attr is presented).
    Optional<AnyType>:$end,           // End index of the loop (optional if endValue attr is presented).
    Optional<AnyType>:$step,           // Step size for the loop (optional if stepValue attr is presented).
    OptionalAttr<AnyAttr>:$startValue, // Optional constant start value attribute.
    OptionalAttr<AnyAttr>:$endValue,   // Optional constant end value attribute.
    OptionalAttr<AnyAttr>:$stepValue   // Optional constant step value attribute.
  );

  let results = (outs
    AnyType:$index,         // Current loop index
    AnyType:$valid          // Valid predicate for the current index
  );

  let assemblyFormat =
    " `(``parent_valid` `=` $parentValid (`,` `start` `=` $start^)? (`,` `end` `=` $end^)? (`,` `step` `=` $step^)?`)` attr-dict `:` type($parentValid) (`,` type($start)^)? (`,` type($end)^)? (`,` type($step)^)? `->` type($index) `,` type($valid)";
}

For constant-based loops, we can embed all the constant values in the attributes. For other loops, we can use operands to denote the loop parameters.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we rename loop_controller to loop_control? All the other operations are verb

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure~

@tancheng
Copy link
Copy Markdown
Contributor

For a loop with all constant loop parameters, if we don't have such parent_valid signal, we cannot ensure its correctness.

I doubt about this, let's discuss this later. Or plz give a simple example to elaborate.

And I doubt whether we really need following constant attribute, the similar 3 operands are enough?

    OptionalAttr<AnyAttr>:$startValue, // Optional constant start value attribute.
    OptionalAttr<AnyAttr>:$endValue,   // Optional constant end value attribute.
    OptionalAttr<AnyAttr>:$stepValue   // Optional constant step value attribute.

@ShangkunLi
Copy link
Copy Markdown
Collaborator Author

For a loop with all constant loop parameters, if we don't have such parent_valid signal, we cannot ensure its correctness.

I doubt about this, let's discuss this later. Or plz give a simple example to elaborate.

And I doubt whether we really need following constant attribute, the similar 3 operands are enough?

    OptionalAttr<AnyAttr>:$startValue, // Optional constant start value attribute.
    OptionalAttr<AnyAttr>:$endValue,   // Optional constant end value attribute.
    OptionalAttr<AnyAttr>:$stepValue   // Optional constant step value attribute.

Here is a simple loop with constant parameters:

module {
  func.func @_Z10simpleloopv() -> i32 attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
    %0 = "neura.constant"() <{predicate = true, value = 1 : i64}> : () -> !neura.data<i64, i1>
    %1 = "neura.grant_once"(%0) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %2 = "neura.constant"() <{predicate = true, value = 128 : i64}> : () -> !neura.data<i64, i1>
    %3 = "neura.grant_once"(%2) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %4 = "neura.constant"() <{predicate = true, value = 0 : i32}> : () -> !neura.data<i32, i1>
    %5 = "neura.grant_once"(%4) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
    %6 = "neura.constant"() <{predicate = true, value = 0 : i64}> : () -> !neura.data<i64, i1>
    %7 = "neura.grant_once"(%6) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %8 = neura.reserve : !neura.data<i64, i1>
    %9 = "neura.phi"(%8, %3) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
    %10 = neura.reserve : !neura.data<i32, i1>
    %11 = "neura.phi"(%10, %5) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
    %12 = neura.reserve : !neura.data<i64, i1>
    %13 = "neura.phi"(%12, %7) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
    %14 = "neura.icmp"(%13, %9) <{cmpType = "slt"}> : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i1, i1>
    %15 = neura.grant_predicate %13, %14 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
    %16 = neura.grant_predicate %11, %14 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
    %17 = neura.grant_predicate %1, %14 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
    %18 = neura.grant_predicate %3, %14 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
    %19 = "neura.not"(%14) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
    %20 = neura.grant_predicate %11, %19 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
    %21 = "neura.cast"(%15) <{cast_type = "i64_to_i32"}> : (!neura.data<i64, i1>) -> !neura.data<i32, i1>
    %22 = "neura.add"(%16, %21) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
    %23 = "neura.add"(%15, %17) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
    neura.ctrl_mov %23 -> %12 : !neura.data<i64, i1> !neura.data<i64, i1>
    neura.ctrl_mov %22 -> %10 : !neura.data<i32, i1> !neura.data<i32, i1>
    neura.ctrl_mov %18 -> %8 : !neura.data<i64, i1> !neura.data<i64, i1>
    "neura.return"(%20) : (!neura.data<i32, i1>) -> ()
  }

The main purpose of this loop_controller is to eliminate all the recurrence dependency brought by loop control:
index increament: %12->%13->%15->%23->%12, upper bound: %8->%9->%14->%18->%8.

And to eliminate such recurrence cycle, we have to trace until we reach the value of loop parameters. If we trace to constant (e.g., %6 for lower bound), we can not fuse constant with grant_once later. If we trace to grant_once, then we have to add grant_predicate ops for grant_once and ctrl_mov after each index increment; there will still be recurrence dependency.

And for nested non-constant loop parameters, we can avoid introducing more grant_predicate ops on the loop parameters and use the parent_valid directly.

@tancheng
Copy link
Copy Markdown
Contributor

%12->%13->%15->%23->%12

Is this eliminatable? I thought we can only eliminate int i = 0; i < 128; i++.

I don't get "If we trace to constant (e.g., %6 for lower bound), we can not fuse constant with grant_once later. If we trace to grant_once, then we have to add grant_predicate ops for grant_once and ctrl_mov after each index increment; there will still be recurrence dependency." let's discuss this later or tmr.

@ShangkunLi
Copy link
Copy Markdown
Collaborator Author

ShangkunLi commented Jul 30, 2025

%12->%13->%15->%23->%12

Is this eliminatable? I thought we can only eliminate int i = 0; i < 128; i++.

I don't get "If we trace to constant (e.g., %6 for lower bound), we can not fuse constant with grant_once later. If we trace to grant_once, then we have to add grant_predicate ops for grant_once and ctrl_mov after each index increment; there will still be recurrence dependency." let's discuss this later or tmr.

Hi~ Cheng,

I revised the definition of loop_control, and now the transformed ir looks like:

module {
  func.func @_Z10simpleloopv() -> i32 attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
    %0 = "neura.constant"() <{predicate = true, value = 1 : i64}> : () -> !neura.data<i64, i1>
    %1 = "neura.constant"() <{predicate = true, value = 128 : i64}> : () -> !neura.data<i64, i1>
    %2 = "neura.grant_always"(%1) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %3 = "neura.grant_always"(%0) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %4 = "neura.constant"() <{predicate = true, value = 0 : i32}> : () -> !neura.data<i32, i1>
    %5 = "neura.grant_once"(%4) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
    %6 = "neura.constant"() <{predicate = true, value = 0 : i64}> : () -> !neura.data<i64, i1>
    %7 = "neura.grant_once"(%6) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
    %8 = "neura.constant"() <{predicate = true, value = true}> : () -> !neura.data<i1, i1>
    %nextindex, %valid = neura.loop_control(parent_valid = %8, start = %7, end = %2, step = %3) {iterationType = "increment"} : !neura.data<i1, i1>, !neura.data<i64, i1>, !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>, !neura.data<i1, i1>
    %9 = "neura.not"(%valid) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
    %10 = neura.reserve : !neura.data<i32, i1>
    %11 = "neura.phi"(%10, %5) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
    %12 = neura.grant_predicate %11, %valid : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
    %13 = neura.grant_predicate %11, %9 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
    %14 = "neura.cast"(%nextindex) <{cast_type = "i64_to_i32"}> : (!neura.data<i64, i1>) -> !neura.data<i32, i1>
    %15 = "neura.add"(%12, %14) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
    neura.ctrl_mov %15 -> %10 : !neura.data<i32, i1> !neura.data<i32, i1>
    "neura.return"(%13) : (!neura.data<i32, i1>) -> ()
  }
}

WDYT?

@tancheng
Copy link
Copy Markdown
Contributor

Hi @ShangkunLi, it looks better~ thanks for the refactor. I am still curious

  • why there is no grant on %8
  • and neura.loop_control(parent_valid = %8, start = %7, end = %2, step = %3) has the variable names exist in the operation, this is great and informative~!

@ShangkunLi
Copy link
Copy Markdown
Collaborator Author

Hi @ShangkunLi, it looks better~ thanks for the refactor. I am still curious

  • why there is no grant on %8
  • and neura.loop_control(parent_valid = %8, start = %7, end = %2, step = %3) has the variable names exist in the operation, this is great and informative~!

Sorry, I forgot to add grant_always on %8. Added in the latest commit in #100.

And please refer #100 directly. These two prs will be deprecated.

@tancheng
Copy link
Copy Markdown
Contributor

#100

And though grant_always is added, it would be replaced with grant_once in that PR?

@ShangkunLi
Copy link
Copy Markdown
Collaborator Author

Supported in #100

@ShangkunLi ShangkunLi closed this Jul 31, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

new feature New feature or request

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants