Skip to content

Commit fa5fb36

Browse files
committed
fix: skip direct-dominating live-in opt for loop headers (#270)
Values crossing from outer blocks to inner loop headers were marked as direct-dominating live-ins, preventing block-arg promotion. This caused missing inner-rate PHI_STARTs in the dataflow IR, starving inner-loop operations of valid data each cycle. Added back-edge check: if the using block has a predecessor it dominates (i.e. it is a loop header), the value is promoted to a block argument.
1 parent f584835 commit fa5fb36

File tree

8 files changed

+1517
-919
lines changed

8 files changed

+1517
-919
lines changed

lib/NeuraDialect/Transforms/CanonicalizeLiveInPass.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -436,6 +436,28 @@ identifyDirectDominatingLiveIns(Region &region, DominanceInfo &dom_info,
436436
continue;
437437
}
438438

439+
// If the using block is a loop header (has a back-edge from itself or
440+
// other blocks), we must NOT treat any live-in as a direct dominating
441+
// live-in. In the dataflow model, loop headers operate at the inner
442+
// loop rate, so live-in values from outer blocks must be promoted to
443+
// block arguments to get proper rate-matched PHI_START operations
444+
// during the ctrl-to-data-flow transformation.
445+
// See: https://github.com/coredac/dataflow/issues/270
446+
bool using_block_is_loop_header = false;
447+
for (Block *pred : block.getPredecessors()) {
448+
if (dom_info.dominates(&block, pred)) {
449+
using_block_is_loop_header = true;
450+
break;
451+
}
452+
}
453+
454+
if (using_block_is_loop_header) {
455+
// Skips direct dominating live-in optimization for loop headers.
456+
// The value must be promoted to a block argument so that the
457+
// transform-ctrl-to-data-flow pass creates an inner-rate PHI_START.
458+
continue;
459+
}
460+
439461
// Pattern 1: Single-Source-Single-Sink with one conditional branch.
440462
if (isSingleSourceSingleSinkPattern(defining_block, &block, dom_info,
441463
post_dom_info)) {

test/e2e/bicg/bicg_int_kernel.mlir

Lines changed: 138 additions & 115 deletions
Large diffs are not rendered by default.

test/e2e/bicg/bicg_kernel.mlir

Lines changed: 210 additions & 81 deletions
Large diffs are not rendered by default.

test/e2e/fft/fft_kernel.mlir

Lines changed: 354 additions & 273 deletions
Large diffs are not rendered by default.

test/e2e/gemm/gemm_kernel.mlir

Lines changed: 271 additions & 213 deletions
Large diffs are not rendered by default.

test/e2e/gemv/gemv_kernel.mlir

Lines changed: 194 additions & 151 deletions
Large diffs are not rendered by default.

test/e2e/spmv/spmv_kernel.mlir

Lines changed: 325 additions & 83 deletions
Large diffs are not rendered by default.

test/neura/fusion/test.mlir

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@
2929

3030
// CHECK-FUSED: func.func @_Z6kernelPA1024_iPiS1_S1_S1_
3131
// CHECK-FUSED: accelerator = "neura"
32-
// CHECK-FUSED-DAG: %91 = neura.load_indexed %89[%90 : !neura.data<i64, i1>] !neura.data<!llvm.ptr, i1> : !neura.data<i32, i1>
33-
// CHECK-FUSED-DAG: %82 = "neura.mul_add"(%79, %80, %81) : (!neura.data<i32, i1>, !neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
34-
// CHECK-FUSED-DAG: %95 = "neura.mul_add"(%92, %93, %94) : (!neura.data<i32, i1>, !neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
32+
// CHECK-FUSED-DAG: %102 = neura.load_indexed %100[%101 : !neura.data<i64, i1>] !neura.data<!llvm.ptr, i1> : !neura.data<i32, i1>
33+
// CHECK-FUSED-DAG: %93 = "neura.mul_add"(%90, %91, %92) : (!neura.data<i32, i1>, !neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
34+
// CHECK-FUSED-DAG: %106 = "neura.mul_add"(%103, %104, %105) : (!neura.data<i32, i1>, !neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
3535

3636
// CHECK-MAPPING: mapping_info = {compiled_ii = 12 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 9 : i32, res_mii = 5 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}
3737

0 commit comments

Comments
 (0)