diff --git a/lib/NeuraDialect/Transforms/GenerateCodePass.cpp b/lib/NeuraDialect/Transforms/GenerateCodePass.cpp index 7e3b5e42..8cac3fe7 100644 --- a/lib/NeuraDialect/Transforms/GenerateCodePass.cpp +++ b/lib/NeuraDialect/Transforms/GenerateCodePass.cpp @@ -572,11 +572,15 @@ struct GenerateCodePass // Emits router hops for multi-hop paths (from the second hop onwards). CTRL_MOV emits CTRL_MOV hops. template void generateIntermediateHops(const SmallVector &links, const Topology &topo, - int base_mov_id, size_t &hop_counter) { - for (size_t i = 1; i < links.size(); ++i) { + int base_mov_id, size_t &hop_counter, + bool starts_with_register = false) { + // Hops start from links[1]; when the path begins with a register, timestamp hops with the previous link. + size_t begin = 1; + for (size_t i = begin; i < links.size(); ++i) { int prev_link = links[i - 1].link_id; int cur_link = links[i].link_id; - int ts = links[i].ts; + // If path starts with register, align hop ts to the previous link (value arrival). + int ts = starts_with_register ? links[i - 1].ts : links[i].ts; int mid_tile = topo.srcTileOfLink(cur_link); StringRef in = topo.invertDir(topo.dirFromLink(prev_link)); @@ -690,10 +694,13 @@ struct GenerateCodePass StringRef producer_direction = directions.first; StringRef consumer_direction = directions.second; + // Detects if the path starts with a register (same-tile register staging). + bool starts_with_register = !regs.empty(); + // Producer endpoints & intermediate hops. setProducerDestination(producer, producer_direction, regs); size_t hop_counter = 1; - generateIntermediateHops(links, topo, mov_dfg_id, hop_counter); + generateIntermediateHops(links, topo, mov_dfg_id, hop_counter, starts_with_register); // Gather consumers. SmallVector, 2> consumers; @@ -790,6 +797,18 @@ struct GenerateCodePass return it->second; } + // Looks up time_step for a materialized instruction by its id. + std::optional getInstructionTimeById(int id) const { + for (const auto &tile_entry : tile_time_instructions) { + for (const auto &ts_entry : tile_entry.second) { + for (const Instruction &inst : ts_entry.second) { + if (inst.id == id) return inst.time_step; + } + } + } + return std::nullopt; + } + // Gets instruction ID for a materialized operation. int getInstructionId(Operation *op) const { auto it = operation_to_instruction_reference.find(op); @@ -881,6 +900,21 @@ struct GenerateCodePass return info; } + // Detects whether mapping_locs starts with a register/reg resource. + static bool pathStartsWithRegister(Operation *op) { + if (auto arr = op->getAttrOfType("mapping_locs")) { + if (!arr.empty()) { + if (auto dict = dyn_cast(arr[0])) { + if (auto res = dyn_cast_or_null(dict.get("resource"))) { + return res.getValue() == "register" || res.getValue() == "reg"; + } + } + } + } + return false; + } + + struct DfgNodeInfo { std::string opcode; int tile_x = -1; @@ -966,7 +1000,8 @@ struct GenerateCodePass TileLocation producer_loc, const Topology &topology, SmallVector, 8> &out_tiles, - SmallVector &out_time_steps) const { + SmallVector &out_time_steps, + bool starts_with_register = false) const { out_tiles.clear(); out_time_steps.clear(); if (link_steps.empty()) return; @@ -975,7 +1010,8 @@ struct GenerateCodePass ? topology.tileIdAt(producer_loc.col_idx, producer_loc.row_idx) : -1; int consumer_tile_id = topology.dstTileOfLink(link_steps.back().link_id); - for (size_t i = 0; i < link_steps.size(); ++i) { + size_t begin = starts_with_register ? 1 : 0; + for (size_t i = begin; i < link_steps.size(); ++i) { int middle_tile_id = topology.srcTileOfLink(link_steps[i].link_id); if (middle_tile_id == producer_tile_id || middle_tile_id == consumer_tile_id) continue; auto coord = topology.tile_location.lookup(middle_tile_id); @@ -984,6 +1020,7 @@ struct GenerateCodePass continue; // Skips duplicates. } out_tiles.push_back(coord); + // Uses the hop's own outgoing link timestep. out_time_steps.push_back(link_steps[i].ts); } } @@ -1032,13 +1069,17 @@ struct GenerateCodePass SmallVector link_steps = collectLinkSteps(operation); SmallVector, 8> hop_tiles; SmallVector hop_time_steps; + bool starts_with_register = pathStartsWithRegister(operation); // Build hop tiles directly from link steps (mirrors router hop emission). if (link_steps.size() > 1) { - for (size_t i = 1; i < link_steps.size(); ++i) { + size_t begin = starts_with_register ? 1 : 1; // hops are from link[1] onward + for (size_t i = begin; i < link_steps.size(); ++i) { int middle_tile_id = topology.srcTileOfLink(link_steps[i].link_id); auto coord = topology.tile_location.lookup(middle_tile_id); hop_tiles.push_back(coord); - hop_time_steps.push_back(link_steps[i].ts); + // Aligns hop ts with the link's own timestep (or previous if starts with register). + int hop_ts = starts_with_register ? link_steps[i - 1].ts : link_steps[i].ts; + hop_time_steps.push_back(hop_ts); } } @@ -1056,7 +1097,14 @@ struct GenerateCodePass hop_node.opcode = isCtrlMov(operation) ? "CTRL_MOV" : "DATA_MOV"; hop_node.tile_x = hop_tiles[i].first; hop_node.tile_y = hop_tiles[i].second; - hop_node.time_step = (i < hop_time_steps.size()) ? hop_time_steps[i] : -1; + // Prefers the materialized instruction time if available; fallback to link-based, + // but ensure it is not earlier than the computed hop_ts when present. + if (auto ts = getInstructionTimeById(node_id)) { + int link_ts = (i < hop_time_steps.size()) ? hop_time_steps[i] : *ts; + hop_node.time_step = std::max(*ts, link_ts); + } else { + hop_node.time_step = (i < hop_time_steps.size()) ? hop_time_steps[i] : -1; + } nodes[node_id] = hop_node; } diff --git a/test/code_gen/test_code_generate.mlir b/test/code_gen/test_code_generate.mlir index c8fbb422..035af43a 100644 --- a/test/code_gen/test_code_generate.mlir +++ b/test/code_gen/test_code_generate.mlir @@ -144,11 +144,11 @@ func.func @loop_test() -> f32 { // ASM: PE(1,1): // ASM-NEXT: { // ASM-NEXT: GRANT_PREDICATE, [WEST, RED], [EAST, RED] -> [SOUTH, RED] (t=7, inv_iters=1) +// ASM-NEXT: DATA_MOV, [EAST, RED] -> [WEST, RED] (t=7, inv_iters=1) // ASM-NEXT: DATA_MOV, [NORTH, RED] -> [$1] (t=7, inv_iters=1) // ASM-NEXT: } (idx_per_ii=2) // ASM-NEXT: { // ASM-NEXT: GRANT_PREDICATE, [$0], [$1] -> [$0] (t=8, inv_iters=1) -// ASM-NEXT: DATA_MOV, [EAST, RED] -> [WEST, RED] (t=8, inv_iters=1) // ASM-NEXT: } (idx_per_ii=3) // ASM-NEXT: { // ASM-NEXT: PHI_START, [WEST, RED], [$0] -> [WEST, RED], [$0] (t=4, inv_iters=0) @@ -156,10 +156,8 @@ func.func @loop_test() -> f32 { // ASM: PE(2,1): // ASM-NEXT: { // ASM-NEXT: NOT, [NORTH, RED] -> [WEST, RED] (t=6, inv_iters=1) +// ASM-NEXT: DATA_MOV, [NORTH, RED] -> [WEST, RED] (t=6, inv_iters=1) // ASM-NEXT: } (idx_per_ii=1) -// ASM-NEXT: { -// ASM-NEXT: DATA_MOV, [NORTH, RED] -> [WEST, RED] (t=7, inv_iters=1) -// ASM-NEXT: } (idx_per_ii=2) // ASM: PE(0,2): // ASM-NEXT: { // ASM-NEXT: PHI_START, [$0], [SOUTH, RED] -> [SOUTH, RED] (t=5, inv_iters=1) @@ -176,3 +174,55 @@ func.func @loop_test() -> f32 { // ASM-NEXT: { // ASM-NEXT: GRANT_ONCE, [$1] -> [$0] (t=4, inv_iters=0) // ASM-NEXT: } (idx_per_ii=4) +// ASM: PE(1,2): +// ASM-NEXT: { +// ASM-NEXT: CONSTANT, [#1] -> [$0] (t=0, inv_iters=0) +// ASM-NEXT: DATA_MOV, [EAST, RED] -> [SOUTH, RED] (t=5, inv_iters=1) +// ASM-NEXT: } (idx_per_ii=0) +// ASM-NEXT: { +// ASM-NEXT: GRANT_ONCE, [$0] -> [$0] (t=1, inv_iters=0) +// ASM-NEXT: } (idx_per_ii=1) +// ASM-NEXT: { +// ASM-NEXT: PHI_START, [$0], [EAST, RED] -> [$0], [EAST, RED] (t=2, inv_iters=0) +// ASM-NEXT: } (idx_per_ii=2) +// ASM-NEXT: { +// ASM-NEXT: ADD, [EAST, RED], [$0] -> [EAST, RED], [NORTH, RED] (t=3, inv_iters=0) +// ASM-NEXT: } (idx_per_ii=3) +// ASM-NEXT: { +// ASM-NEXT: PHI_START, [WEST, RED], [EAST, RED] -> [EAST, RED] (t=4, inv_iters=0) +// ASM-NEXT: } (idx_per_ii=4) +// ASM: PE(2,2): +// ASM-NEXT: { +// ASM-NEXT: ICMP_SLT, [$0], [WEST, RED] -> [NORTH, RED], [SOUTH, RED], [WEST, RED], [$0], [$1] (t=5, inv_iters=1) +// ASM-NEXT: } (idx_per_ii=0) +// ASM-NEXT: { +// ASM-NEXT: GRANT_PREDICATE, [$2], [$0] -> [WEST, RED] (t=6, inv_iters=1) +// ASM-NEXT: } (idx_per_ii=1) +// ASM-NEXT: { +// ASM-NEXT: PHI_START, [EAST, RED], [NORTH, RED] -> [WEST, RED] (t=2, inv_iters=0) +// ASM-NEXT: } (idx_per_ii=2) +// ASM-NEXT: { +// ASM-NEXT: DATA_MOV, [WEST, RED] -> [$2] (t=3, inv_iters=0) +// ASM-NEXT: GRANT_PREDICATE, [$0], [$1] -> [WEST, RED] (t=8, inv_iters=1) +// ASM-NEXT: } (idx_per_ii=3) +// ASM-NEXT: { +// ASM-NEXT: DATA_MOV, [WEST, RED] -> [$0] (t=4, inv_iters=0) +// ASM-NEXT: } (idx_per_ii=4) +// ASM: PE(3,2): +// ASM-NEXT: { +// ASM-NEXT: CONSTANT, [#0] -> [$0] (t=0, inv_iters=0) +// ASM-NEXT: } (idx_per_ii=0) +// ASM-NEXT: { +// ASM-NEXT: GRANT_ONCE, [$0] -> [WEST, RED] (t=1, inv_iters=0) +// ASM-NEXT: } (idx_per_ii=1) +// ASM: PE(1,3): +// ASM-NEXT: { +// ASM-NEXT: DATA_MOV, [SOUTH, RED] -> [EAST, RED] (t=3, inv_iters=0) +// ASM-NEXT: } (idx_per_ii=3) +// ASM: PE(2,3): +// ASM-NEXT: { +// ASM-NEXT: DATA_MOV, [WEST, RED] -> [$0] (t=5, inv_iters=1) +// ASM-NEXT: } (idx_per_ii=0) +// ASM-NEXT: { +// ASM-NEXT: GRANT_PREDICATE, [$0], [SOUTH, RED] -> [SOUTH, RED] (t=6, inv_iters=1) +// ASM-NEXT: } (idx_per_ii=1) diff --git a/test/e2e/bicg/bicg_kernel.mlir b/test/e2e/bicg/bicg_kernel.mlir index 4f82b761..2467d27b 100644 --- a/test/e2e/bicg/bicg_kernel.mlir +++ b/test/e2e/bicg/bicg_kernel.mlir @@ -213,11 +213,9 @@ // ASM-NEXT: } (idx_per_ii=1) // ASM-NEXT: { // ASM-NEXT: GRANT_ONCE, [arg1] -> [NORTH, RED], [$0] (t=2, inv_iters=0) +// ASM-NEXT: DATA_MOV, [NORTH, RED] -> [EAST, RED] (t=2, inv_iters=0) // ASM-NEXT: } (idx_per_ii=2) // ASM-NEXT: { -// ASM-NEXT: DATA_MOV, [NORTH, RED] -> [EAST, RED] (t=3, inv_iters=0) -// ASM-NEXT: } (idx_per_ii=3) -// ASM-NEXT: { // ASM-NEXT: GRANT_PREDICATE, [$0], [NORTH, RED] -> [$0], [$1] (t=10, inv_iters=0) // ASM-NEXT: } (idx_per_ii=10) // ASM-NEXT: { @@ -238,14 +236,14 @@ // ASM-NEXT: GRANT_PREDICATE, [$0], [$1] -> [EAST, RED] (t=15, inv_iters=1) // ASM-NEXT: } (idx_per_ii=2) // ASM-NEXT: { -// ASM-NEXT: DATA_MOV, [WEST, RED] -> [EAST, RED] (t=4, inv_iters=0) -// ASM-NEXT: } (idx_per_ii=4) +// ASM-NEXT: DATA_MOV, [WEST, RED] -> [EAST, RED] (t=3, inv_iters=0) +// ASM-NEXT: } (idx_per_ii=3) // ASM-NEXT: { // ASM-NEXT: DATA_MOV, [WEST, RED] -> [$1] (t=11, inv_iters=0) +// ASM-NEXT: DATA_MOV, [NORTH, RED] -> [EAST, RED] (t=11, inv_iters=0) // ASM-NEXT: } (idx_per_ii=11) // ASM-NEXT: { // ASM-NEXT: NOT, [WEST, RED] -> [$0], [$1] (t=12, inv_iters=0) -// ASM-NEXT: DATA_MOV, [NORTH, RED] -> [EAST, RED] (t=12, inv_iters=0) // ASM-NEXT: } (idx_per_ii=12) // RUN: mlir-neura-opt %t-kernel.mlir --view-op-graph 2>&1 | sed -n '/^digraph G {/,/^}$/p' > bicg_kernel_original.dot diff --git a/test/e2e/fir/fir_kernel.mlir b/test/e2e/fir/fir_kernel.mlir index d630822d..6dff9a74 100644 --- a/test/e2e/fir/fir_kernel.mlir +++ b/test/e2e/fir/fir_kernel.mlir @@ -143,9 +143,6 @@ // ASM-NEXT: } (idx_per_ii=2) // ASM: PE(1,2): // ASM-NEXT: { -// ASM-NEXT: DATA_MOV, [EAST, RED] -> [SOUTH, RED] (t=5, inv_iters=1) -// ASM-NEXT: } (idx_per_ii=0) -// ASM-NEXT: { // ASM-NEXT: ADD, [NORTH, RED], [SOUTH, RED] -> [SOUTH, RED], [$0] (t=6, inv_iters=1) // ASM-NEXT: } (idx_per_ii=1) // ASM-NEXT: { @@ -155,10 +152,12 @@ // ASM-NEXT: DATA_MOV, [EAST, RED] -> [$1] (t=3, inv_iters=0) // ASM-NEXT: RETURN_VALUE, [$0] (t=8, inv_iters=1) // ASM-NEXT: } (idx_per_ii=3) +// ASM-NEXT: { +// ASM-NEXT: DATA_MOV, [EAST, RED] -> [SOUTH, RED] (t=4, inv_iters=0) +// ASM-NEXT: } (idx_per_ii=4) // ASM: PE(2,2): // ASM-NEXT: { // ASM-NEXT: GRANT_PREDICATE, [$0], [$1] -> [EAST, RED] (t=5, inv_iters=1) -// ASM-NEXT: DATA_MOV, [EAST, RED] -> [WEST, RED] (t=5, inv_iters=1) // ASM-NEXT: } (idx_per_ii=0) // ASM-NEXT: { // ASM-NEXT: GEP, [EAST, RED] -> [$0] (t=2, inv_iters=0) @@ -169,7 +168,36 @@ // ASM-NEXT: } (idx_per_ii=3) // ASM-NEXT: { // ASM-NEXT: NOT, [EAST, RED] -> [$1], [WEST, RED] (t=4, inv_iters=0) +// ASM-NEXT: DATA_MOV, [EAST, RED] -> [WEST, RED] (t=4, inv_iters=0) +// ASM-NEXT: } (idx_per_ii=4) +// ASM: PE(3,2): +// ASM-NEXT: { +// ASM-NEXT: GRANT_ONCE, [#0] -> [$0] (t=0, inv_iters=0) +// ASM-NEXT: } (idx_per_ii=0) +// ASM-NEXT: { +// ASM-NEXT: PHI_START, [$0], [WEST, RED] -> [NORTH, RED], [WEST, RED], [$0] (t=1, inv_iters=0) +// ASM-NEXT: } (idx_per_ii=1) +// ASM-NEXT: { +// ASM-NEXT: ADD, [$0], [#1] -> [$0], [WEST, RED] (t=2, inv_iters=0) +// ASM-NEXT: } (idx_per_ii=2) +// ASM-NEXT: { +// ASM-NEXT: ICMP_EQ, [$0], [#32] -> [WEST, RED] (t=3, inv_iters=0) +// ASM-NEXT: } (idx_per_ii=3) +// ASM: PE(1,3): +// ASM-NEXT: { +// ASM-NEXT: DATA_MOV, [EAST, RED] -> [SOUTH, RED] (t=5, inv_iters=1) +// ASM-NEXT: } (idx_per_ii=0) +// ASM: PE(2,3): +// ASM-NEXT: { +// ASM-NEXT: MUL, [SOUTH, RED], [EAST, RED] -> [WEST, RED] (t=4, inv_iters=0) // ASM-NEXT: } (idx_per_ii=4) +// ASM: PE(3,3): +// ASM-NEXT: { +// ASM-NEXT: GEP, [SOUTH, RED] -> [$0] (t=2, inv_iters=0) +// ASM-NEXT: } (idx_per_ii=2) +// ASM-NEXT: { +// ASM-NEXT: LOAD, [$0] -> [WEST, RED] (t=3, inv_iters=0) +// ASM-NEXT: } (idx_per_ii=3) // RUN: mlir-neura-opt %t-kernel.mlir --view-op-graph 2>&1 | sed -n '/^digraph G {/,/^}$/p' > fir_kernel_original.dot // RUN: dot -Tpng fir_kernel_original.dot -o fir_kernel_original.png diff --git a/test/e2e/fir/fir_kernel_vec.mlir b/test/e2e/fir/fir_kernel_vec.mlir index f8d7eb14..89e47a88 100644 --- a/test/e2e/fir/fir_kernel_vec.mlir +++ b/test/e2e/fir/fir_kernel_vec.mlir @@ -138,9 +138,6 @@ // ASM-NEXT: } (idx_per_ii=3) // ASM: PE(1,2): // ASM-NEXT: { -// ASM-NEXT: DATA_MOV, [EAST, RED] -> [SOUTH, RED] (t=5, inv_iters=1) -// ASM-NEXT: } (idx_per_ii=0) -// ASM-NEXT: { // ASM-NEXT: VADD, [NORTH, RED], [SOUTH, RED] -> [SOUTH, RED], [$0] (t=6, inv_iters=1) // ASM-NEXT: } (idx_per_ii=1) // ASM-NEXT: { @@ -151,12 +148,12 @@ // ASM-NEXT: VECTOR.REDUCE.ADD, [$0] -> [$0] (t=8, inv_iters=1) // ASM-NEXT: } (idx_per_ii=3) // ASM-NEXT: { +// ASM-NEXT: DATA_MOV, [EAST, RED] -> [SOUTH, RED] (t=4, inv_iters=0) // ASM-NEXT: RETURN_VALUE, [$0] (t=9, inv_iters=1) // ASM-NEXT: } (idx_per_ii=4) // ASM: PE(2,2): // ASM-NEXT: { // ASM-NEXT: GRANT_PREDICATE, [$0], [$1] -> [EAST, RED] (t=5, inv_iters=1) -// ASM-NEXT: DATA_MOV, [EAST, RED] -> [WEST, RED] (t=5, inv_iters=1) // ASM-NEXT: } (idx_per_ii=0) // ASM-NEXT: { // ASM-NEXT: GEP, [EAST, RED] -> [$0] (t=2, inv_iters=0) @@ -167,4 +164,5 @@ // ASM-NEXT: } (idx_per_ii=3) // ASM-NEXT: { // ASM-NEXT: NOT, [EAST, RED] -> [$1], [WEST, RED] (t=4, inv_iters=0) +// ASM-NEXT: DATA_MOV, [EAST, RED] -> [WEST, RED] (t=4, inv_iters=0) // ASM-NEXT: } (idx_per_ii=4) diff --git a/test/neura/ctrl/branch_for.mlir b/test/neura/ctrl/branch_for.mlir index bb8aff85..74abdc63 100644 --- a/test/neura/ctrl/branch_for.mlir +++ b/test/neura/ctrl/branch_for.mlir @@ -229,9 +229,11 @@ func.func @loop_test() -> f32 { // YAML-NEXT: dst_operands: // YAML-NEXT: - operand: "EAST" // YAML-NEXT: color: "RED" +// YAML-NEXT: - index_per_ii: 1 +// YAML-NEXT: operations: // YAML-NEXT: - opcode: "DATA_MOV" -// YAML-NEXT: id: 190001 -// YAML-NEXT: time_step: 4 +// YAML-NEXT: id: 160001 +// YAML-NEXT: time_step: 5 // YAML-NEXT: invalid_iterations: 1 // YAML-NEXT: src_operands: // YAML-NEXT: - operand: "EAST" @@ -239,13 +241,48 @@ func.func @loop_test() -> f32 { // YAML-NEXT: dst_operands: // YAML-NEXT: - operand: "NORTH" // YAML-NEXT: color: "RED" +// YAML-NEXT: - index_per_ii: 2 +// YAML-NEXT: operations: +// YAML-NEXT: - opcode: "GRANT_ONCE" +// YAML-NEXT: id: 1 +// YAML-NEXT: time_step: 2 +// YAML-NEXT: invalid_iterations: 0 +// YAML-NEXT: src_operands: +// YAML-NEXT: - operand: "#0.000000" +// YAML-NEXT: color: "RED" +// YAML-NEXT: dst_operands: +// YAML-NEXT: - operand: "$0" +// YAML-NEXT: color: "RED" +// YAML-NEXT: - index_per_ii: 3 +// YAML-NEXT: operations: +// YAML-NEXT: - opcode: "PHI_START" +// YAML-NEXT: id: 8 +// YAML-NEXT: time_step: 3 +// YAML-NEXT: invalid_iterations: 0 +// YAML-NEXT: src_operands: +// YAML-NEXT: - operand: "$0" +// YAML-NEXT: color: "RED" +// YAML-NEXT: - operand: "NORTH" +// YAML-NEXT: color: "RED" +// YAML-NEXT: dst_operands: +// YAML-NEXT: - operand: "EAST" +// YAML-NEXT: color: "RED" +// YAML-NEXT: - opcode: "DATA_MOV" +// YAML-NEXT: id: 190001 +// YAML-NEXT: time_step: 3 +// YAML-NEXT: invalid_iterations: 0 +// YAML-NEXT: src_operands: +// YAML-NEXT: - operand: "EAST" +// YAML-NEXT: color: "RED" +// YAML-NEXT: dst_operands: +// YAML-NEXT: - operand: "NORTH" +// YAML-NEXT: color: "RED" // ASM: # Compiled II: 4 // ASM: PE(0,0): // ASM-NEXT: { // ASM-NEXT: GRANT_ONCE, [#0] -> [EAST, RED] (t=0, inv_iters=0) -// ASM-NEXT: DATA_MOV, [EAST, RED] -> [NORTH, RED] (t=4, inv_iters=1) // ASM-NEXT: } (idx_per_ii=0) // ASM-NEXT: { // ASM-NEXT: DATA_MOV, [EAST, RED] -> [NORTH, RED] (t=5, inv_iters=1) @@ -255,6 +292,7 @@ func.func @loop_test() -> f32 { // ASM-NEXT: } (idx_per_ii=2) // ASM-NEXT: { // ASM-NEXT: PHI_START, [$0], [NORTH, RED] -> [EAST, RED] (t=3, inv_iters=0) +// ASM-NEXT: DATA_MOV, [EAST, RED] -> [NORTH, RED] (t=3, inv_iters=0) // ASM-NEXT: } (idx_per_ii=3) // ASM: PE(1,0): // ASM-NEXT: {