diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 1a962e68c587c7..419414e5bd993d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -14011,7 +14011,7 @@ static void placeSources(ByteProvider &Src0, Src0s.push_back( {*Src0.Src, ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)), - Src1.SrcOffset / 4}); + Src0.SrcOffset / 4}); Src1s.push_back( {*Src1.Src, ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)), diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll index 108d85e024ad76..15734094db42cd 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -3450,4 +3450,850 @@ entry: } +define amdgpu_kernel void @ByteOffsetCorrectness(ptr addrspace(1) noalias readonly align 16 %inptr0, ptr addrspace(1) noalias readonly align 16 %inptr1, ptr addrspace(1) noalias align 16 %inptr2) local_unnamed_addr { +; GFX7-LABEL: ByteOffsetCorrectness: +; GFX7: ; %bb.0: ; %.entry +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, 2, v[0:1] +; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7-NEXT: s_cbranch_execz .LBB17_5 +; GFX7-NEXT: ; %bb.1: ; %.lr.ph.preheader +; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v2 +; GFX7-NEXT: v_mul_hi_u32_u24_e32 v2, 0x48, v0 +; GFX7-NEXT: v_mul_u32_u24_e32 v1, 0x48, v0 +; GFX7-NEXT: s_movk_i32 s0, 0x900 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v3, s0, v[1:2] +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 5, v3 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, v6, v0 +; GFX7-NEXT: v_addc_u32_e64 v3, s[0:1], 0, 0, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s10, v4 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc +; GFX7-NEXT: v_mov_b32_e32 v6, s9 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, s8, v4 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: v_mov_b32_e32 v6, 0x48 +; GFX7-NEXT: s_movk_i32 s10, 0xffe1 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mov_b32_e32 v7, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7-NEXT: s_mov_b32 s11, -1 +; GFX7-NEXT: s_mov_b64 s[12:13], 0 +; GFX7-NEXT: .LBB17_2: ; %.lr.ph +; GFX7-NEXT: ; =>This Loop Header: Depth=1 +; GFX7-NEXT: ; Child Loop BB17_3 Depth 2 +; GFX7-NEXT: v_mov_b32_e32 v8, 0 +; GFX7-NEXT: s_mov_b64 s[0:1], s[8:9] +; GFX7-NEXT: .LBB17_3: ; %.preheader2 +; GFX7-NEXT: ; Parent Loop BB17_2 Depth=1 +; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX7-NEXT: buffer_load_sbyte v9, v[4:5], s[0:3], 0 addr64 +; GFX7-NEXT: buffer_load_sbyte v10, v[4:5], s[0:3], 0 addr64 offset:1 +; GFX7-NEXT: buffer_load_sbyte v11, v[4:5], s[0:3], 0 addr64 offset:2 +; GFX7-NEXT: buffer_load_sbyte v12, v[4:5], s[0:3], 0 addr64 offset:3 +; GFX7-NEXT: buffer_load_sbyte v13, v[4:5], s[0:3], 0 addr64 offset:4 +; GFX7-NEXT: buffer_load_sbyte v14, v[4:5], s[0:3], 0 addr64 offset:5 +; GFX7-NEXT: buffer_load_sbyte v15, v[4:5], s[0:3], 0 addr64 offset:6 +; GFX7-NEXT: buffer_load_sbyte v16, v[4:5], s[0:3], 0 addr64 offset:7 +; GFX7-NEXT: buffer_load_sbyte v17, v[4:5], s[0:3], 0 addr64 offset:8 +; GFX7-NEXT: buffer_load_sbyte v18, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: buffer_load_sbyte v19, v[0:1], s[0:3], 0 addr64 offset:1 +; GFX7-NEXT: buffer_load_sbyte v20, v[0:1], s[0:3], 0 addr64 offset:2 +; GFX7-NEXT: buffer_load_sbyte v21, v[0:1], s[0:3], 0 addr64 offset:3 +; GFX7-NEXT: buffer_load_sbyte v22, v[0:1], s[0:3], 0 addr64 offset:4 +; GFX7-NEXT: buffer_load_sbyte v23, v[0:1], s[0:3], 0 addr64 offset:5 +; GFX7-NEXT: buffer_load_sbyte v24, v[0:1], s[0:3], 0 addr64 offset:6 +; GFX7-NEXT: buffer_load_sbyte v25, v[0:1], s[0:3], 0 addr64 offset:7 +; GFX7-NEXT: buffer_load_sbyte v26, v[0:1], s[0:3], 0 addr64 offset:8 +; GFX7-NEXT: s_add_u32 s0, s0, 9 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, s[0:1], v[6:7] +; GFX7-NEXT: s_and_b64 vcc, exec, vcc +; GFX7-NEXT: s_waitcnt vmcnt(8) +; GFX7-NEXT: v_mad_i32_i24 v8, v18, v9, v8 +; GFX7-NEXT: s_waitcnt vmcnt(7) +; GFX7-NEXT: v_mad_i32_i24 v8, v19, v10, v8 +; GFX7-NEXT: s_waitcnt vmcnt(6) +; GFX7-NEXT: v_mad_i32_i24 v8, v20, v11, v8 +; GFX7-NEXT: s_waitcnt vmcnt(5) +; GFX7-NEXT: v_mad_i32_i24 v8, v21, v12, v8 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_mad_i32_i24 v8, v22, v13, v8 +; GFX7-NEXT: s_waitcnt vmcnt(3) +; GFX7-NEXT: v_mad_i32_i24 v8, v23, v14, v8 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_mad_i32_i24 v8, v24, v15, v8 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mad_i32_i24 v8, v25, v16, v8 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_i32_i24 v8, v26, v17, v8 +; GFX7-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX7-NEXT: ; %bb.4: ; %.110 +; GFX7-NEXT: ; in Loop: Header=BB17_2 Depth=1 +; GFX7-NEXT: v_lshl_b64 v[9:10], v[2:3], 2 +; GFX7-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], v[2:3] +; GFX7-NEXT: buffer_store_dword v8, v[9:10], s[4:7], 0 addr64 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v2 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x900, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v8 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GFX7-NEXT: s_or_b64 s[12:13], s[0:1], s[12:13] +; GFX7-NEXT: v_mov_b32_e32 v3, v9 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[12:13] +; GFX7-NEXT: s_cbranch_execnz .LBB17_2 +; GFX7-NEXT: .LBB17_5: ; %._crit_edge +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: ByteOffsetCorrectness: +; GFX8: ; %bb.0: ; %.entry +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 2, v[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_cbranch_execz .LBB17_5 +; GFX8-NEXT: ; %bb.1: ; %.lr.ph.preheader +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v3, v2 +; GFX8-NEXT: s_movk_i32 s0, 0x900 +; GFX8-NEXT: v_mul_hi_u32_u24_e32 v4, 0x900, v3 +; GFX8-NEXT: v_mul_u32_u24_e32 v3, 0x900, v3 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v2, s0, v[3:4] +; GFX8-NEXT: s_movk_i32 s0, 0x48 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, s0, v[1:2] +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 5, v5 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0 +; GFX8-NEXT: v_addc_u32_e64 v1, s[0:1], 0, 0, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, s6, v2 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v4, v3, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v2 +; GFX8-NEXT: s_movk_i32 s4, 0xffe1 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v4, v3, vcc +; GFX8-NEXT: s_mov_b32 s5, -1 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB17_2: ; %.lr.ph +; GFX8-NEXT: ; =>This Loop Header: Depth=1 +; GFX8-NEXT: ; Child Loop BB17_3 Depth 2 +; GFX8-NEXT: v_mov_b32_e32 v10, 0 +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB17_3: ; %.preheader2 +; GFX8-NEXT: ; Parent Loop BB17_2 Depth=1 +; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v9, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v6 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v7, v3, vcc +; GFX8-NEXT: flat_load_sbyte v11, v[4:5] +; GFX8-NEXT: flat_load_sbyte v12, v[2:3] +; GFX8-NEXT: s_add_u32 s0, s0, 9 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0x48 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_i32_i24 v12, v12, v11, v10 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 1, v4 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc +; GFX8-NEXT: flat_load_sbyte v13, v[10:11] +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 1, v2 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_sbyte v10, v[10:11] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 2, v4 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc +; GFX8-NEXT: flat_load_sbyte v13, v[10:11] +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 2, v2 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_sbyte v10, v[10:11] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 3, v4 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc +; GFX8-NEXT: flat_load_sbyte v13, v[10:11] +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 3, v2 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_sbyte v10, v[10:11] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc +; GFX8-NEXT: flat_load_sbyte v13, v[10:11] +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 4, v2 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_sbyte v10, v[10:11] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 5, v4 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc +; GFX8-NEXT: flat_load_sbyte v13, v[10:11] +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 5, v2 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_sbyte v10, v[10:11] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v4 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc +; GFX8-NEXT: flat_load_sbyte v13, v[10:11] +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v2 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_sbyte v10, v[10:11] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 7, v4 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc +; GFX8-NEXT: flat_load_sbyte v13, v[10:11] +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 7, v2 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 8, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v2 +; GFX8-NEXT: flat_load_sbyte v10, v[10:11] +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_sbyte v4, v[4:5] +; GFX8-NEXT: flat_load_sbyte v2, v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_mad_i32_i24 v10, v10, v13, v12 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_i32_i24 v10, v2, v4, v10 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_3 +; GFX8-NEXT: ; %bb.4: ; %.110 +; GFX8-NEXT: ; in Loop: Header=BB17_2 Depth=1 +; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc +; GFX8-NEXT: flat_store_dword v[2:3], v10 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x900, v6 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x900, v8 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GFX8-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB17_2 +; GFX8-NEXT: .LBB17_5: ; %._crit_edge +; GFX8-NEXT: s_endpgm +; +; GFX9-NODL-LABEL: ByteOffsetCorrectness: +; GFX9-NODL: ; %bb.0: ; %.entry +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NODL-NEXT: v_cmp_gt_i64_e32 vcc, 2, v[0:1] +; GFX9-NODL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NODL-NEXT: s_cbranch_execz .LBB17_5 +; GFX9-NODL-NEXT: ; %bb.1: ; %.lr.ph.preheader +; GFX9-NODL-NEXT: v_add_u32_e32 v10, v3, v2 +; GFX9-NODL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v1, 5, v10 +; GFX9-NODL-NEXT: s_movk_i32 s3, 0x900 +; GFX9-NODL-NEXT: v_mul_hi_u32_u24_e32 v9, 0x900, v2 +; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v8, 0x900, v2 +; GFX9-NODL-NEXT: v_add_co_u32_e32 v4, vcc, v1, v0 +; GFX9-NODL-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v3, s3, v[8:9] +; GFX9-NODL-NEXT: v_mul_hi_u32_u24_e32 v7, 0x48, v0 +; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v6, 0x48, v0 +; GFX9-NODL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v10, s3, v[6:7] +; GFX9-NODL-NEXT: s_movk_i32 s2, 0x48 +; GFX9-NODL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, s2, v[1:2] +; GFX9-NODL-NEXT: v_addc_co_u32_e64 v5, s[0:1], 0, 0, vcc +; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: v_mov_b32_e32 v8, s9 +; GFX9-NODL-NEXT: v_add_co_u32_e32 v0, vcc, s8, v6 +; GFX9-NODL-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v7, vcc +; GFX9-NODL-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 +; GFX9-NODL-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc +; GFX9-NODL-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NODL-NEXT: v_add_co_u32_e32 v6, vcc, s10, v6 +; GFX9-NODL-NEXT: s_movk_i32 s6, 0xffe1 +; GFX9-NODL-NEXT: v_addc_co_u32_e32 v7, vcc, v8, v7, vcc +; GFX9-NODL-NEXT: s_mov_b32 s7, -1 +; GFX9-NODL-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NODL-NEXT: .LBB17_2: ; %.lr.ph +; GFX9-NODL-NEXT: ; =>This Loop Header: Depth=1 +; GFX9-NODL-NEXT: ; Child Loop BB17_3 Depth 2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NODL-NEXT: s_mov_b64 s[10:11], 0 +; GFX9-NODL-NEXT: .LBB17_3: ; %.preheader2 +; GFX9-NODL-NEXT: ; Parent Loop BB17_2 Depth=1 +; GFX9-NODL-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v12, s11 +; GFX9-NODL-NEXT: v_add_co_u32_e32 v9, vcc, s10, v6 +; GFX9-NODL-NEXT: v_add_co_u32_e64 v11, s[0:1], s10, v0 +; GFX9-NODL-NEXT: v_add_co_u32_e64 v13, s[2:3], s10, v2 +; GFX9-NODL-NEXT: v_addc_co_u32_e64 v14, s[2:3], v3, v12, s[2:3] +; GFX9-NODL-NEXT: v_addc_co_u32_e32 v10, vcc, v7, v12, vcc +; GFX9-NODL-NEXT: v_addc_co_u32_e64 v12, vcc, v1, v12, s[0:1] +; GFX9-NODL-NEXT: global_load_sbyte v15, v[13:14], off +; GFX9-NODL-NEXT: global_load_sbyte v16, v[11:12], off offset:1 +; GFX9-NODL-NEXT: global_load_sbyte v17, v[11:12], off offset:2 +; GFX9-NODL-NEXT: global_load_sbyte v18, v[11:12], off offset:3 +; GFX9-NODL-NEXT: global_load_sbyte v19, v[11:12], off offset:4 +; GFX9-NODL-NEXT: global_load_sbyte v20, v[11:12], off offset:5 +; GFX9-NODL-NEXT: global_load_sbyte v21, v[11:12], off offset:6 +; GFX9-NODL-NEXT: global_load_sbyte v22, v[11:12], off offset:7 +; GFX9-NODL-NEXT: global_load_sbyte v23, v[9:10], off +; GFX9-NODL-NEXT: global_load_sbyte v24, v[9:10], off offset:1 +; GFX9-NODL-NEXT: global_load_sbyte v25, v[9:10], off offset:2 +; GFX9-NODL-NEXT: global_load_sbyte v26, v[9:10], off offset:3 +; GFX9-NODL-NEXT: global_load_sbyte v27, v[9:10], off offset:4 +; GFX9-NODL-NEXT: global_load_sbyte v28, v[9:10], off offset:5 +; GFX9-NODL-NEXT: global_load_sbyte v29, v[9:10], off offset:6 +; GFX9-NODL-NEXT: ; kill: killed $vgpr11 killed $vgpr12 +; GFX9-NODL-NEXT: global_load_sbyte v11, v[9:10], off offset:7 +; GFX9-NODL-NEXT: global_load_sbyte v12, v[13:14], off offset:8 +; GFX9-NODL-NEXT: global_load_sbyte v30, v[9:10], off offset:8 +; GFX9-NODL-NEXT: s_add_u32 s10, s10, 9 +; GFX9-NODL-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NODL-NEXT: s_cmp_lg_u64 s[10:11], 0x48 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(9) +; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v23, v15, v8 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(8) +; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v24, v16, v8 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(7) +; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v25, v17, v8 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(6) +; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v26, v18, v8 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(5) +; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v27, v19, v8 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(4) +; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v28, v20, v8 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(3) +; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v29, v21, v8 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) +; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v11, v22, v8 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) +; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v30, v12, v8 +; GFX9-NODL-NEXT: s_cbranch_scc1 .LBB17_3 +; GFX9-NODL-NEXT: ; %bb.4: ; %.110 +; GFX9-NODL-NEXT: ; in Loop: Header=BB17_2 Depth=1 +; GFX9-NODL-NEXT: v_lshlrev_b64 v[9:10], 2, v[4:5] +; GFX9-NODL-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-NODL-NEXT: v_add_co_u32_e32 v9, vcc, s4, v9 +; GFX9-NODL-NEXT: v_addc_co_u32_e32 v10, vcc, v11, v10, vcc +; GFX9-NODL-NEXT: global_store_dword v[9:10], v8, off +; GFX9-NODL-NEXT: v_add_co_u32_e32 v8, vcc, 32, v4 +; GFX9-NODL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v5, vcc +; GFX9-NODL-NEXT: v_add_co_u32_e32 v0, vcc, 0x900, v0 +; GFX9-NODL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NODL-NEXT: v_add_co_u32_e32 v2, vcc, 0x900, v2 +; GFX9-NODL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NODL-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], v[4:5] +; GFX9-NODL-NEXT: v_add_co_u32_e32 v6, vcc, 0x900, v6 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-NODL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NODL-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-NODL-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NODL-NEXT: s_cbranch_execnz .LBB17_2 +; GFX9-NODL-NEXT: .LBB17_5: ; %._crit_edge +; GFX9-NODL-NEXT: s_endpgm +; +; GFX9-DL-LABEL: ByteOffsetCorrectness: +; GFX9-DL: ; %bb.0: ; %.entry +; GFX9-DL-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DL-NEXT: v_cmp_gt_i64_e32 vcc, 2, v[0:1] +; GFX9-DL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DL-NEXT: s_cbranch_execz .LBB17_5 +; GFX9-DL-NEXT: ; %bb.1: ; %.lr.ph.preheader +; GFX9-DL-NEXT: v_add_u32_e32 v10, v3, v2 +; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX9-DL-NEXT: v_lshlrev_b32_e32 v1, 5, v10 +; GFX9-DL-NEXT: s_movk_i32 s3, 0x900 +; GFX9-DL-NEXT: v_mul_hi_u32_u24_e32 v9, 0x900, v2 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v8, 0x900, v2 +; GFX9-DL-NEXT: v_add_co_u32_e32 v4, vcc, v1, v0 +; GFX9-DL-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v3, s3, v[8:9] +; GFX9-DL-NEXT: v_mul_hi_u32_u24_e32 v7, 0x48, v0 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v6, 0x48, v0 +; GFX9-DL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v10, s3, v[6:7] +; GFX9-DL-NEXT: s_movk_i32 s2, 0x48 +; GFX9-DL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, s2, v[1:2] +; GFX9-DL-NEXT: v_addc_co_u32_e64 v5, s[0:1], 0, 0, vcc +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s9 +; GFX9-DL-NEXT: v_add_co_u32_e32 v0, vcc, s8, v6 +; GFX9-DL-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v7, vcc +; GFX9-DL-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 +; GFX9-DL-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-DL-NEXT: v_add_co_u32_e32 v6, vcc, s10, v6 +; GFX9-DL-NEXT: s_movk_i32 s8, 0xffe1 +; GFX9-DL-NEXT: v_addc_co_u32_e32 v7, vcc, v8, v7, vcc +; GFX9-DL-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-DL-NEXT: s_mov_b32 s12, 0xc0c0400 +; GFX9-DL-NEXT: s_mov_b32 s9, -1 +; GFX9-DL-NEXT: s_mov_b32 s13, 0x4000c0c +; GFX9-DL-NEXT: .LBB17_2: ; %.lr.ph +; GFX9-DL-NEXT: ; =>This Loop Header: Depth=1 +; GFX9-DL-NEXT: ; Child Loop BB17_3 Depth 2 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DL-NEXT: s_mov_b64 s[10:11], 0 +; GFX9-DL-NEXT: .LBB17_3: ; %.preheader2 +; GFX9-DL-NEXT: ; Parent Loop BB17_2 Depth=1 +; GFX9-DL-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX9-DL-NEXT: v_mov_b32_e32 v12, s11 +; GFX9-DL-NEXT: v_add_co_u32_e32 v9, vcc, s10, v6 +; GFX9-DL-NEXT: v_add_co_u32_e64 v11, s[0:1], s10, v0 +; GFX9-DL-NEXT: v_add_co_u32_e64 v13, s[2:3], s10, v2 +; GFX9-DL-NEXT: v_addc_co_u32_e64 v14, s[2:3], v3, v12, s[2:3] +; GFX9-DL-NEXT: v_addc_co_u32_e32 v10, vcc, v7, v12, vcc +; GFX9-DL-NEXT: v_addc_co_u32_e64 v12, vcc, v1, v12, s[0:1] +; GFX9-DL-NEXT: global_load_sbyte v15, v[11:12], off offset:1 +; GFX9-DL-NEXT: global_load_sbyte v16, v[11:12], off offset:2 +; GFX9-DL-NEXT: global_load_sbyte v17, v[11:12], off offset:3 +; GFX9-DL-NEXT: global_load_sbyte v18, v[11:12], off offset:4 +; GFX9-DL-NEXT: global_load_sbyte v19, v[11:12], off offset:5 +; GFX9-DL-NEXT: global_load_sbyte v20, v[11:12], off offset:6 +; GFX9-DL-NEXT: global_load_sbyte v21, v[11:12], off offset:7 +; GFX9-DL-NEXT: global_load_sbyte v22, v[13:14], off +; GFX9-DL-NEXT: global_load_sbyte v23, v[13:14], off offset:8 +; GFX9-DL-NEXT: global_load_sbyte v24, v[9:10], off +; GFX9-DL-NEXT: global_load_sbyte v25, v[9:10], off offset:1 +; GFX9-DL-NEXT: global_load_sbyte v26, v[9:10], off offset:2 +; GFX9-DL-NEXT: global_load_sbyte v27, v[9:10], off offset:3 +; GFX9-DL-NEXT: global_load_sbyte v28, v[9:10], off offset:4 +; GFX9-DL-NEXT: global_load_sbyte v29, v[9:10], off offset:5 +; GFX9-DL-NEXT: ; kill: killed $vgpr13 killed $vgpr14 +; GFX9-DL-NEXT: ; kill: killed $vgpr11 killed $vgpr12 +; GFX9-DL-NEXT: global_load_sbyte v11, v[9:10], off offset:6 +; GFX9-DL-NEXT: global_load_sbyte v12, v[9:10], off offset:7 +; GFX9-DL-NEXT: global_load_sbyte v13, v[9:10], off offset:8 +; GFX9-DL-NEXT: s_add_u32 s10, s10, 9 +; GFX9-DL-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-DL-NEXT: s_cmp_lg_u64 s[10:11], 0x48 +; GFX9-DL-NEXT: s_waitcnt vmcnt(16) +; GFX9-DL-NEXT: v_perm_b32 v9, v16, v15, s12 +; GFX9-DL-NEXT: s_waitcnt vmcnt(14) +; GFX9-DL-NEXT: v_perm_b32 v10, v18, v17, s13 +; GFX9-DL-NEXT: v_or_b32_e32 v9, v10, v9 +; GFX9-DL-NEXT: s_waitcnt vmcnt(12) +; GFX9-DL-NEXT: v_perm_b32 v16, v20, v19, s12 +; GFX9-DL-NEXT: s_waitcnt vmcnt(9) +; GFX9-DL-NEXT: v_perm_b32 v17, v23, v21, s13 +; GFX9-DL-NEXT: s_waitcnt vmcnt(8) +; GFX9-DL-NEXT: v_mad_i32_i24 v8, v24, v22, v8 +; GFX9-DL-NEXT: s_waitcnt vmcnt(6) +; GFX9-DL-NEXT: v_perm_b32 v14, v26, v25, s12 +; GFX9-DL-NEXT: s_waitcnt vmcnt(4) +; GFX9-DL-NEXT: v_perm_b32 v15, v28, v27, s13 +; GFX9-DL-NEXT: v_or_b32_e32 v10, v15, v14 +; GFX9-DL-NEXT: v_dot4_i32_i8 v8, v10, v9, v8 +; GFX9-DL-NEXT: s_waitcnt vmcnt(2) +; GFX9-DL-NEXT: v_perm_b32 v11, v11, v29, s12 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_perm_b32 v12, v13, v12, s13 +; GFX9-DL-NEXT: v_or_b32_e32 v13, v17, v16 +; GFX9-DL-NEXT: v_or_b32_e32 v11, v12, v11 +; GFX9-DL-NEXT: v_dot4_i32_i8 v8, v11, v13, v8 +; GFX9-DL-NEXT: s_cbranch_scc1 .LBB17_3 +; GFX9-DL-NEXT: ; %bb.4: ; %.110 +; GFX9-DL-NEXT: ; in Loop: Header=BB17_2 Depth=1 +; GFX9-DL-NEXT: v_lshlrev_b64 v[9:10], 2, v[4:5] +; GFX9-DL-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-DL-NEXT: v_add_co_u32_e32 v9, vcc, s4, v9 +; GFX9-DL-NEXT: v_addc_co_u32_e32 v10, vcc, v11, v10, vcc +; GFX9-DL-NEXT: global_store_dword v[9:10], v8, off +; GFX9-DL-NEXT: v_add_co_u32_e32 v8, vcc, 32, v4 +; GFX9-DL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v5, vcc +; GFX9-DL-NEXT: v_add_co_u32_e32 v0, vcc, 0x900, v0 +; GFX9-DL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-DL-NEXT: v_add_co_u32_e32 v2, vcc, 0x900, v2 +; GFX9-DL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-DL-NEXT: v_cmp_lt_i64_e64 s[0:1], s[8:9], v[4:5] +; GFX9-DL-NEXT: v_add_co_u32_e32 v6, vcc, 0x900, v6 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-DL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-DL-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GFX9-DL-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-DL-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-DL-NEXT: s_cbranch_execnz .LBB17_2 +; GFX9-DL-NEXT: .LBB17_5: ; %._crit_edge +; GFX9-DL-NEXT: s_endpgm +; +; GFX10-DL-LABEL: ByteOffsetCorrectness: +; GFX10-DL: ; %bb.0: ; %.entry +; GFX10-DL-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-DL-NEXT: v_cmp_gt_i64_e32 vcc_lo, 2, v[0:1] +; GFX10-DL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10-DL-NEXT: s_cbranch_execz .LBB17_5 +; GFX10-DL-NEXT: ; %bb.1: ; %.lr.ph.preheader +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX10-DL-NEXT: v_mul_hi_u32_u24_e32 v5, 0x900, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, 0x900, v2 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v6, v3, v2 +; GFX10-DL-NEXT: v_mul_hi_u32_u24_e32 v2, 0x48, v0 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, 0x48, v0 +; GFX10-DL-NEXT: s_movk_i32 s2, 0xffe1 +; GFX10-DL-NEXT: v_mad_u64_u32 v[3:4], s0, 0x900, v3, v[4:5] +; GFX10-DL-NEXT: v_lshlrev_b32_e32 v8, 5, v6 +; GFX10-DL-NEXT: v_mad_u64_u32 v[6:7], s0, 0x900, v6, v[1:2] +; GFX10-DL-NEXT: s_mov_b32 s3, -1 +; GFX10-DL-NEXT: s_mov_b32 s6, 0 +; GFX10-DL-NEXT: v_mad_u64_u32 v[4:5], s0, 0x48, v0, v[3:4] +; GFX10-DL-NEXT: v_add_co_u32 v0, s0, v8, v0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_add_co_u32 v2, vcc_lo, s8, v6 +; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s9, v7, vcc_lo +; GFX10-DL-NEXT: v_add_co_u32 v4, vcc_lo, s8, v4 +; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s9, v5, vcc_lo +; GFX10-DL-NEXT: v_add_co_u32 v6, vcc_lo, s10, v6 +; GFX10-DL-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, 0, s0 +; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s11, v7, vcc_lo +; GFX10-DL-NEXT: .LBB17_2: ; %.lr.ph +; GFX10-DL-NEXT: ; =>This Loop Header: Depth=1 +; GFX10-DL-NEXT: ; Child Loop BB17_3 Depth 2 +; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-DL-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-DL-NEXT: .LBB17_3: ; %.preheader2 +; GFX10-DL-NEXT: ; Parent Loop BB17_2 Depth=1 +; GFX10-DL-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX10-DL-NEXT: v_add_co_u32 v9, vcc_lo, v4, s0 +; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s1, v5, vcc_lo +; GFX10-DL-NEXT: v_add_co_u32 v11, vcc_lo, v2, s0 +; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, s1, v3, vcc_lo +; GFX10-DL-NEXT: v_add_co_u32 v13, vcc_lo, v6, s0 +; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, s1, v7, vcc_lo +; GFX10-DL-NEXT: s_clause 0x6 +; GFX10-DL-NEXT: global_load_sbyte v15, v[11:12], off offset:1 +; GFX10-DL-NEXT: global_load_sbyte v16, v[11:12], off offset:2 +; GFX10-DL-NEXT: global_load_sbyte v17, v[11:12], off offset:3 +; GFX10-DL-NEXT: global_load_sbyte v18, v[11:12], off offset:4 +; GFX10-DL-NEXT: global_load_sbyte v19, v[11:12], off offset:5 +; GFX10-DL-NEXT: global_load_sbyte v20, v[11:12], off offset:6 +; GFX10-DL-NEXT: global_load_sbyte v21, v[11:12], off offset:7 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: global_load_sbyte v22, v[9:10], off +; GFX10-DL-NEXT: global_load_sbyte v23, v[9:10], off offset:8 +; GFX10-DL-NEXT: s_clause 0x8 +; GFX10-DL-NEXT: global_load_sbyte v24, v[13:14], off +; GFX10-DL-NEXT: global_load_sbyte v25, v[13:14], off offset:1 +; GFX10-DL-NEXT: global_load_sbyte v26, v[13:14], off offset:2 +; GFX10-DL-NEXT: global_load_sbyte v27, v[13:14], off offset:3 +; GFX10-DL-NEXT: global_load_sbyte v28, v[13:14], off offset:4 +; GFX10-DL-NEXT: global_load_sbyte v29, v[13:14], off offset:5 +; GFX10-DL-NEXT: ; meta instruction +; GFX10-DL-NEXT: ; meta instruction +; GFX10-DL-NEXT: global_load_sbyte v9, v[13:14], off offset:6 +; GFX10-DL-NEXT: global_load_sbyte v10, v[13:14], off offset:7 +; GFX10-DL-NEXT: global_load_sbyte v11, v[13:14], off offset:8 +; GFX10-DL-NEXT: s_add_u32 s0, s0, 9 +; GFX10-DL-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-DL-NEXT: s_cmp_lg_u64 s[0:1], 0x48 +; GFX10-DL-NEXT: s_waitcnt vmcnt(16) +; GFX10-DL-NEXT: v_perm_b32 v12, v16, v15, 0xc0c0400 +; GFX10-DL-NEXT: s_waitcnt vmcnt(14) +; GFX10-DL-NEXT: v_perm_b32 v13, v18, v17, 0x4000c0c +; GFX10-DL-NEXT: s_waitcnt vmcnt(12) +; GFX10-DL-NEXT: v_perm_b32 v16, v20, v19, 0xc0c0400 +; GFX10-DL-NEXT: s_waitcnt vmcnt(9) +; GFX10-DL-NEXT: v_perm_b32 v17, v23, v21, 0x4000c0c +; GFX10-DL-NEXT: s_waitcnt vmcnt(8) +; GFX10-DL-NEXT: v_mad_i32_i24 v8, v24, v22, v8 +; GFX10-DL-NEXT: s_waitcnt vmcnt(6) +; GFX10-DL-NEXT: v_perm_b32 v14, v26, v25, 0xc0c0400 +; GFX10-DL-NEXT: s_waitcnt vmcnt(4) +; GFX10-DL-NEXT: v_perm_b32 v15, v28, v27, 0x4000c0c +; GFX10-DL-NEXT: s_waitcnt vmcnt(2) +; GFX10-DL-NEXT: v_perm_b32 v9, v9, v29, 0xc0c0400 +; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_perm_b32 v10, v11, v10, 0x4000c0c +; GFX10-DL-NEXT: v_or_b32_e32 v11, v13, v12 +; GFX10-DL-NEXT: v_or_b32_e32 v12, v15, v14 +; GFX10-DL-NEXT: v_or_b32_e32 v13, v17, v16 +; GFX10-DL-NEXT: v_or_b32_e32 v9, v10, v9 +; GFX10-DL-NEXT: v_dot4c_i32_i8 v8, v12, v11 +; GFX10-DL-NEXT: v_dot4c_i32_i8 v8, v9, v13 +; GFX10-DL-NEXT: s_cbranch_scc1 .LBB17_3 +; GFX10-DL-NEXT: ; %bb.4: ; %.110 +; GFX10-DL-NEXT: ; in Loop: Header=BB17_2 Depth=1 +; GFX10-DL-NEXT: v_lshlrev_b64 v[9:10], 2, v[0:1] +; GFX10-DL-NEXT: v_add_co_u32 v6, s0, 0x900, v6 +; GFX10-DL-NEXT: v_add_co_ci_u32_e64 v7, s0, 0, v7, s0 +; GFX10-DL-NEXT: v_add_co_u32 v9, vcc_lo, s4, v9 +; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s5, v10, vcc_lo +; GFX10-DL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 32 +; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v1, vcc_lo +; GFX10-DL-NEXT: v_add_co_u32 v2, vcc_lo, 0x900, v2 +; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-DL-NEXT: v_add_co_u32 v4, vcc_lo, 0x900, v4 +; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo +; GFX10-DL-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX10-DL-NEXT: v_mov_b32_e32 v0, v11 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, v12 +; GFX10-DL-NEXT: global_store_dword v[9:10], v8, off +; GFX10-DL-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 +; GFX10-DL-NEXT: s_cbranch_execnz .LBB17_2 +; GFX10-DL-NEXT: .LBB17_5: ; %._crit_edge +; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: ByteOffsetCorrectness: +; GFX11-DL: ; %bb.0: ; %.entry +; GFX11-DL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0x3ff, v0 +; GFX11-DL-NEXT: s_mov_b32 s0, exec_lo +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: v_cmpx_gt_i64_e32 2, v[1:2] +; GFX11-DL-NEXT: s_cbranch_execz .LBB17_5 +; GFX11-DL-NEXT: ; %bb.1: ; %.lr.ph.preheader +; GFX11-DL-NEXT: v_bfe_u32 v5, v0, 20, 10 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-DL-NEXT: v_mul_hi_u32_u24_e32 v4, 0x48, v1 +; GFX11-DL-NEXT: s_load_b64 s[2:3], s[2:3], 0x34 +; GFX11-DL-NEXT: v_mul_hi_u32_u24_e32 v3, 0x900, v5 +; GFX11-DL-NEXT: v_mul_u32_u24_e32 v2, 0x900, v5 +; GFX11-DL-NEXT: v_add_nc_u32_e32 v9, v0, v5 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_mad_u64_u32 v[5:6], null, 0x900, v0, v[2:3] +; GFX11-DL-NEXT: v_mul_u32_u24_e32 v3, 0x48, v1 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 5, v9 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DL-NEXT: v_mad_u64_u32 v[7:8], null, 0x900, v9, v[3:4] +; GFX11-DL-NEXT: v_mad_u64_u32 v[9:10], null, 0x48, v1, v[5:6] +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_add_co_u32 v0, s0, v0, v1 +; GFX11-DL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0, s0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-DL-NEXT: v_add_co_u32 v2, vcc_lo, s4, v7 +; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s5, v8, vcc_lo +; GFX11-DL-NEXT: v_add_co_u32 v4, vcc_lo, s4, v9 +; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s5, v10, vcc_lo +; GFX11-DL-NEXT: v_add_co_u32 v6, vcc_lo, s6, v7 +; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s7, v8, vcc_lo +; GFX11-DL-NEXT: s_movk_i32 s4, 0xffe1 +; GFX11-DL-NEXT: s_mov_b32 s5, -1 +; GFX11-DL-NEXT: s_mov_b32 s6, 0 +; GFX11-DL-NEXT: .LBB17_2: ; %.lr.ph +; GFX11-DL-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-DL-NEXT: ; Child Loop BB17_3 Depth 2 +; GFX11-DL-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-DL-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-DL-NEXT: .LBB17_3: ; %.preheader2 +; GFX11-DL-NEXT: ; Parent Loop BB17_2 Depth=1 +; GFX11-DL-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-DL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-DL-NEXT: v_add_co_u32 v9, vcc_lo, v4, s0 +; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s1, v5, vcc_lo +; GFX11-DL-NEXT: v_add_co_u32 v11, vcc_lo, v2, s0 +; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, s1, v3, vcc_lo +; GFX11-DL-NEXT: v_add_co_u32 v13, vcc_lo, v6, s0 +; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, s1, v7, vcc_lo +; GFX11-DL-NEXT: s_clause 0x6 +; GFX11-DL-NEXT: global_load_i8 v15, v[11:12], off offset:1 +; GFX11-DL-NEXT: global_load_i8 v16, v[11:12], off offset:2 +; GFX11-DL-NEXT: global_load_i8 v17, v[11:12], off offset:3 +; GFX11-DL-NEXT: global_load_i8 v18, v[11:12], off offset:4 +; GFX11-DL-NEXT: global_load_i8 v19, v[11:12], off offset:5 +; GFX11-DL-NEXT: global_load_i8 v20, v[11:12], off offset:6 +; GFX11-DL-NEXT: global_load_i8 v11, v[11:12], off offset:7 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_i8 v12, v[9:10], off +; GFX11-DL-NEXT: global_load_i8 v9, v[9:10], off offset:8 +; GFX11-DL-NEXT: s_clause 0x8 +; GFX11-DL-NEXT: global_load_i8 v10, v[13:14], off +; GFX11-DL-NEXT: global_load_i8 v21, v[13:14], off offset:1 +; GFX11-DL-NEXT: global_load_i8 v22, v[13:14], off offset:2 +; GFX11-DL-NEXT: global_load_i8 v23, v[13:14], off offset:3 +; GFX11-DL-NEXT: global_load_i8 v24, v[13:14], off offset:4 +; GFX11-DL-NEXT: global_load_i8 v25, v[13:14], off offset:5 +; GFX11-DL-NEXT: global_load_i8 v26, v[13:14], off offset:6 +; GFX11-DL-NEXT: global_load_i8 v27, v[13:14], off offset:7 +; GFX11-DL-NEXT: global_load_i8 v13, v[13:14], off offset:8 +; GFX11-DL-NEXT: s_add_u32 s0, s0, 9 +; GFX11-DL-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-DL-NEXT: s_cmp_lg_u64 s[0:1], 0x48 +; GFX11-DL-NEXT: s_waitcnt vmcnt(9) +; GFX11-DL-NEXT: v_perm_b32 v9, v9, v11, 0x4000c0c +; GFX11-DL-NEXT: s_waitcnt vmcnt(8) +; GFX11-DL-NEXT: v_mad_i32_i24 v8, v10, v12, v8 +; GFX11-DL-NEXT: v_perm_b32 v10, v16, v15, 0xc0c0400 +; GFX11-DL-NEXT: v_perm_b32 v12, v18, v17, 0x4000c0c +; GFX11-DL-NEXT: s_waitcnt vmcnt(6) +; GFX11-DL-NEXT: v_perm_b32 v14, v22, v21, 0xc0c0400 +; GFX11-DL-NEXT: s_waitcnt vmcnt(4) +; GFX11-DL-NEXT: v_perm_b32 v15, v24, v23, 0x4000c0c +; GFX11-DL-NEXT: v_perm_b32 v16, v20, v19, 0xc0c0400 +; GFX11-DL-NEXT: s_waitcnt vmcnt(2) +; GFX11-DL-NEXT: v_perm_b32 v11, v26, v25, 0xc0c0400 +; GFX11-DL-NEXT: v_or_b32_e32 v10, v12, v10 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_perm_b32 v13, v13, v27, 0x4000c0c +; GFX11-DL-NEXT: v_or_b32_e32 v12, v15, v14 +; GFX11-DL-NEXT: v_or_b32_e32 v9, v9, v16 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_or_b32_e32 v11, v13, v11 +; GFX11-DL-NEXT: v_dot4_i32_iu8 v8, v12, v10, v8 neg_lo:[1,1,0] +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: v_dot4_i32_iu8 v8, v11, v9, v8 neg_lo:[1,1,0] +; GFX11-DL-NEXT: s_cbranch_scc1 .LBB17_3 +; GFX11-DL-NEXT: ; %bb.4: ; %.110 +; GFX11-DL-NEXT: ; in Loop: Header=BB17_2 Depth=1 +; GFX11-DL-NEXT: v_lshlrev_b64 v[9:10], 2, v[0:1] +; GFX11-DL-NEXT: v_add_co_u32 v6, s0, 0x900, v6 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_add_co_ci_u32_e64 v7, s0, 0, v7, s0 +; GFX11-DL-NEXT: v_add_co_u32 v9, vcc_lo, s2, v9 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s3, v10, vcc_lo +; GFX11-DL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 32 +; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v1, vcc_lo +; GFX11-DL-NEXT: v_add_co_u32 v2, vcc_lo, 0x900, v2 +; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX11-DL-NEXT: v_add_co_u32 v4, vcc_lo, 0x900, v4 +; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo +; GFX11-DL-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-DL-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 +; GFX11-DL-NEXT: global_store_b32 v[9:10], v8, off +; GFX11-DL-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-DL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-DL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX11-DL-NEXT: s_cbranch_execnz .LBB17_2 +; GFX11-DL-NEXT: .LBB17_5: ; %._crit_edge +; GFX11-DL-NEXT: s_endpgm +.entry: + %workitemx = tail call i32 @llvm.amdgcn.workitem.id.x() + %sworkitemx = sext i32 %workitemx to i64 + %workitemy = tail call i32 @llvm.amdgcn.workitem.id.y() + %sworkitemy = sext i32 %workitemy to i64 + %workitemz = tail call i32 @llvm.amdgcn.workitem.id.z() + %sworkitemz = sext i32 %workitemz to i64 + %ivtemp0 = add nsw i64 %sworkitemy, %sworkitemz + %ivtemp1 = shl nsw i64 %ivtemp0, 5 + %iv = add nsw i64 %ivtemp1, %sworkitemx + %cmp = icmp slt i64 %sworkitemx, 2 + br i1 %cmp, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %.entry, %.110 + %phi = phi i64 [ %outerlimit, %.110 ], [ %iv, %.entry ] + %outptr = getelementptr i32, ptr addrspace(1) %inptr2, i64 %phi + %scalarmul = mul nsw i64 %phi, 72 + br label %.preheader2 + +.preheader2: ; preds = %.lr.ph, %.preheader2 + %phi1 = phi i64 [ 0, %.lr.ph ], [ %limit, %.preheader2 ] + %.lcssa4.lcssa67 = phi i32 [ 0, %.lr.ph ], [ %ivadd9, %.preheader2 ] + %mul0 = mul nuw nsw i64 %phi1, 9 + %scalaradd = add nsw i64 %mul0, %scalarmul + %gep10 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %scalaradd + %l10 = load i8, ptr addrspace(1) %gep10, align 1 + %gep11 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %scalaradd + %l11 = load i8, ptr addrspace(1) %gep11, align 1 + %op11 = sext i8 %l10 to i32 + %op10 = sext i8 %l11 to i32 + %mul1 = mul nsw i32 %op10, %op11 + %ivadd1 = add i32 %mul1, %.lcssa4.lcssa67 + %off2 = add nsw i64 %scalaradd, 1 + %gep21 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off2 + %l21 = load i8, ptr addrspace(1) %gep21, align 1 + %gep20 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off2 + %l20 = load i8, ptr addrspace(1) %gep20, align 1 + %op21 = sext i8 %l21 to i32 + %op20 = sext i8 %l20 to i32 + %mul2 = mul nsw i32 %op20, %op21 + %ivadd2 = add i32 %mul2, %ivadd1 + %off3 = add nsw i64 %scalaradd, 2 + %gep31 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off3 + %l31 = load i8, ptr addrspace(1) %gep31, align 1 + %gep30 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off3 + %l30 = load i8, ptr addrspace(1) %gep30, align 1 + %op31 = sext i8 %l31 to i32 + %op30 = sext i8 %l30 to i32 + %mul3 = mul nsw i32 %op30, %op31 + %ivadd3 = add i32 %mul3, %ivadd2 + %off4 = add nsw i64 %scalaradd, 3 + %gep41 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off4 + %l41 = load i8, ptr addrspace(1) %gep41, align 1 + %gep40 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off4 + %l40 = load i8, ptr addrspace(1) %gep40, align 1 + %op41 = sext i8 %l41 to i32 + %op40 = sext i8 %l40 to i32 + %mul4 = mul nsw i32 %op40, %op41 + %ivadd4 = add i32 %mul4, %ivadd3 + %off5 = add nsw i64 %scalaradd, 4 + %gep51 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off5 + %l51 = load i8, ptr addrspace(1) %gep51, align 1 + %gep50 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off5 + %l50 = load i8, ptr addrspace(1) %gep50, align 1 + %op51 = sext i8 %l51 to i32 + %op50 = sext i8 %l50 to i32 + %mul5 = mul nsw i32 %op50, %op51 + %ivadd5 = add i32 %mul5, %ivadd4 + %off6 = add nsw i64 %scalaradd, 5 + %gep61 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off6 + %l61 = load i8, ptr addrspace(1) %gep61, align 1 + %gep60 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off6 + %l60 = load i8, ptr addrspace(1) %gep60, align 1 + %op61 = sext i8 %l61 to i32 + %op60 = sext i8 %l60 to i32 + %mul6 = mul nsw i32 %op60, %op61 + %ivadd6 = add i32 %mul6, %ivadd5 + %off7 = add nsw i64 %scalaradd, 6 + %gep71 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off7 + %l71 = load i8, ptr addrspace(1) %gep71, align 1 + %gep70 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off7 + %l70 = load i8, ptr addrspace(1) %gep70, align 1 + %op71 = sext i8 %l71 to i32 + %op70 = sext i8 %l70 to i32 + %mul7 = mul nsw i32 %op70, %op71 + %ivadd7 = add i32 %mul7, %ivadd6 + %off8 = add nsw i64 %scalaradd, 7 + %gep81 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off8 + %l81 = load i8, ptr addrspace(1) %gep81, align 1 + %gep80 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off8 + %l80 = load i8, ptr addrspace(1) %gep80, align 1 + %op81 = sext i8 %l81 to i32 + %op80 = sext i8 %l80 to i32 + %mul8 = mul nsw i32 %op80, %op81 + %ivadd8 = add i32 %mul8, %ivadd7 + %off9 = add nsw i64 %scalaradd, 8 + %gep91 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off9 + %l91 = load i8, ptr addrspace(1) %gep91, align 1 + %gep90 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off9 + %l90 = load i8, ptr addrspace(1) %gep90, align 1 + %op91 = sext i8 %l91 to i32 + %op90 = sext i8 %l90 to i32 + %mul9 = mul nsw i32 %op90, %op91 + %ivadd9 = add i32 %mul9, %ivadd8 + %limit = add nuw nsw i64 %phi1, 1 + %exitcond.not = icmp eq i64 %limit, 8 + br i1 %exitcond.not, label %.110, label %.preheader2 + +.110: ; preds = %.preheader2 + store i32 %ivadd9, ptr addrspace(1) %outptr, align 4 + %outerlimit = add nsw i64 %phi, 32 + %outerexitcond = icmp slt i64 %phi, -30 + br i1 %outerexitcond, label %.lr.ph, label %._crit_edge + +._crit_edge: ; preds = %.110, %.3 + ret void +} + + declare i32 @llvm.amdgcn.workitem.id.x() +declare i32 @llvm.amdgcn.workitem.id.y() +declare i32 @llvm.amdgcn.workitem.id.z()