-
Notifications
You must be signed in to change notification settings - Fork 11.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Change-Id: Ifa2ee3caaf13bc563119f79a241c3231557d401f
- Loading branch information
Showing
1 changed file
with
116 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | ||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 --start-before=amdgpu-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s | ||
|
||
; The first (A) operand of the v_dot4 is derived from the LHS of the mul chain (that is %l6080, %l7081, %l8082, %l9083). | ||
; These correspond to the 5th, 6th, 7th and 8th byte in the load %7. | ||
; Confirm that we are actually accessing these bytes. | ||
; | ||
; Previously, we used the dword offset from the corresponding byte in the second (B) operand. | ||
; The result was to access the 3rd byte of %7 instead of the 7th (i.e. a dword offset of 0 instead of 1). | ||
|
||
define amdgpu_kernel void @ByteOffsetCorrectness(ptr addrspace(1) noalias readonly align 16 %inptr0, ptr addrspace(1) noalias readonly align 16 %inptr1, ptr addrspace(1) noalias align 16 %inptr2, ptr addrspace(1) %outptr) local_unnamed_addr #0 { | ||
; GFX11-LABEL: ByteOffsetCorrectness: | ||
; GFX11: ; %bb.0: ; %.entry | ||
; GFX11-NEXT: v_bfe_u32 v2, v0, 20, 10 | ||
; GFX11-NEXT: s_clause 0x1 | ||
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 | ||
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c | ||
; GFX11-NEXT: v_bfe_u32 v6, v0, 10, 10 | ||
; GFX11-NEXT: v_and_b32_e32 v7, 0x3ff, v0 | ||
; GFX11-NEXT: v_mul_hi_u32_u24_e32 v1, 0x900, v2 | ||
; GFX11-NEXT: v_mul_u32_u24_e32 v0, 0x900, v2 | ||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) | ||
; GFX11-NEXT: v_add_nc_u32_e32 v8, v6, v2 | ||
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, 0x900, v6, v[0:1] | ||
; GFX11-NEXT: v_mov_b32_e32 v6, 0 | ||
; GFX11-NEXT: v_mul_hi_u32_u24_e32 v3, 0x48, v7 | ||
; GFX11-NEXT: v_mul_u32_u24_e32 v2, 0x48, v7 | ||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) | ||
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, 0x900, v8, v[2:3] | ||
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x48, v7, v[4:5] | ||
; GFX11-NEXT: s_waitcnt lgkmcnt(0) | ||
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, s4, v0 | ||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) | ||
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s5, v1, vcc_lo | ||
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, s4, v2 | ||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | ||
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s5, v3, vcc_lo | ||
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, s6, v0 | ||
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo | ||
; GFX11-NEXT: s_clause 0x2 | ||
; GFX11-NEXT: global_load_i8 v7, v[4:5], off offset:7 | ||
; GFX11-NEXT: global_load_i8 v2, v[2:3], off offset:8 | ||
; GFX11-NEXT: global_load_d16_b16 v6, v[4:5], off offset:5 | ||
; GFX11-NEXT: s_clause 0x1 | ||
; GFX11-NEXT: global_load_i8 v3, v[0:1], off offset:8 | ||
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off | ||
; GFX11-NEXT: s_waitcnt vmcnt(0) | ||
; GFX11-NEXT: v_perm_b32 v0, v2, v7, 0x4000c0c | ||
; GFX11-NEXT: v_perm_b32 v2, v6, v6, 0xc0c0100 | ||
; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x4030201 | ||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) | ||
; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 | ||
; GFX11-NEXT: v_mov_b32_e32 v2, 0 | ||
; GFX11-NEXT: v_dot4_i32_iu8 v0, v1, v0, 0 neg_lo:[1,1,0] | ||
; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] | ||
; GFX11-NEXT: s_endpgm | ||
.entry: | ||
%ByteOffsetCorrectness.kernarg.segment = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() | ||
%workitemx = tail call i32 @llvm.amdgcn.workitem.id.x() | ||
%sworkitemx = sext i32 %workitemx to i64 | ||
%workitemy = tail call i32 @llvm.amdgcn.workitem.id.y() | ||
%sworkitemy = sext i32 %workitemy to i64 | ||
%workitemz = tail call i32 @llvm.amdgcn.workitem.id.z() | ||
%sworkitemz = sext i32 %workitemz to i64 | ||
%ivtemp0 = add i64 %sworkitemy, %sworkitemz | ||
%ivtemp1 = shl nsw i64 %ivtemp0, 5 | ||
%iv = add nsw i64 %ivtemp1, %sworkitemx | ||
%0 = mul nsw i64 %ivtemp0, 2304 | ||
%1 = mul nsw i64 %sworkitemx, 72 | ||
%2 = add i64 %0, %1 | ||
%scevgep = getelementptr i8, ptr addrspace(1) %inptr0, i64 %2 | ||
%3 = mul nsw i64 %sworkitemy, 2304 | ||
%4 = mul nsw i64 %sworkitemz, 2304 | ||
%5 = add i64 %3, %4 | ||
%6 = add i64 %5, %1 | ||
%scevgep49 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %6 | ||
%scevgep55 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %2 | ||
%scevgep54 = getelementptr i8, ptr addrspace(1) %scevgep49, i64 0 | ||
%l10 = load i8, ptr addrspace(1) %scevgep54, align 1 | ||
%scevgep58 = getelementptr i8, ptr addrspace(1) %scevgep55, i64 0 | ||
%7 = load <9 x i8>, ptr addrspace(1) %scevgep58, align 1 | ||
%l6080 = extractelement <9 x i8> %7, i32 5 | ||
%l7081 = extractelement <9 x i8> %7, i32 6 | ||
%l8082 = extractelement <9 x i8> %7, i32 7 | ||
%l9083 = extractelement <9 x i8> %7, i32 8 | ||
%scevgep35 = getelementptr i8, ptr addrspace(1) %scevgep, i64 0 | ||
%scevgep36 = getelementptr i8, ptr addrspace(1) %scevgep35, i64 1 | ||
%8 = load <7 x i8>, ptr addrspace(1) %scevgep36, align 1 | ||
%l6188 = extractelement <7 x i8> %8, i32 4 | ||
%l7189 = extractelement <7 x i8> %8, i32 5 | ||
%l8190 = extractelement <7 x i8> %8, i32 6 | ||
%op61 = sext i8 %l6188 to i32 | ||
%op60 = sext i8 %l6080 to i32 | ||
%mul6 = call i32 @llvm.amdgcn.mul.i24.i32(i32 %op60, i32 %op61) | ||
%ivadd6 = add i32 %mul6, 0 | ||
%op71 = sext i8 %l7189 to i32 | ||
%op70 = sext i8 %l7081 to i32 | ||
%mul7 = call i32 @llvm.amdgcn.mul.i24.i32(i32 %op70, i32 %op71) | ||
%ivadd7 = add i32 %mul7, %ivadd6 | ||
%op81 = sext i8 %l8190 to i32 | ||
%op80 = sext i8 %l8082 to i32 | ||
%mul8 = call i32 @llvm.amdgcn.mul.i24.i32(i32 %op80, i32 %op81) | ||
%ivadd8 = add i32 %mul8, %ivadd7 | ||
%scevgep53 = getelementptr i8, ptr addrspace(1) %scevgep54, i64 8 | ||
%l91 = load i8, ptr addrspace(1) %scevgep53, align 1 | ||
%op91 = sext i8 %l91 to i32 | ||
%op90 = sext i8 %l9083 to i32 | ||
%mul9 = call i32 @llvm.amdgcn.mul.i24.i32(i32 %op90, i32 %op91) | ||
%ivadd9 = add i32 %mul9, %ivadd8 | ||
store i32 %ivadd9, ptr addrspace(1) %outptr, align 4 | ||
ret void | ||
} | ||
|
||
declare i32 @llvm.amdgcn.workitem.id.x() | ||
declare i32 @llvm.amdgcn.workitem.id.y() | ||
declare i32 @llvm.amdgcn.workitem.id.z() |