Skip to content

Commit

Permalink
[X86] isShuffleFoldableLoad - only check that the SDValue has one use (
Browse files Browse the repository at this point in the history
…#126900)

We don't need the entire load node to have oneuse, just the loaded value - prevents load chains from interfering with shuffle commutation
  • Loading branch information
RKSimon authored Feb 12, 2025
1 parent b101c35 commit 7647f47
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 132 deletions.
2 changes: 1 addition & 1 deletion llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12480,7 +12480,7 @@ static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
/// This is particularly important because the set of instructions varies
/// significantly based on whether the operand is a load or not.
static bool isShuffleFoldableLoad(SDValue V) {
return V->hasOneUse() &&
return V.hasOneUse() &&
ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1665,10 +1665,9 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i
;
; AVX512F-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
; AVX512F-FAST: # %bb.0:
; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0
; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7]
; AVX512F-FAST-NEXT: vpermi2d (%rdi), %ymm0, %ymm1
; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0
; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15]
; AVX512F-FAST-NEXT: vpermd (%rdi), %zmm0, %zmm0
; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512F-FAST-NEXT: vzeroupper
; AVX512F-FAST-NEXT: retq
Expand All @@ -1684,10 +1683,9 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i
;
; AVX512DQ-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
; AVX512DQ-FAST: # %bb.0:
; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0
; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7]
; AVX512DQ-FAST-NEXT: vpermi2d (%rdi), %ymm0, %ymm1
; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0
; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15]
; AVX512DQ-FAST-NEXT: vpermd (%rdi), %zmm0, %zmm0
; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512DQ-FAST-NEXT: vzeroupper
; AVX512DQ-FAST-NEXT: retq
Expand All @@ -1703,10 +1701,9 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i
;
; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm0
; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7]
; AVX512BW-FAST-NEXT: vpermi2d (%rdi), %ymm0, %ymm1
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm1, %zmm0
; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15]
; AVX512BW-FAST-NEXT: vpermd (%rdi), %zmm0, %zmm0
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-FAST-NEXT: vzeroupper
; AVX512BW-FAST-NEXT: retq
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
Original file line number Diff line number Diff line change
Expand Up @@ -201,8 +201,8 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4
; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm5
; AVX512-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0]
; AVX512-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0]
; AVX512-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
; AVX512-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
; AVX512-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4
Expand Down Expand Up @@ -260,8 +260,8 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm5
; AVX512DQ-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0]
; AVX512DQ-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0]
; AVX512DQ-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4
Expand Down Expand Up @@ -319,8 +319,8 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm4
; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5
; AVX512BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0]
; AVX512BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0]
; AVX512BW-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
; AVX512BW-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4
Expand Down Expand Up @@ -378,8 +378,8 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm4
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5
; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0]
; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0]
; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4
Expand Down
26 changes: 12 additions & 14 deletions llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
Original file line number Diff line number Diff line change
Expand Up @@ -403,13 +403,12 @@ define void @PR39483() {
;
; X86-AVX512-LABEL: PR39483:
; X86-AVX512: # %bb.0: # %entry
; X86-AVX512-NEXT: vmovups 0, %zmm0
; X86-AVX512-NEXT: vmovups 64, %ymm1
; X86-AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,5,8,11,14,17,20,23]
; X86-AVX512-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2
; X86-AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X86-AVX512-NEXT: vmulps %ymm0, %ymm2, %ymm1
; X86-AVX512-NEXT: vaddps %ymm0, %ymm1, %ymm0
; X86-AVX512-NEXT: vmovups 64, %ymm0
; X86-AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [18,21,24,27,30,1,4,7]
; X86-AVX512-NEXT: vpermt2ps 0, %zmm1, %zmm0
; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86-AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0
; X86-AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0
; X86-AVX512-NEXT: vmovups %ymm0, (%eax)
;
; X64-AVX1-LABEL: PR39483:
Expand Down Expand Up @@ -444,13 +443,12 @@ define void @PR39483() {
;
; X64-AVX512-LABEL: PR39483:
; X64-AVX512: # %bb.0: # %entry
; X64-AVX512-NEXT: vmovups 0, %zmm0
; X64-AVX512-NEXT: vmovups 64, %ymm1
; X64-AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,5,8,11,14,17,20,23]
; X64-AVX512-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2
; X64-AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX512-NEXT: vmulps %ymm0, %ymm2, %ymm1
; X64-AVX512-NEXT: vaddps %ymm0, %ymm1, %ymm0
; X64-AVX512-NEXT: vmovups 64, %ymm0
; X64-AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [18,21,24,27,30,1,4,7]
; X64-AVX512-NEXT: vpermt2ps 0, %zmm1, %zmm0
; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vmovups %ymm0, (%rax)
entry:
%wide.vec = load <24 x float>, ptr null, align 4
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/X86/vselect-avx.ll
Original file line number Diff line number Diff line change
Expand Up @@ -377,14 +377,14 @@ define void @vselect_concat_splat() {
; AVX512-NEXT: vmovaps %ymm2, %ymm3
; AVX512-NEXT: vpermi2ps %ymm1, %ymm0, %ymm3
; AVX512-NEXT: vmovups 32, %xmm4
; AVX512-NEXT: vmovups 0, %ymm5
; AVX512-NEXT: vxorps %xmm6, %xmm6, %xmm6
; AVX512-NEXT: vcmpneqps %xmm6, %xmm3, %k0
; AVX512-NEXT: vxorps %xmm5, %xmm5, %xmm5
; AVX512-NEXT: vcmpneqps %xmm5, %xmm3, %k0
; AVX512-NEXT: kshiftlw $4, %k0, %k1
; AVX512-NEXT: korw %k1, %k0, %k1
; AVX512-NEXT: vpermt2ps %ymm4, %ymm2, %ymm5
; AVX512-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
; AVX512-NEXT: vmovaps %ymm5, %ymm0 {%k1}
; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,11,14,1,9,12,15,2]
; AVX512-NEXT: vpermi2ps 0, %ymm4, %ymm1
; AVX512-NEXT: vmovaps %ymm1, %ymm0 {%k1}
; AVX512-NEXT: vmovups %ymm0, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
Expand Down
Loading

0 comments on commit 7647f47

Please sign in to comment.