[DAG] Add ISD::VECTOR_COMPRESS handling in computeKnownBits/ComputeNumSignBits (llvm#159692)

KavinTheG · web-flow · commit d46998b1dd1a · 2025-09-22T10:02:18.000Z
Resolves llvm#158332
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3480,6 +3480,17 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
         break;
     }
     break;
+  case ISD::VECTOR_COMPRESS: {
+    SDValue Vec = Op.getOperand(0);
+    SDValue PassThru = Op.getOperand(2);
+    Known = computeKnownBits(PassThru, DemandedElts, Depth + 1);
+    // If we don't know any bits, early out.
+    if (Known.isUnknown())
+      break;
+    Known2 = computeKnownBits(Vec, Depth + 1);
+    Known = Known.intersectWith(Known2);
+    break;
+  }
   case ISD::VECTOR_SHUFFLE: {
     assert(!Op.getValueType().isScalableVector());
     // Collect the known bits that are shared by every vector element referenced
@@ -4789,6 +4800,17 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
     }
     return Tmp;
 
+  case ISD::VECTOR_COMPRESS: {
+    SDValue Vec = Op.getOperand(0);
+    SDValue PassThru = Op.getOperand(2);
+    Tmp = ComputeNumSignBits(PassThru, DemandedElts, Depth + 1);
+    if (Tmp == 1)
+      return 1;
+    Tmp2 = ComputeNumSignBits(Vec, Depth + 1);
+    Tmp = std::min(Tmp, Tmp2);
+    return Tmp;
+  }
+
   case ISD::VECTOR_SHUFFLE: {
     // Collect the minimum number of sign bits that are shared by every vector
     // element referenced by the shuffle.
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
@@ -100,6 +100,42 @@ define <vscale x 4 x i4> @test_compress_illegal_element_type(<vscale x 4 x i4> %
     ret <vscale x 4 x i4> %out
 }
 
+define <vscale x 4 x i32> @test_compress_knownbits_zext(<vscale x 4 x i16> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %passthru) nounwind {
+; CHECK-LABEL: test_compress_knownbits_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and z0.s, z0.s, #0xffff
+; CHECK-NEXT:    cntp x8, p0, p0.s
+; CHECK-NEXT:    and z1.s, z1.s, #0x3
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %xvec = zext <vscale x 4 x i16> %vec to <vscale x 4 x i32>
+  %xpassthru = and <vscale x 4 x i32> %passthru, splat (i32 3)
+  %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %xvec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %xpassthru)
+  %res = and <vscale x 4 x i32> %out, splat (i32 65535)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @test_compress_numsignbits_sext(<vscale x 4 x i16> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %passthru) nounwind {
+; CHECK-LABEL: test_compress_numsignbits_sext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    and z1.s, z1.s, #0x3
+; CHECK-NEXT:    cntp x8, p0, p0.s
+; CHECK-NEXT:    sxth z0.s, p1/m, z0.s
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %xvec = sext <vscale x 4 x i16> %vec to <vscale x 4 x i32>
+  %xpassthru = and <vscale x 4 x i32> %passthru, splat (i32 3)
+  %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %xvec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %xpassthru)
+  %shl = shl <vscale x 4 x i32> %out, splat (i32 16)
+  %res = ashr <vscale x 4 x i32> %shl, splat (i32 16)
+  ret <vscale x 4 x i32> %res
+}
+
 define <vscale x 8 x i32> @test_compress_large(<vscale x 8 x i32> %vec, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_compress_large:
 ; CHECK:       // %bb.0:
diff --git a/llvm/test/CodeGen/AArch64/vector-compress.ll b/llvm/test/CodeGen/AArch64/vector-compress.ll
@@ -471,3 +471,116 @@ define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i
     %out = call <3 x i3> @llvm.experimental.vector.compress(<3 x i3> %vec, <3 x i1> %mask, <3 x i3> undef)
     ret <3 x i3> %out
 }
+
+define <4 x i32> @test_compress_knownbits_zext_v4i16_4i32(<4 x i16> %vec, <4 x i1>  %mask, <4 x i32> %passthru) nounwind {
+; CHECK-LABEL: test_compress_knownbits_zext_v4i16_4i32:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    ushll.4s v1, v1, #0
+; CHECK-NEXT:    movi.4s v3, #1
+; CHECK-NEXT:    mov x14, sp
+; CHECK-NEXT:    movi.4s v4, #3
+; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    mov x13, sp
+; CHECK-NEXT:    mov x12, sp
+; CHECK-NEXT:    mov x15, sp
+; CHECK-NEXT:    shl.4s v1, v1, #31
+; CHECK-NEXT:    and.16b v2, v2, v4
+; CHECK-NEXT:    cmlt.4s v1, v1, #0
+; CHECK-NEXT:    str q2, [sp]
+; CHECK-NEXT:    and.16b v3, v1, v3
+; CHECK-NEXT:    mov.s w8, v1[1]
+; CHECK-NEXT:    mov.s w9, v1[2]
+; CHECK-NEXT:    mov.s w10, v1[3]
+; CHECK-NEXT:    fmov w11, s1
+; CHECK-NEXT:    addv.4s s1, v3
+; CHECK-NEXT:    and x16, x11, #0x1
+; CHECK-NEXT:    and x8, x8, #0x1
+; CHECK-NEXT:    bfi x14, x11, #2, #1
+; CHECK-NEXT:    add x8, x16, x8
+; CHECK-NEXT:    and x9, x9, #0x1
+; CHECK-NEXT:    and x10, x10, #0x1
+; CHECK-NEXT:    fmov w11, s1
+; CHECK-NEXT:    add x9, x8, x9
+; CHECK-NEXT:    mov w16, #3 ; =0x3
+; CHECK-NEXT:    add x10, x9, x10
+; CHECK-NEXT:    orr x8, x12, x8, lsl #2
+; CHECK-NEXT:    bfi x15, x9, #2, #2
+; CHECK-NEXT:    cmp x10, #3
+; CHECK-NEXT:    bfi x13, x11, #2, #2
+; CHECK-NEXT:    mov.s w11, v0[3]
+; CHECK-NEXT:    csel x9, x10, x16, lo
+; CHECK-NEXT:    ldr w13, [x13]
+; CHECK-NEXT:    str s0, [sp]
+; CHECK-NEXT:    st1.s { v0 }[1], [x14]
+; CHECK-NEXT:    st1.s { v0 }[2], [x8]
+; CHECK-NEXT:    orr x8, x12, x9, lsl #2
+; CHECK-NEXT:    csel w9, w11, w13, hi
+; CHECK-NEXT:    st1.s { v0 }[3], [x15]
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+entry:
+  %xvec = zext <4 x i16> %vec to <4 x i32>
+  %xpassthru = and <4 x i32> %passthru, splat (i32 3)
+  %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %xvec, <4 x i1> %mask, <4 x i32> %xpassthru)
+  %res = and <4 x i32> %out, splat (i32 65535)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_compress_numsignbits_sext_v4i16_4i32(<4 x i16> %vec, <4 x i1> %mask, <4 x i32> %passthru) nounwind {
+; CHECK-LABEL: test_compress_numsignbits_sext_v4i16_4i32:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    ushll.4s v1, v1, #0
+; CHECK-NEXT:    movi.4s v3, #1
+; CHECK-NEXT:    mov x14, sp
+; CHECK-NEXT:    movi.4s v4, #3
+; CHECK-NEXT:    sshll.4s v0, v0, #0
+; CHECK-NEXT:    mov x13, sp
+; CHECK-NEXT:    mov x12, sp
+; CHECK-NEXT:    mov x15, sp
+; CHECK-NEXT:    shl.4s v1, v1, #31
+; CHECK-NEXT:    and.16b v2, v2, v4
+; CHECK-NEXT:    cmlt.4s v1, v1, #0
+; CHECK-NEXT:    str q2, [sp]
+; CHECK-NEXT:    and.16b v3, v1, v3
+; CHECK-NEXT:    mov.s w8, v1[1]
+; CHECK-NEXT:    mov.s w9, v1[2]
+; CHECK-NEXT:    mov.s w10, v1[3]
+; CHECK-NEXT:    fmov w11, s1
+; CHECK-NEXT:    addv.4s s1, v3
+; CHECK-NEXT:    and x16, x11, #0x1
+; CHECK-NEXT:    and x8, x8, #0x1
+; CHECK-NEXT:    bfi x14, x11, #2, #1
+; CHECK-NEXT:    add x8, x16, x8
+; CHECK-NEXT:    and x9, x9, #0x1
+; CHECK-NEXT:    and x10, x10, #0x1
+; CHECK-NEXT:    fmov w11, s1
+; CHECK-NEXT:    add x9, x8, x9
+; CHECK-NEXT:    mov w16, #3 ; =0x3
+; CHECK-NEXT:    add x10, x9, x10
+; CHECK-NEXT:    orr x8, x12, x8, lsl #2
+; CHECK-NEXT:    bfi x15, x9, #2, #2
+; CHECK-NEXT:    cmp x10, #3
+; CHECK-NEXT:    bfi x13, x11, #2, #2
+; CHECK-NEXT:    mov.s w11, v0[3]
+; CHECK-NEXT:    csel x9, x10, x16, lo
+; CHECK-NEXT:    ldr w13, [x13]
+; CHECK-NEXT:    str s0, [sp]
+; CHECK-NEXT:    st1.s { v0 }[1], [x14]
+; CHECK-NEXT:    st1.s { v0 }[2], [x8]
+; CHECK-NEXT:    orr x8, x12, x9, lsl #2
+; CHECK-NEXT:    csel w9, w11, w13, hi
+; CHECK-NEXT:    st1.s { v0 }[3], [x15]
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+entry:
+  %xvec = sext <4 x i16> %vec to <4 x i32>
+  %xpassthru = and <4 x i32> %passthru, splat(i32 3)
+  %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %xvec, <4 x i1> %mask, <4 x i32> %xpassthru)
+  %shl = shl <4 x i32> %out, splat(i32 16)
+  %res = ashr <4 x i32> %shl, splat(i32 16)
+  ret <4 x i32> %res
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-compress.ll b/llvm/test/CodeGen/RISCV/rvv/vector-compress.ll
@@ -346,6 +346,39 @@ define <vscale x 4 x i32> @vector_compress_nxv4i32_passthru(<vscale x 4 x i32> %
   ret <vscale x 4 x i32> %ret
 }
 
+define <vscale x 4 x i32> @test_compress_nvx8f64_knownbits(<vscale x 4 x i16> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %passthru) nounwind {
+; CHECK-LABEL: test_compress_nvx8f64_knownbits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v12, v8
+; CHECK-NEXT:    vand.vi v8, v10, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, tu, ma
+; CHECK-NEXT:    vcompress.vm v8, v12, v0
+; CHECK-NEXT:    ret
+  %xvec = zext <vscale x 4 x i16> %vec to <vscale x 4 x i32>
+  %xpassthru = and <vscale x 4 x i32> %passthru, splat (i32 3)
+  %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %xvec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %xpassthru)
+  %res = and <vscale x 4 x i32> %out, splat (i32 65535)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @test_compress_nv8xf64_numsignbits(<vscale x 4 x i16> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %passthru) nounwind {
+; CHECK-LABEL: test_compress_nv8xf64_numsignbits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vsext.vf2 v12, v8
+; CHECK-NEXT:    vand.vi v8, v10, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, tu, ma
+; CHECK-NEXT:    vcompress.vm v8, v12, v0
+; CHECK-NEXT:    ret
+  %xvec = sext <vscale x 4 x i16> %vec to <vscale x 4 x i32>
+  %xpassthru = and <vscale x 4 x i32> %passthru, splat (i32 3)
+  %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %xvec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %xpassthru)
+  %shl = shl <vscale x 4 x i32> %out, splat (i32 16)
+  %res = ashr <vscale x 4 x i32> %shl, splat (i32 16)
+  ret <vscale x 4 x i32> %res
+}
+
 define <vscale x 8 x i32> @vector_compress_nxv8i32(<vscale x 8 x i32> %data, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: vector_compress_nxv8i32:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll