Skip to content

Commit add9079

Browse files
authored
[LoongArch] Broadcast repeated subsequence in build_vector instead of inserting per element (llvm#154533)
1 parent 7d1adab commit add9079

File tree

8 files changed

+222
-774
lines changed

8 files changed

+222
-774
lines changed

llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2650,6 +2650,7 @@ static SDValue lowerBUILD_VECTORAsBroadCastLoad(BuildVectorSDNode *BVOp,
26502650
SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
26512651
SelectionDAG &DAG) const {
26522652
BuildVectorSDNode *Node = cast<BuildVectorSDNode>(Op);
2653+
MVT VT = Node->getSimpleValueType(0);
26532654
EVT ResTy = Op->getValueType(0);
26542655
unsigned NumElts = ResTy.getVectorNumElements();
26552656
SDLoc DL(Op);
@@ -2744,6 +2745,66 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
27442745
}
27452746

27462747
if (!IsConstant) {
2748+
// If the BUILD_VECTOR has a repeated pattern, use INSERT_VECTOR_ELT to fill
2749+
// the sub-sequence of the vector and then broadcast the sub-sequence.
2750+
//
2751+
// TODO: If the BUILD_VECTOR contains undef elements, consider falling
2752+
// back to use INSERT_VECTOR_ELT to materialize the vector, because it
2753+
// generates worse code in some cases. This could be further optimized
2754+
// with more consideration.
2755+
SmallVector<SDValue> Sequence;
2756+
BitVector UndefElements;
2757+
if (Node->getRepeatedSequence(Sequence, &UndefElements) &&
2758+
UndefElements.count() == 0) {
2759+
SDValue Vector = DAG.getUNDEF(ResTy);
2760+
SDValue FillVec = Vector;
2761+
EVT FillTy = ResTy;
2762+
2763+
// Using LSX instructions to fill the sub-sequence of 256-bits vector,
2764+
// because the high part can be simply treated as undef.
2765+
if (Is256Vec) {
2766+
FillTy = ResTy.getHalfNumVectorElementsVT(*DAG.getContext());
2767+
FillVec = DAG.getExtractSubvector(DL, FillTy, Vector, 0);
2768+
}
2769+
2770+
SDValue Op0 = Sequence[0];
2771+
unsigned SeqLen = Sequence.size();
2772+
if (!Op0.isUndef())
2773+
FillVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, FillTy, Op0);
2774+
for (unsigned i = 1; i < SeqLen; ++i) {
2775+
SDValue Opi = Sequence[i];
2776+
if (Opi.isUndef())
2777+
continue;
2778+
FillVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, FillTy, FillVec, Opi,
2779+
DAG.getConstant(i, DL, Subtarget.getGRLenVT()));
2780+
}
2781+
2782+
unsigned SplatLen = NumElts / SeqLen;
2783+
MVT SplatEltTy = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
2784+
MVT SplatTy = MVT::getVectorVT(SplatEltTy, SplatLen);
2785+
2786+
// If size of the sub-sequence is half of a 256-bits vector, bitcast the
2787+
// vector to v4i64 type in order to match the pattern of XVREPLVE0Q.
2788+
if (SplatEltTy == MVT::i128)
2789+
SplatTy = MVT::v4i64;
2790+
2791+
SDValue SplatVec;
2792+
SDValue SrcVec = DAG.getBitcast(
2793+
SplatTy,
2794+
Is256Vec ? DAG.getInsertSubvector(DL, Vector, FillVec, 0) : FillVec);
2795+
if (Is256Vec) {
2796+
SplatVec =
2797+
DAG.getNode((SplatEltTy == MVT::i128) ? LoongArchISD::XVREPLVE0Q
2798+
: LoongArchISD::XVREPLVE0,
2799+
DL, SplatTy, SrcVec);
2800+
} else {
2801+
SplatVec = DAG.getNode(LoongArchISD::VREPLVEI, DL, SplatTy, SrcVec,
2802+
DAG.getConstant(0, DL, Subtarget.getGRLenVT()));
2803+
}
2804+
2805+
return DAG.getBitcast(ResTy, SplatVec);
2806+
}
2807+
27472808
// Use INSERT_VECTOR_ELT operations rather than expand to stores.
27482809
// The resulting code is the same length as the expansion, but it doesn't
27492810
// use memory operations.
@@ -7110,6 +7171,8 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
71107171
NODE_NAME_CASE(VREPLGR2VR)
71117172
NODE_NAME_CASE(XVPERMI)
71127173
NODE_NAME_CASE(XVPERM)
7174+
NODE_NAME_CASE(XVREPLVE0)
7175+
NODE_NAME_CASE(XVREPLVE0Q)
71137176
NODE_NAME_CASE(VPICK_SEXT_ELT)
71147177
NODE_NAME_CASE(VPICK_ZEXT_ELT)
71157178
NODE_NAME_CASE(VREPLVE)

llvm/lib/Target/LoongArch/LoongArchISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,8 @@ enum NodeType : unsigned {
146146
VREPLGR2VR,
147147
XVPERMI,
148148
XVPERM,
149+
XVREPLVE0,
150+
XVREPLVE0Q,
149151

150152
// Extended vector element extraction
151153
VPICK_SEXT_ELT,

llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,14 @@
1212

1313
def SDT_LoongArchXVPERM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
1414
SDTCisVec<2>, SDTCisInt<2>]>;
15+
def SDT_LoongArchXVREPLVE0 : SDTypeProfile<1, 1, [SDTCisVec<0>,
16+
SDTCisSameAs<0, 1>]>;
1517

1618
// Target nodes.
1719
def loongarch_xvpermi: SDNode<"LoongArchISD::XVPERMI", SDT_LoongArchV1RUimm>;
1820
def loongarch_xvperm: SDNode<"LoongArchISD::XVPERM", SDT_LoongArchXVPERM>;
21+
def loongarch_xvreplve0: SDNode<"LoongArchISD::XVREPLVE0", SDT_LoongArchXVREPLVE0>;
22+
def loongarch_xvreplve0q: SDNode<"LoongArchISD::XVREPLVE0Q", SDT_LoongArchXVREPLVE0>;
1923
def loongarch_xvmskltz: SDNode<"LoongArchISD::XVMSKLTZ", SDT_LoongArchVMSKCOND>;
2024
def loongarch_xvmskgez: SDNode<"LoongArchISD::XVMSKGEZ", SDT_LoongArchVMSKCOND>;
2125
def loongarch_xvmskeqz: SDNode<"LoongArchISD::XVMSKEQZ", SDT_LoongArchVMSKCOND>;
@@ -1886,11 +1890,26 @@ def : Pat<(loongarch_xvperm v8i32:$xj, v8i32:$xk),
18861890
def : Pat<(loongarch_xvperm v8f32:$xj, v8i32:$xk),
18871891
(XVPERM_W v8f32:$xj, v8i32:$xk)>;
18881892

1889-
// XVREPLVE0_{W/D}
1893+
// XVREPLVE0_{B/H/W/D/Q}
1894+
def : Pat<(loongarch_xvreplve0 v32i8:$xj),
1895+
(XVREPLVE0_B v32i8:$xj)>;
1896+
def : Pat<(loongarch_xvreplve0 v16i16:$xj),
1897+
(XVREPLVE0_H v16i16:$xj)>;
1898+
def : Pat<(loongarch_xvreplve0 v8i32:$xj),
1899+
(XVREPLVE0_W v8i32:$xj)>;
1900+
def : Pat<(loongarch_xvreplve0 v4i64:$xj),
1901+
(XVREPLVE0_D v4i64:$xj)>;
1902+
def : Pat<(loongarch_xvreplve0 v8f32:$xj),
1903+
(XVREPLVE0_W v8f32:$xj)>;
1904+
def : Pat<(loongarch_xvreplve0 v4f64:$xj),
1905+
(XVREPLVE0_D v4f64:$xj)>;
18901906
def : Pat<(lasxsplatf32 FPR32:$fj),
18911907
(XVREPLVE0_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32))>;
18921908
def : Pat<(lasxsplatf64 FPR64:$fj),
18931909
(XVREPLVE0_D (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64))>;
1910+
foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in
1911+
def : Pat<(vt (loongarch_xvreplve0q LASX256:$xj)),
1912+
(XVREPLVE0_Q LASX256:$xj)>;
18941913

18951914
// VSTELM
18961915
defm : VstelmPat<truncstorei8, v32i8, XVSTELM_B, simm8, uimm5>;

llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -82,11 +82,11 @@ def loongarch_vmskgez: SDNode<"LoongArchISD::VMSKGEZ", SDT_LoongArchVMSKCOND>;
8282
def loongarch_vmskeqz: SDNode<"LoongArchISD::VMSKEQZ", SDT_LoongArchVMSKCOND>;
8383
def loongarch_vmsknez: SDNode<"LoongArchISD::VMSKNEZ", SDT_LoongArchVMSKCOND>;
8484

85-
def immZExt1 : ImmLeaf<i64, [{return isUInt<1>(Imm);}]>;
86-
def immZExt2 : ImmLeaf<i64, [{return isUInt<2>(Imm);}]>;
87-
def immZExt3 : ImmLeaf<i64, [{return isUInt<3>(Imm);}]>;
88-
def immZExt4 : ImmLeaf<i64, [{return isUInt<4>(Imm);}]>;
89-
def immZExt8 : ImmLeaf<i64, [{return isUInt<8>(Imm);}]>;
85+
def immZExt1 : ImmLeaf<GRLenVT, [{return isUInt<1>(Imm);}]>;
86+
def immZExt2 : ImmLeaf<GRLenVT, [{return isUInt<2>(Imm);}]>;
87+
def immZExt3 : ImmLeaf<GRLenVT, [{return isUInt<3>(Imm);}]>;
88+
def immZExt4 : ImmLeaf<GRLenVT, [{return isUInt<4>(Imm);}]>;
89+
def immZExt8 : ImmLeaf<GRLenVT, [{return isUInt<8>(Imm);}]>;
9090

9191
class VecCond<SDPatternOperator OpNode, ValueType TyNode,
9292
RegisterClass RC = LSX128>
@@ -2026,15 +2026,15 @@ def : Pat<(loongarch_vilvh v4f32:$vj, v4f32:$vk),
20262026
def : Pat<(loongarch_vilvh v2f64:$vj, v2f64:$vk),
20272027
(VILVH_D v2f64:$vj, v2f64:$vk)>;
20282028

2029-
// VSHUF4I_{B/H/W}
2029+
// VSHUF4I_{B/H/W/D}
20302030
def : Pat<(loongarch_vshuf4i v16i8:$vj, immZExt8:$ui8),
20312031
(VSHUF4I_B v16i8:$vj, immZExt8:$ui8)>;
20322032
def : Pat<(loongarch_vshuf4i v8i16:$vj, immZExt8:$ui8),
2033-
(VSHUF4I_H v8i16:$vj, immZExt8:$ui8)>;
2033+
(VSHUF4I_H v8i16:$vj, immZExt8:$ui8)>;
20342034
def : Pat<(loongarch_vshuf4i v4i32:$vj, immZExt8:$ui8),
2035-
(VSHUF4I_W v4i32:$vj, immZExt8:$ui8)>;
2035+
(VSHUF4I_W v4i32:$vj, immZExt8:$ui8)>;
20362036
def : Pat<(loongarch_vshuf4i v4f32:$vj, immZExt8:$ui8),
2037-
(VSHUF4I_W v4f32:$vj, immZExt8:$ui8)>;
2037+
(VSHUF4I_W v4f32:$vj, immZExt8:$ui8)>;
20382038
def : Pat<(loongarch_vshuf4i_d v2i64:$vj, v2i64:$vk, immZExt8:$ui8),
20392039
(VSHUF4I_D v2i64:$vj, v2i64:$vk, immZExt8:$ui8)>;
20402040
def : Pat<(loongarch_vshuf4i_d v2f64:$vj, v2f64:$vk, immZExt8:$ui8),
@@ -2044,15 +2044,15 @@ def : Pat<(loongarch_vshuf4i_d v2f64:$vj, v2f64:$vk, immZExt8:$ui8),
20442044
def : Pat<(loongarch_vreplvei v16i8:$vj, immZExt4:$ui4),
20452045
(VREPLVEI_B v16i8:$vj, immZExt4:$ui4)>;
20462046
def : Pat<(loongarch_vreplvei v8i16:$vj, immZExt3:$ui3),
2047-
(VREPLVEI_H v8i16:$vj, immZExt3:$ui3)>;
2047+
(VREPLVEI_H v8i16:$vj, immZExt3:$ui3)>;
20482048
def : Pat<(loongarch_vreplvei v4i32:$vj, immZExt2:$ui2),
2049-
(VREPLVEI_W v4i32:$vj, immZExt2:$ui2)>;
2049+
(VREPLVEI_W v4i32:$vj, immZExt2:$ui2)>;
20502050
def : Pat<(loongarch_vreplvei v2i64:$vj, immZExt1:$ui1),
2051-
(VREPLVEI_D v2i64:$vj, immZExt1:$ui1)>;
2051+
(VREPLVEI_D v2i64:$vj, immZExt1:$ui1)>;
20522052
def : Pat<(loongarch_vreplvei v4f32:$vj, immZExt2:$ui2),
2053-
(VREPLVEI_W v4f32:$vj, immZExt2:$ui2)>;
2053+
(VREPLVEI_W v4f32:$vj, immZExt2:$ui2)>;
20542054
def : Pat<(loongarch_vreplvei v2f64:$vj, immZExt1:$ui1),
2055-
(VREPLVEI_D v2f64:$vj, immZExt1:$ui1)>;
2055+
(VREPLVEI_D v2f64:$vj, immZExt1:$ui1)>;
20562056

20572057
// VREPLVEI_{W/D}
20582058
def : Pat<(lsxsplatf32 FPR32:$fj),

llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll

Lines changed: 12 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,9 @@ define <4 x i64> @should_not_be_optimized(ptr %ptr, ptr %dst) {
99
; LA32-NEXT: ld.w $a2, $a0, 0
1010
; LA32-NEXT: ld.w $a0, $a0, 4
1111
; LA32-NEXT: st.w $a2, $a1, 0
12-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a2, 0
13-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 1
14-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a2, 2
15-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 3
16-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a2, 4
17-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 5
18-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a2, 6
19-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 7
12+
; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0
13+
; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1
14+
; LA32-NEXT: xvreplve0.d $xr0, $xr0
2015
; LA32-NEXT: st.w $a0, $a1, 4
2116
; LA32-NEXT: ret
2217
;
@@ -64,14 +59,9 @@ define <4 x i64> @xvldrepl_d_unaligned_offset(ptr %ptr) {
6459
; LA32: # %bb.0:
6560
; LA32-NEXT: ld.w $a1, $a0, 4
6661
; LA32-NEXT: ld.w $a0, $a0, 8
67-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 0
68-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 1
69-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 2
70-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 3
71-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 4
72-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 5
73-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 6
74-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 7
62+
; LA32-NEXT: vinsgr2vr.w $vr0, $a1, 0
63+
; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1
64+
; LA32-NEXT: xvreplve0.d $xr0, $xr0
7565
; LA32-NEXT: ret
7666
;
7767
; LA64-LABEL: xvldrepl_d_unaligned_offset:
@@ -162,14 +152,9 @@ define <4 x i64> @xvldrepl_d(ptr %ptr) {
162152
; LA32: # %bb.0:
163153
; LA32-NEXT: ld.w $a1, $a0, 0
164154
; LA32-NEXT: ld.w $a0, $a0, 4
165-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 0
166-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 1
167-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 2
168-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 3
169-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 4
170-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 5
171-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 6
172-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 7
155+
; LA32-NEXT: vinsgr2vr.w $vr0, $a1, 0
156+
; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1
157+
; LA32-NEXT: xvreplve0.d $xr0, $xr0
173158
; LA32-NEXT: ret
174159
;
175160
; LA64-LABEL: xvldrepl_d:
@@ -187,14 +172,9 @@ define <4 x i64> @xvldrepl_d_offset(ptr %ptr) {
187172
; LA32: # %bb.0:
188173
; LA32-NEXT: ld.w $a1, $a0, 264
189174
; LA32-NEXT: ld.w $a0, $a0, 268
190-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 0
191-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 1
192-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 2
193-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 3
194-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 4
195-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 5
196-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 6
197-
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 7
175+
; LA32-NEXT: vinsgr2vr.w $vr0, $a1, 0
176+
; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1
177+
; LA32-NEXT: xvreplve0.d $xr0, $xr0
198178
; LA32-NEXT: ret
199179
;
200180
; LA64-LABEL: xvldrepl_d_offset:

0 commit comments

Comments
 (0)