Skip to content

Commit f9b080c

Browse files
authored
[LoongArch] Custom legalizing ConstantFP to avoid float loads (llvm#158050)
This commit custom legalize `ConstantFP` using code sequence rather than simpily loading the fp values from constant pool. A new option (`-loongarch-materialize-float-imm=<enum>`) is added to set the maximum number of instructions (including code sequence to generate the value and moving the value to FPR) alllowed to be used when materializing floating-point immediates. The default value of the option is set to `3` on both LA32 and LA64. Which means: - For `f32` on both LA32 and LA64: `2 insts + movgr2fr.w`; (will cover all `f32` values) - For `f64` on LA64: `2 insts + movgr2fr.d`; - For `f64` on LA32: `1 inst + movgr2fr.w + movgr2frh.w`. (same inst latency as using constant pool) The option can be set in range `0,2-6`. (6 behaves same as 5 on LA64.)
1 parent d46998b commit f9b080c

15 files changed

+1658
-1121
lines changed

llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ def NotBoolXor : PatFrags<(ops node:$val),
1717
// LoongArch specific DAG Nodes.
1818
//===----------------------------------------------------------------------===//
1919

20+
def SDT_LoongArchMOVGR2FR_W
21+
: SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisVT<1, i32>]>;
2022
def SDT_LoongArchMOVGR2FR_W_LA64
2123
: SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisVT<1, i64>]>;
2224
def SDT_LoongArchMOVFR2GR_S_LA64
@@ -28,6 +30,8 @@ def SDT_LoongArchFRSQRTE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>;
2830
// ISD::BRCOND is custom-lowered to LoongArchISD::BRCOND for floating-point
2931
// comparisons to prevent recursive lowering.
3032
def loongarch_brcond : SDNode<"LoongArchISD::BRCOND", SDTBrcond, [SDNPHasChain]>;
33+
def loongarch_movgr2fr_w
34+
: SDNode<"LoongArchISD::MOVGR2FR_W", SDT_LoongArchMOVGR2FR_W>;
3135
def loongarch_movgr2fr_w_la64
3236
: SDNode<"LoongArchISD::MOVGR2FR_W_LA64", SDT_LoongArchMOVGR2FR_W_LA64>;
3337
def loongarch_movfr2gr_s_la64
@@ -185,6 +189,14 @@ def : PatFpr<fneg, FNEG_S, FPR32>;
185189
def : PatFpr<fabs, FABS_S, FPR32>;
186190
def : PatFpr<fsqrt, FSQRT_S, FPR32>;
187191
def : Pat<(fdiv fpimm1, (fsqrt FPR32:$fj)), (FRSQRT_S FPR32:$fj)>;
192+
let Predicates = [HasBasicF, IsLA64] in {
193+
def : Pat<(fdiv (loongarch_movgr2fr_w_la64 (i64 1065353216)), (fsqrt FPR32:$fj)),
194+
(FRSQRT_S FPR32:$fj)>;
195+
} // Predicates = [HasBasicF, IsLA64]
196+
let Predicates = [HasBasicF, IsLA32] in {
197+
def : Pat<(fdiv (loongarch_movgr2fr_w (i32 1065353216)), (fsqrt FPR32:$fj)),
198+
(FRSQRT_S FPR32:$fj)>;
199+
} // Predicates = [HasBasicF, IsLA32]
188200
def : Pat<(fcanonicalize FPR32:$fj), (FMAX_S $fj, $fj)>;
189201
def : Pat<(is_fpclass FPR32:$fj, (i32 timm:$mask)),
190202
(SLTU R0, (ANDI (MOVFR2GR_S (FCLASS_S FPR32:$fj)),
@@ -295,6 +307,14 @@ def : Pat<(loongarch_ftint FPR32:$src), (FTINTRZ_W_S FPR32:$src)>;
295307

296308
// FP reciprocal operation
297309
def : Pat<(fdiv fpimm1, FPR32:$src), (FRECIP_S $src)>;
310+
let Predicates = [HasBasicF, IsLA64] in {
311+
def : Pat<(fdiv (loongarch_movgr2fr_w_la64 (i64 1065353216)), FPR32:$src),
312+
(FRECIP_S $src)>;
313+
} // Predicates = [HasBasicF, IsLA64]
314+
let Predicates = [HasBasicF, IsLA32] in {
315+
def : Pat<(fdiv (loongarch_movgr2fr_w (i32 1065353216)), FPR32:$src),
316+
(FRECIP_S $src)>;
317+
} // Predicates = [HasBasicF, IsLA32]
298318

299319
let Predicates = [HasFrecipe] in {
300320
// FP approximate reciprocal operation
@@ -350,6 +370,7 @@ def : PatFpr<frint, FRINT_S, FPR32>;
350370
let Predicates = [HasBasicF, IsLA32] in {
351371
// GPR -> FPR
352372
def : Pat<(bitconvert (i32 GPR:$src)), (MOVGR2FR_W GPR:$src)>;
373+
def : Pat<(loongarch_movgr2fr_w (i32 GPR:$src)), (MOVGR2FR_W GPR:$src)>;
353374
// FPR -> GPR
354375
def : Pat<(i32 (bitconvert FPR32:$src)), (MOVFR2GR_S FPR32:$src)>;
355376
// int -> f32

llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,21 @@
1010
//
1111
//===----------------------------------------------------------------------===//
1212

13+
//===----------------------------------------------------------------------===//
14+
// LoongArch specific DAG Nodes.
15+
//===----------------------------------------------------------------------===//
16+
17+
def SDT_LoongArchMOVGR2FR_D
18+
: SDTypeProfile<1, 1, [SDTCisVT<0, f64>, SDTCisVT<1, i64>]>;
19+
def SDT_LoongArchMOVGR2FR_D_LO_HI
20+
: SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>,
21+
SDTCisSameAs<1, 2>]>;
22+
23+
def loongarch_movgr2fr_d
24+
: SDNode<"LoongArchISD::MOVGR2FR_D", SDT_LoongArchMOVGR2FR_D>;
25+
def loongarch_movgr2fr_d_lo_hi
26+
: SDNode<"LoongArchISD::MOVGR2FR_D_LO_HI", SDT_LoongArchMOVGR2FR_D_LO_HI>;
27+
1328
//===----------------------------------------------------------------------===//
1429
// Instructions
1530
//===----------------------------------------------------------------------===//
@@ -147,6 +162,11 @@ def : PatFpr<fneg, FNEG_D, FPR64>;
147162
def : PatFpr<fabs, FABS_D, FPR64>;
148163
def : PatFpr<fsqrt, FSQRT_D, FPR64>;
149164
def : Pat<(fdiv fpimm1, (fsqrt FPR64:$fj)), (FRSQRT_D FPR64:$fj)>;
165+
let Predicates = [IsLA32] in {
166+
def : Pat<(fdiv (loongarch_movgr2fr_d_lo_hi (i32 0), (i32 1072693248)),
167+
(fsqrt FPR64:$fj)),
168+
(FRSQRT_D FPR64:$fj)>;
169+
} // Predicates = [IsLA32]
150170
def : Pat<(fcopysign FPR64:$fj, FPR32:$fk),
151171
(FCOPYSIGN_D FPR64:$fj, (FCVT_D_S FPR32:$fk))>;
152172
def : Pat<(fcopysign FPR32:$fj, FPR64:$fk),
@@ -252,6 +272,10 @@ def : Pat<(f64 (fpextend FPR32:$src)), (FCVT_D_S FPR32:$src)>;
252272

253273
// FP reciprocal operation
254274
def : Pat<(fdiv fpimm1, FPR64:$src), (FRECIP_D $src)>;
275+
let Predicates = [IsLA32] in {
276+
def : Pat<(fdiv (loongarch_movgr2fr_d_lo_hi (i32 0), (i32 1072693248)), FPR64:$src),
277+
(FRECIP_D FPR64:$src)>;
278+
} // Predicates = [IsLA32]
255279

256280
let Predicates = [HasFrecipe] in {
257281
// FP approximate reciprocal operation
@@ -307,9 +331,13 @@ def : Pat<(f64 (sint_to_fp (i64 (sexti32 (i64 GPR:$src))))),
307331
def : Pat<(f64 (sint_to_fp GPR:$src)), (FFINT_D_L (MOVGR2FR_D GPR:$src))>;
308332

309333
def : Pat<(bitconvert GPR:$src), (MOVGR2FR_D GPR:$src)>;
334+
def : Pat<(loongarch_movgr2fr_d GPR:$src), (MOVGR2FR_D GPR:$src)>;
310335
} // Predicates = [HasBasicD, IsLA64]
311336
let Predicates = [HasBasicD, IsLA32] in {
312337
def : Pat<(f64 (sint_to_fp (i32 GPR:$src))), (FFINT_D_W (MOVGR2FR_W GPR:$src))>;
338+
339+
def : Pat<(f64 (loongarch_movgr2fr_d_lo_hi (i32 GPR:$lo), (i32 GPR:$hi))),
340+
(MOVGR2FRH_W (MOVGR2FR_W_64 GPR:$lo), GPR:$hi)>;
313341
} // Predicates = [HasBasicD, IsLA32]
314342

315343
// Convert FP to int

llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "LoongArchSubtarget.h"
1919
#include "MCTargetDesc/LoongArchBaseInfo.h"
2020
#include "MCTargetDesc/LoongArchMCTargetDesc.h"
21+
#include "MCTargetDesc/LoongArchMatInt.h"
2122
#include "llvm/ADT/SmallSet.h"
2223
#include "llvm/ADT/Statistic.h"
2324
#include "llvm/ADT/StringExtras.h"
@@ -41,6 +42,34 @@ using namespace llvm;
4142

4243
STATISTIC(NumTailCalls, "Number of tail calls");
4344

45+
enum MaterializeFPImm {
46+
NoMaterializeFPImm = 0,
47+
MaterializeFPImm2Ins = 2,
48+
MaterializeFPImm3Ins = 3,
49+
MaterializeFPImm4Ins = 4,
50+
MaterializeFPImm5Ins = 5,
51+
MaterializeFPImm6Ins = 6
52+
};
53+
54+
static cl::opt<MaterializeFPImm> MaterializeFPImmInsNum(
55+
"loongarch-materialize-float-imm", cl::Hidden,
56+
cl::desc("Maximum number of instructions used (including code sequence "
57+
"to generate the value and moving the value to FPR) when "
58+
"materializing floating-point immediates (default = 3)"),
59+
cl::init(MaterializeFPImm3Ins),
60+
cl::values(clEnumValN(NoMaterializeFPImm, "0", "Use constant pool"),
61+
clEnumValN(MaterializeFPImm2Ins, "2",
62+
"Materialize FP immediate within 2 instructions"),
63+
clEnumValN(MaterializeFPImm3Ins, "3",
64+
"Materialize FP immediate within 3 instructions"),
65+
clEnumValN(MaterializeFPImm4Ins, "4",
66+
"Materialize FP immediate within 4 instructions"),
67+
clEnumValN(MaterializeFPImm5Ins, "5",
68+
"Materialize FP immediate within 5 instructions"),
69+
clEnumValN(MaterializeFPImm6Ins, "6",
70+
"Materialize FP immediate within 6 instructions "
71+
"(behaves same as 5 on loongarch64)")));
72+
4473
static cl::opt<bool> ZeroDivCheck("loongarch-check-zero-division", cl::Hidden,
4574
cl::desc("Trap on integer division by zero."),
4675
cl::init(false));
@@ -190,6 +219,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
190219
setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
191220
setCondCodeAction(FPCCToExpand, MVT::f32, Expand);
192221

222+
setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
193223
setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
194224
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
195225
setOperationAction(ISD::FMA, MVT::f32, Legal);
@@ -237,6 +267,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
237267
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
238268
setCondCodeAction(FPCCToExpand, MVT::f64, Expand);
239269

270+
setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
240271
setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
241272
setOperationAction(ISD::BR_CC, MVT::f64, Expand);
242273
setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Legal);
@@ -557,10 +588,67 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
557588
case ISD::VECREDUCE_UMAX:
558589
case ISD::VECREDUCE_UMIN:
559590
return lowerVECREDUCE(Op, DAG);
591+
case ISD::ConstantFP:
592+
return lowerConstantFP(Op, DAG);
560593
}
561594
return SDValue();
562595
}
563596

597+
SDValue LoongArchTargetLowering::lowerConstantFP(SDValue Op,
598+
SelectionDAG &DAG) const {
599+
EVT VT = Op.getValueType();
600+
ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
601+
const APFloat &FPVal = CFP->getValueAPF();
602+
SDLoc DL(CFP);
603+
604+
assert((VT == MVT::f32 && Subtarget.hasBasicF()) ||
605+
(VT == MVT::f64 && Subtarget.hasBasicD()));
606+
607+
// If value is 0.0 or -0.0, just ignore it.
608+
if (FPVal.isZero())
609+
return SDValue();
610+
611+
// If lsx enabled, use cheaper 'vldi' instruction if possible.
612+
if (isFPImmVLDILegal(FPVal, VT))
613+
return SDValue();
614+
615+
// Construct as integer, and move to float register.
616+
APInt INTVal = FPVal.bitcastToAPInt();
617+
618+
// If more than MaterializeFPImmInsNum instructions will be used to
619+
// generate the INTVal and move it to float register, fallback to
620+
// use floating point load from the constant pool.
621+
auto Seq = LoongArchMatInt::generateInstSeq(INTVal.getSExtValue());
622+
int InsNum = Seq.size() + ((VT == MVT::f64 && !Subtarget.is64Bit()) ? 2 : 1);
623+
if (InsNum > MaterializeFPImmInsNum && !FPVal.isExactlyValue(+1.0))
624+
return SDValue();
625+
626+
switch (VT.getSimpleVT().SimpleTy) {
627+
default:
628+
llvm_unreachable("Unexpected floating point type!");
629+
break;
630+
case MVT::f32: {
631+
SDValue NewVal = DAG.getConstant(INTVal, DL, MVT::i32);
632+
if (Subtarget.is64Bit())
633+
NewVal = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, NewVal);
634+
return DAG.getNode(Subtarget.is64Bit() ? LoongArchISD::MOVGR2FR_W_LA64
635+
: LoongArchISD::MOVGR2FR_W,
636+
DL, VT, NewVal);
637+
}
638+
case MVT::f64: {
639+
if (Subtarget.is64Bit()) {
640+
SDValue NewVal = DAG.getConstant(INTVal, DL, MVT::i64);
641+
return DAG.getNode(LoongArchISD::MOVGR2FR_D, DL, VT, NewVal);
642+
}
643+
SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
644+
SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
645+
return DAG.getNode(LoongArchISD::MOVGR2FR_D_LO_HI, DL, VT, Lo, Hi);
646+
}
647+
}
648+
649+
return SDValue();
650+
}
651+
564652
// Lower vecreduce_add using vhaddw instructions.
565653
// For Example:
566654
// call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
@@ -7152,7 +7240,10 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
71527240
NODE_NAME_CASE(SRL_W)
71537241
NODE_NAME_CASE(BSTRINS)
71547242
NODE_NAME_CASE(BSTRPICK)
7243+
NODE_NAME_CASE(MOVGR2FR_W)
71557244
NODE_NAME_CASE(MOVGR2FR_W_LA64)
7245+
NODE_NAME_CASE(MOVGR2FR_D)
7246+
NODE_NAME_CASE(MOVGR2FR_D_LO_HI)
71567247
NODE_NAME_CASE(MOVFR2GR_S_LA64)
71577248
NODE_NAME_CASE(FTINT)
71587249
NODE_NAME_CASE(BUILD_PAIR_F64)

llvm/lib/Target/LoongArch/LoongArchISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,10 @@ enum NodeType : unsigned {
5757
MOD_WU,
5858

5959
// FPR<->GPR transfer operations
60+
MOVGR2FR_W,
6061
MOVGR2FR_W_LA64,
62+
MOVGR2FR_D,
63+
MOVGR2FR_D_LO_HI,
6164
MOVFR2GR_S_LA64,
6265
MOVFCSR2GR,
6366
MOVGR2FCSR,
@@ -399,6 +402,7 @@ class LoongArchTargetLowering : public TargetLowering {
399402
SDValue lowerBF16_TO_FP(SDValue Op, SelectionDAG &DAG) const;
400403
SDValue lowerVECREDUCE_ADD(SDValue Op, SelectionDAG &DAG) const;
401404
SDValue lowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
405+
SDValue lowerConstantFP(SDValue Op, SelectionDAG &DAG) const;
402406

403407
bool isFPImmLegal(const APFloat &Imm, EVT VT,
404408
bool ForCodeSize) const override;

0 commit comments

Comments
 (0)