Skip to content

Commit

Permalink
AMDGPU/GlobalISel: Disable LCSSA pass
Browse files Browse the repository at this point in the history
Disable LCSSA pass in preparation for implementing temporal divergence
lowering in amdgpu divergence lowering. Breaks all cases where sgpr or
i1 values are used outside of the cycle with divergent exit.
Regenerate regression tests for amdgpu divergence lowering with LCSSA
disabled and switch them to new reg bank select. Also add required
regbanklegalize rules for these tests to pass.
Update IntrinsicLaneMaskAnalyzer to stop tracking lcssa phis that are
lane masks.
  • Loading branch information
petar-avramovic committed Feb 24, 2025
1 parent f1252f5 commit 11a9bd2
Show file tree
Hide file tree
Showing 20 changed files with 2,662 additions and 1,562 deletions.
12 changes: 2 additions & 10 deletions llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,25 +91,17 @@ void IntrinsicLaneMaskAnalyzer::initLaneMaskIntrinsics(MachineFunction &MF) {
GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI);
if (GI && GI->is(Intrinsic::amdgcn_if_break)) {
S32S64LaneMask.insert(MI.getOperand(3).getReg());
findLCSSAPhi(MI.getOperand(0).getReg());
S32S64LaneMask.insert(MI.getOperand(0).getReg());
}

if (MI.getOpcode() == AMDGPU::SI_IF ||
MI.getOpcode() == AMDGPU::SI_ELSE) {
findLCSSAPhi(MI.getOperand(0).getReg());
S32S64LaneMask.insert(MI.getOperand(0).getReg());
}
}
}
}

void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) {
S32S64LaneMask.insert(Reg);
for (const MachineInstr &LCSSAPhi : MRI.use_instructions(Reg)) {
if (LCSSAPhi.isPHI())
S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg());
}
}

static LLT getReadAnyLaneSplitTy(LLT Ty) {
if (Ty.isVector()) {
LLT ElTy = Ty.getElementType();
Expand Down
2 changes: 0 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,6 @@ class IntrinsicLaneMaskAnalyzer {

private:
void initLaneMaskIntrinsics(MachineFunction &MF);
// This will not be needed when we turn off LCSSA for global-isel.
void findLCSSAPhi(Register Reg);
};

void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,12 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
}

// Opcodes that also support S1.
if (Opc == G_FREEZE &&
MRI.getType(MI->getOperand(0).getReg()) != LLT::scalar(1)) {
RBLHelper.applyMappingTrivial(*MI);
continue;
}

if ((Opc == AMDGPU::G_CONSTANT || Opc == AMDGPU::G_FCONSTANT ||
Opc == AMDGPU::G_IMPLICIT_DEF)) {
Register Dst = MI->getOperand(0).getReg();
Expand Down
28 changes: 27 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,26 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
switch (Mapping.LoweringMethod) {
case DoNotLower:
return;
case VccExtToSel: {
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
Register Src = MI.getOperand(1).getReg();
unsigned Opc = MI.getOpcode();
if (Ty == S32 || Ty == S16) {
auto True = B.buildConstant({VgprRB, Ty}, Opc == G_SEXT ? -1 : 1);
auto False = B.buildConstant({VgprRB, Ty}, 0);
B.buildSelect(MI.getOperand(0).getReg(), Src, True, False);
}
if (Ty == S64) {
auto True = B.buildConstant({VgprRB, S32}, Opc == G_SEXT ? -1 : 1);
auto False = B.buildConstant({VgprRB, S32}, 0);
auto Sel = B.buildSelect({VgprRB, S32}, Src, True, False);
B.buildMergeValues(
MI.getOperand(0).getReg(),
{Sel.getReg(0), Opc == G_SEXT ? Sel.getReg(0) : False.getReg(0)});
}
MI.eraseFromParent();
return;
}
case UniExtToSel: {
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
auto True = B.buildConstant({SgprRB, Ty},
Expand Down Expand Up @@ -276,6 +296,8 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
case Sgpr64:
case Vgpr64:
return LLT::scalar(64);
case VgprP0:
return LLT::pointer(0, 64);
case SgprP1:
case VgprP1:
return LLT::pointer(1, 64);
Expand Down Expand Up @@ -383,6 +405,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
return SgprRB;
case Vgpr32:
case Vgpr64:
case VgprP0:
case VgprP1:
case VgprP3:
case VgprP4:
Expand Down Expand Up @@ -425,6 +448,7 @@ void RegBankLegalizeHelper::applyMappingDst(
case SgprV4S32:
case Vgpr32:
case Vgpr64:
case VgprP0:
case VgprP1:
case VgprP3:
case VgprP4:
Expand Down Expand Up @@ -555,6 +579,7 @@ void RegBankLegalizeHelper::applyMappingSrc(
// vgpr scalars, pointers and vectors
case Vgpr32:
case Vgpr64:
case VgprP0:
case VgprP1:
case VgprP3:
case VgprP4:
Expand Down Expand Up @@ -653,7 +678,8 @@ void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) {
// We accept all types that can fit in some register class.
// Uniform G_PHIs have all sgpr registers.
// Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr.
if (Ty == LLT::scalar(32) || Ty == LLT::pointer(4, 64)) {
if (Ty == LLT::scalar(32) || Ty == LLT::pointer(1, 64) ||
Ty == LLT::pointer(4, 64)) {
return;
}

Expand Down
41 changes: 40 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg) == LLT::scalar(32);
case S64:
return MRI.getType(Reg) == LLT::scalar(64);
case P0:
return MRI.getType(Reg) == LLT::pointer(0, 64);
case P1:
return MRI.getType(Reg) == LLT::pointer(1, 64);
case P3:
Expand All @@ -58,6 +60,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg) == LLT::pointer(4, 64);
case P5:
return MRI.getType(Reg) == LLT::pointer(5, 32);
case V4S32:
return MRI.getType(Reg) == LLT::fixed_vector(4, 32);
case B32:
return MRI.getType(Reg).getSizeInBits() == 32;
case B64:
Expand All @@ -78,6 +82,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniform(Reg);
case UniS64:
return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniform(Reg);
case UniP0:
return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isUniform(Reg);
case UniP1:
return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isUniform(Reg);
case UniP3:
Expand All @@ -104,6 +110,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg) == LLT::scalar(32) && MUI.isDivergent(Reg);
case DivS64:
return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergent(Reg);
case DivP0:
return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isDivergent(Reg);
case DivP1:
return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergent(Reg);
case DivP3:
Expand Down Expand Up @@ -315,13 +323,15 @@ RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) const {
Opc == AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS) {
unsigned IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
if (!IRulesAlias.contains(IntrID)) {
MI.dump();
LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
llvm_unreachable("No rules defined for intrinsic opcode");
}
return IRules.at(IRulesAlias.at(IntrID));
}

if (!GRulesAlias.contains(Opc)) {
MI.dump();
LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
llvm_unreachable("No rules defined for generic opcode");
}
Expand Down Expand Up @@ -431,16 +441,21 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
addRulesForGOpcs({G_XOR, G_OR, G_AND}, StandardB)
.Any({{UniS1}, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}}})
.Any({{DivS1}, {{Vcc}, {Vcc, Vcc}}})
.Div(B32, {{VgprB32}, {VgprB32, VgprB32}})
.Uni(B64, {{SgprB64}, {SgprB64, SgprB64}})
.Div(B64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32});

addRulesForGOpcs({G_SHL}, Standard)
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
.Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});

// Note: we only write S1 rules for G_IMPLICIT_DEF, G_CONSTANT, G_FCONSTANT
// and G_FREEZE here, rest is trivially regbankselected earlier
addRulesForGOpcs({G_IMPLICIT_DEF}).Any({{UniS1}, {{Sgpr32Trunc}, {}}});
addRulesForGOpcs({G_CONSTANT})
.Any({{UniS1, _}, {{Sgpr32Trunc}, {None}, UniCstExt}});
addRulesForGOpcs({G_FREEZE}).Any({{DivS1}, {{Vcc}, {Vcc}}});

addRulesForGOpcs({G_ICMP})
.Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
Expand Down Expand Up @@ -471,6 +486,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,

addRulesForGOpcs({G_ZEXT, G_SEXT})
.Any({{UniS32, S1}, {{Sgpr32}, {Sgpr32AExtBoolInReg}, UniExtToSel}})
.Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
.Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
.Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}});

Expand Down Expand Up @@ -525,9 +541,12 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,

// clang-format off
addRulesForGOpcs({G_LOAD})
.Any({{DivB32, DivP0}, {{VgprB32}, {VgprP0}}})

.Any({{DivB32, DivP1}, {{VgprB32}, {VgprP1}}})
.Any({{{UniB256, UniP1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}})
.Any({{{UniB512, UniP1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}})
.Any({{{UniB32, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}})
.Any({{{UniB256, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP1}, SplitLoad}})
.Any({{{UniB512, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP1}, SplitLoad}})

Expand Down Expand Up @@ -556,15 +575,26 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
// clang-format on

addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, Vector)
.Div(S32, {{Vgpr32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
.Uni(S32, {{UniInVgprS32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
.Div(V4S32, {{VgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
.Uni(V4S32, {{UniInVgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}});

addRulesForGOpcs({G_STORE})
.Any({{S32, P0}, {{}, {Vgpr32, VgprP0}}})
.Any({{S32, P1}, {{}, {Vgpr32, VgprP1}}})
.Any({{S64, P1}, {{}, {Vgpr64, VgprP1}}})
.Any({{V4S32, P1}, {{}, {VgprV4S32, VgprP1}}});

addRulesForGOpcs({G_PTR_ADD}).Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}});
addRulesForGOpcs({G_AMDGPU_BUFFER_STORE})
.Any({{S32}, {{}, {Vgpr32, SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}});

addRulesForGOpcs({G_PTR_ADD})
.Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}})
.Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}})
.Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}});

addRulesForGOpcs({G_INTTOPTR}).Any({{UniP4}, {{SgprP4}, {Sgpr64}}});

addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});

Expand All @@ -580,15 +610,24 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat);

addRulesForGOpcs({G_UITOFP})
.Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
.Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
.Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat);

using namespace Intrinsic;

addRulesForIOpcs({amdgcn_s_getpc}).Any({{UniS64, _}, {{Sgpr64}, {None}}});

// This is "intrinsic lane mask" it was set to i32/i64 in llvm-ir.
addRulesForIOpcs({amdgcn_end_cf}).Any({{_, S32}, {{}, {None, Sgpr32}}});

addRulesForIOpcs({amdgcn_if_break}, Standard)
.Uni(S32, {{Sgpr32}, {IntrId, Vcc, Sgpr32}});

addRulesForIOpcs({amdgcn_mbcnt_lo, amdgcn_mbcnt_hi}, Standard)
.Div(S32, {{}, {Vgpr32, None, Vgpr32, Vgpr32}});

addRulesForIOpcs({amdgcn_readfirstlane})
.Any({{UniS32, _, DivS32}, {{}, {Sgpr32, None, Vgpr32}}});

} // end initialize rules
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,19 @@ enum UniformityLLTOpPredicateID {
DivS64,

// pointers
P0,
P1,
P3,
P4,
P5,

UniP0,
UniP1,
UniP3,
UniP4,
UniP5,

DivP0,
DivP1,
DivP3,
DivP4,
Expand Down Expand Up @@ -124,6 +127,7 @@ enum RegBankLLTMappingApplyID {
// vgpr scalars, pointers, vectors and B-types
Vgpr32,
Vgpr64,
VgprP0,
VgprP1,
VgprP3,
VgprP4,
Expand Down Expand Up @@ -162,6 +166,7 @@ enum RegBankLLTMappingApplyID {
// vgpr. Lower it to two S32 vgpr ANDs.
enum LoweringMethodID {
DoNotLower,
VccExtToSel,
UniExtToSel,
VgprToVccCopy,
SplitTo32,
Expand Down
10 changes: 8 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1382,7 +1382,11 @@ bool GCNPassConfig::addPreISel() {
// control flow modifications.
addPass(createAMDGPURewriteUndefForPHILegacyPass());

addPass(createLCSSAPass());
// SDAG requires LCSSA, GlobalISel does not. Disable LCSSA for -global-isel
// with -new-reg-bank-select and without any of the fallback options.
if (!getCGPassBuilderOption().EnableGlobalISelOption ||
!isGlobalISelAbortEnabled() || !NewRegBankSelect)
addPass(createLCSSAPass());

if (TM->getOptLevel() > CodeGenOptLevel::Less)
addPass(&AMDGPUPerfHintAnalysisLegacyID);
Expand Down Expand Up @@ -2086,7 +2090,9 @@ void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
// control flow modifications.
addPass(AMDGPURewriteUndefForPHIPass());

addPass(LCSSAPass());
if (!getCGPassBuilderOption().EnableGlobalISelOption ||
!isGlobalISelAbortEnabled() || !NewRegBankSelect)
addPass(LCSSAPass());

if (TM.getOptLevel() > CodeGenOptLevel::Less)
addPass(AMDGPUPerfHintAnalysisPass(TM));
Expand Down
Loading

0 comments on commit 11a9bd2

Please sign in to comment.