Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions llvm/include/llvm/CodeGen/MachineRegisterInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -617,6 +617,14 @@ class MachineRegisterInfo {
/// form, so there should only be one definition.
LLVM_ABI MachineInstr *getVRegDef(Register Reg) const;

/// getDomVRegDefInBasicBlock - Return the last machine instr that defines
/// the specified virtual register in the basic block, searching backwards
/// from instruction I (exclusive). Returns MBB.end() if no definition is
/// found.
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment seems bit misleading, as the last instruction while searching backward implies the very first definition in the MBB. I think you can separate the two lines : the last instruction encountered in the MBB & you search in the backward manner from I.

LLVM_ABI MachineBasicBlock::iterator
getDomVRegDefInBasicBlock(Register Reg, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const;

/// getUniqueVRegDef - Return the unique machine instr that defines the
/// specified virtual register or null if none is found. If there are
/// multiple definitions or no definition, return null.
Expand Down
13 changes: 13 additions & 0 deletions llvm/lib/CodeGen/MachineRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -674,3 +674,16 @@ bool MachineRegisterInfo::isReservedRegUnit(MCRegUnit Unit) const {
}
return false;
}

MachineBasicBlock::iterator MachineRegisterInfo::getDomVRegDefInBasicBlock(
Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
if (I == MBB.begin())
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code can be simplified into a while (I != MBB.begin()) { .... } loop, right? also return a pointer of MachineInstr seems more straightforward?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep. Turn it into a for/while loop and add this condition check as the loop terminator. I second that idea of returning a MachineInstr*. All the machinery of checking I != MBB.end() at the call-sites (after this function returns) can be simplified with a !MI.

return MBB.end();
// Iterate backwards from I (exclusive) to the beginning of the basic block
do {
--I;
if (I->modifiesRegister(Reg, getTargetRegisterInfo()))
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (I->modifiesRegister(Reg, getTargetRegisterInfo()))
if (I->definesRegister(Reg, getTargetRegisterInfo()))

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you looking for those instructions only that fully define Reg use the above suggestion because the existing one also accounts for the case of partial definition!

return I;
} while (I != MBB.begin());
return MBB.end();
}
45 changes: 26 additions & 19 deletions llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1839,7 +1839,7 @@ void ControlFlowRewriter::rewrite() {
Opcode = AMDGPU::S_CBRANCH_SCC1;
} else {
Register CondReg = Info.OrigCondition;
if (!LMA.isSubsetOfExec(CondReg, *Node->Block)) {
if (!LMA.isSubsetOfExec(CondReg, *Node->Block, Node->Block->end())) {
CondReg = LMU.createLaneMaskReg();
BuildMI(*Node->Block, Node->Block->end(), {}, TII.get(LMC.AndOpc),
CondReg)
Expand Down Expand Up @@ -1937,7 +1937,8 @@ void ControlFlowRewriter::rewrite() {
}
} else {
CondReg = LaneOrigin.CondReg;
if (!LMA.isSubsetOfExec(LaneOrigin.CondReg, *LaneOrigin.Node->Block)) {
if (!LMA.isSubsetOfExec(LaneOrigin.CondReg, *LaneOrigin.Node->Block,
LaneOrigin.Node->Block->getFirstTerminator())) {
Register Prev = CondReg;
CondReg = LMU.createLaneMaskReg();
BuildMI(*LaneOrigin.Node->Block,
Expand Down Expand Up @@ -2033,28 +2034,34 @@ void ControlFlowRewriter::rewrite() {
CFGNodeInfo &PredInfo = NodeInfo.find(Pred)->second;
Register PrimaryExec = PredInfo.PrimarySuccessorExec;

MachineInstr *PrimaryExecDef;
for (;;) {
PrimaryExecDef = MRI.getVRegDef(PrimaryExec);
if (PrimaryExecDef->getOpcode() != AMDGPU::COPY)
break;
PrimaryExec = PrimaryExecDef->getOperand(1).getReg();
}
// Turning off this copy-chain optimization to retain the Accumulator as
// the PrimaryExec

// MachineInstr *PrimaryExecDef;
// for (;;) {
// PrimaryExecDef = MRI.getVRegDef(PrimaryExec);
// if (PrimaryExecDef->getOpcode() != AMDGPU::COPY)
// break;
// PrimaryExec = PrimaryExecDef->getOperand(1).getReg();
// }

// Rejoin = EXEC ^ PrimaryExec
//
// Fold immediately if PrimaryExec was obtained via XOR as well.
Register Rejoin;

if (PrimaryExecDef->getParent() == Pred->Block &&
PrimaryExecDef->getOpcode() == LMC.XorOpc &&
PrimaryExecDef->getOperand(1).isReg() &&
PrimaryExecDef->getOperand(2).isReg()) {
if (PrimaryExecDef->getOperand(1).getReg() == LMC.ExecReg)
Rejoin = PrimaryExecDef->getOperand(2).getReg();
else if (PrimaryExecDef->getOperand(2).getReg() == LMC.ExecReg)
Rejoin = PrimaryExecDef->getOperand(1).getReg();
}
// Turning off this XOR optimiztion since buildMergeLaneMasks() will not
// introduce XOR instruction for creating the PrimaryExec

// if (PrimaryExecDef->getParent() == Pred->Block &&
// PrimaryExecDef->getOpcode() == LMC.XorOpc &&
// PrimaryExecDef->getOperand(1).isReg() &&
// PrimaryExecDef->getOperand(2).isReg()) {
// if (PrimaryExecDef->getOperand(1).getReg() == LMC.ExecReg)
// Rejoin = PrimaryExecDef->getOperand(2).getReg();
// else if (PrimaryExecDef->getOperand(2).getReg() == LMC.ExecReg)
// Rejoin = PrimaryExecDef->getOperand(1).getReg();
// }

if (!Rejoin) {
// Try to find a previously generated XOR (or merely masked) value
Expand Down Expand Up @@ -2091,7 +2098,7 @@ void ControlFlowRewriter::rewrite() {

LLVM_DEBUG(Function.dump());
}

Updater.insertAccumulatorResets();
Updater.cleanup();
}

Expand Down
125 changes: 73 additions & 52 deletions llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,14 @@ bool GCNLaneMaskUtils::maybeLaneMask(Register Reg) const {

/// Determine whether the lane-mask register \p Reg is a wave-wide constant.
/// If so, the value is stored in \p Val.
bool GCNLaneMaskUtils::isConstantLaneMask(Register Reg, bool &Val) const {
bool GCNLaneMaskUtils::isConstantLaneMask(
Register Reg, bool &Val, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const {
MachineRegisterInfo &MRI = MF.getRegInfo();

const MachineInstr *MI;
for (;;) {
MI = MRI.getVRegDef(Reg);
if (!MI) {
MI = MRI.getDomVRegDefInBasicBlock(Reg, MBB, MI);
if (MI == MBB.end()) {
// This can happen when called from GCNLaneMaskUpdater, where Reg can
// be a placeholder that has not yet been filled in.
return false;
Expand Down Expand Up @@ -100,18 +101,17 @@ Register GCNLaneMaskUtils::createLaneMaskReg() const {
/// properly masked, i.e. use PrevReg directly instead of
/// (PrevReg & ~EXEC), and don't add extra 1-bits to DstReg
/// beyond (CurReg & EXEC).
void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL, Register DstReg,
Register PrevReg, Register CurReg,
GCNLaneMaskAnalysis *LMA,
bool accumulating) const {
/// \param isPrevZeroReg Indicates that PrevReg is a zero register.
void GCNLaneMaskUtils::buildMergeLaneMasks(
MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
Register DstReg, Register PrevReg, Register CurReg,
GCNLaneMaskAnalysis *LMA, bool accumulating, bool isPrevZeroReg) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
bool PrevVal = false;
bool PrevConstant = !PrevReg || isConstantLaneMask(PrevReg, PrevVal);
bool PrevConstant = !PrevReg || isPrevZeroReg;
bool CurVal = false;
bool CurConstant = isConstantLaneMask(CurReg, CurVal);
bool CurConstant = isConstantLaneMask(CurReg, CurVal, MBB, I);

assert(PrevReg || !accumulating);

Expand Down Expand Up @@ -147,7 +147,7 @@ void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB,
}
if (!CurConstant) {
if ((PrevConstant && PrevVal) ||
(LMA && LMA->isSubsetOfExec(CurReg, MBB))) {
(LMA && LMA->isSubsetOfExec(CurReg, MBB, I))) {
CurMaskedReg = CurReg;
} else {
CurMaskedReg = createLaneMaskReg();
Expand Down Expand Up @@ -188,22 +188,26 @@ void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB,
/// (Reg & EXEC) == Reg when used in \p UseBlock.
bool GCNLaneMaskAnalysis::isSubsetOfExec(Register Reg,
MachineBasicBlock &UseBlock,
MachineBasicBlock::iterator I,
unsigned RemainingDepth) {
MachineRegisterInfo &MRI = LMU.function()->getRegInfo();
MachineInstr *DefInstr = nullptr;
MachineBasicBlock::iterator DefInstr = UseBlock.end();
const AMDGPU::LaneMaskConstants &LMC = LMU.getLaneMaskConsts();

for (;;) {
if (!Register::isVirtualRegister(Reg)) {
if (Reg == LMC.ExecReg &&
(!DefInstr || DefInstr->getParent() == &UseBlock))
(DefInstr == UseBlock.end() || DefInstr->getParent() == &UseBlock))
return true;
return false;
}

DefInstr = MRI.getVRegDef(Reg);
DefInstr = MRI.getDomVRegDefInBasicBlock(Reg, UseBlock, I);
if (DefInstr == UseBlock.end())
return false;
if (DefInstr->getOpcode() == AMDGPU::COPY) {
Reg = DefInstr->getOperand(1).getReg();
I = DefInstr;
continue;
}

Expand Down Expand Up @@ -242,7 +246,7 @@ bool GCNLaneMaskAnalysis::isSubsetOfExec(Register Reg,
if ((LikeOr || IsAnd || IsAndN2) &&
(DefInstr->getOperand(1).isReg() && DefInstr->getOperand(2).isReg())) {
bool FirstIsSubset = isSubsetOfExec(DefInstr->getOperand(1).getReg(),
UseBlock, RemainingDepth);
UseBlock, DefInstr, RemainingDepth);
if (!FirstIsSubset && (LikeOr || IsAndN2))
return SubsetOfExec.try_emplace(Reg, false).first->second;

Expand All @@ -252,7 +256,7 @@ bool GCNLaneMaskAnalysis::isSubsetOfExec(Register Reg,
}

bool SecondIsSubset = isSubsetOfExec(DefInstr->getOperand(2).getReg(),
UseBlock, RemainingDepth);
UseBlock, DefInstr, RemainingDepth);
if (!SecondIsSubset)
return SubsetOfExec.try_emplace(Reg, false).first->second;

Expand All @@ -268,14 +272,14 @@ void GCNLaneMaskUpdater::init(Register Reg) {
Processed = false;
Blocks.clear();
// SSAUpdater.Initialize(LMU.getLaneMaskConsts().LaneMaskRC);
SSAUpdater.Initialize(Reg);
Accumulator = {};
}

/// Optional cleanup, may remove stray instructions.
void GCNLaneMaskUpdater::cleanup() {
Processed = false;
Blocks.clear();

Accumulator = {};
MachineRegisterInfo &MRI = LMU.function()->getRegInfo();

if (ZeroReg && MRI.use_empty(ZeroReg)) {
Expand Down Expand Up @@ -330,7 +334,7 @@ void GCNLaneMaskUpdater::addAvailable(MachineBasicBlock &Block,
Register GCNLaneMaskUpdater::getValueInMiddleOfBlock(MachineBasicBlock &Block) {
if (!Processed)
process();
return SSAUpdater.GetValueInMiddleOfBlock(&Block);
return Accumulator;
}

/// Return the value at the end of the given block, i.e. after any change that
Expand All @@ -342,7 +346,7 @@ Register GCNLaneMaskUpdater::getValueInMiddleOfBlock(MachineBasicBlock &Block) {
Register GCNLaneMaskUpdater::getValueAtEndOfBlock(MachineBasicBlock &Block) {
if (!Processed)
process();
return SSAUpdater.GetValueAtEndOfBlock(&Block);
return Accumulator;
}

/// Return the value in \p Block after the value merge (if any).
Expand All @@ -352,15 +356,15 @@ Register GCNLaneMaskUpdater::getValueAfterMerge(MachineBasicBlock &Block) {

auto BlockIt = findBlockInfo(Block);
if (BlockIt != Blocks.end()) {
if (BlockIt->Merged)
return BlockIt->Merged;
if (BlockIt->Value)
return Accumulator;
if (BlockIt->Flags & ResetInMiddle)
return ZeroReg;
}

// We didn't merge anything in the block, but the block may still be
// ResetAtEnd, in which case we need the pre-reset value.
return SSAUpdater.GetValueInMiddleOfBlock(&Block);
return Accumulator;
}

/// Determine whether \p MI defines and/or uses SCC.
Expand Down Expand Up @@ -422,22 +426,22 @@ void GCNLaneMaskUpdater::process() {
.addImm(0);
}

// Add available values.
if (!Accumulator) {
Accumulator = LMU.createLaneMaskReg();
BuildMI(Entry, Entry.getFirstTerminator(), {},
TII->get(LMU.getLaneMaskConsts().MovOpc), Accumulator)
.addImm(0);
}

// Reset accumulator.
for (BlockInfo &Info : Blocks) {
assert(Accumulating || !Info.Flags);
assert(Info.Flags || Info.Value);

if (Info.Value)
Info.Merged = LMU.createLaneMaskReg();

SSAUpdater.AddAvailableValue(
Info.Block,
(Info.Value && !(Info.Flags & ResetAtEnd)) ? Info.Merged : ZeroReg);
if (!Info.Value || (Info.Flags & ResetAtEnd))
AccumulatorResetBlocks[Info.Block].insert(Accumulator);
}

if (Accumulating && !SSAUpdater.HasValueForBlock(&Entry))
SSAUpdater.AddAvailableValue(&Entry, ZeroReg);

// Once the SSA updater is ready, we can fill in all merge code, relying
// on the SSA updater to insert required PHIs.
for (BlockInfo &Info : Blocks) {
Expand All @@ -448,11 +452,8 @@ void GCNLaneMaskUpdater::process() {
Register Previous;
if (Info.Block != &LMU.function()->front() &&
!(Info.Flags & ResetInMiddle)) {
Previous = SSAUpdater.GetValueInMiddleOfBlock(Info.Block);
if (Accumulating) {
assert(!MRI.getVRegDef(Previous) ||
MRI.getVRegDef(Previous)->getOpcode() != AMDGPU::IMPLICIT_DEF);
} else {
Previous = Accumulator;
if (!Accumulating) {
MachineInstr *PrevInstr = MRI.getVRegDef(Previous);
if (PrevInstr && PrevInstr->getOpcode() == AMDGPU::IMPLICIT_DEF) {
PotentiallyDead.insert(PrevInstr);
Expand All @@ -466,18 +467,19 @@ void GCNLaneMaskUpdater::process() {

// Insert merge logic.
MachineBasicBlock::iterator insertPt = getSaluInsertionAtEnd(*Info.Block);
LMU.buildMergeLaneMasks(*Info.Block, insertPt, {}, Info.Merged, Previous,
Info.Value, LMA, Accumulating);

if (Info.Flags & ResetAtEnd) {
MachineInstr *mergeInstr = MRI.getVRegDef(Info.Merged);
if (mergeInstr->getOpcode() == AMDGPU::COPY &&
mergeInstr->getOperand(1).getReg().isVirtual()) {
assert(MRI.use_empty(Info.Merged));
Info.Merged = mergeInstr->getOperand(1).getReg();
mergeInstr->eraseFromParent();
}
}
LMU.buildMergeLaneMasks(*Info.Block, insertPt, {}, Accumulator, Previous,
Info.Value, LMA, Accumulating, Previous == ZeroReg);

// Switching off this optimization, since Accumulator will always have a use
// if (Info.Flags & ResetAtEnd) {
// MachineInstr *mergeInstr = MRI.getVRegDef(Info.Merged);
// if (mergeInstr->getOpcode() == AMDGPU::COPY &&
// mergeInstr->getOperand(1).getReg().isVirtual()) {
// assert(MRI.use_empty(Info.Merged));
// Info.Merged = mergeInstr->getOperand(1).getReg();
// mergeInstr->eraseFromParent();
// }
// }
}

Processed = true;
Expand All @@ -489,3 +491,22 @@ GCNLaneMaskUpdater::findBlockInfo(MachineBasicBlock &Block) {
return llvm::find_if(
Blocks, [&](const auto &Entry) { return Entry.Block == &Block; });
}

void GCNLaneMaskUpdater::insertAccumulatorResets() {
const SIInstrInfo *TII =
LMU.function()->getSubtarget<GCNSubtarget>().getInstrInfo();
for (auto &Entry : AccumulatorResetBlocks) {
MachineBasicBlock *B = Entry.first;
DenseSet<Register> &Accumulators = Entry.second;
for (Register ACC : Accumulators) {
// Get first branch instruction.
MachineBasicBlock::iterator I = B->getFirstTerminator();
while (I != B->end() && !I->isBranch())
I++;
if (I == B->end())
I--;
BuildMI(*B, I, {}, TII->get(LMU.getLaneMaskConsts().MovOpc), ACC)
.addImm(0);
}
Copy link

@vg0204 vg0204 Dec 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't you insert all resets one after another once you find the right place rather than searching for right insertion place for every accumulator to reset? Seems bit expensive!

}
}
Loading