Skip to content

Commit

Permalink
[VPlan] Model branch cond to enter scalar epilogue in VPlan. (llvm#92651
Browse files Browse the repository at this point in the history
)

This patch moves branch condition creation to enter the scalar epilogue
loop to VPlan. Modeling the branch in the middle block also requires
modeling the successor blocks. This is done using the recently
introduced VPIRBasicBlock.

Note that the middle.block is still created as part of the skeleton and
then patched in during VPlan execution. Unfortunately the skeleton needs
to create the middle.block early on, as it is also used for induction
resume value creation and is also needed to properly update the
dominator tree during skeleton creation.

After this patch lands, I plan to move induction resume value and phi
node creation in the scalar preheader to VPlan. Once that is done, we
should be able to create the middle.block in VPlan directly.

This is a re-worked version based on the earlier
https://reviews.llvm.org/D150398 and the main change is the use of
VPIRBasicBlock.

Depends on llvm#92525

PR: llvm#92651
  • Loading branch information
fhahn committed Jul 5, 2024
1 parent b546096 commit 99d6c6d
Show file tree
Hide file tree
Showing 103 changed files with 1,621 additions and 1,226 deletions.
116 changes: 34 additions & 82 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2964,34 +2964,6 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
LoopScalarPreHeader =
SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
nullptr, Twine(Prefix) + "scalar.ph");

// Set up the middle block terminator. Two cases:
// 1) If we know that we must execute the scalar epilogue, retain the existing
// unconditional branch from the middle block to the scalar preheader. In that
// case, there's no edge from the middle block to exit blocks and thus no
// need to update the immediate dominator of the exit blocks.
if (Cost->requiresScalarEpilogue(VF.isVector())) {
assert(
LoopMiddleBlock->getSingleSuccessor() == LoopScalarPreHeader &&
" middle block should have the scalar preheader as single successor");
return;
}

// 2) Otherwise, we must have a single unique exit block (due to how we
// implement the multiple exit case). In this case, set up a conditional
// branch from the middle block to the loop scalar preheader, and the
// exit block. completeLoopSkeleton will update the condition to use an
// iteration check, if required to decide whether to execute the remainder.
BranchInst *BrInst =
BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);

// Update dominator for loop exit. During skeleton creation, only the vector
// pre-header and the middle block are created. The vector loop is entirely
// created during VPlan exection.
DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
}

PHINode *InnerLoopVectorizer::createInductionResumeValue(
Expand Down Expand Up @@ -3088,51 +3060,6 @@ void InnerLoopVectorizer::createInductionResumeValues(
}
}

BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
// The trip counts should be cached by now.
Value *Count = getTripCount();
Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);

auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();

// Add a check in the middle block to see if we have completed
// all of the iterations in the first vector loop. Three cases:
// 1) If we require a scalar epilogue, there is no conditional branch as
// we unconditionally branch to the scalar preheader. Do nothing.
// 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
// Thus if tail is to be folded, we know we don't need to run the
// remainder and we can use the previous value for the condition (true).
// 3) Otherwise, construct a runtime check.
if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
!Cost->foldTailByMasking()) {
// Here we use the same DebugLoc as the scalar loop latch terminator instead
// of the corresponding compare because they may have ended up with
// different line numbers and we want to avoid awkward line stepping while
// debugging. Eg. if the compare has got a line number inside the loop.
// TODO: At the moment, CreateICmpEQ will simplify conditions with constant
// operands. Perform simplification directly on VPlan once the branch is
// modeled there.
IRBuilder<> B(LoopMiddleBlock->getTerminator());
B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());
BI.setCondition(CmpN);
if (hasBranchWeightMD(*ScalarLatchTerm)) {
// Assume that `Count % VectorTripCount` is equally distributed.
unsigned TripCount = UF * VF.getKnownMinValue();
assert(TripCount > 0 && "trip count should not be zero");
const uint32_t Weights[] = {1, TripCount - 1};
setBranchWeights(BI, Weights, /*IsExpected=*/false);
}
}

#ifdef EXPENSIVE_CHECKS
assert(DT->verify(DominatorTree::VerificationLevel::Fast));
#endif

return LoopVectorPreHeader;
}

std::pair<BasicBlock *, Value *>
InnerLoopVectorizer::createVectorizedLoopSkeleton(
const SCEV2ValueTy &ExpandedSCEVs) {
Expand All @@ -3155,17 +3082,18 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
| [ ]_| <-- vector loop (created during VPlan execution).
| |
| v
\ -[ ] <--- middle-block.
\ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
| | successors created during VPlan execution)
\/ |
/\ v
| ->[ ] <--- new preheader.
| ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
| |
(opt) v <-- edge from middle to exit iff epilogue is not required.
| [ ] \
| [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
\ |
\ v
>[ ] <-- exit block(s).
>[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
...
*/

Expand All @@ -3192,7 +3120,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
// Emit phis for the new starting index of the scalar loop.
createInductionResumeValues(ExpandedSCEVs);

return {completeLoopSkeleton(), nullptr};
return {LoopVectorPreHeader, nullptr};
}

// Fix up external users of the induction variable. At this point, we are
Expand Down Expand Up @@ -7477,6 +7405,9 @@ LoopVectorizationPlanner::executePlan(
std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
: State.ExpandedSCEVs);
#ifdef EXPENSIVE_CHECKS
assert(DT->verify(DominatorTree::VerificationLevel::Fast));
#endif

// Only use noalias metadata when using memory checks guaranteeing no overlap
// across all iterations.
Expand Down Expand Up @@ -7557,6 +7488,18 @@ LoopVectorizationPlanner::executePlan(

ILV.printDebugTracesAtEnd();

// 4. Adjust branch weight of the branch in the middle block.
auto *MiddleTerm =
cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
if (MiddleTerm->isConditional() &&
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
// Assume that `Count % VectorTripCount` is equally distributed.
unsigned TripCount = State.UF * State.VF.getKnownMinValue();
assert(TripCount > 0 && "trip count should not be zero");
const uint32_t Weights[] = {1, TripCount - 1};
setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
}

return {State.ExpandedSCEVs, ReductionResumeValues};
}

Expand Down Expand Up @@ -7613,7 +7556,7 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
// inductions in the epilogue loop are created before executing the plan for
// the epilogue loop.

return {completeLoopSkeleton(), nullptr};
return {LoopVectorPreHeader, nullptr};
}

void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
Expand Down Expand Up @@ -7802,7 +7745,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
{VecEpilogueIterationCountCheck,
EPI.VectorTripCount} /* AdditionalBypass */);

return {completeLoopSkeleton(), EPResumeVal};
return {LoopVectorPreHeader, EPResumeVal};
}

BasicBlock *
Expand Down Expand Up @@ -7847,7 +7790,6 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
setBranchWeights(BI, Weights, /*IsExpected=*/false);
}
ReplaceInstWithInst(Insert->getTerminator(), &BI);

LoopBypassBlocks.push_back(Insert);
return Insert;
}
Expand Down Expand Up @@ -8552,9 +8494,17 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
// modified; a basic block for the vector pre-header, followed by a region for
// the vector loop, followed by the middle basic block. The skeleton vector
// loop region contains a header and latch basic blocks.

bool RequiresScalarEpilogueCheck =
LoopVectorizationPlanner::getDecisionAndClampRange(
[this](ElementCount VF) {
return !CM.requiresScalarEpilogue(VF.isVector());
},
Range);
VPlanPtr Plan = VPlan::createInitialVPlan(
createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
*PSE.getSE(), OrigLoop->getLoopPreheader());
*PSE.getSE(), RequiresScalarEpilogueCheck, CM.foldTailByMasking(),
OrigLoop);
VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
Expand Down Expand Up @@ -8802,7 +8752,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
// Create new empty VPlan
auto Plan = VPlan::createInitialVPlan(
createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
*PSE.getSE(), OrigLoop->getLoopPreheader());
*PSE.getSE(), true, false, OrigLoop);

// Build hierarchical CFG
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
Expand Down Expand Up @@ -10163,6 +10113,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
}

assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
"DT not preserved correctly");
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
DT, true, &ExpandedSCEVs);
++LoopsEpilogueVectorized;
Expand Down
Loading

0 comments on commit 99d6c6d

Please sign in to comment.