Skip to content

Commit

Permalink
[LV]Initial support for safe distance in predicated DataWithEVL vecto…
Browse files Browse the repository at this point in the history
…rization mode.

Enabled initial support for max safe distance in DataWithEVL mode. If
max safe distance is required, need to emit special code:
CMP = icmp ult AVL, MAX_SAFE_DISTANCE
SAFE_AVL = select CMP, AVL, MAX_SAFE_DISTANCE
EVL = call i32 @llvm.experimental.get.vector.length(i64 SAFE_AVL)

while vectorize the loop in DataWithEVL tail folding mode.

Reviewers: fhahn

Reviewed By: fhahn

Pull Request: #102897
  • Loading branch information
alexey-bataev authored Oct 18, 2024
1 parent 7f2e937 commit f148d57
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 31 deletions.
30 changes: 22 additions & 8 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1427,12 +1427,9 @@ class LoopVectorizationCostModel {
// Override forced styles if needed.
// FIXME: use actual opcode/data type for analysis here.
// FIXME: Investigate opportunity for fixed vector factor.
bool EVLIsLegal =
IsScalableVF && UserIC <= 1 &&
TTI.hasActiveVectorLength(0, nullptr, Align()) &&
!EnableVPlanNativePath &&
// FIXME: implement support for max safe dependency distance.
Legal->isSafeForAnyVectorWidth();
bool EVLIsLegal = UserIC <= 1 &&
TTI.hasActiveVectorLength(0, nullptr, Align()) &&
!EnableVPlanNativePath;
if (!EVLIsLegal) {
// If for some reason EVL mode is unsupported, fallback to
// DataWithoutLaneMask to try to vectorize the loop with folded tail
Expand All @@ -1457,6 +1454,15 @@ class LoopVectorizationCostModel {
return getTailFoldingStyle() != TailFoldingStyle::None;
}

/// Return maximum safe number of elements to be processed per vector
/// iteration, which do not prevent store-load forwarding and are safe with
/// regard to the memory dependencies. Required for EVL-based VPlans to
/// correctly calculate AVL (application vector length) as min(remaining AVL,
/// MaxSafeElements).
/// TODO: need to consider adjusting cost model to use this value as a
/// vectorization factor for EVL-based vectorization.
std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }

/// Returns true if the instructions in this block requires predication
/// for any reason, e.g. because tail folding now requires a predicate
/// or because the block in the original loop was predicated.
Expand Down Expand Up @@ -1608,6 +1614,12 @@ class LoopVectorizationCostModel {
/// true if scalable vectorization is supported and enabled.
std::optional<bool> IsScalableVectorizationAllowed;

/// Maximum safe number of elements to be processed per vector iteration,
/// which do not prevent store-load forwarding and are safe with regard to the
/// memory dependencies. Required for EVL-based veectorization, where this
/// value is used as the upper bound of the safe AVL.
std::optional<unsigned> MaxSafeElements;

/// A map holding scalar costs for different vectorization factors. The
/// presence of a cost for an instruction in the mapping indicates that the
/// instruction will be scalarized when vectorizing with the associated
Expand Down Expand Up @@ -3858,6 +3870,8 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(

auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
if (!Legal->isSafeForAnyVectorWidth())
this->MaxSafeElements = MaxSafeElements;

LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
<< ".\n");
Expand Down Expand Up @@ -8686,8 +8700,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
VPlanTransforms::optimize(*Plan);
// TODO: try to put it close to addActiveLaneMask().
// Discard the plan if it is not EVL-compatible
if (CM.foldTailWithEVL() &&
!VPlanTransforms::tryAddExplicitVectorLength(*Plan))
if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength(
*Plan, CM.getMaxSafeElements()))
break;
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
VPlans.push_back(std::move(Plan));
Expand Down
9 changes: 6 additions & 3 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
return true;
switch (Opcode) {
case Instruction::ICmp:
case Instruction::Select:
case VPInstruction::BranchOnCond:
case VPInstruction::BranchOnCount:
case VPInstruction::CalculateTripCountMinusVF:
Expand Down Expand Up @@ -440,9 +441,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
return Builder.CreateCmp(getPredicate(), A, B, Name);
}
case Instruction::Select: {
Value *Cond = State.get(getOperand(0));
Value *Op1 = State.get(getOperand(1));
Value *Op2 = State.get(getOperand(2));
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
return Builder.CreateSelect(Cond, Op1, Op2, Name);
}
case VPInstruction::ActiveLaneMask: {
Expand Down Expand Up @@ -742,6 +744,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
default:
return false;
case Instruction::ICmp:
case Instruction::Select:
case VPInstruction::PtrAdd:
// TODO: Cover additional opcodes.
return vputils::onlyFirstLaneUsed(this);
Expand Down
38 changes: 30 additions & 8 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1439,7 +1439,24 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
/// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
/// ...
///
bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
/// If MaxSafeElements is provided, the function adds the following recipes:
/// vector.ph:
/// ...
///
/// vector.body:
/// ...
/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
/// [ %NextEVLIV, %vector.body ]
/// %AVL = sub original TC, %EVLPhi
/// %cmp = cmp ult %AVL, MaxSafeElements
/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
/// ...
/// %NextEVLIV = add IVSize (cast i32 %VPEVL to IVSize), %EVLPhi
/// ...
///
bool VPlanTransforms::tryAddExplicitVectorLength(
VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
// The transform updates all users of inductions to work based on EVL, instead
// of the VF directly. At the moment, widened inductions cannot be updated, so
Expand All @@ -1464,14 +1481,19 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
// Create the ExplicitVectorLengthPhi recipe in the main loop.
auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
EVLPhi->insertAfter(CanonicalIVPHI);
// TODO: Add support for MaxSafeDist for correct loop emission.
VPBuilder Builder(Header, Header->getFirstNonPhi());
// Compute original TC - IV as the AVL (application vector length).
auto *AVL = new VPInstruction(Instruction::Sub, {Plan.getTripCount(), EVLPhi},
DebugLoc(), "avl");
AVL->insertBefore(*Header, Header->getFirstNonPhi());
auto *VPEVL =
new VPInstruction(VPInstruction::ExplicitVectorLength, AVL, DebugLoc());
VPEVL->insertAfter(AVL);
VPValue *AVL = Builder.createNaryOp(
Instruction::Sub, {Plan.getTripCount(), EVLPhi}, DebugLoc(), "avl");
if (MaxSafeElements) {
// Support for MaxSafeDist for correct loop emission.
VPValue *AVLSafe = Plan.getOrAddLiveIn(
ConstantInt::get(CanonicalIVPHI->getScalarType(), *MaxSafeElements));
VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc(), "safe_avl");
}
auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
DebugLoc());

auto *CanonicalIVIncrement =
cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanTransforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,9 @@ struct VPlanTransforms {
/// VPCanonicalIVPHIRecipe is only used to control the loop after
/// this transformation.
/// \returns true if the transformation succeeds, or false if it doesn't.
static bool tryAddExplicitVectorLength(VPlan &Plan);
static bool
tryAddExplicitVectorLength(VPlan &Plan,
const std::optional<unsigned> &MaxEVLSafeElements);

// For each Interleave Group in \p InterleaveGroups replace the Recipes
// widening its memory instructions with a single VPInterleaveRecipe at its
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -422,28 +422,37 @@ define void @no_high_lmul_or_interleave(ptr %p) {
; IF-EVL-NEXT: entry:
; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; IF-EVL: vector.ph:
; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP1:%.*]] = sub i64 [[TMP7]], 1
; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 3002, [[TMP1]]
; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], <i64 3001, i64 3001, i64 3001, i64 3001>
; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 3002, [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP9:%.*]] = icmp ult i64 [[AVL]], 1024
; IF-EVL-NEXT: [[SAFE_AVL:%.*]] = select i1 [[TMP9]], i64 [[AVL]], i64 1024
; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[SAFE_AVL]], i32 1, i1 true)
; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[EVL_BASED_IV]], 0
; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr [[TMP3]], i32 32, <4 x i1> [[TMP1]], <4 x i64> poison)
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0(ptr align 32 [[TMP3]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]])
; IF-EVL-NEXT: [[TMP4:%.*]] = add i64 [[TMP0]], 1024
; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP4]]
; IF-EVL-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
; IF-EVL-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_MASKED_LOAD]], ptr [[TMP6]], i32 32, <4 x i1> [[TMP1]])
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; IF-EVL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 3004
; IF-EVL-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; IF-EVL-NEXT: call void @llvm.vp.store.nxv1i64.p0(<vscale x 1 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP6]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]])
; IF-EVL-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP11]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
; IF-EVL-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; IF-EVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3004, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; IF-EVL-NEXT: br label [[LOOP:%.*]]
; IF-EVL: loop:
; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
Expand Down

0 comments on commit f148d57

Please sign in to comment.