Skip to content

Commit

Permalink
[SLP]Keep externally used GEPs as GEPs, if possible instead of extrac…
Browse files Browse the repository at this point in the history
…telement.

If the vectorized GEP instruction can be still kept as a scalar GEP,
better to keep it as scalar instead of extractelement. In many cases it
is more profitable.

Metric: size..text

Program                                                                          size..text
                                                                                 results     results0    diff
                        test-suite :: SingleSource/Benchmarks/Misc/oourafft.test    18911.00    19695.00  4.1%
                   test-suite :: SingleSource/Benchmarks/Misc-C++-EH/spirit.test    59987.00    60707.00  1.2%
       test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test  1392209.00  1392753.00  0.0%
        test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test  1392209.00  1392753.00  0.0%
           test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test  1087996.00  1088236.00  0.0%
                         test-suite :: MultiSource/Benchmarks/Bullet/bullet.test   309310.00   309342.00  0.0%
             test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test   664661.00   664693.00  0.0%
            test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test   664661.00   664693.00  0.0%
        test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12354636.00 12354908.00  0.0%
                  test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test  1152748.00  1152716.00 -0.0%
                       test-suite :: MultiSource/Applications/oggenc/oggenc.test   191787.00   191771.00 -0.0%
                     test-suite :: SingleSource/UnitTests/matrix-types-spec.test   480796.00   480476.00 -0.1%

Misc/oourafft - Extra code gets vectorized
Misc-C++-EH/spirit - same
CFP2017speed/638.imagick_s
CFP2017rate/538.imagick_r - same, extra code gets vectorized
CINT2006/400.perlbench - some extra 4 x ptr stores vectorized
Bullet/bullet - extra 4 x ptr store vectorized
CINT2017rate/525.x264_r
CINT2017speed/625.x264_s - same
CFP2017rate/526.blender_r - extra 8 x float stores (several), some extra
4 x ptr stores
CFP2006/453.povray - 2 x double loads/stores replaced by 4 x double
loads/stores
Applications/oggenc - extra code is vectorized
UnitTests/matrix-types-spec - extra code gets vectorized

Reviewers: RKSimon

Reviewed By: RKSimon

Pull Request: llvm#88877
  • Loading branch information
alexey-bataev authored Apr 16, 2024
1 parent 5422eb0 commit c7657cf
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 25 deletions.
50 changes: 49 additions & 1 deletion llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1134,6 +1134,7 @@ class BoUpSLP {
MustGather.clear();
EntryToLastInstruction.clear();
ExternalUses.clear();
ExternalUsesAsGEPs.clear();
for (auto &Iter : BlocksSchedules) {
BlockScheduling *BS = Iter.second.get();
BS->clear();
Expand Down Expand Up @@ -3154,6 +3155,10 @@ class BoUpSLP {
/// after vectorization.
UserList ExternalUses;

/// A list of GEPs which can be reaplced by scalar GEPs instead of
/// extractelement instructions.
SmallPtrSet<Value *, 4> ExternalUsesAsGEPs;

/// Values used only by @llvm.assume calls.
SmallPtrSet<const Value *, 32> EphValues;

Expand Down Expand Up @@ -5541,6 +5546,7 @@ void BoUpSLP::buildExternalUses(
<< FoundLane << " from " << *Scalar << ".\n");
ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
continue;
}
for (User *U : Scalar->users()) {
LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
Expand Down Expand Up @@ -9925,6 +9931,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
SmallVector<APInt> DemandedElts;
SmallDenseSet<Value *, 4> UsedInserts;
DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
Expand Down Expand Up @@ -10033,12 +10040,40 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
}
}
}
// Leave the GEPs as is, they are free in most cases and better to keep them
// as GEPs.
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
if (auto *GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
if (!ValueToExtUses) {
ValueToExtUses.emplace();
for_each(enumerate(ExternalUses), [&](const auto &P) {
ValueToExtUses->try_emplace(P.value().Scalar, P.index());
});
}
// Can use original GEP, if no operands vectorized or they are marked as
// externally used already.
bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) {
if (!getTreeEntry(V))
return true;
auto It = ValueToExtUses->find(V);
if (It != ValueToExtUses->end()) {
// Replace all uses to avoid compiler crash.
ExternalUses[It->second].User = nullptr;
return true;
}
return false;
});
if (CanBeUsedAsGEP) {
ExtractCost += TTI->getInstructionCost(GEP, CostKind);
ExternalUsesAsGEPs.insert(EU.Scalar);
continue;
}
}

// If we plan to rewrite the tree in a smaller type, we will need to sign
// extend the extracted value back to the original type. Here, we account
// for the extract and the added cost of the sign extend if needed.
auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
auto It = MinBWs.find(getTreeEntry(EU.Scalar));
if (It != MinBWs.end()) {
auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
Expand Down Expand Up @@ -13161,6 +13196,8 @@ Value *BoUpSLP::vectorizeTree(
if (Scalar->getType() != Vec->getType()) {
Value *Ex = nullptr;
Value *ExV = nullptr;
auto *GEP = dyn_cast<GetElementPtrInst>(Scalar);
bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP);
auto It = ScalarToEEs.find(Scalar);
if (It != ScalarToEEs.end()) {
// No need to emit many extracts, just move the only one in the
Expand All @@ -13186,6 +13223,15 @@ Value *BoUpSLP::vectorizeTree(
if (const TreeEntry *ETE = getTreeEntry(V))
V = ETE->VectorizedValue;
Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
} else if (ReplaceGEP) {
// Leave the GEPs as is, they are free in most cases and better to
// keep them as GEPs.
auto *CloneGEP = GEP->clone();
CloneGEP->insertBefore(*Builder.GetInsertBlock(),
Builder.GetInsertPoint());
if (GEP->hasName())
CloneGEP->takeName(GEP);
Ex = CloneGEP;
} else {
Ex = Builder.CreateExtractElement(Vec, Lane);
}
Expand Down Expand Up @@ -13224,6 +13270,8 @@ Value *BoUpSLP::vectorizeTree(
assert((ExternallyUsedValues.count(Scalar) ||
any_of(Scalar->users(),
[&](llvm::User *U) {
if (ExternalUsesAsGEPs.contains(U))
return true;
TreeEntry *UseEntry = getTreeEntry(U);
return UseEntry &&
(UseEntry->State == TreeEntry::Vectorize ||
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ define i32 @fn1() {
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 11, i64 56>
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11
; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[TMP4]], align 8
; CHECK-NEXT: ret i32 undef
Expand Down Expand Up @@ -92,7 +92,7 @@ define void @externally_used_ptrs() {
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 56, i64 11>
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11
; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP5]], [[TMP6]]
Expand Down
17 changes: 9 additions & 8 deletions llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,33 +13,34 @@ define dso_local i32 @g() local_unnamed_addr {
; CHECK: while.body:
; CHECK-NEXT: [[C_022:%.*]] = phi ptr [ [[C_022_BE:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x ptr> [ [[TMP14:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ]
; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 1
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[C_022]] to i64
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP9]] to i64
; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 1
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 1, i64 1>
; CHECK-NEXT: switch i32 [[TMP3]], label [[WHILE_BODY_BACKEDGE]] [
; CHECK-NEXT: i32 2, label [[SW_BB:%.*]]
; CHECK-NEXT: i32 4, label [[SW_BB6:%.*]]
; CHECK-NEXT: i32 2, label [[SW_BB:%.*]]
; CHECK-NEXT: i32 4, label [[SW_BB6:%.*]]
; CHECK-NEXT: ]
; CHECK: sw.bb:
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64
; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 2, i64 2>
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP4]], i32 1
; CHECK-NEXT: store i32 [[TMP7]], ptr [[TMP9]], align 4
; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 2
; CHECK-NEXT: store i32 [[TMP7]], ptr [[INCDEC_PTR1]], align 4
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 2, i64 2>
; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]]
; CHECK: sw.bb6:
; CHECK-NEXT: [[INCDEC_PTR8:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 2
; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 1
; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[INCDEC_PTR]] to i64
; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 2, i64 2>
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x ptr> [[TMP4]], i32 0
; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP13]], align 4
; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]]
; CHECK: while.body.backedge:
; CHECK-NEXT: [[C_022_BE]] = phi ptr [ [[INCDEC_PTR]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ]
; CHECK-NEXT: [[C_022_BE]] = phi ptr [ [[INCDEC_PTR1]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ]
; CHECK-NEXT: [[TMP14]] = phi <2 x ptr> [ [[TMP4]], [[WHILE_BODY]] ], [ [[TMP12]], [[SW_BB6]] ], [ [[TMP8]], [[SW_BB]] ]
; CHECK-NEXT: br label [[WHILE_BODY]]
; CHECK: while.end:
Expand Down
19 changes: 8 additions & 11 deletions llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -52,17 +52,14 @@ define void @test(ptr %r, ptr %p, ptr %q) #0 {

define void @test2(ptr %a, ptr %b) {
; CHECK-LABEL: @test2(
; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 1
; CHECK-NEXT: [[A2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 2
; CHECK-NEXT: [[I1:%.*]] = ptrtoint ptr [[A1]] to i64
; CHECK-NEXT: [[B3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 3
; CHECK-NEXT: [[I2:%.*]] = ptrtoint ptr [[B3]] to i64
; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A1]], align 8
; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 8
; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[I1]], [[V1]]
; CHECK-NEXT: [[ADD2:%.*]] = add i64 [[I2]], [[V2]]
; CHECK-NEXT: store i64 [[ADD1]], ptr [[A1]], align 8
; CHECK-NEXT: store i64 [[ADD2]], ptr [[A2]], align 8
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A:%.*]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[B:%.*]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 1, i64 3>
; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 1
; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr [[A1]], align 8
; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]]
; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[A1]], align 8
; CHECK-NEXT: ret void
;
%a1 = getelementptr inbounds i64, ptr %a, i64 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 {
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) [[TMP0:%.*]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x ptr addrspace(1)> [[TMP3]], <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, <4 x ptr addrspace(1)> [[TMP4]], <4 x i64> <i64 8, i64 12, i64 28, i64 24>
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr addrspace(1)> [[TMP5]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 8
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8
; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p1(<4 x ptr addrspace(1)> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> poison)
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <8 x i32> <i32 0, i32 3, i32 0, i32 3, i32 2, i32 1, i32 2, i32 1>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ define void @allocas(ptr %a, ptr %b, ptr %c) {
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[V1]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[V2]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i32> <i32 1, i32 1>
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[V1]], i32 1
; CHECK-NEXT: store ptr [[TMP4]], ptr [[A:%.*]], align 8
; CHECK-NEXT: store <2 x ptr> [[TMP3]], ptr [[B:%.*]], align 8
; CHECK-NEXT: ret void
Expand Down Expand Up @@ -127,7 +127,7 @@ define void @stacksave2(ptr %a, ptr %b, ptr %c) {
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[V1]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[V2]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i32> <i32 1, i32 1>
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[V1]], i32 1
; CHECK-NEXT: store ptr [[TMP4]], ptr [[A:%.*]], align 8
; CHECK-NEXT: call void @use(ptr inalloca(i8) [[V2]]) #[[ATTR5:[0-9]+]]
; CHECK-NEXT: call void @llvm.stackrestore.p0(ptr [[STACK]])
Expand Down

0 comments on commit c7657cf

Please sign in to comment.