From c7657cf7d1ee57f9cb9133164536591a1842b43c Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 16 Apr 2024 14:54:06 -0400 Subject: [PATCH] [SLP]Keep externally used GEPs as GEPs, if possible instead of extractelement. If the vectorized GEP instruction can be still kept as a scalar GEP, better to keep it as scalar instead of extractelement. In many cases it is more profitable. Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Benchmarks/Misc/oourafft.test 18911.00 19695.00 4.1% test-suite :: SingleSource/Benchmarks/Misc-C++-EH/spirit.test 59987.00 60707.00 1.2% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1392209.00 1392753.00 0.0% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1392209.00 1392753.00 0.0% test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 1087996.00 1088236.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 309310.00 309342.00 0.0% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 664661.00 664693.00 0.0% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 664661.00 664693.00 0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12354636.00 12354908.00 0.0% test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 1152748.00 1152716.00 -0.0% test-suite :: MultiSource/Applications/oggenc/oggenc.test 191787.00 191771.00 -0.0% test-suite :: SingleSource/UnitTests/matrix-types-spec.test 480796.00 480476.00 -0.1% Misc/oourafft - Extra code gets vectorized Misc-C++-EH/spirit - same CFP2017speed/638.imagick_s CFP2017rate/538.imagick_r - same, extra code gets vectorized CINT2006/400.perlbench - some extra 4 x ptr stores vectorized Bullet/bullet - extra 4 x ptr store vectorized CINT2017rate/525.x264_r CINT2017speed/625.x264_s - same CFP2017rate/526.blender_r - extra 8 x float stores (several), some extra 4 x ptr stores CFP2006/453.povray - 2 x double loads/stores replaced by 4 x double loads/stores Applications/oggenc - extra code is vectorized UnitTests/matrix-types-spec - extra code gets vectorized Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/88877 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 50 ++++++++++++++++++- .../SLPVectorizer/X86/extract_in_tree_user.ll | 4 +- .../SLPVectorizer/X86/geps-non-pow-2.ll | 17 ++++--- .../SLPVectorizer/X86/opaque-ptr.ll | 19 +++---- .../X86/reorder-reused-masked-gather2.ll | 2 +- .../SLPVectorizer/X86/stacksave-dependence.ll | 4 +- 6 files changed, 71 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 0cd3ca32933ca2..7694627c3b0430 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1134,6 +1134,7 @@ class BoUpSLP { MustGather.clear(); EntryToLastInstruction.clear(); ExternalUses.clear(); + ExternalUsesAsGEPs.clear(); for (auto &Iter : BlocksSchedules) { BlockScheduling *BS = Iter.second.get(); BS->clear(); @@ -3154,6 +3155,10 @@ class BoUpSLP { /// after vectorization. UserList ExternalUses; + /// A list of GEPs which can be reaplced by scalar GEPs instead of + /// extractelement instructions. + SmallPtrSet ExternalUsesAsGEPs; + /// Values used only by @llvm.assume calls. SmallPtrSet EphValues; @@ -5541,6 +5546,7 @@ void BoUpSLP::buildExternalUses( << FoundLane << " from " << *Scalar << ".\n"); ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()); ExternalUses.emplace_back(Scalar, nullptr, FoundLane); + continue; } for (User *U : Scalar->users()) { LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); @@ -9925,6 +9931,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { SmallVector DemandedElts; SmallDenseSet UsedInserts; DenseSet> VectorCasts; + std::optional> ValueToExtUses; for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. if (!isa_and_nonnull(EU.User) && @@ -10033,12 +10040,40 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { } } } + // Leave the GEPs as is, they are free in most cases and better to keep them + // as GEPs. + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + if (auto *GEP = dyn_cast(EU.Scalar)) { + if (!ValueToExtUses) { + ValueToExtUses.emplace(); + for_each(enumerate(ExternalUses), [&](const auto &P) { + ValueToExtUses->try_emplace(P.value().Scalar, P.index()); + }); + } + // Can use original GEP, if no operands vectorized or they are marked as + // externally used already. + bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) { + if (!getTreeEntry(V)) + return true; + auto It = ValueToExtUses->find(V); + if (It != ValueToExtUses->end()) { + // Replace all uses to avoid compiler crash. + ExternalUses[It->second].User = nullptr; + return true; + } + return false; + }); + if (CanBeUsedAsGEP) { + ExtractCost += TTI->getInstructionCost(GEP, CostKind); + ExternalUsesAsGEPs.insert(EU.Scalar); + continue; + } + } // If we plan to rewrite the tree in a smaller type, we will need to sign // extend the extracted value back to the original type. Here, we account // for the extract and the added cost of the sign extend if needed. auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; auto It = MinBWs.find(getTreeEntry(EU.Scalar)); if (It != MinBWs.end()) { auto *MinTy = IntegerType::get(F->getContext(), It->second.first); @@ -13161,6 +13196,8 @@ Value *BoUpSLP::vectorizeTree( if (Scalar->getType() != Vec->getType()) { Value *Ex = nullptr; Value *ExV = nullptr; + auto *GEP = dyn_cast(Scalar); + bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP); auto It = ScalarToEEs.find(Scalar); if (It != ScalarToEEs.end()) { // No need to emit many extracts, just move the only one in the @@ -13186,6 +13223,15 @@ Value *BoUpSLP::vectorizeTree( if (const TreeEntry *ETE = getTreeEntry(V)) V = ETE->VectorizedValue; Ex = Builder.CreateExtractElement(V, ES->getIndexOperand()); + } else if (ReplaceGEP) { + // Leave the GEPs as is, they are free in most cases and better to + // keep them as GEPs. + auto *CloneGEP = GEP->clone(); + CloneGEP->insertBefore(*Builder.GetInsertBlock(), + Builder.GetInsertPoint()); + if (GEP->hasName()) + CloneGEP->takeName(GEP); + Ex = CloneGEP; } else { Ex = Builder.CreateExtractElement(Vec, Lane); } @@ -13224,6 +13270,8 @@ Value *BoUpSLP::vectorizeTree( assert((ExternallyUsedValues.count(Scalar) || any_of(Scalar->users(), [&](llvm::User *U) { + if (ExternalUsesAsGEPs.contains(U)) + return true; TreeEntry *UseEntry = getTreeEntry(U); return UseEntry && (UseEntry->State == TreeEntry::Vectorize || diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll index 096f57d100a50f..c600d75ed1e8c4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll @@ -13,7 +13,7 @@ define i32 @fn1() { ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11 ; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64> ; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[TMP4]], align 8 ; CHECK-NEXT: ret i32 undef @@ -92,7 +92,7 @@ define void @externally_used_ptrs() { ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11 ; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP5]], [[TMP6]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll index aa679743583064..e459cd8c6955b0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll @@ -13,25 +13,26 @@ define dso_local i32 @g() local_unnamed_addr { ; CHECK: while.body: ; CHECK-NEXT: [[C_022:%.*]] = phi ptr [ [[C_022_BE:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x ptr> [ [[TMP14:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ] -; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[C_022]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP9]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 +; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> ; CHECK-NEXT: switch i32 [[TMP3]], label [[WHILE_BODY_BACKEDGE]] [ -; CHECK-NEXT: i32 2, label [[SW_BB:%.*]] -; CHECK-NEXT: i32 4, label [[SW_BB6:%.*]] +; CHECK-NEXT: i32 2, label [[SW_BB:%.*]] +; CHECK-NEXT: i32 4, label [[SW_BB6:%.*]] ; CHECK-NEXT: ] ; CHECK: sw.bb: ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP4]], i32 1 -; CHECK-NEXT: store i32 [[TMP7]], ptr [[TMP9]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 2 +; CHECK-NEXT: store i32 [[TMP7]], ptr [[INCDEC_PTR1]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> ; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]] ; CHECK: sw.bb6: ; CHECK-NEXT: [[INCDEC_PTR8:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 2 +; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 1 ; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[INCDEC_PTR]] to i64 ; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> @@ -39,7 +40,7 @@ define dso_local i32 @g() local_unnamed_addr { ; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP13]], align 4 ; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]] ; CHECK: while.body.backedge: -; CHECK-NEXT: [[C_022_BE]] = phi ptr [ [[INCDEC_PTR]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ] +; CHECK-NEXT: [[C_022_BE]] = phi ptr [ [[INCDEC_PTR1]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ] ; CHECK-NEXT: [[TMP14]] = phi <2 x ptr> [ [[TMP4]], [[WHILE_BODY]] ], [ [[TMP12]], [[SW_BB6]] ], [ [[TMP8]], [[SW_BB]] ] ; CHECK-NEXT: br label [[WHILE_BODY]] ; CHECK: while.end: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll index 3801fa5c787b6d..c40be9690cce1d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll @@ -52,17 +52,14 @@ define void @test(ptr %r, ptr %p, ptr %q) #0 { define void @test2(ptr %a, ptr %b) { ; CHECK-LABEL: @test2( -; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 1 -; CHECK-NEXT: [[A2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 2 -; CHECK-NEXT: [[I1:%.*]] = ptrtoint ptr [[A1]] to i64 -; CHECK-NEXT: [[B3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 3 -; CHECK-NEXT: [[I2:%.*]] = ptrtoint ptr [[B3]] to i64 -; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A1]], align 8 -; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 8 -; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[I1]], [[V1]] -; CHECK-NEXT: [[ADD2:%.*]] = add i64 [[I2]], [[V2]] -; CHECK-NEXT: store i64 [[ADD1]], ptr [[A1]], align 8 -; CHECK-NEXT: store i64 [[ADD2]], ptr [[A2]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[B:%.*]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> +; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr [[A1]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]] +; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[A1]], align 8 ; CHECK-NEXT: ret void ; %a1 = getelementptr inbounds i64, ptr %a, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll index ddc2a1b819041f..30f328293cdaa3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll @@ -9,7 +9,7 @@ define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 { ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) [[TMP0:%.*]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x ptr addrspace(1)> [[TMP3]], <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, <4 x ptr addrspace(1)> [[TMP4]], <4 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr addrspace(1)> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 8 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8 ; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p1(<4 x ptr addrspace(1)> [[TMP5]], i32 4, <4 x i1> , <4 x float> poison) ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll b/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll index 0125e5fab089b2..e93c5244dfbe2c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll @@ -35,7 +35,7 @@ define void @allocas(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[V1]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[V2]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[V1]], i32 1 ; CHECK-NEXT: store ptr [[TMP4]], ptr [[A:%.*]], align 8 ; CHECK-NEXT: store <2 x ptr> [[TMP3]], ptr [[B:%.*]], align 8 ; CHECK-NEXT: ret void @@ -127,7 +127,7 @@ define void @stacksave2(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[V1]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[V2]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[V1]], i32 1 ; CHECK-NEXT: store ptr [[TMP4]], ptr [[A:%.*]], align 8 ; CHECK-NEXT: call void @use(ptr inalloca(i8) [[V2]]) #[[ATTR5:[0-9]+]] ; CHECK-NEXT: call void @llvm.stackrestore.p0(ptr [[STACK]])