Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SLP]Support revectorization of the previously vectorized scalars #133091

Conversation

alexey-bataev
Copy link
Member

If the scalar instructions is marked for the vectorization in the tree,
it cannot be vectorized as part of the another node in the same tree, in
general. It may prevent some potentially profitable vectorization
opportunities, since some nodes end up being buildvector/gather nodes,
which add to the total cost.
Patch allows revectorization of the previously vectorized scalars.

Created using spr 1.3.5
@llvmbot
Copy link
Member

llvmbot commented Mar 26, 2025

@llvm/pr-subscribers-vectorizers

@llvm/pr-subscribers-llvm-transforms

Author: Alexey Bataev (alexey-bataev)

Changes

If the scalar instructions is marked for the vectorization in the tree,
it cannot be vectorized as part of the another node in the same tree, in
general. It may prevent some potentially profitable vectorization
opportunities, since some nodes end up being buildvector/gather nodes,
which add to the total cost.
Patch allows revectorization of the previously vectorized scalars.


Patch is 34.05 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/133091.diff

8 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+116-72)
  • (modified) llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll (+8-10)
  • (modified) llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll (+10-11)
  • (modified) llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll (+10-11)
  • (modified) llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll (+2-2)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll (+5-5)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll (+3-4)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll (+2-2)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 59a0408abbf04..af3f61b98590d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4078,11 +4078,6 @@ class BoUpSLP {
         if (isa<PoisonValue>(V))
           continue;
         auto It = ScalarToTreeEntries.find(V);
-        assert(
-            (It == ScalarToTreeEntries.end() ||
-             (It->getSecond().size() == 1 && It->getSecond().front() == Last) ||
-             doesNotNeedToBeScheduled(V)) &&
-            "Scalar already in tree!");
         if (It == ScalarToTreeEntries.end()) {
           ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
           (void)Processed.insert(V);
@@ -4342,6 +4337,9 @@ class BoUpSLP {
   private:
     /// Used for getting a "good" final ordering of instructions.
     int SchedulingPriority = 0;
+    /// True if this instruction (or bundle) is scheduled (or considered as
+    /// scheduled in the dry-run).
+    bool IsScheduled = false;
     /// The kind of the ScheduleEntity.
     const Kind K = Kind::ScheduleData;
 
@@ -4355,6 +4353,10 @@ class BoUpSLP {
         return SD->isReady();
       return cast<ScheduleBundle>(this)->isReady();
     }
+    /// Gets/sets if the bundle is scheduled.
+    bool isScheduled() const { return IsScheduled; }
+    void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
+
     static bool classof(const ScheduleEntity *) { return true; }
   };
 
@@ -4427,10 +4429,6 @@ class BoUpSLP {
       IsScheduled = false;
     }
 
-    /// Gets/sets if the bundle is scheduled.
-    bool isScheduled() const { return IsScheduled; }
-    void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
-
     /// Gets the number of unscheduled dependencies.
     int getUnscheduledDeps() const { return UnscheduledDeps; }
     /// Gets the number of dependencies.
@@ -4505,10 +4503,6 @@ class BoUpSLP {
     /// for scheduling.
     /// Note that this is negative as long as Dependencies is not calculated.
     int UnscheduledDeps = InvalidDeps;
-
-    /// True if this instruction is scheduled (or considered as scheduled in the
-    /// dry-run).
-    bool IsScheduled = false;
   };
 
 #ifndef NDEBUG
@@ -4553,11 +4547,6 @@ class BoUpSLP {
       }
     }
 
-    bool isScheduled() const {
-      return all_of(Bundle,
-                    [](const ScheduleData *SD) { return SD->isScheduled(); });
-    }
-
     /// Returns the number of unscheduled dependencies in the bundle.
     int unscheduledDepsInBundle() const {
       assert(*this && "bundle must not be empty");
@@ -4814,12 +4803,19 @@ class BoUpSLP {
         ProcessBundleMember(SD, nullptr);
       } else {
         ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
-        for_each(Bundle.getBundle(), [](ScheduleData *SD) {
-          SD->setScheduled(/*Scheduled=*/true);
-        });
+        Bundle.setScheduled(/*Scheduled=*/true);
         LLVM_DEBUG(dbgs() << "SLP:   schedule " << Bundle << "\n");
-        for (ScheduleData *SD : Bundle.getBundle())
-          ProcessBundleMember(SD, &Bundle);
+        for (ScheduleData *SD : Bundle.getBundle()) {
+          if (ArrayRef<ScheduleBundle *> SDBundles =
+                  getScheduleBundles(SD->getInst());
+              !SDBundles.empty() &&
+              all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
+                return SDBundle->isScheduled();
+              })) {
+            SD->setScheduled(/*Scheduled=*/true);
+            ProcessBundleMember(SD, &Bundle);
+          }
+        }
       }
     }
 
@@ -4851,7 +4847,8 @@ class BoUpSLP {
       }
 
       for (const ScheduleEntity *Bundle : ReadyInsts) {
-        assert(Bundle->isReady() && "item in ready list not ready?");
+        assert((Bundle->isReady() || Bundle->isScheduled()) &&
+               "item in ready list not ready?");
         (void)Bundle;
       }
     }
@@ -7553,7 +7550,7 @@ void BoUpSLP::buildExternalUses(
           // Some in-tree scalars will remain as scalar in vectorized
           // instructions. If that is the case, the one in FoundLane will
           // be used.
-          if (any_of(UseEntries, [&](TreeEntry *UseEntry) {
+          if (all_of(UseEntries, [&](TreeEntry *UseEntry) {
                 return UseEntry->State == TreeEntry::ScatterVectorize ||
                        !doesInTreeUserNeedToExtract(
                            Scalar, getRootEntryInstruction(*UseEntry), TLI,
@@ -9567,14 +9564,34 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // We now know that this is a vector of instructions of the same type from
   // the same block.
 
-  // Check that none of the instructions in the bundle are already in the tree.
-  for (Value *V : VL) {
-    if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
-        doesNotNeedToBeScheduled(V))
-      continue;
-    if (isVectorized(V)) {
-      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
-                        << ") is already in tree.\n");
+  // Check that none of the instructions in the bundle are already in the tree
+  // and the node may be not profitable for the vectorization as the small
+  // alternate node.
+  if (S && S.isAltShuffle()) {
+    unsigned NumVectorized = 0;
+    unsigned NumExtracted = 0;
+    for (Value *V : VL) {
+      auto *I = dyn_cast<Instruction>(V);
+      if (!I || doesNotNeedToBeScheduled(V) ||
+          all_of(I->operands(), [&](const Use &U) {
+            return isa<ExtractElementInst>(U.get());
+          }))
+        continue;
+      if (isVectorized(V))
+        ++NumVectorized;
+      else if (!V->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList))
+        ++NumExtracted;
+    }
+    constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
+    if (NumVectorized > 0 &&
+        (VL.size() == 2 ||
+         (getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
+                         getWidenedType(VL.front()->getType(), VL.size()), {},
+                         Kind) +
+              NumExtracted >
+          VL.size() - NumVectorized))) {
+      LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
+                           "node is not profitable.\n");
       if (TryToFindDuplicates(S)) {
         auto Invalid = ScheduleBundle::invalid();
         newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
@@ -9663,8 +9680,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 #endif
   if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
     LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
-    assert((!BS.getScheduleData(VL0) || BS.getScheduleBundles(VL0).empty()) &&
-           "tryScheduleBundle should not create bundle on failure");
     // Last chance to try to vectorize alternate node.
     if (S.isAltShuffle() && ReuseShuffleIndices.empty() &&
         TrySplitNode(SmallNodeSize, S))
@@ -12443,7 +12458,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   SmallBitVector UsedScalars(Sz, false);
   for (unsigned I = 0; I < Sz; ++I) {
     if (isa<Instruction>(UniqueValues[I]) &&
-        is_contained(getTreeEntries(UniqueValues[I]), E))
+        getTreeEntries(UniqueValues[I]).front() == E)
       continue;
     UsedScalars.set(I);
   }
@@ -13971,6 +13986,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
   for (ExternalUser &EU : ExternalUses) {
     ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
   }
+  SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
   for (ExternalUser &EU : ExternalUses) {
     // Uses by ephemeral values are free (because the ephemeral value will be
     // removed prior to code generation, and so the extraction will be
@@ -13978,6 +13994,12 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
     if (EphValues.count(EU.User))
       continue;
 
+    // Check if the scalar for the given user or all users is accounted already.
+    if (!CheckedScalarUser.insert(std::make_pair(EU.Scalar, EU.User)).second ||
+        (EU.User &&
+         CheckedScalarUser.contains(std::make_pair(EU.Scalar, nullptr))))
+      continue;
+
     // Used in unreachable blocks or in EH pads (rarely executed) or is
     // terminated with unreachable instruction.
     if (BasicBlock *UserParent =
@@ -14680,10 +14702,16 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       PHINode *UserPHI = UseEI.UserTE->State != TreeEntry::SplitVectorize
                              ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
                              : nullptr;
-      const Instruction *InsertPt =
+      Instruction *InsertPt =
           UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
                   : &getLastInstructionInBundle(UseEI.UserTE);
       if (TEInsertPt == InsertPt) {
+        // If the schedulable insertion point is used in multiple entries - just
+        // exit, no known ordering at this point, available only after real
+        // scheduling.
+        if (!doesNotNeedToBeScheduled(InsertPt) &&
+            (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
+          continue;
         // If the users are the PHI nodes with the same incoming blocks - skip.
         if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
             TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
@@ -15395,19 +15423,29 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
 
   // Set the insert point to the beginning of the basic block if the entry
   // should not be scheduled.
-  const auto *It = BlocksSchedules.find(BB);
-  auto IsNotScheduledEntry = [&](const TreeEntry *E) {
+  auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
     if (E->isGather())
-      return false;
+      return nullptr;
     // Found previously that the instruction do not need to be scheduled.
-    return It == BlocksSchedules.end() || all_of(E->Scalars, [&](Value *V) {
-             if (!isa<Instruction>(V))
-               return true;
-             return It->second->getScheduleBundles(V).empty();
-           });
+    const auto *It = BlocksSchedules.find(BB);
+    if (It == BlocksSchedules.end())
+      return nullptr;
+    for (Value *V : E->Scalars) {
+      auto *I = dyn_cast<Instruction>(V);
+      if (!I || isa<PHINode>(I) || doesNotNeedToBeScheduled(I))
+        continue;
+      ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
+      if (Bundles.empty())
+        continue;
+      const auto *It = find_if(
+          Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
+      if (It != Bundles.end())
+        return *It;
+    }
+    return nullptr;
   };
-  if (IsNotScheduledEntry(E) ||
-      (!E->isGather() && all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
+  const ScheduleBundle *Bundle = FindScheduleBundle(E);
+  if (!E->isGather() && !Bundle) {
     if ((E->getOpcode() == Instruction::GetElementPtr &&
          any_of(E->Scalars,
                 [](Value *V) {
@@ -15433,19 +15471,10 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
   // scheduled, and the last instruction is VL.back(). So we start with
   // VL.back() and iterate over schedule data until we reach the end of the
   // bundle. The end of the bundle is marked by null ScheduleData.
-  if (It != BlocksSchedules.end() && !E->isGather()) {
-    Value *V = E->isOneOf(E->Scalars.back());
-    if (doesNotNeedToBeScheduled(V))
-      V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
-    if (ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(V);
-        !Bundles.empty()) {
-      const auto *It = find_if(
-          Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
-      assert(It != Bundles.end() && "Failed to find bundle");
-      Res = (*It)->getBundle().back()->getInst();
-      return *Res;
-    }
-    assert(E->getOpcode() == Instruction::PHI && "Expected PHI");
+  if (Bundle) {
+    assert(!E->isGather() && "Gathered instructions should not be scheduled");
+    Res = Bundle->getBundle().back()->getInst();
+    return *Res;
   }
 
   // LastInst can still be null at this point if there's either not an entry
@@ -17851,13 +17880,13 @@ Value *BoUpSLP::vectorizeTree(
     const ExtraValueToDebugLocsMap &ExternallyUsedValues,
     Instruction *ReductionRoot,
     ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
+  // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
+  // need to rebuild it.
+  EntryToLastInstruction.clear();
   // All blocks must be scheduled before any instructions are inserted.
   for (auto &BSIter : BlocksSchedules) {
     scheduleBlock(BSIter.second.get());
   }
-  // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
-  // need to rebuild it.
-  EntryToLastInstruction.clear();
 
   if (ReductionRoot)
     Builder.SetInsertPoint(ReductionRoot->getParent(),
@@ -18696,18 +18725,15 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
     // dependencies. As soon as the bundle is "ready" it means that there are no
     // cyclic dependencies and we can schedule it. Note that's important that we
     // don't "schedule" the bundle yet.
-    SmallPtrSet<const ScheduleBundle *, 16> Visited;
     while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
            !ReadyInsts.empty()) {
       ScheduleEntity *Picked = ReadyInsts.pop_back_val();
-      const auto *PickedBundle = dyn_cast<ScheduleBundle>(Picked);
-      if (PickedBundle && !Visited.insert(PickedBundle).second) {
-        assert(PickedBundle->isScheduled() && "bundle must be scheduled");
+      if (Picked->isScheduled()) {
+        if (Picked == &Bundle)
+          break;
         continue;
       }
-      assert((PickedBundle ? PickedBundle->isReady()
-                           : cast<ScheduleData>(Picked)->isReady()) &&
-             "must be ready to schedule");
+      assert(Picked->isReady() && "must be ready to schedule");
       schedule(Picked, ReadyInsts);
       if (Picked == &Bundle)
         break;
@@ -18761,8 +18787,16 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
   TryScheduleBundleImpl(ReSchedule, Bundle);
   if (!Bundle.isReady()) {
     for (ScheduleData *BD : Bundle.getBundle()) {
-      if (BD->isReady())
-        ReadyInsts.insert(BD);
+      if (BD->isReady()) {
+        ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst());
+        if (Bundles.empty()) {
+          ReadyInsts.insert(BD);
+          continue;
+        }
+        for (ScheduleBundle *B : Bundles)
+          if (B->isReady())
+            ReadyInsts.insert(B);
+      }
     }
     ScheduledBundlesList.pop_back();
     for (Value *V : VL) {
@@ -19093,6 +19127,11 @@ void BoUpSLP::BlockScheduling::resetSchedule() {
       SD->setScheduled(/*Scheduled=*/false);
       SD->resetUnscheduledDeps();
     }
+    for (ScheduleBundle *Bundle: getScheduleBundles(I)) {
+      assert(isInSchedulingRegion(*Bundle) &&
+             "ScheduleBundle not in scheduling region");
+      Bundle->setScheduled(/*Scheduled=*/false);
+    }
   }
   ReadyInsts.clear();
 }
@@ -19151,6 +19190,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   Instruction *LastScheduledInst = BS->ScheduleEnd;
 
   // Do the "real" scheduling.
+  SmallPtrSet<Instruction *, 16> Scheduled;
   while (!ReadyInsts.empty()) {
     auto *Picked = *ReadyInsts.begin();
     ReadyInsts.erase(ReadyInsts.begin());
@@ -19160,10 +19200,14 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
     if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
       for (const ScheduleData *BundleMember : Bundle->getBundle()) {
         Instruction *PickedInst = BundleMember->getInst();
+        if (!Scheduled.insert(PickedInst).second)
+          continue;
         if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
           PickedInst->moveAfter(LastScheduledInst->getPrevNode());
         LastScheduledInst = PickedInst;
       }
+      EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
+                                         LastScheduledInst);
     } else {
       auto *SD = cast<ScheduleData>(Picked);
       Instruction *PickedInst = SD->getInst();
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
index 3cab4a4da3f8e..fcd3bfc3f323a 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
@@ -39,28 +39,26 @@ define void @test() {
 ; CHECK:       [[BB77]]:
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 14, i32 15, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> [[TMP12]], float [[I70]], i32 0
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <2 x float> poison, float [[I68]], i32 0
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <2 x float> [[TMP30]], float [[I66]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <8 x float> [[TMP14]], float [[I68]], i32 2
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x float> [[TMP19]], float [[I66]], i32 3
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <8 x float> [[TMP16]], float [[I67]], i32 6
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <8 x float> [[TMP20]], float [[I69]], i32 7
 ; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x float> [[TMP25]], <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 3, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x float> [[TMP39]], <16 x float> [[TMP25]], <16 x i32> <i32 poison, i32 poison, i32 2, i32 3, i32 18, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 19, i32 poison, i32 poison>
 ; CHECK-NEXT:    br label %[[BB78:.*]]
 ; CHECK:       [[BB78]]:
 ; CHECK-NEXT:    [[TMP15:%.*]] = phi <8 x float> [ [[TMP17]], %[[BB77]] ], [ [[TMP36:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x float> [ [[TMP31]], %[[BB77]] ], [ [[TMP37:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = phi <8 x float> [ [[TMP21]], %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <8 x float> [[TMP22]], <8 x float> poison, <16 x i32> <i32 0, i32 3, i32 1, i32 2, i32 3, i32 0, i32 2, i32 3, i32 2, i32 6, i32 2, i32 3, i32 0, i32 7, i32 6, i32 6>
 ; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 1, i32 3, i32 5, i32 3, i32 1, i32 0, i32 4, i32 5, i32 5>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 2, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <16 x float> [[TMP21]], <16 x float> [[TMP20]], <16 x i32> <i32 0, i32 17, i32 2, i32 16, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <16 x float> [[TMP23]], <16 x float> [[TMP22]], <16 x i32> <i32 0, i32 1, i32...
[truncated]

Copy link

github-actions bot commented Mar 26, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

Created using spr 1.3.5
Copy link
Contributor

@gbossu gbossu left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as the previous time, I'm still discovering the code base, so do what you want with my comments ;)

@@ -7553,7 +7550,7 @@ void BoUpSLP::buildExternalUses(
// Some in-tree scalars will remain as scalar in vectorized
// instructions. If that is the case, the one in FoundLane will
// be used.
if (any_of(UseEntries, [&](TreeEntry *UseEntry) {
if (all_of(UseEntries, [&](TreeEntry *UseEntry) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Checking: After changing any_of to all_of, is the comment higher up still up to date?

Copy link
Member Author

@alexey-bataev alexey-bataev Mar 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, still correct. It is not related to the check itself, just describes common logic, which is not changed. Just before UseEntries could be only one, now there might be multiple UseEntries

Created using spr 1.3.5
@alexey-bataev
Copy link
Member Author

Ping!

@hiraditya
Copy link
Collaborator

LGTM, unless @RKSimon has any more feedback.

Created using spr 1.3.5
Created using spr 1.3.5
continue;
if (isVectorized(V))
Vectorized.clearBit(Idx);
else if (!V->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Super-Nit: Given that you have auto *I = dyn_cast<Instruction>(V);, maybe use I everywhere, instead of sometimes V and sometimes I?

Created using spr 1.3.5
Copy link
Contributor

@gbossu gbossu left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For what it's worth, as a SLP Vectorizer beginner, the change looks okay to me.

Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM - cheers

Created using spr 1.3.5
@alexey-bataev alexey-bataev merged commit 0e3049c into main Apr 1, 2025
5 of 9 checks passed
@alexey-bataev alexey-bataev deleted the users/alexey-bataev/spr/slpsupport-revectorization-of-the-previously-vectorized-scalars branch April 1, 2025 18:30
llvm-sync bot pushed a commit to arm/arm-toolchain that referenced this pull request Apr 1, 2025
…scalars

If the scalar instructions is marked for the vectorization in the tree,
it cannot be vectorized as part of the another node in the same tree, in
general. It may prevent some potentially profitable vectorization
opportunities, since some nodes end up being buildvector/gather nodes,
which add to the total cost.
Patch allows revectorization of the previously vectorized scalars.

Reviewers: hiraditya, RKSimon

Reviewed By: RKSimon, hiraditya

Pull Request: llvm/llvm-project#133091
Ankur-0429 pushed a commit to Ankur-0429/llvm-project that referenced this pull request Apr 2, 2025
If the scalar instructions is marked for the vectorization in the tree,
it cannot be vectorized as part of the another node in the same tree, in
general. It may prevent some potentially profitable vectorization
opportunities, since some nodes end up being buildvector/gather nodes,
which add to the total cost.
Patch allows revectorization of the previously vectorized scalars.

Reviewers: hiraditya, RKSimon

Reviewed By: RKSimon, hiraditya

Pull Request: llvm#133091
@alexfh
Copy link
Contributor

alexfh commented Apr 7, 2025

Hi @alexey-bataev, we see Clang crashes after 0e3049c. I'm working on a reduce test case, but please revert in the meantime.

@alexfh
Copy link
Contributor

alexfh commented Apr 7, 2025

Test case (also in https://gcc.godbolt.org/z/nsjaqfoT8):

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define void @_f(ptr %0, i8 %1, i8 %2) #0 {
  %4 = or i8 %1, 1
  %5 = getelementptr i8, ptr %0, i64 13520
  store i8 %4, ptr %5, align 4
  %6 = getelementptr i8, ptr %0, i64 13521
  store i8 %4, ptr %6, align 1
  %7 = getelementptr i8, ptr %0, i64 13522
  store i8 %4, ptr %7, align 2
  %8 = load i8, ptr %0, align 2
  %9 = or i8 %2, 1
  %10 = getelementptr i8, ptr %0, i64 13523
  store i8 %9, ptr %10, align 1
  %11 = getelementptr i8, ptr %0, i64 13524
  store i8 %4, ptr %11, align 4
  %12 = getelementptr i8, ptr %0, i64 13525
  store i8 %4, ptr %12, align 1
  %13 = getelementptr i8, ptr %0, i64 13526
  store i8 %4, ptr %13, align 2
  %14 = getelementptr i8, ptr %0, i64 13445
  %15 = load i8, ptr %14, align 1
  %16 = getelementptr i8, ptr %0, i64 13527
  store i8 %4, ptr %16, align 1
  %17 = getelementptr i8, ptr %0, i64 13428
  %18 = load i8, ptr %17, align 4
  %19 = or i8 %18, 1
  %20 = getelementptr i8, ptr %0, i64 13528
  store i8 %19, ptr %20, align 4
  %21 = getelementptr i8, ptr %0, i64 13429
  %22 = load i8, ptr %21, align 1
  %23 = or i8 %22, 1
  %24 = getelementptr i8, ptr %0, i64 13529
  store i8 %23, ptr %24, align 1
  %25 = getelementptr i8, ptr %0, i64 13430
  %26 = load i8, ptr %25, align 2
  %27 = or i8 %26, 1
  %28 = getelementptr i8, ptr %0, i64 13530
  store i8 %27, ptr %28, align 2
  %29 = getelementptr i8, ptr %0, i64 13431
  %30 = load i8, ptr %29, align 1
  %31 = or i8 %30, 1
  %32 = getelementptr i8, ptr %0, i64 13531
  store i8 %31, ptr %32, align 1
  %33 = getelementptr i8, ptr %0, i64 13432
  %34 = load i8, ptr %33, align 4
  %35 = or i8 %34, 1
  %36 = getelementptr i8, ptr %0, i64 13532
  store i8 %35, ptr %36, align 4
  %37 = getelementptr i8, ptr %0, i64 13433
  %38 = load i8, ptr %37, align 1
  %39 = or i8 %38, 1
  %40 = getelementptr i8, ptr %0, i64 13533
  store i8 %39, ptr %40, align 1
  %41 = getelementptr i8, ptr %0, i64 13434
  %42 = load i8, ptr %41, align 2
  %43 = or i8 %42, 1
  %44 = getelementptr i8, ptr %0, i64 13534
  store i8 %43, ptr %44, align 2
  %45 = getelementptr i8, ptr %0, i64 13435
  %46 = load i8, ptr %45, align 1
  %47 = or i8 %46, 1
  %48 = getelementptr i8, ptr %0, i64 13535
  store i8 %47, ptr %48, align 1
  %49 = getelementptr i8, ptr %0, i64 13442
  %50 = load i8, ptr %49, align 2
  %51 = or i8 %50, %15
  %52 = getelementptr i8, ptr %0, i64 13550
  store i8 %51, ptr %52, align 2
  %53 = or i8 %42, %50
  %54 = getelementptr i8, ptr %0, i64 13542
  store i8 %53, ptr %54, align 2
  %55 = getelementptr i8, ptr %0, i64 13438
  %56 = load i8, ptr %55, align 2
  %57 = or i8 %56, %8
  %58 = getelementptr i8, ptr %0, i64 13546
  store i8 %57, ptr %58, align 2
  %59 = or i8 %26, %56
  %60 = getelementptr i8, ptr %0, i64 13538
  store i8 %59, ptr %60, align 2
  %61 = getelementptr i8, ptr %0, i64 13440
  %62 = load i8, ptr %61, align 4
  %63 = or i8 %62, %15
  %64 = getelementptr i8, ptr %0, i64 13548
  store i8 %63, ptr %64, align 4
  %65 = or i8 %34, %62
  %66 = getelementptr i8, ptr %0, i64 13540
  store i8 %65, ptr %66, align 4
  %67 = getelementptr i8, ptr %0, i64 13436
  %68 = load i8, ptr %67, align 4
  %69 = getelementptr i8, ptr %0, i64 13444
  %70 = load i8, ptr %69, align 4
  %71 = or i8 %68, %70
  %72 = getelementptr i8, ptr %0, i64 13544
  store i8 %71, ptr %72, align 4
  %73 = or i8 %18, %68
  %74 = getelementptr i8, ptr %0, i64 13536
  store i8 %73, ptr %74, align 4
  %75 = getelementptr i8, ptr %0, i64 13443
  %76 = load i8, ptr %75, align 1
  %77 = or i8 %76, %15
  %78 = getelementptr i8, ptr %0, i64 13551
  store i8 %77, ptr %78, align 1
  %79 = or i8 %46, %76
  %80 = getelementptr i8, ptr %0, i64 13543
  store i8 %79, ptr %80, align 1
  %81 = getelementptr i8, ptr %0, i64 13439
  %82 = load i8, ptr %81, align 1
  %83 = or i8 %82, %15
  %84 = getelementptr i8, ptr %0, i64 13547
  store i8 %83, ptr %84, align 1
  %85 = or i8 %30, %82
  %86 = getelementptr i8, ptr %0, i64 13539
  store i8 %85, ptr %86, align 1
  %87 = getelementptr i8, ptr %0, i64 13441
  %88 = load i8, ptr %87, align 1
  %89 = or i8 %88, %15
  %90 = getelementptr i8, ptr %0, i64 13549
  store i8 %89, ptr %90, align 1
  %91 = or i8 %38, %88
  %92 = getelementptr i8, ptr %0, i64 13541
  store i8 %91, ptr %92, align 1
  %93 = getelementptr i8, ptr %0, i64 13437
  %94 = load i8, ptr %93, align 1
  %95 = load i8, ptr %0, align 1
  %96 = or i8 %94, %95
  %97 = getelementptr i8, ptr %0, i64 13545
  store i8 %96, ptr %97, align 1
  %98 = or i8 %22, %94
  %99 = getelementptr i8, ptr %0, i64 13537
  store i8 %98, ptr %99, align 1
  ret void
}

attributes #0 = { "prefer-vector-width"="128" "target-features"="+avx" }
Stack dump:
0.      Program arguments: ./clang-bad -O3 -c -o /dev/null reduced.ll
1.      Code generation
2.      Running pass 'Function Pass Manager' on module 'reduced.ll'.
3.      Running pass 'Machine Instruction Scheduler' on function '@_f'

@alexfh
Copy link
Contributor

alexfh commented Apr 7, 2025

Sent #134604. I'll land it as soon as the premerge checks finish.

@alexey-bataev
Copy link
Member Author

Test case (also in https://gcc.godbolt.org/z/nsjaqfoT8):

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define void @_f(ptr %0, i8 %1, i8 %2) #0 {
  %4 = or i8 %1, 1
  %5 = getelementptr i8, ptr %0, i64 13520
  store i8 %4, ptr %5, align 4
  %6 = getelementptr i8, ptr %0, i64 13521
  store i8 %4, ptr %6, align 1
  %7 = getelementptr i8, ptr %0, i64 13522
  store i8 %4, ptr %7, align 2
  %8 = load i8, ptr %0, align 2
  %9 = or i8 %2, 1
  %10 = getelementptr i8, ptr %0, i64 13523
  store i8 %9, ptr %10, align 1
  %11 = getelementptr i8, ptr %0, i64 13524
  store i8 %4, ptr %11, align 4
  %12 = getelementptr i8, ptr %0, i64 13525
  store i8 %4, ptr %12, align 1
  %13 = getelementptr i8, ptr %0, i64 13526
  store i8 %4, ptr %13, align 2
  %14 = getelementptr i8, ptr %0, i64 13445
  %15 = load i8, ptr %14, align 1
  %16 = getelementptr i8, ptr %0, i64 13527
  store i8 %4, ptr %16, align 1
  %17 = getelementptr i8, ptr %0, i64 13428
  %18 = load i8, ptr %17, align 4
  %19 = or i8 %18, 1
  %20 = getelementptr i8, ptr %0, i64 13528
  store i8 %19, ptr %20, align 4
  %21 = getelementptr i8, ptr %0, i64 13429
  %22 = load i8, ptr %21, align 1
  %23 = or i8 %22, 1
  %24 = getelementptr i8, ptr %0, i64 13529
  store i8 %23, ptr %24, align 1
  %25 = getelementptr i8, ptr %0, i64 13430
  %26 = load i8, ptr %25, align 2
  %27 = or i8 %26, 1
  %28 = getelementptr i8, ptr %0, i64 13530
  store i8 %27, ptr %28, align 2
  %29 = getelementptr i8, ptr %0, i64 13431
  %30 = load i8, ptr %29, align 1
  %31 = or i8 %30, 1
  %32 = getelementptr i8, ptr %0, i64 13531
  store i8 %31, ptr %32, align 1
  %33 = getelementptr i8, ptr %0, i64 13432
  %34 = load i8, ptr %33, align 4
  %35 = or i8 %34, 1
  %36 = getelementptr i8, ptr %0, i64 13532
  store i8 %35, ptr %36, align 4
  %37 = getelementptr i8, ptr %0, i64 13433
  %38 = load i8, ptr %37, align 1
  %39 = or i8 %38, 1
  %40 = getelementptr i8, ptr %0, i64 13533
  store i8 %39, ptr %40, align 1
  %41 = getelementptr i8, ptr %0, i64 13434
  %42 = load i8, ptr %41, align 2
  %43 = or i8 %42, 1
  %44 = getelementptr i8, ptr %0, i64 13534
  store i8 %43, ptr %44, align 2
  %45 = getelementptr i8, ptr %0, i64 13435
  %46 = load i8, ptr %45, align 1
  %47 = or i8 %46, 1
  %48 = getelementptr i8, ptr %0, i64 13535
  store i8 %47, ptr %48, align 1
  %49 = getelementptr i8, ptr %0, i64 13442
  %50 = load i8, ptr %49, align 2
  %51 = or i8 %50, %15
  %52 = getelementptr i8, ptr %0, i64 13550
  store i8 %51, ptr %52, align 2
  %53 = or i8 %42, %50
  %54 = getelementptr i8, ptr %0, i64 13542
  store i8 %53, ptr %54, align 2
  %55 = getelementptr i8, ptr %0, i64 13438
  %56 = load i8, ptr %55, align 2
  %57 = or i8 %56, %8
  %58 = getelementptr i8, ptr %0, i64 13546
  store i8 %57, ptr %58, align 2
  %59 = or i8 %26, %56
  %60 = getelementptr i8, ptr %0, i64 13538
  store i8 %59, ptr %60, align 2
  %61 = getelementptr i8, ptr %0, i64 13440
  %62 = load i8, ptr %61, align 4
  %63 = or i8 %62, %15
  %64 = getelementptr i8, ptr %0, i64 13548
  store i8 %63, ptr %64, align 4
  %65 = or i8 %34, %62
  %66 = getelementptr i8, ptr %0, i64 13540
  store i8 %65, ptr %66, align 4
  %67 = getelementptr i8, ptr %0, i64 13436
  %68 = load i8, ptr %67, align 4
  %69 = getelementptr i8, ptr %0, i64 13444
  %70 = load i8, ptr %69, align 4
  %71 = or i8 %68, %70
  %72 = getelementptr i8, ptr %0, i64 13544
  store i8 %71, ptr %72, align 4
  %73 = or i8 %18, %68
  %74 = getelementptr i8, ptr %0, i64 13536
  store i8 %73, ptr %74, align 4
  %75 = getelementptr i8, ptr %0, i64 13443
  %76 = load i8, ptr %75, align 1
  %77 = or i8 %76, %15
  %78 = getelementptr i8, ptr %0, i64 13551
  store i8 %77, ptr %78, align 1
  %79 = or i8 %46, %76
  %80 = getelementptr i8, ptr %0, i64 13543
  store i8 %79, ptr %80, align 1
  %81 = getelementptr i8, ptr %0, i64 13439
  %82 = load i8, ptr %81, align 1
  %83 = or i8 %82, %15
  %84 = getelementptr i8, ptr %0, i64 13547
  store i8 %83, ptr %84, align 1
  %85 = or i8 %30, %82
  %86 = getelementptr i8, ptr %0, i64 13539
  store i8 %85, ptr %86, align 1
  %87 = getelementptr i8, ptr %0, i64 13441
  %88 = load i8, ptr %87, align 1
  %89 = or i8 %88, %15
  %90 = getelementptr i8, ptr %0, i64 13549
  store i8 %89, ptr %90, align 1
  %91 = or i8 %38, %88
  %92 = getelementptr i8, ptr %0, i64 13541
  store i8 %91, ptr %92, align 1
  %93 = getelementptr i8, ptr %0, i64 13437
  %94 = load i8, ptr %93, align 1
  %95 = load i8, ptr %0, align 1
  %96 = or i8 %94, %95
  %97 = getelementptr i8, ptr %0, i64 13545
  store i8 %96, ptr %97, align 1
  %98 = or i8 %22, %94
  %99 = getelementptr i8, ptr %0, i64 13537
  store i8 %98, ptr %99, align 1
  ret void
}

attributes #0 = { "prefer-vector-width"="128" "target-features"="+avx" }
Stack dump:
0.      Program arguments: ./clang-bad -O3 -c -o /dev/null reduced.ll
1.      Code generation
2.      Running pass 'Function Pass Manager' on module 'reduced.ll'.
3.      Running pass 'Machine Instruction Scheduler' on function '@_f'

Test case (also in https://gcc.godbolt.org/z/nsjaqfoT8):

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define void @_f(ptr %0, i8 %1, i8 %2) #0 {
  %4 = or i8 %1, 1
  %5 = getelementptr i8, ptr %0, i64 13520
  store i8 %4, ptr %5, align 4
  %6 = getelementptr i8, ptr %0, i64 13521
  store i8 %4, ptr %6, align 1
  %7 = getelementptr i8, ptr %0, i64 13522
  store i8 %4, ptr %7, align 2
  %8 = load i8, ptr %0, align 2
  %9 = or i8 %2, 1
  %10 = getelementptr i8, ptr %0, i64 13523
  store i8 %9, ptr %10, align 1
  %11 = getelementptr i8, ptr %0, i64 13524
  store i8 %4, ptr %11, align 4
  %12 = getelementptr i8, ptr %0, i64 13525
  store i8 %4, ptr %12, align 1
  %13 = getelementptr i8, ptr %0, i64 13526
  store i8 %4, ptr %13, align 2
  %14 = getelementptr i8, ptr %0, i64 13445
  %15 = load i8, ptr %14, align 1
  %16 = getelementptr i8, ptr %0, i64 13527
  store i8 %4, ptr %16, align 1
  %17 = getelementptr i8, ptr %0, i64 13428
  %18 = load i8, ptr %17, align 4
  %19 = or i8 %18, 1
  %20 = getelementptr i8, ptr %0, i64 13528
  store i8 %19, ptr %20, align 4
  %21 = getelementptr i8, ptr %0, i64 13429
  %22 = load i8, ptr %21, align 1
  %23 = or i8 %22, 1
  %24 = getelementptr i8, ptr %0, i64 13529
  store i8 %23, ptr %24, align 1
  %25 = getelementptr i8, ptr %0, i64 13430
  %26 = load i8, ptr %25, align 2
  %27 = or i8 %26, 1
  %28 = getelementptr i8, ptr %0, i64 13530
  store i8 %27, ptr %28, align 2
  %29 = getelementptr i8, ptr %0, i64 13431
  %30 = load i8, ptr %29, align 1
  %31 = or i8 %30, 1
  %32 = getelementptr i8, ptr %0, i64 13531
  store i8 %31, ptr %32, align 1
  %33 = getelementptr i8, ptr %0, i64 13432
  %34 = load i8, ptr %33, align 4
  %35 = or i8 %34, 1
  %36 = getelementptr i8, ptr %0, i64 13532
  store i8 %35, ptr %36, align 4
  %37 = getelementptr i8, ptr %0, i64 13433
  %38 = load i8, ptr %37, align 1
  %39 = or i8 %38, 1
  %40 = getelementptr i8, ptr %0, i64 13533
  store i8 %39, ptr %40, align 1
  %41 = getelementptr i8, ptr %0, i64 13434
  %42 = load i8, ptr %41, align 2
  %43 = or i8 %42, 1
  %44 = getelementptr i8, ptr %0, i64 13534
  store i8 %43, ptr %44, align 2
  %45 = getelementptr i8, ptr %0, i64 13435
  %46 = load i8, ptr %45, align 1
  %47 = or i8 %46, 1
  %48 = getelementptr i8, ptr %0, i64 13535
  store i8 %47, ptr %48, align 1
  %49 = getelementptr i8, ptr %0, i64 13442
  %50 = load i8, ptr %49, align 2
  %51 = or i8 %50, %15
  %52 = getelementptr i8, ptr %0, i64 13550
  store i8 %51, ptr %52, align 2
  %53 = or i8 %42, %50
  %54 = getelementptr i8, ptr %0, i64 13542
  store i8 %53, ptr %54, align 2
  %55 = getelementptr i8, ptr %0, i64 13438
  %56 = load i8, ptr %55, align 2
  %57 = or i8 %56, %8
  %58 = getelementptr i8, ptr %0, i64 13546
  store i8 %57, ptr %58, align 2
  %59 = or i8 %26, %56
  %60 = getelementptr i8, ptr %0, i64 13538
  store i8 %59, ptr %60, align 2
  %61 = getelementptr i8, ptr %0, i64 13440
  %62 = load i8, ptr %61, align 4
  %63 = or i8 %62, %15
  %64 = getelementptr i8, ptr %0, i64 13548
  store i8 %63, ptr %64, align 4
  %65 = or i8 %34, %62
  %66 = getelementptr i8, ptr %0, i64 13540
  store i8 %65, ptr %66, align 4
  %67 = getelementptr i8, ptr %0, i64 13436
  %68 = load i8, ptr %67, align 4
  %69 = getelementptr i8, ptr %0, i64 13444
  %70 = load i8, ptr %69, align 4
  %71 = or i8 %68, %70
  %72 = getelementptr i8, ptr %0, i64 13544
  store i8 %71, ptr %72, align 4
  %73 = or i8 %18, %68
  %74 = getelementptr i8, ptr %0, i64 13536
  store i8 %73, ptr %74, align 4
  %75 = getelementptr i8, ptr %0, i64 13443
  %76 = load i8, ptr %75, align 1
  %77 = or i8 %76, %15
  %78 = getelementptr i8, ptr %0, i64 13551
  store i8 %77, ptr %78, align 1
  %79 = or i8 %46, %76
  %80 = getelementptr i8, ptr %0, i64 13543
  store i8 %79, ptr %80, align 1
  %81 = getelementptr i8, ptr %0, i64 13439
  %82 = load i8, ptr %81, align 1
  %83 = or i8 %82, %15
  %84 = getelementptr i8, ptr %0, i64 13547
  store i8 %83, ptr %84, align 1
  %85 = or i8 %30, %82
  %86 = getelementptr i8, ptr %0, i64 13539
  store i8 %85, ptr %86, align 1
  %87 = getelementptr i8, ptr %0, i64 13441
  %88 = load i8, ptr %87, align 1
  %89 = or i8 %88, %15
  %90 = getelementptr i8, ptr %0, i64 13549
  store i8 %89, ptr %90, align 1
  %91 = or i8 %38, %88
  %92 = getelementptr i8, ptr %0, i64 13541
  store i8 %91, ptr %92, align 1
  %93 = getelementptr i8, ptr %0, i64 13437
  %94 = load i8, ptr %93, align 1
  %95 = load i8, ptr %0, align 1
  %96 = or i8 %94, %95
  %97 = getelementptr i8, ptr %0, i64 13545
  store i8 %96, ptr %97, align 1
  %98 = or i8 %22, %94
  %99 = getelementptr i8, ptr %0, i64 13537
  store i8 %98, ptr %99, align 1
  ret void
}

attributes #0 = { "prefer-vector-width"="128" "target-features"="+avx" }
Stack dump:
0.      Program arguments: ./clang-bad -O3 -c -o /dev/null reduced.ll
1.      Code generation
2.      Running pass 'Function Pass Manager' on module 'reduced.ll'.
3.      Running pass 'Machine Instruction Scheduler' on function '@_f'

Thanks for the reproducer!

alexey-bataev added a commit that referenced this pull request Apr 7, 2025
…n SplitVectorize nodes

If the last instruction in the SplitVectorize node is vectorized and
scheduled as part of some bundles, the SplitVectorize node might be
placed in the wrong order, leading to a compiler crash. Need to check if
the vectorized node has vector value and place the SplitVectorize node after the vector instruction to prevent a compile crash.

Fixes issue reported in #133091 (comment)
@alexey-bataev
Copy link
Member Author

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define void @_f(ptr %0, i8 %1, i8 %2) #0 {
%4 = or i8 %1, 1
%5 = getelementptr i8, ptr %0, i64 13520
store i8 %4, ptr %5, align 4
%6 = getelementptr i8, ptr %0, i64 13521
store i8 %4, ptr %6, align 1
%7 = getelementptr i8, ptr %0, i64 13522
store i8 %4, ptr %7, align 2
%8 = load i8, ptr %0, align 2
%9 = or i8 %2, 1
%10 = getelementptr i8, ptr %0, i64 13523
store i8 %9, ptr %10, align 1
%11 = getelementptr i8, ptr %0, i64 13524
store i8 %4, ptr %11, align 4
%12 = getelementptr i8, ptr %0, i64 13525
store i8 %4, ptr %12, align 1
%13 = getelementptr i8, ptr %0, i64 13526
store i8 %4, ptr %13, align 2
%14 = getelementptr i8, ptr %0, i64 13445
%15 = load i8, ptr %14, align 1
%16 = getelementptr i8, ptr %0, i64 13527
store i8 %4, ptr %16, align 1
%17 = getelementptr i8, ptr %0, i64 13428
%18 = load i8, ptr %17, align 4
%19 = or i8 %18, 1
%20 = getelementptr i8, ptr %0, i64 13528
store i8 %19, ptr %20, align 4
%21 = getelementptr i8, ptr %0, i64 13429
%22 = load i8, ptr %21, align 1
%23 = or i8 %22, 1
%24 = getelementptr i8, ptr %0, i64 13529
store i8 %23, ptr %24, align 1
%25 = getelementptr i8, ptr %0, i64 13430
%26 = load i8, ptr %25, align 2
%27 = or i8 %26, 1
%28 = getelementptr i8, ptr %0, i64 13530
store i8 %27, ptr %28, align 2
%29 = getelementptr i8, ptr %0, i64 13431
%30 = load i8, ptr %29, align 1
%31 = or i8 %30, 1
%32 = getelementptr i8, ptr %0, i64 13531
store i8 %31, ptr %32, align 1
%33 = getelementptr i8, ptr %0, i64 13432
%34 = load i8, ptr %33, align 4
%35 = or i8 %34, 1
%36 = getelementptr i8, ptr %0, i64 13532
store i8 %35, ptr %36, align 4
%37 = getelementptr i8, ptr %0, i64 13433
%38 = load i8, ptr %37, align 1
%39 = or i8 %38, 1
%40 = getelementptr i8, ptr %0, i64 13533
store i8 %39, ptr %40, align 1
%41 = getelementptr i8, ptr %0, i64 13434
%42 = load i8, ptr %41, align 2
%43 = or i8 %42, 1
%44 = getelementptr i8, ptr %0, i64 13534
store i8 %43, ptr %44, align 2
%45 = getelementptr i8, ptr %0, i64 13435
%46 = load i8, ptr %45, align 1
%47 = or i8 %46, 1
%48 = getelementptr i8, ptr %0, i64 13535
store i8 %47, ptr %48, align 1
%49 = getelementptr i8, ptr %0, i64 13442
%50 = load i8, ptr %49, align 2
%51 = or i8 %50, %15
%52 = getelementptr i8, ptr %0, i64 13550
store i8 %51, ptr %52, align 2
%53 = or i8 %42, %50
%54 = getelementptr i8, ptr %0, i64 13542
store i8 %53, ptr %54, align 2
%55 = getelementptr i8, ptr %0, i64 13438
%56 = load i8, ptr %55, align 2
%57 = or i8 %56, %8
%58 = getelementptr i8, ptr %0, i64 13546
store i8 %57, ptr %58, align 2
%59 = or i8 %26, %56
%60 = getelementptr i8, ptr %0, i64 13538
store i8 %59, ptr %60, align 2
%61 = getelementptr i8, ptr %0, i64 13440
%62 = load i8, ptr %61, align 4
%63 = or i8 %62, %15
%64 = getelementptr i8, ptr %0, i64 13548
store i8 %63, ptr %64, align 4
%65 = or i8 %34, %62
%66 = getelementptr i8, ptr %0, i64 13540
store i8 %65, ptr %66, align 4
%67 = getelementptr i8, ptr %0, i64 13436
%68 = load i8, ptr %67, align 4
%69 = getelementptr i8, ptr %0, i64 13444
%70 = load i8, ptr %69, align 4
%71 = or i8 %68, %70
%72 = getelementptr i8, ptr %0, i64 13544
store i8 %71, ptr %72, align 4
%73 = or i8 %18, %68
%74 = getelementptr i8, ptr %0, i64 13536
store i8 %73, ptr %74, align 4
%75 = getelementptr i8, ptr %0, i64 13443
%76 = load i8, ptr %75, align 1
%77 = or i8 %76, %15
%78 = getelementptr i8, ptr %0, i64 13551
store i8 %77, ptr %78, align 1
%79 = or i8 %46, %76
%80 = getelementptr i8, ptr %0, i64 13543
store i8 %79, ptr %80, align 1
%81 = getelementptr i8, ptr %0, i64 13439
%82 = load i8, ptr %81, align 1
%83 = or i8 %82, %15
%84 = getelementptr i8, ptr %0, i64 13547
store i8 %83, ptr %84, align 1
%85 = or i8 %30, %82
%86 = getelementptr i8, ptr %0, i64 13539
store i8 %85, ptr %86, align 1
%87 = getelementptr i8, ptr %0, i64 13441
%88 = load i8, ptr %87, align 1
%89 = or i8 %88, %15
%90 = getelementptr i8, ptr %0, i64 13549
store i8 %89, ptr %90, align 1
%91 = or i8 %38, %88
%92 = getelementptr i8, ptr %0, i64 13541
store i8 %91, ptr %92, align 1
%93 = getelementptr i8, ptr %0, i64 13437
%94 = load i8, ptr %93, align 1
%95 = load i8, ptr %0, align 1
%96 = or i8 %94, %95
%97 = getelementptr i8, ptr %0, i64 13545
store i8 %96, ptr %97, align 1
%98 = or i8 %22, %94
%99 = getelementptr i8, ptr %0, i64 13537
store i8 %98, ptr %99, align 1
ret void
}

attributes #0 = { "prefer-vector-width"="128" "target-features"="+avx" }

Test case (also in https://gcc.godbolt.org/z/nsjaqfoT8):

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define void @_f(ptr %0, i8 %1, i8 %2) #0 {
  %4 = or i8 %1, 1
  %5 = getelementptr i8, ptr %0, i64 13520
  store i8 %4, ptr %5, align 4
  %6 = getelementptr i8, ptr %0, i64 13521
  store i8 %4, ptr %6, align 1
  %7 = getelementptr i8, ptr %0, i64 13522
  store i8 %4, ptr %7, align 2
  %8 = load i8, ptr %0, align 2
  %9 = or i8 %2, 1
  %10 = getelementptr i8, ptr %0, i64 13523
  store i8 %9, ptr %10, align 1
  %11 = getelementptr i8, ptr %0, i64 13524
  store i8 %4, ptr %11, align 4
  %12 = getelementptr i8, ptr %0, i64 13525
  store i8 %4, ptr %12, align 1
  %13 = getelementptr i8, ptr %0, i64 13526
  store i8 %4, ptr %13, align 2
  %14 = getelementptr i8, ptr %0, i64 13445
  %15 = load i8, ptr %14, align 1
  %16 = getelementptr i8, ptr %0, i64 13527
  store i8 %4, ptr %16, align 1
  %17 = getelementptr i8, ptr %0, i64 13428
  %18 = load i8, ptr %17, align 4
  %19 = or i8 %18, 1
  %20 = getelementptr i8, ptr %0, i64 13528
  store i8 %19, ptr %20, align 4
  %21 = getelementptr i8, ptr %0, i64 13429
  %22 = load i8, ptr %21, align 1
  %23 = or i8 %22, 1
  %24 = getelementptr i8, ptr %0, i64 13529
  store i8 %23, ptr %24, align 1
  %25 = getelementptr i8, ptr %0, i64 13430
  %26 = load i8, ptr %25, align 2
  %27 = or i8 %26, 1
  %28 = getelementptr i8, ptr %0, i64 13530
  store i8 %27, ptr %28, align 2
  %29 = getelementptr i8, ptr %0, i64 13431
  %30 = load i8, ptr %29, align 1
  %31 = or i8 %30, 1
  %32 = getelementptr i8, ptr %0, i64 13531
  store i8 %31, ptr %32, align 1
  %33 = getelementptr i8, ptr %0, i64 13432
  %34 = load i8, ptr %33, align 4
  %35 = or i8 %34, 1
  %36 = getelementptr i8, ptr %0, i64 13532
  store i8 %35, ptr %36, align 4
  %37 = getelementptr i8, ptr %0, i64 13433
  %38 = load i8, ptr %37, align 1
  %39 = or i8 %38, 1
  %40 = getelementptr i8, ptr %0, i64 13533
  store i8 %39, ptr %40, align 1
  %41 = getelementptr i8, ptr %0, i64 13434
  %42 = load i8, ptr %41, align 2
  %43 = or i8 %42, 1
  %44 = getelementptr i8, ptr %0, i64 13534
  store i8 %43, ptr %44, align 2
  %45 = getelementptr i8, ptr %0, i64 13435
  %46 = load i8, ptr %45, align 1
  %47 = or i8 %46, 1
  %48 = getelementptr i8, ptr %0, i64 13535
  store i8 %47, ptr %48, align 1
  %49 = getelementptr i8, ptr %0, i64 13442
  %50 = load i8, ptr %49, align 2
  %51 = or i8 %50, %15
  %52 = getelementptr i8, ptr %0, i64 13550
  store i8 %51, ptr %52, align 2
  %53 = or i8 %42, %50
  %54 = getelementptr i8, ptr %0, i64 13542
  store i8 %53, ptr %54, align 2
  %55 = getelementptr i8, ptr %0, i64 13438
  %56 = load i8, ptr %55, align 2
  %57 = or i8 %56, %8
  %58 = getelementptr i8, ptr %0, i64 13546
  store i8 %57, ptr %58, align 2
  %59 = or i8 %26, %56
  %60 = getelementptr i8, ptr %0, i64 13538
  store i8 %59, ptr %60, align 2
  %61 = getelementptr i8, ptr %0, i64 13440
  %62 = load i8, ptr %61, align 4
  %63 = or i8 %62, %15
  %64 = getelementptr i8, ptr %0, i64 13548
  store i8 %63, ptr %64, align 4
  %65 = or i8 %34, %62
  %66 = getelementptr i8, ptr %0, i64 13540
  store i8 %65, ptr %66, align 4
  %67 = getelementptr i8, ptr %0, i64 13436
  %68 = load i8, ptr %67, align 4
  %69 = getelementptr i8, ptr %0, i64 13444
  %70 = load i8, ptr %69, align 4
  %71 = or i8 %68, %70
  %72 = getelementptr i8, ptr %0, i64 13544
  store i8 %71, ptr %72, align 4
  %73 = or i8 %18, %68
  %74 = getelementptr i8, ptr %0, i64 13536
  store i8 %73, ptr %74, align 4
  %75 = getelementptr i8, ptr %0, i64 13443
  %76 = load i8, ptr %75, align 1
  %77 = or i8 %76, %15
  %78 = getelementptr i8, ptr %0, i64 13551
  store i8 %77, ptr %78, align 1
  %79 = or i8 %46, %76
  %80 = getelementptr i8, ptr %0, i64 13543
  store i8 %79, ptr %80, align 1
  %81 = getelementptr i8, ptr %0, i64 13439
  %82 = load i8, ptr %81, align 1
  %83 = or i8 %82, %15
  %84 = getelementptr i8, ptr %0, i64 13547
  store i8 %83, ptr %84, align 1
  %85 = or i8 %30, %82
  %86 = getelementptr i8, ptr %0, i64 13539
  store i8 %85, ptr %86, align 1
  %87 = getelementptr i8, ptr %0, i64 13441
  %88 = load i8, ptr %87, align 1
  %89 = or i8 %88, %15
  %90 = getelementptr i8, ptr %0, i64 13549
  store i8 %89, ptr %90, align 1
  %91 = or i8 %38, %88
  %92 = getelementptr i8, ptr %0, i64 13541
  store i8 %91, ptr %92, align 1
  %93 = getelementptr i8, ptr %0, i64 13437
  %94 = load i8, ptr %93, align 1
  %95 = load i8, ptr %0, align 1
  %96 = or i8 %94, %95
  %97 = getelementptr i8, ptr %0, i64 13545
  store i8 %96, ptr %97, align 1
  %98 = or i8 %22, %94
  %99 = getelementptr i8, ptr %0, i64 13537
  store i8 %98, ptr %99, align 1
  ret void
}

attributes #0 = { "prefer-vector-width"="128" "target-features"="+avx" }
Stack dump:
0.      Program arguments: ./clang-bad -O3 -c -o /dev/null reduced.ll
1.      Code generation
2.      Running pass 'Function Pass Manager' on module 'reduced.ll'.
3.      Running pass 'Machine Instruction Scheduler' on function '@_f'

Fixed in f413772

llvm-sync bot pushed a commit to arm/arm-toolchain that referenced this pull request Apr 7, 2025
…struction in SplitVectorize nodes

If the last instruction in the SplitVectorize node is vectorized and
scheduled as part of some bundles, the SplitVectorize node might be
placed in the wrong order, leading to a compiler crash. Need to check if
the vectorized node has vector value and place the SplitVectorize node after the vector instruction to prevent a compile crash.

Fixes issue reported in llvm/llvm-project#133091 (comment)
@alexfh
Copy link
Contributor

alexfh commented Apr 7, 2025

Fixed in f413772

Thanks! Will try with the original failures.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

6 participants