Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SLP]Initial support for (masked)loads + compress and (masked)interleaved #132099

Conversation

alexey-bataev
Copy link
Member

Added initial support for (masked)loads + compress and
(masked)interleaved loads.

Created using spr 1.3.5
@llvmbot
Copy link
Member

llvmbot commented Mar 19, 2025

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-vectorizers

Author: Alexey Bataev (alexey-bataev)

Changes

Added initial support for (masked)loads + compress and
(masked)interleaved loads.


Patch is 109.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/132099.diff

14 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+325-34)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll (+7-10)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll (+6-16)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll (+6-10)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll (+52-110)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll (+52-110)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll (+5-6)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll (+28-24)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll (+5-7)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll (+4-7)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll (+4-8)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll (+4-4)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll (+3-8)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll (+5-6)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1d9d80bd69def..f9905cc7c3307 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryLocation.h"
@@ -1378,7 +1379,8 @@ class BoUpSLP {
     Gather,
     Vectorize,
     ScatterVectorize,
-    StridedVectorize
+    StridedVectorize,
+    MaskedLoadCompressVectorize
   };
 
   using ValueList = SmallVector<Value *, 8>;
@@ -3378,6 +3380,7 @@ class BoUpSLP {
       Vectorize,         ///< The node is regularly vectorized.
       ScatterVectorize,  ///< Masked scatter/gather node.
       StridedVectorize,  ///< Strided loads (and stores)
+      MaskedLoadCompressVectorize, ///< Masked load with compress.
       NeedToGather,      ///< Gather/buildvector node.
       CombinedVectorize, ///< Vectorized node, combined with its user into more
                          ///< complex node like select/cmp to minmax, mul/add to
@@ -3604,6 +3607,9 @@ class BoUpSLP {
       case StridedVectorize:
         dbgs() << "StridedVectorize\n";
         break;
+      case MaskedLoadCompressVectorize:
+        dbgs() << "MaskedLoadCompressVectorize\n";
+        break;
       case NeedToGather:
         dbgs() << "NeedToGather\n";
         break;
@@ -4650,7 +4656,8 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
     if (Entry->isGather())
       return "color=red";
     if (Entry->State == TreeEntry::ScatterVectorize ||
-        Entry->State == TreeEntry::StridedVectorize)
+        Entry->State == TreeEntry::StridedVectorize ||
+        Entry->State == TreeEntry::MaskedLoadCompressVectorize)
       return "color=blue";
     return "";
   }
@@ -5214,6 +5221,145 @@ static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec,
   return Builder.CreateShuffleVector(Vec, Mask);
 }
 
+/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
+/// with \p Order.
+static void buildCompressMask(ArrayRef<Value *> PointerOps,
+                              ArrayRef<unsigned> Order, Type *ScalarTy,
+                              const DataLayout &DL, ScalarEvolution &SE,
+                              SmallVectorImpl<int> &CompressMask) {
+  const unsigned Sz = PointerOps.size();
+  CompressMask.assign(Sz, PoisonMaskElem);
+  // The first element always set.
+  CompressMask[0] = 0;
+  Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
+  for (unsigned I : seq<unsigned>(1, Sz)) {
+    Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
+    unsigned Pos = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
+    CompressMask[I] = Pos;
+  }
+}
+
+/// Checks if the \p VL can be transformed to a (masked)load + compress or
+/// (masked) interleaved load.
+static bool isMaskedLoadCompress(
+    ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
+    ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
+    const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC,
+    const DominatorTree &DT, const TargetLibraryInfo &TLI,
+    const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
+    unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
+    VectorType *&LoadVecTy) {
+  InterleaveFactor = 0;
+  Type *ScalarTy = VL.front()->getType();
+  const unsigned Sz = VL.size();
+  auto *VecTy = getWidenedType(ScalarTy, Sz);
+  constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  // Check external uses.
+  for (const auto [I, V] : enumerate(VL)) {
+    if (AreAllUsersVectorized(V))
+      continue;
+    InstructionCost ExtractCost =
+        TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, I);
+    InstructionCost ScalarCost =
+        TTI.getInstructionCost(cast<Instruction>(V), CostKind);
+    if (ExtractCost <= ScalarCost)
+      return false;
+  }
+  Value *Ptr0;
+  Value *PtrN;
+  if (Order.empty()) {
+    Ptr0 = PointerOps.front();
+    PtrN = PointerOps.back();
+  } else {
+    Ptr0 = PointerOps[Order.front()];
+    PtrN = PointerOps[Order.back()];
+  }
+  std::optional<int> Diff =
+      getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
+  if (!Diff)
+    return false;
+  const unsigned MaxRegSize =
+      TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
+          .getFixedValue();
+  // Check for very large distances between elements.
+  if (*Diff / Sz >= MaxRegSize / 8)
+    return false;
+  Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
+  LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
+  auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
+  bool IsNotMasked = isSafeToLoadUnconditionally(
+      Ptr0, LoadVecTy, CommonAlignment, DL,
+      cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
+      &TLI);
+  // TODO: perform the analysis of each scalar load for better
+  // safe-load-unconditionally analysis.
+  buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
+  assert(CompressMask.size() >= 2 && "At least two elements are required");
+  IsMasked = !IsNotMasked;
+  auto [ScalarGEPCost, VectorGEPCost] =
+      getGEPCosts(TTI, PointerOps, PointerOps.front(),
+                  Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
+  // The cost of scalar loads.
+  InstructionCost ScalarLoadsCost =
+      std::accumulate(VL.begin(), VL.end(), InstructionCost(),
+                      [&](InstructionCost C, Value *V) {
+                        return C + TTI.getInstructionCost(cast<Instruction>(V),
+                                                          CostKind);
+                      }) +
+      ScalarGEPCost;
+  APInt DemandedElts = APInt::getAllOnes(Sz);
+  InstructionCost GatherCost =
+      getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
+                               /*Insert=*/true,
+                               /*Extract=*/false, CostKind) +
+      ScalarLoadsCost;
+  InstructionCost LoadCost = 0;
+  if (IsNotMasked)
+    LoadCost =
+        TTI.getMemoryOpCost(Instruction::Load, LoadVecTy,
+                            IsNotMasked ? LI->getAlign() : CommonAlignment,
+                            LI->getPointerAddressSpace(), CostKind);
+  else
+    LoadCost =
+        TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
+                                  LI->getPointerAddressSpace(), CostKind);
+  SmallVector<int> Mask;
+  if (!Order.empty())
+    inversePermutation(Order, Mask);
+  if (int Interval = CompressMask[1] - CompressMask[0];
+      Interval > 0 && all_of(enumerate(CompressMask), [&](const auto &D) {
+        return static_cast<unsigned>(D.value()) == D.index() * Interval;
+      })) {
+    // Check for potential segmented(interleaved) loads.
+    if (TTI.isLegalInterleavedAccessType(
+            LoadVecTy, Interval, IsNotMasked ? LI->getAlign() : CommonAlignment,
+            LI->getPointerAddressSpace())) {
+      InstructionCost InterleavedCost = TTI.getInterleavedMemoryOpCost(
+          Instruction::Load, LoadVecTy, Interval, std::nullopt,
+          IsNotMasked ? LI->getAlign() : CommonAlignment,
+          LI->getPointerAddressSpace(), CostKind, !IsNotMasked);
+      if (!Mask.empty())
+        InterleavedCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
+                                            LoadVecTy, CompressMask, CostKind);
+      if (InterleavedCost < GatherCost) {
+        InterleaveFactor = Interval;
+        return true;
+      }
+    }
+  }
+  if (!Order.empty()) {
+    SmallVector<int> NewMask(Sz, PoisonMaskElem);
+    for (unsigned I : seq<unsigned>(Sz)) {
+      NewMask[I] = CompressMask[Mask[I]];
+    }
+    CompressMask.swap(NewMask);
+  }
+  InstructionCost CompressCost = ::getShuffleCost(
+      TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
+  InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
+  return TotalVecCost < GatherCost;
+}
+
 BoUpSLP::LoadsState
 BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                            SmallVectorImpl<unsigned> &Order,
@@ -5285,9 +5431,6 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
     // Check that the sorted loads are consecutive.
     if (static_cast<unsigned>(*Diff) == Sz - 1)
       return LoadsState::Vectorize;
-    if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
-        TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
-      return LoadsState::Gather;
     // Simple check if not a strided access - clear order.
     bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
     // Try to generate strided load node if:
@@ -5343,7 +5486,22 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
         }
       }
     }
+    [[maybe_unused]] bool IsMasked;
+    [[maybe_unused]] unsigned InterleaveFactor;
+    [[maybe_unused]] SmallVector<int> CompressMask;
+    [[maybe_unused]] VectorType *LoadVecTy;;
+    if (isMaskedLoadCompress(
+            VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT, *TLI,
+            [&](Value *V) {
+              return areAllUsersVectorized(cast<Instruction>(V),
+                                           UserIgnoreList);
+            },
+            IsMasked, InterleaveFactor, CompressMask, LoadVecTy))
+      return LoadsState::MaskedLoadCompressVectorize;
   }
+  if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
+      TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
+    return LoadsState::Gather;
   // Correctly identify compare the cost of loads + shuffles rather than
   // strided/masked gather loads. Returns true if vectorized + shuffles
   // representation is better than just gather.
@@ -5436,7 +5594,8 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
         }
         // If need the reorder - consider as high-cost masked gather for now.
         if ((LS == LoadsState::Vectorize ||
-             LS == LoadsState::StridedVectorize) &&
+             LS == LoadsState::StridedVectorize ||
+             LS == LoadsState::MaskedLoadCompressVectorize) &&
             !Order.empty() && !isReverseOrder(Order))
           LS = LoadsState::ScatterVectorize;
         States.push_back(LS);
@@ -5501,6 +5660,14 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                                                   CommonAlignment, CostKind) +
                        VectorGEPCost;
           break;
+        case LoadsState::MaskedLoadCompressVectorize:
+          VecLdCost += TTI.getMaskedMemoryOpCost(
+                           Instruction::Load, SubVecTy, CommonAlignment,
+                           LI0->getPointerAddressSpace(), CostKind) +
+                       VectorGEPCost +
+                       ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, SubVecTy,
+                                        {}, CostKind);
+          break;
         case LoadsState::ScatterVectorize:
           VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
                                                   LI0->getPointerOperand(),
@@ -5874,7 +6041,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
     return std::nullopt;
   if (TE.State == TreeEntry::SplitVectorize ||
       ((TE.State == TreeEntry::Vectorize ||
-        TE.State == TreeEntry::StridedVectorize) &&
+        TE.State == TreeEntry::StridedVectorize ||
+        TE.State == TreeEntry::MaskedLoadCompressVectorize) &&
        (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
         (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
     assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
@@ -6061,7 +6229,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
       OrdersType CurrentOrder;
       LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
                                          CurrentOrder, PointerOps);
-      if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize)
+      if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize ||
+          Res == LoadsState::MaskedLoadCompressVectorize)
         return std::move(CurrentOrder);
     }
     // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
@@ -6301,7 +6470,8 @@ void BoUpSLP::reorderTopToBottom() {
       VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
       if (!(TE->State == TreeEntry::Vectorize ||
             TE->State == TreeEntry::StridedVectorize ||
-            TE->State == TreeEntry::SplitVectorize) ||
+            TE->State == TreeEntry::SplitVectorize ||
+            TE->State == TreeEntry::MaskedLoadCompressVectorize) ||
           !TE->ReuseShuffleIndices.empty())
         GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
       if (TE->State == TreeEntry::Vectorize &&
@@ -6478,7 +6648,8 @@ void BoUpSLP::reorderTopToBottom() {
       if ((TE->State == TreeEntry::SplitVectorize &&
            TE->ReuseShuffleIndices.empty()) ||
           ((TE->State == TreeEntry::Vectorize ||
-            TE->State == TreeEntry::StridedVectorize) &&
+            TE->State == TreeEntry::StridedVectorize ||
+            TE->State == TreeEntry::MaskedLoadCompressVectorize) &&
            (isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
                 InsertElementInst>(TE->getMainOp()) ||
             (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
@@ -6526,6 +6697,8 @@ bool BoUpSLP::canReorderOperands(
           return OpData.first == I &&
                  (OpData.second->State == TreeEntry::Vectorize ||
                   OpData.second->State == TreeEntry::StridedVectorize ||
+                  OpData.second->State ==
+                      TreeEntry::MaskedLoadCompressVectorize ||
                   OpData.second->State == TreeEntry::SplitVectorize);
         }))
       continue;
@@ -6540,6 +6713,7 @@ bool BoUpSLP::canReorderOperands(
       // node, just reorder reuses mask.
       if (TE->State != TreeEntry::Vectorize &&
           TE->State != TreeEntry::StridedVectorize &&
+          TE->State != TreeEntry::MaskedLoadCompressVectorize &&
           TE->State != TreeEntry::SplitVectorize &&
           TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
         GatherOps.push_back(TE);
@@ -6550,6 +6724,7 @@ bool BoUpSLP::canReorderOperands(
                  [&Gather, UserTE, I](TreeEntry *TE) {
                    assert(TE->State != TreeEntry::Vectorize &&
                           TE->State != TreeEntry::StridedVectorize &&
+                          TE->State != TreeEntry::MaskedLoadCompressVectorize &&
                           TE->State != TreeEntry::SplitVectorize &&
                           "Only non-vectorized nodes are expected.");
                    if (TE->UserTreeIndex.UserTE == UserTE &&
@@ -6586,6 +6761,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
   for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
     if (TE->State != TreeEntry::Vectorize &&
         TE->State != TreeEntry::StridedVectorize &&
+        TE->State != TreeEntry::MaskedLoadCompressVectorize &&
         TE->State != TreeEntry::SplitVectorize)
       NonVectorized.push_back(TE.get());
     if (std::optional<OrdersType> CurrentOrder =
@@ -6593,6 +6769,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
       Queue.push(TE.get());
       if (!(TE->State == TreeEntry::Vectorize ||
             TE->State == TreeEntry::StridedVectorize ||
+            TE->State == TreeEntry::MaskedLoadCompressVectorize ||
             TE->State == TreeEntry::SplitVectorize) ||
           !TE->ReuseShuffleIndices.empty())
         GathersToOrders.insert(TE.get());
@@ -6621,6 +6798,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
     for (TreeEntry *TE : OrderedOps) {
       if (!(TE->State == TreeEntry::Vectorize ||
             TE->State == TreeEntry::StridedVectorize ||
+            TE->State == TreeEntry::MaskedLoadCompressVectorize ||
             TE->State == TreeEntry::SplitVectorize ||
             (TE->isGather() && GathersToOrders.contains(TE))) ||
           !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
@@ -6918,6 +7096,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         // Gathers are processed separately.
         if (TE->State != TreeEntry::Vectorize &&
             TE->State != TreeEntry::StridedVectorize &&
+            TE->State != TreeEntry::MaskedLoadCompressVectorize &&
             TE->State != TreeEntry::SplitVectorize &&
             (TE->State != TreeEntry::ScatterVectorize ||
              TE->ReorderIndices.empty()))
@@ -6950,7 +7129,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         Data.first->reorderOperands(Mask);
       if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
           Data.first->isAltShuffle() ||
-          Data.first->State == TreeEntry::StridedVectorize) {
+          Data.first->State == TreeEntry::StridedVectorize ||
+          Data.first->State == TreeEntry::MaskedLoadCompressVectorize) {
         reorderScalars(Data.first->Scalars, Mask);
         reorderOrder(Data.first->ReorderIndices, MaskOrder,
                      /*BottomOrder=*/true);
@@ -7722,22 +7902,31 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
               // just exit.
               unsigned ConsecutiveNodesSize = 0;
               if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
-                  any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
-                         [&, Slice = Slice](const auto &P) {
-                           const auto *It = find_if(Slice, [&](Value *V) {
-                             return std::get<1>(P).contains(V);
-                           });
-                           if (It == Slice.end())
-                             return false;
-                           ArrayRef<Value *> VL =
-                               VectorizableTree[std::get<0>(P)]->Scalars;
-                           ConsecutiveNodesSize += VL.size();
-                           unsigned Start = std::distance(Slice.begin(), It);
-                           unsigned Sz = Slice.size() - Start;
-                           return Sz < VL.size() ||
-                                  Slice.slice(std::distance(Slice.begin(), It),
-                                              VL.size()) != VL;
-                         }))
+                  any_of(
+                      zip(LoadEntriesToVectorize, LoadSetsToVectorize),
+                      [&, Slice = Slice](const auto &P) {
+                        const auto *It = find_if(Slice, [&](Value *V) {
+                          return std::get<1>(P).contains(V);
+                        });
+                        if (It == Slice.end())
+                          return false;
+                        const TreeEntry &TE = *VectorizableTree[std::get<0>(P)];
+                        ArrayRef<Value *> VL = TE.Scalars;
+                        OrdersType Order;
+                        SmallVector<Value *> PointerOps;
+                        LoadsState State =
+                        canVectorizeLoads(VL, VL.front(), Order,
+                                              PointerOps);
+                        if (State == LoadsState::ScatterVectorize||
+                            State == LoadsState::MaskedLoadCompressVectorize)
+                          return false;
+                        ConsecutiveNodesSize += VL.size();
+                        unsigned Start = std::distance(Slice.begin(), It);
+                        unsigned Sz = Slice.size() - Start;
+                        return Sz < VL.size() ||
+                               Slice.slice(std::distance(Slice.begin(), It),
+                                           VL.size()) != VL;
+                      }))
                 continue;
               // Try to build long masked gather loads.
               UserMaxVF = bit_ceil(UserMaxVF);
@@ -8216,6 +8405,13 @@ BoUpSLP::TreeEntr...
[truncated]

Copy link

github-actions bot commented Mar 19, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

Created using spr 1.3.5
@alexey-bataev
Copy link
Member Author

Supposed to fix 130872, but fixes only for AVX512 currently. Requires better pointer analysis to avoid masked loads

@@ -5214,6 +5221,145 @@ static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec,
return Builder.CreateShuffleVector(Vec, Mask);
}

/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
/// with \p Order.
static void buildCompressMask(ArrayRef<Value *> PointerOps,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Worth returning true (or std::optional) here if the compression stride is uniform?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if this is possible at all. Can you share a scenario?

Created using spr 1.3.5
@alexey-bataev
Copy link
Member Author

Ping!

Created using spr 1.3.5
Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better comments describing what's going on?

Created using spr 1.3.5
@alexey-bataev
Copy link
Member Author

Ping!

Created using spr 1.3.5
Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM - cheers

Created using spr 1.3.5
@alexey-bataev alexey-bataev merged commit 0bec0f5 into main Apr 3, 2025
6 of 9 checks passed
@alexey-bataev alexey-bataev deleted the users/alexey-bataev/spr/slpinitial-support-for-maskedloads-compress-and-maskedinterleaved branch April 3, 2025 17:21
llvm-sync bot pushed a commit to arm/arm-toolchain that referenced this pull request Apr 3, 2025
…ked)interleaved

Added initial support for (masked)loads + compress and
(masked)interleaved loads.

Reviewers: RKSimon, hiraditya

Reviewed By: RKSimon

Pull Request: llvm/llvm-project#132099
alexey-bataev added a commit that referenced this pull request Apr 3, 2025
…aved

Added initial support for (masked)loads + compress and
(masked)interleaved loads.

Reviewers: RKSimon, hiraditya

Reviewed By: RKSimon

Pull Request: #132099
llvm-sync bot pushed a commit to arm/arm-toolchain that referenced this pull request Apr 3, 2025
…ked)interleaved

Added initial support for (masked)loads + compress and
(masked)interleaved loads.

Reviewers: RKSimon, hiraditya

Reviewed By: RKSimon

Pull Request: llvm/llvm-project#132099
@antmox
Copy link
Contributor

antmox commented Apr 4, 2025

Hi!

Could this be the cause of clang-aarch64-sve-vls-2stage failure ? https://lab.llvm.org/buildbot/#/builders/4/builds/6023

clang: /home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/llvm/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp:12938: auto llvm::slpvectorizer::BoUpSLP::getEntryCost(const TreeEntry *, ArrayRef<Value *>, SmallPtrSetImpl<Value *> &)::(anonymous class)::operator()(InstructionCost) const: Assertion `IsVectorized && "Expected to be vectorized"' failed.

 #0 0x0000bd5856bb64c0 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2.install/bin/clang+0x81e64c0)
 #1 0x0000bd5856bb43ec llvm::sys::RunSignalHandlers() (/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2.install/bin/clang+0x81e43ec)
 #2 0x0000bd5856b1aed4 CrashRecoverySignalHandler(int) CrashRecoveryContext.cpp:0:0
 #3 0x0000e439c64788f8 (linux-vdso.so.1+0x8f8)
 #4 0x0000e439c5ecf1f0 __pthread_kill_implementation ./nptl/pthread_kill.c:44:76
 #5 0x0000e439c5e8a67c gsignal ./signal/../sysdeps/posix/raise.c:27:6
 #6 0x0000e439c5e77130 abort ./stdlib/abort.c:81:7
 #7 0x0000e439c5e83fd4 __assert_fail_base ./assert/assert.c:91:7
 #8 0x0000e439c5e8404c (/lib/aarch64-linux-gnu/libc.so.6+0x3404c)
 #9 0x0000bd5858457080 llvm::InstructionCost llvm::function_ref<llvm::InstructionCost (llvm::InstructionCost)>::callback_fn<llvm::slpvectorizer::BoUpSLP::getEntryCost(llvm::slpvectorizer::BoUpSLP::TreeEntry const*, llvm::ArrayRef<llvm::Value*>, llvm::SmallPtrSetImpl<llvm::Value*>&)::$_16>(long, llvm::InstructionCost) SLPVectorizer.cpp:0:0
#10 0x0000bd58583bc314 llvm::slpvectorizer::BoUpSLP::getEntryCost(llvm::slpvectorizer::BoUpSLP::TreeEntry const*, llvm::ArrayRef<llvm::Value*>, llvm::SmallPtrSetImpl<llvm::Value*>&)::$_2::operator()(llvm::function_ref<llvm::InstructionCost (unsigned int)>, llvm::function_ref<llvm::InstructionCost (llvm::InstructionCost)>) const SLPVectorizer.cpp:0:0
#11 0x0000bd58583b7d00 llvm::slpvectorizer::BoUpSLP::getEntryCost(llvm::slpvectorizer::BoUpSLP::TreeEntry const*, llvm::ArrayRef<llvm::Value*>, llvm::SmallPtrSetImpl<llvm::Value*>&) (/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2.install/bin/clang+0x99e7d00)
#12 0x0000bd58583c1ed0 llvm::slpvectorizer::BoUpSLP::getTreeCost(llvm::ArrayRef<llvm::Value*>, llvm::InstructionCost) (/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2.install/bin/clang+0x99f1ed0)
#13 0x0000bd58583f54d8 llvm::SLPVectorizerPass::tryToVectorizeList(llvm::ArrayRef<llvm::Value*>, llvm::slpvectorizer::BoUpSLP&, bool) (/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2.install/bin/clang+0x9a254d8)
#14 0x0000bd58583f6290 llvm::SLPVectorizerPass::tryToVectorize(llvm::Instruction*, llvm::slpvectorizer::BoUpSLP&) (/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2.install/bin/clang+0x9a26290)
#15 0x0000bd58583f8ddc llvm::SLPVectorizerPass::vectorizeRootInstruction(llvm::PHINode*, llvm::Instruction*, llvm::BasicBlock*, llvm::slpvectorizer::BoUpSLP&) (/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2.install/bin/clang+0x9a28ddc)
#16 0x0000bd58583eeb0c llvm::SLPVectorizerPass::vectorizeChainsInBlock(llvm::BasicBlock*, llvm::slpvectorizer::BoUpSLP&) (/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2.install/bin/clang+0x9a1eb0c)
#17 0x0000bd58583ec324 llvm::SLPVectorizerPass::runImpl(llvm::Function&, llvm::ScalarEvolution*, llvm::TargetTransformInfo*, llvm::TargetLibraryInfo*, llvm::AAResults*, llvm::LoopInfo*, llvm::DominatorTree*, llvm::AssumptionCache*, llvm::DemandedBits*, llvm::OptimizationRemarkEmitter*) (/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2.install/bin/clang+0x9a1c324)
#18 0x0000bd58583eb8a0 llvm::SLPVectorizerPass::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2.install/bin/clang+0x9a1b8a0)
#19 0x0000bd58565e2358 llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2.install/bin/clang+0x7c12358)

@alexey-bataev
Copy link
Member Author

Hi!

Could this be the cause of clang-aarch64-sve-vls-2stage failure ? https://lab.llvm.org/buildbot/#/builders/4/builds/6023

clang: /home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/llvm/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp:12938: auto llvm::slpvectorizer::BoUpSLP::getEntryCost(const TreeEntry *, ArrayRef<Value *>, SmallPtrSetImpl<Value *> &)::(anonymous class)::operator()(InstructionCost) const: Assertion `IsVectorized && "Expected to be vectorized"' failed.

 #0 0x0000bd5856bb64c0 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2.install/bin/clang+0x81e64c0)
 #1 0x0000bd5856bb43ec llvm::sys::RunSignalHandlers() (/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2.install/bin/clang+0x81e43ec)
 #2 0x0000bd5856b1aed4 CrashRecoverySignalHandler(int) CrashRecoveryContext.cpp:0:0
 #3 0x0000e439c64788f8 (linux-vdso.so.1+0x8f8)
 #4 0x0000e439c5ecf1f0 __pthread_kill_implementation ./nptl/pthread_kill.c:44:76
 #5 0x0000e439c5e8a67c gsignal ./signal/../sysdeps/posix/raise.c:27:6
 #6 0x0000e439c5e77130 abort ./stdlib/abort.c:81:7
 #7 0x0000e439c5e83fd4 __assert_fail_base ./assert/assert.c:91:7
 #8 0x0000e439c5e8404c (/lib/aarch64-linux-gnu/libc.so.6+0x3404c)
 #9 0x0000bd5858457080 llvm::InstructionCost llvm::function_ref<llvm::InstructionCost (llvm::InstructionCost)>::callback_fn<llvm::slpvectorizer::BoUpSLP::getEntryCost(llvm::slpvectorizer::BoUpSLP::TreeEntry const*, llvm::ArrayRef<llvm::Value*>, llvm::SmallPtrSetImpl<llvm::Value*>&)::$_16>(long, llvm::InstructionCost) SLPVectorizer.cpp:0:0
#10 0x0000bd58583bc314 llvm::slpvectorizer::BoUpSLP::getEntryCost(llvm::slpvectorizer::BoUpSLP::TreeEntry const*, llvm::ArrayRef<llvm::Value*>, llvm::SmallPtrSetImpl<llvm::Value*>&)::$_2::operator()(llvm::function_ref<llvm::InstructionCost (unsigned int)>, llvm::function_ref<llvm::InstructionCost (llvm::InstructionCost)>) const SLPVectorizer.cpp:0:0
#11 0x0000bd58583b7d00 llvm::slpvectorizer::BoUpSLP::getEntryCost(llvm::slpvectorizer::BoUpSLP::TreeEntry const*, llvm::ArrayRef<llvm::Value*>, llvm::SmallPtrSetImpl<llvm::Value*>&) (/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2.install/bin/clang+0x99e7d00)
#12 0x0000bd58583c1ed0 llvm::slpvectorizer::BoUpSLP::getTreeCost(llvm::ArrayRef<llvm::Value*>, llvm::InstructionCost) (/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2.install/bin/clang+0x99f1ed0)
#13 0x0000bd58583f54d8 llvm::SLPVectorizerPass::tryToVectorizeList(llvm::ArrayRef<llvm::Value*>, llvm::slpvectorizer::BoUpSLP&, bool) (/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2.install/bin/clang+0x9a254d8)
#14 0x0000bd58583f6290 llvm::SLPVectorizerPass::tryToVectorize(llvm::Instruction*, llvm::slpvectorizer::BoUpSLP&) (/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2.install/bin/clang+0x9a26290)
#15 0x0000bd58583f8ddc llvm::SLPVectorizerPass::vectorizeRootInstruction(llvm::PHINode*, llvm::Instruction*, llvm::BasicBlock*, llvm::slpvectorizer::BoUpSLP&) (/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2.install/bin/clang+0x9a28ddc)
#16 0x0000bd58583eeb0c llvm::SLPVectorizerPass::vectorizeChainsInBlock(llvm::BasicBlock*, llvm::slpvectorizer::BoUpSLP&) (/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2.install/bin/clang+0x9a1eb0c)
#17 0x0000bd58583ec324 llvm::SLPVectorizerPass::runImpl(llvm::Function&, llvm::ScalarEvolution*, llvm::TargetTransformInfo*, llvm::TargetLibraryInfo*, llvm::AAResults*, llvm::LoopInfo*, llvm::DominatorTree*, llvm::AssumptionCache*, llvm::DemandedBits*, llvm::OptimizationRemarkEmitter*) (/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2.install/bin/clang+0x9a1c324)
#18 0x0000bd58583eb8a0 llvm::SLPVectorizerPass::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2.install/bin/clang+0x9a1b8a0)
#19 0x0000bd58565e2358 llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2.install/bin/clang+0x7c12358)

Maybe, could you try to provide a reproducer? I don't have the access to arm-based machines and cannot reproduce it locally

@antmox
Copy link
Contributor

antmox commented Apr 4, 2025

sure, I'll try to do that

@alexey-bataev
Copy link
Member Author

sure, I'll try to do that

Thanks!!!

@gregbedwell
Copy link
Collaborator

We also hit this crash since this commit. I've put a reproducer in #134411

alexey-bataev added a commit that referenced this pull request Apr 4, 2025
…aved

Added initial support for (masked)loads + compress and
(masked)interleaved loads.

Reviewers: RKSimon, hiraditya

Reviewed By: RKSimon

Pull Request: #132099
llvm-sync bot pushed a commit to arm/arm-toolchain that referenced this pull request Apr 4, 2025
…ked)interleaved

Added initial support for (masked)loads + compress and
(masked)interleaved loads.

Reviewers: RKSimon, hiraditya

Reviewed By: RKSimon

Pull Request: llvm/llvm-project#132099
@antmox
Copy link
Contributor

antmox commented Apr 7, 2025

Hi. Probably useless now but here it is:

! { dg-do compile }
! { dg-options "-O3 -ffast-math -fdump-tree-reassoc1 --param max-completely-peeled-insns=200" }
      subroutine anisonl(w,vo,anisox,s,ii1,jj1,weight)
      integer ii1,jj1,i1,iii1,j1,jjj1,k1,l1,m1,n1
      real*8 w(3,3),vo(3,3),anisox(3,3,3,3),s(60,60),weight
!
!     This routine replaces the following lines in e_c3d.f for
!     an anisotropic material
!
                      do i1=1,3
                        iii1=ii1+i1-1
                        do j1=1,3
                          jjj1=jj1+j1-1
                          do k1=1,3
                            do l1=1,3
                              s(iii1,jjj1)=s(iii1,jjj1)
     &                          +anisox(i1,k1,j1,l1)*w(k1,l1)*weight
                              do m1=1,3
                                s(iii1,jjj1)=s(iii1,jjj1)
     &                              +anisox(i1,k1,m1,l1)*w(k1,l1)
     &                                 *vo(j1,m1)*weight
     &                              +anisox(m1,k1,j1,l1)*w(k1,l1)
     &                                 *vo(i1,m1)*weight
                                do n1=1,3
                                  s(iii1,jjj1)=s(iii1,jjj1)
     &                              +anisox(m1,k1,n1,l1)
     &                              *w(k1,l1)*vo(i1,m1)*vo(j1,n1)*weight
                                enddo
                              enddo
                            enddo
                          enddo
                        enddo
                      enddo

      return
      end

! There should be 22 multiplications left after un-distributing
! weigth, w(k1,l1), vo(i1,m1) and vo(j1,m1) on the innermost two
! unrolled loops.

! { dg-final { scan-tree-dump-times "\[0-9\] \\\* " 22 "reassoc1" } }

stage2.install/bin/flang -fc1 -triple aarch64-unknown-linux-gnu -emit-obj -mrelocation-model pic -pic-level 2 -pic-is-pie -ffast-math -target-cpu neoverse-512tvb -target-feature +outline-atomics -target-feature +v8.4a -target-feature +aes -target-feature +bf16 -target-feature +ccdp -target-feature +ccidx -target-feature +ccpp -target-feature +complxnum -target-feature +crc -target-feature +dotprod -target-feature +fp-armv8 -target-feature +fp16fml -target-feature +fullfp16 -target-feature +i8mm -target-feature +jsconv -target-feature +lse -target-feature +neon -target-feature +pauth -target-feature +perfmon -target-feature +rand -target-feature +ras -target-feature +rcpc -target-feature +rdm -target-feature +sha2 -target-feature +sha3 -target-feature +sm4 -target-feature +spe -target-feature +ssbs -target-feature +sve -mvscale-max=2 -mvscale-min=2 -vectorize-loops -vectorize-slp -fversion-loops-for-stride -mframe-pointer=non-leaf -mllvm -treat-scalable-fixed-error-as-warning=false -O3 -o reassoc_4.o -x f95-cpp-input reassoc_4.f

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

5 participants