diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index e2a127ff35be..fb7bf098f860 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -23,6 +23,7 @@
 
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/IR/FMF.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
@@ -1714,6 +1715,9 @@ class TargetTransformInfo {
   /// \return The maximum number of function arguments the target supports.
   unsigned getMaxNumArgs() const;
 
+  Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL,
+                             ElementCount VF) const;
+
   /// @}
 
 private:
@@ -2088,6 +2092,8 @@ class TargetTransformInfo::Concept {
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
   virtual unsigned getMaxNumArgs() const = 0;
+  virtual Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL,
+                                     ElementCount VF) const = 0;
 };
 
 template <typename T>
@@ -2815,6 +2821,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   unsigned getMaxNumArgs() const override {
     return Impl.getMaxNumArgs();
   }
+
+  Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL,
+                             ElementCount VF) const override {
+    return Impl.computeVectorLength(Builder, AVL, VF);
+  }
 };
 
 template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 1d8f523e9792..4195dcaa6394 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
@@ -908,6 +909,21 @@ class TargetTransformInfoImplBase {
 
   unsigned getMaxNumArgs() const { return UINT_MAX; }
 
+  Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL,
+                             ElementCount VF) const {
+    if (!VF.isScalable()) {
+      return ConstantInt::get(Builder.getInt32Ty(), VF.getFixedValue());
+    }
+
+    Constant *EC =
+        ConstantInt::get(Builder.getInt32Ty(), VF.getKnownMinValue());
+    Value *VLMax = Builder.CreateVScale(EC, "vlmax");
+    Value *VL = Builder.CreateZExtOrTrunc(AVL, Builder.getInt32Ty(), "vl");
+
+    return Builder.CreateIntrinsic(Builder.getInt32Ty(), Intrinsic::umin,
+                                   {VLMax, VL}, nullptr, "evl");
+  }
+
 protected:
   // Obtain the minimum required size to hold the value (without the sign)
   // In case of a vector it returns the min required size for one element.
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 5e7bdcdf72a4..0a9b2cfd266a 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -35,6 +35,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -2558,6 +2559,21 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
 
   InstructionCost getVectorSplitCost() { return 1; }
 
+  Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL,
+                             ElementCount VF) const {
+    if (!VF.isScalable()) {
+      return ConstantInt::get(Builder.getInt32Ty(), VF.getFixedValue());
+    }
+
+    Constant *EC =
+        ConstantInt::get(Builder.getInt32Ty(), VF.getKnownMinValue());
+    Value *VLMax = Builder.CreateVScale(EC, "vlmax");
+    Value *VL = Builder.CreateZExtOrTrunc(AVL, Builder.getInt32Ty(), "vl");
+
+    return Builder.CreateIntrinsic(Builder.getInt32Ty(), Intrinsic::umin,
+                                   {VLMax, VL}, nullptr, "evl");
+  }
+
   /// @}
 };
 
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 8940bebd2c9a..560897a04052 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -564,6 +564,9 @@ class VPIntrinsic : public IntrinsicInst {
   /// The llvm.vp.* intrinsics for this instruction Opcode
   static Intrinsic::ID getForOpcode(unsigned OC);
 
+  /// The llvm.vp.* intrinsics for this intrinsic ID
+  static Intrinsic::ID getForIntrinsicID(Intrinsic::ID IID);
+
   // Whether \p ID is a VP intrinsic ID.
   static bool isVPIntrinsic(Intrinsic::ID);
 
diff --git a/llvm/include/llvm/IR/VectorBuilder.h b/llvm/include/llvm/IR/VectorBuilder.h
index 301edaed70fe..654486f210ef 100644
--- a/llvm/include/llvm/IR/VectorBuilder.h
+++ b/llvm/include/llvm/IR/VectorBuilder.h
@@ -57,6 +57,10 @@ class VectorBuilder {
     return RetType();
   }
 
+  Value *createVectorInstruction(Intrinsic::ID VPID, Type *ReturnTy,
+                                 ArrayRef<Value *> VecOpArray,
+                                 const Twine &Name = Twine());
+
 public:
   VectorBuilder(IRBuilderBase &Builder,
                 Behavior ErrorHandling = Behavior::ReportAndAbort)
@@ -89,9 +93,19 @@ class VectorBuilder {
   // \p Opcode      The functional instruction opcode of the emitted intrinsic.
   // \p ReturnTy    The return type of the operation.
   // \p VecOpArray  The operand list.
-  Value *createVectorInstruction(unsigned Opcode, Type *ReturnTy,
-                                 ArrayRef<Value *> VecOpArray,
-                                 const Twine &Name = Twine());
+  Value *createVectorInstructionFromOpcode(unsigned Opcode, Type *ReturnTy,
+                                           ArrayRef<Value *> VecOpArray,
+                                           const Twine &Name = Twine());
+
+  // Emit a VP intrinsic call that mimics a regular intrinsic.
+  // This operation behaves according to the VectorBuilderBehavior.
+  // \p IID         The functional intrinsic ID of the emitted VP intrinsic.
+  // \p ReturnTy    The return type of the operation.
+  // \p VecOpArray  The operand list.
+  Value *createVectorInstructionFromIntrinsicID(Intrinsic::ID IID,
+                                                Type *ReturnTy,
+                                                ArrayRef<Value *> VecOpArray,
+                                                const Twine &Name = Twine());
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/Transforms/Vectorize/VectorPredication.h b/llvm/include/llvm/Transforms/Vectorize/VectorPredication.h
new file mode 100644
index 000000000000..ce59854dbb95
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Vectorize/VectorPredication.h
@@ -0,0 +1,55 @@
+#ifndef LLVM_TRANSFORMS_VECTORPREDICATION_H
+#define LLVM_TRANSFORMS_VECTORPREDICATION_H
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+using InstToMaskEVLMap = DenseMap<Instruction *, std::pair<Value *, Value *>>;
+
+struct BlockData {
+  // Vector that stores all vector predicated memory writing operations found in
+  // the basic block. If after phase 1 is empty, then the basic block can be
+  // skipped by following phases.
+  SmallVector<Instruction *> MemoryWritingVPInstructions;
+
+  // Store all instructions of the basic block (in the same order as they are
+  // found), assigning to each the list of users. Skip PHIs and terminators.
+  MapVector<Instruction *, SmallPtrSet<Instruction *, 4>> TopologicalGraph;
+
+  // Map each full-length vector operation eligible to be transformed to a
+  // vector predication one with the (mask,evl) pair of its first vector
+  // predicated memory writing operation user.
+  InstToMaskEVLMap VecOpsToTransform;
+
+  // Ordered list representing the reverse order of how the basic block has to
+  // be transformed due to the new vector predicated instructions.
+  SmallVector<Instruction *> NewBBReverseOrder;
+
+  BlockData() = default;
+};
+
+class VectorPredicationPass : public PassInfoMixin<VectorPredicationPass> {
+private:
+  // List of instructions to be replaced by the new VP operations and that later
+  // should be removed, if possible.
+  DenseMap<Instruction *, Value *> OldInstructionsToRemove;
+
+  void analyseBasicBlock(BasicBlock &BB, BlockData &BBInfo);
+  void findCandidateVectorOperations(BasicBlock &BB, BlockData &BBInfo);
+  void addNewUsersToMasksAndEVLs(BasicBlock &BB, BlockData &BBInfo);
+  void buildNewBasicBlockSchedule(BasicBlock &BB, BlockData &BBInfo);
+  void emitNewBasicBlockSchedule(BasicBlock &BB, BlockData &BBInfo);
+  void transformCandidateVectorOperations(BasicBlock &BB, BlockData &BBInfo);
+
+  void removeOldInstructions();
+
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static StringRef name() { return "VectorPredicationPass"; }
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORPREDICATION_H
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 3f76dfdaac31..b79047627d1f 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1264,6 +1264,12 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
   return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
 }
 
+Value *TargetTransformInfo::computeVectorLength(IRBuilderBase &Builder,
+                                                Value *AVL,
+                                                ElementCount VF) const {
+  return TTIImpl->computeVectorLength(Builder, AVL, VF);
+}
+
 TargetTransformInfo::Concept::~Concept() = default;
 
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index 7a3b708e7400..3aa33dfc2afd 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -604,6 +604,19 @@ Intrinsic::ID VPIntrinsic::getForOpcode(unsigned IROPC) {
   return Intrinsic::not_intrinsic;
 }
 
+Intrinsic::ID VPIntrinsic::getForIntrinsicID(Intrinsic::ID IID) {
+  switch (IID) {
+  default:
+    break;
+
+#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) break;
+#define VP_PROPERTY_FUNCTIONAL_INTRINSIC(INTR) case Intrinsic::INTR:
+#define END_REGISTER_VP_INTRINSIC(VPID) return Intrinsic::VPID;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+  return Intrinsic::not_intrinsic;
+}
+
 bool VPIntrinsic::canIgnoreVectorLengthParam() const {
   using namespace PatternMatch;
 
diff --git a/llvm/lib/IR/VectorBuilder.cpp b/llvm/lib/IR/VectorBuilder.cpp
index c07bc0561fba..c94bc5b180f5 100644
--- a/llvm/lib/IR/VectorBuilder.cpp
+++ b/llvm/lib/IR/VectorBuilder.cpp
@@ -51,13 +51,30 @@ Value &VectorBuilder::requestEVL() {
   return *ConstantInt::get(IntTy, StaticVectorLength.getFixedValue());
 }
 
-Value *VectorBuilder::createVectorInstruction(unsigned Opcode, Type *ReturnTy,
-                                              ArrayRef<Value *> InstOpArray,
-                                              const Twine &Name) {
+Value *VectorBuilder::createVectorInstructionFromOpcode(
+    unsigned Opcode, Type *ReturnTy, ArrayRef<Value *> InstOpArray,
+    const Twine &Name) {
   auto VPID = VPIntrinsic::getForOpcode(Opcode);
   if (VPID == Intrinsic::not_intrinsic)
     return returnWithError<Value *>("No VPIntrinsic for this opcode");
 
+  return createVectorInstruction(VPID, ReturnTy, InstOpArray, Name);
+}
+
+Value *VectorBuilder::createVectorInstructionFromIntrinsicID(
+    Intrinsic::ID IID, Type *ReturnTy, ArrayRef<Value *> InstOpArray,
+    const Twine &Name) {
+  auto VPID = VPIntrinsic::getForIntrinsicID(IID);
+  if (VPID == Intrinsic::not_intrinsic)
+    return returnWithError<Value *>("No VPIntrinsic for this Intrinsic");
+
+  return createVectorInstruction(VPID, ReturnTy, InstOpArray, Name);
+}
+
+Value *VectorBuilder::createVectorInstruction(Intrinsic::ID VPID,
+                                              Type *ReturnTy,
+                                              ArrayRef<Value *> InstOpArray,
+                                              const Twine &Name) {
   auto MaskPosOpt = VPIntrinsic::getMaskParamPos(VPID);
   auto VLenPosOpt = VPIntrinsic::getVectorLengthParamPos(VPID);
   size_t NumInstParams = InstOpArray.size();
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index f94bd422c6b5..973d6cd7d17a 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -281,6 +281,7 @@
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
 #include "llvm/Transforms/Vectorize/VectorCombine.h"
+#include "llvm/Transforms/Vectorize/VectorPredication.h"
 #include <optional>
 
 using namespace llvm;
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 5c6c391049a7..82ba63b5d0ae 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -135,6 +135,7 @@
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
 #include "llvm/Transforms/Vectorize/VectorCombine.h"
+#include "llvm/Transforms/Vectorize/VectorPredication.h"
 
 using namespace llvm;
 
@@ -285,6 +286,11 @@ cl::opt<bool> EnableMemProfContextDisambiguation(
 extern cl::opt<bool> EnableInferAlignmentPass;
 } // namespace llvm
 
+static cl::opt<bool>
+    EnableVectorPredication("enable-vector-predication", cl::init(false),
+                            cl::Hidden,
+                            cl::desc("Enable VectorPredicationPass."));
+
 PipelineTuningOptions::PipelineTuningOptions() {
   LoopInterleaving = true;
   LoopVectorization = true;
@@ -1297,6 +1303,10 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
                /*AllowSpeculation=*/true),
       /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false));
 
+  // Try to vector predicate vectorized functions.
+  if (EnableVectorPredication)
+    FPM.addPass(VectorPredicationPass());
+
   // Now that we've vectorized and unrolled loops, we may have more refined
   // alignment information, try to re-derive it here.
   FPM.addPass(AlignmentFromAssumptionsPass());
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 82ce040c6496..6ad9cb1c44de 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -420,6 +420,7 @@ FUNCTION_PASS("tsan", ThreadSanitizerPass())
 FUNCTION_PASS("typepromotion", TypePromotionPass(TM))
 FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass())
 FUNCTION_PASS("vector-combine", VectorCombinePass())
+FUNCTION_PASS("vector-predication", VectorPredicationPass())
 FUNCTION_PASS("verify", VerifierPass())
 FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass())
 FUNCTION_PASS("verify<loops>", LoopVerifierPass())
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 1a9abaea8111..8f25709d95fd 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -14,6 +14,7 @@
 #include "llvm/CodeGen/CostTable.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
 #include <cmath>
 #include <optional>
 using namespace llvm;
@@ -1848,3 +1849,36 @@ bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
                   C2.NumIVMuls, C2.NumBaseAdds,
                   C2.ScaleCost, C2.ImmCost, C2.SetupCost);
 }
+
+Value *RISCVTTIImpl::computeVectorLength(IRBuilderBase &Builder, Value *AVL,
+                                         ElementCount VF) const {
+  // Maps a VF to a (SEW, LMUL) pair.
+  // NOTE: we assume ELEN = 64.
+  const std::map<unsigned int, std::pair<unsigned int, unsigned int>>
+      VFToSEWLMUL = {{1, {3, 0}},  {2, {3, 1}},  {4, {3, 2}}, {8, {3, 3}},
+                     {16, {2, 3}}, {32, {1, 3}}, {64, {0, 3}}};
+
+  assert(AVL->getType()->isIntegerTy() &&
+         "Requested vector length should be an integer.");
+  assert(VFToSEWLMUL.find(VF.getKnownMinValue()) != VFToSEWLMUL.end() &&
+         "Invalid value for LMUL argument.");
+  auto VFToSEWLMULVal = VFToSEWLMUL.at(VF.getKnownMinValue());
+
+  Value *AVLArg = Builder.CreateZExtOrTrunc(AVL, Builder.getInt64Ty());
+  Constant *SEWArg =
+      ConstantInt::get(Builder.getInt64Ty(), VFToSEWLMULVal.first);
+  Constant *LMULArg =
+      ConstantInt::get(Builder.getInt64Ty(), VFToSEWLMULVal.second);
+  Value *EVLRes =
+      Builder.CreateIntrinsic(Intrinsic::riscv_vsetvli, {AVLArg->getType()},
+                              {AVLArg, SEWArg, LMULArg}, nullptr, "vl");
+
+  // NOTE: evl type is required to be i32.
+  Value *EVL = Builder.CreateZExtOrTrunc(EVLRes, Builder.getInt32Ty());
+  if (!VF.isScalable()) {
+    EVL = Builder.CreateBinaryIntrinsic(
+        Intrinsic::umin,
+        ConstantInt::get(Builder.getInt32Ty(), VF.getFixedValue()), EVL);
+  }
+  return EVL;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index d2592be75000..cd30f16fc6c0 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -377,6 +377,9 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
   bool shouldFoldTerminatingConditionAfterLSR() const {
     return true;
   }
+
+  Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL,
+                             ElementCount VF) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 9674094024b9..5574b33d9bc2 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_component_library(LLVMVectorize
   SLPVectorizer.cpp
   Vectorize.cpp
   VectorCombine.cpp
+  VectorPredication.cpp
   VPlan.cpp
   VPlanAnalysis.cpp
   VPlanHCFGBuilder.cpp
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b590fb4685a3..02cf0aaef5fa 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -411,6 +411,10 @@ static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
 // after prolog. See `emitIterationCountCheck`.
 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
 
+cl::opt<bool> UseVectorPredicationIntrinsics(
+    "use-vp-intrinsics", cl::init(false), cl::Hidden,
+    cl::desc("Use Vector Predication intrinsics during vectorization."));
+
 /// A helper function that returns true if the given type is irregular. The
 /// type is irregular if its allocated size doesn't equal the store size of an
 /// element of the corresponding vector type.
@@ -2792,6 +2796,11 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
   if (VectorTripCount)
     return VectorTripCount;
 
+  // With VP intrinsics, we require tail-folding by masking; this way, we
+  // operate on a number of elements equal to the original loop trip count.
+  if (UseVectorPredicationIntrinsics)
+    return VectorTripCount = getTripCount();
+
   Value *TC = getTripCount();
   IRBuilder<> Builder(InsertBlock->getTerminator());
 
@@ -2828,6 +2837,7 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
   // the step does not evenly divide the trip count, no adjustment is necessary
   // since there will already be scalar iterations. Note that the minimum
   // iterations check ensures that N >= Step.
+  // TODO: we should probably honor the cost model also with VP intrinsics.
   if (Cost->requiresScalarEpilogue(VF.isVector())) {
     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
     R = Builder.CreateSelect(IsZero, Step, R);
@@ -6316,9 +6326,12 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
   }
 
   bool Reverse = ConsecutiveStride < 0;
-  if (Reverse)
+  if (Reverse) {
+    if (UseVectorPredicationIntrinsics)
+      return InstructionCost::getInvalid();
     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
                                std::nullopt, CostKind, 0);
+  }
   return Cost;
 }
 
@@ -8234,12 +8247,13 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
 
   if (LoadInst *Load = dyn_cast<LoadInst>(I))
-    return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
-                                              Consecutive, Reverse);
+    return new VPWidenMemoryInstructionRecipe(
+        *Load, Operands[0], Mask, Plan->getEVLPhi(), Consecutive, Reverse);
 
   StoreInst *Store = cast<StoreInst>(I);
   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
-                                            Mask, Consecutive, Reverse);
+                                            Mask, Plan->getEVLPhi(),
+                                            Consecutive, Reverse);
 }
 
 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
@@ -8257,10 +8271,12 @@ createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
   VPValue *Step =
       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
-    return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
+    return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI,
+                                             Plan.getEVLPhi());
   }
   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
-  return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
+  return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc,
+                                           Plan.getEVLPhi());
 }
 
 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
@@ -8692,32 +8708,64 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
 
 // Add the necessary canonical IV and branch recipes required to control the
 // loop.
-static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
-                                  DebugLoc DL) {
-  Value *StartIdx = ConstantInt::get(IdxTy, 0);
-  auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx);
+static VPInstruction *addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy,
+                                            bool HasNUW, DebugLoc DL,
+                                            const TargetTransformInfo *TTI) {
+  VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
+  VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
+
+  // Add the EVL recipe, used to calculate the correct IV increment.
+  VPEVLPHIRecipe *EVLRecipe = nullptr;
+  // TODO: TTI should be able to indicate if a target prefers vector predication
+  // intrinsics.
+  if (UseVectorPredicationIntrinsics) {
+    EVLRecipe = new VPEVLPHIRecipe(Plan.getTripCount(), TTI);
+    Header->insert(EVLRecipe, Header->begin());
+  }
 
   // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
+  Value *StartIdx = ConstantInt::get(IdxTy, 0);
+  auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx);
   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
-  VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
-  VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
   Header->insert(CanonicalIVPHI, Header->begin());
 
   // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
-  // IV by VF * UF.
-  auto *CanonicalIVIncrement =
+  // IV either by VF * UF or by the EVL values.
+  VPInstruction *CanonicalIVIncrement = nullptr;
+  if (EVLRecipe)
+    CanonicalIVIncrement =
+      new VPInstruction(Instruction::Add, {CanonicalIVPHI, EVLRecipe},
+                        {HasNUW, false}, DL, "index.next");
+  else
+    CanonicalIVIncrement =
       new VPInstruction(Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()},
                         {HasNUW, false}, DL, "index.next");
+
   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
 
+  // If we are working with vector predication instrinsics, add a NextEVL
+  // VPInstruction to calculate the remaining elements number.
+  VPInstruction *NextEVL = nullptr;
+  if (EVLRecipe) {
+    NextEVL =
+        new VPInstruction(VPInstruction::NextEVL,
+                          {EVLRecipe, CanonicalIVIncrement}, DL, "evl.next");
+    EVLRecipe->addOperand(NextEVL);
+  }
+
   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
   EB->appendRecipe(CanonicalIVIncrement);
+  if (NextEVL) {
+    EB->appendRecipe(NextEVL);
+  }
 
   // Add the BranchOnCount VPInstruction to the latch.
   VPInstruction *BranchBack =
       new VPInstruction(VPInstruction::BranchOnCount,
                         {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
   EB->appendRecipe(BranchBack);
+
+  return NextEVL;
 }
 
 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
@@ -8807,7 +8855,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   // When not folding the tail, we know that the induction increment will not
   // overflow.
   bool HasNUW = Style == TailFoldingStyle::None;
-  addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
+  auto *NextEVL = addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
+                                        HasNUW, DL, &TTI);
 
   // Proactively create header mask. Masks for other blocks are created on
   // demand.
@@ -8982,7 +9031,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     bool WithoutRuntimeCheck =
         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
-                                       WithoutRuntimeCheck);
+                                       WithoutRuntimeCheck, NextEVL);
   }
   return Plan;
 }
@@ -9022,7 +9071,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
   // is guaranteed to not wrap.
   bool HasNUW = true;
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
-                        DebugLoc());
+                        DebugLoc(), &TTI);
   return Plan;
 }
 
@@ -9529,7 +9578,7 @@ lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr,
   } else {
     VectorBuilder VBuilder(Builder);
     VBuilder.setEVL(EVLPart).setMask(Mask);
-    Call = cast<CallInst>(VBuilder.createVectorInstruction(
+    Call = cast<CallInst>(VBuilder.createVectorInstructionFromOpcode(
         Instruction::Store, Type::getVoidTy(EVLPart->getContext()),
         {StoredVal, Addr}));
   }
@@ -9553,7 +9602,7 @@ static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder,
   } else {
     VectorBuilder VBuilder(Builder);
     VBuilder.setEVL(EVLPart).setMask(Mask);
-    Call = cast<CallInst>(VBuilder.createVectorInstruction(
+    Call = cast<CallInst>(VBuilder.createVectorInstructionFromOpcode(
         Instruction::Load, DataTy, Addr, "vp.op.load"));
   }
   Call->addParamAttr(
@@ -9580,8 +9629,15 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
 
   auto &Builder = State.Builder;
   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
-  bool isMaskRequired = getMask();
-  if (isMaskRequired) {
+  VPValue *VPMask = getMask();
+  VPValue *VPEVL = getEVL();
+  if (VPEVL && (!VPMask || (isa<VPInstruction>(VPMask) &&
+                            dyn_cast<VPInstruction>(VPMask)->getOpcode() ==
+                                VPInstruction::ActiveLaneMask))) {
+    auto *MaskTy = VectorType::get(Builder.getInt1Ty(), State.VF);
+    for (unsigned Part = 0; Part < State.UF; ++Part)
+      BlockInMaskParts[Part] = ConstantInt::getTrue(MaskTy);
+  } else if (VPMask) {
     // Mask reversal is only neede for non-all-one (null) masks, as reverse of a
     // null all-one mask is a null mask.
     for (unsigned Part = 0; Part < State.UF; ++Part) {
@@ -9623,7 +9679,14 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
       PartPtr =
           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds);
     } else {
-      Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
+      Value *Increment = nullptr;
+      if (VPEVL) {
+        Increment = Builder.getInt32(0); // EVL is always an i32.
+        for (unsigned int P = 0; P < Part; P++)
+          Increment = Builder.CreateAdd(Increment, State.get(VPEVL, P));
+      } else {
+        Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
+      }
       PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds);
     }
 
@@ -9631,7 +9694,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
   };
 
   auto MaskValue = [&](unsigned Part) -> Value * {
-    if (isMaskRequired)
+    if (VPMask)
       return BlockInMaskParts[Part];
     return nullptr;
   };
@@ -9659,10 +9722,19 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
             StoredVal, CreateGatherScatter, MaskValue(Part), EVLPart,
             Alignment);
       } else if (CreateGatherScatter) {
-        Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+        Value *MaskPart =
+            (VPMask || VPEVL) ? BlockInMaskParts[Part] : nullptr;
         Value *VectorGep = State.get(getAddr(), Part);
-        NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
-                                            MaskPart);
+        if (VPEVL) {
+          auto *PtrsTy = cast<VectorType>(VectorGep->getType());
+          Value *Operands[] = {StoredVal, VectorGep, MaskPart,
+                               State.get(VPEVL, Part)};
+          NewSI = Builder.CreateIntrinsic(Intrinsic::vp_scatter,
+                                          {DataTy, PtrsTy}, Operands);
+        } else {
+          NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
+                                              MaskPart);
+        }
       } else {
         if (isReverse()) {
           // If we store to reverse consecutive memory locations, then we need
@@ -9673,11 +9745,17 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
         }
         auto *VecPtr =
             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
-        if (isMaskRequired)
+        if (VPEVL) {
+          Value *Operands[] = {StoredVal, VecPtr, BlockInMaskParts[Part],
+                               State.get(VPEVL, Part)};
+          NewSI = Builder.CreateIntrinsic(
+              Intrinsic::vp_store, {DataTy, VecPtr->getType()}, Operands);
+        } else if (VPMask) {
           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
                                             BlockInMaskParts[Part]);
-        else
+        } else {
           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
+        }
       }
       State.addMetadata(NewSI, SI);
     }
@@ -9704,21 +9782,37 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
               : CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))),
           CreateGatherScatter, MaskValue(Part), EVLPart, Alignment);
     } else if (CreateGatherScatter) {
-      Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+      Value *MaskPart =
+          (VPMask || VPEVL) ? BlockInMaskParts[Part] : nullptr;
       Value *VectorGep = State.get(getAddr(), Part);
-      NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
-                                         nullptr, "wide.masked.gather");
+      if (VPEVL) {
+        auto *PtrsTy = cast<VectorType>(VectorGep->getType());
+        Value *Operands[] = {VectorGep, MaskPart, State.get(VPEVL, Part)};
+        NewLI = Builder.CreateIntrinsic(Intrinsic::vp_gather, {DataTy, PtrsTy},
+                                        Operands, nullptr, "vp.gather");
+      } else {
+        NewLI =
+            Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
+                                       nullptr, "wide.masked.gather");
+      }
       State.addMetadata(NewLI, LI);
     } else {
       auto *VecPtr =
           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
-      if (isMaskRequired)
+      if (VPEVL) {
+        Value *Operands[] = {VecPtr, BlockInMaskParts[Part],
+                             State.get(VPEVL, Part)};
+        NewLI = Builder.CreateIntrinsic(Intrinsic::vp_load,
+                                        {DataTy, VecPtr->getType()}, Operands,
+                                        nullptr, "vp.load");
+      } else if (VPMask) {
         NewLI = Builder.CreateMaskedLoad(
             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
             PoisonValue::get(DataTy), "wide.masked.load");
-      else
+      } else {
         NewLI =
             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
+      }
 
       // Add metadata to the load, but setVectorValue to the reverse shuffle.
       State.addMetadata(NewLI, LI);
@@ -10516,6 +10610,11 @@ LoopVectorizeResult LoopVectorizePass::runImpl(
 
 PreservedAnalyses LoopVectorizePass::run(Function &F,
                                          FunctionAnalysisManager &AM) {
+    assert((!UseVectorPredicationIntrinsics ||
+            PreferPredicateOverEpilogue ==
+                PreferPredicateTy::PredicateOrDontVectorize) &&
+           "Tail folding required when using VP intrinsics.");
+
     auto &LI = AM.getResult<LoopAnalysis>(F);
     // There are no loops in the function. Return before computing other expensive
     // analyses.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 1d7df9c9575a..0ac8d43acb11 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -708,6 +708,16 @@ VPlan::~VPlan() {
     delete BackedgeTakenCount;
 }
 
+VPEVLPHIRecipe *VPlan::getEVLPhi() {
+  VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
+  for (VPRecipeBase &R : Header->phis()) {
+    if (isa<VPEVLPHIRecipe>(&R))
+      return cast<VPEVLPHIRecipe>(&R);
+  }
+  
+  return nullptr;
+}
+
 VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE) {
   VPBasicBlock *Preheader = new VPBasicBlock("ph");
   VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph");
@@ -821,6 +831,13 @@ void VPlan::execute(VPTransformState *State) {
     }
 
     auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
+    if (auto *EVLPhi = dyn_cast<VPEVLPHIRecipe>(PhiR)) {
+      PHINode *Phi = EVLPhi->getPhi();
+      Phi->addIncoming(State->get(EVLPhi->getBackedgeValue(), State->UF - 1),
+                       VectorLatchBB);
+      continue;
+    }
+
     // For  canonical IV, first-order recurrences and in-order reduction phis,
     // only a single part is generated, which provides the last part from the
     // previous iteration. For non-ordered reductions all UF parts are
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0ca668abbe60..ab1d4b73aa62 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -771,10 +771,10 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   /// Returns the underlying instruction, if the recipe is a VPValue or nullptr
   /// otherwise.
   Instruction *getUnderlyingInstr() {
-    return cast<Instruction>(getVPSingleValue()->getUnderlyingValue());
+    return cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
   }
   const Instruction *getUnderlyingInstr() const {
-    return cast<Instruction>(getVPSingleValue()->getUnderlyingValue());
+    return cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
   }
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -1069,7 +1069,8 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue {
     // Increment the canonical IV separately for each unrolled part.
     CanonicalIVIncrementForPart,
     BranchOnCount,
-    BranchOnCond
+    BranchOnCond,
+    NextEVL
   };
 
 private:
@@ -1452,20 +1453,28 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
   TruncInst *Trunc;
   const InductionDescriptor &IndDesc;
 
+  void addEVL(VPValue *EVLRecipe) {
+    if (EVLRecipe)
+      addOperand(EVLRecipe);
+  }
+
 public:
   VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
-                                const InductionDescriptor &IndDesc)
+                                const InductionDescriptor &IndDesc,
+                                VPValue *EVLRecipe)
       : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start), IV(IV),
         Trunc(nullptr), IndDesc(IndDesc) {
     addOperand(Step);
+    addEVL(EVLRecipe);
   }
 
   VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
                                 const InductionDescriptor &IndDesc,
-                                TruncInst *Trunc)
+                                TruncInst *Trunc, VPValue *EVLRecipe)
       : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, Trunc, Start),
         IV(IV), Trunc(Trunc), IndDesc(IndDesc) {
     addOperand(Step);
+    addEVL(EVLRecipe);
   }
 
   ~VPWidenIntOrFpInductionRecipe() override = default;
@@ -1500,6 +1509,12 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
   VPValue *getStepValue() { return getOperand(1); }
   const VPValue *getStepValue() const { return getOperand(1); }
 
+  /// Return the EVL value of the current loop iteration.
+  VPValue *getEVL() { return getNumOperands() == 3 ? getOperand(2) : nullptr; }
+  const VPValue *getEVL() const {
+    return getNumOperands() == 3 ? getOperand(2) : nullptr;
+  }
+
   /// Returns the first defined value as TruncInst, if it is one or nullptr
   /// otherwise.
   TruncInst *getTruncInst() { return Trunc; }
@@ -1988,8 +2003,8 @@ class VPPredInstPHIRecipe : public VPRecipeBase, public VPValue {
 
 /// A Recipe for widening load/store operations.
 /// The recipe uses the following VPValues:
-/// - For load: Address, optional mask
-/// - For store: Address, stored value, optional mask
+/// - For load: Address, optional mask, optional evl
+/// - For store: Address, stored value, optional mask, optional evl
 /// TODO: We currently execute only per-part unless a specific instance is
 /// provided.
 class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
@@ -2001,33 +2016,41 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
   // Whether the consecutive loaded/stored addresses are in reverse order.
   bool Reverse;
 
-  void setMask(VPValue *Mask) {
-    if (!Mask)
-      return;
-    addOperand(Mask);
-  }
+  // Whether the instruction has a not all-ones mask.
+  bool Masked = false;
+
+  // Whether a vector length is available to the instruction.
+  bool HasVL = false;
 
-  bool isMasked() const {
-    return isStore() ? getNumOperands() == 3 : getNumOperands() == 2;
+  void setMaskAndEVL(VPValue *Mask, VPValue *VPEVL) {
+    if (Mask) {
+      this->Masked = true;
+      addOperand(Mask);
+    }
+
+    if (VPEVL) {
+      this->HasVL = true;
+      addOperand(VPEVL);
+    }
   }
 
 public:
   VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
-                                 bool Consecutive, bool Reverse)
+                                 VPValue *EVL, bool Consecutive, bool Reverse)
       : VPRecipeBase(VPDef::VPWidenMemoryInstructionSC, {Addr}),
         Ingredient(Load), Consecutive(Consecutive), Reverse(Reverse) {
     assert((Consecutive || !Reverse) && "Reverse implies consecutive");
     new VPValue(this, &Load);
-    setMask(Mask);
+    setMaskAndEVL(Mask, EVL);
   }
 
   VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr,
                                  VPValue *StoredValue, VPValue *Mask,
-                                 bool Consecutive, bool Reverse)
+                                 VPValue *EVL, bool Consecutive, bool Reverse)
       : VPRecipeBase(VPDef::VPWidenMemoryInstructionSC, {Addr, StoredValue}),
         Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse) {
     assert((Consecutive || !Reverse) && "Reverse implies consecutive");
-    setMask(Mask);
+    setMaskAndEVL(Mask, EVL);
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenMemoryInstructionSC)
@@ -2040,8 +2063,15 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
   /// Return the mask used by this recipe. Note that a full mask is represented
   /// by a nullptr.
   VPValue *getMask() const {
-    // Mask is optional and therefore the last operand.
-    return isMasked() ? getOperand(getNumOperands() - 1) : nullptr;
+    return Masked ? (HasVL ? getOperand(getNumOperands() - 2)
+                           : getOperand(getNumOperands() - 1))
+                  : nullptr;
+  }
+
+  /// Return the evl used by this recipe. If we are working with full-length
+  /// vectors, return nullptr.
+  VPValue *getEVL() const {
+    return HasVL ? getOperand(getNumOperands() - 1) : nullptr;
   }
 
   /// Returns true if this recipe is a store.
@@ -2190,6 +2220,33 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
 #endif
 };
 
+class VPEVLPHIRecipe : public VPHeaderPHIRecipe {
+  const TargetTransformInfo *TTI;
+  PHINode *Phi = nullptr;
+
+public:
+  VPEVLPHIRecipe(VPValue *StartEVL, const TargetTransformInfo *TTI)
+      : VPHeaderPHIRecipe(VPDef::VPWidenEVLSC, nullptr, StartEVL), TTI(TTI) {}
+
+  ~VPEVLPHIRecipe() override = default;
+
+  VP_CLASSOF_IMPL(VPDef::VPWidenEVLSC)
+
+  PHINode *getPhi() const { return Phi; }
+
+  static inline bool classof(const VPHeaderPHIRecipe *D) {
+    return D->getVPDefID() == VPDef::VPWidenEVLSC;
+  }
+
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A recipe for generating the phi node for the current index of elements,
 /// adjusted in accordance with EVL value. It starts at StartIV value and gets
 /// incremented by EVL in each iteration of the vector loop.
@@ -2795,6 +2852,10 @@ class VPlan {
     return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());
   }
 
+  /// Find and return the VPEVLPHIRecipe from the header - there should be only
+  /// one at most. If there isn't one, then return nullptr.
+  VPEVLPHIRecipe *getEVLPhi();
+
   void addLiveOut(PHINode *PN, VPValue *V);
 
   void removeLiveOut(PHINode *PN) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 5e0344a14df5..25658b278648 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -38,6 +38,7 @@ using VectorParts = SmallVector<Value *, 2>;
 namespace llvm {
 extern cl::opt<bool> EnableVPlanNativePath;
 }
+extern cl::opt<bool> UseVectorPredicationIntrinsics;
 
 #define LV_NAME "loop-vectorize"
 #define DEBUG_TYPE LV_NAME
@@ -274,12 +275,25 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
   IRBuilderBase &Builder = State.Builder;
   Builder.SetCurrentDebugLocation(getDebugLoc());
 
-  if (Instruction::isBinaryOp(getOpcode())) {
+  unsigned Opc = getOpcode();
+  if (Instruction::isBinaryOp(Opc)) {
     if (Part != 0 && vputils::onlyFirstPartUsed(this))
       return State.get(this, 0);
 
     Value *A = State.get(getOperand(0), Part);
-    Value *B = State.get(getOperand(1), Part);
+    Value *B = nullptr;
+
+    if (UseVectorPredicationIntrinsics && Opc == Instruction::Add) {
+      // We have the EVL value available to use.
+      VPValue *VPEVL = getOperand(1);
+      Value *Step = State.get(VPEVL, 0);
+      for (unsigned P = 1; P < State.UF; P++)
+        Step = Builder.CreateAdd(Step, State.get(VPEVL, P));
+
+      B = Builder.CreateZExtOrTrunc(Step, A->getType());
+    } else
+      B = State.get(getOperand(1), Part);
+
     auto *Res =
         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
     if (auto *I = dyn_cast<Instruction>(Res))
@@ -439,6 +453,19 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
     return CondBr;
   }
+  case VPInstruction::NextEVL: {
+    Value *Next = nullptr;
+    if (Part == 0) {
+      auto *EVLRecipe = cast<VPEVLPHIRecipe>(getOperand(0));
+      Value *StartEVL = EVLRecipe->getOperand(0)->getUnderlyingValue();
+      Value *IVIncrement = State.get(getOperand(1), 0);
+
+      Next = Builder.CreateSub(StartEVL, IVIncrement, "evl.next");
+    } else {
+      Next = State.get(this, 0);
+    }
+    return Next;
+  }
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
@@ -521,6 +548,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::BranchOnCount:
     O << "branch-on-count";
     break;
+  case VPInstruction::NextEVL:
+    O << "next-evl";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
@@ -968,24 +998,27 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
     MulOp = Instruction::FMul;
   }
 
-  // Multiply the vectorization factor by the step using integer or
-  // floating-point arithmetic as appropriate.
-  Type *StepType = Step->getType();
-  Value *RuntimeVF;
-  if (Step->getType()->isFloatingPointTy())
-    RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
-  else
-    RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
-  Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
-
-  // Create a vector splat to use in the induction update.
-  //
-  // FIXME: If the step is non-constant, we create the vector splat with
-  //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
-  //        handle a constant vector splat.
-  Value *SplatVF = isa<Constant>(Mul)
-                       ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
-                       : Builder.CreateVectorSplat(State.VF, Mul);
+  Value *SplatVF = nullptr;
+  if (!getEVL()) {
+    // Multiply the vectorization factor by the step using integer or
+    // floating-point arithmetic as appropriate.
+    Type *StepType = Step->getType();
+    Value *RuntimeVF;
+    if (Step->getType()->isFloatingPointTy())
+      RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
+    else
+      RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
+    Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
+
+    // Create a vector splat to use in the induction update.
+    //
+    // FIXME: If the step is non-constant, we create the vector splat with
+    //        IRBuilder. IRBuilder can constant-fold the multiply, but it
+    //        doesn't handle a constant vector splat.
+    SplatVF = isa<Constant>(Mul)
+                  ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
+                  : Builder.CreateVectorSplat(State.VF, Mul);
+  }
   Builder.restoreIP(CurrIP);
 
   // We may need to add the step a number of times, depending on the unroll
@@ -1000,8 +1033,26 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
     if (isa<TruncInst>(EntryVal))
       State.addMetadata(LastInduction, EntryVal);
 
-    LastInduction = cast<Instruction>(
-        Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
+    if (auto *EVLRecipe = getEVL()) {
+      // Ensure the types match.
+      Type *DestTy = LastInduction->getType()->getScalarType();
+      Value *EVL = State.get(EVLRecipe, Part);
+      if (DestTy->isIntegerTy()) {
+        EVL = Builder.CreateZExtOrTrunc(EVL, DestTy);
+      } else {
+        assert(DestTy->isFloatingPointTy());
+        EVL = Builder.CreateUIToFP(EVL, DestTy);
+      }
+      // Multiply the EVL by the step using integer or floating-point
+      // arithmetic as appropriate.
+      Value *Mul = Builder.CreateBinOp(MulOp, Step, EVL);
+      Value *SplatEVL = Builder.CreateVectorSplat(State.VF, Mul);
+      LastInduction = cast<Instruction>(
+          Builder.CreateBinOp(AddOp, LastInduction, SplatEVL, "step.add.vl"));
+    } else {
+      LastInduction = cast<Instruction>(
+          Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
+    }
     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
   }
 
@@ -1033,6 +1084,9 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
 #endif
 
 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
+  if (getEVL())
+    return false;
+
   // The step may be defined by a recipe in the preheader (e.g. if it requires
   // SCEV expansion), but for the canonical induction the step is required to be
   // 1, which is represented as live-in.
@@ -1770,3 +1824,30 @@ void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
   printOperands(O, SlotTracker);
 }
 #endif
+
+void VPEVLPHIRecipe::execute(VPTransformState &State) {
+  Value *StartEVL = getOperand(0)->getUnderlyingValue();
+  BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+  this->Phi = State.Builder.CreatePHI(StartEVL->getType(), 2, "evl.phi");
+  this->Phi->addIncoming(StartEVL, VectorPH);
+
+  Value *PrevEVL = State.Builder.CreateZExtOrTrunc(
+      cast<Value>(this->Phi), State.Builder.getInt32Ty(), "evl.phi.cast");
+  Value *EVL = nullptr;
+  for (unsigned Part = 0; Part < State.UF; Part++) {
+    if (EVL)
+      PrevEVL = State.Builder.CreateSub(PrevEVL, EVL);
+    EVL = TTI->computeVectorLength(State.Builder, PrevEVL, State.VF);
+    State.set(this, EVL, Part);
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPEVLPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+                             VPSlotTracker &SlotTracker) const {
+  O << Indent << "EVL-PHI ";
+  printAsOperand(O, SlotTracker);
+  O << " = phi ";
+  printOperands(O, SlotTracker);
+}
+#endif
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 466259cb196c..22dc894babc4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -53,7 +53,8 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
           VPValue *Start = Plan->getVPValueOrAddLiveIn(II->getStartValue());
           VPValue *Step =
               vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE);
-          NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II);
+          NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II,
+                                                        Plan->getEVLPhi());
         } else {
           Plan->addVPValue(Phi, VPPhi);
           continue;
@@ -66,11 +67,12 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
         if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
           NewRecipe = new VPWidenMemoryInstructionRecipe(
               *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
-              false /*Consecutive*/, false /*Reverse*/);
+              nullptr /*EVL*/, false /*Consecutive*/, false /*Reverse*/);
         } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
           NewRecipe = new VPWidenMemoryInstructionRecipe(
               *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
-              nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/);
+              nullptr /*Mask*/, nullptr /*EVL*/, false /*Consecutive*/,
+              false /*Reverse*/);
         } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
           NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
         } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
@@ -1040,7 +1042,8 @@ void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) {
 //   branch-on-cond %Negated
 //
 static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
-    VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
+    VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck,
+    VPInstruction *NextEVL) {
   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
   auto *CanonicalIVPHI = Plan.getCanonicalIV();
@@ -1066,6 +1069,9 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
     // When the loop is guarded by a runtime overflow check for the loop
     // induction variable increment by VF, we can increment the value before
     // the get.active.lane mask and use the unmodified tripcount.
+    if (NextEVL) {
+      EB->insert(NextEVL, EB->end()--);
+    }
     IncrementValue = CanonicalIVIncrement;
     TripCount = TC;
   } else {
@@ -1102,6 +1108,10 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
                                    "active.lane.mask.next");
   LaneMaskPhi->addOperand(ALM);
 
+  if (DataAndControlFlowWithoutRuntimeCheck && NextEVL) {
+    EB->insert(NextEVL, EB->end()--);
+  }
+
   // Replace the original terminator with BranchOnCond. We have to invert the
   // mask here because a true condition means jumping to the exit block.
   auto *NotMask = Builder.createNot(ALM, DL);
@@ -1151,7 +1161,8 @@ static void replaceHeaderPredicateWithIdiom(
 
 void VPlanTransforms::addActiveLaneMask(
     VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
-    bool DataAndControlFlowWithoutRuntimeCheck) {
+    bool DataAndControlFlowWithoutRuntimeCheck,
+    VPInstruction *NextEVL) {
   assert((!DataAndControlFlowWithoutRuntimeCheck ||
           UseActiveLaneMaskForControlFlow) &&
          "DataAndControlFlowWithoutRuntimeCheck implies "
@@ -1167,7 +1178,7 @@ void VPlanTransforms::addActiveLaneMask(
   VPRecipeBase *LaneMask;
   if (UseActiveLaneMaskForControlFlow) {
     LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
-        Plan, DataAndControlFlowWithoutRuntimeCheck);
+        Plan, DataAndControlFlowWithoutRuntimeCheck, NextEVL);
   } else {
     LaneMask = new VPInstruction(VPInstruction::ActiveLaneMask,
                                  {WideCanonicalIV, Plan.getTripCount()},
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index a4bc7a23072c..7e1f65bc16ec 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -75,7 +75,8 @@ struct VPlanTransforms {
   /// UseActiveLaneMaskForControlFlow.
   static void addActiveLaneMask(VPlan &Plan,
                                 bool UseActiveLaneMaskForControlFlow,
-                                bool DataAndControlFlowWithoutRuntimeCheck);
+                                bool DataAndControlFlowWithoutRuntimeCheck,
+                                VPInstruction *NextEVL);
 
   /// Insert truncates and extends for any truncated recipe. Redundant casts
   /// will be folded later.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 22dbf7571dd9..a4db8b5c5d02 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -367,6 +367,7 @@ class VPDef {
     VPActiveLaneMaskPHISC,
     VPEVLBasedIVPHISC,
     VPFirstOrderRecurrencePHISC,
+    VPWidenEVLSC,
     VPWidenPHISC,
     VPWidenIntOrFpInductionSC,
     VPWidenPointerInductionSC,
diff --git a/llvm/lib/Transforms/Vectorize/VectorPredication.cpp b/llvm/lib/Transforms/Vectorize/VectorPredication.cpp
new file mode 100644
index 000000000000..bbebcba38e91
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VectorPredication.cpp
@@ -0,0 +1,358 @@
+#include "llvm/Transforms/Vectorize/VectorPredication.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/VectorBuilder.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#define DEBUG_TYPE "vector-predication"
+STATISTIC(Transforms, "Number of full-length -> evl vector transformation.");
+
+using namespace llvm;
+
+// Map each instruction to its uses and save all memory writing vector
+// predicated instructions found in the basic block.
+void VectorPredicationPass::analyseBasicBlock(BasicBlock &BB,
+                                              BlockData &BBInfo) {
+  // Store all memory accessing instructions: all these instructions have to be
+  // chained, so that their relative order can be preserved when rewriting the
+  // basic block.
+  SmallVector<Instruction *> ToBeChainedInstructions;
+
+  for (Instruction &I : BB) {
+    if (isa<PHINode>(I) || I.isTerminator())
+      continue;
+
+    SmallPtrSet<Instruction *, 4> IUsers;
+    for (User *IU : I.users()) {
+      assert(isa<Instruction>(IU) && "Unexpected behaviour.");
+      auto *IUInst = cast<Instruction>(IU);
+      if (IUInst->getParent() != I.getParent())
+        continue;
+      if (isa<PHINode>(IUInst) || IUInst->isTerminator())
+        continue;
+
+      IUsers.insert(IUInst);
+    }
+    BBInfo.TopologicalGraph.insert({&I, IUsers});
+
+    if (I.mayReadOrWriteMemory() || I.mayHaveSideEffects())
+      ToBeChainedInstructions.push_back(&I);
+
+    if (auto *CI = dyn_cast<CallInst>(&I)) {
+      if (auto *CF = CI->getCalledFunction()) {
+        Intrinsic::ID ID = CF->getIntrinsicID();
+        if (ID == Intrinsic::vp_store || ID == Intrinsic::vp_scatter) {
+          BBInfo.MemoryWritingVPInstructions.push_back(&I);
+        }
+      }
+    }
+  }
+
+  if (ToBeChainedInstructions.size() > 1) {
+    for (unsigned I = 0; I < ToBeChainedInstructions.size() - 2; I++) {
+      auto *Parent = ToBeChainedInstructions[I];
+      auto *Child = ToBeChainedInstructions[I + 1];
+      BBInfo.TopologicalGraph[Parent].insert(Child);
+    }
+  }
+}
+
+namespace {
+void findCandidateVectorOperation(BasicBlock &BB, Value *Op, Value *Mask,
+                                  Value *EVL, BlockData &BBInfo) {
+  auto *OpInst = dyn_cast<Instruction>(Op);
+  if (!OpInst)
+    return;
+
+  if (OpInst->getParent() != &BB)
+    return;
+
+  Intrinsic::ID VPID = Intrinsic::not_intrinsic;
+  unsigned Opcode = OpInst->getOpcode();
+  if (Opcode == Instruction::Call) {
+    if (auto *CF = cast<CallInst>(OpInst)->getCalledFunction())
+      VPID = VPIntrinsic::getForIntrinsicID(CF->getIntrinsicID());
+  } else
+    VPID = VPIntrinsic::getForOpcode(OpInst->getOpcode());
+  if (VPID == Intrinsic::not_intrinsic)
+    return;
+
+  // If the instruction is already present in the map, it means it was already
+  // visited starting from a previous memory writting vp operation.
+  if (!BBInfo.VecOpsToTransform
+           .insert(std::make_pair(OpInst, std::make_pair(Mask, EVL)))
+           .second) {
+    // We need to check if new mask and evl values differ from the old ones:
+    // - if they are the same, then there is nothing to do;
+    // - if only the mask differ, we use an allones mask;
+    // - otherwise, we remove the instruction from the map (i.e., no
+    //   transformation should happen)
+    // NOTE: maybe, instead of giving up, we could split case 3 in two
+    // more cases: if only EVLs differs, we use VLMAX with the mask; if both
+    // mask and EVL differ, we use an allones mask and VLMAX (even if
+    // semantically it means not doing anything).
+    auto It = BBInfo.VecOpsToTransform.find(OpInst);
+    assert(It != BBInfo.VecOpsToTransform.end());
+    Value *OldMask, *OldEVL;
+    std::tie(OldMask, OldEVL) = It->second;
+
+    if (Mask == OldMask && EVL == OldEVL)
+      return;
+
+    BBInfo.VecOpsToTransform.erase(OpInst);
+    if (EVL == OldEVL) {
+      BBInfo.VecOpsToTransform.insert(
+          std::make_pair(OpInst, std::make_pair(nullptr, EVL)));
+    }
+  }
+
+  // Recursively visit OpInst operands.
+  switch (VPID) {
+  default:
+    for (auto *OpVal : OpInst->operand_values())
+      findCandidateVectorOperation(BB, OpVal, Mask, EVL, BBInfo);
+    break;
+  case Intrinsic::vp_select: {
+    auto CanBackPropagateCondOpAsMask = [&](Value *CondOp) -> bool {
+      if (!CondOp->getType()->isVectorTy())
+        return false;
+
+      auto *CondInstr = dyn_cast<Instruction>(CondOp);
+      if (!CondInstr)
+        return false;
+      if (CondInstr->getParent() != &BB)
+        return false;
+      if (auto *ALM = dyn_cast<CallInst>(CondInstr);
+          ALM && ALM->getCalledFunction()->getIntrinsicID() ==
+                     Intrinsic::get_active_lane_mask)
+        return false;
+
+      return true;
+    };
+
+    Value *Cond = OpInst->getOperand(0);
+    Value *TrueOp = OpInst->getOperand(1);
+    Value *FalseOp = OpInst->getOperand(2);
+    // If the condition argument is a vector, we backpropagate it as mask
+    // for the true branch and its negation as mask for the false one.
+    if (CanBackPropagateCondOpAsMask(Cond)) {
+      auto *CondInstr = cast<Instruction>(Cond);
+      IRBuilder<> Builder(CondInstr);
+      auto *CondNot = cast<Instruction>(Builder.CreateNot(Cond));
+      SmallPtrSet<Instruction *, 4> CondNotUsers;
+      BBInfo.TopologicalGraph.insert({CondNot, CondNotUsers});
+      BBInfo.TopologicalGraph[CondInstr].insert(CondNot);
+
+      findCandidateVectorOperation(BB, Cond, nullptr, EVL, BBInfo);
+      findCandidateVectorOperation(BB, CondNot, nullptr, EVL, BBInfo);
+
+      findCandidateVectorOperation(BB, TrueOp, Cond, EVL, BBInfo);
+      findCandidateVectorOperation(BB, FalseOp, CondNot, EVL, BBInfo);
+    } else {
+      findCandidateVectorOperation(BB, TrueOp, nullptr, EVL, BBInfo);
+      findCandidateVectorOperation(BB, FalseOp, nullptr, EVL, BBInfo);
+    }
+    break;
+  }
+  }
+}
+} // namespace
+
+// For each vector predicated memory writing operation of the basic block, go
+// back to the stored vector defining instruction and verify it is a vector
+// operation. Add it to the list of instructions to be transformed into vector
+// predicated ones, then recursively repeat the process for its vector
+// arguments.
+void VectorPredicationPass::findCandidateVectorOperations(BasicBlock &BB,
+                                                          BlockData &BBInfo) {
+  if (BBInfo.MemoryWritingVPInstructions.empty())
+    return;
+
+  for (Instruction *I : BBInfo.MemoryWritingVPInstructions) {
+    assert(I->getParent() == &BB && "This is not the right basic block");
+    auto *VPI = cast<VPIntrinsic>(I);
+    Value *StoredOperand = VPI->getMemoryDataParam();
+    Value *MaskOperand = VPI->getMaskParam();
+    Value *EVLOperand = VPI->getVectorLengthParam();
+    // First, visit the mask operand (assigning an allones mask to this branch)
+    // and only then visit the stored operand.
+    findCandidateVectorOperation(BB, MaskOperand, nullptr, EVLOperand, BBInfo);
+    findCandidateVectorOperation(BB, StoredOperand, MaskOperand, EVLOperand,
+                                 BBInfo);
+  }
+}
+
+// Add the candidates as users of the mask and of the evl linked to each of
+// them, but only if they belong to the same basic block.
+void VectorPredicationPass::addNewUsersToMasksAndEVLs(BasicBlock &BB,
+                                                      BlockData &BBInfo) {
+  if (BBInfo.VecOpsToTransform.empty())
+    return;
+
+  for (auto [K, V] : BBInfo.VecOpsToTransform) {
+    if (auto *MaskInst = dyn_cast_if_present<Instruction>(V.first);
+        MaskInst && MaskInst->getParent() == &BB)
+      BBInfo.TopologicalGraph[MaskInst].insert(K);
+    if (auto *EVLInst = dyn_cast<Instruction>(V.second);
+        EVLInst && EVLInst->getParent() == &BB)
+      BBInfo.TopologicalGraph[EVLInst].insert(K);
+  }
+}
+
+// Topologically sort, preserving as much as possible the original order.
+void VectorPredicationPass::buildNewBasicBlockSchedule(BasicBlock &BB,
+                                                       BlockData &BBInfo) {
+  if (BBInfo.VecOpsToTransform.empty())
+    return;
+
+  while (!BBInfo.TopologicalGraph.empty()) {
+    Instruction *Inst = nullptr;
+    for (auto B = BBInfo.TopologicalGraph.rbegin(),
+              E = BBInfo.TopologicalGraph.rend();
+         B != E; B++) {
+      if (B->second.empty()) {
+        Inst = B->first;
+        break;
+      }
+    }
+    assert(Inst && "Failed to empty topological graph!");
+
+    BBInfo.NewBBReverseOrder.push_back(Inst);
+    BBInfo.TopologicalGraph.erase(Inst);
+
+    for (auto B = BBInfo.TopologicalGraph.begin(),
+              E = BBInfo.TopologicalGraph.end();
+         B != E; B++) {
+      B->second.erase(Inst);
+    }
+  }
+}
+
+// Modify the basic block based on the topological order generated.
+void VectorPredicationPass::emitNewBasicBlockSchedule(BasicBlock &BB,
+                                                      BlockData &BBInfo) {
+  if (BBInfo.VecOpsToTransform.empty())
+    return;
+
+  Instruction *InsertPoint = BB.getTerminator();
+  for (Instruction *I : BBInfo.NewBBReverseOrder) {
+    I->moveBefore(InsertPoint);
+    InsertPoint = I;
+  }
+}
+
+// Transform candidates to vector predicated instructions.
+void VectorPredicationPass::transformCandidateVectorOperations(
+    BasicBlock &BB, BlockData &BBInfo) {
+  if (BBInfo.VecOpsToTransform.empty())
+    return;
+
+  for (auto [I, P] : BBInfo.VecOpsToTransform) {
+    Value *Mask, *EVL;
+    std::tie(Mask, EVL) = P;
+
+    IRBuilder<> Builder(I);
+    unsigned int OpcodeOrIID = I->getOpcode();
+    Type *RetTy = I->getType();
+    SmallVector<Value *> Operands(I->value_op_begin(), I->value_op_end());
+    bool IsCall = false;
+    switch (OpcodeOrIID) {
+    case Instruction::Call: {
+      Operands.clear();
+      auto *CI = cast<CallInst>(I);
+      for (auto &Op : CI->operands()) {
+        if (Op == CI->getCalledOperand())
+          continue;
+        Operands.push_back(Op.get());
+      }
+      OpcodeOrIID = CI->getCalledFunction()->getIntrinsicID();
+      IsCall = true;
+      break;
+    }
+    case Instruction::FCmp:
+    case Instruction::ICmp: {
+      Operands.clear();
+      auto *CmpI = cast<CmpInst>(I);
+      Value *PredOp = MetadataAsValue::get(
+          Builder.getContext(),
+          MDString::get(Builder.getContext(),
+                        CmpInst::getPredicateName(CmpI->getPredicate())));
+      Operands = {CmpI->getOperand(0), CmpI->getOperand(1), PredOp};
+      break;
+    }
+    case Instruction::Select: {
+      if (!I->getOperand(0)->getType()->isVectorTy()) {
+        Operands.clear();
+        Value *Op1 = I->getOperand(1);
+        Value *Op2 = I->getOperand(2);
+        Value *Cond = Builder.CreateVectorSplat(
+            cast<VectorType>(Op1->getType())->getElementCount(),
+            I->getOperand(0), "select.cond.splat");
+        Operands = {Cond, Op1, Op2};
+      } else if (auto *ALM = dyn_cast<CallInst>(I->getOperand(0));
+            ALM && ALM->getCalledFunction()->getIntrinsicID() ==
+                       Intrinsic::get_active_lane_mask) {
+        // Ignore the select: the vector length operand already takes care of
+        // keeping track of the active elements.
+        I->replaceAllUsesWith(I->getOperand(1));
+        OldInstructionsToRemove.insert(std::make_pair(I, nullptr));
+
+        continue;
+      }
+      break;
+    }
+    default:
+      break;
+    }
+
+    if (!Mask)
+      // nullptr means unmasked operation, hence we use an all-ones mask.
+      Mask = ConstantInt::getTrue(RetTy->getWithNewType(Builder.getInt1Ty()));
+
+    VectorBuilder VecBuilder(Builder);
+    VecBuilder.setMask(Mask).setEVL(EVL);
+    Value *NewVPOp = nullptr;
+    if (IsCall)
+      NewVPOp = VecBuilder.createVectorInstructionFromIntrinsicID(
+          OpcodeOrIID, RetTy, Operands, "vp.op");
+    else
+      NewVPOp = VecBuilder.createVectorInstructionFromOpcode(OpcodeOrIID, RetTy,
+                                                             Operands, "vp.op");
+
+    Transforms++; // Stats
+    OldInstructionsToRemove.insert(std::make_pair(I, NewVPOp));
+  }
+}
+
+// Remove old instructions, if possible.
+void VectorPredicationPass::removeOldInstructions() {
+  for (auto [I, NewVPOp] : OldInstructionsToRemove) {
+    if (NewVPOp)
+      I->replaceAllUsesWith(NewVPOp);
+    if (isInstructionTriviallyDead(I))
+      I->eraseFromParent();
+  }
+}
+
+PreservedAnalyses VectorPredicationPass::run(Function &F,
+                                             FunctionAnalysisManager &AM) {
+  assert(OldInstructionsToRemove.empty() &&
+         "Map should be cleared at the end of each run of the pass.");
+
+  for (BasicBlock &BB : F) {
+    BlockData BBInfo;
+
+    analyseBasicBlock(BB, BBInfo);
+    findCandidateVectorOperations(BB, BBInfo);
+    addNewUsersToMasksAndEVLs(BB, BBInfo);
+    buildNewBasicBlockSchedule(BB, BBInfo);
+    emitNewBasicBlockSchedule(BB, BBInfo);
+    transformCandidateVectorOperations(BB, BBInfo);
+  }
+
+  removeOldInstructions();
+  OldInstructionsToRemove.clear();
+
+  // TODO: think about which analysis are preserved.
+  return PreservedAnalyses::none();
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vp_intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vp_intrinsics.ll
new file mode 100644
index 000000000000..03134f36c6ab
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vp_intrinsics.ll
@@ -0,0 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S --passes=loop-vectorize -use-vp-intrinsics -prefer-predicate-over-epilogue=predicate-dont-vectorize -o - < %s | FileCheck %s
+
+; ModuleID = 'custom/simple.c'
+source_filename = "custom/simple.c"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-unknown"
+
+; Input C code:
+; void addVec(long N, double *C, double *A, double *B) {
+;   long I;
+;   for (I = 0; I < N; I++)
+;     C[I] = A[I] + B[I];
+; }
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024)
+define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 {
+; CHECK-LABEL: @addVec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64
+; CHECK-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[C1:%.*]] = ptrtoint ptr [[C:%.*]] to i64
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 13, i64 [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[C1]], [[A2]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP6]], 8
+; CHECK-NEXT:    [[TMP10:%.*]] = sub i64 [[C1]], [[B3]]
+; CHECK-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 2
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[EVL_PHI:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[EVL_PHI_CAST:%.*]] = trunc i64 [[EVL_PHI]] to i32
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[EVL_PHI_CAST]] to i64
+; CHECK-NEXT:    [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP13]], i64 3, i64 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[VL]] to i32
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:    [[TMP17:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP16]]
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <vscale x 2 x i64> [[BROADCAST_SPLAT]], [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 2 x i64> [[VEC_IV]], i32 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP18]], i64 [[N]])
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, ptr [[TMP19]], i32 0
+; CHECK-NEXT:    [[VP_LOAD:%.*]] = call <vscale x 2 x double> @llvm.vp.load.nxv2f64.p0(ptr [[TMP20]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP14]]), !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, ptr [[TMP21]], i32 0
+; CHECK-NEXT:    [[VP_LOAD5:%.*]] = call <vscale x 2 x double> @llvm.vp.load.nxv2f64.p0(ptr [[TMP22]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP14]]), !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP23:%.*]] = fadd <vscale x 2 x double> [[VP_LOAD]], [[VP_LOAD5]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds double, ptr [[TMP24]], i32 0
+; CHECK-NEXT:    call void @llvm.vp.store.nxv2f64.p0(<vscale x 2 x double> [[TMP23]], ptr [[TMP25]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP14]]), !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[TMP14]] to i64
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP26]]
+; CHECK-NEXT:    [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load double, ptr [[ARRAYIDX1]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP28]], [[TMP29]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_08]]
+; CHECK-NEXT:    store double [[ADD]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp7 = icmp sgt i64 %N, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %I.08 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.08
+  %0 = load double, ptr %arrayidx, align 8, !tbaa !4
+  %arrayidx1 = getelementptr inbounds double, ptr %B, i64 %I.08
+  %1 = load double, ptr %arrayidx1, align 8, !tbaa !4
+  %add = fadd double %0, %1
+  %arrayidx2 = getelementptr inbounds double, ptr %C, i64 %I.08
+  store double %add, ptr %arrayidx2, align 8, !tbaa !4
+  %inc = add nuw nsw i64 %I.08, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !8
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"target-abi", !"lp64d"}
+!2 = !{i32 8, !"SmallDataLimit", i32 8}
+!3 = !{!"clang version 17.0.0"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"double", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+!8 = distinct !{!8, !9}
+!9 = !{!"llvm.loop.mustprogress"}
diff --git a/llvm/test/Transforms/VectorPredication/if-elif-else.ll b/llvm/test/Transforms/VectorPredication/if-elif-else.ll
new file mode 100644
index 000000000000..8241f17102c4
--- /dev/null
+++ b/llvm/test/Transforms/VectorPredication/if-elif-else.ll
@@ -0,0 +1,270 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S --passes=vector-predication -o - < %s | FileCheck %s
+
+; ModuleID = 'custom/if-elif-else.c'
+source_filename = "custom/if-elif-else.c"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-unknown"
+
+; Input C code:
+; void addVec(long N, double *C, double *A, double *B) {
+;   long I;
+;   for (I = 0; I < N; I++) {
+;     if (N < 50)
+;       C[I] = A[I] + B[I];
+;     else if (N > 75)
+;       C[I] = A[I] * B[I];
+;     else
+;       C[I] = 2 * A[I];
+;   }
+; }
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024)
+define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 {
+; CHECK-LABEL: @addVec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP30:%.*]] = icmp sgt i64 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP30]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i64 [[N]], 50
+; CHECK-NEXT:    [[CMP4:%.*]] = icmp ugt i64 [[N]], 75
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i64 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 10)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[N]], 3
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[UGLYGEP32:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[UGLYGEP33:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP32]], [[C]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND034:%.*]] = icmp ugt ptr [[UGLYGEP33]], [[C]]
+; CHECK-NEXT:    [[BOUND135:%.*]] = icmp ugt ptr [[UGLYGEP]], [[B]]
+; CHECK-NEXT:    [[FOUND_CONFLICT36:%.*]] = and i1 [[BOUND034]], [[BOUND135]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT36]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT37:%.*]] = insertelement <vscale x 1 x i1> poison, i1 [[CMP1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT38:%.*]] = shufflevector <vscale x 1 x i1> [[BROADCAST_SPLATINSERT37]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT39:%.*]] = insertelement <vscale x 1 x i1> poison, i1 [[CMP4]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT40:%.*]] = shufflevector <vscale x 1 x i1> [[BROADCAST_SPLATINSERT39]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <vscale x 1 x i1> [[BROADCAST_SPLAT38]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP6:%.*]] = select <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[BROADCAST_SPLAT40]], <vscale x 1 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = select <vscale x 1 x i1> [[BROADCAST_SPLAT38]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[BROADCAST_SPLAT40]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[EVL_PHI:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = and i64 [[EVL_PHI]], 4294967295
+; CHECK-NEXT:    [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP8]], i64 3, i64 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[VL]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VP_LOAD:%.*]] = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr [[TMP10]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP9]]), !tbaa [[TBAA4:![0-9]+]], !alias.scope !8
+; CHECK-NEXT:    [[VP_OP:%.*]] = call <vscale x 1 x double> @llvm.vp.fmul.nxv1f64(<vscale x 1 x double> [[VP_LOAD]], <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 2.000000e+00, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP9]])
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VP_LOAD41:%.*]] = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr [[TMP11]], <vscale x 1 x i1> [[TMP6]], i32 [[TMP9]]), !tbaa [[TBAA4]], !alias.scope !11
+; CHECK-NEXT:    [[VP_OP2:%.*]] = call <vscale x 1 x double> @llvm.vp.fmul.nxv1f64(<vscale x 1 x double> [[VP_LOAD]], <vscale x 1 x double> [[VP_LOAD41]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP9]])
+; CHECK-NEXT:    [[VP_LOAD42:%.*]] = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr [[TMP11]], <vscale x 1 x i1> [[BROADCAST_SPLAT38]], i32 [[TMP9]]), !tbaa [[TBAA4]], !alias.scope !11
+; CHECK-NEXT:    [[VP_OP1:%.*]] = call <vscale x 1 x double> @llvm.vp.fadd.nxv1f64(<vscale x 1 x double> [[VP_LOAD]], <vscale x 1 x double> [[VP_LOAD42]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP9]])
+; CHECK-NEXT:    [[VP_OP4:%.*]] = call <vscale x 1 x double> @llvm.vp.select.nxv1f64(<vscale x 1 x i1> [[TMP7]], <vscale x 1 x double> [[VP_OP1]], <vscale x 1 x double> [[VP_OP]], i32 [[TMP9]])
+; CHECK-NEXT:    [[VP_OP3:%.*]] = call <vscale x 1 x double> @llvm.vp.select.nxv1f64(<vscale x 1 x i1> [[TMP6]], <vscale x 1 x double> [[VP_OP2]], <vscale x 1 x double> [[VP_OP4]], i32 [[TMP9]])
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double> [[VP_OP3]], ptr [[TMP12]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP9]]), !tbaa [[TBAA4]], !alias.scope !13, !noalias !15
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[VL]]
+; CHECK-NEXT:    [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[FOR_END_LOOPEXIT44:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_031:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_031]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_031]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       if.else:
+; CHECK-NEXT:    br i1 [[CMP4]], label [[IF_THEN5:%.*]], label [[IF_ELSE9:%.*]]
+; CHECK:       if.then5:
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_031]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr [[ARRAYIDX7]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[TMP14]], [[TMP16]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       if.else9:
+; CHECK-NEXT:    [[MUL11:%.*]] = fmul double [[TMP14]], 2.000000e+00
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[ADD_SINK:%.*]] = phi double [ [[ADD]], [[IF_THEN]] ], [ [[MUL11]], [[IF_ELSE9]] ], [ [[MUL]], [[IF_THEN5]] ]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_031]]
+; CHECK-NEXT:    store double [[ADD_SINK]], ptr [[ARRAYIDX3]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_031]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end.loopexit44:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp30 = icmp sgt i64 %N, 0
+  br i1 %cmp30, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %cmp1 = icmp ult i64 %N, 50
+  %cmp4 = icmp ugt i64 %N, 75
+  %0 = xor i64 %N, -1
+  %1 = call i64 @llvm.vscale.i64()
+  %2 = call i64 @llvm.umax.i64(i64 %1, i64 10)
+  %3 = icmp ugt i64 %2, %0
+  br i1 %3, label %for.body.preheader, label %vector.memcheck
+
+for.body.preheader:                               ; preds = %vector.memcheck, %for.body.lr.ph
+  br label %for.body
+
+vector.memcheck:                                  ; preds = %for.body.lr.ph
+  %4 = shl i64 %N, 3
+  %uglygep = getelementptr i8, ptr %C, i64 %4
+  %uglygep32 = getelementptr i8, ptr %A, i64 %4
+  %uglygep33 = getelementptr i8, ptr %B, i64 %4
+  %bound0 = icmp ugt ptr %uglygep32, %C
+  %bound1 = icmp ugt ptr %uglygep, %A
+  %found.conflict = and i1 %bound0, %bound1
+  %bound034 = icmp ugt ptr %uglygep33, %C
+  %bound135 = icmp ugt ptr %uglygep, %B
+  %found.conflict36 = and i1 %bound034, %bound135
+  %conflict.rdx = or i1 %found.conflict, %found.conflict36
+  br i1 %conflict.rdx, label %for.body.preheader, label %vector.ph
+
+vector.ph:                                        ; preds = %vector.memcheck
+  %broadcast.splatinsert37 = insertelement <vscale x 1 x i1> poison, i1 %cmp1, i64 0
+  %broadcast.splat38 = shufflevector <vscale x 1 x i1> %broadcast.splatinsert37, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %broadcast.splatinsert39 = insertelement <vscale x 1 x i1> poison, i1 %cmp4, i64 0
+  %broadcast.splat40 = shufflevector <vscale x 1 x i1> %broadcast.splatinsert39, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %5 = xor <vscale x 1 x i1> %broadcast.splat38, shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+  %6 = select <vscale x 1 x i1> %5, <vscale x 1 x i1> %broadcast.splat40, <vscale x 1 x i1> zeroinitializer
+  %7 = select <vscale x 1 x i1> %broadcast.splat38, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %broadcast.splat40
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %evl.phi = phi i64 [ %N, %vector.ph ], [ %evl.next, %vector.body ]
+  %8 = and i64 %evl.phi, 4294967295
+  %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %8, i64 3, i64 0)
+  %9 = trunc i64 %vl to i32
+  %10 = getelementptr inbounds double, ptr %A, i64 %index
+  %vp.load = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr %10, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %9), !tbaa !4, !alias.scope !8
+  %11 = fmul <vscale x 1 x double> %vp.load, shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 2.000000e+00, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
+  %12 = getelementptr double, ptr %B, i64 %index
+  %vp.load41 = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr %12, <vscale x 1 x i1> %6, i32 %9), !tbaa !4, !alias.scope !11
+  %13 = fmul <vscale x 1 x double> %vp.load, %vp.load41
+  %vp.load42 = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr %12, <vscale x 1 x i1> %broadcast.splat38, i32 %9), !tbaa !4, !alias.scope !11
+  %14 = fadd <vscale x 1 x double> %vp.load, %vp.load42
+  %predphi = select <vscale x 1 x i1> %7, <vscale x 1 x double> %14, <vscale x 1 x double> %11
+  %predphi43 = select <vscale x 1 x i1> %6, <vscale x 1 x double> %13, <vscale x 1 x double> %predphi
+  %15 = getelementptr inbounds double, ptr %C, i64 %index
+  call void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double> %predphi43, ptr %15, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %9), !tbaa !4, !alias.scope !13, !noalias !15
+  %index.next = add i64 %index, %vl
+  %evl.next = sub i64 %N, %index.next
+  %16 = icmp eq i64 %index.next, %N
+  br i1 %16, label %for.end.loopexit44, label %vector.body, !llvm.loop !16
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %I.031 = phi i64 [ %inc, %for.inc ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.031
+  %17 = load double, ptr %arrayidx, align 8, !tbaa !4
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds double, ptr %B, i64 %I.031
+  %18 = load double, ptr %arrayidx2, align 8, !tbaa !4
+  %add = fadd double %17, %18
+  br label %for.inc
+
+if.else:                                          ; preds = %for.body
+  br i1 %cmp4, label %if.then5, label %if.else9
+
+if.then5:                                         ; preds = %if.else
+  %arrayidx7 = getelementptr inbounds double, ptr %B, i64 %I.031
+  %19 = load double, ptr %arrayidx7, align 8, !tbaa !4
+  %mul = fmul double %17, %19
+  br label %for.inc
+
+if.else9:                                         ; preds = %if.else
+  %mul11 = fmul double %17, 2.000000e+00
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.else9, %if.then5
+  %add.sink = phi double [ %add, %if.then ], [ %mul11, %if.else9 ], [ %mul, %if.then5 ]
+  %arrayidx3 = getelementptr inbounds double, ptr %C, i64 %I.031
+  store double %add.sink, ptr %arrayidx3, align 8, !tbaa !4
+  %inc = add nuw nsw i64 %I.031, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !20
+
+for.end.loopexit:                                 ; preds = %for.inc
+  br label %for.end
+
+for.end.loopexit44:                               ; preds = %vector.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit44, %for.end.loopexit, %entry
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare i64 @llvm.vscale.i64() #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.umax.i64(i64, i64) #2
+
+; Function Attrs: nounwind memory(none)
+declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #3
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64() #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
+declare <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr nocapture, <vscale x 1 x i1>, i32) #4
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write)
+declare void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double>, ptr nocapture, <vscale x 1 x i1>, i32) #5
+
+attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #3 = { nounwind memory(none) }
+attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"target-abi", !"lp64d"}
+!2 = !{i32 8, !"SmallDataLimit", i32 8}
+!3 = !{!"clang version 17.0.0"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"double", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+!8 = !{!9}
+!9 = distinct !{!9, !10}
+!10 = distinct !{!10, !"LVerDomain"}
+!11 = !{!12}
+!12 = distinct !{!12, !10}
+!13 = !{!14}
+!14 = distinct !{!14, !10}
+!15 = !{!9, !12}
+!16 = distinct !{!16, !17, !18, !19}
+!17 = !{!"llvm.loop.mustprogress"}
+!18 = !{!"llvm.loop.isvectorized", i32 1}
+!19 = !{!"llvm.loop.unroll.runtime.disable"}
+!20 = distinct !{!20, !17, !18}
diff --git a/llvm/test/Transforms/VectorPredication/if-elif-else_not-uniform.ll b/llvm/test/Transforms/VectorPredication/if-elif-else_not-uniform.ll
new file mode 100644
index 000000000000..071c42c5ed6b
--- /dev/null
+++ b/llvm/test/Transforms/VectorPredication/if-elif-else_not-uniform.ll
@@ -0,0 +1,316 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -S --passes=vector-predication -o - < %s | FileCheck %s
+
+; ModuleID = 'if-elif-else_not-uniform.c'
+source_filename = "if-elif-else_not-uniform.c"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-linux-gnu"
+
+; Input C code:
+; void addVec(long N, double *C, double *A, double *B, double *K) {
+;   long I;
+;   for (I = 0; I < N; I++) {
+;     if (K[I] < 50)
+;       C[I] = A[I] + B[I];
+;     else if (K[I] > 75)
+;       C[I] = A[I] * B[I];
+;     else
+;       C[I] = 2 * A[I];
+;   }
+; }
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable vscale_range(2,1024)
+define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B, ptr nocapture noundef readonly %K) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @addVec
+; CHECK-SAME: (i64 noundef [[N:%.*]], ptr nocapture noundef writeonly [[C:%.*]], ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef readonly [[K:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP33:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP33]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i64 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2]], i64 12)
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt i64 [[TMP3]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_BODY_PREHEADER50:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       for.body.preheader50:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[N]], 3
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[C]], i64 [[TMP5]]
+; CHECK-NEXT:    [[SCEVGEP35:%.*]] = getelementptr i8, ptr [[K]], i64 [[TMP5]]
+; CHECK-NEXT:    [[SCEVGEP36:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[SCEVGEP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP35]], [[C]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[K]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND038:%.*]] = icmp ugt ptr [[SCEVGEP36]], [[C]]
+; CHECK-NEXT:    [[BOUND139:%.*]] = icmp ugt ptr [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT40:%.*]] = and i1 [[BOUND038]], [[BOUND139]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT40]]
+; CHECK-NEXT:    [[BOUND041:%.*]] = icmp ugt ptr [[SCEVGEP37]], [[C]]
+; CHECK-NEXT:    [[BOUND142:%.*]] = icmp ugt ptr [[SCEVGEP]], [[B]]
+; CHECK-NEXT:    [[FOUND_CONFLICT43:%.*]] = and i1 [[BOUND041]], [[BOUND142]]
+; CHECK-NEXT:    [[CONFLICT_RDX44:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT43]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX44]], label [[FOR_BODY_PREHEADER50]], label [[VECTOR_BODY_PREHEADER:%.*]]
+; CHECK:       vector.body.preheader:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[EVL_PHI:%.*]] = phi i64 [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ], [ [[N]], [[VECTOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = and i64 [[EVL_PHI]], 4294967295
+; CHECK-NEXT:    [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP6]], i64 3, i64 1)
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i64 [[VL]] to i32
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[N]])
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, ptr [[K]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VP_LOAD:%.*]] = call <vscale x 2 x double> @llvm.vp.load.nxv2f64.p0(ptr [[TMP8]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]]), !tbaa [[TBAA7:![0-9]+]], !alias.scope !11
+; CHECK-NEXT:    [[VP_OP7:%.*]] = call <vscale x 2 x i1> @llvm.vp.fcmp.nxv2f64(<vscale x 2 x double> [[VP_LOAD]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 5.000000e+01, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer), metadata !"olt", <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; CHECK-NEXT:    [[VP_OP3:%.*]] = call <vscale x 2 x i1> @llvm.vp.xor.nxv2i1(<vscale x 2 x i1> [[VP_OP7]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[VP_OP3]], <vscale x 2 x i1> zeroinitializer
+; CHECK-NEXT:    [[VP_OP16:%.*]] = call <vscale x 2 x i1> @llvm.vp.fcmp.nxv2f64(<vscale x 2 x double> [[VP_LOAD]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 7.500000e+01, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer), metadata !"ogt", <vscale x 2 x i1> [[TMP9]], i32 [[TMP7]])
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VP_LOAD45:%.*]] = call <vscale x 2 x double> @llvm.vp.load.nxv2f64.p0(ptr [[TMP10]], <vscale x 2 x i1> [[VP_OP3]], i32 [[TMP7]]), !tbaa [[TBAA7]], !alias.scope !14
+; CHECK-NEXT:    [[VP_OP8:%.*]] = call <vscale x 2 x i1> @llvm.vp.xor.nxv2i1(<vscale x 2 x i1> [[VP_OP16]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> [[TMP9]], i32 [[TMP7]])
+; CHECK-NEXT:    [[VP_OP12:%.*]] = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> [[VP_OP3]], <vscale x 2 x i1> [[VP_OP8]], <vscale x 2 x i1> zeroinitializer, i32 [[TMP7]])
+; CHECK-NEXT:    [[VP_OP14:%.*]] = call <vscale x 2 x double> @llvm.vp.fmul.nxv2f64(<vscale x 2 x double> [[VP_LOAD45]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 2.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> [[VP_OP12]], i32 [[TMP7]])
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VP_OP5:%.*]] = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> [[VP_OP3]], <vscale x 2 x i1> [[VP_OP16]], <vscale x 2 x i1> zeroinitializer, i32 [[TMP7]])
+; CHECK-NEXT:    [[VP_LOAD46:%.*]] = call <vscale x 2 x double> @llvm.vp.load.nxv2f64.p0(ptr [[TMP11]], <vscale x 2 x i1> [[VP_OP5]], i32 [[TMP7]]), !tbaa [[TBAA7]], !alias.scope !16
+; CHECK-NEXT:    [[VP_OP11:%.*]] = call <vscale x 2 x double> @llvm.vp.fmul.nxv2f64(<vscale x 2 x double> [[VP_LOAD45]], <vscale x 2 x double> [[VP_LOAD46]], <vscale x 2 x i1> [[VP_OP5]], i32 [[TMP7]])
+; CHECK-NEXT:    [[VP_LOAD47:%.*]] = call <vscale x 2 x double> @llvm.vp.load.nxv2f64.p0(ptr [[TMP10]], <vscale x 2 x i1> [[VP_OP7]], i32 [[TMP7]]), !tbaa [[TBAA7]], !alias.scope !14
+; CHECK-NEXT:    [[VP_LOAD48:%.*]] = call <vscale x 2 x double> @llvm.vp.load.nxv2f64.p0(ptr [[TMP11]], <vscale x 2 x i1> [[VP_OP7]], i32 [[TMP7]]), !tbaa [[TBAA7]], !alias.scope !16
+; CHECK-NEXT:    [[VP_OP4:%.*]] = call <vscale x 2 x i1> @llvm.vp.xor.nxv2i1(<vscale x 2 x i1> [[VP_OP12]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; CHECK-NEXT:    [[VP_OP6:%.*]] = call <vscale x 2 x double> @llvm.vp.fadd.nxv2f64(<vscale x 2 x double> [[VP_LOAD47]], <vscale x 2 x double> [[VP_LOAD48]], <vscale x 2 x i1> [[VP_OP4]], i32 [[TMP7]])
+; CHECK-NEXT:    [[VP_OP13:%.*]] = call <vscale x 2 x i1> @llvm.vp.xor.nxv2i1(<vscale x 2 x i1> [[VP_OP5]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; CHECK-NEXT:    [[VP_OP15:%.*]] = call <vscale x 2 x double> @llvm.vp.select.nxv2f64(<vscale x 2 x i1> [[VP_OP12]], <vscale x 2 x double> [[VP_OP14]], <vscale x 2 x double> [[VP_OP6]], i32 [[TMP7]])
+; CHECK-NEXT:    [[VP_OP2:%.*]] = call <vscale x 2 x i1> @llvm.vp.or.nxv2i1(<vscale x 2 x i1> [[VP_OP7]], <vscale x 2 x i1> [[VP_OP12]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; CHECK-NEXT:    [[VP_OP9:%.*]] = call <vscale x 2 x i1> @llvm.vp.or.nxv2i1(<vscale x 2 x i1> [[VP_OP2]], <vscale x 2 x i1> [[VP_OP5]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; CHECK-NEXT:    [[VP_OP:%.*]] = call <vscale x 2 x double> @llvm.vp.select.nxv2f64(<vscale x 2 x i1> [[VP_OP5]], <vscale x 2 x double> [[VP_OP11]], <vscale x 2 x double> [[VP_OP15]], i32 [[TMP7]])
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.vp.store.nxv2f64.p0(<vscale x 2 x double> [[VP_OP]], ptr [[TMP12]], <vscale x 2 x i1> [[VP_OP9]], i32 [[TMP7]]), !tbaa [[TBAA7]], !alias.scope !18, !noalias !20
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[VL]]
+; CHECK-NEXT:    [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
+; CHECK-NEXT:    [[VP_OP10:%.*]] = call <vscale x 2 x i1> @llvm.vp.xor.nxv2i1(<vscale x 2 x i1> [[VP_OP3]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; CHECK-NEXT:    [[VP_OP1:%.*]] = call <vscale x 2 x i1> @llvm.vp.xor.nxv2i1(<vscale x 2 x i1> [[VP_OP3]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; CHECK-NEXT:    br i1 [[TMP13]], label [[FOR_END_LOOPEXIT51:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_034:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER50]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[K]], i64 [[I_034]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA7]]
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp olt double [[TMP14]], 5.000000e+01
+; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_034]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA7]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_034]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr [[ARRAYIDX3]], align 8, !tbaa [[TBAA7]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[CMP6:%.*]] = fcmp ogt double [[TMP14]], 7.500000e+01
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_034]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load double, ptr [[ARRAYIDX8]], align 8, !tbaa [[TBAA7]]
+; CHECK-NEXT:    br i1 [[CMP6]], label [[IF_THEN7:%.*]], label [[IF_ELSE11:%.*]]
+; CHECK:       if.then7:
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_034]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load double, ptr [[ARRAYIDX9]], align 8, !tbaa [[TBAA7]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[TMP17]], [[TMP18]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       if.else11:
+; CHECK-NEXT:    [[MUL13:%.*]] = fmul double [[TMP17]], 2.000000e+00
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[ADD_SINK:%.*]] = phi double [ [[ADD]], [[IF_THEN]] ], [ [[MUL13]], [[IF_ELSE11]] ], [ [[MUL]], [[IF_THEN7]] ]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_034]]
+; CHECK-NEXT:    store double [[ADD_SINK]], ptr [[ARRAYIDX4]], align 8, !tbaa [[TBAA7]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_034]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end.loopexit51:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp33 = icmp sgt i64 %N, 0
+  br i1 %cmp33, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %0 = xor i64 %N, -1
+  %1 = call i64 @llvm.vscale.i64()
+  %2 = shl nuw nsw i64 %1, 1
+  %3 = call i64 @llvm.umax.i64(i64 %2, i64 12)
+  %4 = icmp ugt i64 %3, %0
+  br i1 %4, label %for.body.preheader50, label %vector.memcheck
+
+for.body.preheader50:                             ; preds = %vector.memcheck, %for.body.preheader
+  br label %for.body
+
+vector.memcheck:                                  ; preds = %for.body.preheader
+  %5 = shl i64 %N, 3
+  %scevgep = getelementptr i8, ptr %C, i64 %5
+  %scevgep35 = getelementptr i8, ptr %K, i64 %5
+  %scevgep36 = getelementptr i8, ptr %A, i64 %5
+  %scevgep37 = getelementptr i8, ptr %B, i64 %5
+  %bound0 = icmp ugt ptr %scevgep35, %C
+  %bound1 = icmp ugt ptr %scevgep, %K
+  %found.conflict = and i1 %bound0, %bound1
+  %bound038 = icmp ugt ptr %scevgep36, %C
+  %bound139 = icmp ugt ptr %scevgep, %A
+  %found.conflict40 = and i1 %bound038, %bound139
+  %conflict.rdx = or i1 %found.conflict, %found.conflict40
+  %bound041 = icmp ugt ptr %scevgep37, %C
+  %bound142 = icmp ugt ptr %scevgep, %B
+  %found.conflict43 = and i1 %bound041, %bound142
+  %conflict.rdx44 = or i1 %conflict.rdx, %found.conflict43
+  br i1 %conflict.rdx44, label %for.body.preheader50, label %vector.body.preheader
+
+vector.body.preheader:                            ; preds = %vector.memcheck
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body.preheader, %vector.body
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.body.preheader ]
+  %evl.phi = phi i64 [ %evl.next, %vector.body ], [ %N, %vector.body.preheader ]
+  %6 = and i64 %evl.phi, 4294967295
+  %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %6, i64 3, i64 1)
+  %7 = trunc i64 %vl to i32
+  %active.lane.mask = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index, i64 %N)
+  %8 = getelementptr inbounds double, ptr %K, i64 %index
+  %vp.load = call <vscale x 2 x double> @llvm.vp.load.nxv2f64.p0(ptr %8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %7), !tbaa !7, !alias.scope !11
+  %9 = fcmp olt <vscale x 2 x double> %vp.load, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 5.000000e+01, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %10 = fcmp ogt <vscale x 2 x double> %vp.load, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 7.500000e+01, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %11 = getelementptr double, ptr %A, i64 %index
+  %12 = xor <vscale x 2 x i1> %9, shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+  %13 = select <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %12, <vscale x 2 x i1> zeroinitializer
+  %vp.load45 = call <vscale x 2 x double> @llvm.vp.load.nxv2f64.p0(ptr %11, <vscale x 2 x i1> %13, i32 %7), !tbaa !7, !alias.scope !14
+  %14 = fmul <vscale x 2 x double> %vp.load45, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 2.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %15 = getelementptr double, ptr %B, i64 %index
+  %16 = select <vscale x 2 x i1> %13, <vscale x 2 x i1> %10, <vscale x 2 x i1> zeroinitializer
+  %vp.load46 = call <vscale x 2 x double> @llvm.vp.load.nxv2f64.p0(ptr %15, <vscale x 2 x i1> %16, i32 %7), !tbaa !7, !alias.scope !16
+  %17 = fmul <vscale x 2 x double> %vp.load45, %vp.load46
+  %18 = select <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %9, <vscale x 2 x i1> zeroinitializer
+  %vp.load47 = call <vscale x 2 x double> @llvm.vp.load.nxv2f64.p0(ptr %11, <vscale x 2 x i1> %18, i32 %7), !tbaa !7, !alias.scope !14
+  %vp.load48 = call <vscale x 2 x double> @llvm.vp.load.nxv2f64.p0(ptr %15, <vscale x 2 x i1> %18, i32 %7), !tbaa !7, !alias.scope !16
+  %19 = fadd <vscale x 2 x double> %vp.load47, %vp.load48
+  %20 = xor <vscale x 2 x i1> %10, shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+  %21 = select <vscale x 2 x i1> %13, <vscale x 2 x i1> %20, <vscale x 2 x i1> zeroinitializer
+  %predphi = select <vscale x 2 x i1> %21, <vscale x 2 x double> %14, <vscale x 2 x double> %19
+  %predphi49 = select <vscale x 2 x i1> %16, <vscale x 2 x double> %17, <vscale x 2 x double> %predphi
+  %22 = getelementptr inbounds double, ptr %C, i64 %index
+  %23 = or <vscale x 2 x i1> %18, %21
+  %24 = or <vscale x 2 x i1> %23, %16
+  call void @llvm.vp.store.nxv2f64.p0(<vscale x 2 x double> %predphi49, ptr %22, <vscale x 2 x i1> %24, i32 %7), !tbaa !7, !alias.scope !18, !noalias !20
+  %index.next = add i64 %index, %vl
+  %evl.next = sub i64 %N, %index.next
+  %25 = icmp eq i64 %index.next, %N
+  br i1 %25, label %for.end.loopexit51, label %vector.body, !llvm.loop !21
+
+for.body:                                         ; preds = %for.body.preheader50, %for.inc
+  %I.034 = phi i64 [ %inc, %for.inc ], [ 0, %for.body.preheader50 ]
+  %arrayidx = getelementptr inbounds double, ptr %K, i64 %I.034
+  %26 = load double, ptr %arrayidx, align 8, !tbaa !7
+  %cmp1 = fcmp olt double %26, 5.000000e+01
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds double, ptr %A, i64 %I.034
+  %27 = load double, ptr %arrayidx2, align 8, !tbaa !7
+  %arrayidx3 = getelementptr inbounds double, ptr %B, i64 %I.034
+  %28 = load double, ptr %arrayidx3, align 8, !tbaa !7
+  %add = fadd double %27, %28
+  br label %for.inc
+
+if.else:                                          ; preds = %for.body
+  %cmp6 = fcmp ogt double %26, 7.500000e+01
+  %arrayidx8 = getelementptr inbounds double, ptr %A, i64 %I.034
+  %29 = load double, ptr %arrayidx8, align 8, !tbaa !7
+  br i1 %cmp6, label %if.then7, label %if.else11
+
+if.then7:                                         ; preds = %if.else
+  %arrayidx9 = getelementptr inbounds double, ptr %B, i64 %I.034
+  %30 = load double, ptr %arrayidx9, align 8, !tbaa !7
+  %mul = fmul double %29, %30
+  br label %for.inc
+
+if.else11:                                        ; preds = %if.else
+  %mul13 = fmul double %29, 2.000000e+00
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.else11, %if.then7
+  %add.sink = phi double [ %add, %if.then ], [ %mul13, %if.else11 ], [ %mul, %if.then7 ]
+  %arrayidx4 = getelementptr inbounds double, ptr %C, i64 %I.034
+  store double %add.sink, ptr %arrayidx4, align 8, !tbaa !7
+  %inc = add nuw nsw i64 %I.034, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !25
+
+for.end.loopexit:                                 ; preds = %for.inc
+  br label %for.end
+
+for.end.loopexit51:                               ; preds = %vector.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit51, %for.end.loopexit, %entry
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare i64 @llvm.vscale.i64() #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.umax.i64(i64, i64) #2
+
+; Function Attrs: nounwind memory(none)
+declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #3
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64() #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
+declare <vscale x 2 x double> @llvm.vp.load.nxv2f64.p0(ptr nocapture, <vscale x 2 x i1>, i32) #4
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write)
+declare void @llvm.vp.store.nxv2f64.p0(<vscale x 2 x double>, ptr nocapture, <vscale x 2 x i1>, i32) #5
+
+attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zicsr,+zifencei,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-smaia,-experimental-ssaia,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zcmt,-experimental-zfa,-experimental-zicond,-experimental-zihintntl,-experimental-ztso,-experimental-zvbb,-experimental-zvbc,-experimental-zvfh,-experimental-zvkg,-experimental-zvkn,-experimental-zvkned,-experimental-zvkng,-experimental-zvknha,-experimental-zvknhb,-experimental-zvks,-experimental-zvksed,-experimental-zvksg,-experimental-zvksh,-experimental-zvkt,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xsfvcp,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-zawrs,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zicntr,-zihintpause,-zihpm,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #3 = { nounwind memory(none) }
+attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"target-abi", !"lp64d"}
+!2 = !{i32 8, !"PIC Level", i32 2}
+!3 = !{i32 7, !"PIE Level", i32 2}
+!4 = !{i32 7, !"uwtable", i32 2}
+!5 = !{i32 8, !"SmallDataLimit", i32 8}
+!6 = !{!"clang version 17.0.0"}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"double", !9, i64 0}
+!9 = !{!"omnipotent char", !10, i64 0}
+!10 = !{!"Simple C/C++ TBAA"}
+!11 = !{!12}
+!12 = distinct !{!12, !13}
+!13 = distinct !{!13, !"LVerDomain"}
+!14 = !{!15}
+!15 = distinct !{!15, !13}
+!16 = !{!17}
+!17 = distinct !{!17, !13}
+!18 = !{!19}
+!19 = distinct !{!19, !13}
+!20 = !{!12, !15, !17}
+!21 = distinct !{!21, !22, !23, !24}
+!22 = !{!"llvm.loop.mustprogress"}
+!23 = !{!"llvm.loop.isvectorized", i32 1}
+!24 = !{!"llvm.loop.unroll.runtime.disable"}
+!25 = distinct !{!25, !22, !23}
diff --git a/llvm/test/Transforms/VectorPredication/if-else_scalar-cond.ll b/llvm/test/Transforms/VectorPredication/if-else_scalar-cond.ll
new file mode 100644
index 000000000000..ed8f28feeffc
--- /dev/null
+++ b/llvm/test/Transforms/VectorPredication/if-else_scalar-cond.ll
@@ -0,0 +1,209 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S --passes=vector-predication -o - < %s | FileCheck %s
+
+; ModuleID = 'custom/if-else2.c'
+source_filename = "custom/if-else2.c"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-unknown"
+
+; Input C code:
+; void addVec(long N, double *C, double *A, double *B) {
+;   long I;
+;   for (I = 0; I < N; I++) {
+;     if (N < 50)
+;       C[I] = A[I] + B[I];
+;     else
+;       C[I] = A[I] * B[I];
+;   }
+; }
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024)
+define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 {
+; CHECK-LABEL: @addVec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B22:%.*]] = ptrtoint ptr [[B:%.*]] to i64
+; CHECK-NEXT:    [[A21:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[C20:%.*]] = ptrtoint ptr [[C:%.*]] to i64
+; CHECK-NEXT:    [[CMP18:%.*]] = icmp sgt i64 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP18]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i64 [[N]], 50
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i64 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[C20]], [[A21]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[C20]], [[B22]]
+; CHECK-NEXT:    [[DIFF_CHECK23:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK23]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY_PREHEADER:%.*]]
+; CHECK:       vector.body.preheader:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[EVL_PHI:%.*]] = phi i64 [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ], [ [[N]], [[VECTOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[EVL_PHI]], 4294967295
+; CHECK-NEXT:    [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP9]], i64 3, i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[VL]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VP_LOAD:%.*]] = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr [[TMP11]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VP_LOAD24:%.*]] = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr [[TMP12]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[VP_OP:%.*]] = call <vscale x 1 x double> @llvm.vp.fadd.nxv1f64(<vscale x 1 x double> [[VP_LOAD]], <vscale x 1 x double> [[VP_LOAD24]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]])
+; CHECK-NEXT:    [[VP_OP2:%.*]] = call <vscale x 1 x double> @llvm.vp.fmul.nxv1f64(<vscale x 1 x double> [[VP_LOAD]], <vscale x 1 x double> [[VP_LOAD24]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]])
+; CHECK-NEXT:    [[SELECT_COND_SPLAT_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 [[CMP1]], i64 0
+; CHECK-NEXT:    [[SELECT_COND_SPLAT_SPLAT:%.*]] = shufflevector <vscale x 1 x i1> [[SELECT_COND_SPLAT_SPLATINSERT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    [[VP_OP1:%.*]] = call <vscale x 1 x double> @llvm.vp.select.nxv1f64(<vscale x 1 x i1> [[SELECT_COND_SPLAT_SPLAT]], <vscale x 1 x double> [[VP_OP]], <vscale x 1 x double> [[VP_OP2]], i32 [[TMP10]])
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double> [[VP_OP1]], ptr [[TMP13]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[VL]]
+; CHECK-NEXT:    [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[FOR_END_LOOPEXIT25:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_019:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_019]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_019]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[MUL_SINK:%.*]] = select i1 [[CMP1]], double [[ADD]], double [[MUL]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_019]]
+; CHECK-NEXT:    store double [[MUL_SINK]], ptr [[TMP17]], align 8
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_019]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end.loopexit25:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %B22 = ptrtoint ptr %B to i64
+  %A21 = ptrtoint ptr %A to i64
+  %C20 = ptrtoint ptr %C to i64
+  %cmp18 = icmp sgt i64 %N, 0
+  br i1 %cmp18, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %cmp1 = icmp ult i64 %N, 50
+  %0 = xor i64 %N, -1
+  %1 = call i64 @llvm.vscale.i64()
+  %2 = call i64 @llvm.umax.i64(i64 %1, i64 8)
+  %3 = icmp ugt i64 %2, %0
+  br i1 %3, label %for.body.preheader, label %vector.memcheck
+
+for.body.preheader:                               ; preds = %vector.memcheck, %for.body.lr.ph
+  br label %for.body
+
+vector.memcheck:                                  ; preds = %for.body.lr.ph
+  %4 = call i64 @llvm.vscale.i64()
+  %5 = shl nuw nsw i64 %4, 3
+  %6 = sub i64 %C20, %A21
+  %diff.check = icmp ult i64 %6, %5
+  %7 = shl nuw nsw i64 %4, 3
+  %8 = sub i64 %C20, %B22
+  %diff.check23 = icmp ult i64 %8, %7
+  %conflict.rdx = or i1 %diff.check, %diff.check23
+  br i1 %conflict.rdx, label %for.body.preheader, label %vector.body.preheader
+
+vector.body.preheader:                            ; preds = %vector.memcheck
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body.preheader, %vector.body
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.body.preheader ]
+  %evl.phi = phi i64 [ %evl.next, %vector.body ], [ %N, %vector.body.preheader ]
+  %9 = and i64 %evl.phi, 4294967295
+  %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %9, i64 3, i64 0)
+  %10 = trunc i64 %vl to i32
+  %11 = getelementptr inbounds double, ptr %A, i64 %index
+  %vp.load = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr %11, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %10), !tbaa !4
+  %12 = getelementptr inbounds double, ptr %B, i64 %index
+  %vp.load24 = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr %12, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %10), !tbaa !4
+  %13 = fadd <vscale x 1 x double> %vp.load, %vp.load24
+  %14 = fmul <vscale x 1 x double> %vp.load, %vp.load24
+  %15 = select i1 %cmp1, <vscale x 1 x double> %13, <vscale x 1 x double> %14
+  %16 = getelementptr inbounds double, ptr %C, i64 %index
+  call void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double> %15, ptr %16, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %10)
+  %index.next = add i64 %index, %vl
+  %evl.next = sub i64 %N, %index.next
+  %17 = icmp eq i64 %index.next, %N
+  br i1 %17, label %for.end.loopexit25, label %vector.body, !llvm.loop !8
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %I.019 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.019
+  %18 = load double, ptr %arrayidx, align 8, !tbaa !4
+  %arrayidx2 = getelementptr inbounds double, ptr %B, i64 %I.019
+  %19 = load double, ptr %arrayidx2, align 8, !tbaa !4
+  %add = fadd double %18, %19
+  %mul = fmul double %18, %19
+  %mul.sink = select i1 %cmp1, double %add, double %mul
+  %20 = getelementptr inbounds double, ptr %C, i64 %I.019
+  store double %mul.sink, ptr %20, align 8
+  %inc = add nuw nsw i64 %I.019, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !12
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end.loopexit25:                               ; preds = %vector.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit25, %for.end.loopexit, %entry
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare i64 @llvm.vscale.i64() #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.umax.i64(i64, i64) #2
+
+; Function Attrs: nounwind memory(none)
+declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #3
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64() #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
+declare <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr nocapture, <vscale x 1 x i1>, i32) #4
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write)
+declare void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double>, ptr nocapture, <vscale x 1 x i1>, i32) #5
+
+attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #3 = { nounwind memory(none) }
+attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"target-abi", !"lp64d"}
+!2 = !{i32 8, !"SmallDataLimit", i32 8}
+!3 = !{!"clang version 17.0.0"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"double", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+!8 = distinct !{!8, !9, !10, !11}
+!9 = !{!"llvm.loop.mustprogress"}
+!10 = !{!"llvm.loop.isvectorized", i32 1}
+!11 = !{!"llvm.loop.unroll.runtime.disable"}
+!12 = distinct !{!12, !9, !10}
diff --git a/llvm/test/Transforms/VectorPredication/if-else_vec-cond.ll b/llvm/test/Transforms/VectorPredication/if-else_vec-cond.ll
new file mode 100644
index 000000000000..34e4c63c12af
--- /dev/null
+++ b/llvm/test/Transforms/VectorPredication/if-else_vec-cond.ll
@@ -0,0 +1,220 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S --passes=vector-predication -o - < %s | FileCheck %s
+
+; ModuleID = 'custom/if-else1.c'
+source_filename = "custom/if-else1.c"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-unknown"
+
+; Input C code:
+; void addVec(long N, double *C, double *A, double *B) {
+;   long I;
+;   for (I = 0; I < N; I++) {
+;     if (I < 50)
+;       C[I] = A[I] + B[I];
+;     else
+;       C[I] = A[I] * B[I];
+;   }
+; }
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024)
+define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 {
+; CHECK-LABEL: @addVec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B22:%.*]] = ptrtoint ptr [[B:%.*]] to i64
+; CHECK-NEXT:    [[A21:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[C20:%.*]] = ptrtoint ptr [[C:%.*]] to i64
+; CHECK-NEXT:    [[CMP18:%.*]] = icmp sgt i64 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP18]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i64 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_BODY_PREHEADER25:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       for.body.preheader25:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[C20]], [[A21]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[C20]], [[B22]]
+; CHECK-NEXT:    [[DIFF_CHECK23:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK23]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER25]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[EVL_PHI:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = and i64 [[EVL_PHI]], 4294967295
+; CHECK-NEXT:    [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP10]], i64 3, i64 0)
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[VL]] to i32
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[VL]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    [[VP_OP2:%.*]] = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i64(<vscale x 1 x i64> [[VEC_IND]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 50, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), metadata !"ult", <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP11]])
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VP_LOAD:%.*]] = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr [[TMP12]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP11]]), !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VP_LOAD24:%.*]] = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr [[TMP13]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP11]]), !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[VP_OP3:%.*]] = call <vscale x 1 x double> @llvm.vp.fadd.nxv1f64(<vscale x 1 x double> [[VP_LOAD]], <vscale x 1 x double> [[VP_LOAD24]], <vscale x 1 x i1> [[VP_OP2]], i32 [[TMP11]])
+; CHECK-NEXT:    [[VP_OP4:%.*]] = call <vscale x 1 x i1> @llvm.vp.xor.nxv1i1(<vscale x 1 x i1> [[VP_OP2]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP11]])
+; CHECK-NEXT:    [[VP_OP1:%.*]] = call <vscale x 1 x double> @llvm.vp.fmul.nxv1f64(<vscale x 1 x double> [[VP_LOAD]], <vscale x 1 x double> [[VP_LOAD24]], <vscale x 1 x i1> [[VP_OP4]], i32 [[TMP11]])
+; CHECK-NEXT:    [[VP_OP:%.*]] = call <vscale x 1 x double> @llvm.vp.select.nxv1f64(<vscale x 1 x i1> [[VP_OP2]], <vscale x 1 x double> [[VP_OP3]], <vscale x 1 x double> [[VP_OP1]], i32 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double> [[VP_OP]], ptr [[TMP14]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP11]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[VL]]
+; CHECK-NEXT:    [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[FOR_END_LOOPEXIT26:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_019:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER25]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i64 [[I_019]], 50
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_019]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_019]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load double, ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[MUL_SINK:%.*]] = select i1 [[CMP1]], double [[ADD]], double [[MUL]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_019]]
+; CHECK-NEXT:    store double [[MUL_SINK]], ptr [[TMP18]], align 8
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_019]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end.loopexit26:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %B22 = ptrtoint ptr %B to i64
+  %A21 = ptrtoint ptr %A to i64
+  %C20 = ptrtoint ptr %C to i64
+  %cmp18 = icmp sgt i64 %N, 0
+  br i1 %cmp18, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %0 = xor i64 %N, -1
+  %1 = call i64 @llvm.vscale.i64()
+  %2 = call i64 @llvm.umax.i64(i64 %1, i64 8)
+  %3 = icmp ugt i64 %2, %0
+  br i1 %3, label %for.body.preheader25, label %vector.memcheck
+
+for.body.preheader25:                             ; preds = %vector.memcheck, %for.body.preheader
+  br label %for.body
+
+vector.memcheck:                                  ; preds = %for.body.preheader
+  %4 = call i64 @llvm.vscale.i64()
+  %5 = shl nuw nsw i64 %4, 3
+  %6 = sub i64 %C20, %A21
+  %diff.check = icmp ult i64 %6, %5
+  %7 = shl nuw nsw i64 %4, 3
+  %8 = sub i64 %C20, %B22
+  %diff.check23 = icmp ult i64 %8, %7
+  %conflict.rdx = or i1 %diff.check, %diff.check23
+  br i1 %conflict.rdx, label %for.body.preheader25, label %vector.ph
+
+vector.ph:                                        ; preds = %vector.memcheck
+  %9 = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %evl.phi = phi i64 [ %N, %vector.ph ], [ %evl.next, %vector.body ]
+  %vec.ind = phi <vscale x 1 x i64> [ %9, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %10 = and i64 %evl.phi, 4294967295
+  %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %10, i64 3, i64 0)
+  %11 = trunc i64 %vl to i32
+  %.splatinsert = insertelement <vscale x 1 x i64> poison, i64 %vl, i64 0
+  %.splat = shufflevector <vscale x 1 x i64> %.splatinsert, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %12 = icmp ult <vscale x 1 x i64> %vec.ind, shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 50, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+  %13 = getelementptr inbounds double, ptr %A, i64 %index
+  %vp.load = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr %13, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %11), !tbaa !4
+  %14 = getelementptr inbounds double, ptr %B, i64 %index
+  %vp.load24 = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr %14, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %11), !tbaa !4
+  %15 = fadd <vscale x 1 x double> %vp.load, %vp.load24
+  %16 = fmul <vscale x 1 x double> %vp.load, %vp.load24
+  %17 = select <vscale x 1 x i1> %12, <vscale x 1 x double> %15, <vscale x 1 x double> %16
+  %18 = getelementptr inbounds double, ptr %C, i64 %index
+  call void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double> %17, ptr %18, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %11)
+  %index.next = add i64 %index, %vl
+  %evl.next = sub i64 %N, %index.next
+  %vec.ind.next = add <vscale x 1 x i64> %vec.ind, %.splat
+  %19 = icmp eq i64 %index.next, %N
+  br i1 %19, label %for.end.loopexit26, label %vector.body, !llvm.loop !8
+
+for.body:                                         ; preds = %for.body.preheader25, %for.body
+  %I.019 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader25 ]
+  %cmp1 = icmp ult i64 %I.019, 50
+  %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.019
+  %20 = load double, ptr %arrayidx, align 8, !tbaa !4
+  %arrayidx2 = getelementptr inbounds double, ptr %B, i64 %I.019
+  %21 = load double, ptr %arrayidx2, align 8, !tbaa !4
+  %add = fadd double %20, %21
+  %mul = fmul double %20, %21
+  %mul.sink = select i1 %cmp1, double %add, double %mul
+  %22 = getelementptr inbounds double, ptr %C, i64 %I.019
+  store double %mul.sink, ptr %22, align 8
+  %inc = add nuw nsw i64 %I.019, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !12
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end.loopexit26:                               ; preds = %vector.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit26, %for.end.loopexit, %entry
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare i64 @llvm.vscale.i64() #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.umax.i64(i64, i64) #2
+
+; Function Attrs: nounwind memory(none)
+declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #3
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64() #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
+declare <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr nocapture, <vscale x 1 x i1>, i32) #4
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write)
+declare void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double>, ptr nocapture, <vscale x 1 x i1>, i32) #5
+
+attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #3 = { nounwind memory(none) }
+attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"target-abi", !"lp64d"}
+!2 = !{i32 8, !"SmallDataLimit", i32 8}
+!3 = !{!"clang version 17.0.0"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"double", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+!8 = distinct !{!8, !9, !10, !11}
+!9 = !{!"llvm.loop.mustprogress"}
+!10 = !{!"llvm.loop.isvectorized", i32 1}
+!11 = !{!"llvm.loop.unroll.runtime.disable"}
+!12 = distinct !{!12, !9, !10}
diff --git a/llvm/test/Transforms/VectorPredication/simple_vector_sum.ll b/llvm/test/Transforms/VectorPredication/simple_vector_sum.ll
new file mode 100644
index 000000000000..116d883572ee
--- /dev/null
+++ b/llvm/test/Transforms/VectorPredication/simple_vector_sum.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S --passes=vector-predication -o - < %s | FileCheck %s
+
+; ModuleID = 'custom/simple.c'
+source_filename = "custom/simple.c"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-unknown"
+
+; Input C code:
+; void addVec(long N, double *C, double *A, double *B) {
+;   long I;
+;   for (I = 0; I < N; I++)
+;     C[I] = A[I] + B[I];
+; }
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024)
+define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 {
+; CHECK-LABEL: @addVec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B11:%.*]] = ptrtoint ptr [[B:%.*]] to i64
+; CHECK-NEXT:    [[A10:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[C9:%.*]] = ptrtoint ptr [[C:%.*]] to i64
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i64 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 10)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_BODY_PREHEADER14:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       for.body.preheader14:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[C9]], [[A10]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[C9]], [[B11]]
+; CHECK-NEXT:    [[DIFF_CHECK12:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK12]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER14]], label [[VECTOR_BODY_PREHEADER:%.*]]
+; CHECK:       vector.body.preheader:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[EVL_PHI:%.*]] = phi i64 [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ], [ [[N]], [[VECTOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[EVL_PHI]], 4294967295
+; CHECK-NEXT:    [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP9]], i64 3, i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[VL]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VP_LOAD:%.*]] = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr [[TMP11]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VP_LOAD13:%.*]] = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr [[TMP12]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[VP_OP:%.*]] = call <vscale x 1 x double> @llvm.vp.fadd.nxv1f64(<vscale x 1 x double> [[VP_LOAD]], <vscale x 1 x double> [[VP_LOAD13]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]])
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double> [[VP_OP]], ptr [[TMP13]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[VL]]
+; CHECK-NEXT:    [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[FOR_END_LOOPEXIT15:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER14]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr [[ARRAYIDX1]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_08]]
+; CHECK-NEXT:    store double [[ADD]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end.loopexit15:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %B11 = ptrtoint ptr %B to i64
+  %A10 = ptrtoint ptr %A to i64
+  %C9 = ptrtoint ptr %C to i64
+  %cmp7 = icmp sgt i64 %N, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %0 = xor i64 %N, -1
+  %1 = call i64 @llvm.vscale.i64()
+  %2 = call i64 @llvm.umax.i64(i64 %1, i64 10)
+  %3 = icmp ugt i64 %2, %0
+  br i1 %3, label %for.body.preheader14, label %vector.memcheck
+
+for.body.preheader14:                             ; preds = %vector.memcheck, %for.body.preheader
+  br label %for.body
+
+vector.memcheck:                                  ; preds = %for.body.preheader
+  %4 = call i64 @llvm.vscale.i64()
+  %5 = shl nuw nsw i64 %4, 3
+  %6 = sub i64 %C9, %A10
+  %diff.check = icmp ult i64 %6, %5
+  %7 = shl nuw nsw i64 %4, 3
+  %8 = sub i64 %C9, %B11
+  %diff.check12 = icmp ult i64 %8, %7
+  %conflict.rdx = or i1 %diff.check, %diff.check12
+  br i1 %conflict.rdx, label %for.body.preheader14, label %vector.body.preheader
+
+vector.body.preheader:                            ; preds = %vector.memcheck
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body.preheader, %vector.body
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.body.preheader ]
+  %evl.phi = phi i64 [ %evl.next, %vector.body ], [ %N, %vector.body.preheader ]
+  %9 = and i64 %evl.phi, 4294967295
+  %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %9, i64 3, i64 0)
+  %10 = trunc i64 %vl to i32
+  %11 = getelementptr inbounds double, ptr %A, i64 %index
+  %vp.load = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr %11, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %10), !tbaa !4
+  %12 = getelementptr inbounds double, ptr %B, i64 %index
+  %vp.load13 = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr %12, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %10), !tbaa !4
+  %13 = fadd <vscale x 1 x double> %vp.load, %vp.load13
+  %14 = getelementptr inbounds double, ptr %C, i64 %index
+  call void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double> %13, ptr %14, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %10), !tbaa !4
+  %index.next = add i64 %index, %vl
+  %evl.next = sub i64 %N, %index.next
+  %15 = icmp eq i64 %index.next, %N
+  br i1 %15, label %for.end.loopexit15, label %vector.body, !llvm.loop !8
+
+for.body:                                         ; preds = %for.body.preheader14, %for.body
+  %I.08 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader14 ]
+  %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.08
+  %16 = load double, ptr %arrayidx, align 8, !tbaa !4
+  %arrayidx1 = getelementptr inbounds double, ptr %B, i64 %I.08
+  %17 = load double, ptr %arrayidx1, align 8, !tbaa !4
+  %add = fadd double %16, %17
+  %arrayidx2 = getelementptr inbounds double, ptr %C, i64 %I.08
+  store double %add, ptr %arrayidx2, align 8, !tbaa !4
+  %inc = add nuw nsw i64 %I.08, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !12
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end.loopexit15:                               ; preds = %vector.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit15, %for.end.loopexit, %entry
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare i64 @llvm.vscale.i64() #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.umax.i64(i64, i64) #2
+
+; Function Attrs: nounwind memory(none)
+declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #3
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64() #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
+declare <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr nocapture, <vscale x 1 x i1>, i32) #4
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write)
+declare void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double>, ptr nocapture, <vscale x 1 x i1>, i32) #5
+
+attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #3 = { nounwind memory(none) }
+attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"target-abi", !"lp64d"}
+!2 = !{i32 8, !"SmallDataLimit", i32 8}
+!3 = !{!"clang version 17.0.0"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"double", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+!8 = distinct !{!8, !9, !10, !11}
+!9 = !{!"llvm.loop.mustprogress"}
+!10 = !{!"llvm.loop.isvectorized", i32 1}
+!11 = !{!"llvm.loop.unroll.runtime.disable"}
+!12 = distinct !{!12, !9, !10}
diff --git a/llvm/unittests/IR/VectorBuilderTest.cpp b/llvm/unittests/IR/VectorBuilderTest.cpp
index 4f9e9d7c494d..7b0109a77b3e 100644
--- a/llvm/unittests/IR/VectorBuilderTest.cpp
+++ b/llvm/unittests/IR/VectorBuilderTest.cpp
@@ -66,8 +66,8 @@ TEST_F(VectorBuilderTest, TestCreateBinaryInstructions) {
     bool IsFP = (#INSTCLASS)[0] == 'F';                                        \
     auto *ValueTy = IsFP ? FloatVecTy : IntVecTy;                              \
     Value *Op = UndefValue::get(ValueTy);                                      \
-    auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy,     \
-                                             {Op, Op});                        \
+    auto *I = VBuild.createVectorInstructionFromOpcode(Instruction::OPCODE,    \
+                                                       ValueTy, {Op, Op});     \
     ASSERT_TRUE(isa<VPIntrinsic>(I));                                          \
     auto *VPIntrin = cast<VPIntrinsic>(I);                                     \
     ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID);                               \
@@ -116,8 +116,8 @@ TEST_F(VectorBuilderTest, TestCreateBinaryInstructions_FixedVector_NoMask) {
     bool IsFP = (#INSTCLASS)[0] == 'F';                                        \
     Type *ValueTy = IsFP ? FloatVecTy : IntVecTy;                              \
     Value *Op = UndefValue::get(ValueTy);                                      \
-    auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy,     \
-                                             {Op, Op});                        \
+    auto *I = VBuild.createVectorInstructionFromOpcode(Instruction::OPCODE,    \
+                                                       ValueTy, {Op, Op});     \
     ASSERT_TRUE(isa<VPIntrinsic>(I));                                          \
     auto *VPIntrin = cast<VPIntrinsic>(I);                                     \
     ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID);                               \
@@ -162,8 +162,8 @@ TEST_F(VectorBuilderTest, TestCreateBinaryInstructions_FixedVector_NoEVL) {
     bool IsFP = (#INSTCLASS)[0] == 'F';                                        \
     Type *ValueTy = IsFP ? FloatVecTy : IntVecTy;                              \
     Value *Op = UndefValue::get(ValueTy);                                      \
-    auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy,     \
-                                             {Op, Op});                        \
+    auto *I = VBuild.createVectorInstructionFromOpcode(Instruction::OPCODE,    \
+                                                       ValueTy, {Op, Op});     \
     ASSERT_TRUE(isa<VPIntrinsic>(I));                                          \
     auto *VPIntrin = cast<VPIntrinsic>(I);                                     \
     ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID);                               \
@@ -197,8 +197,8 @@ TEST_F(VectorBuilderTest,
     bool IsFP = (#INSTCLASS)[0] == 'F';                                        \
     Type *ValueTy = IsFP ? FloatVecTy : IntVecTy;                              \
     Value *Op = UndefValue::get(ValueTy);                                      \
-    auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy,     \
-                                             {Op, Op});                        \
+    auto *I = VBuild.createVectorInstructionFromOpcode(Instruction::OPCODE,    \
+                                                       ValueTy, {Op, Op});     \
     ASSERT_TRUE(isa<VPIntrinsic>(I));                                          \
     auto *VPIntrin = cast<VPIntrinsic>(I);                                     \
     ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID);                               \
@@ -227,8 +227,8 @@ TEST_F(VectorBuilderTest, TestCreateLoadStore) {
 
   // vp.load
   auto LoadVPID = VPIntrinsic::getForOpcode(Instruction::Load);
-  auto *LoadIntrin = VBuild.createVectorInstruction(Instruction::Load,
-                                                    FloatVecTy, {FloatVecPtr});
+  auto *LoadIntrin = VBuild.createVectorInstructionFromOpcode(
+      Instruction::Load, FloatVecTy, {FloatVecPtr});
   ASSERT_TRUE(isa<VPIntrinsic>(LoadIntrin));
   auto *VPLoad = cast<VPIntrinsic>(LoadIntrin);
   ASSERT_EQ(VPLoad->getIntrinsicID(), LoadVPID);
@@ -237,8 +237,8 @@ TEST_F(VectorBuilderTest, TestCreateLoadStore) {
   // vp.store
   auto *VoidTy = Builder.getVoidTy();
   auto StoreVPID = VPIntrinsic::getForOpcode(Instruction::Store);
-  auto *StoreIntrin = VBuild.createVectorInstruction(Instruction::Store, VoidTy,
-                                                     {FloatVec, FloatVecPtr});
+  auto *StoreIntrin = VBuild.createVectorInstructionFromOpcode(
+      Instruction::Store, VoidTy, {FloatVec, FloatVecPtr});
   ASSERT_TRUE(isa<VPIntrinsic>(LoadIntrin));
   auto *VPStore = cast<VPIntrinsic>(StoreIntrin);
   ASSERT_EQ(VPStore->getIntrinsicID(), StoreVPID);
@@ -257,7 +257,8 @@ TEST_F(VectorBuilderTest, TestFail_SilentlyReturnNone) {
   auto *VoidTy = Builder.getVoidTy();
   VectorBuilder VBuild(Builder, VectorBuilder::Behavior::SilentlyReturnNone);
   VBuild.setMask(Mask).setEVL(EVL);
-  auto *Val = VBuild.createVectorInstruction(Instruction::Br, VoidTy, {});
+  auto *Val =
+      VBuild.createVectorInstructionFromOpcode(Instruction::Br, VoidTy, {});
   ASSERT_EQ(Val, nullptr);
 }
 
@@ -272,8 +273,11 @@ TEST_F(VectorBuilderTest, TestFail_ReportAndAbort) {
   auto *VoidTy = Builder.getVoidTy();
   VectorBuilder VBuild(Builder, VectorBuilder::Behavior::ReportAndAbort);
   VBuild.setMask(Mask).setEVL(EVL);
-  ASSERT_DEATH({ VBuild.createVectorInstruction(Instruction::Br, VoidTy, {}); },
-               "No VPIntrinsic for this opcode");
+  ASSERT_DEATH(
+      {
+        VBuild.createVectorInstructionFromOpcode(Instruction::Br, VoidTy, {});
+      },
+      "No VPIntrinsic for this opcode");
 }
 
 } // end anonymous namespace
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index 65d241feeab2..539701822bfb 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1036,7 +1036,8 @@ TEST(VPRecipeTest, CastVPWidenMemoryInstructionRecipeToVPUserAndVPDef) {
       new LoadInst(Int32, UndefValue::get(Int32Ptr), "", false, Align(1));
   VPValue Addr;
   VPValue Mask;
-  VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, true, false);
+  VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, nullptr, true,
+                                        false);
   EXPECT_TRUE(isa<VPUser>(&Recipe));
   VPRecipeBase *BaseR = &Recipe;
   EXPECT_TRUE(isa<VPUser>(BaseR));
@@ -1131,7 +1132,8 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
         new LoadInst(Int32, UndefValue::get(Int32Ptr), "", false, Align(1));
     VPValue Addr;
     VPValue Mask;
-    VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, true, false);
+    VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, nullptr, true,
+                                          false);
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_TRUE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1145,8 +1147,8 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     VPValue Addr;
     VPValue Mask;
     VPValue StoredV;
-    VPWidenMemoryInstructionRecipe Recipe(*Store, &Addr, &StoredV, &Mask, false,
-                                          false);
+    VPWidenMemoryInstructionRecipe Recipe(*Store, &Addr, &StoredV, &Mask,
+                                          nullptr, false, false);
     EXPECT_TRUE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_TRUE(Recipe.mayWriteToMemory());