diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index e2a127ff35be..fb7bf098f860 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -23,6 +23,7 @@ #include "llvm/ADT/SmallBitVector.h" #include "llvm/IR/FMF.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" @@ -1714,6 +1715,9 @@ class TargetTransformInfo { /// \return The maximum number of function arguments the target supports. unsigned getMaxNumArgs() const; + Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL, + ElementCount VF) const; + /// @} private: @@ -2088,6 +2092,8 @@ class TargetTransformInfo::Concept { getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; virtual bool hasArmWideBranch(bool Thumb) const = 0; virtual unsigned getMaxNumArgs() const = 0; + virtual Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL, + ElementCount VF) const = 0; }; template @@ -2815,6 +2821,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { unsigned getMaxNumArgs() const override { return Impl.getMaxNumArgs(); } + + Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL, + ElementCount VF) const override { + return Impl.computeVectorLength(Builder, AVL, VF); + } }; template diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 1d8f523e9792..4195dcaa6394 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -19,6 +19,7 @@ #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" @@ -908,6 +909,21 @@ class TargetTransformInfoImplBase { unsigned getMaxNumArgs() const { return UINT_MAX; } + Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL, + ElementCount VF) const { + if (!VF.isScalable()) { + return ConstantInt::get(Builder.getInt32Ty(), VF.getFixedValue()); + } + + Constant *EC = + ConstantInt::get(Builder.getInt32Ty(), VF.getKnownMinValue()); + Value *VLMax = Builder.CreateVScale(EC, "vlmax"); + Value *VL = Builder.CreateZExtOrTrunc(AVL, Builder.getInt32Ty(), "vl"); + + return Builder.CreateIntrinsic(Builder.getInt32Ty(), Intrinsic::umin, + {VLMax, VL}, nullptr, "evl"); + } + protected: // Obtain the minimum required size to hold the value (without the sign) // In case of a vector it returns the min required size for one element. diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 5e7bdcdf72a4..0a9b2cfd266a 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -35,6 +35,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" @@ -2558,6 +2559,21 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { InstructionCost getVectorSplitCost() { return 1; } + Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL, + ElementCount VF) const { + if (!VF.isScalable()) { + return ConstantInt::get(Builder.getInt32Ty(), VF.getFixedValue()); + } + + Constant *EC = + ConstantInt::get(Builder.getInt32Ty(), VF.getKnownMinValue()); + Value *VLMax = Builder.CreateVScale(EC, "vlmax"); + Value *VL = Builder.CreateZExtOrTrunc(AVL, Builder.getInt32Ty(), "vl"); + + return Builder.CreateIntrinsic(Builder.getInt32Ty(), Intrinsic::umin, + {VLMax, VL}, nullptr, "evl"); + } + /// @} }; diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h index 8940bebd2c9a..560897a04052 100644 --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -564,6 +564,9 @@ class VPIntrinsic : public IntrinsicInst { /// The llvm.vp.* intrinsics for this instruction Opcode static Intrinsic::ID getForOpcode(unsigned OC); + /// The llvm.vp.* intrinsics for this intrinsic ID + static Intrinsic::ID getForIntrinsicID(Intrinsic::ID IID); + // Whether \p ID is a VP intrinsic ID. static bool isVPIntrinsic(Intrinsic::ID); diff --git a/llvm/include/llvm/IR/VectorBuilder.h b/llvm/include/llvm/IR/VectorBuilder.h index 301edaed70fe..654486f210ef 100644 --- a/llvm/include/llvm/IR/VectorBuilder.h +++ b/llvm/include/llvm/IR/VectorBuilder.h @@ -57,6 +57,10 @@ class VectorBuilder { return RetType(); } + Value *createVectorInstruction(Intrinsic::ID VPID, Type *ReturnTy, + ArrayRef VecOpArray, + const Twine &Name = Twine()); + public: VectorBuilder(IRBuilderBase &Builder, Behavior ErrorHandling = Behavior::ReportAndAbort) @@ -89,9 +93,19 @@ class VectorBuilder { // \p Opcode The functional instruction opcode of the emitted intrinsic. // \p ReturnTy The return type of the operation. // \p VecOpArray The operand list. - Value *createVectorInstruction(unsigned Opcode, Type *ReturnTy, - ArrayRef VecOpArray, - const Twine &Name = Twine()); + Value *createVectorInstructionFromOpcode(unsigned Opcode, Type *ReturnTy, + ArrayRef VecOpArray, + const Twine &Name = Twine()); + + // Emit a VP intrinsic call that mimics a regular intrinsic. + // This operation behaves according to the VectorBuilderBehavior. + // \p IID The functional intrinsic ID of the emitted VP intrinsic. + // \p ReturnTy The return type of the operation. + // \p VecOpArray The operand list. + Value *createVectorInstructionFromIntrinsicID(Intrinsic::ID IID, + Type *ReturnTy, + ArrayRef VecOpArray, + const Twine &Name = Twine()); }; } // namespace llvm diff --git a/llvm/include/llvm/Transforms/Vectorize/VectorPredication.h b/llvm/include/llvm/Transforms/Vectorize/VectorPredication.h new file mode 100644 index 000000000000..ce59854dbb95 --- /dev/null +++ b/llvm/include/llvm/Transforms/Vectorize/VectorPredication.h @@ -0,0 +1,55 @@ +#ifndef LLVM_TRANSFORMS_VECTORPREDICATION_H +#define LLVM_TRANSFORMS_VECTORPREDICATION_H + +#include "llvm/ADT/MapVector.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { + +using InstToMaskEVLMap = DenseMap>; + +struct BlockData { + // Vector that stores all vector predicated memory writing operations found in + // the basic block. If after phase 1 is empty, then the basic block can be + // skipped by following phases. + SmallVector MemoryWritingVPInstructions; + + // Store all instructions of the basic block (in the same order as they are + // found), assigning to each the list of users. Skip PHIs and terminators. + MapVector> TopologicalGraph; + + // Map each full-length vector operation eligible to be transformed to a + // vector predication one with the (mask,evl) pair of its first vector + // predicated memory writing operation user. + InstToMaskEVLMap VecOpsToTransform; + + // Ordered list representing the reverse order of how the basic block has to + // be transformed due to the new vector predicated instructions. + SmallVector NewBBReverseOrder; + + BlockData() = default; +}; + +class VectorPredicationPass : public PassInfoMixin { +private: + // List of instructions to be replaced by the new VP operations and that later + // should be removed, if possible. + DenseMap OldInstructionsToRemove; + + void analyseBasicBlock(BasicBlock &BB, BlockData &BBInfo); + void findCandidateVectorOperations(BasicBlock &BB, BlockData &BBInfo); + void addNewUsersToMasksAndEVLs(BasicBlock &BB, BlockData &BBInfo); + void buildNewBasicBlockSchedule(BasicBlock &BB, BlockData &BBInfo); + void emitNewBasicBlockSchedule(BasicBlock &BB, BlockData &BBInfo); + void transformCandidateVectorOperations(BasicBlock &BB, BlockData &BBInfo); + + void removeOldInstructions(); + +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + static StringRef name() { return "VectorPredicationPass"; } +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_VECTORPREDICATION_H diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 3f76dfdaac31..b79047627d1f 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1264,6 +1264,12 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType, return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment); } +Value *TargetTransformInfo::computeVectorLength(IRBuilderBase &Builder, + Value *AVL, + ElementCount VF) const { + return TTIImpl->computeVectorLength(Builder, AVL, VF); +} + TargetTransformInfo::Concept::~Concept() = default; TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {} diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp index 7a3b708e7400..3aa33dfc2afd 100644 --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -604,6 +604,19 @@ Intrinsic::ID VPIntrinsic::getForOpcode(unsigned IROPC) { return Intrinsic::not_intrinsic; } +Intrinsic::ID VPIntrinsic::getForIntrinsicID(Intrinsic::ID IID) { + switch (IID) { + default: + break; + +#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) break; +#define VP_PROPERTY_FUNCTIONAL_INTRINSIC(INTR) case Intrinsic::INTR: +#define END_REGISTER_VP_INTRINSIC(VPID) return Intrinsic::VPID; +#include "llvm/IR/VPIntrinsics.def" + } + return Intrinsic::not_intrinsic; +} + bool VPIntrinsic::canIgnoreVectorLengthParam() const { using namespace PatternMatch; diff --git a/llvm/lib/IR/VectorBuilder.cpp b/llvm/lib/IR/VectorBuilder.cpp index c07bc0561fba..c94bc5b180f5 100644 --- a/llvm/lib/IR/VectorBuilder.cpp +++ b/llvm/lib/IR/VectorBuilder.cpp @@ -51,13 +51,30 @@ Value &VectorBuilder::requestEVL() { return *ConstantInt::get(IntTy, StaticVectorLength.getFixedValue()); } -Value *VectorBuilder::createVectorInstruction(unsigned Opcode, Type *ReturnTy, - ArrayRef InstOpArray, - const Twine &Name) { +Value *VectorBuilder::createVectorInstructionFromOpcode( + unsigned Opcode, Type *ReturnTy, ArrayRef InstOpArray, + const Twine &Name) { auto VPID = VPIntrinsic::getForOpcode(Opcode); if (VPID == Intrinsic::not_intrinsic) return returnWithError("No VPIntrinsic for this opcode"); + return createVectorInstruction(VPID, ReturnTy, InstOpArray, Name); +} + +Value *VectorBuilder::createVectorInstructionFromIntrinsicID( + Intrinsic::ID IID, Type *ReturnTy, ArrayRef InstOpArray, + const Twine &Name) { + auto VPID = VPIntrinsic::getForIntrinsicID(IID); + if (VPID == Intrinsic::not_intrinsic) + return returnWithError("No VPIntrinsic for this Intrinsic"); + + return createVectorInstruction(VPID, ReturnTy, InstOpArray, Name); +} + +Value *VectorBuilder::createVectorInstruction(Intrinsic::ID VPID, + Type *ReturnTy, + ArrayRef InstOpArray, + const Twine &Name) { auto MaskPosOpt = VPIntrinsic::getMaskParamPos(VPID); auto VLenPosOpt = VPIntrinsic::getVectorLengthParamPos(VPID); size_t NumInstParams = InstOpArray.size(); diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index f94bd422c6b5..973d6cd7d17a 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -281,6 +281,7 @@ #include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "llvm/Transforms/Vectorize/SLPVectorizer.h" #include "llvm/Transforms/Vectorize/VectorCombine.h" +#include "llvm/Transforms/Vectorize/VectorPredication.h" #include using namespace llvm; diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 5c6c391049a7..82ba63b5d0ae 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -135,6 +135,7 @@ #include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "llvm/Transforms/Vectorize/SLPVectorizer.h" #include "llvm/Transforms/Vectorize/VectorCombine.h" +#include "llvm/Transforms/Vectorize/VectorPredication.h" using namespace llvm; @@ -285,6 +286,11 @@ cl::opt EnableMemProfContextDisambiguation( extern cl::opt EnableInferAlignmentPass; } // namespace llvm +static cl::opt + EnableVectorPredication("enable-vector-predication", cl::init(false), + cl::Hidden, + cl::desc("Enable VectorPredicationPass.")); + PipelineTuningOptions::PipelineTuningOptions() { LoopInterleaving = true; LoopVectorization = true; @@ -1297,6 +1303,10 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level, /*AllowSpeculation=*/true), /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false)); + // Try to vector predicate vectorized functions. + if (EnableVectorPredication) + FPM.addPass(VectorPredicationPass()); + // Now that we've vectorized and unrolled loops, we may have more refined // alignment information, try to re-derive it here. FPM.addPass(AlignmentFromAssumptionsPass()); diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 82ce040c6496..6ad9cb1c44de 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -420,6 +420,7 @@ FUNCTION_PASS("tsan", ThreadSanitizerPass()) FUNCTION_PASS("typepromotion", TypePromotionPass(TM)) FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass()) FUNCTION_PASS("vector-combine", VectorCombinePass()) +FUNCTION_PASS("vector-predication", VectorPredicationPass()) FUNCTION_PASS("verify", VerifierPass()) FUNCTION_PASS("verify", DominatorTreeVerifierPass()) FUNCTION_PASS("verify", LoopVerifierPass()) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 1a9abaea8111..8f25709d95fd 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -14,6 +14,7 @@ #include "llvm/CodeGen/CostTable.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsRISCV.h" #include #include using namespace llvm; @@ -1848,3 +1849,36 @@ bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, C2.NumIVMuls, C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost); } + +Value *RISCVTTIImpl::computeVectorLength(IRBuilderBase &Builder, Value *AVL, + ElementCount VF) const { + // Maps a VF to a (SEW, LMUL) pair. + // NOTE: we assume ELEN = 64. + const std::map> + VFToSEWLMUL = {{1, {3, 0}}, {2, {3, 1}}, {4, {3, 2}}, {8, {3, 3}}, + {16, {2, 3}}, {32, {1, 3}}, {64, {0, 3}}}; + + assert(AVL->getType()->isIntegerTy() && + "Requested vector length should be an integer."); + assert(VFToSEWLMUL.find(VF.getKnownMinValue()) != VFToSEWLMUL.end() && + "Invalid value for LMUL argument."); + auto VFToSEWLMULVal = VFToSEWLMUL.at(VF.getKnownMinValue()); + + Value *AVLArg = Builder.CreateZExtOrTrunc(AVL, Builder.getInt64Ty()); + Constant *SEWArg = + ConstantInt::get(Builder.getInt64Ty(), VFToSEWLMULVal.first); + Constant *LMULArg = + ConstantInt::get(Builder.getInt64Ty(), VFToSEWLMULVal.second); + Value *EVLRes = + Builder.CreateIntrinsic(Intrinsic::riscv_vsetvli, {AVLArg->getType()}, + {AVLArg, SEWArg, LMULArg}, nullptr, "vl"); + + // NOTE: evl type is required to be i32. + Value *EVL = Builder.CreateZExtOrTrunc(EVLRes, Builder.getInt32Ty()); + if (!VF.isScalable()) { + EVL = Builder.CreateBinaryIntrinsic( + Intrinsic::umin, + ConstantInt::get(Builder.getInt32Ty(), VF.getFixedValue()), EVL); + } + return EVL; +} diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index d2592be75000..cd30f16fc6c0 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -377,6 +377,9 @@ class RISCVTTIImpl : public BasicTTIImplBase { bool shouldFoldTerminatingConditionAfterLSR() const { return true; } + + Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL, + ElementCount VF) const; }; } // end namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt index 9674094024b9..5574b33d9bc2 100644 --- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -5,6 +5,7 @@ add_llvm_component_library(LLVMVectorize SLPVectorizer.cpp Vectorize.cpp VectorCombine.cpp + VectorPredication.cpp VPlan.cpp VPlanAnalysis.cpp VPlanHCFGBuilder.cpp diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b590fb4685a3..02cf0aaef5fa 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -411,6 +411,10 @@ static constexpr uint32_t MemCheckBypassWeights[] = {1, 127}; // after prolog. See `emitIterationCountCheck`. static constexpr uint32_t MinItersBypassWeights[] = {1, 127}; +cl::opt UseVectorPredicationIntrinsics( + "use-vp-intrinsics", cl::init(false), cl::Hidden, + cl::desc("Use Vector Predication intrinsics during vectorization.")); + /// A helper function that returns true if the given type is irregular. The /// type is irregular if its allocated size doesn't equal the store size of an /// element of the corresponding vector type. @@ -2792,6 +2796,11 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { if (VectorTripCount) return VectorTripCount; + // With VP intrinsics, we require tail-folding by masking; this way, we + // operate on a number of elements equal to the original loop trip count. + if (UseVectorPredicationIntrinsics) + return VectorTripCount = getTripCount(); + Value *TC = getTripCount(); IRBuilder<> Builder(InsertBlock->getTerminator()); @@ -2828,6 +2837,7 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { // the step does not evenly divide the trip count, no adjustment is necessary // since there will already be scalar iterations. Note that the minimum // iterations check ensures that N >= Step. + // TODO: we should probably honor the cost model also with VP intrinsics. if (Cost->requiresScalarEpilogue(VF.isVector())) { auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); R = Builder.CreateSelect(IsZero, Step, R); @@ -6316,9 +6326,12 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, } bool Reverse = ConsecutiveStride < 0; - if (Reverse) + if (Reverse) { + if (UseVectorPredicationIntrinsics) + return InstructionCost::getInvalid(); Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, std::nullopt, CostKind, 0); + } return Cost; } @@ -8234,12 +8247,13 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, Reverse || Decision == LoopVectorizationCostModel::CM_Widen; if (LoadInst *Load = dyn_cast(I)) - return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, - Consecutive, Reverse); + return new VPWidenMemoryInstructionRecipe( + *Load, Operands[0], Mask, Plan->getEVLPhi(), Consecutive, Reverse); StoreInst *Store = cast(I); return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], - Mask, Consecutive, Reverse); + Mask, Plan->getEVLPhi(), + Consecutive, Reverse); } /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also @@ -8257,10 +8271,12 @@ createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); if (auto *TruncI = dyn_cast(PhiOrTrunc)) { - return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI); + return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI, + Plan.getEVLPhi()); } assert(isa(PhiOrTrunc) && "must be a phi node here"); - return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc); + return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, + Plan.getEVLPhi()); } VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( @@ -8692,32 +8708,64 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, // Add the necessary canonical IV and branch recipes required to control the // loop. -static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, - DebugLoc DL) { - Value *StartIdx = ConstantInt::get(IdxTy, 0); - auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx); +static VPInstruction *addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, + bool HasNUW, DebugLoc DL, + const TargetTransformInfo *TTI) { + VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); + + // Add the EVL recipe, used to calculate the correct IV increment. + VPEVLPHIRecipe *EVLRecipe = nullptr; + // TODO: TTI should be able to indicate if a target prefers vector predication + // intrinsics. + if (UseVectorPredicationIntrinsics) { + EVLRecipe = new VPEVLPHIRecipe(Plan.getTripCount(), TTI); + Header->insert(EVLRecipe, Header->begin()); + } // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. + Value *StartIdx = ConstantInt::get(IdxTy, 0); + auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx); auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); - VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); - VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); Header->insert(CanonicalIVPHI, Header->begin()); // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar - // IV by VF * UF. - auto *CanonicalIVIncrement = + // IV either by VF * UF or by the EVL values. + VPInstruction *CanonicalIVIncrement = nullptr; + if (EVLRecipe) + CanonicalIVIncrement = + new VPInstruction(Instruction::Add, {CanonicalIVPHI, EVLRecipe}, + {HasNUW, false}, DL, "index.next"); + else + CanonicalIVIncrement = new VPInstruction(Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL, "index.next"); + CanonicalIVPHI->addOperand(CanonicalIVIncrement); + // If we are working with vector predication instrinsics, add a NextEVL + // VPInstruction to calculate the remaining elements number. + VPInstruction *NextEVL = nullptr; + if (EVLRecipe) { + NextEVL = + new VPInstruction(VPInstruction::NextEVL, + {EVLRecipe, CanonicalIVIncrement}, DL, "evl.next"); + EVLRecipe->addOperand(NextEVL); + } + VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); EB->appendRecipe(CanonicalIVIncrement); + if (NextEVL) { + EB->appendRecipe(NextEVL); + } // Add the BranchOnCount VPInstruction to the latch. VPInstruction *BranchBack = new VPInstruction(VPInstruction::BranchOnCount, {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); EB->appendRecipe(BranchBack); + + return NextEVL; } // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the @@ -8807,7 +8855,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { // When not folding the tail, we know that the induction increment will not // overflow. bool HasNUW = Style == TailFoldingStyle::None; - addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); + auto *NextEVL = addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), + HasNUW, DL, &TTI); // Proactively create header mask. Masks for other blocks are created on // demand. @@ -8982,7 +9031,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { bool WithoutRuntimeCheck = Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow, - WithoutRuntimeCheck); + WithoutRuntimeCheck, NextEVL); } return Plan; } @@ -9022,7 +9071,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { // is guaranteed to not wrap. bool HasNUW = true; addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, - DebugLoc()); + DebugLoc(), &TTI); return Plan; } @@ -9529,7 +9578,7 @@ lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr, } else { VectorBuilder VBuilder(Builder); VBuilder.setEVL(EVLPart).setMask(Mask); - Call = cast(VBuilder.createVectorInstruction( + Call = cast(VBuilder.createVectorInstructionFromOpcode( Instruction::Store, Type::getVoidTy(EVLPart->getContext()), {StoredVal, Addr})); } @@ -9553,7 +9602,7 @@ static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder, } else { VectorBuilder VBuilder(Builder); VBuilder.setEVL(EVLPart).setMask(Mask); - Call = cast(VBuilder.createVectorInstruction( + Call = cast(VBuilder.createVectorInstructionFromOpcode( Instruction::Load, DataTy, Addr, "vp.op.load")); } Call->addParamAttr( @@ -9580,8 +9629,15 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); - bool isMaskRequired = getMask(); - if (isMaskRequired) { + VPValue *VPMask = getMask(); + VPValue *VPEVL = getEVL(); + if (VPEVL && (!VPMask || (isa(VPMask) && + dyn_cast(VPMask)->getOpcode() == + VPInstruction::ActiveLaneMask))) { + auto *MaskTy = VectorType::get(Builder.getInt1Ty(), State.VF); + for (unsigned Part = 0; Part < State.UF; ++Part) + BlockInMaskParts[Part] = ConstantInt::getTrue(MaskTy); + } else if (VPMask) { // Mask reversal is only neede for non-all-one (null) masks, as reverse of a // null all-one mask is a null mask. for (unsigned Part = 0; Part < State.UF; ++Part) { @@ -9623,7 +9679,14 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { PartPtr = Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds); } else { - Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); + Value *Increment = nullptr; + if (VPEVL) { + Increment = Builder.getInt32(0); // EVL is always an i32. + for (unsigned int P = 0; P < Part; P++) + Increment = Builder.CreateAdd(Increment, State.get(VPEVL, P)); + } else { + Increment = createStepForVF(Builder, IndexTy, State.VF, Part); + } PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds); } @@ -9631,7 +9694,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { }; auto MaskValue = [&](unsigned Part) -> Value * { - if (isMaskRequired) + if (VPMask) return BlockInMaskParts[Part]; return nullptr; }; @@ -9659,10 +9722,19 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { StoredVal, CreateGatherScatter, MaskValue(Part), EVLPart, Alignment); } else if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Value *MaskPart = + (VPMask || VPEVL) ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(getAddr(), Part); - NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, - MaskPart); + if (VPEVL) { + auto *PtrsTy = cast(VectorGep->getType()); + Value *Operands[] = {StoredVal, VectorGep, MaskPart, + State.get(VPEVL, Part)}; + NewSI = Builder.CreateIntrinsic(Intrinsic::vp_scatter, + {DataTy, PtrsTy}, Operands); + } else { + NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, + MaskPart); + } } else { if (isReverse()) { // If we store to reverse consecutive memory locations, then we need @@ -9673,11 +9745,17 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { } auto *VecPtr = CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); - if (isMaskRequired) + if (VPEVL) { + Value *Operands[] = {StoredVal, VecPtr, BlockInMaskParts[Part], + State.get(VPEVL, Part)}; + NewSI = Builder.CreateIntrinsic( + Intrinsic::vp_store, {DataTy, VecPtr->getType()}, Operands); + } else if (VPMask) { NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, BlockInMaskParts[Part]); - else + } else { NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); + } } State.addMetadata(NewSI, SI); } @@ -9704,21 +9782,37 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { : CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))), CreateGatherScatter, MaskValue(Part), EVLPart, Alignment); } else if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Value *MaskPart = + (VPMask || VPEVL) ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(getAddr(), Part); - NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, - nullptr, "wide.masked.gather"); + if (VPEVL) { + auto *PtrsTy = cast(VectorGep->getType()); + Value *Operands[] = {VectorGep, MaskPart, State.get(VPEVL, Part)}; + NewLI = Builder.CreateIntrinsic(Intrinsic::vp_gather, {DataTy, PtrsTy}, + Operands, nullptr, "vp.gather"); + } else { + NewLI = + Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, + nullptr, "wide.masked.gather"); + } State.addMetadata(NewLI, LI); } else { auto *VecPtr = CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); - if (isMaskRequired) + if (VPEVL) { + Value *Operands[] = {VecPtr, BlockInMaskParts[Part], + State.get(VPEVL, Part)}; + NewLI = Builder.CreateIntrinsic(Intrinsic::vp_load, + {DataTy, VecPtr->getType()}, Operands, + nullptr, "vp.load"); + } else if (VPMask) { NewLI = Builder.CreateMaskedLoad( DataTy, VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), "wide.masked.load"); - else + } else { NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); + } // Add metadata to the load, but setVectorValue to the reverse shuffle. State.addMetadata(NewLI, LI); @@ -10516,6 +10610,11 @@ LoopVectorizeResult LoopVectorizePass::runImpl( PreservedAnalyses LoopVectorizePass::run(Function &F, FunctionAnalysisManager &AM) { + assert((!UseVectorPredicationIntrinsics || + PreferPredicateOverEpilogue == + PreferPredicateTy::PredicateOrDontVectorize) && + "Tail folding required when using VP intrinsics."); + auto &LI = AM.getResult(F); // There are no loops in the function. Return before computing other expensive // analyses. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 1d7df9c9575a..0ac8d43acb11 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -708,6 +708,16 @@ VPlan::~VPlan() { delete BackedgeTakenCount; } +VPEVLPHIRecipe *VPlan::getEVLPhi() { + VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock(); + for (VPRecipeBase &R : Header->phis()) { + if (isa(&R)) + return cast(&R); + } + + return nullptr; +} + VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE) { VPBasicBlock *Preheader = new VPBasicBlock("ph"); VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph"); @@ -821,6 +831,13 @@ void VPlan::execute(VPTransformState *State) { } auto *PhiR = cast(&R); + if (auto *EVLPhi = dyn_cast(PhiR)) { + PHINode *Phi = EVLPhi->getPhi(); + Phi->addIncoming(State->get(EVLPhi->getBackedgeValue(), State->UF - 1), + VectorLatchBB); + continue; + } + // For canonical IV, first-order recurrences and in-order reduction phis, // only a single part is generated, which provides the last part from the // previous iteration. For non-ordered reductions all UF parts are diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 0ca668abbe60..ab1d4b73aa62 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -771,10 +771,10 @@ class VPRecipeBase : public ilist_node_with_parent, /// Returns the underlying instruction, if the recipe is a VPValue or nullptr /// otherwise. Instruction *getUnderlyingInstr() { - return cast(getVPSingleValue()->getUnderlyingValue()); + return cast_or_null(getVPSingleValue()->getUnderlyingValue()); } const Instruction *getUnderlyingInstr() const { - return cast(getVPSingleValue()->getUnderlyingValue()); + return cast_or_null(getVPSingleValue()->getUnderlyingValue()); } /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -1069,7 +1069,8 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue { // Increment the canonical IV separately for each unrolled part. CanonicalIVIncrementForPart, BranchOnCount, - BranchOnCond + BranchOnCond, + NextEVL }; private: @@ -1452,20 +1453,28 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe { TruncInst *Trunc; const InductionDescriptor &IndDesc; + void addEVL(VPValue *EVLRecipe) { + if (EVLRecipe) + addOperand(EVLRecipe); + } + public: VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step, - const InductionDescriptor &IndDesc) + const InductionDescriptor &IndDesc, + VPValue *EVLRecipe) : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start), IV(IV), Trunc(nullptr), IndDesc(IndDesc) { addOperand(Step); + addEVL(EVLRecipe); } VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step, const InductionDescriptor &IndDesc, - TruncInst *Trunc) + TruncInst *Trunc, VPValue *EVLRecipe) : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, Trunc, Start), IV(IV), Trunc(Trunc), IndDesc(IndDesc) { addOperand(Step); + addEVL(EVLRecipe); } ~VPWidenIntOrFpInductionRecipe() override = default; @@ -1500,6 +1509,12 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe { VPValue *getStepValue() { return getOperand(1); } const VPValue *getStepValue() const { return getOperand(1); } + /// Return the EVL value of the current loop iteration. + VPValue *getEVL() { return getNumOperands() == 3 ? getOperand(2) : nullptr; } + const VPValue *getEVL() const { + return getNumOperands() == 3 ? getOperand(2) : nullptr; + } + /// Returns the first defined value as TruncInst, if it is one or nullptr /// otherwise. TruncInst *getTruncInst() { return Trunc; } @@ -1988,8 +2003,8 @@ class VPPredInstPHIRecipe : public VPRecipeBase, public VPValue { /// A Recipe for widening load/store operations. /// The recipe uses the following VPValues: -/// - For load: Address, optional mask -/// - For store: Address, stored value, optional mask +/// - For load: Address, optional mask, optional evl +/// - For store: Address, stored value, optional mask, optional evl /// TODO: We currently execute only per-part unless a specific instance is /// provided. class VPWidenMemoryInstructionRecipe : public VPRecipeBase { @@ -2001,33 +2016,41 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase { // Whether the consecutive loaded/stored addresses are in reverse order. bool Reverse; - void setMask(VPValue *Mask) { - if (!Mask) - return; - addOperand(Mask); - } + // Whether the instruction has a not all-ones mask. + bool Masked = false; + + // Whether a vector length is available to the instruction. + bool HasVL = false; - bool isMasked() const { - return isStore() ? getNumOperands() == 3 : getNumOperands() == 2; + void setMaskAndEVL(VPValue *Mask, VPValue *VPEVL) { + if (Mask) { + this->Masked = true; + addOperand(Mask); + } + + if (VPEVL) { + this->HasVL = true; + addOperand(VPEVL); + } } public: VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, - bool Consecutive, bool Reverse) + VPValue *EVL, bool Consecutive, bool Reverse) : VPRecipeBase(VPDef::VPWidenMemoryInstructionSC, {Addr}), Ingredient(Load), Consecutive(Consecutive), Reverse(Reverse) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); new VPValue(this, &Load); - setMask(Mask); + setMaskAndEVL(Mask, EVL); } VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredValue, VPValue *Mask, - bool Consecutive, bool Reverse) + VPValue *EVL, bool Consecutive, bool Reverse) : VPRecipeBase(VPDef::VPWidenMemoryInstructionSC, {Addr, StoredValue}), Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); - setMask(Mask); + setMaskAndEVL(Mask, EVL); } VP_CLASSOF_IMPL(VPDef::VPWidenMemoryInstructionSC) @@ -2040,8 +2063,15 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase { /// Return the mask used by this recipe. Note that a full mask is represented /// by a nullptr. VPValue *getMask() const { - // Mask is optional and therefore the last operand. - return isMasked() ? getOperand(getNumOperands() - 1) : nullptr; + return Masked ? (HasVL ? getOperand(getNumOperands() - 2) + : getOperand(getNumOperands() - 1)) + : nullptr; + } + + /// Return the evl used by this recipe. If we are working with full-length + /// vectors, return nullptr. + VPValue *getEVL() const { + return HasVL ? getOperand(getNumOperands() - 1) : nullptr; } /// Returns true if this recipe is a store. @@ -2190,6 +2220,33 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe { #endif }; +class VPEVLPHIRecipe : public VPHeaderPHIRecipe { + const TargetTransformInfo *TTI; + PHINode *Phi = nullptr; + +public: + VPEVLPHIRecipe(VPValue *StartEVL, const TargetTransformInfo *TTI) + : VPHeaderPHIRecipe(VPDef::VPWidenEVLSC, nullptr, StartEVL), TTI(TTI) {} + + ~VPEVLPHIRecipe() override = default; + + VP_CLASSOF_IMPL(VPDef::VPWidenEVLSC) + + PHINode *getPhi() const { return Phi; } + + static inline bool classof(const VPHeaderPHIRecipe *D) { + return D->getVPDefID() == VPDef::VPWidenEVLSC; + } + + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A recipe for generating the phi node for the current index of elements, /// adjusted in accordance with EVL value. It starts at StartIV value and gets /// incremented by EVL in each iteration of the vector loop. @@ -2795,6 +2852,10 @@ class VPlan { return cast(&*EntryVPBB->begin()); } + /// Find and return the VPEVLPHIRecipe from the header - there should be only + /// one at most. If there isn't one, then return nullptr. + VPEVLPHIRecipe *getEVLPhi(); + void addLiveOut(PHINode *PN, VPValue *V); void removeLiveOut(PHINode *PN) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 5e0344a14df5..25658b278648 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -38,6 +38,7 @@ using VectorParts = SmallVector; namespace llvm { extern cl::opt EnableVPlanNativePath; } +extern cl::opt UseVectorPredicationIntrinsics; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME @@ -274,12 +275,25 @@ Value *VPInstruction::generateInstruction(VPTransformState &State, IRBuilderBase &Builder = State.Builder; Builder.SetCurrentDebugLocation(getDebugLoc()); - if (Instruction::isBinaryOp(getOpcode())) { + unsigned Opc = getOpcode(); + if (Instruction::isBinaryOp(Opc)) { if (Part != 0 && vputils::onlyFirstPartUsed(this)) return State.get(this, 0); Value *A = State.get(getOperand(0), Part); - Value *B = State.get(getOperand(1), Part); + Value *B = nullptr; + + if (UseVectorPredicationIntrinsics && Opc == Instruction::Add) { + // We have the EVL value available to use. + VPValue *VPEVL = getOperand(1); + Value *Step = State.get(VPEVL, 0); + for (unsigned P = 1; P < State.UF; P++) + Step = Builder.CreateAdd(Step, State.get(VPEVL, P)); + + B = Builder.CreateZExtOrTrunc(Step, A->getType()); + } else + B = State.get(getOperand(1), Part); + auto *Res = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); if (auto *I = dyn_cast(Res)) @@ -439,6 +453,19 @@ Value *VPInstruction::generateInstruction(VPTransformState &State, Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); return CondBr; } + case VPInstruction::NextEVL: { + Value *Next = nullptr; + if (Part == 0) { + auto *EVLRecipe = cast(getOperand(0)); + Value *StartEVL = EVLRecipe->getOperand(0)->getUnderlyingValue(); + Value *IVIncrement = State.get(getOperand(1), 0); + + Next = Builder.CreateSub(StartEVL, IVIncrement, "evl.next"); + } else { + Next = State.get(this, 0); + } + return Next; + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -521,6 +548,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::BranchOnCount: O << "branch-on-count"; break; + case VPInstruction::NextEVL: + O << "next-evl"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -968,24 +998,27 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { MulOp = Instruction::FMul; } - // Multiply the vectorization factor by the step using integer or - // floating-point arithmetic as appropriate. - Type *StepType = Step->getType(); - Value *RuntimeVF; - if (Step->getType()->isFloatingPointTy()) - RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); - else - RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); - Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); - - // Create a vector splat to use in the induction update. - // - // FIXME: If the step is non-constant, we create the vector splat with - // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't - // handle a constant vector splat. - Value *SplatVF = isa(Mul) - ? ConstantVector::getSplat(State.VF, cast(Mul)) - : Builder.CreateVectorSplat(State.VF, Mul); + Value *SplatVF = nullptr; + if (!getEVL()) { + // Multiply the vectorization factor by the step using integer or + // floating-point arithmetic as appropriate. + Type *StepType = Step->getType(); + Value *RuntimeVF; + if (Step->getType()->isFloatingPointTy()) + RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); + else + RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); + Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); + + // Create a vector splat to use in the induction update. + // + // FIXME: If the step is non-constant, we create the vector splat with + // IRBuilder. IRBuilder can constant-fold the multiply, but it + // doesn't handle a constant vector splat. + SplatVF = isa(Mul) + ? ConstantVector::getSplat(State.VF, cast(Mul)) + : Builder.CreateVectorSplat(State.VF, Mul); + } Builder.restoreIP(CurrIP); // We may need to add the step a number of times, depending on the unroll @@ -1000,8 +1033,26 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { if (isa(EntryVal)) State.addMetadata(LastInduction, EntryVal); - LastInduction = cast( - Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); + if (auto *EVLRecipe = getEVL()) { + // Ensure the types match. + Type *DestTy = LastInduction->getType()->getScalarType(); + Value *EVL = State.get(EVLRecipe, Part); + if (DestTy->isIntegerTy()) { + EVL = Builder.CreateZExtOrTrunc(EVL, DestTy); + } else { + assert(DestTy->isFloatingPointTy()); + EVL = Builder.CreateUIToFP(EVL, DestTy); + } + // Multiply the EVL by the step using integer or floating-point + // arithmetic as appropriate. + Value *Mul = Builder.CreateBinOp(MulOp, Step, EVL); + Value *SplatEVL = Builder.CreateVectorSplat(State.VF, Mul); + LastInduction = cast( + Builder.CreateBinOp(AddOp, LastInduction, SplatEVL, "step.add.vl")); + } else { + LastInduction = cast( + Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); + } LastInduction->setDebugLoc(EntryVal->getDebugLoc()); } @@ -1033,6 +1084,9 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, #endif bool VPWidenIntOrFpInductionRecipe::isCanonical() const { + if (getEVL()) + return false; + // The step may be defined by a recipe in the preheader (e.g. if it requires // SCEV expansion), but for the canonical induction the step is required to be // 1, which is represented as live-in. @@ -1770,3 +1824,30 @@ void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, printOperands(O, SlotTracker); } #endif + +void VPEVLPHIRecipe::execute(VPTransformState &State) { + Value *StartEVL = getOperand(0)->getUnderlyingValue(); + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + this->Phi = State.Builder.CreatePHI(StartEVL->getType(), 2, "evl.phi"); + this->Phi->addIncoming(StartEVL, VectorPH); + + Value *PrevEVL = State.Builder.CreateZExtOrTrunc( + cast(this->Phi), State.Builder.getInt32Ty(), "evl.phi.cast"); + Value *EVL = nullptr; + for (unsigned Part = 0; Part < State.UF; Part++) { + if (EVL) + PrevEVL = State.Builder.CreateSub(PrevEVL, EVL); + EVL = TTI->computeVectorLength(State.Builder, PrevEVL, State.VF); + State.set(this, EVL, Part); + } +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPEVLPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EVL-PHI "; + printAsOperand(O, SlotTracker); + O << " = phi "; + printOperands(O, SlotTracker); +} +#endif diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 466259cb196c..22dc894babc4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -53,7 +53,8 @@ void VPlanTransforms::VPInstructionsToVPRecipes( VPValue *Start = Plan->getVPValueOrAddLiveIn(II->getStartValue()); VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE); - NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II); + NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II, + Plan->getEVLPhi()); } else { Plan->addVPValue(Phi, VPPhi); continue; @@ -66,11 +67,12 @@ void VPlanTransforms::VPInstructionsToVPRecipes( if (LoadInst *Load = dyn_cast(Inst)) { NewRecipe = new VPWidenMemoryInstructionRecipe( *Load, Ingredient.getOperand(0), nullptr /*Mask*/, - false /*Consecutive*/, false /*Reverse*/); + nullptr /*EVL*/, false /*Consecutive*/, false /*Reverse*/); } else if (StoreInst *Store = dyn_cast(Inst)) { NewRecipe = new VPWidenMemoryInstructionRecipe( *Store, Ingredient.getOperand(1), Ingredient.getOperand(0), - nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/); + nullptr /*Mask*/, nullptr /*EVL*/, false /*Consecutive*/, + false /*Reverse*/); } else if (GetElementPtrInst *GEP = dyn_cast(Inst)) { NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands()); } else if (CallInst *CI = dyn_cast(Inst)) { @@ -1040,7 +1042,8 @@ void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) { // branch-on-cond %Negated // static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( - VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) { + VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck, + VPInstruction *NextEVL) { VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); auto *CanonicalIVPHI = Plan.getCanonicalIV(); @@ -1066,6 +1069,9 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( // When the loop is guarded by a runtime overflow check for the loop // induction variable increment by VF, we can increment the value before // the get.active.lane mask and use the unmodified tripcount. + if (NextEVL) { + EB->insert(NextEVL, EB->end()--); + } IncrementValue = CanonicalIVIncrement; TripCount = TC; } else { @@ -1102,6 +1108,10 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( "active.lane.mask.next"); LaneMaskPhi->addOperand(ALM); + if (DataAndControlFlowWithoutRuntimeCheck && NextEVL) { + EB->insert(NextEVL, EB->end()--); + } + // Replace the original terminator with BranchOnCond. We have to invert the // mask here because a true condition means jumping to the exit block. auto *NotMask = Builder.createNot(ALM, DL); @@ -1151,7 +1161,8 @@ static void replaceHeaderPredicateWithIdiom( void VPlanTransforms::addActiveLaneMask( VPlan &Plan, bool UseActiveLaneMaskForControlFlow, - bool DataAndControlFlowWithoutRuntimeCheck) { + bool DataAndControlFlowWithoutRuntimeCheck, + VPInstruction *NextEVL) { assert((!DataAndControlFlowWithoutRuntimeCheck || UseActiveLaneMaskForControlFlow) && "DataAndControlFlowWithoutRuntimeCheck implies " @@ -1167,7 +1178,7 @@ void VPlanTransforms::addActiveLaneMask( VPRecipeBase *LaneMask; if (UseActiveLaneMaskForControlFlow) { LaneMask = addVPLaneMaskPhiAndUpdateExitBranch( - Plan, DataAndControlFlowWithoutRuntimeCheck); + Plan, DataAndControlFlowWithoutRuntimeCheck, NextEVL); } else { LaneMask = new VPInstruction(VPInstruction::ActiveLaneMask, {WideCanonicalIV, Plan.getTripCount()}, diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index a4bc7a23072c..7e1f65bc16ec 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -75,7 +75,8 @@ struct VPlanTransforms { /// UseActiveLaneMaskForControlFlow. static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, - bool DataAndControlFlowWithoutRuntimeCheck); + bool DataAndControlFlowWithoutRuntimeCheck, + VPInstruction *NextEVL); /// Insert truncates and extends for any truncated recipe. Redundant casts /// will be folded later. diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 22dbf7571dd9..a4db8b5c5d02 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -367,6 +367,7 @@ class VPDef { VPActiveLaneMaskPHISC, VPEVLBasedIVPHISC, VPFirstOrderRecurrencePHISC, + VPWidenEVLSC, VPWidenPHISC, VPWidenIntOrFpInductionSC, VPWidenPointerInductionSC, diff --git a/llvm/lib/Transforms/Vectorize/VectorPredication.cpp b/llvm/lib/Transforms/Vectorize/VectorPredication.cpp new file mode 100644 index 000000000000..bbebcba38e91 --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/VectorPredication.cpp @@ -0,0 +1,358 @@ +#include "llvm/Transforms/Vectorize/VectorPredication.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/VectorBuilder.h" +#include "llvm/Transforms/Utils/Local.h" + +#define DEBUG_TYPE "vector-predication" +STATISTIC(Transforms, "Number of full-length -> evl vector transformation."); + +using namespace llvm; + +// Map each instruction to its uses and save all memory writing vector +// predicated instructions found in the basic block. +void VectorPredicationPass::analyseBasicBlock(BasicBlock &BB, + BlockData &BBInfo) { + // Store all memory accessing instructions: all these instructions have to be + // chained, so that their relative order can be preserved when rewriting the + // basic block. + SmallVector ToBeChainedInstructions; + + for (Instruction &I : BB) { + if (isa(I) || I.isTerminator()) + continue; + + SmallPtrSet IUsers; + for (User *IU : I.users()) { + assert(isa(IU) && "Unexpected behaviour."); + auto *IUInst = cast(IU); + if (IUInst->getParent() != I.getParent()) + continue; + if (isa(IUInst) || IUInst->isTerminator()) + continue; + + IUsers.insert(IUInst); + } + BBInfo.TopologicalGraph.insert({&I, IUsers}); + + if (I.mayReadOrWriteMemory() || I.mayHaveSideEffects()) + ToBeChainedInstructions.push_back(&I); + + if (auto *CI = dyn_cast(&I)) { + if (auto *CF = CI->getCalledFunction()) { + Intrinsic::ID ID = CF->getIntrinsicID(); + if (ID == Intrinsic::vp_store || ID == Intrinsic::vp_scatter) { + BBInfo.MemoryWritingVPInstructions.push_back(&I); + } + } + } + } + + if (ToBeChainedInstructions.size() > 1) { + for (unsigned I = 0; I < ToBeChainedInstructions.size() - 2; I++) { + auto *Parent = ToBeChainedInstructions[I]; + auto *Child = ToBeChainedInstructions[I + 1]; + BBInfo.TopologicalGraph[Parent].insert(Child); + } + } +} + +namespace { +void findCandidateVectorOperation(BasicBlock &BB, Value *Op, Value *Mask, + Value *EVL, BlockData &BBInfo) { + auto *OpInst = dyn_cast(Op); + if (!OpInst) + return; + + if (OpInst->getParent() != &BB) + return; + + Intrinsic::ID VPID = Intrinsic::not_intrinsic; + unsigned Opcode = OpInst->getOpcode(); + if (Opcode == Instruction::Call) { + if (auto *CF = cast(OpInst)->getCalledFunction()) + VPID = VPIntrinsic::getForIntrinsicID(CF->getIntrinsicID()); + } else + VPID = VPIntrinsic::getForOpcode(OpInst->getOpcode()); + if (VPID == Intrinsic::not_intrinsic) + return; + + // If the instruction is already present in the map, it means it was already + // visited starting from a previous memory writting vp operation. + if (!BBInfo.VecOpsToTransform + .insert(std::make_pair(OpInst, std::make_pair(Mask, EVL))) + .second) { + // We need to check if new mask and evl values differ from the old ones: + // - if they are the same, then there is nothing to do; + // - if only the mask differ, we use an allones mask; + // - otherwise, we remove the instruction from the map (i.e., no + // transformation should happen) + // NOTE: maybe, instead of giving up, we could split case 3 in two + // more cases: if only EVLs differs, we use VLMAX with the mask; if both + // mask and EVL differ, we use an allones mask and VLMAX (even if + // semantically it means not doing anything). + auto It = BBInfo.VecOpsToTransform.find(OpInst); + assert(It != BBInfo.VecOpsToTransform.end()); + Value *OldMask, *OldEVL; + std::tie(OldMask, OldEVL) = It->second; + + if (Mask == OldMask && EVL == OldEVL) + return; + + BBInfo.VecOpsToTransform.erase(OpInst); + if (EVL == OldEVL) { + BBInfo.VecOpsToTransform.insert( + std::make_pair(OpInst, std::make_pair(nullptr, EVL))); + } + } + + // Recursively visit OpInst operands. + switch (VPID) { + default: + for (auto *OpVal : OpInst->operand_values()) + findCandidateVectorOperation(BB, OpVal, Mask, EVL, BBInfo); + break; + case Intrinsic::vp_select: { + auto CanBackPropagateCondOpAsMask = [&](Value *CondOp) -> bool { + if (!CondOp->getType()->isVectorTy()) + return false; + + auto *CondInstr = dyn_cast(CondOp); + if (!CondInstr) + return false; + if (CondInstr->getParent() != &BB) + return false; + if (auto *ALM = dyn_cast(CondInstr); + ALM && ALM->getCalledFunction()->getIntrinsicID() == + Intrinsic::get_active_lane_mask) + return false; + + return true; + }; + + Value *Cond = OpInst->getOperand(0); + Value *TrueOp = OpInst->getOperand(1); + Value *FalseOp = OpInst->getOperand(2); + // If the condition argument is a vector, we backpropagate it as mask + // for the true branch and its negation as mask for the false one. + if (CanBackPropagateCondOpAsMask(Cond)) { + auto *CondInstr = cast(Cond); + IRBuilder<> Builder(CondInstr); + auto *CondNot = cast(Builder.CreateNot(Cond)); + SmallPtrSet CondNotUsers; + BBInfo.TopologicalGraph.insert({CondNot, CondNotUsers}); + BBInfo.TopologicalGraph[CondInstr].insert(CondNot); + + findCandidateVectorOperation(BB, Cond, nullptr, EVL, BBInfo); + findCandidateVectorOperation(BB, CondNot, nullptr, EVL, BBInfo); + + findCandidateVectorOperation(BB, TrueOp, Cond, EVL, BBInfo); + findCandidateVectorOperation(BB, FalseOp, CondNot, EVL, BBInfo); + } else { + findCandidateVectorOperation(BB, TrueOp, nullptr, EVL, BBInfo); + findCandidateVectorOperation(BB, FalseOp, nullptr, EVL, BBInfo); + } + break; + } + } +} +} // namespace + +// For each vector predicated memory writing operation of the basic block, go +// back to the stored vector defining instruction and verify it is a vector +// operation. Add it to the list of instructions to be transformed into vector +// predicated ones, then recursively repeat the process for its vector +// arguments. +void VectorPredicationPass::findCandidateVectorOperations(BasicBlock &BB, + BlockData &BBInfo) { + if (BBInfo.MemoryWritingVPInstructions.empty()) + return; + + for (Instruction *I : BBInfo.MemoryWritingVPInstructions) { + assert(I->getParent() == &BB && "This is not the right basic block"); + auto *VPI = cast(I); + Value *StoredOperand = VPI->getMemoryDataParam(); + Value *MaskOperand = VPI->getMaskParam(); + Value *EVLOperand = VPI->getVectorLengthParam(); + // First, visit the mask operand (assigning an allones mask to this branch) + // and only then visit the stored operand. + findCandidateVectorOperation(BB, MaskOperand, nullptr, EVLOperand, BBInfo); + findCandidateVectorOperation(BB, StoredOperand, MaskOperand, EVLOperand, + BBInfo); + } +} + +// Add the candidates as users of the mask and of the evl linked to each of +// them, but only if they belong to the same basic block. +void VectorPredicationPass::addNewUsersToMasksAndEVLs(BasicBlock &BB, + BlockData &BBInfo) { + if (BBInfo.VecOpsToTransform.empty()) + return; + + for (auto [K, V] : BBInfo.VecOpsToTransform) { + if (auto *MaskInst = dyn_cast_if_present(V.first); + MaskInst && MaskInst->getParent() == &BB) + BBInfo.TopologicalGraph[MaskInst].insert(K); + if (auto *EVLInst = dyn_cast(V.second); + EVLInst && EVLInst->getParent() == &BB) + BBInfo.TopologicalGraph[EVLInst].insert(K); + } +} + +// Topologically sort, preserving as much as possible the original order. +void VectorPredicationPass::buildNewBasicBlockSchedule(BasicBlock &BB, + BlockData &BBInfo) { + if (BBInfo.VecOpsToTransform.empty()) + return; + + while (!BBInfo.TopologicalGraph.empty()) { + Instruction *Inst = nullptr; + for (auto B = BBInfo.TopologicalGraph.rbegin(), + E = BBInfo.TopologicalGraph.rend(); + B != E; B++) { + if (B->second.empty()) { + Inst = B->first; + break; + } + } + assert(Inst && "Failed to empty topological graph!"); + + BBInfo.NewBBReverseOrder.push_back(Inst); + BBInfo.TopologicalGraph.erase(Inst); + + for (auto B = BBInfo.TopologicalGraph.begin(), + E = BBInfo.TopologicalGraph.end(); + B != E; B++) { + B->second.erase(Inst); + } + } +} + +// Modify the basic block based on the topological order generated. +void VectorPredicationPass::emitNewBasicBlockSchedule(BasicBlock &BB, + BlockData &BBInfo) { + if (BBInfo.VecOpsToTransform.empty()) + return; + + Instruction *InsertPoint = BB.getTerminator(); + for (Instruction *I : BBInfo.NewBBReverseOrder) { + I->moveBefore(InsertPoint); + InsertPoint = I; + } +} + +// Transform candidates to vector predicated instructions. +void VectorPredicationPass::transformCandidateVectorOperations( + BasicBlock &BB, BlockData &BBInfo) { + if (BBInfo.VecOpsToTransform.empty()) + return; + + for (auto [I, P] : BBInfo.VecOpsToTransform) { + Value *Mask, *EVL; + std::tie(Mask, EVL) = P; + + IRBuilder<> Builder(I); + unsigned int OpcodeOrIID = I->getOpcode(); + Type *RetTy = I->getType(); + SmallVector Operands(I->value_op_begin(), I->value_op_end()); + bool IsCall = false; + switch (OpcodeOrIID) { + case Instruction::Call: { + Operands.clear(); + auto *CI = cast(I); + for (auto &Op : CI->operands()) { + if (Op == CI->getCalledOperand()) + continue; + Operands.push_back(Op.get()); + } + OpcodeOrIID = CI->getCalledFunction()->getIntrinsicID(); + IsCall = true; + break; + } + case Instruction::FCmp: + case Instruction::ICmp: { + Operands.clear(); + auto *CmpI = cast(I); + Value *PredOp = MetadataAsValue::get( + Builder.getContext(), + MDString::get(Builder.getContext(), + CmpInst::getPredicateName(CmpI->getPredicate()))); + Operands = {CmpI->getOperand(0), CmpI->getOperand(1), PredOp}; + break; + } + case Instruction::Select: { + if (!I->getOperand(0)->getType()->isVectorTy()) { + Operands.clear(); + Value *Op1 = I->getOperand(1); + Value *Op2 = I->getOperand(2); + Value *Cond = Builder.CreateVectorSplat( + cast(Op1->getType())->getElementCount(), + I->getOperand(0), "select.cond.splat"); + Operands = {Cond, Op1, Op2}; + } else if (auto *ALM = dyn_cast(I->getOperand(0)); + ALM && ALM->getCalledFunction()->getIntrinsicID() == + Intrinsic::get_active_lane_mask) { + // Ignore the select: the vector length operand already takes care of + // keeping track of the active elements. + I->replaceAllUsesWith(I->getOperand(1)); + OldInstructionsToRemove.insert(std::make_pair(I, nullptr)); + + continue; + } + break; + } + default: + break; + } + + if (!Mask) + // nullptr means unmasked operation, hence we use an all-ones mask. + Mask = ConstantInt::getTrue(RetTy->getWithNewType(Builder.getInt1Ty())); + + VectorBuilder VecBuilder(Builder); + VecBuilder.setMask(Mask).setEVL(EVL); + Value *NewVPOp = nullptr; + if (IsCall) + NewVPOp = VecBuilder.createVectorInstructionFromIntrinsicID( + OpcodeOrIID, RetTy, Operands, "vp.op"); + else + NewVPOp = VecBuilder.createVectorInstructionFromOpcode(OpcodeOrIID, RetTy, + Operands, "vp.op"); + + Transforms++; // Stats + OldInstructionsToRemove.insert(std::make_pair(I, NewVPOp)); + } +} + +// Remove old instructions, if possible. +void VectorPredicationPass::removeOldInstructions() { + for (auto [I, NewVPOp] : OldInstructionsToRemove) { + if (NewVPOp) + I->replaceAllUsesWith(NewVPOp); + if (isInstructionTriviallyDead(I)) + I->eraseFromParent(); + } +} + +PreservedAnalyses VectorPredicationPass::run(Function &F, + FunctionAnalysisManager &AM) { + assert(OldInstructionsToRemove.empty() && + "Map should be cleared at the end of each run of the pass."); + + for (BasicBlock &BB : F) { + BlockData BBInfo; + + analyseBasicBlock(BB, BBInfo); + findCandidateVectorOperations(BB, BBInfo); + addNewUsersToMasksAndEVLs(BB, BBInfo); + buildNewBasicBlockSchedule(BB, BBInfo); + emitNewBasicBlockSchedule(BB, BBInfo); + transformCandidateVectorOperations(BB, BBInfo); + } + + removeOldInstructions(); + OldInstructionsToRemove.clear(); + + // TODO: think about which analysis are preserved. + return PreservedAnalyses::none(); +} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vp_intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vp_intrinsics.ll new file mode 100644 index 000000000000..03134f36c6ab --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vp_intrinsics.ll @@ -0,0 +1,140 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S --passes=loop-vectorize -use-vp-intrinsics -prefer-predicate-over-epilogue=predicate-dont-vectorize -o - < %s | FileCheck %s + +; ModuleID = 'custom/simple.c' +source_filename = "custom/simple.c" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-unknown" + +; Input C code: +; void addVec(long N, double *C, double *A, double *B) { +; long I; +; for (I = 0; I < N; I++) +; C[I] = A[I] + B[I]; +; } + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) +define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 { +; CHECK-LABEL: @addVec( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64 +; CHECK-NEXT: [[A2:%.*]] = ptrtoint ptr [[A:%.*]] to i64 +; CHECK-NEXT: [[C1:%.*]] = ptrtoint ptr [[C:%.*]] to i64 +; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 13, i64 [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]] +; CHECK-NEXT: br i1 [[TMP4]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8 +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[C1]], [[A2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP6]], 8 +; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[C1]], [[B3]] +; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 2 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_PHI:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_PHI_CAST:%.*]] = trunc i64 [[EVL_PHI]] to i32 +; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[EVL_PHI_CAST]] to i64 +; CHECK-NEXT: [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP13]], i64 3, i64 1) +; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[VL]] to i32 +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP17:%.*]] = add zeroinitializer, [[TMP16]] +; CHECK-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[VEC_IV]], i32 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP18]], i64 [[N]]) +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, ptr [[TMP19]], i32 0 +; CHECK-NEXT: [[VP_LOAD:%.*]] = call @llvm.vp.load.nxv2f64.p0(ptr [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP14]]), !tbaa [[TBAA4:![0-9]+]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, ptr [[TMP21]], i32 0 +; CHECK-NEXT: [[VP_LOAD5:%.*]] = call @llvm.vp.load.nxv2f64.p0(ptr [[TMP22]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP14]]), !tbaa [[TBAA4]] +; CHECK-NEXT: [[TMP23:%.*]] = fadd [[VP_LOAD]], [[VP_LOAD5]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, ptr [[TMP24]], i32 0 +; CHECK-NEXT: call void @llvm.vp.store.nxv2f64.p0( [[TMP23]], ptr [[TMP25]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP14]]), !tbaa [[TBAA4]] +; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP26]] +; CHECK-NEXT: [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_08]] +; CHECK-NEXT: [[TMP28:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_08]] +; CHECK-NEXT: [[TMP29:%.*]] = load double, ptr [[ARRAYIDX1]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP28]], [[TMP29]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_08]] +; CHECK-NEXT: store double [[ADD]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %cmp7 = icmp sgt i64 %N, 0 + br i1 %cmp7, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %I.08 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.08 + %0 = load double, ptr %arrayidx, align 8, !tbaa !4 + %arrayidx1 = getelementptr inbounds double, ptr %B, i64 %I.08 + %1 = load double, ptr %arrayidx1, align 8, !tbaa !4 + %add = fadd double %0, %1 + %arrayidx2 = getelementptr inbounds double, ptr %C, i64 %I.08 + store double %add, ptr %arrayidx2, align 8, !tbaa !4 + %inc = add nuw nsw i64 %I.08, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !8 + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"target-abi", !"lp64d"} +!2 = !{i32 8, !"SmallDataLimit", i32 8} +!3 = !{!"clang version 17.0.0"} +!4 = !{!5, !5, i64 0} +!5 = !{!"double", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C/C++ TBAA"} +!8 = distinct !{!8, !9} +!9 = !{!"llvm.loop.mustprogress"} diff --git a/llvm/test/Transforms/VectorPredication/if-elif-else.ll b/llvm/test/Transforms/VectorPredication/if-elif-else.ll new file mode 100644 index 000000000000..8241f17102c4 --- /dev/null +++ b/llvm/test/Transforms/VectorPredication/if-elif-else.ll @@ -0,0 +1,270 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S --passes=vector-predication -o - < %s | FileCheck %s + +; ModuleID = 'custom/if-elif-else.c' +source_filename = "custom/if-elif-else.c" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-unknown" + +; Input C code: +; void addVec(long N, double *C, double *A, double *B) { +; long I; +; for (I = 0; I < N; I++) { +; if (N < 50) +; C[I] = A[I] + B[I]; +; else if (N > 75) +; C[I] = A[I] * B[I]; +; else +; C[I] = 2 * A[I]; +; } +; } + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) +define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 { +; CHECK-LABEL: @addVec( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP30:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP30]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i64 [[N]], 50 +; CHECK-NEXT: [[CMP4:%.*]] = icmp ugt i64 [[N]], 75 +; CHECK-NEXT: [[TMP0:%.*]] = xor i64 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 10) +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[N]], 3 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[UGLYGEP32:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[UGLYGEP33:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP32]], [[C]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[A]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND034:%.*]] = icmp ugt ptr [[UGLYGEP33]], [[C]] +; CHECK-NEXT: [[BOUND135:%.*]] = icmp ugt ptr [[UGLYGEP]], [[B]] +; CHECK-NEXT: [[FOUND_CONFLICT36:%.*]] = and i1 [[BOUND034]], [[BOUND135]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT36]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT37:%.*]] = insertelement poison, i1 [[CMP1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT38:%.*]] = shufflevector [[BROADCAST_SPLATINSERT37]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT39:%.*]] = insertelement poison, i1 [[CMP4]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT40:%.*]] = shufflevector [[BROADCAST_SPLATINSERT39]], poison, zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = xor [[BROADCAST_SPLAT38]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP6:%.*]] = select [[TMP5]], [[BROADCAST_SPLAT40]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = select [[BROADCAST_SPLAT38]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[BROADCAST_SPLAT40]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_PHI:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = and i64 [[EVL_PHI]], 4294967295 +; CHECK-NEXT: [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP8]], i64 3, i64 0) +; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[VL]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP10]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]), !tbaa [[TBAA4:![0-9]+]], !alias.scope !8 +; CHECK-NEXT: [[VP_OP:%.*]] = call @llvm.vp.fmul.nxv1f64( [[VP_LOAD]], shufflevector ( insertelement ( poison, double 2.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD41:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP11]], [[TMP6]], i32 [[TMP9]]), !tbaa [[TBAA4]], !alias.scope !11 +; CHECK-NEXT: [[VP_OP2:%.*]] = call @llvm.vp.fmul.nxv1f64( [[VP_LOAD]], [[VP_LOAD41]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) +; CHECK-NEXT: [[VP_LOAD42:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP11]], [[BROADCAST_SPLAT38]], i32 [[TMP9]]), !tbaa [[TBAA4]], !alias.scope !11 +; CHECK-NEXT: [[VP_OP1:%.*]] = call @llvm.vp.fadd.nxv1f64( [[VP_LOAD]], [[VP_LOAD42]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) +; CHECK-NEXT: [[VP_OP4:%.*]] = call @llvm.vp.select.nxv1f64( [[TMP7]], [[VP_OP1]], [[VP_OP]], i32 [[TMP9]]) +; CHECK-NEXT: [[VP_OP3:%.*]] = call @llvm.vp.select.nxv1f64( [[TMP6]], [[VP_OP2]], [[VP_OP4]], i32 [[TMP9]]) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.vp.store.nxv1f64.p0( [[VP_OP3]], ptr [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]), !tbaa [[TBAA4]], !alias.scope !13, !noalias !15 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VL]] +; CHECK-NEXT: [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP13]], label [[FOR_END_LOOPEXIT44:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK: for.body: +; CHECK-NEXT: [[I_031:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_031]] +; CHECK-NEXT: [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_031]] +; CHECK-NEXT: [[TMP15:%.*]] = load double, ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP14]], [[TMP15]] +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: if.else: +; CHECK-NEXT: br i1 [[CMP4]], label [[IF_THEN5:%.*]], label [[IF_ELSE9:%.*]] +; CHECK: if.then5: +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_031]] +; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr [[ARRAYIDX7]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[TMP14]], [[TMP16]] +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: if.else9: +; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP14]], 2.000000e+00 +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[ADD_SINK:%.*]] = phi double [ [[ADD]], [[IF_THEN]] ], [ [[MUL11]], [[IF_ELSE9]] ], [ [[MUL]], [[IF_THEN5]] ] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_031]] +; CHECK-NEXT: store double [[ADD_SINK]], ptr [[ARRAYIDX3]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_031]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end.loopexit44: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %cmp30 = icmp sgt i64 %N, 0 + br i1 %cmp30, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + %cmp1 = icmp ult i64 %N, 50 + %cmp4 = icmp ugt i64 %N, 75 + %0 = xor i64 %N, -1 + %1 = call i64 @llvm.vscale.i64() + %2 = call i64 @llvm.umax.i64(i64 %1, i64 10) + %3 = icmp ugt i64 %2, %0 + br i1 %3, label %for.body.preheader, label %vector.memcheck + +for.body.preheader: ; preds = %vector.memcheck, %for.body.lr.ph + br label %for.body + +vector.memcheck: ; preds = %for.body.lr.ph + %4 = shl i64 %N, 3 + %uglygep = getelementptr i8, ptr %C, i64 %4 + %uglygep32 = getelementptr i8, ptr %A, i64 %4 + %uglygep33 = getelementptr i8, ptr %B, i64 %4 + %bound0 = icmp ugt ptr %uglygep32, %C + %bound1 = icmp ugt ptr %uglygep, %A + %found.conflict = and i1 %bound0, %bound1 + %bound034 = icmp ugt ptr %uglygep33, %C + %bound135 = icmp ugt ptr %uglygep, %B + %found.conflict36 = and i1 %bound034, %bound135 + %conflict.rdx = or i1 %found.conflict, %found.conflict36 + br i1 %conflict.rdx, label %for.body.preheader, label %vector.ph + +vector.ph: ; preds = %vector.memcheck + %broadcast.splatinsert37 = insertelement poison, i1 %cmp1, i64 0 + %broadcast.splat38 = shufflevector %broadcast.splatinsert37, poison, zeroinitializer + %broadcast.splatinsert39 = insertelement poison, i1 %cmp4, i64 0 + %broadcast.splat40 = shufflevector %broadcast.splatinsert39, poison, zeroinitializer + %5 = xor %broadcast.splat38, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) + %6 = select %5, %broadcast.splat40, zeroinitializer + %7 = select %broadcast.splat38, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), %broadcast.splat40 + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %evl.phi = phi i64 [ %N, %vector.ph ], [ %evl.next, %vector.body ] + %8 = and i64 %evl.phi, 4294967295 + %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %8, i64 3, i64 0) + %9 = trunc i64 %vl to i32 + %10 = getelementptr inbounds double, ptr %A, i64 %index + %vp.load = call @llvm.vp.load.nxv1f64.p0(ptr %10, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %9), !tbaa !4, !alias.scope !8 + %11 = fmul %vp.load, shufflevector ( insertelement ( poison, double 2.000000e+00, i64 0), poison, zeroinitializer) + %12 = getelementptr double, ptr %B, i64 %index + %vp.load41 = call @llvm.vp.load.nxv1f64.p0(ptr %12, %6, i32 %9), !tbaa !4, !alias.scope !11 + %13 = fmul %vp.load, %vp.load41 + %vp.load42 = call @llvm.vp.load.nxv1f64.p0(ptr %12, %broadcast.splat38, i32 %9), !tbaa !4, !alias.scope !11 + %14 = fadd %vp.load, %vp.load42 + %predphi = select %7, %14, %11 + %predphi43 = select %6, %13, %predphi + %15 = getelementptr inbounds double, ptr %C, i64 %index + call void @llvm.vp.store.nxv1f64.p0( %predphi43, ptr %15, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %9), !tbaa !4, !alias.scope !13, !noalias !15 + %index.next = add i64 %index, %vl + %evl.next = sub i64 %N, %index.next + %16 = icmp eq i64 %index.next, %N + br i1 %16, label %for.end.loopexit44, label %vector.body, !llvm.loop !16 + +for.body: ; preds = %for.body.preheader, %for.inc + %I.031 = phi i64 [ %inc, %for.inc ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.031 + %17 = load double, ptr %arrayidx, align 8, !tbaa !4 + br i1 %cmp1, label %if.then, label %if.else + +if.then: ; preds = %for.body + %arrayidx2 = getelementptr inbounds double, ptr %B, i64 %I.031 + %18 = load double, ptr %arrayidx2, align 8, !tbaa !4 + %add = fadd double %17, %18 + br label %for.inc + +if.else: ; preds = %for.body + br i1 %cmp4, label %if.then5, label %if.else9 + +if.then5: ; preds = %if.else + %arrayidx7 = getelementptr inbounds double, ptr %B, i64 %I.031 + %19 = load double, ptr %arrayidx7, align 8, !tbaa !4 + %mul = fmul double %17, %19 + br label %for.inc + +if.else9: ; preds = %if.else + %mul11 = fmul double %17, 2.000000e+00 + br label %for.inc + +for.inc: ; preds = %if.then, %if.else9, %if.then5 + %add.sink = phi double [ %add, %if.then ], [ %mul11, %if.else9 ], [ %mul, %if.then5 ] + %arrayidx3 = getelementptr inbounds double, ptr %C, i64 %I.031 + store double %add.sink, ptr %arrayidx3, align 8, !tbaa !4 + %inc = add nuw nsw i64 %I.031, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !20 + +for.end.loopexit: ; preds = %for.inc + br label %for.end + +for.end.loopexit44: ; preds = %vector.body + br label %for.end + +for.end: ; preds = %for.end.loopexit44, %for.end.loopexit, %entry + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare i64 @llvm.vscale.i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.umax.i64(i64, i64) #2 + +; Function Attrs: nounwind memory(none) +declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #3 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.experimental.stepvector.nxv1i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare @llvm.vp.load.nxv1f64.p0(ptr nocapture, , i32) #4 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.vp.store.nxv1f64.p0(, ptr nocapture, , i32) #5 + +attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { nounwind memory(none) } +attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"target-abi", !"lp64d"} +!2 = !{i32 8, !"SmallDataLimit", i32 8} +!3 = !{!"clang version 17.0.0"} +!4 = !{!5, !5, i64 0} +!5 = !{!"double", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C/C++ TBAA"} +!8 = !{!9} +!9 = distinct !{!9, !10} +!10 = distinct !{!10, !"LVerDomain"} +!11 = !{!12} +!12 = distinct !{!12, !10} +!13 = !{!14} +!14 = distinct !{!14, !10} +!15 = !{!9, !12} +!16 = distinct !{!16, !17, !18, !19} +!17 = !{!"llvm.loop.mustprogress"} +!18 = !{!"llvm.loop.isvectorized", i32 1} +!19 = !{!"llvm.loop.unroll.runtime.disable"} +!20 = distinct !{!20, !17, !18} diff --git a/llvm/test/Transforms/VectorPredication/if-elif-else_not-uniform.ll b/llvm/test/Transforms/VectorPredication/if-elif-else_not-uniform.ll new file mode 100644 index 000000000000..071c42c5ed6b --- /dev/null +++ b/llvm/test/Transforms/VectorPredication/if-elif-else_not-uniform.ll @@ -0,0 +1,316 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -S --passes=vector-predication -o - < %s | FileCheck %s + +; ModuleID = 'if-elif-else_not-uniform.c' +source_filename = "if-elif-else_not-uniform.c" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-linux-gnu" + +; Input C code: +; void addVec(long N, double *C, double *A, double *B, double *K) { +; long I; +; for (I = 0; I < N; I++) { +; if (K[I] < 50) +; C[I] = A[I] + B[I]; +; else if (K[I] > 75) +; C[I] = A[I] * B[I]; +; else +; C[I] = 2 * A[I]; +; } +; } + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable vscale_range(2,1024) +define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B, ptr nocapture noundef readonly %K) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @addVec +; CHECK-SAME: (i64 noundef [[N:%.*]], ptr nocapture noundef writeonly [[C:%.*]], ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef readonly [[K:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP33:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP33]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = xor i64 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2]], i64 12) +; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[TMP3]], [[TMP0]] +; CHECK-NEXT: br i1 [[TMP4]], label [[FOR_BODY_PREHEADER50:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: for.body.preheader50: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[N]], 3 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[C]], i64 [[TMP5]] +; CHECK-NEXT: [[SCEVGEP35:%.*]] = getelementptr i8, ptr [[K]], i64 [[TMP5]] +; CHECK-NEXT: [[SCEVGEP36:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[SCEVGEP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP35]], [[C]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[K]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND038:%.*]] = icmp ugt ptr [[SCEVGEP36]], [[C]] +; CHECK-NEXT: [[BOUND139:%.*]] = icmp ugt ptr [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[FOUND_CONFLICT40:%.*]] = and i1 [[BOUND038]], [[BOUND139]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT40]] +; CHECK-NEXT: [[BOUND041:%.*]] = icmp ugt ptr [[SCEVGEP37]], [[C]] +; CHECK-NEXT: [[BOUND142:%.*]] = icmp ugt ptr [[SCEVGEP]], [[B]] +; CHECK-NEXT: [[FOUND_CONFLICT43:%.*]] = and i1 [[BOUND041]], [[BOUND142]] +; CHECK-NEXT: [[CONFLICT_RDX44:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT43]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX44]], label [[FOR_BODY_PREHEADER50]], label [[VECTOR_BODY_PREHEADER:%.*]] +; CHECK: vector.body.preheader: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[EVL_PHI:%.*]] = phi i64 [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ], [ [[N]], [[VECTOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[TMP6:%.*]] = and i64 [[EVL_PHI]], 4294967295 +; CHECK-NEXT: [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP6]], i64 3, i64 1) +; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[VL]] to i32 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[N]]) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, ptr [[K]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD:%.*]] = call @llvm.vp.load.nxv2f64.p0(ptr [[TMP8]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]), !tbaa [[TBAA7:![0-9]+]], !alias.scope !11 +; CHECK-NEXT: [[VP_OP7:%.*]] = call @llvm.vp.fcmp.nxv2f64( [[VP_LOAD]], shufflevector ( insertelement ( poison, double 5.000000e+01, i64 0), poison, zeroinitializer), metadata !"olt", shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; CHECK-NEXT: [[VP_OP3:%.*]] = call @llvm.vp.xor.nxv2i1( [[VP_OP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = select [[ACTIVE_LANE_MASK]], [[VP_OP3]], zeroinitializer +; CHECK-NEXT: [[VP_OP16:%.*]] = call @llvm.vp.fcmp.nxv2f64( [[VP_LOAD]], shufflevector ( insertelement ( poison, double 7.500000e+01, i64 0), poison, zeroinitializer), metadata !"ogt", [[TMP9]], i32 [[TMP7]]) +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD45:%.*]] = call @llvm.vp.load.nxv2f64.p0(ptr [[TMP10]], [[VP_OP3]], i32 [[TMP7]]), !tbaa [[TBAA7]], !alias.scope !14 +; CHECK-NEXT: [[VP_OP8:%.*]] = call @llvm.vp.xor.nxv2i1( [[VP_OP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP9]], i32 [[TMP7]]) +; CHECK-NEXT: [[VP_OP12:%.*]] = call @llvm.vp.select.nxv2i1( [[VP_OP3]], [[VP_OP8]], zeroinitializer, i32 [[TMP7]]) +; CHECK-NEXT: [[VP_OP14:%.*]] = call @llvm.vp.fmul.nxv2f64( [[VP_LOAD45]], shufflevector ( insertelement ( poison, double 2.000000e+00, i64 0), poison, zeroinitializer), [[VP_OP12]], i32 [[TMP7]]) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_OP5:%.*]] = call @llvm.vp.select.nxv2i1( [[VP_OP3]], [[VP_OP16]], zeroinitializer, i32 [[TMP7]]) +; CHECK-NEXT: [[VP_LOAD46:%.*]] = call @llvm.vp.load.nxv2f64.p0(ptr [[TMP11]], [[VP_OP5]], i32 [[TMP7]]), !tbaa [[TBAA7]], !alias.scope !16 +; CHECK-NEXT: [[VP_OP11:%.*]] = call @llvm.vp.fmul.nxv2f64( [[VP_LOAD45]], [[VP_LOAD46]], [[VP_OP5]], i32 [[TMP7]]) +; CHECK-NEXT: [[VP_LOAD47:%.*]] = call @llvm.vp.load.nxv2f64.p0(ptr [[TMP10]], [[VP_OP7]], i32 [[TMP7]]), !tbaa [[TBAA7]], !alias.scope !14 +; CHECK-NEXT: [[VP_LOAD48:%.*]] = call @llvm.vp.load.nxv2f64.p0(ptr [[TMP11]], [[VP_OP7]], i32 [[TMP7]]), !tbaa [[TBAA7]], !alias.scope !16 +; CHECK-NEXT: [[VP_OP4:%.*]] = call @llvm.vp.xor.nxv2i1( [[VP_OP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; CHECK-NEXT: [[VP_OP6:%.*]] = call @llvm.vp.fadd.nxv2f64( [[VP_LOAD47]], [[VP_LOAD48]], [[VP_OP4]], i32 [[TMP7]]) +; CHECK-NEXT: [[VP_OP13:%.*]] = call @llvm.vp.xor.nxv2i1( [[VP_OP5]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; CHECK-NEXT: [[VP_OP15:%.*]] = call @llvm.vp.select.nxv2f64( [[VP_OP12]], [[VP_OP14]], [[VP_OP6]], i32 [[TMP7]]) +; CHECK-NEXT: [[VP_OP2:%.*]] = call @llvm.vp.or.nxv2i1( [[VP_OP7]], [[VP_OP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; CHECK-NEXT: [[VP_OP9:%.*]] = call @llvm.vp.or.nxv2i1( [[VP_OP2]], [[VP_OP5]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; CHECK-NEXT: [[VP_OP:%.*]] = call @llvm.vp.select.nxv2f64( [[VP_OP5]], [[VP_OP11]], [[VP_OP15]], i32 [[TMP7]]) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2f64.p0( [[VP_OP]], ptr [[TMP12]], [[VP_OP9]], i32 [[TMP7]]), !tbaa [[TBAA7]], !alias.scope !18, !noalias !20 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VL]] +; CHECK-NEXT: [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] +; CHECK-NEXT: [[VP_OP10:%.*]] = call @llvm.vp.xor.nxv2i1( [[VP_OP3]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; CHECK-NEXT: [[VP_OP1:%.*]] = call @llvm.vp.xor.nxv2i1( [[VP_OP3]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; CHECK-NEXT: br i1 [[TMP13]], label [[FOR_END_LOOPEXIT51:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK: for.body: +; CHECK-NEXT: [[I_034:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER50]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[K]], i64 [[I_034]] +; CHECK-NEXT: [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA7]] +; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt double [[TMP14]], 5.000000e+01 +; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_034]] +; CHECK-NEXT: [[TMP15:%.*]] = load double, ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA7]] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_034]] +; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr [[ARRAYIDX3]], align 8, !tbaa [[TBAA7]] +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP15]], [[TMP16]] +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: if.else: +; CHECK-NEXT: [[CMP6:%.*]] = fcmp ogt double [[TMP14]], 7.500000e+01 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_034]] +; CHECK-NEXT: [[TMP17:%.*]] = load double, ptr [[ARRAYIDX8]], align 8, !tbaa [[TBAA7]] +; CHECK-NEXT: br i1 [[CMP6]], label [[IF_THEN7:%.*]], label [[IF_ELSE11:%.*]] +; CHECK: if.then7: +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_034]] +; CHECK-NEXT: [[TMP18:%.*]] = load double, ptr [[ARRAYIDX9]], align 8, !tbaa [[TBAA7]] +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[TMP17]], [[TMP18]] +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: if.else11: +; CHECK-NEXT: [[MUL13:%.*]] = fmul double [[TMP17]], 2.000000e+00 +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[ADD_SINK:%.*]] = phi double [ [[ADD]], [[IF_THEN]] ], [ [[MUL13]], [[IF_ELSE11]] ], [ [[MUL]], [[IF_THEN7]] ] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_034]] +; CHECK-NEXT: store double [[ADD_SINK]], ptr [[ARRAYIDX4]], align 8, !tbaa [[TBAA7]] +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_034]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end.loopexit51: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %cmp33 = icmp sgt i64 %N, 0 + br i1 %cmp33, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %0 = xor i64 %N, -1 + %1 = call i64 @llvm.vscale.i64() + %2 = shl nuw nsw i64 %1, 1 + %3 = call i64 @llvm.umax.i64(i64 %2, i64 12) + %4 = icmp ugt i64 %3, %0 + br i1 %4, label %for.body.preheader50, label %vector.memcheck + +for.body.preheader50: ; preds = %vector.memcheck, %for.body.preheader + br label %for.body + +vector.memcheck: ; preds = %for.body.preheader + %5 = shl i64 %N, 3 + %scevgep = getelementptr i8, ptr %C, i64 %5 + %scevgep35 = getelementptr i8, ptr %K, i64 %5 + %scevgep36 = getelementptr i8, ptr %A, i64 %5 + %scevgep37 = getelementptr i8, ptr %B, i64 %5 + %bound0 = icmp ugt ptr %scevgep35, %C + %bound1 = icmp ugt ptr %scevgep, %K + %found.conflict = and i1 %bound0, %bound1 + %bound038 = icmp ugt ptr %scevgep36, %C + %bound139 = icmp ugt ptr %scevgep, %A + %found.conflict40 = and i1 %bound038, %bound139 + %conflict.rdx = or i1 %found.conflict, %found.conflict40 + %bound041 = icmp ugt ptr %scevgep37, %C + %bound142 = icmp ugt ptr %scevgep, %B + %found.conflict43 = and i1 %bound041, %bound142 + %conflict.rdx44 = or i1 %conflict.rdx, %found.conflict43 + br i1 %conflict.rdx44, label %for.body.preheader50, label %vector.body.preheader + +vector.body.preheader: ; preds = %vector.memcheck + br label %vector.body + +vector.body: ; preds = %vector.body.preheader, %vector.body + %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.body.preheader ] + %evl.phi = phi i64 [ %evl.next, %vector.body ], [ %N, %vector.body.preheader ] + %6 = and i64 %evl.phi, 4294967295 + %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %6, i64 3, i64 1) + %7 = trunc i64 %vl to i32 + %active.lane.mask = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index, i64 %N) + %8 = getelementptr inbounds double, ptr %K, i64 %index + %vp.load = call @llvm.vp.load.nxv2f64.p0(ptr %8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %7), !tbaa !7, !alias.scope !11 + %9 = fcmp olt %vp.load, shufflevector ( insertelement ( poison, double 5.000000e+01, i64 0), poison, zeroinitializer) + %10 = fcmp ogt %vp.load, shufflevector ( insertelement ( poison, double 7.500000e+01, i64 0), poison, zeroinitializer) + %11 = getelementptr double, ptr %A, i64 %index + %12 = xor %9, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) + %13 = select %active.lane.mask, %12, zeroinitializer + %vp.load45 = call @llvm.vp.load.nxv2f64.p0(ptr %11, %13, i32 %7), !tbaa !7, !alias.scope !14 + %14 = fmul %vp.load45, shufflevector ( insertelement ( poison, double 2.000000e+00, i64 0), poison, zeroinitializer) + %15 = getelementptr double, ptr %B, i64 %index + %16 = select %13, %10, zeroinitializer + %vp.load46 = call @llvm.vp.load.nxv2f64.p0(ptr %15, %16, i32 %7), !tbaa !7, !alias.scope !16 + %17 = fmul %vp.load45, %vp.load46 + %18 = select %active.lane.mask, %9, zeroinitializer + %vp.load47 = call @llvm.vp.load.nxv2f64.p0(ptr %11, %18, i32 %7), !tbaa !7, !alias.scope !14 + %vp.load48 = call @llvm.vp.load.nxv2f64.p0(ptr %15, %18, i32 %7), !tbaa !7, !alias.scope !16 + %19 = fadd %vp.load47, %vp.load48 + %20 = xor %10, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) + %21 = select %13, %20, zeroinitializer + %predphi = select %21, %14, %19 + %predphi49 = select %16, %17, %predphi + %22 = getelementptr inbounds double, ptr %C, i64 %index + %23 = or %18, %21 + %24 = or %23, %16 + call void @llvm.vp.store.nxv2f64.p0( %predphi49, ptr %22, %24, i32 %7), !tbaa !7, !alias.scope !18, !noalias !20 + %index.next = add i64 %index, %vl + %evl.next = sub i64 %N, %index.next + %25 = icmp eq i64 %index.next, %N + br i1 %25, label %for.end.loopexit51, label %vector.body, !llvm.loop !21 + +for.body: ; preds = %for.body.preheader50, %for.inc + %I.034 = phi i64 [ %inc, %for.inc ], [ 0, %for.body.preheader50 ] + %arrayidx = getelementptr inbounds double, ptr %K, i64 %I.034 + %26 = load double, ptr %arrayidx, align 8, !tbaa !7 + %cmp1 = fcmp olt double %26, 5.000000e+01 + br i1 %cmp1, label %if.then, label %if.else + +if.then: ; preds = %for.body + %arrayidx2 = getelementptr inbounds double, ptr %A, i64 %I.034 + %27 = load double, ptr %arrayidx2, align 8, !tbaa !7 + %arrayidx3 = getelementptr inbounds double, ptr %B, i64 %I.034 + %28 = load double, ptr %arrayidx3, align 8, !tbaa !7 + %add = fadd double %27, %28 + br label %for.inc + +if.else: ; preds = %for.body + %cmp6 = fcmp ogt double %26, 7.500000e+01 + %arrayidx8 = getelementptr inbounds double, ptr %A, i64 %I.034 + %29 = load double, ptr %arrayidx8, align 8, !tbaa !7 + br i1 %cmp6, label %if.then7, label %if.else11 + +if.then7: ; preds = %if.else + %arrayidx9 = getelementptr inbounds double, ptr %B, i64 %I.034 + %30 = load double, ptr %arrayidx9, align 8, !tbaa !7 + %mul = fmul double %29, %30 + br label %for.inc + +if.else11: ; preds = %if.else + %mul13 = fmul double %29, 2.000000e+00 + br label %for.inc + +for.inc: ; preds = %if.then, %if.else11, %if.then7 + %add.sink = phi double [ %add, %if.then ], [ %mul13, %if.else11 ], [ %mul, %if.then7 ] + %arrayidx4 = getelementptr inbounds double, ptr %C, i64 %I.034 + store double %add.sink, ptr %arrayidx4, align 8, !tbaa !7 + %inc = add nuw nsw i64 %I.034, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !25 + +for.end.loopexit: ; preds = %for.inc + br label %for.end + +for.end.loopexit51: ; preds = %vector.body + br label %for.end + +for.end: ; preds = %for.end.loopexit51, %for.end.loopexit, %entry + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare i64 @llvm.vscale.i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.umax.i64(i64, i64) #2 + +; Function Attrs: nounwind memory(none) +declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #3 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.experimental.stepvector.nxv2i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare @llvm.vp.load.nxv2f64.p0(ptr nocapture, , i32) #4 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.vp.store.nxv2f64.p0(, ptr nocapture, , i32) #5 + +attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zicsr,+zifencei,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-smaia,-experimental-ssaia,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zcmt,-experimental-zfa,-experimental-zicond,-experimental-zihintntl,-experimental-ztso,-experimental-zvbb,-experimental-zvbc,-experimental-zvfh,-experimental-zvkg,-experimental-zvkn,-experimental-zvkned,-experimental-zvkng,-experimental-zvknha,-experimental-zvknhb,-experimental-zvks,-experimental-zvksed,-experimental-zvksg,-experimental-zvksh,-experimental-zvkt,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xsfvcp,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-zawrs,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zicntr,-zihintpause,-zihpm,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { nounwind memory(none) } +attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5} +!llvm.ident = !{!6} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"target-abi", !"lp64d"} +!2 = !{i32 8, !"PIC Level", i32 2} +!3 = !{i32 7, !"PIE Level", i32 2} +!4 = !{i32 7, !"uwtable", i32 2} +!5 = !{i32 8, !"SmallDataLimit", i32 8} +!6 = !{!"clang version 17.0.0"} +!7 = !{!8, !8, i64 0} +!8 = !{!"double", !9, i64 0} +!9 = !{!"omnipotent char", !10, i64 0} +!10 = !{!"Simple C/C++ TBAA"} +!11 = !{!12} +!12 = distinct !{!12, !13} +!13 = distinct !{!13, !"LVerDomain"} +!14 = !{!15} +!15 = distinct !{!15, !13} +!16 = !{!17} +!17 = distinct !{!17, !13} +!18 = !{!19} +!19 = distinct !{!19, !13} +!20 = !{!12, !15, !17} +!21 = distinct !{!21, !22, !23, !24} +!22 = !{!"llvm.loop.mustprogress"} +!23 = !{!"llvm.loop.isvectorized", i32 1} +!24 = !{!"llvm.loop.unroll.runtime.disable"} +!25 = distinct !{!25, !22, !23} diff --git a/llvm/test/Transforms/VectorPredication/if-else_scalar-cond.ll b/llvm/test/Transforms/VectorPredication/if-else_scalar-cond.ll new file mode 100644 index 000000000000..ed8f28feeffc --- /dev/null +++ b/llvm/test/Transforms/VectorPredication/if-else_scalar-cond.ll @@ -0,0 +1,209 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S --passes=vector-predication -o - < %s | FileCheck %s + +; ModuleID = 'custom/if-else2.c' +source_filename = "custom/if-else2.c" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-unknown" + +; Input C code: +; void addVec(long N, double *C, double *A, double *B) { +; long I; +; for (I = 0; I < N; I++) { +; if (N < 50) +; C[I] = A[I] + B[I]; +; else +; C[I] = A[I] * B[I]; +; } +; } + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) +define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 { +; CHECK-LABEL: @addVec( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[B22:%.*]] = ptrtoint ptr [[B:%.*]] to i64 +; CHECK-NEXT: [[A21:%.*]] = ptrtoint ptr [[A:%.*]] to i64 +; CHECK-NEXT: [[C20:%.*]] = ptrtoint ptr [[C:%.*]] to i64 +; CHECK-NEXT: [[CMP18:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP18]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i64 [[N]], 50 +; CHECK-NEXT: [[TMP0:%.*]] = xor i64 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8) +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[C20]], [[A21]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[C20]], [[B22]] +; CHECK-NEXT: [[DIFF_CHECK23:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK23]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY_PREHEADER:%.*]] +; CHECK: vector.body.preheader: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[EVL_PHI:%.*]] = phi i64 [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ], [ [[N]], [[VECTOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[TMP9:%.*]] = and i64 [[EVL_PHI]], 4294967295 +; CHECK-NEXT: [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP9]], i64 3, i64 0) +; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[VL]] to i32 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4:![0-9]+]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD24:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4]] +; CHECK-NEXT: [[VP_OP:%.*]] = call @llvm.vp.fadd.nxv1f64( [[VP_LOAD]], [[VP_LOAD24]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; CHECK-NEXT: [[VP_OP2:%.*]] = call @llvm.vp.fmul.nxv1f64( [[VP_LOAD]], [[VP_LOAD24]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; CHECK-NEXT: [[SELECT_COND_SPLAT_SPLATINSERT:%.*]] = insertelement poison, i1 [[CMP1]], i64 0 +; CHECK-NEXT: [[SELECT_COND_SPLAT_SPLAT:%.*]] = shufflevector [[SELECT_COND_SPLAT_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[VP_OP1:%.*]] = call @llvm.vp.select.nxv1f64( [[SELECT_COND_SPLAT_SPLAT]], [[VP_OP]], [[VP_OP2]], i32 [[TMP10]]) +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.vp.store.nxv1f64.p0( [[VP_OP1]], ptr [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VL]] +; CHECK-NEXT: [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP14]], label [[FOR_END_LOOPEXIT25:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: for.body: +; CHECK-NEXT: [[I_019:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_019]] +; CHECK-NEXT: [[TMP15:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_019]] +; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[MUL_SINK:%.*]] = select i1 [[CMP1]], double [[ADD]], double [[MUL]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_019]] +; CHECK-NEXT: store double [[MUL_SINK]], ptr [[TMP17]], align 8 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_019]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end.loopexit25: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %B22 = ptrtoint ptr %B to i64 + %A21 = ptrtoint ptr %A to i64 + %C20 = ptrtoint ptr %C to i64 + %cmp18 = icmp sgt i64 %N, 0 + br i1 %cmp18, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + %cmp1 = icmp ult i64 %N, 50 + %0 = xor i64 %N, -1 + %1 = call i64 @llvm.vscale.i64() + %2 = call i64 @llvm.umax.i64(i64 %1, i64 8) + %3 = icmp ugt i64 %2, %0 + br i1 %3, label %for.body.preheader, label %vector.memcheck + +for.body.preheader: ; preds = %vector.memcheck, %for.body.lr.ph + br label %for.body + +vector.memcheck: ; preds = %for.body.lr.ph + %4 = call i64 @llvm.vscale.i64() + %5 = shl nuw nsw i64 %4, 3 + %6 = sub i64 %C20, %A21 + %diff.check = icmp ult i64 %6, %5 + %7 = shl nuw nsw i64 %4, 3 + %8 = sub i64 %C20, %B22 + %diff.check23 = icmp ult i64 %8, %7 + %conflict.rdx = or i1 %diff.check, %diff.check23 + br i1 %conflict.rdx, label %for.body.preheader, label %vector.body.preheader + +vector.body.preheader: ; preds = %vector.memcheck + br label %vector.body + +vector.body: ; preds = %vector.body.preheader, %vector.body + %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.body.preheader ] + %evl.phi = phi i64 [ %evl.next, %vector.body ], [ %N, %vector.body.preheader ] + %9 = and i64 %evl.phi, 4294967295 + %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %9, i64 3, i64 0) + %10 = trunc i64 %vl to i32 + %11 = getelementptr inbounds double, ptr %A, i64 %index + %vp.load = call @llvm.vp.load.nxv1f64.p0(ptr %11, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %10), !tbaa !4 + %12 = getelementptr inbounds double, ptr %B, i64 %index + %vp.load24 = call @llvm.vp.load.nxv1f64.p0(ptr %12, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %10), !tbaa !4 + %13 = fadd %vp.load, %vp.load24 + %14 = fmul %vp.load, %vp.load24 + %15 = select i1 %cmp1, %13, %14 + %16 = getelementptr inbounds double, ptr %C, i64 %index + call void @llvm.vp.store.nxv1f64.p0( %15, ptr %16, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %10) + %index.next = add i64 %index, %vl + %evl.next = sub i64 %N, %index.next + %17 = icmp eq i64 %index.next, %N + br i1 %17, label %for.end.loopexit25, label %vector.body, !llvm.loop !8 + +for.body: ; preds = %for.body.preheader, %for.body + %I.019 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.019 + %18 = load double, ptr %arrayidx, align 8, !tbaa !4 + %arrayidx2 = getelementptr inbounds double, ptr %B, i64 %I.019 + %19 = load double, ptr %arrayidx2, align 8, !tbaa !4 + %add = fadd double %18, %19 + %mul = fmul double %18, %19 + %mul.sink = select i1 %cmp1, double %add, double %mul + %20 = getelementptr inbounds double, ptr %C, i64 %I.019 + store double %mul.sink, ptr %20, align 8 + %inc = add nuw nsw i64 %I.019, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !12 + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end.loopexit25: ; preds = %vector.body + br label %for.end + +for.end: ; preds = %for.end.loopexit25, %for.end.loopexit, %entry + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare i64 @llvm.vscale.i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.umax.i64(i64, i64) #2 + +; Function Attrs: nounwind memory(none) +declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #3 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.experimental.stepvector.nxv1i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare @llvm.vp.load.nxv1f64.p0(ptr nocapture, , i32) #4 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.vp.store.nxv1f64.p0(, ptr nocapture, , i32) #5 + +attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { nounwind memory(none) } +attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"target-abi", !"lp64d"} +!2 = !{i32 8, !"SmallDataLimit", i32 8} +!3 = !{!"clang version 17.0.0"} +!4 = !{!5, !5, i64 0} +!5 = !{!"double", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C/C++ TBAA"} +!8 = distinct !{!8, !9, !10, !11} +!9 = !{!"llvm.loop.mustprogress"} +!10 = !{!"llvm.loop.isvectorized", i32 1} +!11 = !{!"llvm.loop.unroll.runtime.disable"} +!12 = distinct !{!12, !9, !10} diff --git a/llvm/test/Transforms/VectorPredication/if-else_vec-cond.ll b/llvm/test/Transforms/VectorPredication/if-else_vec-cond.ll new file mode 100644 index 000000000000..34e4c63c12af --- /dev/null +++ b/llvm/test/Transforms/VectorPredication/if-else_vec-cond.ll @@ -0,0 +1,220 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S --passes=vector-predication -o - < %s | FileCheck %s + +; ModuleID = 'custom/if-else1.c' +source_filename = "custom/if-else1.c" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-unknown" + +; Input C code: +; void addVec(long N, double *C, double *A, double *B) { +; long I; +; for (I = 0; I < N; I++) { +; if (I < 50) +; C[I] = A[I] + B[I]; +; else +; C[I] = A[I] * B[I]; +; } +; } + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) +define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 { +; CHECK-LABEL: @addVec( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[B22:%.*]] = ptrtoint ptr [[B:%.*]] to i64 +; CHECK-NEXT: [[A21:%.*]] = ptrtoint ptr [[A:%.*]] to i64 +; CHECK-NEXT: [[C20:%.*]] = ptrtoint ptr [[C:%.*]] to i64 +; CHECK-NEXT: [[CMP18:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP18]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = xor i64 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8) +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY_PREHEADER25:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: for.body.preheader25: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[C20]], [[A21]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[C20]], [[B22]] +; CHECK-NEXT: [[DIFF_CHECK23:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK23]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER25]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.experimental.stepvector.nxv1i64() +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_PHI:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = and i64 [[EVL_PHI]], 4294967295 +; CHECK-NEXT: [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP10]], i64 3, i64 0) +; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[VL]] to i32 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[VL]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[VP_OP2:%.*]] = call @llvm.vp.icmp.nxv1i64( [[VEC_IND]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer), metadata !"ult", shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]), !tbaa [[TBAA4:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD24:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]), !tbaa [[TBAA4]] +; CHECK-NEXT: [[VP_OP3:%.*]] = call @llvm.vp.fadd.nxv1f64( [[VP_LOAD]], [[VP_LOAD24]], [[VP_OP2]], i32 [[TMP11]]) +; CHECK-NEXT: [[VP_OP4:%.*]] = call @llvm.vp.xor.nxv1i1( [[VP_OP2]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; CHECK-NEXT: [[VP_OP1:%.*]] = call @llvm.vp.fmul.nxv1f64( [[VP_LOAD]], [[VP_LOAD24]], [[VP_OP4]], i32 [[TMP11]]) +; CHECK-NEXT: [[VP_OP:%.*]] = call @llvm.vp.select.nxv1f64( [[VP_OP2]], [[VP_OP3]], [[VP_OP1]], i32 [[TMP11]]) +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.vp.store.nxv1f64.p0( [[VP_OP]], ptr [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VL]] +; CHECK-NEXT: [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP15]], label [[FOR_END_LOOPEXIT26:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: for.body: +; CHECK-NEXT: [[I_019:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER25]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i64 [[I_019]], 50 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_019]] +; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_019]] +; CHECK-NEXT: [[TMP17:%.*]] = load double, ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[MUL_SINK:%.*]] = select i1 [[CMP1]], double [[ADD]], double [[MUL]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_019]] +; CHECK-NEXT: store double [[MUL_SINK]], ptr [[TMP18]], align 8 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_019]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end.loopexit26: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %B22 = ptrtoint ptr %B to i64 + %A21 = ptrtoint ptr %A to i64 + %C20 = ptrtoint ptr %C to i64 + %cmp18 = icmp sgt i64 %N, 0 + br i1 %cmp18, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %0 = xor i64 %N, -1 + %1 = call i64 @llvm.vscale.i64() + %2 = call i64 @llvm.umax.i64(i64 %1, i64 8) + %3 = icmp ugt i64 %2, %0 + br i1 %3, label %for.body.preheader25, label %vector.memcheck + +for.body.preheader25: ; preds = %vector.memcheck, %for.body.preheader + br label %for.body + +vector.memcheck: ; preds = %for.body.preheader + %4 = call i64 @llvm.vscale.i64() + %5 = shl nuw nsw i64 %4, 3 + %6 = sub i64 %C20, %A21 + %diff.check = icmp ult i64 %6, %5 + %7 = shl nuw nsw i64 %4, 3 + %8 = sub i64 %C20, %B22 + %diff.check23 = icmp ult i64 %8, %7 + %conflict.rdx = or i1 %diff.check, %diff.check23 + br i1 %conflict.rdx, label %for.body.preheader25, label %vector.ph + +vector.ph: ; preds = %vector.memcheck + %9 = call @llvm.experimental.stepvector.nxv1i64() + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %evl.phi = phi i64 [ %N, %vector.ph ], [ %evl.next, %vector.body ] + %vec.ind = phi [ %9, %vector.ph ], [ %vec.ind.next, %vector.body ] + %10 = and i64 %evl.phi, 4294967295 + %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %10, i64 3, i64 0) + %11 = trunc i64 %vl to i32 + %.splatinsert = insertelement poison, i64 %vl, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %12 = icmp ult %vec.ind, shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) + %13 = getelementptr inbounds double, ptr %A, i64 %index + %vp.load = call @llvm.vp.load.nxv1f64.p0(ptr %13, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %11), !tbaa !4 + %14 = getelementptr inbounds double, ptr %B, i64 %index + %vp.load24 = call @llvm.vp.load.nxv1f64.p0(ptr %14, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %11), !tbaa !4 + %15 = fadd %vp.load, %vp.load24 + %16 = fmul %vp.load, %vp.load24 + %17 = select %12, %15, %16 + %18 = getelementptr inbounds double, ptr %C, i64 %index + call void @llvm.vp.store.nxv1f64.p0( %17, ptr %18, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %11) + %index.next = add i64 %index, %vl + %evl.next = sub i64 %N, %index.next + %vec.ind.next = add %vec.ind, %.splat + %19 = icmp eq i64 %index.next, %N + br i1 %19, label %for.end.loopexit26, label %vector.body, !llvm.loop !8 + +for.body: ; preds = %for.body.preheader25, %for.body + %I.019 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader25 ] + %cmp1 = icmp ult i64 %I.019, 50 + %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.019 + %20 = load double, ptr %arrayidx, align 8, !tbaa !4 + %arrayidx2 = getelementptr inbounds double, ptr %B, i64 %I.019 + %21 = load double, ptr %arrayidx2, align 8, !tbaa !4 + %add = fadd double %20, %21 + %mul = fmul double %20, %21 + %mul.sink = select i1 %cmp1, double %add, double %mul + %22 = getelementptr inbounds double, ptr %C, i64 %I.019 + store double %mul.sink, ptr %22, align 8 + %inc = add nuw nsw i64 %I.019, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !12 + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end.loopexit26: ; preds = %vector.body + br label %for.end + +for.end: ; preds = %for.end.loopexit26, %for.end.loopexit, %entry + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare i64 @llvm.vscale.i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.umax.i64(i64, i64) #2 + +; Function Attrs: nounwind memory(none) +declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #3 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.experimental.stepvector.nxv1i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare @llvm.vp.load.nxv1f64.p0(ptr nocapture, , i32) #4 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.vp.store.nxv1f64.p0(, ptr nocapture, , i32) #5 + +attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { nounwind memory(none) } +attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"target-abi", !"lp64d"} +!2 = !{i32 8, !"SmallDataLimit", i32 8} +!3 = !{!"clang version 17.0.0"} +!4 = !{!5, !5, i64 0} +!5 = !{!"double", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C/C++ TBAA"} +!8 = distinct !{!8, !9, !10, !11} +!9 = !{!"llvm.loop.mustprogress"} +!10 = !{!"llvm.loop.isvectorized", i32 1} +!11 = !{!"llvm.loop.unroll.runtime.disable"} +!12 = distinct !{!12, !9, !10} diff --git a/llvm/test/Transforms/VectorPredication/simple_vector_sum.ll b/llvm/test/Transforms/VectorPredication/simple_vector_sum.ll new file mode 100644 index 000000000000..116d883572ee --- /dev/null +++ b/llvm/test/Transforms/VectorPredication/simple_vector_sum.ll @@ -0,0 +1,193 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S --passes=vector-predication -o - < %s | FileCheck %s + +; ModuleID = 'custom/simple.c' +source_filename = "custom/simple.c" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-unknown" + +; Input C code: +; void addVec(long N, double *C, double *A, double *B) { +; long I; +; for (I = 0; I < N; I++) +; C[I] = A[I] + B[I]; +; } + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) +define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 { +; CHECK-LABEL: @addVec( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[B11:%.*]] = ptrtoint ptr [[B:%.*]] to i64 +; CHECK-NEXT: [[A10:%.*]] = ptrtoint ptr [[A:%.*]] to i64 +; CHECK-NEXT: [[C9:%.*]] = ptrtoint ptr [[C:%.*]] to i64 +; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = xor i64 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 10) +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY_PREHEADER14:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: for.body.preheader14: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[C9]], [[A10]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[C9]], [[B11]] +; CHECK-NEXT: [[DIFF_CHECK12:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK12]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER14]], label [[VECTOR_BODY_PREHEADER:%.*]] +; CHECK: vector.body.preheader: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[EVL_PHI:%.*]] = phi i64 [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ], [ [[N]], [[VECTOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[TMP9:%.*]] = and i64 [[EVL_PHI]], 4294967295 +; CHECK-NEXT: [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP9]], i64 3, i64 0) +; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[VL]] to i32 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4:![0-9]+]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD13:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4]] +; CHECK-NEXT: [[VP_OP:%.*]] = call @llvm.vp.fadd.nxv1f64( [[VP_LOAD]], [[VP_LOAD13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.vp.store.nxv1f64.p0( [[VP_OP]], ptr [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VL]] +; CHECK-NEXT: [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP14]], label [[FOR_END_LOOPEXIT15:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER14]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_08]] +; CHECK-NEXT: [[TMP15:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_08]] +; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr [[ARRAYIDX1]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_08]] +; CHECK-NEXT: store double [[ADD]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end.loopexit15: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %B11 = ptrtoint ptr %B to i64 + %A10 = ptrtoint ptr %A to i64 + %C9 = ptrtoint ptr %C to i64 + %cmp7 = icmp sgt i64 %N, 0 + br i1 %cmp7, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %0 = xor i64 %N, -1 + %1 = call i64 @llvm.vscale.i64() + %2 = call i64 @llvm.umax.i64(i64 %1, i64 10) + %3 = icmp ugt i64 %2, %0 + br i1 %3, label %for.body.preheader14, label %vector.memcheck + +for.body.preheader14: ; preds = %vector.memcheck, %for.body.preheader + br label %for.body + +vector.memcheck: ; preds = %for.body.preheader + %4 = call i64 @llvm.vscale.i64() + %5 = shl nuw nsw i64 %4, 3 + %6 = sub i64 %C9, %A10 + %diff.check = icmp ult i64 %6, %5 + %7 = shl nuw nsw i64 %4, 3 + %8 = sub i64 %C9, %B11 + %diff.check12 = icmp ult i64 %8, %7 + %conflict.rdx = or i1 %diff.check, %diff.check12 + br i1 %conflict.rdx, label %for.body.preheader14, label %vector.body.preheader + +vector.body.preheader: ; preds = %vector.memcheck + br label %vector.body + +vector.body: ; preds = %vector.body.preheader, %vector.body + %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.body.preheader ] + %evl.phi = phi i64 [ %evl.next, %vector.body ], [ %N, %vector.body.preheader ] + %9 = and i64 %evl.phi, 4294967295 + %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %9, i64 3, i64 0) + %10 = trunc i64 %vl to i32 + %11 = getelementptr inbounds double, ptr %A, i64 %index + %vp.load = call @llvm.vp.load.nxv1f64.p0(ptr %11, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %10), !tbaa !4 + %12 = getelementptr inbounds double, ptr %B, i64 %index + %vp.load13 = call @llvm.vp.load.nxv1f64.p0(ptr %12, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %10), !tbaa !4 + %13 = fadd %vp.load, %vp.load13 + %14 = getelementptr inbounds double, ptr %C, i64 %index + call void @llvm.vp.store.nxv1f64.p0( %13, ptr %14, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %10), !tbaa !4 + %index.next = add i64 %index, %vl + %evl.next = sub i64 %N, %index.next + %15 = icmp eq i64 %index.next, %N + br i1 %15, label %for.end.loopexit15, label %vector.body, !llvm.loop !8 + +for.body: ; preds = %for.body.preheader14, %for.body + %I.08 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader14 ] + %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.08 + %16 = load double, ptr %arrayidx, align 8, !tbaa !4 + %arrayidx1 = getelementptr inbounds double, ptr %B, i64 %I.08 + %17 = load double, ptr %arrayidx1, align 8, !tbaa !4 + %add = fadd double %16, %17 + %arrayidx2 = getelementptr inbounds double, ptr %C, i64 %I.08 + store double %add, ptr %arrayidx2, align 8, !tbaa !4 + %inc = add nuw nsw i64 %I.08, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !12 + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end.loopexit15: ; preds = %vector.body + br label %for.end + +for.end: ; preds = %for.end.loopexit15, %for.end.loopexit, %entry + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare i64 @llvm.vscale.i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.umax.i64(i64, i64) #2 + +; Function Attrs: nounwind memory(none) +declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #3 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.experimental.stepvector.nxv1i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare @llvm.vp.load.nxv1f64.p0(ptr nocapture, , i32) #4 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.vp.store.nxv1f64.p0(, ptr nocapture, , i32) #5 + +attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { nounwind memory(none) } +attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"target-abi", !"lp64d"} +!2 = !{i32 8, !"SmallDataLimit", i32 8} +!3 = !{!"clang version 17.0.0"} +!4 = !{!5, !5, i64 0} +!5 = !{!"double", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C/C++ TBAA"} +!8 = distinct !{!8, !9, !10, !11} +!9 = !{!"llvm.loop.mustprogress"} +!10 = !{!"llvm.loop.isvectorized", i32 1} +!11 = !{!"llvm.loop.unroll.runtime.disable"} +!12 = distinct !{!12, !9, !10} diff --git a/llvm/unittests/IR/VectorBuilderTest.cpp b/llvm/unittests/IR/VectorBuilderTest.cpp index 4f9e9d7c494d..7b0109a77b3e 100644 --- a/llvm/unittests/IR/VectorBuilderTest.cpp +++ b/llvm/unittests/IR/VectorBuilderTest.cpp @@ -66,8 +66,8 @@ TEST_F(VectorBuilderTest, TestCreateBinaryInstructions) { bool IsFP = (#INSTCLASS)[0] == 'F'; \ auto *ValueTy = IsFP ? FloatVecTy : IntVecTy; \ Value *Op = UndefValue::get(ValueTy); \ - auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy, \ - {Op, Op}); \ + auto *I = VBuild.createVectorInstructionFromOpcode(Instruction::OPCODE, \ + ValueTy, {Op, Op}); \ ASSERT_TRUE(isa(I)); \ auto *VPIntrin = cast(I); \ ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID); \ @@ -116,8 +116,8 @@ TEST_F(VectorBuilderTest, TestCreateBinaryInstructions_FixedVector_NoMask) { bool IsFP = (#INSTCLASS)[0] == 'F'; \ Type *ValueTy = IsFP ? FloatVecTy : IntVecTy; \ Value *Op = UndefValue::get(ValueTy); \ - auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy, \ - {Op, Op}); \ + auto *I = VBuild.createVectorInstructionFromOpcode(Instruction::OPCODE, \ + ValueTy, {Op, Op}); \ ASSERT_TRUE(isa(I)); \ auto *VPIntrin = cast(I); \ ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID); \ @@ -162,8 +162,8 @@ TEST_F(VectorBuilderTest, TestCreateBinaryInstructions_FixedVector_NoEVL) { bool IsFP = (#INSTCLASS)[0] == 'F'; \ Type *ValueTy = IsFP ? FloatVecTy : IntVecTy; \ Value *Op = UndefValue::get(ValueTy); \ - auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy, \ - {Op, Op}); \ + auto *I = VBuild.createVectorInstructionFromOpcode(Instruction::OPCODE, \ + ValueTy, {Op, Op}); \ ASSERT_TRUE(isa(I)); \ auto *VPIntrin = cast(I); \ ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID); \ @@ -197,8 +197,8 @@ TEST_F(VectorBuilderTest, bool IsFP = (#INSTCLASS)[0] == 'F'; \ Type *ValueTy = IsFP ? FloatVecTy : IntVecTy; \ Value *Op = UndefValue::get(ValueTy); \ - auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy, \ - {Op, Op}); \ + auto *I = VBuild.createVectorInstructionFromOpcode(Instruction::OPCODE, \ + ValueTy, {Op, Op}); \ ASSERT_TRUE(isa(I)); \ auto *VPIntrin = cast(I); \ ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID); \ @@ -227,8 +227,8 @@ TEST_F(VectorBuilderTest, TestCreateLoadStore) { // vp.load auto LoadVPID = VPIntrinsic::getForOpcode(Instruction::Load); - auto *LoadIntrin = VBuild.createVectorInstruction(Instruction::Load, - FloatVecTy, {FloatVecPtr}); + auto *LoadIntrin = VBuild.createVectorInstructionFromOpcode( + Instruction::Load, FloatVecTy, {FloatVecPtr}); ASSERT_TRUE(isa(LoadIntrin)); auto *VPLoad = cast(LoadIntrin); ASSERT_EQ(VPLoad->getIntrinsicID(), LoadVPID); @@ -237,8 +237,8 @@ TEST_F(VectorBuilderTest, TestCreateLoadStore) { // vp.store auto *VoidTy = Builder.getVoidTy(); auto StoreVPID = VPIntrinsic::getForOpcode(Instruction::Store); - auto *StoreIntrin = VBuild.createVectorInstruction(Instruction::Store, VoidTy, - {FloatVec, FloatVecPtr}); + auto *StoreIntrin = VBuild.createVectorInstructionFromOpcode( + Instruction::Store, VoidTy, {FloatVec, FloatVecPtr}); ASSERT_TRUE(isa(LoadIntrin)); auto *VPStore = cast(StoreIntrin); ASSERT_EQ(VPStore->getIntrinsicID(), StoreVPID); @@ -257,7 +257,8 @@ TEST_F(VectorBuilderTest, TestFail_SilentlyReturnNone) { auto *VoidTy = Builder.getVoidTy(); VectorBuilder VBuild(Builder, VectorBuilder::Behavior::SilentlyReturnNone); VBuild.setMask(Mask).setEVL(EVL); - auto *Val = VBuild.createVectorInstruction(Instruction::Br, VoidTy, {}); + auto *Val = + VBuild.createVectorInstructionFromOpcode(Instruction::Br, VoidTy, {}); ASSERT_EQ(Val, nullptr); } @@ -272,8 +273,11 @@ TEST_F(VectorBuilderTest, TestFail_ReportAndAbort) { auto *VoidTy = Builder.getVoidTy(); VectorBuilder VBuild(Builder, VectorBuilder::Behavior::ReportAndAbort); VBuild.setMask(Mask).setEVL(EVL); - ASSERT_DEATH({ VBuild.createVectorInstruction(Instruction::Br, VoidTy, {}); }, - "No VPIntrinsic for this opcode"); + ASSERT_DEATH( + { + VBuild.createVectorInstructionFromOpcode(Instruction::Br, VoidTy, {}); + }, + "No VPIntrinsic for this opcode"); } } // end anonymous namespace diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index 65d241feeab2..539701822bfb 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -1036,7 +1036,8 @@ TEST(VPRecipeTest, CastVPWidenMemoryInstructionRecipeToVPUserAndVPDef) { new LoadInst(Int32, UndefValue::get(Int32Ptr), "", false, Align(1)); VPValue Addr; VPValue Mask; - VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, true, false); + VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, nullptr, true, + false); EXPECT_TRUE(isa(&Recipe)); VPRecipeBase *BaseR = &Recipe; EXPECT_TRUE(isa(BaseR)); @@ -1131,7 +1132,8 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { new LoadInst(Int32, UndefValue::get(Int32Ptr), "", false, Align(1)); VPValue Addr; VPValue Mask; - VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, true, false); + VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, nullptr, true, + false); EXPECT_FALSE(Recipe.mayHaveSideEffects()); EXPECT_TRUE(Recipe.mayReadFromMemory()); EXPECT_FALSE(Recipe.mayWriteToMemory()); @@ -1145,8 +1147,8 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { VPValue Addr; VPValue Mask; VPValue StoredV; - VPWidenMemoryInstructionRecipe Recipe(*Store, &Addr, &StoredV, &Mask, false, - false); + VPWidenMemoryInstructionRecipe Recipe(*Store, &Addr, &StoredV, &Mask, + nullptr, false, false); EXPECT_TRUE(Recipe.mayHaveSideEffects()); EXPECT_FALSE(Recipe.mayReadFromMemory()); EXPECT_TRUE(Recipe.mayWriteToMemory());