From db755a202e6ea40d4429f89c4b14e6c815250d3c Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Tue, 19 Dec 2023 11:10:31 +0000 Subject: [PATCH] [vecz] Add support for masking cmpxchg instructions This finishes off the support for masked atomic instructions. The scheme is essentially identical to that of atomic RMW instructions, except that the instruction returns a literal struct containing the value and a boolean success value. These must also be packetized for efficient results. The packetization of literal struct types - those unnamed structs returned by cmpxchg - has gone through some refactoring, so now the vectorized type of `{ i32, i1 }` is (e.g.) `{ <4 x i32>, <4 x i1> }`. This in practice makes it much more efficient to extract component vectors from the structs. We can see the effect on codegen in the associated LIT tests. It also makes it possible to scalably vectorize cmpxchg instructions. Note that now even unmasked cmpxchg instructions generate the masked builtin, albeit with an "all true" mask. This is to help maintain a uniform internal representation of the packetized literal structure type. --- .../analysis/uniform_value_analysis.cpp | 2 + .../source/include/vectorization_context.h | 46 ++- .../control_flow_conversion_pass.cpp | 84 ++-- .../transform/packetization_helpers.cpp | 17 +- .../vecz/source/transform/packetizer.cpp | 205 ++++++++-- .../vecz/source/vectorization_context.cpp | 384 +++++++++++------- .../test/lit/llvm/ScalableVectors/cmpxchg.ll | 37 +- .../llvm/VectorPredication/masked_atomics.ll | 106 +++++ .../compiler/vecz/test/lit/llvm/cmpxchg.ll | 115 ++---- .../vecz/test/lit/llvm/diverging_atomic.ll | 46 --- .../vecz/test/lit/llvm/masked_cmpxchg.ll | 105 +++++ .../test/lit/llvm/masked_cmpxchg_scalar.ll | 48 +++ 12 files changed, 845 insertions(+), 350 deletions(-) create mode 100644 modules/compiler/vecz/test/lit/llvm/VectorPredication/masked_atomics.ll delete mode 100644 modules/compiler/vecz/test/lit/llvm/diverging_atomic.ll create mode 100644 modules/compiler/vecz/test/lit/llvm/masked_cmpxchg.ll create mode 100644 modules/compiler/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll diff --git a/modules/compiler/vecz/source/analysis/uniform_value_analysis.cpp b/modules/compiler/vecz/source/analysis/uniform_value_analysis.cpp index 3f4f495c1..e2a696aa0 100644 --- a/modules/compiler/vecz/source/analysis/uniform_value_analysis.cpp +++ b/modules/compiler/vecz/source/analysis/uniform_value_analysis.cpp @@ -198,6 +198,8 @@ void UniformValueResult::findVectorLeaves( Op->isMaskedScatterGatherMemOp())) { IsCallLeaf = true; } + } else if (Ctx.isMaskedAtomicFunction(*CI->getCalledFunction())) { + IsCallLeaf = true; } if (IsCallLeaf) { Leaves.push_back(CI); diff --git a/modules/compiler/vecz/source/include/vectorization_context.h b/modules/compiler/vecz/source/include/vectorization_context.h index 3d580525c..ad140dd60 100644 --- a/modules/compiler/vecz/source/include/vectorization_context.h +++ b/modules/compiler/vecz/source/include/vectorization_context.h @@ -153,36 +153,59 @@ class VectorizationContext { /// @return The masked version of the function llvm::Function *getOrCreateMaskedFunction(llvm::CallInst *CI); - struct MaskedAtomicRMW { + /// @brief Represents either an atomicrmw or cmpxchg operation. + /// + /// Most fields are shared, with the exception of CmpXchgFailureOrdering and + /// IsWeak, which are only to be set for cmpxchg, and BinOp, which is only to + /// be set to a valid value for atomicrmw. + struct MaskedAtomic { llvm::Type *PointerTy; llvm::Type *ValTy; + /// @brief Must be set to BAD_BINOP for cmpxchg instructions llvm::AtomicRMWInst::BinOp BinOp; llvm::Align Align; bool IsVolatile = false; llvm::SyncScope::ID SyncScope; llvm::AtomicOrdering Ordering; + /// @brief Must be set for cmpxchg instructions + std::optional CmpXchgFailureOrdering = std::nullopt; + /// @brief Must only be set for cmpxchg instructions + bool IsWeak = false; // Vectorization info llvm::ElementCount VF; bool IsVectorPredicated = false; + + /// @brief Returns true if this MaskedAtomic represents a cmpxchg operation. + bool isCmpXchg() const { + if (CmpXchgFailureOrdering.has_value()) { + // 'binop' only applies to atomicrmw + assert(BinOp == llvm::AtomicRMWInst::BAD_BINOP && + "Invalid MaskedAtomic state"); + return true; + } + // 'weak' only applies to cmpxchg + assert(!IsWeak && "Invalid MaskedAtomic state"); + return false; + } }; - /// @brief Check if the given function is a masked version of an atomic RMW - /// operation. + /// @brief Check if the given function is a masked version of an atomicrmw or + /// cmpxchg operation. /// /// @param[in] F The function to check - /// @return A MaskedAtomicRMW instance detailing the atomic operation if the - /// function is a masked atomic RMW, or std::nullopt otherwise - std::optional isMaskedAtomicRMWFunction( + /// @return A MaskedAtomic instance detailing the atomic operation if the + /// function is a masked atomic, or std::nullopt otherwise + std::optional isMaskedAtomicFunction( const llvm::Function &F) const; /// @brief Get (if it exists already) or create the function representing the - /// masked version of an atomic RMW operation. + /// masked version of an atomicrmw/cmpxchg operation. /// /// @param[in] I Atomic to be masked /// @param[in] Choices Choices to mangle into the function name /// @param[in] VF The vectorization factor of the atomic operation /// @return The masked version of the function - llvm::Function *getOrCreateMaskedAtomicRMWFunction( - MaskedAtomicRMW &I, const VectorizationChoices &Choices, + llvm::Function *getOrCreateMaskedAtomicFunction( + MaskedAtomic &I, const VectorizationChoices &Choices, llvm::ElementCount VF); /// @brief Create a VectorizationUnit to use to vectorize the given scalar @@ -296,10 +319,9 @@ class VectorizationContext { /// @brief Emit the body for a masked atomic builtin /// /// @param[in] F The empty (declaration only) function to emit the body in - /// @param[in] MA The MaskedAtomicRMW information + /// @param[in] MA The MaskedAtomic information /// @returns true on success, false otherwise - bool emitMaskedAtomicRMWBody(llvm::Function &F, - const MaskedAtomicRMW &MA) const; + bool emitMaskedAtomicBody(llvm::Function &F, const MaskedAtomic &MA) const; /// @brief Helper for non-vectorization tasks. TargetInfo &VTI; diff --git a/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp b/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp index e16eab41b..f7649b5b4 100644 --- a/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp +++ b/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp @@ -214,15 +214,14 @@ class ControlFlowConversionState::Impl : public ControlFlowConversionState { /// @return true if it is valid to mask this call, false otherwise bool applyMaskToCall(CallInst *CI, Value *mask, DeletionMap &toDelete); - /// @brief Attempt to apply a mask to an AtomicRMW instruction via a builtin + /// @brief Attempt to apply a mask to an atomic instruction via a builtin /// call. /// - /// @param[in] atomicI The atomic instruction to apply the mask to + /// @param[in] I The (atomic) instruction to apply the mask to /// @param[in] mask The mask to apply to the masked atomic /// @param[out] toDelete mapping of deleted unmasked operations /// @return true if it is valid to mask this atomic, false otherwise - bool applyMaskToAtomicRMW(AtomicRMWInst &atomicI, Value *mask, - DeletionMap &toDelete); + bool applyMaskToAtomic(Instruction &I, Value *mask, DeletionMap &toDelete); /// @brief Linearize a CFG. /// @return true if no problem occurred, false otherwise. @@ -1138,9 +1137,7 @@ Error ControlFlowConversionState::Impl::applyMask(BasicBlock &BB, Value *mask) { } } else if (I.isAtomic() && !isa(&I)) { // Turn atomics into calls to masked builtins if possible. - // FIXME: We don't yet support masked cmpxchg instructions. - if (auto *atomicI = dyn_cast(&I); - !atomicI || !applyMaskToAtomicRMW(*atomicI, mask, toDelete)) { + if (!applyMaskToAtomic(I, mask, toDelete)) { return makeStringError("Could not apply mask to atomic instruction", I); } } else if (auto *branch = dyn_cast(&I)) { @@ -1372,41 +1369,66 @@ bool ControlFlowConversionState::Impl::applyMaskToCall(CallInst *CI, return true; } -bool ControlFlowConversionState::Impl::applyMaskToAtomicRMW( - AtomicRMWInst &atomicI, Value *mask, DeletionMap &toDelete) { - LLVM_DEBUG(dbgs() << "vecz-cf: Now at AtomicRMWInst " << atomicI << "\n"); +bool ControlFlowConversionState::Impl::applyMaskToAtomic( + Instruction &I, Value *mask, DeletionMap &toDelete) { + LLVM_DEBUG(dbgs() << "vecz-cf: Now at atomic inst " << I << "\n"); - VectorizationContext::MaskedAtomicRMW MA; - MA.Align = atomicI.getAlign(); - MA.BinOp = atomicI.getOperation(); - MA.IsVectorPredicated = VU.choices().vectorPredication(); - MA.IsVolatile = atomicI.isVolatile(); - MA.Ordering = atomicI.getOrdering(); - MA.SyncScope = atomicI.getSyncScopeID(); + SmallVector maskedFnArgs; + VectorizationContext::MaskedAtomic MA; MA.VF = ElementCount::getFixed(1); - MA.ValTy = atomicI.getType(); - MA.PointerTy = atomicI.getPointerOperand()->getType(); + MA.IsVectorPredicated = VU.choices().vectorPredication(); + + if (auto *atomicI = dyn_cast(&I)) { + MA.Align = atomicI->getAlign(); + MA.BinOp = atomicI->getOperation(); + MA.IsVolatile = atomicI->isVolatile(); + MA.Ordering = atomicI->getOrdering(); + MA.SyncScope = atomicI->getSyncScopeID(); + MA.ValTy = atomicI->getType(); + MA.PointerTy = atomicI->getPointerOperand()->getType(); + + // Set up the arguments to this function + maskedFnArgs = {atomicI->getPointerOperand(), atomicI->getValOperand(), + mask}; + + } else if (auto *cmpxchgI = dyn_cast(&I)) { + MA.Align = cmpxchgI->getAlign(); + MA.BinOp = AtomicRMWInst::BAD_BINOP; + MA.IsWeak = cmpxchgI->isWeak(); + MA.IsVolatile = cmpxchgI->isVolatile(); + MA.Ordering = cmpxchgI->getSuccessOrdering(); + MA.CmpXchgFailureOrdering = cmpxchgI->getFailureOrdering(); + MA.SyncScope = cmpxchgI->getSyncScopeID(); + MA.ValTy = cmpxchgI->getCompareOperand()->getType(); + MA.PointerTy = cmpxchgI->getPointerOperand()->getType(); + + // Set up the arguments to this function + maskedFnArgs = {cmpxchgI->getPointerOperand(), + cmpxchgI->getCompareOperand(), cmpxchgI->getNewValOperand(), + mask}; + } else { + return false; + } + // Create the new function and replace the old one with it // Get the masked function - Function *newFunction = Ctx.getOrCreateMaskedAtomicRMWFunction( + Function *maskedAtomicFn = Ctx.getOrCreateMaskedAtomicFunction( MA, VU.choices(), ElementCount::getFixed(1)); - VECZ_FAIL_IF(!newFunction); - SmallVector fnArgs = {atomicI.getPointerOperand(), - atomicI.getValOperand(), mask}; + VECZ_FAIL_IF(!maskedAtomicFn); // We don't have a vector length just yet - pass in one as a dummy. if (MA.IsVectorPredicated) { - fnArgs.push_back( - ConstantInt::get(IntegerType::getInt32Ty(atomicI.getContext()), 1)); + maskedFnArgs.push_back( + ConstantInt::get(IntegerType::getInt32Ty(I.getContext()), 1)); } - CallInst *newCI = CallInst::Create(newFunction, fnArgs, "", &atomicI); - VECZ_FAIL_IF(!newCI); + CallInst *maskedCI = CallInst::Create(maskedAtomicFn, maskedFnArgs, "", &I); + VECZ_FAIL_IF(!maskedCI); - atomicI.replaceAllUsesWith(newCI); - toDelete.emplace_back(&atomicI, newCI); + I.replaceAllUsesWith(maskedCI); + toDelete.emplace_back(&I, maskedCI); - LLVM_DEBUG(dbgs() << "vecz-cf: Replaced " << atomicI << "\n"); - LLVM_DEBUG(dbgs() << " with " << *newCI << "\n"); + LLVM_DEBUG(dbgs() << "vecz-cf: Replaced " << I << "\n"); + LLVM_DEBUG(dbgs() << " with " << *maskedCI << "\n"); return true; } diff --git a/modules/compiler/vecz/source/transform/packetization_helpers.cpp b/modules/compiler/vecz/source/transform/packetization_helpers.cpp index 11e954e73..ad08844ad 100644 --- a/modules/compiler/vecz/source/transform/packetization_helpers.cpp +++ b/modules/compiler/vecz/source/transform/packetization_helpers.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -46,6 +47,18 @@ using namespace vecz; namespace { inline Type *getWideType(Type *ty, ElementCount factor) { if (!ty->isVectorTy()) { + // The wide type of a struct literal is the wide type of each of its + // elements. + if (auto *structTy = dyn_cast(ty); + structTy && structTy->isLiteral()) { + SmallVector wideElts(structTy->elements()); + for (unsigned i = 0, e = wideElts.size(); i != e; i++) { + wideElts[i] = getWideType(wideElts[i], factor); + } + return StructType::get(ty->getContext(), wideElts); + } else if (structTy) { + VECZ_ERROR("Can't create wide type for structure type"); + } return VectorType::get(ty, factor); } bool const isScalable = isa(ty); @@ -694,7 +707,9 @@ const Packetizer::Result &Packetizer::Result::broadcast(unsigned width) const { auto &F = packetizer.F; Value *result = nullptr; const auto &TI = packetizer.context().targetInfo(); - if (isa(scalar)) { + if (isa(scalar)) { + result = PoisonValue::get(getWideType(ty, factor)); + } else if (isa(scalar)) { result = UndefValue::get(getWideType(ty, factor)); } else if (ty->isVectorTy() && factor.isScalable()) { IRBuilder<> B(buildAfter(scalar, F)); diff --git a/modules/compiler/vecz/source/transform/packetizer.cpp b/modules/compiler/vecz/source/transform/packetizer.cpp index 9b80bca86..3191f4d86 100644 --- a/modules/compiler/vecz/source/transform/packetizer.cpp +++ b/modules/compiler/vecz/source/transform/packetizer.cpp @@ -302,14 +302,14 @@ class Packetizer::Impl : public Packetizer { /// /// @return Packetized instruction. ValuePacket packetizeMemOp(MemOp &Op); - /// @brief Packetize a masked atomic RMW operation. + /// @brief Packetize a masked atomicrmw or cmpxchg operation. /// - /// @param[in] CI Masked atomic RMW builtin call to packetize. - /// @param[in] AtomicInfo Information about the masked atomic RMW. + /// @param[in] CI Masked atomic builtin call to packetize. + /// @param[in] AtomicInfo Information about the masked atomic. /// /// @return Packetized instruction. - ValuePacket packetizeMaskedAtomicRMW( - CallInst &CI, VectorizationContext::MaskedAtomicRMW AtomicInfo); + ValuePacket packetizeMaskedAtomic( + CallInst &CI, VectorizationContext::MaskedAtomic AtomicInfo); /// @brief Packetize a GEP instruction. /// /// @param[in] GEP Instruction to packetize. @@ -334,6 +334,12 @@ class Packetizer::Impl : public Packetizer { /// /// @return Packetized instruction. ValuePacket packetizeFreeze(FreezeInst *FreezeI); + /// @brief Packetize an atomic cmpxchg instruction. + /// + /// @param[in] AtomicI Instruction to packetize. + /// + /// @return Packetized instruction. + ValuePacket packetizeAtomicCmpXchg(AtomicCmpXchgInst *AtomicI); /// @brief Packetize a unary operator instruction. /// /// @param[in] UnOp Instruction to packetize. @@ -402,6 +408,22 @@ class Packetizer::Impl : public Packetizer { /// /// @return Packetized instruction. ValuePacket packetizeExtractElement(ExtractElementInst *ExtractElement); + /// @brief Packetize an insert value instruction. + /// + /// Only packetizes inserts into literal struct types. + /// + /// @param[in] InsertValue Instruction to packetize. + /// + /// @return Packetized instruction. + ValuePacket packetizeInsertValue(InsertValueInst *InsertValue); + /// @brief Packetize an extract value instruction. + /// + /// Only packetizes extracts from literal struct types. + /// + /// @param[in] ExtractValue Instruction to packetize. + /// + /// @return Packetized instruction. + ValuePacket packetizeExtractValue(ExtractValueInst *ExtractValue); /// @brief Packetize a shuffle vector instruction. /// /// @param[in] Shuffle Instruction to packetize. @@ -1157,12 +1179,21 @@ Packetizer::Result Packetizer::Impl::packetizeInstruction(Instruction *Ins) { case Instruction::ExtractElement: results = packetizeExtractElement(cast(Ins)); break; + case Instruction::InsertValue: + results = packetizeInsertValue(cast(Ins)); + break; + case Instruction::ExtractValue: + results = packetizeExtractValue(cast(Ins)); + break; case Instruction::ShuffleVector: results = packetizeShuffleVector(cast(Ins)); break; case Instruction::Freeze: results = packetizeFreeze(cast(Ins)); break; + case Instruction::AtomicCmpXchg: + results = packetizeAtomicCmpXchg(cast(Ins)); + break; } if (auto res = getPacketizationResult(Ins, results, /*update stats*/ true)) { @@ -2102,8 +2133,8 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) { return packetizeMemOp(*MaskedOp); } } - if (auto AtomicInfo = Ctx.isMaskedAtomicRMWFunction(*Callee)) { - return packetizeMaskedAtomicRMW(*CI, *AtomicInfo); + if (auto AtomicInfo = Ctx.isMaskedAtomicFunction(*Callee)) { + return packetizeMaskedAtomic(*CI, *AtomicInfo); } } @@ -2778,16 +2809,18 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) { return results; } -ValuePacket Packetizer::Impl::packetizeMaskedAtomicRMW( - CallInst &CI, VectorizationContext::MaskedAtomicRMW AtomicInfo) { +ValuePacket Packetizer::Impl::packetizeMaskedAtomic( + CallInst &CI, VectorizationContext::MaskedAtomic AtomicInfo) { ValuePacket results; - Value *const ptr = CI.getArgOperand(0); - Value *const val = CI.getArgOperand(1); - Value *const mask = CI.getArgOperand(2); + bool const IsCmpXchg = AtomicInfo.isCmpXchg(); - assert(AtomicInfo.ValTy == val->getType() && "AtomicInfo mismatch"); - auto const packetWidth = getPacketWidthForType(val->getType()); + Value *const ptrArg = CI.getArgOperand(0); + Value *const valOrCmpArg = CI.getArgOperand(1); + Value *const maskArg = CI.getArgOperand(2 + IsCmpXchg); + + assert(AtomicInfo.ValTy == valOrCmpArg->getType() && "AtomicInfo mismatch"); + auto const packetWidth = getPacketWidthForType(valOrCmpArg->getType()); if (VL && packetWidth != 1) { emitVeczRemarkMissed(&F, &CI, @@ -2795,20 +2828,29 @@ ValuePacket Packetizer::Impl::packetizeMaskedAtomicRMW( return {}; } - ValuePacket valPacket; - Result valResult = packetize(val); + ValuePacket valOrCmpPacket; + Result valResult = packetize(valOrCmpArg); PACK_FAIL_IF(!valResult); - valResult.getPacketValues(packetWidth, valPacket); - PACK_FAIL_IF(valPacket.empty()); + valResult.getPacketValues(packetWidth, valOrCmpPacket); + PACK_FAIL_IF(valOrCmpPacket.empty()); + + ValuePacket newValPacket; + if (IsCmpXchg) { + Value *const newValArg = CI.getArgOperand(2); + Result newValResult = packetize(newValArg); + PACK_FAIL_IF(!newValResult); + newValResult.getPacketValues(packetWidth, newValPacket); + PACK_FAIL_IF(newValPacket.empty()); + } ValuePacket ptrPacket; - Result ptrResult = packetize(ptr); + Result ptrResult = packetize(ptrArg); PACK_FAIL_IF(!ptrResult); ptrResult.getPacketValues(packetWidth, ptrPacket); PACK_FAIL_IF(ptrPacket.empty()); ValuePacket maskPacket; - Result maskResult = packetize(mask); + Result maskResult = packetize(maskArg); PACK_FAIL_IF(!maskResult); maskResult.getPacketValues(packetWidth, maskPacket); PACK_FAIL_IF(maskPacket.empty()); @@ -2817,16 +2859,20 @@ ValuePacket Packetizer::Impl::packetizeMaskedAtomicRMW( IC.deleteInstructionLater(&CI); for (unsigned i = 0; i != packetWidth; ++i) { - auto *const ptrI = ptrPacket[i]; - auto *const valI = valPacket[i]; + auto *const ptr = ptrPacket[i]; + auto *const valOrCmp = valOrCmpPacket[i]; - AtomicInfo.ValTy = valI->getType(); - AtomicInfo.PointerTy = ptrI->getType(); + AtomicInfo.ValTy = valOrCmp->getType(); + AtomicInfo.PointerTy = ptr->getType(); auto *maskedAtomicF = - Ctx.getOrCreateMaskedAtomicRMWFunction(AtomicInfo, Choices, SimdWidth); + Ctx.getOrCreateMaskedAtomicFunction(AtomicInfo, Choices, SimdWidth); PACK_FAIL_IF(!maskedAtomicF); - SmallVector args = {ptrI, valI, maskPacket[i]}; + SmallVector args = {ptr, valOrCmp}; + if (IsCmpXchg) { + args.push_back(newValPacket[i]); + } + args.push_back(maskPacket[i]); if (AtomicInfo.IsVectorPredicated) { assert(VL && "Missing vector length"); args.push_back(VL); @@ -2991,6 +3037,49 @@ ValuePacket Packetizer::Impl::packetizeFreeze(FreezeInst *FreezeI) { return results; } +ValuePacket Packetizer::Impl::packetizeAtomicCmpXchg( + AtomicCmpXchgInst *AtomicI) { + ValuePacket results; + + VectorizationContext::MaskedAtomic MA; + MA.VF = SimdWidth; + MA.IsVectorPredicated = VU.choices().vectorPredication(); + + MA.Align = AtomicI->getAlign(); + MA.BinOp = AtomicRMWInst::BAD_BINOP; + MA.IsWeak = AtomicI->isWeak(); + MA.IsVolatile = AtomicI->isVolatile(); + MA.Ordering = AtomicI->getSuccessOrdering(); + MA.CmpXchgFailureOrdering = AtomicI->getFailureOrdering(); + MA.SyncScope = AtomicI->getSyncScopeID(); + + IRBuilder<> B(AtomicI); + + // Set up the arguments to this function + Value *Ptr = packetize(AtomicI->getPointerOperand()).getAsValue(); + Value *Cmp = packetize(AtomicI->getCompareOperand()).getAsValue(); + Value *New = packetize(AtomicI->getNewValOperand()).getAsValue(); + + MA.ValTy = Cmp->getType(); + MA.PointerTy = Ptr->getType(); + + auto *const TrueMask = createAllTrueMask(B, SimdWidth); + SmallVector MaskedFnArgs = {Ptr, Cmp, New, TrueMask}; + if (VL) { + MaskedFnArgs.push_back(VL); + } + + Function *MaskedAtomicFn = + Ctx.getOrCreateMaskedAtomicFunction(MA, VU.choices(), SimdWidth); + PACK_FAIL_IF(!MaskedAtomicFn); + + CallInst *MaskedCI = B.CreateCall(MaskedAtomicFn, MaskedFnArgs); + + results.push_back(MaskedCI); + + return results; +} + ValuePacket Packetizer::Impl::packetizeUnaryOp(UnaryOperator *UnOp) { ValuePacket results; @@ -3716,6 +3805,70 @@ ValuePacket Packetizer::Impl::packetizeExtractElement( return results; } +ValuePacket Packetizer::Impl::packetizeInsertValue( + InsertValueInst *InsertValue) { + ValuePacket results; + + Value *const Val = InsertValue->getInsertedValueOperand(); + Value *const Aggregate = InsertValue->getAggregateOperand(); + + // We can only packetize literal struct types + if (auto *StructTy = dyn_cast(Aggregate->getType()); + !StructTy || !StructTy->isLiteral()) { + return results; + } + + Value *PackAggregate = packetizeIfVarying(Aggregate); + PACK_FAIL_IF(!PackAggregate); + + Value *PackVal = packetizeIfVarying(Val); + PACK_FAIL_IF(!PackVal); + + bool const IsValVarying = Val != PackVal; + bool const IsAggregateVarying = Aggregate != PackAggregate; + if (!IsAggregateVarying && IsValVarying) { + // If the aggregate wasn't varying but the value was + PackAggregate = packetize(Aggregate).getAsValue(); + } else if (IsAggregateVarying && !IsValVarying) { + // If the aggregate was varying but the value wasn't + PackVal = packetize(Val).getAsValue(); + } else if (!IsAggregateVarying && !IsValVarying) { + // If both were uniform + return results; + } + + IRBuilder<> B(buildAfter(InsertValue, F)); + + results.push_back( + B.CreateInsertValue(PackAggregate, PackVal, InsertValue->getIndices())); + + IC.deleteInstructionLater(InsertValue); + return results; +} + +ValuePacket Packetizer::Impl::packetizeExtractValue( + ExtractValueInst *ExtractValue) { + ValuePacket results; + + Value *const Aggregate = ExtractValue->getAggregateOperand(); + // We can only packetize literal struct types + if (auto *StructTy = dyn_cast(Aggregate->getType()); + !StructTy || !StructTy->isLiteral()) { + return results; + } + + Value *PackAggregate = packetizeIfVarying(Aggregate); + PACK_FAIL_IF(!PackAggregate); + + IRBuilder<> B(buildAfter(ExtractValue, F)); + + results.push_back( + B.CreateExtractValue(PackAggregate, ExtractValue->getIndices())); + + IC.deleteInstructionLater(ExtractValue); + return results; +} + ValuePacket Packetizer::Impl::packetizeShuffleVector( ShuffleVectorInst *Shuffle) { Value *const srcA = Shuffle->getOperand(0); diff --git a/modules/compiler/vecz/source/vectorization_context.cpp b/modules/compiler/vecz/source/vectorization_context.cpp index 821d62936..6880bbfc4 100644 --- a/modules/compiler/vecz/source/vectorization_context.cpp +++ b/modules/compiler/vecz/source/vectorization_context.cpp @@ -374,8 +374,8 @@ Function *VectorizationContext::getOrCreateMaskedFunction(CallInst *CI) { return newFunction; } -std::optional -VectorizationContext::isMaskedAtomicRMWFunction(const Function &F) const { +std::optional +VectorizationContext::isMaskedAtomicFunction(const Function &F) const { auto VFInfo = decodeVectorizedFunctionName(F.getName()); if (!VFInfo) { return std::nullopt; @@ -383,55 +383,69 @@ VectorizationContext::isMaskedAtomicRMWFunction(const Function &F) const { auto [FnNameStr, VF, Choices] = *VFInfo; llvm::StringRef FnName = FnNameStr; - if (!FnName.consume_front("masked_atomicrmw_")) { + if (!FnName.consume_front("masked_")) { return std::nullopt; } - VectorizationContext::MaskedAtomicRMW AtomicInfo; + bool IsCmpXchg = FnName.consume_front("cmpxchg_"); + if (!IsCmpXchg && !FnName.consume_front("atomicrmw_")) { + return std::nullopt; + } + VectorizationContext::MaskedAtomic AtomicInfo; AtomicInfo.VF = VF; AtomicInfo.IsVectorPredicated = Choices.vectorPredication(); + if (IsCmpXchg) { + AtomicInfo.IsWeak = FnName.consume_front("weak_"); + } AtomicInfo.IsVolatile = FnName.consume_front("volatile_"); - if (FnName.consume_front("xchg")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::Xchg; - } else if (FnName.consume_front("add")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::Add; - } else if (FnName.consume_front("sub")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::Sub; - } else if (FnName.consume_front("and")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::And; - } else if (FnName.consume_front("nand")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::Nand; - } else if (FnName.consume_front("or")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::Or; - } else if (FnName.consume_front("xor")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::Xor; - } else if (FnName.consume_front("max")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::Max; - } else if (FnName.consume_front("min")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::Min; - } else if (FnName.consume_front("umax")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::UMax; - } else if (FnName.consume_front("umin")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::UMin; - } else if (FnName.consume_front("fadd")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::FAdd; - } else if (FnName.consume_front("fsub")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::FSub; - } else if (FnName.consume_front("fmax")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::FMax; - } else if (FnName.consume_front("fmin")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::FMin; - } else if (FnName.consume_front("uincwrap")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::UIncWrap; - } else if (FnName.consume_front("udecwrap")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::UDecWrap; + if (IsCmpXchg) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::BAD_BINOP; } else { - return std::nullopt; + if (FnName.consume_front("xchg")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::Xchg; + } else if (FnName.consume_front("add")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::Add; + } else if (FnName.consume_front("sub")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::Sub; + } else if (FnName.consume_front("and")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::And; + } else if (FnName.consume_front("nand")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::Nand; + } else if (FnName.consume_front("or")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::Or; + } else if (FnName.consume_front("xor")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::Xor; + } else if (FnName.consume_front("max")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::Max; + } else if (FnName.consume_front("min")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::Min; + } else if (FnName.consume_front("umax")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::UMax; + } else if (FnName.consume_front("umin")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::UMin; + } else if (FnName.consume_front("fadd")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::FAdd; + } else if (FnName.consume_front("fsub")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::FSub; + } else if (FnName.consume_front("fmax")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::FMax; + } else if (FnName.consume_front("fmin")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::FMin; + } else if (FnName.consume_front("uincwrap")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::UIncWrap; + } else if (FnName.consume_front("udecwrap")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::UDecWrap; + } else { + return std::nullopt; + } + if (!FnName.consume_front("_")) { + return std::nullopt; + } } - if (!FnName.consume_front("_align")) { + if (!FnName.consume_front("align")) { return std::nullopt; } @@ -446,26 +460,38 @@ VectorizationContext::isMaskedAtomicRMWFunction(const Function &F) const { return std::nullopt; } - if (FnName.consume_front("acquire")) { - AtomicInfo.Ordering = AtomicOrdering::Acquire; - } else if (FnName.consume_front("acqrel")) { - AtomicInfo.Ordering = AtomicOrdering::AcquireRelease; - } else if (FnName.consume_front("monotonic")) { - AtomicInfo.Ordering = AtomicOrdering::Monotonic; - } else if (FnName.consume_front("notatomic")) { - AtomicInfo.Ordering = AtomicOrdering::NotAtomic; - } else if (FnName.consume_front("release")) { - AtomicInfo.Ordering = AtomicOrdering::Release; - } else if (FnName.consume_front("seqcst")) { - AtomicInfo.Ordering = AtomicOrdering::SequentiallyConsistent; - } else if (FnName.consume_front("unordered")) { - AtomicInfo.Ordering = AtomicOrdering::Unordered; + auto demangleOrdering = [&FnName]() -> std::optional { + if (FnName.consume_front("acquire_")) { + return AtomicOrdering::Acquire; + } else if (FnName.consume_front("acqrel_")) { + return AtomicOrdering::AcquireRelease; + } else if (FnName.consume_front("monotonic_")) { + return AtomicOrdering::Monotonic; + } else if (FnName.consume_front("notatomic_")) { + return AtomicOrdering::NotAtomic; + } else if (FnName.consume_front("release_")) { + return AtomicOrdering::Release; + } else if (FnName.consume_front("seqcst_")) { + return AtomicOrdering::SequentiallyConsistent; + } else if (FnName.consume_front("unordered_")) { + return AtomicOrdering::Unordered; + } else { + return std::nullopt; + } + }; + + if (auto Ordering = demangleOrdering()) { + AtomicInfo.Ordering = *Ordering; } else { return std::nullopt; } - if (!FnName.consume_front("_")) { - return std::nullopt; + if (IsCmpXchg) { + if (auto Ordering = demangleOrdering()) { + AtomicInfo.CmpXchgFailureOrdering = *Ordering; + } else { + return std::nullopt; + } } unsigned SyncScopeID = 0; @@ -489,91 +515,114 @@ VectorizationContext::isMaskedAtomicRMWFunction(const Function &F) const { return AtomicInfo; } -Function *VectorizationContext::getOrCreateMaskedAtomicRMWFunction( - MaskedAtomicRMW &I, const VectorizationChoices &Choices, ElementCount VF) { +Function *VectorizationContext::getOrCreateMaskedAtomicFunction( + MaskedAtomic &I, const VectorizationChoices &Choices, ElementCount VF) { + bool const isCmpXchg = I.isCmpXchg(); LLVMContext &ctx = I.ValTy->getContext(); SmallVector argTys; argTys.push_back(I.PointerTy); argTys.push_back(I.ValTy); + if (isCmpXchg) { + argTys.push_back(I.ValTy); + } // Add one extra argument for the mask, which is always the same length // (scalar or vector) as the value type. auto *i1Ty = Type::getInt1Ty(ctx); - argTys.push_back( + auto *maskTy = !I.ValTy->isVectorTy() ? dyn_cast(i1Ty) - : VectorType::get(i1Ty, - cast(I.ValTy)->getElementCount())); + : VectorType::get(i1Ty, cast(I.ValTy)->getElementCount()); + argTys.push_back(maskTy); if (Choices.vectorPredication()) { argTys.push_back(Type::getInt32Ty(ctx)); } std::string maskedFnName; raw_string_ostream O(maskedFnName); - O << "masked_atomicrmw_"; + O << (isCmpXchg ? "masked_cmpxchg_" : "masked_atomicrmw_"); + + if (I.IsWeak) { + assert(isCmpXchg && "Bad MaskedAtomic state"); + O << "weak_"; + } if (I.IsVolatile) { O << "volatile_"; } + if (!isCmpXchg) { #define BINOP_CASE(BINOP, STR) \ case AtomicRMWInst::BINOP: \ O << (STR); \ break - switch (I.BinOp) { - BINOP_CASE(Xchg, "xchg"); - BINOP_CASE(Add, "add"); - BINOP_CASE(Sub, "sub"); - BINOP_CASE(And, "and"); - BINOP_CASE(Nand, "nand"); - BINOP_CASE(Or, "or"); - BINOP_CASE(Xor, "xor"); - BINOP_CASE(Max, "max"); - BINOP_CASE(Min, "min"); - BINOP_CASE(UMax, "umax"); - BINOP_CASE(UMin, "umin"); - BINOP_CASE(FAdd, "fadd"); - BINOP_CASE(FSub, "fsub"); - BINOP_CASE(FMax, "fmax"); - BINOP_CASE(FMin, "fmin"); - BINOP_CASE(UIncWrap, "uincwrap"); - BINOP_CASE(UDecWrap, "udecwrap"); - case llvm::AtomicRMWInst::BAD_BINOP: - return nullptr; - } + switch (I.BinOp) { + BINOP_CASE(Xchg, "xchg"); + BINOP_CASE(Add, "add"); + BINOP_CASE(Sub, "sub"); + BINOP_CASE(And, "and"); + BINOP_CASE(Nand, "nand"); + BINOP_CASE(Or, "or"); + BINOP_CASE(Xor, "xor"); + BINOP_CASE(Max, "max"); + BINOP_CASE(Min, "min"); + BINOP_CASE(UMax, "umax"); + BINOP_CASE(UMin, "umin"); + BINOP_CASE(FAdd, "fadd"); + BINOP_CASE(FSub, "fsub"); + BINOP_CASE(FMax, "fmax"); + BINOP_CASE(FMin, "fmin"); + BINOP_CASE(UIncWrap, "uincwrap"); + BINOP_CASE(UDecWrap, "udecwrap"); + case llvm::AtomicRMWInst::BAD_BINOP: + return nullptr; + } #undef BINOP_CASE + O << "_"; + } + + O << "align" << I.Align.value() << "_"; - O << "_align" << I.Align.value() << "_"; // Mangle ordering - switch (I.Ordering) { - default: - O << static_cast(I.Ordering); - break; - case AtomicOrdering::Acquire: - O << "acquire"; - break; - case AtomicOrdering::AcquireRelease: - O << "acqrel"; - break; - case AtomicOrdering::Monotonic: - O << "monotonic"; - break; - case AtomicOrdering::NotAtomic: - O << "notatomic"; - break; - case AtomicOrdering::Release: - O << "release"; - break; - case AtomicOrdering::SequentiallyConsistent: - O << "seqcst"; - break; - case AtomicOrdering::Unordered: - O << "unordered"; - break; + auto mangleOrdering = [&O](AtomicOrdering Ordering) { + switch (Ordering) { + default: + O << static_cast(Ordering); + break; + case AtomicOrdering::Acquire: + O << "acquire"; + break; + case AtomicOrdering::AcquireRelease: + O << "acqrel"; + break; + case AtomicOrdering::Monotonic: + O << "monotonic"; + break; + case AtomicOrdering::NotAtomic: + O << "notatomic"; + break; + case AtomicOrdering::Release: + O << "release"; + break; + case AtomicOrdering::SequentiallyConsistent: + O << "seqcst"; + break; + case AtomicOrdering::Unordered: + O << "unordered"; + break; + } + }; + + mangleOrdering(I.Ordering); + // Failure Ordering + if (I.CmpXchgFailureOrdering) { + O << "_"; + mangleOrdering(*I.CmpXchgFailureOrdering); } + // Syncscope O << "_" << static_cast(I.SyncScope) << "_"; @@ -588,9 +637,11 @@ Function *VectorizationContext::getOrCreateMaskedAtomicRMWFunction( maskedFnName = getVectorizedFunctionName(maskedFnName, VF, Choices, /*IsBuiltin=*/true); + Type *maskedFnRetTy = isCmpXchg ? StructType::get(I.ValTy, maskTy) : I.ValTy; + // Create the function type FunctionType *maskedFnTy = - FunctionType::get(I.ValTy, argTys, /*isVarArg=*/false); + FunctionType::get(maskedFnRetTy, argTys, /*isVarArg=*/false); return getOrCreateInternalBuiltin(maskedFnName, maskedFnTy); } @@ -687,8 +738,8 @@ bool VectorizationContext::defineInternalBuiltin(Function *F) { return emitSubgroupScanBody(*F, isInclusive, opKind, isVP); } - if (auto AtomicInfo = isMaskedAtomicRMWFunction(*F)) { - return emitMaskedAtomicRMWBody(*F, *AtomicInfo); + if (auto AtomicInfo = isMaskedAtomicFunction(*F)) { + return emitMaskedAtomicBody(*F, *AtomicInfo); } return false; @@ -1008,40 +1059,54 @@ bool VectorizationContext::emitSubgroupScanBody(Function &F, bool IsInclusive, return true; } -bool VectorizationContext::emitMaskedAtomicRMWBody( - Function &F, const VectorizationContext::MaskedAtomicRMW &MA) const { +bool VectorizationContext::emitMaskedAtomicBody( + Function &F, const VectorizationContext::MaskedAtomic &MA) const { LLVMContext &Ctx = F.getContext(); + bool IsCmpXchg = MA.isCmpXchg(); auto *const EntryBB = BasicBlock::Create(Ctx, "entry", &F); + IRBuilder<> B(EntryBB); + + BasicBlock *LoopEntryBB = EntryBB; + if (MA.IsVectorPredicated) { + auto *const VL = F.getArg(3 + IsCmpXchg); + // Early exit if the vector length is zero. We're going to unconditionally + // jump into the loop after this. + auto *const EarlyExitBB = BasicBlock::Create(Ctx, "earlyexit", &F); + auto *const CmpZero = + B.CreateICmpEQ(VL, ConstantInt::get(VL->getType(), 0)); + + LoopEntryBB = BasicBlock::Create(Ctx, "loopentry", &F); + + B.CreateCondBr(CmpZero, EarlyExitBB, LoopEntryBB); + + B.SetInsertPoint(EarlyExitBB); + B.CreateRet(PoisonValue::get(F.getReturnType())); + } + + B.SetInsertPoint(LoopEntryBB); + auto *const ExitBB = BasicBlock::Create(Ctx, "exit", &F); auto *const PtrArg = F.getArg(0); auto *const ValArg = F.getArg(1); - Value *MaskArg = F.getArg(2); + Value *MaskArg = F.getArg(2 + IsCmpXchg); const bool IsVector = ValArg->getType()->isVectorTy(); - IRBuilder<> B(EntryBB); Value *const IdxStart = B.getInt32(0); ConstantInt *const KnownMin = B.getInt32(MA.VF.getKnownMinValue()); - Value *IdxEnd = !MA.VF.isScalable() ? KnownMin : B.CreateVScale(KnownMin); - - // For vector-predicated masked atomics, we have to merge the incoming mask - // with a mask corresponding to the number of elements left active by the - // runtime vector length. - if (MA.IsVectorPredicated) { - auto *const VL = F.getArg(3); - auto *const IndexTy = VectorType::get(VL->getType(), MA.VF); - auto *const step = B.CreateStepVector(IndexTy); - auto *const VLMask = B.CreateICmpULT(step, B.CreateVectorSplat(MA.VF, VL)); - MaskArg = B.CreateAnd(MaskArg, VLMask); - } + Value *IdxEnd = + MA.IsVectorPredicated + ? F.getArg(3 + IsCmpXchg) + : (!MA.VF.isScalable() ? KnownMin : B.CreateVScale(KnownMin)); Value *RetVal = nullptr; + Value *RetSuccessVal = nullptr; auto CreateLoopBody = [&MA, &F, &ExitBB, PtrArg, ValArg, MaskArg, &RetVal, - IsVector]( + &RetSuccessVal, IsVector, IsCmpXchg]( BasicBlock *BB, Value *Idx, ArrayRef IVs, MutableArrayRef IVsNext) -> BasicBlock * { IRBuilder<> IRB(BB); @@ -1066,14 +1131,39 @@ bool VectorizationContext::emitMaskedAtomicRMWBody( Ptr = IRB.CreateExtractElement(PtrArg, Idx, "ptr"); Val = IRB.CreateExtractElement(ValArg, Idx, "val"); } - auto *const AtomicRMW = IRB.CreateAtomicRMW(MA.BinOp, Ptr, Val, MA.Align, - MA.Ordering, MA.SyncScope); - AtomicRMW->setVolatile(MA.IsVolatile); - if (IsVector) { - RetVal = IRB.CreateInsertElement(IVs[0], AtomicRMW, Idx, "retvec"); + if (IsCmpXchg) { + Value *NewValArg = F.getArg(2); + Value *NewVal = NewValArg; + if (IsVector) { + NewVal = IRB.CreateExtractElement(NewValArg, Idx, "newval"); + } + auto *const CmpXchg = + IRB.CreateAtomicCmpXchg(Ptr, Val, NewVal, MA.Align, MA.Ordering, + *MA.CmpXchgFailureOrdering, MA.SyncScope); + CmpXchg->setWeak(MA.IsWeak); + CmpXchg->setVolatile(MA.IsVolatile); + + if (IsVector) { + RetVal = IRB.CreateInsertElement( + IVs[0], IRB.CreateExtractValue(CmpXchg, 0), Idx, "retvec"); + RetSuccessVal = IRB.CreateInsertElement( + IVs[1], IRB.CreateExtractValue(CmpXchg, 1), Idx, "retsuccess"); + } else { + RetVal = IRB.CreateExtractValue(CmpXchg, 0); + RetSuccessVal = IRB.CreateExtractValue(CmpXchg, 1); + } + } else { - RetVal = AtomicRMW; + auto *const AtomicRMW = IRB.CreateAtomicRMW( + MA.BinOp, Ptr, Val, MA.Align, MA.Ordering, MA.SyncScope); + AtomicRMW->setVolatile(MA.IsVolatile); + + if (IsVector) { + RetVal = IRB.CreateInsertElement(IVs[0], AtomicRMW, Idx, "retvec"); + } else { + RetVal = AtomicRMW; + } } IRB.CreateBr(ElseBB); @@ -1089,6 +1179,15 @@ bool VectorizationContext::emitMaskedAtomicRMWBody( } IVsNext[0] = RetVal; + if (IsCmpXchg) { + auto *MergePhi = + IRB.CreatePHI(RetSuccessVal->getType(), 2, "mergesuccess"); + MergePhi->addIncoming(IVs[1], BB); + MergePhi->addIncoming(RetSuccessVal, IfBB); + RetSuccessVal = MergePhi; + IVsNext[1] = RetSuccessVal; + } + // Move the exit block right to the end of the function. ExitBB->moveAfter(ElseBB); @@ -1100,11 +1199,22 @@ bool VectorizationContext::emitMaskedAtomicRMWBody( Opts.IVs.push_back(PoisonValue::get(MA.ValTy)); Opts.loopIVNames.push_back("retvec.prev"); } - compiler::utils::createLoop(EntryBB, ExitBB, IdxStart, IdxEnd, Opts, + if (IsCmpXchg) { + Opts.IVs.push_back(PoisonValue::get(MaskArg->getType())); + Opts.loopIVNames.push_back("retsuccess.prev"); + } + compiler::utils::createLoop(LoopEntryBB, ExitBB, IdxStart, IdxEnd, Opts, CreateLoopBody); B.SetInsertPoint(ExitBB); - B.CreateRet(RetVal); + if (IsCmpXchg) { + Value *RetStruct = PoisonValue::get(F.getReturnType()); + RetStruct = B.CreateInsertValue(RetStruct, RetVal, 0); + RetStruct = B.CreateInsertValue(RetStruct, RetSuccessVal, 1); + B.CreateRet(RetStruct); + } else { + B.CreateRet(RetVal); + } return true; } diff --git a/modules/compiler/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll b/modules/compiler/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll index 7558c2907..85b4c865d 100644 --- a/modules/compiler/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll +++ b/modules/compiler/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll @@ -14,39 +14,62 @@ ; ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -; RUN: veczc -w 4 -vecz-scalable -vecz-passes=packetizer,verify \ -; RUN: --pass-remarks-missed=vecz -S < %s 2>&1 | FileCheck %s +; RUN: veczc -w 4 -vecz-scalable -vecz-passes=packetizer,verify -S < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "spir64-unknown-unknown" -; Note: we can't currently scalably packetize this kernel, due to the struct -; type. -; CHECK: Vecz: Could not packetize %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4 +; CHECK: define spir_kernel void @__vecz_nxv4_test_fn(ptr %p, ptr %q, ptr %r) define spir_kernel void @test_fn(ptr %p, ptr %q, ptr %r) { entry: +; CHECK: [[SPLAT_PTR_INS:%.*]] = insertelement poison, ptr %p, i64 0 +; CHECK: [[SPLAT_PTR:%.*]] = shufflevector [[SPLAT_PTR_INS]], poison, zeroinitializer %call = call i64 @__mux_get_global_id(i32 0) +; Test that this cmpxchg is packetized by generating a call to an all-true masked version. +; CHECK: [[A0:%.*]] = call { , } @__vecz_b_nxv4_masked_cmpxchg_align4_acquire_monotonic_1_u9nxv4u3ptru5nxv4ju5nxv4ju5nxv4b( +; CHECK-SAME: [[SPLAT_PTR]], +; CHECK-SAME: shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-SAME: shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) +; CHECK-SAME: shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic +; CHECK: [[EXT0:%.*]] = extractvalue { , } [[A0]], 0 %val0 = extractvalue { i32, i1 } %old0, 0 +; CHECK: [[EXT1:%.*]] = extractvalue { , } [[A0]], 1 %success0 = extractvalue { i32, i1 } %old0, 1 %out = getelementptr i32, ptr %q, i64 %call +; Stored as a vector +; CHECK: store [[EXT0]], ptr store i32 %val0, ptr %out, align 4 +; CHECK: [[PTR:%.*]] = getelementptr i8, ptr %r, i64 %call %outsuccess = getelementptr i8, ptr %r, i64 %call +; CHECK: [[ZEXT0:%.*]] = zext [[EXT1]] to %outbyte = zext i1 %success0 to i8 +; Stored as a vector +; CHECK: store [[ZEXT0]], ptr [[PTR]], align 1 store i8 %outbyte, ptr %outsuccess, align 1 ; Test a couple of insert/extract patterns ; Test inserting a uniform value into a varying literal struct +; CHECK: [[INS0:%.*]] = insertvalue { , } [[A0]], zeroinitializer, 1 +; CHECK: [[EXT2:%.*]] = extractvalue { , } [[INS0]], 1 +; CHECK: [[ZEXT1:%.*]] = zext [[EXT2]] to +; CHECK: store [[ZEXT1]], ptr [[PTR]], align 1 %testinsertconst = insertvalue { i32, i1 } %old0, i1 false, 1 %testextract0 = extractvalue { i32, i1 } %testinsertconst, 1 %outbyte0 = zext i1 %testextract0 to i8 store i8 %outbyte0, ptr %outsuccess, align 1 ; Test inserting a varying value into a varying literal struct +; CHECK: [[LD:%.*]] = load , ptr +; CHECK: [[VBOOL:%.*]] = trunc [[LD]] to +; CHECK: [[INS1:%.*]] = insertvalue { , } [[A0]], [[VBOOL]], 1 +; CHECK: [[EXT3:%.*]] = extractvalue { , } [[INS1]], 1 +; CHECK: [[ZEXT2:%.*]] = zext [[EXT3]] to +; CHECK: store [[ZEXT2]], ptr [[PTR]], align 1 %byte1 = load i8, ptr %outsuccess, align 1 %bool1 = trunc i8 %byte1 to i1 %testinsertvarying0 = insertvalue { i32, i1 } %old0, i1 %bool1, 1 @@ -55,6 +78,10 @@ entry: store i8 %outbyte1, ptr %outsuccess, align 1 ; Test inserting a varying value into a uniform literal struct +; CHECK: [[INS2:%.*]] = insertvalue { , } poison, [[VBOOL]], 1 +; CHECK: [[EXT4:%.*]] = extractvalue { , } [[INS2]], 1 +; CHECK: [[ZEXT3:%.*]] = zext [[EXT4]] to +; CHECK: store [[ZEXT3]], ptr [[PTR]], align 1 %testinsertvarying1 = insertvalue { i32, i1 } poison, i1 %bool1, 1 %testextract2 = extractvalue { i32, i1 } %testinsertvarying1, 1 %outbyte2 = zext i1 %testextract2 to i8 diff --git a/modules/compiler/vecz/test/lit/llvm/VectorPredication/masked_atomics.ll b/modules/compiler/vecz/test/lit/llvm/VectorPredication/masked_atomics.ll new file mode 100644 index 000000000..35a478caa --- /dev/null +++ b/modules/compiler/vecz/test/lit/llvm/VectorPredication/masked_atomics.ll @@ -0,0 +1,106 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-passes=define-builtins,verify -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_fn( %p) { + %ret0 = call @__vecz_b_nxv1_vp_masked_atomicrmw_add_align4_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1b( %p, zeroinitializer, zeroinitializer, i32 4) + %ret1 = call { , } @__vecz_b_nxv1_vp_masked_cmpxchg_align4_acquire_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1ju5nxv1b( %p, zeroinitializer, zeroinitializer, zeroinitializer, i32 4) + ret void +} + +declare @__vecz_b_nxv1_vp_masked_atomicrmw_add_align4_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1b( %p, %val, %mask, i32 %vl) + +declare { , } @__vecz_b_nxv1_vp_masked_cmpxchg_align4_acquire_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1ju5nxv1b( %p, %cmp, %newval, %mask, i32 %vl) + +; CHECK: define @__vecz_b_nxv1_vp_masked_atomicrmw_add_align4_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1b( %p, %val, %mask, i32 %vl) { +; CHECK: entry: +; CHECK: [[VLZERO:%.*]] = icmp eq i32 %vl, 0 +; CHECK: br i1 [[VLZERO]], label %earlyexit, label %loopentry + +; CHECK: earlyexit: +; CHECK: ret poison + +; CHECK: loopentry: +; CHECK: br label %loopIR + +; CHECK: loopIR: +; CHECK: [[IDX:%.*]] = phi i32 [ 0, %loopentry ], [ [[INC:%.*]], %if.else ] +; CHECK: [[RET_PREV:%.*]] = phi [ poison, %loopentry ], [ [[MERGE:%.*]], %if.else ] +; CHECK: [[MASKELT:%.*]] = extractelement %mask, i32 [[IDX]] +; CHECK: [[MASKCMP:%.*]] = icmp ne i1 [[MASKELT]], false +; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else + +; CHECK: if.then: +; CHECK: [[PTR:%.*]] = extractelement %p, i32 [[IDX]] +; CHECK: [[VAL:%.*]] = extractelement %val, i32 [[IDX]] +; CHECK: [[ATOM:%.*]] = atomicrmw add ptr [[PTR]], i32 [[VAL]] acquire, align 4 +; CHECK: [[RET_NEXT:%.*]] = insertelement [[RET_PREV]], i32 [[ATOM]], i32 [[IDX]] +; CHECK: br label %if.else + +; CHECK: if.else: +; CHECK: [[MERGE:%.*]] = phi [ [[RET_PREV]], %loopIR ], [ [[RET_NEXT]], %if.then ] +; CHECK: [[INC]] = add i32 [[IDX]], 1 +; CHECK: [[CMP:%.*]] = icmp ult i32 [[INC]], %vl +; CHECK: br i1 [[CMP]], label %loopIR, label %exit + +; CHECK: exit: +; CHECK: ret [[MERGE]] + +; CHECK: define { , } @__vecz_b_nxv1_vp_masked_cmpxchg_align4_acquire_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1ju5nxv1b( %p, %cmp, %newval, %mask, i32 %vl) { +; CHECK: entry: +; CHECK: [[VLZERO:%.*]] = icmp eq i32 %vl, 0 +; CHECK: br i1 [[VLZERO]], label %earlyexit, label %loopentry + +; CHECK: earlyexit: +; CHECK: ret { , } poison + +; CHECK: loopentry: +; CHECK: br label %loopIR + +; CHECK: loopIR: +; CHECK: [[IDX:%.*]] = phi i32 [ 0, %loopentry ], [ [[INC:%.*]], %if.else ] +; CHECK: [[RET_PREV:%.*]] = phi [ poison, %loopentry ], [ [[MERGE:%.*]], %if.else ] +; CHECK: [[SUCCESS_PREV:%.*]] = phi [ poison, %loopentry ], [ [[MERGE_SUCCESS:%.*]], %if.else ] +; CHECK: [[MASKELT:%.*]] = extractelement %mask, i32 [[IDX]] +; CHECK: [[MASKCMP:%.*]] = icmp ne i1 [[MASKELT]], false +; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else + +; CHECK: if.then: +; CHECK: [[PTR:%.*]] = extractelement %p, i32 [[IDX]] +; CHECK: [[CMP:%.*]] = extractelement %cmp, i32 [[IDX]] +; CHECK: [[NEWVAL:%.*]] = extractelement %newval, i32 [[IDX]] +; CHECK: [[ATOM:%.*]] = cmpxchg ptr [[PTR]], i32 [[CMP]], i32 [[NEWVAL]] acquire acquire, align 4 +; CHECK: [[EXT0:%.*]] = extractvalue { i32, i1 } [[ATOM]], 0 +; CHECK: [[RET:%.*]] = insertelement [[RET_PREV]], i32 [[EXT0]], i32 [[IDX]] +; CHECK: [[EXT1:%.*]] = extractvalue { i32, i1 } [[ATOM]], 1 +; CHECK: [[SUCCESS:%.*]] = insertelement [[SUCCESS_PREV]], i1 [[EXT1]], i32 [[IDX]] +; CHECK: br label %if.else + +; CHECK: if.else: +; CHECK: [[MERGE:%.*]] = phi [ [[RET_PREV]], %loopIR ], [ [[RET]], %if.then ] +; CHECK: [[MERGE_SUCCESS:%.*]] = phi [ [[SUCCESS_PREV]], %loopIR ], [ [[SUCCESS]], %if.then ] +; CHECK: [[INC]] = add i32 [[IDX]], 1 +; CHECK: [[CMP:%.*]] = icmp ult i32 [[INC]], %vl +; CHECK: br i1 [[CMP]], label %loopIR, label %exit + +; CHECK: exit: +; CHECK: [[RETTMP:%.*]] = insertvalue { , } poison, [[MERGE]], 0 +; CHECK: [[RETVAL:%.*]] = insertvalue { , } [[RETTMP]], [[MERGE_SUCCESS]], 1 +; CHECK: ret { , } [[RETVAL]] diff --git a/modules/compiler/vecz/test/lit/llvm/cmpxchg.ll b/modules/compiler/vecz/test/lit/llvm/cmpxchg.ll index d62486409..bf2175364 100644 --- a/modules/compiler/vecz/test/lit/llvm/cmpxchg.ll +++ b/modules/compiler/vecz/test/lit/llvm/cmpxchg.ll @@ -22,51 +22,29 @@ target triple = "spir64-unknown-unknown" ; CHECK: define spir_kernel void @__vecz_v4_test_fn(ptr %p, ptr %q, ptr %r) define spir_kernel void @test_fn(ptr %p, ptr %q, ptr %r) { entry: +; CHECK: [[SPLAT_PTR_INS:%.*]] = insertelement <4 x ptr> poison, ptr %p, i64 0 +; CHECK: [[SPLAT_PTR:%.*]] = shufflevector <4 x ptr> [[SPLAT_PTR_INS]], <4 x ptr> poison, <4 x i32> zeroinitializer %call = call i64 @__mux_get_global_id(i32 0) -; Test that this cmpxchg is scalarized. Not ideal, but hey. -; CHECK: [[A0:%.*]] = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4 -; CHECK: [[A1:%.*]] = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4 -; CHECK: [[A2:%.*]] = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4 -; CHECK: [[A3:%.*]] = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4 - -; Then we insert the values into a strange struct -; CHECK: [[INS0:%.*]] = insertvalue [4 x { i32, i1 }] undef, { i32, i1 } [[A0]], 0 -; CHECK: [[INS1:%.*]] = insertvalue [4 x { i32, i1 }] [[INS0]], { i32, i1 } [[A1]], 1 -; CHECK: [[INS2:%.*]] = insertvalue [4 x { i32, i1 }] [[INS1]], { i32, i1 } [[A2]], 2 -; CHECK: [[INS3:%.*]] = insertvalue [4 x { i32, i1 }] [[INS2]], { i32, i1 } [[A3]], 3 +; Test that this cmpxchg is packetized by generating a call to an all-true masked version. +; CHECK: [[A0:%.*]] = call { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align4_acquire_monotonic_1_Dv4_u3ptrDv4_jDv4_jDv4_b( +; CHECK-SAME: <4 x ptr> [[SPLAT_PTR]], <4 x i32> , +; CHECK-SAME: <4 x i32> , +; CHECK-SAME: <4 x i1> %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic - -; To extract from this result, we extract each element individually then insert -; each into a vector. -; CHECK: [[ELT0_0_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 0, 0 -; CHECK: [[ELT0_0_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 1, 0 -; CHECK: [[ELT0_0_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 2, 0 -; CHECK: [[ELT0_0_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 3, 0 -; CHECK: [[INS0V0_0:%.*]] = insertelement <4 x i32> undef, i32 [[ELT0_0_0]], i32 0 -; CHECK: [[INS0V0_1:%.*]] = insertelement <4 x i32> [[INS0V0_0]], i32 [[ELT0_0_1]], i32 1 -; CHECK: [[INS0V0_2:%.*]] = insertelement <4 x i32> [[INS0V0_1]], i32 [[ELT0_0_2]], i32 2 -; CHECK: [[INS0V0_3:%.*]] = insertelement <4 x i32> [[INS0V0_2]], i32 [[ELT0_0_3]], i32 3 +; CHECK: [[EXT0:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[A0]], 0 %val0 = extractvalue { i32, i1 } %old0, 0 -; Same again here -; CHECK: [[ELT1_0_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 0, 1 -; CHECK: [[ELT1_0_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 1, 1 -; CHECK: [[ELT1_0_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 2, 1 -; CHECK: [[ELT1_0_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 3, 1 -; CHECK: [[INS1V0_0:%.*]] = insertelement <4 x i1> undef, i1 [[ELT1_0_0]], i32 0 -; CHECK: [[INS1V0_1:%.*]] = insertelement <4 x i1> [[INS1V0_0]], i1 [[ELT1_0_1]], i32 1 -; CHECK: [[INS1V0_2:%.*]] = insertelement <4 x i1> [[INS1V0_1]], i1 [[ELT1_0_2]], i32 2 -; CHECK: [[INS1V0_3:%.*]] = insertelement <4 x i1> [[INS1V0_2]], i1 [[ELT1_0_3]], i32 3 +; CHECK: [[EXT1:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[A0]], 1 %success0 = extractvalue { i32, i1 } %old0, 1 %out = getelementptr i32, ptr %q, i64 %call ; Stored as a vector -; CHECK: store <4 x i32> [[INS0V0_3]], ptr +; CHECK: store <4 x i32> [[EXT0]], ptr store i32 %val0, ptr %out, align 4 ; CHECK: [[PTR:%.*]] = getelementptr i8, ptr %r, i64 %call %outsuccess = getelementptr i8, ptr %r, i64 %call -; CHECK: [[ZEXT0:%.*]] = zext <4 x i1> [[INS1V0_3]] to <4 x i8> +; CHECK: [[ZEXT0:%.*]] = zext <4 x i1> [[EXT1]] to <4 x i8> %outbyte = zext i1 %success0 to i8 ; Stored as a vector ; CHECK: store <4 x i8> [[ZEXT0]], ptr [[PTR]], align 1 @@ -74,25 +52,10 @@ entry: ; Test a couple of insert/extract patterns -; Test inserting a uniform value into a varying literal struct -; This is very inefficient -; CHECK: [[INSS0_0:%.*]] = insertvalue { i32, i1 } [[A0]], i1 false, 1 -; CHECK: [[INSS0_1:%.*]] = insertvalue { i32, i1 } [[A1]], i1 false, 1 -; CHECK: [[INSS0_2:%.*]] = insertvalue { i32, i1 } [[A2]], i1 false, 1 -; CHECK: [[INSS0_3:%.*]] = insertvalue { i32, i1 } [[A3]], i1 false, 1 -; CHECK: [[INSS1_0:%.*]] = insertvalue [4 x { i32, i1 }] undef, { i32, i1 } [[INSS0_0]], 0 -; CHECK: [[INSS1_1:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS1_0]], { i32, i1 } [[INSS0_1]], 1 -; CHECK: [[INSS1_2:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS1_1]], { i32, i1 } [[INSS0_2]], 2 -; CHECK: [[INSS1_3:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS1_2]], { i32, i1 } [[INSS0_3]], 3 -; CHECK: [[EXTS1_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS1_3]], 0, 1 -; CHECK: [[EXTS1_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS1_3]], 1, 1 -; CHECK: [[EXTS1_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS1_3]], 2, 1 -; CHECK: [[EXTS1_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS1_3]], 3, 1 -; CHECK: [[INS1V1_0:%.*]] = insertelement <4 x i1> undef, i1 [[EXTS1_0]], i32 0 -; CHECK: [[INS1V1_1:%.*]] = insertelement <4 x i1> [[INS1V1_0]], i1 [[EXTS1_1]], i32 1 -; CHECK: [[INS1V1_2:%.*]] = insertelement <4 x i1> [[INS1V1_1]], i1 [[EXTS1_2]], i32 2 -; CHECK: [[INS1V1_3:%.*]] = insertelement <4 x i1> [[INS1V1_2]], i1 [[EXTS1_3]], i32 3 -; CHECK: [[ZEXT1:%.*]] = zext <4 x i1> [[INS1V1_3]] to <4 x i8> + ; Test inserting a uniform value into a varying literal struct +; CHECK: [[INS0:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[A0]], <4 x i1> zeroinitializer, 1 +; CHECK: [[EXT2:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[INS0]], 1 +; CHECK: [[ZEXT1:%.*]] = zext <4 x i1> [[EXT2]] to <4 x i8> ; CHECK: store <4 x i8> [[ZEXT1]], ptr [[PTR]], align 1 %testinsertconst = insertvalue { i32, i1 } %old0, i1 false, 1 %testextract0 = extractvalue { i32, i1 } %testinsertconst, 1 @@ -100,29 +63,11 @@ entry: store i8 %outbyte0, ptr %outsuccess, align 1 ; Test inserting a varying value into a varying literal struct -; CHECK: [[V4I8_LD:%.*]] = load <4 x i8>, ptr %outsuccess, align 1 -; CHECK: [[TRUNC:%.*]] = trunc <4 x i8> [[V4I8_LD]] to <4 x i1> -; CHECK: [[EXTV0_0:%.*]] = extractelement <4 x i1> [[TRUNC]], i32 0 -; CHECK: [[EXTV0_1:%.*]] = extractelement <4 x i1> [[TRUNC]], i32 1 -; CHECK: [[EXTV0_2:%.*]] = extractelement <4 x i1> [[TRUNC]], i32 2 -; CHECK: [[EXTV0_3:%.*]] = extractelement <4 x i1> [[TRUNC]], i32 3 -; CHECK: [[INSS2_0:%.*]] = insertvalue { i32, i1 } [[A0]], i1 [[EXTV0_0]], 1 -; CHECK: [[INSS2_1:%.*]] = insertvalue { i32, i1 } [[A1]], i1 [[EXTV0_1]], 1 -; CHECK: [[INSS2_2:%.*]] = insertvalue { i32, i1 } [[A2]], i1 [[EXTV0_2]], 1 -; CHECK: [[INSS2_3:%.*]] = insertvalue { i32, i1 } [[A3]], i1 [[EXTV0_3]], 1 -; CHECK: [[INSS3_0:%.*]] = insertvalue [4 x { i32, i1 }] undef, { i32, i1 } [[INSS2_0]], 0 -; CHECK: [[INSS3_1:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS3_0]], { i32, i1 } [[INSS2_1]], 1 -; CHECK: [[INSS3_2:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS3_1]], { i32, i1 } [[INSS2_2]], 2 -; CHECK: [[INSS3_3:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS3_2]], { i32, i1 } [[INSS2_3]], 3 -; CHECK: [[EXTS3_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS3_3]], 0, 1 -; CHECK: [[EXTS3_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS3_3]], 1, 1 -; CHECK: [[EXTS3_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS3_3]], 2, 1 -; CHECK: [[EXTS3_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS3_3]], 3, 1 -; CHECK: [[INS1V2_0:%.*]] = insertelement <4 x i1> undef, i1 [[EXTS3_0]], i32 0 -; CHECK: [[INS1V2_1:%.*]] = insertelement <4 x i1> [[INS1V2_0]], i1 [[EXTS3_1]], i32 1 -; CHECK: [[INS1V2_2:%.*]] = insertelement <4 x i1> [[INS1V2_1]], i1 [[EXTS3_2]], i32 2 -; CHECK: [[INS1V2_3:%.*]] = insertelement <4 x i1> [[INS1V2_2]], i1 [[EXTS3_3]], i32 3 -; CHECK: [[ZEXT2:%.*]] = zext <4 x i1> [[INS1V2_3]] to <4 x i8> +; CHECK: [[LD:%.*]] = load <4 x i8>, ptr +; CHECK: [[VBOOL:%.*]] = trunc <4 x i8> [[LD]] to <4 x i1> +; CHECK: [[INS1:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[A0]], <4 x i1> [[VBOOL]], 1 +; CHECK: [[EXT3:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[INS1]], 1 +; CHECK: [[ZEXT2:%.*]] = zext <4 x i1> [[EXT3]] to <4 x i8> ; CHECK: store <4 x i8> [[ZEXT2]], ptr [[PTR]], align 1 %byte1 = load i8, ptr %outsuccess, align 1 %bool1 = trunc i8 %byte1 to i1 @@ -132,23 +77,9 @@ entry: store i8 %outbyte1, ptr %outsuccess, align 1 ; Test inserting a varying value into a uniform literal struct -; CHECK: [[INSS4_0:%.*]] = insertvalue { i32, i1 } poison, i1 [[EXTV0_0]], 1 -; CHECK: [[INSS4_1:%.*]] = insertvalue { i32, i1 } poison, i1 [[EXTV0_1]], 1 -; CHECK: [[INSS4_2:%.*]] = insertvalue { i32, i1 } poison, i1 [[EXTV0_2]], 1 -; CHECK: [[INSS4_3:%.*]] = insertvalue { i32, i1 } poison, i1 [[EXTV0_3]], 1 -; CHECK: [[INSS5_0:%.*]] = insertvalue [4 x { i32, i1 }] undef, { i32, i1 } [[INSS4_0]], 0 -; CHECK: [[INSS5_1:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS5_0]], { i32, i1 } [[INSS4_1]], 1 -; CHECK: [[INSS5_2:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS5_1]], { i32, i1 } [[INSS4_2]], 2 -; CHECK: [[INSS5_3:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS5_2]], { i32, i1 } [[INSS4_3]], 3 -; CHECK: [[EXTS5_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS5_3]], 0, 1 -; CHECK: [[EXTS5_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS5_3]], 1, 1 -; CHECK: [[EXTS5_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS5_3]], 2, 1 -; CHECK: [[EXTS5_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS5_3]], 3, 1 -; CHECK: [[INS2V3_0:%.*]] = insertelement <4 x i1> undef, i1 [[EXTS5_0]], i32 0 -; CHECK: [[INS2V3_1:%.*]] = insertelement <4 x i1> [[INS2V3_0]], i1 [[EXTS5_1]], i32 1 -; CHECK: [[INS2V3_2:%.*]] = insertelement <4 x i1> [[INS2V3_1]], i1 [[EXTS5_2]], i32 2 -; CHECK: [[INS2V3_3:%.*]] = insertelement <4 x i1> [[INS2V3_2]], i1 [[EXTS5_3]], i32 3 -; CHECK: [[ZEXT3:%.*]] = zext <4 x i1> [[INS2V3_3]] to <4 x i8> +; CHECK: [[INS2:%.*]] = insertvalue { <4 x i32>, <4 x i1> } poison, <4 x i1> [[VBOOL]], 1 +; CHECK: [[EXT4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[INS2]], 1 +; CHECK: [[ZEXT3:%.*]] = zext <4 x i1> [[EXT4]] to <4 x i8> ; CHECK: store <4 x i8> [[ZEXT3]], ptr [[PTR]], align 1 %testinsertvarying1 = insertvalue { i32, i1 } poison, i1 %bool1, 1 %testextract2 = extractvalue { i32, i1 } %testinsertvarying1, 1 diff --git a/modules/compiler/vecz/test/lit/llvm/diverging_atomic.ll b/modules/compiler/vecz/test/lit/llvm/diverging_atomic.ll deleted file mode 100644 index b6beaae1e..000000000 --- a/modules/compiler/vecz/test/lit/llvm/diverging_atomic.ll +++ /dev/null @@ -1,46 +0,0 @@ -; Copyright (C) Codeplay Software Limited -; -; Licensed under the Apache License, Version 2.0 (the "License") with LLVM -; Exceptions; you may not use this file except in compliance with the License. -; You may obtain a copy of the License at -; -; https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt -; -; Unless required by applicable law or agreed to in writing, software -; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -; License for the specific language governing permissions and limitations -; under the License. -; -; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -; RUN: veczc -w 4 -vecz-passes=cfg-convert,verify -S \ -; RUN: --pass-remarks-missed=vecz < %s 2>&1 | FileCheck %s - -target triple = "spir64-unknown-unknown" -target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" - -; CHECK: Vecz: Could not apply masks for function "kernel" -; CHECK-NEXT: note: Could not apply mask to atomic instruction -; CHECK-SAME: atomic_success = cmpxchg ptr %arrayidx.in, i32 2, i32 4 acq_rel monotonic, align 4 - -define spir_kernel void @kernel(ptr %in, ptr %out) { -entry: - %gid = tail call i64 @__mux_get_global_id(i32 0) - %cmp = icmp eq i64 %gid, 0 - br i1 %cmp, label %if.then, label %end - -if.then: - %arrayidx.in = getelementptr inbounds i32, ptr %in, i64 %gid - %atomic_success = cmpxchg ptr %arrayidx.in, i32 2, i32 4 acq_rel monotonic, align 4 - %atomic = extractvalue { i32, i1 } %atomic_success, 0 - br label %end - -end: - %merge = phi i32 [ 0, %entry ], [ %atomic, %if.then ] - %arrayidx.out = getelementptr inbounds i32, ptr %out, i64 %gid - store i32 %merge, ptr %arrayidx.out, align 4 - ret void -} - -declare i64 @__mux_get_global_id(i32) diff --git a/modules/compiler/vecz/test/lit/llvm/masked_cmpxchg.ll b/modules/compiler/vecz/test/lit/llvm/masked_cmpxchg.ll new file mode 100644 index 000000000..73aec6dfc --- /dev/null +++ b/modules/compiler/vecz/test/lit/llvm/masked_cmpxchg.ll @@ -0,0 +1,105 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -vecz-passes=cfg-convert,verify,packetizer,define-builtins,verify -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; CHECK: define spir_kernel void @__vecz_v4_test_fn(ptr %p, ptr %q, ptr %r) +define spir_kernel void @test_fn(ptr %p, ptr %q, ptr %r) { +entry: +; CHECK: [[SPLAT_PTR_INS:%.*]] = insertelement <4 x ptr> poison, ptr %p, i64 0 +; CHECK: [[SPLAT_PTR:%.*]] = shufflevector <4 x ptr> [[SPLAT_PTR_INS]], <4 x ptr> poison, <4 x i32> zeroinitializer +; CHECK: [[CMP:%.*]] = icmp sgt <4 x i64> , + %call = call i64 @__mux_get_global_id(i32 0) + %cmp = icmp sgt i64 3, %call +; CHECK: [[VEC_PTR:%.*]] = getelementptr i32, ptr %p, <4 x i64> + %wi_p_i32 = getelementptr i32, ptr %p, i64 %call + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry +; CHECK: [[CALL:%.*]] = call { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align4_acquire_monotonic_1_Dv4_u3ptrDv4_jDv4_jDv4_b( +; CHECK-SAME: <4 x ptr> [[SPLAT_PTR]], <4 x i32> , +; CHECK-SAME: <4 x i32> , <4 x i1> [[CMP]] + %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic + %val0 = extractvalue { i32, i1 } %old0, 0 + %success0 = extractvalue { i32, i1 } %old0, 1 + + %out = getelementptr i32, ptr %q, i64 %call + store i32 %val0, ptr %out, align 4 + + %outsuccess = getelementptr i8, ptr %r, i64 %call + %outbyte = zext i1 %success0 to i8 + store i8 %outbyte, ptr %outsuccess, align 1 + + ; Test a couple of insert/extract patterns +; CHECK: [[INS:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[CALL]], <4 x i1> [[CMP]], 1 +; CHECK: [[EXT:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[INS]], 1 + %testinsert = insertvalue { i32, i1 } %old0, i1 %cmp, 1 + %testextract = extractvalue { i32, i1 } %testinsert, 1 + + %outbyte0 = zext i1 %testextract to i8 + store i8 %outbyte0, ptr %outsuccess, align 1 + +; CHECK: = call { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_weak_volatile_align8_monotonic_seqcst_0_Dv4_u3ptrDv4_jDv4_jDv4_b( + %old1 = cmpxchg weak volatile ptr %wi_p_i32, i32 1, i32 2 syncscope("singlethread") monotonic seq_cst, align 8 + + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +; CHECK: define { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align4_acquire_monotonic_1_Dv4_u3ptrDv4_jDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[CMPS:%1]], <4 x i32> [[NEWS:%2]], <4 x i1> [[MASK:%3]]) { +; CHECK: entry: +; CHECK: br label %loopIR + +; CHECK: loopIR: +; CHECK: [[IDX:%.*]] = phi i32 [ 0, %entry ], [ [[IDX_NEXT:%.*]], %if.else ] +; CHECK: [[PREV:%.*]] = phi <4 x i32> [ poison, %entry ], [ [[MERGE:%.*]], %if.else ] +; CHECK: [[PREVSUCCESS:%.*]] = phi <4 x i1> [ poison, %entry ], [ [[MERGESUCCESS:%.*]], %if.else ] +; CHECK: [[MASKELT:%.*]] = extractelement <4 x i1> [[MASK]], i32 [[IDX]] +; CHECK: [[MASKCMP:%.*]] = icmp ne i1 [[MASKELT]], false +; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else + +; CHECK: if.then: +; CHECK: [[PTR:%.*]] = extractelement <4 x ptr> [[PTRS]], i32 [[IDX]] +; CHECK: [[CMP:%.*]] = extractelement <4 x i32> [[CMPS]], i32 [[IDX]] +; CHECK: [[NEW:%.*]] = extractelement <4 x i32> [[NEWS]], i32 [[IDX]] +; CHECK: [[ATOM:%.*]] = cmpxchg ptr [[PTR]], i32 [[CMP]], i32 [[NEW]] acquire monotonic, align 4 +; CHECK: [[VAL:%.*]] = extractvalue { i32, i1 } [[ATOM]], 0 +; CHECK: [[RET:%.*]] = insertelement <4 x i32> [[PREV]], i32 [[VAL]], i32 [[IDX]] +; CHECK: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[ATOM]], 1 +; CHECK: [[RETSUCCESS:%.*]] = insertelement <4 x i1> [[PREVSUCCESS]], i1 [[SUCCESS]], i32 [[IDX]] +; CHECK: br label %if.else + +; CHECK: if.else: +; CHECK: [[MERGE]] = phi <4 x i32> [ [[PREV]], %loopIR ], [ [[RET]], %if.then ] +; CHECK: [[MERGESUCCESS]] = phi <4 x i1> [ [[PREVSUCCESS]], %loopIR ], [ [[RETSUCCESS]], %if.then ] +; CHECK: [[IDX_NEXT]] = add i32 [[IDX]], 1 + +; CHECK: exit: +; CHECK: [[INS0:%.*]] = insertvalue { <4 x i32>, <4 x i1> } poison, <4 x i32> [[MERGE]], 0 +; CHECK: [[INS1:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[INS0]], <4 x i1> [[MERGESUCCESS]], 1 +; CHECK: ret { <4 x i32>, <4 x i1> } [[INS1]] + +; Assume that all masked cmpxchg operations follow the logic above. Just +; check that the right cmpxchg instruction is being generated. +; CHECK: define { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_weak_volatile_align8_monotonic_seqcst_0_Dv4_u3ptrDv4_jDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[CMPS:%1]], <4 x i32> [[NEWS:%2]], <4 x i1> [[MASK:%3]]) { +; CHECK: cmpxchg weak volatile ptr {{%.*}}, i32 {{%.*}}, i32 {{%.*}} syncscope("singlethread") monotonic seq_cst, align 8 + +declare i64 @__mux_get_global_id(i32) diff --git a/modules/compiler/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll b/modules/compiler/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll new file mode 100644 index 000000000..831b6cca8 --- /dev/null +++ b/modules/compiler/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll @@ -0,0 +1,48 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-passes=define-builtins,verify -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_fn(ptr %p) { + %ret = call { i32, i1 } @__vecz_b_v1_masked_cmpxchg_align4_acquire_monotonic_1_u3ptrjjb(ptr %p, i32 1, i32 2, i1 true) + ret void +} + +declare { i32, i1 } @__vecz_b_v1_masked_cmpxchg_align4_acquire_monotonic_1_u3ptrjjb(ptr %p, i32 %cmp, i32 %newval, i1 %mask) + +; CHECK: define { i32, i1 } @__vecz_b_v1_masked_cmpxchg_align4_acquire_monotonic_1_u3ptrjjb(ptr %p, i32 %cmp, i32 %newval, i1 %mask) { +; CHECK: entry: +; CHECK: [[MASKCMP:%.*]] = icmp ne i1 %mask, false +; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else + +; CHECK: if.then: +; CHECK: [[ATOM:%.*]] = cmpxchg ptr %p, i32 %cmp, i32 %newval acquire monotonic, align 4 +; CHECK: [[EXT0:%.*]] = extractvalue { i32, i1 } [[ATOM]], 0 +; CHECK: [[EXT1:%.*]] = extractvalue { i32, i1 } [[ATOM]], 1 +; CHECK: br label %if.else + +; CHECK: if.else: +; CHECK: [[RETVAL:%.*]] = phi i32 [ poison, %entry ], [ [[EXT0]], %if.then ] +; CHECK: [[RETSUCC:%.*]] = phi i1 [ poison, %entry ], [ [[EXT1]], %if.then ] +; CHECK: br label %exit + +; CHECK: exit: +; CHECK: [[INS0:%.*]] = insertvalue { i32, i1 } poison, i32 [[RETVAL]], 0 +; CHECK: [[INS1:%.*]] = insertvalue { i32, i1 } [[INS0]], i1 [[RETSUCC]], 1 +; CHECK: ret { i32, i1 } [[INS1]]