diff --git a/modules/compiler/vecz/source/analysis/uniform_value_analysis.cpp b/modules/compiler/vecz/source/analysis/uniform_value_analysis.cpp index 3f4f495c1..e2a696aa0 100644 --- a/modules/compiler/vecz/source/analysis/uniform_value_analysis.cpp +++ b/modules/compiler/vecz/source/analysis/uniform_value_analysis.cpp @@ -198,6 +198,8 @@ void UniformValueResult::findVectorLeaves( Op->isMaskedScatterGatherMemOp())) { IsCallLeaf = true; } + } else if (Ctx.isMaskedAtomicFunction(*CI->getCalledFunction())) { + IsCallLeaf = true; } if (IsCallLeaf) { Leaves.push_back(CI); diff --git a/modules/compiler/vecz/source/include/vectorization_context.h b/modules/compiler/vecz/source/include/vectorization_context.h index 3d580525c..ad140dd60 100644 --- a/modules/compiler/vecz/source/include/vectorization_context.h +++ b/modules/compiler/vecz/source/include/vectorization_context.h @@ -153,36 +153,59 @@ class VectorizationContext { /// @return The masked version of the function llvm::Function *getOrCreateMaskedFunction(llvm::CallInst *CI); - struct MaskedAtomicRMW { + /// @brief Represents either an atomicrmw or cmpxchg operation. + /// + /// Most fields are shared, with the exception of CmpXchgFailureOrdering and + /// IsWeak, which are only to be set for cmpxchg, and BinOp, which is only to + /// be set to a valid value for atomicrmw. + struct MaskedAtomic { llvm::Type *PointerTy; llvm::Type *ValTy; + /// @brief Must be set to BAD_BINOP for cmpxchg instructions llvm::AtomicRMWInst::BinOp BinOp; llvm::Align Align; bool IsVolatile = false; llvm::SyncScope::ID SyncScope; llvm::AtomicOrdering Ordering; + /// @brief Must be set for cmpxchg instructions + std::optional CmpXchgFailureOrdering = std::nullopt; + /// @brief Must only be set for cmpxchg instructions + bool IsWeak = false; // Vectorization info llvm::ElementCount VF; bool IsVectorPredicated = false; + + /// @brief Returns true if this MaskedAtomic represents a cmpxchg operation. + bool isCmpXchg() const { + if (CmpXchgFailureOrdering.has_value()) { + // 'binop' only applies to atomicrmw + assert(BinOp == llvm::AtomicRMWInst::BAD_BINOP && + "Invalid MaskedAtomic state"); + return true; + } + // 'weak' only applies to cmpxchg + assert(!IsWeak && "Invalid MaskedAtomic state"); + return false; + } }; - /// @brief Check if the given function is a masked version of an atomic RMW - /// operation. + /// @brief Check if the given function is a masked version of an atomicrmw or + /// cmpxchg operation. /// /// @param[in] F The function to check - /// @return A MaskedAtomicRMW instance detailing the atomic operation if the - /// function is a masked atomic RMW, or std::nullopt otherwise - std::optional isMaskedAtomicRMWFunction( + /// @return A MaskedAtomic instance detailing the atomic operation if the + /// function is a masked atomic, or std::nullopt otherwise + std::optional isMaskedAtomicFunction( const llvm::Function &F) const; /// @brief Get (if it exists already) or create the function representing the - /// masked version of an atomic RMW operation. + /// masked version of an atomicrmw/cmpxchg operation. /// /// @param[in] I Atomic to be masked /// @param[in] Choices Choices to mangle into the function name /// @param[in] VF The vectorization factor of the atomic operation /// @return The masked version of the function - llvm::Function *getOrCreateMaskedAtomicRMWFunction( - MaskedAtomicRMW &I, const VectorizationChoices &Choices, + llvm::Function *getOrCreateMaskedAtomicFunction( + MaskedAtomic &I, const VectorizationChoices &Choices, llvm::ElementCount VF); /// @brief Create a VectorizationUnit to use to vectorize the given scalar @@ -296,10 +319,9 @@ class VectorizationContext { /// @brief Emit the body for a masked atomic builtin /// /// @param[in] F The empty (declaration only) function to emit the body in - /// @param[in] MA The MaskedAtomicRMW information + /// @param[in] MA The MaskedAtomic information /// @returns true on success, false otherwise - bool emitMaskedAtomicRMWBody(llvm::Function &F, - const MaskedAtomicRMW &MA) const; + bool emitMaskedAtomicBody(llvm::Function &F, const MaskedAtomic &MA) const; /// @brief Helper for non-vectorization tasks. TargetInfo &VTI; diff --git a/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp b/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp index e16eab41b..f7649b5b4 100644 --- a/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp +++ b/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp @@ -214,15 +214,14 @@ class ControlFlowConversionState::Impl : public ControlFlowConversionState { /// @return true if it is valid to mask this call, false otherwise bool applyMaskToCall(CallInst *CI, Value *mask, DeletionMap &toDelete); - /// @brief Attempt to apply a mask to an AtomicRMW instruction via a builtin + /// @brief Attempt to apply a mask to an atomic instruction via a builtin /// call. /// - /// @param[in] atomicI The atomic instruction to apply the mask to + /// @param[in] I The (atomic) instruction to apply the mask to /// @param[in] mask The mask to apply to the masked atomic /// @param[out] toDelete mapping of deleted unmasked operations /// @return true if it is valid to mask this atomic, false otherwise - bool applyMaskToAtomicRMW(AtomicRMWInst &atomicI, Value *mask, - DeletionMap &toDelete); + bool applyMaskToAtomic(Instruction &I, Value *mask, DeletionMap &toDelete); /// @brief Linearize a CFG. /// @return true if no problem occurred, false otherwise. @@ -1138,9 +1137,7 @@ Error ControlFlowConversionState::Impl::applyMask(BasicBlock &BB, Value *mask) { } } else if (I.isAtomic() && !isa(&I)) { // Turn atomics into calls to masked builtins if possible. - // FIXME: We don't yet support masked cmpxchg instructions. - if (auto *atomicI = dyn_cast(&I); - !atomicI || !applyMaskToAtomicRMW(*atomicI, mask, toDelete)) { + if (!applyMaskToAtomic(I, mask, toDelete)) { return makeStringError("Could not apply mask to atomic instruction", I); } } else if (auto *branch = dyn_cast(&I)) { @@ -1372,41 +1369,66 @@ bool ControlFlowConversionState::Impl::applyMaskToCall(CallInst *CI, return true; } -bool ControlFlowConversionState::Impl::applyMaskToAtomicRMW( - AtomicRMWInst &atomicI, Value *mask, DeletionMap &toDelete) { - LLVM_DEBUG(dbgs() << "vecz-cf: Now at AtomicRMWInst " << atomicI << "\n"); +bool ControlFlowConversionState::Impl::applyMaskToAtomic( + Instruction &I, Value *mask, DeletionMap &toDelete) { + LLVM_DEBUG(dbgs() << "vecz-cf: Now at atomic inst " << I << "\n"); - VectorizationContext::MaskedAtomicRMW MA; - MA.Align = atomicI.getAlign(); - MA.BinOp = atomicI.getOperation(); - MA.IsVectorPredicated = VU.choices().vectorPredication(); - MA.IsVolatile = atomicI.isVolatile(); - MA.Ordering = atomicI.getOrdering(); - MA.SyncScope = atomicI.getSyncScopeID(); + SmallVector maskedFnArgs; + VectorizationContext::MaskedAtomic MA; MA.VF = ElementCount::getFixed(1); - MA.ValTy = atomicI.getType(); - MA.PointerTy = atomicI.getPointerOperand()->getType(); + MA.IsVectorPredicated = VU.choices().vectorPredication(); + + if (auto *atomicI = dyn_cast(&I)) { + MA.Align = atomicI->getAlign(); + MA.BinOp = atomicI->getOperation(); + MA.IsVolatile = atomicI->isVolatile(); + MA.Ordering = atomicI->getOrdering(); + MA.SyncScope = atomicI->getSyncScopeID(); + MA.ValTy = atomicI->getType(); + MA.PointerTy = atomicI->getPointerOperand()->getType(); + + // Set up the arguments to this function + maskedFnArgs = {atomicI->getPointerOperand(), atomicI->getValOperand(), + mask}; + + } else if (auto *cmpxchgI = dyn_cast(&I)) { + MA.Align = cmpxchgI->getAlign(); + MA.BinOp = AtomicRMWInst::BAD_BINOP; + MA.IsWeak = cmpxchgI->isWeak(); + MA.IsVolatile = cmpxchgI->isVolatile(); + MA.Ordering = cmpxchgI->getSuccessOrdering(); + MA.CmpXchgFailureOrdering = cmpxchgI->getFailureOrdering(); + MA.SyncScope = cmpxchgI->getSyncScopeID(); + MA.ValTy = cmpxchgI->getCompareOperand()->getType(); + MA.PointerTy = cmpxchgI->getPointerOperand()->getType(); + + // Set up the arguments to this function + maskedFnArgs = {cmpxchgI->getPointerOperand(), + cmpxchgI->getCompareOperand(), cmpxchgI->getNewValOperand(), + mask}; + } else { + return false; + } + // Create the new function and replace the old one with it // Get the masked function - Function *newFunction = Ctx.getOrCreateMaskedAtomicRMWFunction( + Function *maskedAtomicFn = Ctx.getOrCreateMaskedAtomicFunction( MA, VU.choices(), ElementCount::getFixed(1)); - VECZ_FAIL_IF(!newFunction); - SmallVector fnArgs = {atomicI.getPointerOperand(), - atomicI.getValOperand(), mask}; + VECZ_FAIL_IF(!maskedAtomicFn); // We don't have a vector length just yet - pass in one as a dummy. if (MA.IsVectorPredicated) { - fnArgs.push_back( - ConstantInt::get(IntegerType::getInt32Ty(atomicI.getContext()), 1)); + maskedFnArgs.push_back( + ConstantInt::get(IntegerType::getInt32Ty(I.getContext()), 1)); } - CallInst *newCI = CallInst::Create(newFunction, fnArgs, "", &atomicI); - VECZ_FAIL_IF(!newCI); + CallInst *maskedCI = CallInst::Create(maskedAtomicFn, maskedFnArgs, "", &I); + VECZ_FAIL_IF(!maskedCI); - atomicI.replaceAllUsesWith(newCI); - toDelete.emplace_back(&atomicI, newCI); + I.replaceAllUsesWith(maskedCI); + toDelete.emplace_back(&I, maskedCI); - LLVM_DEBUG(dbgs() << "vecz-cf: Replaced " << atomicI << "\n"); - LLVM_DEBUG(dbgs() << " with " << *newCI << "\n"); + LLVM_DEBUG(dbgs() << "vecz-cf: Replaced " << I << "\n"); + LLVM_DEBUG(dbgs() << " with " << *maskedCI << "\n"); return true; } diff --git a/modules/compiler/vecz/source/transform/packetization_helpers.cpp b/modules/compiler/vecz/source/transform/packetization_helpers.cpp index 11e954e73..ad08844ad 100644 --- a/modules/compiler/vecz/source/transform/packetization_helpers.cpp +++ b/modules/compiler/vecz/source/transform/packetization_helpers.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -46,6 +47,18 @@ using namespace vecz; namespace { inline Type *getWideType(Type *ty, ElementCount factor) { if (!ty->isVectorTy()) { + // The wide type of a struct literal is the wide type of each of its + // elements. + if (auto *structTy = dyn_cast(ty); + structTy && structTy->isLiteral()) { + SmallVector wideElts(structTy->elements()); + for (unsigned i = 0, e = wideElts.size(); i != e; i++) { + wideElts[i] = getWideType(wideElts[i], factor); + } + return StructType::get(ty->getContext(), wideElts); + } else if (structTy) { + VECZ_ERROR("Can't create wide type for structure type"); + } return VectorType::get(ty, factor); } bool const isScalable = isa(ty); @@ -694,7 +707,9 @@ const Packetizer::Result &Packetizer::Result::broadcast(unsigned width) const { auto &F = packetizer.F; Value *result = nullptr; const auto &TI = packetizer.context().targetInfo(); - if (isa(scalar)) { + if (isa(scalar)) { + result = PoisonValue::get(getWideType(ty, factor)); + } else if (isa(scalar)) { result = UndefValue::get(getWideType(ty, factor)); } else if (ty->isVectorTy() && factor.isScalable()) { IRBuilder<> B(buildAfter(scalar, F)); diff --git a/modules/compiler/vecz/source/transform/packetizer.cpp b/modules/compiler/vecz/source/transform/packetizer.cpp index 9b80bca86..3191f4d86 100644 --- a/modules/compiler/vecz/source/transform/packetizer.cpp +++ b/modules/compiler/vecz/source/transform/packetizer.cpp @@ -302,14 +302,14 @@ class Packetizer::Impl : public Packetizer { /// /// @return Packetized instruction. ValuePacket packetizeMemOp(MemOp &Op); - /// @brief Packetize a masked atomic RMW operation. + /// @brief Packetize a masked atomicrmw or cmpxchg operation. /// - /// @param[in] CI Masked atomic RMW builtin call to packetize. - /// @param[in] AtomicInfo Information about the masked atomic RMW. + /// @param[in] CI Masked atomic builtin call to packetize. + /// @param[in] AtomicInfo Information about the masked atomic. /// /// @return Packetized instruction. - ValuePacket packetizeMaskedAtomicRMW( - CallInst &CI, VectorizationContext::MaskedAtomicRMW AtomicInfo); + ValuePacket packetizeMaskedAtomic( + CallInst &CI, VectorizationContext::MaskedAtomic AtomicInfo); /// @brief Packetize a GEP instruction. /// /// @param[in] GEP Instruction to packetize. @@ -334,6 +334,12 @@ class Packetizer::Impl : public Packetizer { /// /// @return Packetized instruction. ValuePacket packetizeFreeze(FreezeInst *FreezeI); + /// @brief Packetize an atomic cmpxchg instruction. + /// + /// @param[in] AtomicI Instruction to packetize. + /// + /// @return Packetized instruction. + ValuePacket packetizeAtomicCmpXchg(AtomicCmpXchgInst *AtomicI); /// @brief Packetize a unary operator instruction. /// /// @param[in] UnOp Instruction to packetize. @@ -402,6 +408,22 @@ class Packetizer::Impl : public Packetizer { /// /// @return Packetized instruction. ValuePacket packetizeExtractElement(ExtractElementInst *ExtractElement); + /// @brief Packetize an insert value instruction. + /// + /// Only packetizes inserts into literal struct types. + /// + /// @param[in] InsertValue Instruction to packetize. + /// + /// @return Packetized instruction. + ValuePacket packetizeInsertValue(InsertValueInst *InsertValue); + /// @brief Packetize an extract value instruction. + /// + /// Only packetizes extracts from literal struct types. + /// + /// @param[in] ExtractValue Instruction to packetize. + /// + /// @return Packetized instruction. + ValuePacket packetizeExtractValue(ExtractValueInst *ExtractValue); /// @brief Packetize a shuffle vector instruction. /// /// @param[in] Shuffle Instruction to packetize. @@ -1157,12 +1179,21 @@ Packetizer::Result Packetizer::Impl::packetizeInstruction(Instruction *Ins) { case Instruction::ExtractElement: results = packetizeExtractElement(cast(Ins)); break; + case Instruction::InsertValue: + results = packetizeInsertValue(cast(Ins)); + break; + case Instruction::ExtractValue: + results = packetizeExtractValue(cast(Ins)); + break; case Instruction::ShuffleVector: results = packetizeShuffleVector(cast(Ins)); break; case Instruction::Freeze: results = packetizeFreeze(cast(Ins)); break; + case Instruction::AtomicCmpXchg: + results = packetizeAtomicCmpXchg(cast(Ins)); + break; } if (auto res = getPacketizationResult(Ins, results, /*update stats*/ true)) { @@ -2102,8 +2133,8 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) { return packetizeMemOp(*MaskedOp); } } - if (auto AtomicInfo = Ctx.isMaskedAtomicRMWFunction(*Callee)) { - return packetizeMaskedAtomicRMW(*CI, *AtomicInfo); + if (auto AtomicInfo = Ctx.isMaskedAtomicFunction(*Callee)) { + return packetizeMaskedAtomic(*CI, *AtomicInfo); } } @@ -2778,16 +2809,18 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) { return results; } -ValuePacket Packetizer::Impl::packetizeMaskedAtomicRMW( - CallInst &CI, VectorizationContext::MaskedAtomicRMW AtomicInfo) { +ValuePacket Packetizer::Impl::packetizeMaskedAtomic( + CallInst &CI, VectorizationContext::MaskedAtomic AtomicInfo) { ValuePacket results; - Value *const ptr = CI.getArgOperand(0); - Value *const val = CI.getArgOperand(1); - Value *const mask = CI.getArgOperand(2); + bool const IsCmpXchg = AtomicInfo.isCmpXchg(); - assert(AtomicInfo.ValTy == val->getType() && "AtomicInfo mismatch"); - auto const packetWidth = getPacketWidthForType(val->getType()); + Value *const ptrArg = CI.getArgOperand(0); + Value *const valOrCmpArg = CI.getArgOperand(1); + Value *const maskArg = CI.getArgOperand(2 + IsCmpXchg); + + assert(AtomicInfo.ValTy == valOrCmpArg->getType() && "AtomicInfo mismatch"); + auto const packetWidth = getPacketWidthForType(valOrCmpArg->getType()); if (VL && packetWidth != 1) { emitVeczRemarkMissed(&F, &CI, @@ -2795,20 +2828,29 @@ ValuePacket Packetizer::Impl::packetizeMaskedAtomicRMW( return {}; } - ValuePacket valPacket; - Result valResult = packetize(val); + ValuePacket valOrCmpPacket; + Result valResult = packetize(valOrCmpArg); PACK_FAIL_IF(!valResult); - valResult.getPacketValues(packetWidth, valPacket); - PACK_FAIL_IF(valPacket.empty()); + valResult.getPacketValues(packetWidth, valOrCmpPacket); + PACK_FAIL_IF(valOrCmpPacket.empty()); + + ValuePacket newValPacket; + if (IsCmpXchg) { + Value *const newValArg = CI.getArgOperand(2); + Result newValResult = packetize(newValArg); + PACK_FAIL_IF(!newValResult); + newValResult.getPacketValues(packetWidth, newValPacket); + PACK_FAIL_IF(newValPacket.empty()); + } ValuePacket ptrPacket; - Result ptrResult = packetize(ptr); + Result ptrResult = packetize(ptrArg); PACK_FAIL_IF(!ptrResult); ptrResult.getPacketValues(packetWidth, ptrPacket); PACK_FAIL_IF(ptrPacket.empty()); ValuePacket maskPacket; - Result maskResult = packetize(mask); + Result maskResult = packetize(maskArg); PACK_FAIL_IF(!maskResult); maskResult.getPacketValues(packetWidth, maskPacket); PACK_FAIL_IF(maskPacket.empty()); @@ -2817,16 +2859,20 @@ ValuePacket Packetizer::Impl::packetizeMaskedAtomicRMW( IC.deleteInstructionLater(&CI); for (unsigned i = 0; i != packetWidth; ++i) { - auto *const ptrI = ptrPacket[i]; - auto *const valI = valPacket[i]; + auto *const ptr = ptrPacket[i]; + auto *const valOrCmp = valOrCmpPacket[i]; - AtomicInfo.ValTy = valI->getType(); - AtomicInfo.PointerTy = ptrI->getType(); + AtomicInfo.ValTy = valOrCmp->getType(); + AtomicInfo.PointerTy = ptr->getType(); auto *maskedAtomicF = - Ctx.getOrCreateMaskedAtomicRMWFunction(AtomicInfo, Choices, SimdWidth); + Ctx.getOrCreateMaskedAtomicFunction(AtomicInfo, Choices, SimdWidth); PACK_FAIL_IF(!maskedAtomicF); - SmallVector args = {ptrI, valI, maskPacket[i]}; + SmallVector args = {ptr, valOrCmp}; + if (IsCmpXchg) { + args.push_back(newValPacket[i]); + } + args.push_back(maskPacket[i]); if (AtomicInfo.IsVectorPredicated) { assert(VL && "Missing vector length"); args.push_back(VL); @@ -2991,6 +3037,49 @@ ValuePacket Packetizer::Impl::packetizeFreeze(FreezeInst *FreezeI) { return results; } +ValuePacket Packetizer::Impl::packetizeAtomicCmpXchg( + AtomicCmpXchgInst *AtomicI) { + ValuePacket results; + + VectorizationContext::MaskedAtomic MA; + MA.VF = SimdWidth; + MA.IsVectorPredicated = VU.choices().vectorPredication(); + + MA.Align = AtomicI->getAlign(); + MA.BinOp = AtomicRMWInst::BAD_BINOP; + MA.IsWeak = AtomicI->isWeak(); + MA.IsVolatile = AtomicI->isVolatile(); + MA.Ordering = AtomicI->getSuccessOrdering(); + MA.CmpXchgFailureOrdering = AtomicI->getFailureOrdering(); + MA.SyncScope = AtomicI->getSyncScopeID(); + + IRBuilder<> B(AtomicI); + + // Set up the arguments to this function + Value *Ptr = packetize(AtomicI->getPointerOperand()).getAsValue(); + Value *Cmp = packetize(AtomicI->getCompareOperand()).getAsValue(); + Value *New = packetize(AtomicI->getNewValOperand()).getAsValue(); + + MA.ValTy = Cmp->getType(); + MA.PointerTy = Ptr->getType(); + + auto *const TrueMask = createAllTrueMask(B, SimdWidth); + SmallVector MaskedFnArgs = {Ptr, Cmp, New, TrueMask}; + if (VL) { + MaskedFnArgs.push_back(VL); + } + + Function *MaskedAtomicFn = + Ctx.getOrCreateMaskedAtomicFunction(MA, VU.choices(), SimdWidth); + PACK_FAIL_IF(!MaskedAtomicFn); + + CallInst *MaskedCI = B.CreateCall(MaskedAtomicFn, MaskedFnArgs); + + results.push_back(MaskedCI); + + return results; +} + ValuePacket Packetizer::Impl::packetizeUnaryOp(UnaryOperator *UnOp) { ValuePacket results; @@ -3716,6 +3805,70 @@ ValuePacket Packetizer::Impl::packetizeExtractElement( return results; } +ValuePacket Packetizer::Impl::packetizeInsertValue( + InsertValueInst *InsertValue) { + ValuePacket results; + + Value *const Val = InsertValue->getInsertedValueOperand(); + Value *const Aggregate = InsertValue->getAggregateOperand(); + + // We can only packetize literal struct types + if (auto *StructTy = dyn_cast(Aggregate->getType()); + !StructTy || !StructTy->isLiteral()) { + return results; + } + + Value *PackAggregate = packetizeIfVarying(Aggregate); + PACK_FAIL_IF(!PackAggregate); + + Value *PackVal = packetizeIfVarying(Val); + PACK_FAIL_IF(!PackVal); + + bool const IsValVarying = Val != PackVal; + bool const IsAggregateVarying = Aggregate != PackAggregate; + if (!IsAggregateVarying && IsValVarying) { + // If the aggregate wasn't varying but the value was + PackAggregate = packetize(Aggregate).getAsValue(); + } else if (IsAggregateVarying && !IsValVarying) { + // If the aggregate was varying but the value wasn't + PackVal = packetize(Val).getAsValue(); + } else if (!IsAggregateVarying && !IsValVarying) { + // If both were uniform + return results; + } + + IRBuilder<> B(buildAfter(InsertValue, F)); + + results.push_back( + B.CreateInsertValue(PackAggregate, PackVal, InsertValue->getIndices())); + + IC.deleteInstructionLater(InsertValue); + return results; +} + +ValuePacket Packetizer::Impl::packetizeExtractValue( + ExtractValueInst *ExtractValue) { + ValuePacket results; + + Value *const Aggregate = ExtractValue->getAggregateOperand(); + // We can only packetize literal struct types + if (auto *StructTy = dyn_cast(Aggregate->getType()); + !StructTy || !StructTy->isLiteral()) { + return results; + } + + Value *PackAggregate = packetizeIfVarying(Aggregate); + PACK_FAIL_IF(!PackAggregate); + + IRBuilder<> B(buildAfter(ExtractValue, F)); + + results.push_back( + B.CreateExtractValue(PackAggregate, ExtractValue->getIndices())); + + IC.deleteInstructionLater(ExtractValue); + return results; +} + ValuePacket Packetizer::Impl::packetizeShuffleVector( ShuffleVectorInst *Shuffle) { Value *const srcA = Shuffle->getOperand(0); diff --git a/modules/compiler/vecz/source/vectorization_context.cpp b/modules/compiler/vecz/source/vectorization_context.cpp index 821d62936..6880bbfc4 100644 --- a/modules/compiler/vecz/source/vectorization_context.cpp +++ b/modules/compiler/vecz/source/vectorization_context.cpp @@ -374,8 +374,8 @@ Function *VectorizationContext::getOrCreateMaskedFunction(CallInst *CI) { return newFunction; } -std::optional -VectorizationContext::isMaskedAtomicRMWFunction(const Function &F) const { +std::optional +VectorizationContext::isMaskedAtomicFunction(const Function &F) const { auto VFInfo = decodeVectorizedFunctionName(F.getName()); if (!VFInfo) { return std::nullopt; @@ -383,55 +383,69 @@ VectorizationContext::isMaskedAtomicRMWFunction(const Function &F) const { auto [FnNameStr, VF, Choices] = *VFInfo; llvm::StringRef FnName = FnNameStr; - if (!FnName.consume_front("masked_atomicrmw_")) { + if (!FnName.consume_front("masked_")) { return std::nullopt; } - VectorizationContext::MaskedAtomicRMW AtomicInfo; + bool IsCmpXchg = FnName.consume_front("cmpxchg_"); + if (!IsCmpXchg && !FnName.consume_front("atomicrmw_")) { + return std::nullopt; + } + VectorizationContext::MaskedAtomic AtomicInfo; AtomicInfo.VF = VF; AtomicInfo.IsVectorPredicated = Choices.vectorPredication(); + if (IsCmpXchg) { + AtomicInfo.IsWeak = FnName.consume_front("weak_"); + } AtomicInfo.IsVolatile = FnName.consume_front("volatile_"); - if (FnName.consume_front("xchg")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::Xchg; - } else if (FnName.consume_front("add")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::Add; - } else if (FnName.consume_front("sub")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::Sub; - } else if (FnName.consume_front("and")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::And; - } else if (FnName.consume_front("nand")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::Nand; - } else if (FnName.consume_front("or")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::Or; - } else if (FnName.consume_front("xor")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::Xor; - } else if (FnName.consume_front("max")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::Max; - } else if (FnName.consume_front("min")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::Min; - } else if (FnName.consume_front("umax")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::UMax; - } else if (FnName.consume_front("umin")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::UMin; - } else if (FnName.consume_front("fadd")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::FAdd; - } else if (FnName.consume_front("fsub")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::FSub; - } else if (FnName.consume_front("fmax")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::FMax; - } else if (FnName.consume_front("fmin")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::FMin; - } else if (FnName.consume_front("uincwrap")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::UIncWrap; - } else if (FnName.consume_front("udecwrap")) { - AtomicInfo.BinOp = AtomicRMWInst::BinOp::UDecWrap; + if (IsCmpXchg) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::BAD_BINOP; } else { - return std::nullopt; + if (FnName.consume_front("xchg")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::Xchg; + } else if (FnName.consume_front("add")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::Add; + } else if (FnName.consume_front("sub")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::Sub; + } else if (FnName.consume_front("and")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::And; + } else if (FnName.consume_front("nand")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::Nand; + } else if (FnName.consume_front("or")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::Or; + } else if (FnName.consume_front("xor")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::Xor; + } else if (FnName.consume_front("max")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::Max; + } else if (FnName.consume_front("min")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::Min; + } else if (FnName.consume_front("umax")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::UMax; + } else if (FnName.consume_front("umin")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::UMin; + } else if (FnName.consume_front("fadd")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::FAdd; + } else if (FnName.consume_front("fsub")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::FSub; + } else if (FnName.consume_front("fmax")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::FMax; + } else if (FnName.consume_front("fmin")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::FMin; + } else if (FnName.consume_front("uincwrap")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::UIncWrap; + } else if (FnName.consume_front("udecwrap")) { + AtomicInfo.BinOp = AtomicRMWInst::BinOp::UDecWrap; + } else { + return std::nullopt; + } + if (!FnName.consume_front("_")) { + return std::nullopt; + } } - if (!FnName.consume_front("_align")) { + if (!FnName.consume_front("align")) { return std::nullopt; } @@ -446,26 +460,38 @@ VectorizationContext::isMaskedAtomicRMWFunction(const Function &F) const { return std::nullopt; } - if (FnName.consume_front("acquire")) { - AtomicInfo.Ordering = AtomicOrdering::Acquire; - } else if (FnName.consume_front("acqrel")) { - AtomicInfo.Ordering = AtomicOrdering::AcquireRelease; - } else if (FnName.consume_front("monotonic")) { - AtomicInfo.Ordering = AtomicOrdering::Monotonic; - } else if (FnName.consume_front("notatomic")) { - AtomicInfo.Ordering = AtomicOrdering::NotAtomic; - } else if (FnName.consume_front("release")) { - AtomicInfo.Ordering = AtomicOrdering::Release; - } else if (FnName.consume_front("seqcst")) { - AtomicInfo.Ordering = AtomicOrdering::SequentiallyConsistent; - } else if (FnName.consume_front("unordered")) { - AtomicInfo.Ordering = AtomicOrdering::Unordered; + auto demangleOrdering = [&FnName]() -> std::optional { + if (FnName.consume_front("acquire_")) { + return AtomicOrdering::Acquire; + } else if (FnName.consume_front("acqrel_")) { + return AtomicOrdering::AcquireRelease; + } else if (FnName.consume_front("monotonic_")) { + return AtomicOrdering::Monotonic; + } else if (FnName.consume_front("notatomic_")) { + return AtomicOrdering::NotAtomic; + } else if (FnName.consume_front("release_")) { + return AtomicOrdering::Release; + } else if (FnName.consume_front("seqcst_")) { + return AtomicOrdering::SequentiallyConsistent; + } else if (FnName.consume_front("unordered_")) { + return AtomicOrdering::Unordered; + } else { + return std::nullopt; + } + }; + + if (auto Ordering = demangleOrdering()) { + AtomicInfo.Ordering = *Ordering; } else { return std::nullopt; } - if (!FnName.consume_front("_")) { - return std::nullopt; + if (IsCmpXchg) { + if (auto Ordering = demangleOrdering()) { + AtomicInfo.CmpXchgFailureOrdering = *Ordering; + } else { + return std::nullopt; + } } unsigned SyncScopeID = 0; @@ -489,91 +515,114 @@ VectorizationContext::isMaskedAtomicRMWFunction(const Function &F) const { return AtomicInfo; } -Function *VectorizationContext::getOrCreateMaskedAtomicRMWFunction( - MaskedAtomicRMW &I, const VectorizationChoices &Choices, ElementCount VF) { +Function *VectorizationContext::getOrCreateMaskedAtomicFunction( + MaskedAtomic &I, const VectorizationChoices &Choices, ElementCount VF) { + bool const isCmpXchg = I.isCmpXchg(); LLVMContext &ctx = I.ValTy->getContext(); SmallVector argTys; argTys.push_back(I.PointerTy); argTys.push_back(I.ValTy); + if (isCmpXchg) { + argTys.push_back(I.ValTy); + } // Add one extra argument for the mask, which is always the same length // (scalar or vector) as the value type. auto *i1Ty = Type::getInt1Ty(ctx); - argTys.push_back( + auto *maskTy = !I.ValTy->isVectorTy() ? dyn_cast(i1Ty) - : VectorType::get(i1Ty, - cast(I.ValTy)->getElementCount())); + : VectorType::get(i1Ty, cast(I.ValTy)->getElementCount()); + argTys.push_back(maskTy); if (Choices.vectorPredication()) { argTys.push_back(Type::getInt32Ty(ctx)); } std::string maskedFnName; raw_string_ostream O(maskedFnName); - O << "masked_atomicrmw_"; + O << (isCmpXchg ? "masked_cmpxchg_" : "masked_atomicrmw_"); + + if (I.IsWeak) { + assert(isCmpXchg && "Bad MaskedAtomic state"); + O << "weak_"; + } if (I.IsVolatile) { O << "volatile_"; } + if (!isCmpXchg) { #define BINOP_CASE(BINOP, STR) \ case AtomicRMWInst::BINOP: \ O << (STR); \ break - switch (I.BinOp) { - BINOP_CASE(Xchg, "xchg"); - BINOP_CASE(Add, "add"); - BINOP_CASE(Sub, "sub"); - BINOP_CASE(And, "and"); - BINOP_CASE(Nand, "nand"); - BINOP_CASE(Or, "or"); - BINOP_CASE(Xor, "xor"); - BINOP_CASE(Max, "max"); - BINOP_CASE(Min, "min"); - BINOP_CASE(UMax, "umax"); - BINOP_CASE(UMin, "umin"); - BINOP_CASE(FAdd, "fadd"); - BINOP_CASE(FSub, "fsub"); - BINOP_CASE(FMax, "fmax"); - BINOP_CASE(FMin, "fmin"); - BINOP_CASE(UIncWrap, "uincwrap"); - BINOP_CASE(UDecWrap, "udecwrap"); - case llvm::AtomicRMWInst::BAD_BINOP: - return nullptr; - } + switch (I.BinOp) { + BINOP_CASE(Xchg, "xchg"); + BINOP_CASE(Add, "add"); + BINOP_CASE(Sub, "sub"); + BINOP_CASE(And, "and"); + BINOP_CASE(Nand, "nand"); + BINOP_CASE(Or, "or"); + BINOP_CASE(Xor, "xor"); + BINOP_CASE(Max, "max"); + BINOP_CASE(Min, "min"); + BINOP_CASE(UMax, "umax"); + BINOP_CASE(UMin, "umin"); + BINOP_CASE(FAdd, "fadd"); + BINOP_CASE(FSub, "fsub"); + BINOP_CASE(FMax, "fmax"); + BINOP_CASE(FMin, "fmin"); + BINOP_CASE(UIncWrap, "uincwrap"); + BINOP_CASE(UDecWrap, "udecwrap"); + case llvm::AtomicRMWInst::BAD_BINOP: + return nullptr; + } #undef BINOP_CASE + O << "_"; + } + + O << "align" << I.Align.value() << "_"; - O << "_align" << I.Align.value() << "_"; // Mangle ordering - switch (I.Ordering) { - default: - O << static_cast(I.Ordering); - break; - case AtomicOrdering::Acquire: - O << "acquire"; - break; - case AtomicOrdering::AcquireRelease: - O << "acqrel"; - break; - case AtomicOrdering::Monotonic: - O << "monotonic"; - break; - case AtomicOrdering::NotAtomic: - O << "notatomic"; - break; - case AtomicOrdering::Release: - O << "release"; - break; - case AtomicOrdering::SequentiallyConsistent: - O << "seqcst"; - break; - case AtomicOrdering::Unordered: - O << "unordered"; - break; + auto mangleOrdering = [&O](AtomicOrdering Ordering) { + switch (Ordering) { + default: + O << static_cast(Ordering); + break; + case AtomicOrdering::Acquire: + O << "acquire"; + break; + case AtomicOrdering::AcquireRelease: + O << "acqrel"; + break; + case AtomicOrdering::Monotonic: + O << "monotonic"; + break; + case AtomicOrdering::NotAtomic: + O << "notatomic"; + break; + case AtomicOrdering::Release: + O << "release"; + break; + case AtomicOrdering::SequentiallyConsistent: + O << "seqcst"; + break; + case AtomicOrdering::Unordered: + O << "unordered"; + break; + } + }; + + mangleOrdering(I.Ordering); + // Failure Ordering + if (I.CmpXchgFailureOrdering) { + O << "_"; + mangleOrdering(*I.CmpXchgFailureOrdering); } + // Syncscope O << "_" << static_cast(I.SyncScope) << "_"; @@ -588,9 +637,11 @@ Function *VectorizationContext::getOrCreateMaskedAtomicRMWFunction( maskedFnName = getVectorizedFunctionName(maskedFnName, VF, Choices, /*IsBuiltin=*/true); + Type *maskedFnRetTy = isCmpXchg ? StructType::get(I.ValTy, maskTy) : I.ValTy; + // Create the function type FunctionType *maskedFnTy = - FunctionType::get(I.ValTy, argTys, /*isVarArg=*/false); + FunctionType::get(maskedFnRetTy, argTys, /*isVarArg=*/false); return getOrCreateInternalBuiltin(maskedFnName, maskedFnTy); } @@ -687,8 +738,8 @@ bool VectorizationContext::defineInternalBuiltin(Function *F) { return emitSubgroupScanBody(*F, isInclusive, opKind, isVP); } - if (auto AtomicInfo = isMaskedAtomicRMWFunction(*F)) { - return emitMaskedAtomicRMWBody(*F, *AtomicInfo); + if (auto AtomicInfo = isMaskedAtomicFunction(*F)) { + return emitMaskedAtomicBody(*F, *AtomicInfo); } return false; @@ -1008,40 +1059,54 @@ bool VectorizationContext::emitSubgroupScanBody(Function &F, bool IsInclusive, return true; } -bool VectorizationContext::emitMaskedAtomicRMWBody( - Function &F, const VectorizationContext::MaskedAtomicRMW &MA) const { +bool VectorizationContext::emitMaskedAtomicBody( + Function &F, const VectorizationContext::MaskedAtomic &MA) const { LLVMContext &Ctx = F.getContext(); + bool IsCmpXchg = MA.isCmpXchg(); auto *const EntryBB = BasicBlock::Create(Ctx, "entry", &F); + IRBuilder<> B(EntryBB); + + BasicBlock *LoopEntryBB = EntryBB; + if (MA.IsVectorPredicated) { + auto *const VL = F.getArg(3 + IsCmpXchg); + // Early exit if the vector length is zero. We're going to unconditionally + // jump into the loop after this. + auto *const EarlyExitBB = BasicBlock::Create(Ctx, "earlyexit", &F); + auto *const CmpZero = + B.CreateICmpEQ(VL, ConstantInt::get(VL->getType(), 0)); + + LoopEntryBB = BasicBlock::Create(Ctx, "loopentry", &F); + + B.CreateCondBr(CmpZero, EarlyExitBB, LoopEntryBB); + + B.SetInsertPoint(EarlyExitBB); + B.CreateRet(PoisonValue::get(F.getReturnType())); + } + + B.SetInsertPoint(LoopEntryBB); + auto *const ExitBB = BasicBlock::Create(Ctx, "exit", &F); auto *const PtrArg = F.getArg(0); auto *const ValArg = F.getArg(1); - Value *MaskArg = F.getArg(2); + Value *MaskArg = F.getArg(2 + IsCmpXchg); const bool IsVector = ValArg->getType()->isVectorTy(); - IRBuilder<> B(EntryBB); Value *const IdxStart = B.getInt32(0); ConstantInt *const KnownMin = B.getInt32(MA.VF.getKnownMinValue()); - Value *IdxEnd = !MA.VF.isScalable() ? KnownMin : B.CreateVScale(KnownMin); - - // For vector-predicated masked atomics, we have to merge the incoming mask - // with a mask corresponding to the number of elements left active by the - // runtime vector length. - if (MA.IsVectorPredicated) { - auto *const VL = F.getArg(3); - auto *const IndexTy = VectorType::get(VL->getType(), MA.VF); - auto *const step = B.CreateStepVector(IndexTy); - auto *const VLMask = B.CreateICmpULT(step, B.CreateVectorSplat(MA.VF, VL)); - MaskArg = B.CreateAnd(MaskArg, VLMask); - } + Value *IdxEnd = + MA.IsVectorPredicated + ? F.getArg(3 + IsCmpXchg) + : (!MA.VF.isScalable() ? KnownMin : B.CreateVScale(KnownMin)); Value *RetVal = nullptr; + Value *RetSuccessVal = nullptr; auto CreateLoopBody = [&MA, &F, &ExitBB, PtrArg, ValArg, MaskArg, &RetVal, - IsVector]( + &RetSuccessVal, IsVector, IsCmpXchg]( BasicBlock *BB, Value *Idx, ArrayRef IVs, MutableArrayRef IVsNext) -> BasicBlock * { IRBuilder<> IRB(BB); @@ -1066,14 +1131,39 @@ bool VectorizationContext::emitMaskedAtomicRMWBody( Ptr = IRB.CreateExtractElement(PtrArg, Idx, "ptr"); Val = IRB.CreateExtractElement(ValArg, Idx, "val"); } - auto *const AtomicRMW = IRB.CreateAtomicRMW(MA.BinOp, Ptr, Val, MA.Align, - MA.Ordering, MA.SyncScope); - AtomicRMW->setVolatile(MA.IsVolatile); - if (IsVector) { - RetVal = IRB.CreateInsertElement(IVs[0], AtomicRMW, Idx, "retvec"); + if (IsCmpXchg) { + Value *NewValArg = F.getArg(2); + Value *NewVal = NewValArg; + if (IsVector) { + NewVal = IRB.CreateExtractElement(NewValArg, Idx, "newval"); + } + auto *const CmpXchg = + IRB.CreateAtomicCmpXchg(Ptr, Val, NewVal, MA.Align, MA.Ordering, + *MA.CmpXchgFailureOrdering, MA.SyncScope); + CmpXchg->setWeak(MA.IsWeak); + CmpXchg->setVolatile(MA.IsVolatile); + + if (IsVector) { + RetVal = IRB.CreateInsertElement( + IVs[0], IRB.CreateExtractValue(CmpXchg, 0), Idx, "retvec"); + RetSuccessVal = IRB.CreateInsertElement( + IVs[1], IRB.CreateExtractValue(CmpXchg, 1), Idx, "retsuccess"); + } else { + RetVal = IRB.CreateExtractValue(CmpXchg, 0); + RetSuccessVal = IRB.CreateExtractValue(CmpXchg, 1); + } + } else { - RetVal = AtomicRMW; + auto *const AtomicRMW = IRB.CreateAtomicRMW( + MA.BinOp, Ptr, Val, MA.Align, MA.Ordering, MA.SyncScope); + AtomicRMW->setVolatile(MA.IsVolatile); + + if (IsVector) { + RetVal = IRB.CreateInsertElement(IVs[0], AtomicRMW, Idx, "retvec"); + } else { + RetVal = AtomicRMW; + } } IRB.CreateBr(ElseBB); @@ -1089,6 +1179,15 @@ bool VectorizationContext::emitMaskedAtomicRMWBody( } IVsNext[0] = RetVal; + if (IsCmpXchg) { + auto *MergePhi = + IRB.CreatePHI(RetSuccessVal->getType(), 2, "mergesuccess"); + MergePhi->addIncoming(IVs[1], BB); + MergePhi->addIncoming(RetSuccessVal, IfBB); + RetSuccessVal = MergePhi; + IVsNext[1] = RetSuccessVal; + } + // Move the exit block right to the end of the function. ExitBB->moveAfter(ElseBB); @@ -1100,11 +1199,22 @@ bool VectorizationContext::emitMaskedAtomicRMWBody( Opts.IVs.push_back(PoisonValue::get(MA.ValTy)); Opts.loopIVNames.push_back("retvec.prev"); } - compiler::utils::createLoop(EntryBB, ExitBB, IdxStart, IdxEnd, Opts, + if (IsCmpXchg) { + Opts.IVs.push_back(PoisonValue::get(MaskArg->getType())); + Opts.loopIVNames.push_back("retsuccess.prev"); + } + compiler::utils::createLoop(LoopEntryBB, ExitBB, IdxStart, IdxEnd, Opts, CreateLoopBody); B.SetInsertPoint(ExitBB); - B.CreateRet(RetVal); + if (IsCmpXchg) { + Value *RetStruct = PoisonValue::get(F.getReturnType()); + RetStruct = B.CreateInsertValue(RetStruct, RetVal, 0); + RetStruct = B.CreateInsertValue(RetStruct, RetSuccessVal, 1); + B.CreateRet(RetStruct); + } else { + B.CreateRet(RetVal); + } return true; } diff --git a/modules/compiler/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll b/modules/compiler/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll index 7558c2907..85b4c865d 100644 --- a/modules/compiler/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll +++ b/modules/compiler/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll @@ -14,39 +14,62 @@ ; ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -; RUN: veczc -w 4 -vecz-scalable -vecz-passes=packetizer,verify \ -; RUN: --pass-remarks-missed=vecz -S < %s 2>&1 | FileCheck %s +; RUN: veczc -w 4 -vecz-scalable -vecz-passes=packetizer,verify -S < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "spir64-unknown-unknown" -; Note: we can't currently scalably packetize this kernel, due to the struct -; type. -; CHECK: Vecz: Could not packetize %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4 +; CHECK: define spir_kernel void @__vecz_nxv4_test_fn(ptr %p, ptr %q, ptr %r) define spir_kernel void @test_fn(ptr %p, ptr %q, ptr %r) { entry: +; CHECK: [[SPLAT_PTR_INS:%.*]] = insertelement poison, ptr %p, i64 0 +; CHECK: [[SPLAT_PTR:%.*]] = shufflevector [[SPLAT_PTR_INS]], poison, zeroinitializer %call = call i64 @__mux_get_global_id(i32 0) +; Test that this cmpxchg is packetized by generating a call to an all-true masked version. +; CHECK: [[A0:%.*]] = call { , } @__vecz_b_nxv4_masked_cmpxchg_align4_acquire_monotonic_1_u9nxv4u3ptru5nxv4ju5nxv4ju5nxv4b( +; CHECK-SAME: [[SPLAT_PTR]], +; CHECK-SAME: shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-SAME: shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) +; CHECK-SAME: shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic +; CHECK: [[EXT0:%.*]] = extractvalue { , } [[A0]], 0 %val0 = extractvalue { i32, i1 } %old0, 0 +; CHECK: [[EXT1:%.*]] = extractvalue { , } [[A0]], 1 %success0 = extractvalue { i32, i1 } %old0, 1 %out = getelementptr i32, ptr %q, i64 %call +; Stored as a vector +; CHECK: store [[EXT0]], ptr store i32 %val0, ptr %out, align 4 +; CHECK: [[PTR:%.*]] = getelementptr i8, ptr %r, i64 %call %outsuccess = getelementptr i8, ptr %r, i64 %call +; CHECK: [[ZEXT0:%.*]] = zext [[EXT1]] to %outbyte = zext i1 %success0 to i8 +; Stored as a vector +; CHECK: store [[ZEXT0]], ptr [[PTR]], align 1 store i8 %outbyte, ptr %outsuccess, align 1 ; Test a couple of insert/extract patterns ; Test inserting a uniform value into a varying literal struct +; CHECK: [[INS0:%.*]] = insertvalue { , } [[A0]], zeroinitializer, 1 +; CHECK: [[EXT2:%.*]] = extractvalue { , } [[INS0]], 1 +; CHECK: [[ZEXT1:%.*]] = zext [[EXT2]] to +; CHECK: store [[ZEXT1]], ptr [[PTR]], align 1 %testinsertconst = insertvalue { i32, i1 } %old0, i1 false, 1 %testextract0 = extractvalue { i32, i1 } %testinsertconst, 1 %outbyte0 = zext i1 %testextract0 to i8 store i8 %outbyte0, ptr %outsuccess, align 1 ; Test inserting a varying value into a varying literal struct +; CHECK: [[LD:%.*]] = load , ptr +; CHECK: [[VBOOL:%.*]] = trunc [[LD]] to +; CHECK: [[INS1:%.*]] = insertvalue { , } [[A0]], [[VBOOL]], 1 +; CHECK: [[EXT3:%.*]] = extractvalue { , } [[INS1]], 1 +; CHECK: [[ZEXT2:%.*]] = zext [[EXT3]] to +; CHECK: store [[ZEXT2]], ptr [[PTR]], align 1 %byte1 = load i8, ptr %outsuccess, align 1 %bool1 = trunc i8 %byte1 to i1 %testinsertvarying0 = insertvalue { i32, i1 } %old0, i1 %bool1, 1 @@ -55,6 +78,10 @@ entry: store i8 %outbyte1, ptr %outsuccess, align 1 ; Test inserting a varying value into a uniform literal struct +; CHECK: [[INS2:%.*]] = insertvalue { , } poison, [[VBOOL]], 1 +; CHECK: [[EXT4:%.*]] = extractvalue { , } [[INS2]], 1 +; CHECK: [[ZEXT3:%.*]] = zext [[EXT4]] to +; CHECK: store [[ZEXT3]], ptr [[PTR]], align 1 %testinsertvarying1 = insertvalue { i32, i1 } poison, i1 %bool1, 1 %testextract2 = extractvalue { i32, i1 } %testinsertvarying1, 1 %outbyte2 = zext i1 %testextract2 to i8 diff --git a/modules/compiler/vecz/test/lit/llvm/VectorPredication/masked_atomics.ll b/modules/compiler/vecz/test/lit/llvm/VectorPredication/masked_atomics.ll new file mode 100644 index 000000000..35a478caa --- /dev/null +++ b/modules/compiler/vecz/test/lit/llvm/VectorPredication/masked_atomics.ll @@ -0,0 +1,106 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-passes=define-builtins,verify -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_fn( %p) { + %ret0 = call @__vecz_b_nxv1_vp_masked_atomicrmw_add_align4_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1b( %p, zeroinitializer, zeroinitializer, i32 4) + %ret1 = call { , } @__vecz_b_nxv1_vp_masked_cmpxchg_align4_acquire_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1ju5nxv1b( %p, zeroinitializer, zeroinitializer, zeroinitializer, i32 4) + ret void +} + +declare @__vecz_b_nxv1_vp_masked_atomicrmw_add_align4_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1b( %p, %val, %mask, i32 %vl) + +declare { , } @__vecz_b_nxv1_vp_masked_cmpxchg_align4_acquire_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1ju5nxv1b( %p, %cmp, %newval, %mask, i32 %vl) + +; CHECK: define @__vecz_b_nxv1_vp_masked_atomicrmw_add_align4_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1b( %p, %val, %mask, i32 %vl) { +; CHECK: entry: +; CHECK: [[VLZERO:%.*]] = icmp eq i32 %vl, 0 +; CHECK: br i1 [[VLZERO]], label %earlyexit, label %loopentry + +; CHECK: earlyexit: +; CHECK: ret poison + +; CHECK: loopentry: +; CHECK: br label %loopIR + +; CHECK: loopIR: +; CHECK: [[IDX:%.*]] = phi i32 [ 0, %loopentry ], [ [[INC:%.*]], %if.else ] +; CHECK: [[RET_PREV:%.*]] = phi [ poison, %loopentry ], [ [[MERGE:%.*]], %if.else ] +; CHECK: [[MASKELT:%.*]] = extractelement %mask, i32 [[IDX]] +; CHECK: [[MASKCMP:%.*]] = icmp ne i1 [[MASKELT]], false +; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else + +; CHECK: if.then: +; CHECK: [[PTR:%.*]] = extractelement %p, i32 [[IDX]] +; CHECK: [[VAL:%.*]] = extractelement %val, i32 [[IDX]] +; CHECK: [[ATOM:%.*]] = atomicrmw add ptr [[PTR]], i32 [[VAL]] acquire, align 4 +; CHECK: [[RET_NEXT:%.*]] = insertelement [[RET_PREV]], i32 [[ATOM]], i32 [[IDX]] +; CHECK: br label %if.else + +; CHECK: if.else: +; CHECK: [[MERGE:%.*]] = phi [ [[RET_PREV]], %loopIR ], [ [[RET_NEXT]], %if.then ] +; CHECK: [[INC]] = add i32 [[IDX]], 1 +; CHECK: [[CMP:%.*]] = icmp ult i32 [[INC]], %vl +; CHECK: br i1 [[CMP]], label %loopIR, label %exit + +; CHECK: exit: +; CHECK: ret [[MERGE]] + +; CHECK: define { , } @__vecz_b_nxv1_vp_masked_cmpxchg_align4_acquire_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1ju5nxv1b( %p, %cmp, %newval, %mask, i32 %vl) { +; CHECK: entry: +; CHECK: [[VLZERO:%.*]] = icmp eq i32 %vl, 0 +; CHECK: br i1 [[VLZERO]], label %earlyexit, label %loopentry + +; CHECK: earlyexit: +; CHECK: ret { , } poison + +; CHECK: loopentry: +; CHECK: br label %loopIR + +; CHECK: loopIR: +; CHECK: [[IDX:%.*]] = phi i32 [ 0, %loopentry ], [ [[INC:%.*]], %if.else ] +; CHECK: [[RET_PREV:%.*]] = phi [ poison, %loopentry ], [ [[MERGE:%.*]], %if.else ] +; CHECK: [[SUCCESS_PREV:%.*]] = phi [ poison, %loopentry ], [ [[MERGE_SUCCESS:%.*]], %if.else ] +; CHECK: [[MASKELT:%.*]] = extractelement %mask, i32 [[IDX]] +; CHECK: [[MASKCMP:%.*]] = icmp ne i1 [[MASKELT]], false +; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else + +; CHECK: if.then: +; CHECK: [[PTR:%.*]] = extractelement %p, i32 [[IDX]] +; CHECK: [[CMP:%.*]] = extractelement %cmp, i32 [[IDX]] +; CHECK: [[NEWVAL:%.*]] = extractelement %newval, i32 [[IDX]] +; CHECK: [[ATOM:%.*]] = cmpxchg ptr [[PTR]], i32 [[CMP]], i32 [[NEWVAL]] acquire acquire, align 4 +; CHECK: [[EXT0:%.*]] = extractvalue { i32, i1 } [[ATOM]], 0 +; CHECK: [[RET:%.*]] = insertelement [[RET_PREV]], i32 [[EXT0]], i32 [[IDX]] +; CHECK: [[EXT1:%.*]] = extractvalue { i32, i1 } [[ATOM]], 1 +; CHECK: [[SUCCESS:%.*]] = insertelement [[SUCCESS_PREV]], i1 [[EXT1]], i32 [[IDX]] +; CHECK: br label %if.else + +; CHECK: if.else: +; CHECK: [[MERGE:%.*]] = phi [ [[RET_PREV]], %loopIR ], [ [[RET]], %if.then ] +; CHECK: [[MERGE_SUCCESS:%.*]] = phi [ [[SUCCESS_PREV]], %loopIR ], [ [[SUCCESS]], %if.then ] +; CHECK: [[INC]] = add i32 [[IDX]], 1 +; CHECK: [[CMP:%.*]] = icmp ult i32 [[INC]], %vl +; CHECK: br i1 [[CMP]], label %loopIR, label %exit + +; CHECK: exit: +; CHECK: [[RETTMP:%.*]] = insertvalue { , } poison, [[MERGE]], 0 +; CHECK: [[RETVAL:%.*]] = insertvalue { , } [[RETTMP]], [[MERGE_SUCCESS]], 1 +; CHECK: ret { , } [[RETVAL]] diff --git a/modules/compiler/vecz/test/lit/llvm/cmpxchg.ll b/modules/compiler/vecz/test/lit/llvm/cmpxchg.ll index d62486409..bf2175364 100644 --- a/modules/compiler/vecz/test/lit/llvm/cmpxchg.ll +++ b/modules/compiler/vecz/test/lit/llvm/cmpxchg.ll @@ -22,51 +22,29 @@ target triple = "spir64-unknown-unknown" ; CHECK: define spir_kernel void @__vecz_v4_test_fn(ptr %p, ptr %q, ptr %r) define spir_kernel void @test_fn(ptr %p, ptr %q, ptr %r) { entry: +; CHECK: [[SPLAT_PTR_INS:%.*]] = insertelement <4 x ptr> poison, ptr %p, i64 0 +; CHECK: [[SPLAT_PTR:%.*]] = shufflevector <4 x ptr> [[SPLAT_PTR_INS]], <4 x ptr> poison, <4 x i32> zeroinitializer %call = call i64 @__mux_get_global_id(i32 0) -; Test that this cmpxchg is scalarized. Not ideal, but hey. -; CHECK: [[A0:%.*]] = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4 -; CHECK: [[A1:%.*]] = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4 -; CHECK: [[A2:%.*]] = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4 -; CHECK: [[A3:%.*]] = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4 - -; Then we insert the values into a strange struct -; CHECK: [[INS0:%.*]] = insertvalue [4 x { i32, i1 }] undef, { i32, i1 } [[A0]], 0 -; CHECK: [[INS1:%.*]] = insertvalue [4 x { i32, i1 }] [[INS0]], { i32, i1 } [[A1]], 1 -; CHECK: [[INS2:%.*]] = insertvalue [4 x { i32, i1 }] [[INS1]], { i32, i1 } [[A2]], 2 -; CHECK: [[INS3:%.*]] = insertvalue [4 x { i32, i1 }] [[INS2]], { i32, i1 } [[A3]], 3 +; Test that this cmpxchg is packetized by generating a call to an all-true masked version. +; CHECK: [[A0:%.*]] = call { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align4_acquire_monotonic_1_Dv4_u3ptrDv4_jDv4_jDv4_b( +; CHECK-SAME: <4 x ptr> [[SPLAT_PTR]], <4 x i32> , +; CHECK-SAME: <4 x i32> , +; CHECK-SAME: <4 x i1> %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic - -; To extract from this result, we extract each element individually then insert -; each into a vector. -; CHECK: [[ELT0_0_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 0, 0 -; CHECK: [[ELT0_0_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 1, 0 -; CHECK: [[ELT0_0_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 2, 0 -; CHECK: [[ELT0_0_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 3, 0 -; CHECK: [[INS0V0_0:%.*]] = insertelement <4 x i32> undef, i32 [[ELT0_0_0]], i32 0 -; CHECK: [[INS0V0_1:%.*]] = insertelement <4 x i32> [[INS0V0_0]], i32 [[ELT0_0_1]], i32 1 -; CHECK: [[INS0V0_2:%.*]] = insertelement <4 x i32> [[INS0V0_1]], i32 [[ELT0_0_2]], i32 2 -; CHECK: [[INS0V0_3:%.*]] = insertelement <4 x i32> [[INS0V0_2]], i32 [[ELT0_0_3]], i32 3 +; CHECK: [[EXT0:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[A0]], 0 %val0 = extractvalue { i32, i1 } %old0, 0 -; Same again here -; CHECK: [[ELT1_0_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 0, 1 -; CHECK: [[ELT1_0_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 1, 1 -; CHECK: [[ELT1_0_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 2, 1 -; CHECK: [[ELT1_0_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 3, 1 -; CHECK: [[INS1V0_0:%.*]] = insertelement <4 x i1> undef, i1 [[ELT1_0_0]], i32 0 -; CHECK: [[INS1V0_1:%.*]] = insertelement <4 x i1> [[INS1V0_0]], i1 [[ELT1_0_1]], i32 1 -; CHECK: [[INS1V0_2:%.*]] = insertelement <4 x i1> [[INS1V0_1]], i1 [[ELT1_0_2]], i32 2 -; CHECK: [[INS1V0_3:%.*]] = insertelement <4 x i1> [[INS1V0_2]], i1 [[ELT1_0_3]], i32 3 +; CHECK: [[EXT1:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[A0]], 1 %success0 = extractvalue { i32, i1 } %old0, 1 %out = getelementptr i32, ptr %q, i64 %call ; Stored as a vector -; CHECK: store <4 x i32> [[INS0V0_3]], ptr +; CHECK: store <4 x i32> [[EXT0]], ptr store i32 %val0, ptr %out, align 4 ; CHECK: [[PTR:%.*]] = getelementptr i8, ptr %r, i64 %call %outsuccess = getelementptr i8, ptr %r, i64 %call -; CHECK: [[ZEXT0:%.*]] = zext <4 x i1> [[INS1V0_3]] to <4 x i8> +; CHECK: [[ZEXT0:%.*]] = zext <4 x i1> [[EXT1]] to <4 x i8> %outbyte = zext i1 %success0 to i8 ; Stored as a vector ; CHECK: store <4 x i8> [[ZEXT0]], ptr [[PTR]], align 1 @@ -74,25 +52,10 @@ entry: ; Test a couple of insert/extract patterns -; Test inserting a uniform value into a varying literal struct -; This is very inefficient -; CHECK: [[INSS0_0:%.*]] = insertvalue { i32, i1 } [[A0]], i1 false, 1 -; CHECK: [[INSS0_1:%.*]] = insertvalue { i32, i1 } [[A1]], i1 false, 1 -; CHECK: [[INSS0_2:%.*]] = insertvalue { i32, i1 } [[A2]], i1 false, 1 -; CHECK: [[INSS0_3:%.*]] = insertvalue { i32, i1 } [[A3]], i1 false, 1 -; CHECK: [[INSS1_0:%.*]] = insertvalue [4 x { i32, i1 }] undef, { i32, i1 } [[INSS0_0]], 0 -; CHECK: [[INSS1_1:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS1_0]], { i32, i1 } [[INSS0_1]], 1 -; CHECK: [[INSS1_2:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS1_1]], { i32, i1 } [[INSS0_2]], 2 -; CHECK: [[INSS1_3:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS1_2]], { i32, i1 } [[INSS0_3]], 3 -; CHECK: [[EXTS1_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS1_3]], 0, 1 -; CHECK: [[EXTS1_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS1_3]], 1, 1 -; CHECK: [[EXTS1_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS1_3]], 2, 1 -; CHECK: [[EXTS1_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS1_3]], 3, 1 -; CHECK: [[INS1V1_0:%.*]] = insertelement <4 x i1> undef, i1 [[EXTS1_0]], i32 0 -; CHECK: [[INS1V1_1:%.*]] = insertelement <4 x i1> [[INS1V1_0]], i1 [[EXTS1_1]], i32 1 -; CHECK: [[INS1V1_2:%.*]] = insertelement <4 x i1> [[INS1V1_1]], i1 [[EXTS1_2]], i32 2 -; CHECK: [[INS1V1_3:%.*]] = insertelement <4 x i1> [[INS1V1_2]], i1 [[EXTS1_3]], i32 3 -; CHECK: [[ZEXT1:%.*]] = zext <4 x i1> [[INS1V1_3]] to <4 x i8> + ; Test inserting a uniform value into a varying literal struct +; CHECK: [[INS0:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[A0]], <4 x i1> zeroinitializer, 1 +; CHECK: [[EXT2:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[INS0]], 1 +; CHECK: [[ZEXT1:%.*]] = zext <4 x i1> [[EXT2]] to <4 x i8> ; CHECK: store <4 x i8> [[ZEXT1]], ptr [[PTR]], align 1 %testinsertconst = insertvalue { i32, i1 } %old0, i1 false, 1 %testextract0 = extractvalue { i32, i1 } %testinsertconst, 1 @@ -100,29 +63,11 @@ entry: store i8 %outbyte0, ptr %outsuccess, align 1 ; Test inserting a varying value into a varying literal struct -; CHECK: [[V4I8_LD:%.*]] = load <4 x i8>, ptr %outsuccess, align 1 -; CHECK: [[TRUNC:%.*]] = trunc <4 x i8> [[V4I8_LD]] to <4 x i1> -; CHECK: [[EXTV0_0:%.*]] = extractelement <4 x i1> [[TRUNC]], i32 0 -; CHECK: [[EXTV0_1:%.*]] = extractelement <4 x i1> [[TRUNC]], i32 1 -; CHECK: [[EXTV0_2:%.*]] = extractelement <4 x i1> [[TRUNC]], i32 2 -; CHECK: [[EXTV0_3:%.*]] = extractelement <4 x i1> [[TRUNC]], i32 3 -; CHECK: [[INSS2_0:%.*]] = insertvalue { i32, i1 } [[A0]], i1 [[EXTV0_0]], 1 -; CHECK: [[INSS2_1:%.*]] = insertvalue { i32, i1 } [[A1]], i1 [[EXTV0_1]], 1 -; CHECK: [[INSS2_2:%.*]] = insertvalue { i32, i1 } [[A2]], i1 [[EXTV0_2]], 1 -; CHECK: [[INSS2_3:%.*]] = insertvalue { i32, i1 } [[A3]], i1 [[EXTV0_3]], 1 -; CHECK: [[INSS3_0:%.*]] = insertvalue [4 x { i32, i1 }] undef, { i32, i1 } [[INSS2_0]], 0 -; CHECK: [[INSS3_1:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS3_0]], { i32, i1 } [[INSS2_1]], 1 -; CHECK: [[INSS3_2:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS3_1]], { i32, i1 } [[INSS2_2]], 2 -; CHECK: [[INSS3_3:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS3_2]], { i32, i1 } [[INSS2_3]], 3 -; CHECK: [[EXTS3_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS3_3]], 0, 1 -; CHECK: [[EXTS3_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS3_3]], 1, 1 -; CHECK: [[EXTS3_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS3_3]], 2, 1 -; CHECK: [[EXTS3_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS3_3]], 3, 1 -; CHECK: [[INS1V2_0:%.*]] = insertelement <4 x i1> undef, i1 [[EXTS3_0]], i32 0 -; CHECK: [[INS1V2_1:%.*]] = insertelement <4 x i1> [[INS1V2_0]], i1 [[EXTS3_1]], i32 1 -; CHECK: [[INS1V2_2:%.*]] = insertelement <4 x i1> [[INS1V2_1]], i1 [[EXTS3_2]], i32 2 -; CHECK: [[INS1V2_3:%.*]] = insertelement <4 x i1> [[INS1V2_2]], i1 [[EXTS3_3]], i32 3 -; CHECK: [[ZEXT2:%.*]] = zext <4 x i1> [[INS1V2_3]] to <4 x i8> +; CHECK: [[LD:%.*]] = load <4 x i8>, ptr +; CHECK: [[VBOOL:%.*]] = trunc <4 x i8> [[LD]] to <4 x i1> +; CHECK: [[INS1:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[A0]], <4 x i1> [[VBOOL]], 1 +; CHECK: [[EXT3:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[INS1]], 1 +; CHECK: [[ZEXT2:%.*]] = zext <4 x i1> [[EXT3]] to <4 x i8> ; CHECK: store <4 x i8> [[ZEXT2]], ptr [[PTR]], align 1 %byte1 = load i8, ptr %outsuccess, align 1 %bool1 = trunc i8 %byte1 to i1 @@ -132,23 +77,9 @@ entry: store i8 %outbyte1, ptr %outsuccess, align 1 ; Test inserting a varying value into a uniform literal struct -; CHECK: [[INSS4_0:%.*]] = insertvalue { i32, i1 } poison, i1 [[EXTV0_0]], 1 -; CHECK: [[INSS4_1:%.*]] = insertvalue { i32, i1 } poison, i1 [[EXTV0_1]], 1 -; CHECK: [[INSS4_2:%.*]] = insertvalue { i32, i1 } poison, i1 [[EXTV0_2]], 1 -; CHECK: [[INSS4_3:%.*]] = insertvalue { i32, i1 } poison, i1 [[EXTV0_3]], 1 -; CHECK: [[INSS5_0:%.*]] = insertvalue [4 x { i32, i1 }] undef, { i32, i1 } [[INSS4_0]], 0 -; CHECK: [[INSS5_1:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS5_0]], { i32, i1 } [[INSS4_1]], 1 -; CHECK: [[INSS5_2:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS5_1]], { i32, i1 } [[INSS4_2]], 2 -; CHECK: [[INSS5_3:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS5_2]], { i32, i1 } [[INSS4_3]], 3 -; CHECK: [[EXTS5_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS5_3]], 0, 1 -; CHECK: [[EXTS5_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS5_3]], 1, 1 -; CHECK: [[EXTS5_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS5_3]], 2, 1 -; CHECK: [[EXTS5_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS5_3]], 3, 1 -; CHECK: [[INS2V3_0:%.*]] = insertelement <4 x i1> undef, i1 [[EXTS5_0]], i32 0 -; CHECK: [[INS2V3_1:%.*]] = insertelement <4 x i1> [[INS2V3_0]], i1 [[EXTS5_1]], i32 1 -; CHECK: [[INS2V3_2:%.*]] = insertelement <4 x i1> [[INS2V3_1]], i1 [[EXTS5_2]], i32 2 -; CHECK: [[INS2V3_3:%.*]] = insertelement <4 x i1> [[INS2V3_2]], i1 [[EXTS5_3]], i32 3 -; CHECK: [[ZEXT3:%.*]] = zext <4 x i1> [[INS2V3_3]] to <4 x i8> +; CHECK: [[INS2:%.*]] = insertvalue { <4 x i32>, <4 x i1> } poison, <4 x i1> [[VBOOL]], 1 +; CHECK: [[EXT4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[INS2]], 1 +; CHECK: [[ZEXT3:%.*]] = zext <4 x i1> [[EXT4]] to <4 x i8> ; CHECK: store <4 x i8> [[ZEXT3]], ptr [[PTR]], align 1 %testinsertvarying1 = insertvalue { i32, i1 } poison, i1 %bool1, 1 %testextract2 = extractvalue { i32, i1 } %testinsertvarying1, 1 diff --git a/modules/compiler/vecz/test/lit/llvm/diverging_atomic.ll b/modules/compiler/vecz/test/lit/llvm/diverging_atomic.ll deleted file mode 100644 index b6beaae1e..000000000 --- a/modules/compiler/vecz/test/lit/llvm/diverging_atomic.ll +++ /dev/null @@ -1,46 +0,0 @@ -; Copyright (C) Codeplay Software Limited -; -; Licensed under the Apache License, Version 2.0 (the "License") with LLVM -; Exceptions; you may not use this file except in compliance with the License. -; You may obtain a copy of the License at -; -; https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt -; -; Unless required by applicable law or agreed to in writing, software -; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -; License for the specific language governing permissions and limitations -; under the License. -; -; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -; RUN: veczc -w 4 -vecz-passes=cfg-convert,verify -S \ -; RUN: --pass-remarks-missed=vecz < %s 2>&1 | FileCheck %s - -target triple = "spir64-unknown-unknown" -target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" - -; CHECK: Vecz: Could not apply masks for function "kernel" -; CHECK-NEXT: note: Could not apply mask to atomic instruction -; CHECK-SAME: atomic_success = cmpxchg ptr %arrayidx.in, i32 2, i32 4 acq_rel monotonic, align 4 - -define spir_kernel void @kernel(ptr %in, ptr %out) { -entry: - %gid = tail call i64 @__mux_get_global_id(i32 0) - %cmp = icmp eq i64 %gid, 0 - br i1 %cmp, label %if.then, label %end - -if.then: - %arrayidx.in = getelementptr inbounds i32, ptr %in, i64 %gid - %atomic_success = cmpxchg ptr %arrayidx.in, i32 2, i32 4 acq_rel monotonic, align 4 - %atomic = extractvalue { i32, i1 } %atomic_success, 0 - br label %end - -end: - %merge = phi i32 [ 0, %entry ], [ %atomic, %if.then ] - %arrayidx.out = getelementptr inbounds i32, ptr %out, i64 %gid - store i32 %merge, ptr %arrayidx.out, align 4 - ret void -} - -declare i64 @__mux_get_global_id(i32) diff --git a/modules/compiler/vecz/test/lit/llvm/masked_cmpxchg.ll b/modules/compiler/vecz/test/lit/llvm/masked_cmpxchg.ll new file mode 100644 index 000000000..73aec6dfc --- /dev/null +++ b/modules/compiler/vecz/test/lit/llvm/masked_cmpxchg.ll @@ -0,0 +1,105 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -vecz-passes=cfg-convert,verify,packetizer,define-builtins,verify -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; CHECK: define spir_kernel void @__vecz_v4_test_fn(ptr %p, ptr %q, ptr %r) +define spir_kernel void @test_fn(ptr %p, ptr %q, ptr %r) { +entry: +; CHECK: [[SPLAT_PTR_INS:%.*]] = insertelement <4 x ptr> poison, ptr %p, i64 0 +; CHECK: [[SPLAT_PTR:%.*]] = shufflevector <4 x ptr> [[SPLAT_PTR_INS]], <4 x ptr> poison, <4 x i32> zeroinitializer +; CHECK: [[CMP:%.*]] = icmp sgt <4 x i64> , + %call = call i64 @__mux_get_global_id(i32 0) + %cmp = icmp sgt i64 3, %call +; CHECK: [[VEC_PTR:%.*]] = getelementptr i32, ptr %p, <4 x i64> + %wi_p_i32 = getelementptr i32, ptr %p, i64 %call + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry +; CHECK: [[CALL:%.*]] = call { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align4_acquire_monotonic_1_Dv4_u3ptrDv4_jDv4_jDv4_b( +; CHECK-SAME: <4 x ptr> [[SPLAT_PTR]], <4 x i32> , +; CHECK-SAME: <4 x i32> , <4 x i1> [[CMP]] + %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic + %val0 = extractvalue { i32, i1 } %old0, 0 + %success0 = extractvalue { i32, i1 } %old0, 1 + + %out = getelementptr i32, ptr %q, i64 %call + store i32 %val0, ptr %out, align 4 + + %outsuccess = getelementptr i8, ptr %r, i64 %call + %outbyte = zext i1 %success0 to i8 + store i8 %outbyte, ptr %outsuccess, align 1 + + ; Test a couple of insert/extract patterns +; CHECK: [[INS:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[CALL]], <4 x i1> [[CMP]], 1 +; CHECK: [[EXT:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[INS]], 1 + %testinsert = insertvalue { i32, i1 } %old0, i1 %cmp, 1 + %testextract = extractvalue { i32, i1 } %testinsert, 1 + + %outbyte0 = zext i1 %testextract to i8 + store i8 %outbyte0, ptr %outsuccess, align 1 + +; CHECK: = call { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_weak_volatile_align8_monotonic_seqcst_0_Dv4_u3ptrDv4_jDv4_jDv4_b( + %old1 = cmpxchg weak volatile ptr %wi_p_i32, i32 1, i32 2 syncscope("singlethread") monotonic seq_cst, align 8 + + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +; CHECK: define { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align4_acquire_monotonic_1_Dv4_u3ptrDv4_jDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[CMPS:%1]], <4 x i32> [[NEWS:%2]], <4 x i1> [[MASK:%3]]) { +; CHECK: entry: +; CHECK: br label %loopIR + +; CHECK: loopIR: +; CHECK: [[IDX:%.*]] = phi i32 [ 0, %entry ], [ [[IDX_NEXT:%.*]], %if.else ] +; CHECK: [[PREV:%.*]] = phi <4 x i32> [ poison, %entry ], [ [[MERGE:%.*]], %if.else ] +; CHECK: [[PREVSUCCESS:%.*]] = phi <4 x i1> [ poison, %entry ], [ [[MERGESUCCESS:%.*]], %if.else ] +; CHECK: [[MASKELT:%.*]] = extractelement <4 x i1> [[MASK]], i32 [[IDX]] +; CHECK: [[MASKCMP:%.*]] = icmp ne i1 [[MASKELT]], false +; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else + +; CHECK: if.then: +; CHECK: [[PTR:%.*]] = extractelement <4 x ptr> [[PTRS]], i32 [[IDX]] +; CHECK: [[CMP:%.*]] = extractelement <4 x i32> [[CMPS]], i32 [[IDX]] +; CHECK: [[NEW:%.*]] = extractelement <4 x i32> [[NEWS]], i32 [[IDX]] +; CHECK: [[ATOM:%.*]] = cmpxchg ptr [[PTR]], i32 [[CMP]], i32 [[NEW]] acquire monotonic, align 4 +; CHECK: [[VAL:%.*]] = extractvalue { i32, i1 } [[ATOM]], 0 +; CHECK: [[RET:%.*]] = insertelement <4 x i32> [[PREV]], i32 [[VAL]], i32 [[IDX]] +; CHECK: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[ATOM]], 1 +; CHECK: [[RETSUCCESS:%.*]] = insertelement <4 x i1> [[PREVSUCCESS]], i1 [[SUCCESS]], i32 [[IDX]] +; CHECK: br label %if.else + +; CHECK: if.else: +; CHECK: [[MERGE]] = phi <4 x i32> [ [[PREV]], %loopIR ], [ [[RET]], %if.then ] +; CHECK: [[MERGESUCCESS]] = phi <4 x i1> [ [[PREVSUCCESS]], %loopIR ], [ [[RETSUCCESS]], %if.then ] +; CHECK: [[IDX_NEXT]] = add i32 [[IDX]], 1 + +; CHECK: exit: +; CHECK: [[INS0:%.*]] = insertvalue { <4 x i32>, <4 x i1> } poison, <4 x i32> [[MERGE]], 0 +; CHECK: [[INS1:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[INS0]], <4 x i1> [[MERGESUCCESS]], 1 +; CHECK: ret { <4 x i32>, <4 x i1> } [[INS1]] + +; Assume that all masked cmpxchg operations follow the logic above. Just +; check that the right cmpxchg instruction is being generated. +; CHECK: define { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_weak_volatile_align8_monotonic_seqcst_0_Dv4_u3ptrDv4_jDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[CMPS:%1]], <4 x i32> [[NEWS:%2]], <4 x i1> [[MASK:%3]]) { +; CHECK: cmpxchg weak volatile ptr {{%.*}}, i32 {{%.*}}, i32 {{%.*}} syncscope("singlethread") monotonic seq_cst, align 8 + +declare i64 @__mux_get_global_id(i32) diff --git a/modules/compiler/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll b/modules/compiler/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll new file mode 100644 index 000000000..831b6cca8 --- /dev/null +++ b/modules/compiler/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll @@ -0,0 +1,48 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-passes=define-builtins,verify -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_fn(ptr %p) { + %ret = call { i32, i1 } @__vecz_b_v1_masked_cmpxchg_align4_acquire_monotonic_1_u3ptrjjb(ptr %p, i32 1, i32 2, i1 true) + ret void +} + +declare { i32, i1 } @__vecz_b_v1_masked_cmpxchg_align4_acquire_monotonic_1_u3ptrjjb(ptr %p, i32 %cmp, i32 %newval, i1 %mask) + +; CHECK: define { i32, i1 } @__vecz_b_v1_masked_cmpxchg_align4_acquire_monotonic_1_u3ptrjjb(ptr %p, i32 %cmp, i32 %newval, i1 %mask) { +; CHECK: entry: +; CHECK: [[MASKCMP:%.*]] = icmp ne i1 %mask, false +; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else + +; CHECK: if.then: +; CHECK: [[ATOM:%.*]] = cmpxchg ptr %p, i32 %cmp, i32 %newval acquire monotonic, align 4 +; CHECK: [[EXT0:%.*]] = extractvalue { i32, i1 } [[ATOM]], 0 +; CHECK: [[EXT1:%.*]] = extractvalue { i32, i1 } [[ATOM]], 1 +; CHECK: br label %if.else + +; CHECK: if.else: +; CHECK: [[RETVAL:%.*]] = phi i32 [ poison, %entry ], [ [[EXT0]], %if.then ] +; CHECK: [[RETSUCC:%.*]] = phi i1 [ poison, %entry ], [ [[EXT1]], %if.then ] +; CHECK: br label %exit + +; CHECK: exit: +; CHECK: [[INS0:%.*]] = insertvalue { i32, i1 } poison, i32 [[RETVAL]], 0 +; CHECK: [[INS1:%.*]] = insertvalue { i32, i1 } [[INS0]], i1 [[RETSUCC]], 1 +; CHECK: ret { i32, i1 } [[INS1]]