Skip to content

Commit

Permalink
[vecz] Add support for masking atomic RMW instructions
Browse files Browse the repository at this point in the history
This commit allows the vectorizer to vectorize kernels in which there
are atomic RMW instructions that need masked for control-flow purposes:
in a divergent if/else or a loop, etc.

It follows a fairly simple paradigm - similar to how we mask loads and
stores - involving:

* Control-flow conversion replacing the atomic with a call to an
  'internal' vecz builtin
* The packetizer widening this builtin, and replacing the call with
  another call (with packetized arguments)
* The post-vectorization `DefineBuiltinsPass` running and providing
  function bodies for these masked atomic builtins

The builtins themselves are simply loops over the entire vectorized
arguments, conditionally doing an atomic operation one by one in
sequence depending on the mask. This should be correct (i.e., not
performing the whole atomic operation at once) since the results are
undefined for how work-items run in parallel and which work-items would
"win" if there was any contention in the atomic memory addresses. Note
also that this is also essentially how plain atomics are vectorized: by
scalarizing them.

There isn't yet support for the atomic cmpxhg instructions - those will
be done separately.
  • Loading branch information
frasercrmck committed Dec 18, 2023
1 parent 65e6dfd commit f188fe5
Show file tree
Hide file tree
Showing 11 changed files with 695 additions and 160 deletions.
45 changes: 44 additions & 1 deletion modules/compiler/vecz/source/include/vectorization_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,11 @@

#include <llvm/ADT/DenseMap.h>
#include <llvm/Analysis/IVDescriptors.h>
#include <llvm/IR/Instructions.h>
#include <llvm/IR/LLVMContext.h>
#include <llvm/IR/PassManager.h>
#include <llvm/IR/ValueHandle.h>
#include <llvm/Support/AtomicOrdering.h>
#include <llvm/Support/TypeSize.h>
#include <llvm/Transforms/Utils/ValueMapper.h>
#include <multi_llvm/multi_llvm.h>
Expand Down Expand Up @@ -150,14 +153,46 @@ class VectorizationContext {
/// @return The masked version of the function
llvm::Function *getOrCreateMaskedFunction(llvm::CallInst *CI);

struct MaskedAtomicRMW {
llvm::Type *PointerTy;
llvm::Type *ValTy;
llvm::AtomicRMWInst::BinOp BinOp;
llvm::Align Align;
bool IsVolatile = false;
llvm::SyncScope::ID SyncScope;
llvm::AtomicOrdering Ordering;
// Vectorization info
llvm::ElementCount VF;
bool IsVectorPredicated = false;
};

/// @brief Check if the given function is a masked version of an atomic RMW
/// operation.
///
/// @param[in] F The function to check
/// @return A MaskedAtomicRMW instance detailing the atomic operation if the
/// function is a masked atomic RMW, or std::nullopt otherwise
std::optional<MaskedAtomicRMW> isMaskedAtomicRMWFunction(
const llvm::Function &F) const;
/// @brief Get (if it exists already) or create the function representing the
/// masked version of an atomic RMW operation.
///
/// @param[in] I Atomic to be masked
/// @param[in] Choices Choices to mangle into the function name
/// @param[in] VF The vectorization factor of the atomic operation
/// @return The masked version of the function
llvm::Function *getOrCreateMaskedAtomicRMWFunction(
MaskedAtomicRMW &I, const VectorizationChoices &Choices,
llvm::ElementCount VF);

/// @brief Create a VectorizationUnit to use to vectorize the given scalar
/// function.
///
/// The lifetime of the returned VectorizationUnit is managed by the
/// VectorizationContext.
///
/// @param[in] F Function to vectorize.
/// @param[in] Width VF vectorization factor to use.
/// @param[in] VF vectorization factor to use.
/// @param[in] Dimension SIMD dimension to use (0 => x, 1 => y, 2 => z).
/// @param[in] Ch Vectorization Choices for the vectorization.
VectorizationUnit *createVectorizationUnit(llvm::Function &F,
Expand Down Expand Up @@ -258,6 +293,14 @@ class VectorizationContext {
bool emitSubgroupScanBody(llvm::Function &F, bool IsInclusive,
llvm::RecurKind OpKind, bool IsVP) const;

/// @brief Emit the body for a masked atomic builtin
///
/// @param[in] F The empty (declaration only) function to emit the body in
/// @param[in] MA The MaskedAtomicRMW information
/// @returns true on success, false otherwise
bool emitMaskedAtomicRMWBody(llvm::Function &F,
const MaskedAtomicRMW &MA) const;

/// @brief Helper for non-vectorization tasks.
TargetInfo &VTI;
/// @brief Module in which the vectorization happens.
Expand Down
16 changes: 15 additions & 1 deletion modules/compiler/vecz/source/include/vectorization_helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,25 @@ class VectorizationChoices;
/// @param[in] ScalarName Name of the original function.
/// @param[in] VF vectorization factor of the vectorized function.
/// @param[in] Choices choices used for vectorization
/// @param[in] IsBuiltin True if this is an internal builtin.
///
/// @return Name for the vectorized function.
std::string getVectorizedFunctionName(llvm::StringRef ScalarName,
llvm::ElementCount VF,
VectorizationChoices Choices);
VectorizationChoices Choices,
bool IsBuiltin = false);

/// @brief Parses a name generated for a vectorized function
///
/// @see getVectorizedFunctionName.
///
/// @param[in] Name Name of the vectorized function.
///
/// @return A tuple containing the original name of the function, and the
/// element count and choices it was encoded with. Returns std::nullopt on
/// failure.
std::optional<std::tuple<std::string, llvm::ElementCount, VectorizationChoices>>
decodeVectorizedFunctionName(llvm::StringRef Name);

/// @brief Clone the scalar function's body into the function to vectorize,
/// vectorizing function argument types where required.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,13 @@
#include <llvm/Analysis/ValueTracking.h>
#include <llvm/IR/BasicBlock.h>
#include <llvm/IR/CFG.h>
#include <llvm/IR/DerivedTypes.h>
#include <llvm/IR/Dominators.h>
#include <llvm/IR/IRBuilder.h>
#include <llvm/IR/Instructions.h>
#include <llvm/Support/Debug.h>
#include <llvm/Support/Error.h>
#include <llvm/Support/TypeSize.h>
#include <llvm/Support/raw_ostream.h>

#include <queue>
Expand Down Expand Up @@ -211,6 +214,16 @@ class ControlFlowConversionState::Impl : public ControlFlowConversionState {
/// @return true if it is valid to mask this call, false otherwise
bool applyMaskToCall(CallInst *CI, Value *mask, DeletionMap &toDelete);

/// @brief Attempt to apply a mask to an AtomicRMW instruction via a builtin
/// call.
///
/// @param[in] atomicI The atomic instruction to apply the mask to
/// @param[in] mask The mask to apply to the masked atomic
/// @param[out] toDelete mapping of deleted unmasked operations
/// @return true if it is valid to mask this atomic, false otherwise
bool applyMaskToAtomicRMW(AtomicRMWInst &atomicI, Value *mask,
DeletionMap &toDelete);

/// @brief Linearize a CFG.
/// @return true if no problem occurred, false otherwise.
bool partiallyLinearizeCFG();
Expand Down Expand Up @@ -1124,9 +1137,12 @@ Error ControlFlowConversionState::Impl::applyMask(BasicBlock &BB, Value *mask) {
return makeStringError("Could not apply mask to call instruction", I);
}
} else if (I.isAtomic() && !isa<FenceInst>(&I)) {
// We need to apply masks to atomic functions, but it is currently not
// implemented. See CA-3294.
return makeStringError("Could not apply mask to atomic instruction", I);
// Turn atomics into calls to masked builtins if possible.
// FIXME: We don't yet support masked cmpxchg instructions.
if (auto *atomicI = dyn_cast<AtomicRMWInst>(&I);
!atomicI || !applyMaskToAtomicRMW(*atomicI, mask, toDelete)) {
return makeStringError("Could not apply mask to atomic instruction", I);
}
} else if (auto *branch = dyn_cast<BranchInst>(&I)) {
// We have to be careful with infinite loops, because if they exist on a
// divergent code path, they will always be entered and will hang the
Expand Down Expand Up @@ -1356,6 +1372,45 @@ bool ControlFlowConversionState::Impl::applyMaskToCall(CallInst *CI,
return true;
}

bool ControlFlowConversionState::Impl::applyMaskToAtomicRMW(
AtomicRMWInst &atomicI, Value *mask, DeletionMap &toDelete) {
LLVM_DEBUG(dbgs() << "vecz-cf: Now at AtomicRMWInst " << atomicI << "\n");

VectorizationContext::MaskedAtomicRMW MA;
MA.Align = atomicI.getAlign();
MA.BinOp = atomicI.getOperation();
MA.IsVectorPredicated = VU.choices().vectorPredication();
MA.IsVolatile = atomicI.isVolatile();
MA.Ordering = atomicI.getOrdering();
MA.SyncScope = atomicI.getSyncScopeID();
MA.VF = ElementCount::getFixed(1);
MA.ValTy = atomicI.getType();
MA.PointerTy = atomicI.getPointerOperand()->getType();
// Create the new function and replace the old one with it
// Get the masked function
Function *newFunction = Ctx.getOrCreateMaskedAtomicRMWFunction(
MA, VU.choices(), ElementCount::getFixed(1));
VECZ_FAIL_IF(!newFunction);
SmallVector<Value *, 8> fnArgs = {atomicI.getPointerOperand(),
atomicI.getValOperand(), mask};
// We don't have a vector length just yet - pass in one as a dummy.
if (MA.IsVectorPredicated) {
fnArgs.push_back(
ConstantInt::get(IntegerType::getInt32Ty(atomicI.getContext()), 1));
}

CallInst *newCI = CallInst::Create(newFunction, fnArgs, "", &atomicI);
VECZ_FAIL_IF(!newCI);

atomicI.replaceAllUsesWith(newCI);
toDelete.emplace_back(&atomicI, newCI);

LLVM_DEBUG(dbgs() << "vecz-cf: Replaced " << atomicI << "\n");
LLVM_DEBUG(dbgs() << " with " << *newCI << "\n");

return true;
}

bool ControlFlowConversionState::Impl::partiallyLinearizeCFG() {
// Two methods are possible to transform the divergent loops into uniform
// ones:
Expand Down
72 changes: 72 additions & 0 deletions modules/compiler/vecz/source/transform/packetizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
#include "memory_operations.h"
#include "transform/instantiation_pass.h"
#include "transform/packetization_helpers.h"
#include "vectorization_context.h"
#include "vectorization_unit.h"
#include "vecz/vecz_choices.h"
#include "vecz/vecz_target_info.h"
Expand Down Expand Up @@ -301,6 +302,14 @@ class Packetizer::Impl : public Packetizer {
///
/// @return Packetized instruction.
ValuePacket packetizeMemOp(MemOp &Op);
/// @brief Packetize a masked atomic RMW operation.
///
/// @param[in] CI Masked atomic RMW builtin call to packetize.
/// @param[in] AtomicInfo Information about the masked atomic RMW.
///
/// @return Packetized instruction.
ValuePacket packetizeMaskedAtomicRMW(
CallInst &CI, VectorizationContext::MaskedAtomicRMW AtomicInfo);
/// @brief Packetize a GEP instruction.
///
/// @param[in] GEP Instruction to packetize.
Expand Down Expand Up @@ -2093,6 +2102,9 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
return packetizeMemOp(*MaskedOp);
}
}
if (auto AtomicInfo = Ctx.isMaskedAtomicRMWFunction(*Callee)) {
return packetizeMaskedAtomicRMW(*CI, *AtomicInfo);
}
}

auto const Builtin = Ctx.builtins().analyzeBuiltin(*Callee);
Expand Down Expand Up @@ -2766,6 +2778,66 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
return results;
}

ValuePacket Packetizer::Impl::packetizeMaskedAtomicRMW(
CallInst &CI, VectorizationContext::MaskedAtomicRMW AtomicInfo) {
ValuePacket results;

Value *const ptr = CI.getArgOperand(0);
Value *const val = CI.getArgOperand(1);
Value *const mask = CI.getArgOperand(2);

assert(AtomicInfo.ValTy == val->getType() && "AtomicInfo mismatch");
auto const packetWidth = getPacketWidthForType(val->getType());

if (VL && packetWidth != 1) {
emitVeczRemarkMissed(&F, &CI,
"Can not vector-predicate packets larger than 1");
return {};
}

ValuePacket valPacket;
Result valResult = packetize(val);
PACK_FAIL_IF(!valResult);
valResult.getPacketValues(packetWidth, valPacket);
PACK_FAIL_IF(valPacket.empty());

ValuePacket ptrPacket;
Result ptrResult = packetize(ptr);
PACK_FAIL_IF(!ptrResult);
ptrResult.getPacketValues(packetWidth, ptrPacket);
PACK_FAIL_IF(ptrPacket.empty());

ValuePacket maskPacket;
Result maskResult = packetize(mask);
PACK_FAIL_IF(!maskResult);
maskResult.getPacketValues(packetWidth, maskPacket);
PACK_FAIL_IF(maskPacket.empty());

IRBuilder<> B(&CI);
IC.deleteInstructionLater(&CI);

for (unsigned i = 0; i != packetWidth; ++i) {
auto *const ptrI = ptrPacket[i];
auto *const valI = valPacket[i];

AtomicInfo.ValTy = valI->getType();
AtomicInfo.PointerTy = ptrI->getType();
auto *maskedAtomicF =
Ctx.getOrCreateMaskedAtomicRMWFunction(AtomicInfo, Choices, SimdWidth);
PACK_FAIL_IF(!maskedAtomicF);

SmallVector<Value *, 4> args = {ptrI, valI, maskPacket[i]};
if (AtomicInfo.IsVectorPredicated) {
assert(VL && "Missing vector length");
args.push_back(VL);
}

results.push_back(B.CreateCall(maskedAtomicF, args));
}

return results;
}

void Packetizer::Impl::vectorizeDI(Instruction *, Value *) {
// FIXME: Reinstate support for vectorizing debug info
return;
Expand Down
Loading

0 comments on commit f188fe5

Please sign in to comment.