[vecz] Add support for masking atomic RMW instructions

This commit allows the vectorizer to vectorize kernels in which there are atomic RMW instructions that need masked for control-flow purposes: in a divergent if/else or a loop, etc. It follows a fairly simple paradigm - similar to how we mask loads and stores - involving: * Control-flow conversion replacing the atomic with a call to an 'internal' vecz builtin * The packetizer widening this builtin, and replacing the call with another call (with packetized arguments) * The post-vectorization `DefineBuiltinsPass` running and providing function bodies for these masked atomic builtins The builtins themselves are simply loops over the entire vectorized arguments, conditionally doing an atomic operation one by one in sequence depending on the mask. This should be correct (i.e., not performing the whole atomic operation at once) since the results are undefined for how work-items run in parallel and which work-items would "win" if there was any contention in the atomic memory addresses. Note also that this is also essentially how plain atomics are vectorized: by scalarizing them. There isn't yet support for the atomic cmpxhg instructions - those will be done separately.
uxlfoundation · Dec 18, 2023 · f188fe5 · f188fe5
1 parent 65e6dfd
commit f188fe5
Show file tree

Hide file tree

Showing 11 changed files with 695 additions and 160 deletions.
diff --git a/modules/compiler/vecz/source/include/vectorization_context.h b/modules/compiler/vecz/source/include/vectorization_context.h
@@ -23,8 +23,11 @@
 
 #include <llvm/ADT/DenseMap.h>
 #include <llvm/Analysis/IVDescriptors.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/LLVMContext.h>
 #include <llvm/IR/PassManager.h>
 #include <llvm/IR/ValueHandle.h>
+#include <llvm/Support/AtomicOrdering.h>
 #include <llvm/Support/TypeSize.h>
 #include <llvm/Transforms/Utils/ValueMapper.h>
 #include <multi_llvm/multi_llvm.h>
@@ -150,14 +153,46 @@ class VectorizationContext {
   /// @return The masked version of the function
   llvm::Function *getOrCreateMaskedFunction(llvm::CallInst *CI);
 
+  struct MaskedAtomicRMW {
+    llvm::Type *PointerTy;
+    llvm::Type *ValTy;
+    llvm::AtomicRMWInst::BinOp BinOp;
+    llvm::Align Align;
+    bool IsVolatile = false;
+    llvm::SyncScope::ID SyncScope;
+    llvm::AtomicOrdering Ordering;
+    // Vectorization info
+    llvm::ElementCount VF;
+    bool IsVectorPredicated = false;
+  };
+
+  /// @brief Check if the given function is a masked version of an atomic RMW
+  /// operation.
+  ///
+  /// @param[in] F The function to check
+  /// @return A MaskedAtomicRMW instance detailing the atomic operation if the
+  /// function is a masked atomic RMW, or std::nullopt otherwise
+  std::optional<MaskedAtomicRMW> isMaskedAtomicRMWFunction(
+      const llvm::Function &F) const;
+  /// @brief Get (if it exists already) or create the function representing the
+  /// masked version of an atomic RMW operation.
+  ///
+  /// @param[in] I Atomic to be masked
+  /// @param[in] Choices Choices to mangle into the function name
+  /// @param[in] VF The vectorization factor of the atomic operation
+  /// @return The masked version of the function
+  llvm::Function *getOrCreateMaskedAtomicRMWFunction(
+      MaskedAtomicRMW &I, const VectorizationChoices &Choices,
+      llvm::ElementCount VF);
+
   /// @brief Create a VectorizationUnit to use to vectorize the given scalar
   /// function.
   ///
   /// The lifetime of the returned VectorizationUnit is managed by the
   /// VectorizationContext.
   ///
   /// @param[in] F Function to vectorize.
-  /// @param[in] Width VF vectorization factor to use.
+  /// @param[in] VF vectorization factor to use.
   /// @param[in] Dimension SIMD dimension to use (0 => x, 1 => y, 2 => z).
   /// @param[in] Ch Vectorization Choices for the vectorization.
   VectorizationUnit *createVectorizationUnit(llvm::Function &F,
@@ -258,6 +293,14 @@ class VectorizationContext {
   bool emitSubgroupScanBody(llvm::Function &F, bool IsInclusive,
                             llvm::RecurKind OpKind, bool IsVP) const;
 
+  /// @brief Emit the body for a masked atomic builtin
+  ///
+  /// @param[in] F The empty (declaration only) function to emit the body in
+  /// @param[in] MA The MaskedAtomicRMW information
+  /// @returns true on success, false otherwise
+  bool emitMaskedAtomicRMWBody(llvm::Function &F,
+                               const MaskedAtomicRMW &MA) const;
+
   /// @brief Helper for non-vectorization tasks.
   TargetInfo &VTI;
   /// @brief Module in which the vectorization happens.

diff --git a/modules/compiler/vecz/source/include/vectorization_helpers.h b/modules/compiler/vecz/source/include/vectorization_helpers.h
@@ -36,11 +36,25 @@ class VectorizationChoices;
 /// @param[in] ScalarName Name of the original function.
 /// @param[in] VF vectorization factor of the vectorized function.
 /// @param[in] Choices choices used for vectorization
+/// @param[in] IsBuiltin True if this is an internal builtin.
 ///
 /// @return Name for the vectorized function.
 std::string getVectorizedFunctionName(llvm::StringRef ScalarName,
                                       llvm::ElementCount VF,
-                                      VectorizationChoices Choices);
+                                      VectorizationChoices Choices,
+                                      bool IsBuiltin = false);
+
+/// @brief Parses a name generated for a vectorized function
+///
+/// @see getVectorizedFunctionName.
+///
+/// @param[in] Name Name of the vectorized function.
+///
+/// @return A tuple containing the original name of the function, and the
+/// element count and choices it was encoded with. Returns std::nullopt on
+/// failure.
+std::optional<std::tuple<std::string, llvm::ElementCount, VectorizationChoices>>
+decodeVectorizedFunctionName(llvm::StringRef Name);
 
 /// @brief Clone the scalar function's body into the function to vectorize,
 /// vectorizing function argument types where required.

diff --git a/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp b/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -27,10 +27,13 @@
 #include <llvm/Analysis/ValueTracking.h>
 #include <llvm/IR/BasicBlock.h>
 #include <llvm/IR/CFG.h>
+#include <llvm/IR/DerivedTypes.h>
 #include <llvm/IR/Dominators.h>
 #include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
 #include <llvm/Support/Debug.h>
 #include <llvm/Support/Error.h>
+#include <llvm/Support/TypeSize.h>
 #include <llvm/Support/raw_ostream.h>
 
 #include <queue>
@@ -211,6 +214,16 @@ class ControlFlowConversionState::Impl : public ControlFlowConversionState {
   /// @return true if it is valid to mask this call, false otherwise
   bool applyMaskToCall(CallInst *CI, Value *mask, DeletionMap &toDelete);
 
+  /// @brief Attempt to apply a mask to an AtomicRMW instruction via a builtin
+  /// call.
+  ///
+  /// @param[in] atomicI The atomic instruction to apply the mask to
+  /// @param[in] mask The mask to apply to the masked atomic
+  /// @param[out] toDelete mapping of deleted unmasked operations
+  /// @return true if it is valid to mask this atomic, false otherwise
+  bool applyMaskToAtomicRMW(AtomicRMWInst &atomicI, Value *mask,
+                            DeletionMap &toDelete);
+
   /// @brief Linearize a CFG.
   /// @return true if no problem occurred, false otherwise.
   bool partiallyLinearizeCFG();
@@ -1124,9 +1137,12 @@ Error ControlFlowConversionState::Impl::applyMask(BasicBlock &BB, Value *mask) {
         return makeStringError("Could not apply mask to call instruction", I);
       }
     } else if (I.isAtomic() && !isa<FenceInst>(&I)) {
-      // We need to apply masks to atomic functions, but it is currently not
-      // implemented. See CA-3294.
-      return makeStringError("Could not apply mask to atomic instruction", I);
+      // Turn atomics into calls to masked builtins if possible.
+      // FIXME: We don't yet support masked cmpxchg instructions.
+      if (auto *atomicI = dyn_cast<AtomicRMWInst>(&I);
+          !atomicI || !applyMaskToAtomicRMW(*atomicI, mask, toDelete)) {
+        return makeStringError("Could not apply mask to atomic instruction", I);
+      }
     } else if (auto *branch = dyn_cast<BranchInst>(&I)) {
       // We have to be careful with infinite loops, because if they exist on a
       // divergent code path, they will always be entered and will hang the
@@ -1356,6 +1372,45 @@ bool ControlFlowConversionState::Impl::applyMaskToCall(CallInst *CI,
   return true;
 }
 
+bool ControlFlowConversionState::Impl::applyMaskToAtomicRMW(
+    AtomicRMWInst &atomicI, Value *mask, DeletionMap &toDelete) {
+  LLVM_DEBUG(dbgs() << "vecz-cf: Now at AtomicRMWInst " << atomicI << "\n");
+
+  VectorizationContext::MaskedAtomicRMW MA;
+  MA.Align = atomicI.getAlign();
+  MA.BinOp = atomicI.getOperation();
+  MA.IsVectorPredicated = VU.choices().vectorPredication();
+  MA.IsVolatile = atomicI.isVolatile();
+  MA.Ordering = atomicI.getOrdering();
+  MA.SyncScope = atomicI.getSyncScopeID();
+  MA.VF = ElementCount::getFixed(1);
+  MA.ValTy = atomicI.getType();
+  MA.PointerTy = atomicI.getPointerOperand()->getType();
+  // Create the new function and replace the old one with it
+  // Get the masked function
+  Function *newFunction = Ctx.getOrCreateMaskedAtomicRMWFunction(
+      MA, VU.choices(), ElementCount::getFixed(1));
+  VECZ_FAIL_IF(!newFunction);
+  SmallVector<Value *, 8> fnArgs = {atomicI.getPointerOperand(),
+                                    atomicI.getValOperand(), mask};
+  // We don't have a vector length just yet - pass in one as a dummy.
+  if (MA.IsVectorPredicated) {
+    fnArgs.push_back(
+        ConstantInt::get(IntegerType::getInt32Ty(atomicI.getContext()), 1));
+  }
+
+  CallInst *newCI = CallInst::Create(newFunction, fnArgs, "", &atomicI);
+  VECZ_FAIL_IF(!newCI);
+
+  atomicI.replaceAllUsesWith(newCI);
+  toDelete.emplace_back(&atomicI, newCI);
+
+  LLVM_DEBUG(dbgs() << "vecz-cf: Replaced " << atomicI << "\n");
+  LLVM_DEBUG(dbgs() << "          with " << *newCI << "\n");
+
+  return true;
+}
+
 bool ControlFlowConversionState::Impl::partiallyLinearizeCFG() {
   // Two methods are possible to transform the divergent loops into uniform
   // ones:

diff --git a/modules/compiler/vecz/source/transform/packetizer.cpp b/modules/compiler/vecz/source/transform/packetizer.cpp
@@ -55,6 +55,7 @@
 #include "memory_operations.h"
 #include "transform/instantiation_pass.h"
 #include "transform/packetization_helpers.h"
+#include "vectorization_context.h"
 #include "vectorization_unit.h"
 #include "vecz/vecz_choices.h"
 #include "vecz/vecz_target_info.h"
@@ -301,6 +302,14 @@ class Packetizer::Impl : public Packetizer {
   ///
   /// @return Packetized instruction.
   ValuePacket packetizeMemOp(MemOp &Op);
+  /// @brief Packetize a masked atomic RMW operation.
+  ///
+  /// @param[in] CI Masked atomic RMW builtin call to packetize.
+  /// @param[in] AtomicInfo Information about the masked atomic RMW.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeMaskedAtomicRMW(
+      CallInst &CI, VectorizationContext::MaskedAtomicRMW AtomicInfo);
   /// @brief Packetize a GEP instruction.
   ///
   /// @param[in] GEP Instruction to packetize.
@@ -2093,6 +2102,9 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
         return packetizeMemOp(*MaskedOp);
       }
     }
+    if (auto AtomicInfo = Ctx.isMaskedAtomicRMWFunction(*Callee)) {
+      return packetizeMaskedAtomicRMW(*CI, *AtomicInfo);
+    }
   }
 
   auto const Builtin = Ctx.builtins().analyzeBuiltin(*Callee);
@@ -2766,6 +2778,66 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
   return results;
 }
 
+ValuePacket Packetizer::Impl::packetizeMaskedAtomicRMW(
+    CallInst &CI, VectorizationContext::MaskedAtomicRMW AtomicInfo) {
+  ValuePacket results;
+
+  Value *const ptr = CI.getArgOperand(0);
+  Value *const val = CI.getArgOperand(1);
+  Value *const mask = CI.getArgOperand(2);
+
+  assert(AtomicInfo.ValTy == val->getType() && "AtomicInfo mismatch");
+  auto const packetWidth = getPacketWidthForType(val->getType());
+
+  if (VL && packetWidth != 1) {
+    emitVeczRemarkMissed(&F, &CI,
+                         "Can not vector-predicate packets larger than 1");
+    return {};
+  }
+
+  ValuePacket valPacket;
+  Result valResult = packetize(val);
+  PACK_FAIL_IF(!valResult);
+  valResult.getPacketValues(packetWidth, valPacket);
+  PACK_FAIL_IF(valPacket.empty());
+
+  ValuePacket ptrPacket;
+  Result ptrResult = packetize(ptr);
+  PACK_FAIL_IF(!ptrResult);
+  ptrResult.getPacketValues(packetWidth, ptrPacket);
+  PACK_FAIL_IF(ptrPacket.empty());
+
+  ValuePacket maskPacket;
+  Result maskResult = packetize(mask);
+  PACK_FAIL_IF(!maskResult);
+  maskResult.getPacketValues(packetWidth, maskPacket);
+  PACK_FAIL_IF(maskPacket.empty());
+
+  IRBuilder<> B(&CI);
+  IC.deleteInstructionLater(&CI);
+
+  for (unsigned i = 0; i != packetWidth; ++i) {
+    auto *const ptrI = ptrPacket[i];
+    auto *const valI = valPacket[i];
+
+    AtomicInfo.ValTy = valI->getType();
+    AtomicInfo.PointerTy = ptrI->getType();
+    auto *maskedAtomicF =
+        Ctx.getOrCreateMaskedAtomicRMWFunction(AtomicInfo, Choices, SimdWidth);
+    PACK_FAIL_IF(!maskedAtomicF);
+
+    SmallVector<Value *, 4> args = {ptrI, valI, maskPacket[i]};
+    if (AtomicInfo.IsVectorPredicated) {
+      assert(VL && "Missing vector length");
+      args.push_back(VL);
+    }
+
+    results.push_back(B.CreateCall(maskedAtomicF, args));
+  }
+
+  return results;
+}
+
 void Packetizer::Impl::vectorizeDI(Instruction *, Value *) {
   // FIXME: Reinstate support for vectorizing debug info
   return;