Merge pull request #264 from frasercrmck/vecz-masked-atomics

[vecz] Add support for masking atomic RMW instructions
uxlfoundation · Dec 19, 2023 · 1d872df · 1d872df
2 parents 51b4ae8 + f188fe5
commit 1d872df
Show file tree

Hide file tree

Showing 11 changed files with 695 additions and 160 deletions.
diff --git a/modules/compiler/vecz/source/include/vectorization_context.h b/modules/compiler/vecz/source/include/vectorization_context.h
@@ -23,8 +23,11 @@
 
 #include <llvm/ADT/DenseMap.h>
 #include <llvm/Analysis/IVDescriptors.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/LLVMContext.h>
 #include <llvm/IR/PassManager.h>
 #include <llvm/IR/ValueHandle.h>
+#include <llvm/Support/AtomicOrdering.h>
 #include <llvm/Support/TypeSize.h>
 #include <llvm/Transforms/Utils/ValueMapper.h>
 #include <multi_llvm/multi_llvm.h>
@@ -150,14 +153,46 @@ class VectorizationContext {
   /// @return The masked version of the function
   llvm::Function *getOrCreateMaskedFunction(llvm::CallInst *CI);
 
+  struct MaskedAtomicRMW {
+    llvm::Type *PointerTy;
+    llvm::Type *ValTy;
+    llvm::AtomicRMWInst::BinOp BinOp;
+    llvm::Align Align;
+    bool IsVolatile = false;
+    llvm::SyncScope::ID SyncScope;
+    llvm::AtomicOrdering Ordering;
+    // Vectorization info
+    llvm::ElementCount VF;
+    bool IsVectorPredicated = false;
+  };
+
+  /// @brief Check if the given function is a masked version of an atomic RMW
+  /// operation.
+  ///
+  /// @param[in] F The function to check
+  /// @return A MaskedAtomicRMW instance detailing the atomic operation if the
+  /// function is a masked atomic RMW, or std::nullopt otherwise
+  std::optional<MaskedAtomicRMW> isMaskedAtomicRMWFunction(
+      const llvm::Function &F) const;
+  /// @brief Get (if it exists already) or create the function representing the
+  /// masked version of an atomic RMW operation.
+  ///
+  /// @param[in] I Atomic to be masked
+  /// @param[in] Choices Choices to mangle into the function name
+  /// @param[in] VF The vectorization factor of the atomic operation
+  /// @return The masked version of the function
+  llvm::Function *getOrCreateMaskedAtomicRMWFunction(
+      MaskedAtomicRMW &I, const VectorizationChoices &Choices,
+      llvm::ElementCount VF);
+
   /// @brief Create a VectorizationUnit to use to vectorize the given scalar
   /// function.
   ///
   /// The lifetime of the returned VectorizationUnit is managed by the
   /// VectorizationContext.
   ///
   /// @param[in] F Function to vectorize.
-  /// @param[in] Width VF vectorization factor to use.
+  /// @param[in] VF vectorization factor to use.
   /// @param[in] Dimension SIMD dimension to use (0 => x, 1 => y, 2 => z).
   /// @param[in] Ch Vectorization Choices for the vectorization.
   VectorizationUnit *createVectorizationUnit(llvm::Function &F,
@@ -258,6 +293,14 @@ class VectorizationContext {
   bool emitSubgroupScanBody(llvm::Function &F, bool IsInclusive,
                             llvm::RecurKind OpKind, bool IsVP) const;
 
+  /// @brief Emit the body for a masked atomic builtin
+  ///
+  /// @param[in] F The empty (declaration only) function to emit the body in
+  /// @param[in] MA The MaskedAtomicRMW information
+  /// @returns true on success, false otherwise
+  bool emitMaskedAtomicRMWBody(llvm::Function &F,
+                               const MaskedAtomicRMW &MA) const;
+
   /// @brief Helper for non-vectorization tasks.
   TargetInfo &VTI;
   /// @brief Module in which the vectorization happens.

diff --git a/modules/compiler/vecz/source/include/vectorization_helpers.h b/modules/compiler/vecz/source/include/vectorization_helpers.h
@@ -36,11 +36,25 @@ class VectorizationChoices;
 /// @param[in] ScalarName Name of the original function.
 /// @param[in] VF vectorization factor of the vectorized function.
 /// @param[in] Choices choices used for vectorization
+/// @param[in] IsBuiltin True if this is an internal builtin.
 ///
 /// @return Name for the vectorized function.
 std::string getVectorizedFunctionName(llvm::StringRef ScalarName,
                                       llvm::ElementCount VF,
-                                      VectorizationChoices Choices);
+                                      VectorizationChoices Choices,
+                                      bool IsBuiltin = false);
+
+/// @brief Parses a name generated for a vectorized function
+///
+/// @see getVectorizedFunctionName.
+///
+/// @param[in] Name Name of the vectorized function.
+///
+/// @return A tuple containing the original name of the function, and the
+/// element count and choices it was encoded with. Returns std::nullopt on
+/// failure.
+std::optional<std::tuple<std::string, llvm::ElementCount, VectorizationChoices>>
+decodeVectorizedFunctionName(llvm::StringRef Name);
 
 /// @brief Clone the scalar function's body into the function to vectorize,
 /// vectorizing function argument types where required.

diff --git a/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp b/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -27,10 +27,13 @@
 #include <llvm/Analysis/ValueTracking.h>
 #include <llvm/IR/BasicBlock.h>
 #include <llvm/IR/CFG.h>
+#include <llvm/IR/DerivedTypes.h>
 #include <llvm/IR/Dominators.h>
 #include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
 #include <llvm/Support/Debug.h>
 #include <llvm/Support/Error.h>
+#include <llvm/Support/TypeSize.h>
 #include <llvm/Support/raw_ostream.h>
 
 #include <queue>
@@ -211,6 +214,16 @@ class ControlFlowConversionState::Impl : public ControlFlowConversionState {
   /// @return true if it is valid to mask this call, false otherwise
   bool applyMaskToCall(CallInst *CI, Value *mask, DeletionMap &toDelete);
 
+  /// @brief Attempt to apply a mask to an AtomicRMW instruction via a builtin
+  /// call.
+  ///
+  /// @param[in] atomicI The atomic instruction to apply the mask to
+  /// @param[in] mask The mask to apply to the masked atomic
+  /// @param[out] toDelete mapping of deleted unmasked operations
+  /// @return true if it is valid to mask this atomic, false otherwise
+  bool applyMaskToAtomicRMW(AtomicRMWInst &atomicI, Value *mask,
+                            DeletionMap &toDelete);
+
   /// @brief Linearize a CFG.
   /// @return true if no problem occurred, false otherwise.
   bool partiallyLinearizeCFG();
@@ -1124,9 +1137,12 @@ Error ControlFlowConversionState::Impl::applyMask(BasicBlock &BB, Value *mask) {
         return makeStringError("Could not apply mask to call instruction", I);
       }
     } else if (I.isAtomic() && !isa<FenceInst>(&I)) {
-      // We need to apply masks to atomic functions, but it is currently not
-      // implemented. See CA-3294.
-      return makeStringError("Could not apply mask to atomic instruction", I);
+      // Turn atomics into calls to masked builtins if possible.
+      // FIXME: We don't yet support masked cmpxchg instructions.
+      if (auto *atomicI = dyn_cast<AtomicRMWInst>(&I);
+          !atomicI || !applyMaskToAtomicRMW(*atomicI, mask, toDelete)) {
+        return makeStringError("Could not apply mask to atomic instruction", I);
+      }
     } else if (auto *branch = dyn_cast<BranchInst>(&I)) {
       // We have to be careful with infinite loops, because if they exist on a
       // divergent code path, they will always be entered and will hang the
@@ -1356,6 +1372,45 @@ bool ControlFlowConversionState::Impl::applyMaskToCall(CallInst *CI,
   return true;
 }
 
+bool ControlFlowConversionState::Impl::applyMaskToAtomicRMW(
+    AtomicRMWInst &atomicI, Value *mask, DeletionMap &toDelete) {
+  LLVM_DEBUG(dbgs() << "vecz-cf: Now at AtomicRMWInst " << atomicI << "\n");
+
+  VectorizationContext::MaskedAtomicRMW MA;
+  MA.Align = atomicI.getAlign();
+  MA.BinOp = atomicI.getOperation();
+  MA.IsVectorPredicated = VU.choices().vectorPredication();
+  MA.IsVolatile = atomicI.isVolatile();
+  MA.Ordering = atomicI.getOrdering();
+  MA.SyncScope = atomicI.getSyncScopeID();
+  MA.VF = ElementCount::getFixed(1);
+  MA.ValTy = atomicI.getType();
+  MA.PointerTy = atomicI.getPointerOperand()->getType();
+  // Create the new function and replace the old one with it
+  // Get the masked function
+  Function *newFunction = Ctx.getOrCreateMaskedAtomicRMWFunction(
+      MA, VU.choices(), ElementCount::getFixed(1));
+  VECZ_FAIL_IF(!newFunction);
+  SmallVector<Value *, 8> fnArgs = {atomicI.getPointerOperand(),
+                                    atomicI.getValOperand(), mask};
+  // We don't have a vector length just yet - pass in one as a dummy.
+  if (MA.IsVectorPredicated) {
+    fnArgs.push_back(
+        ConstantInt::get(IntegerType::getInt32Ty(atomicI.getContext()), 1));
+  }
+
+  CallInst *newCI = CallInst::Create(newFunction, fnArgs, "", &atomicI);
+  VECZ_FAIL_IF(!newCI);
+
+  atomicI.replaceAllUsesWith(newCI);
+  toDelete.emplace_back(&atomicI, newCI);
+
+  LLVM_DEBUG(dbgs() << "vecz-cf: Replaced " << atomicI << "\n");
+  LLVM_DEBUG(dbgs() << "          with " << *newCI << "\n");
+
+  return true;
+}
+
 bool ControlFlowConversionState::Impl::partiallyLinearizeCFG() {
   // Two methods are possible to transform the divergent loops into uniform
   // ones:

diff --git a/modules/compiler/vecz/source/transform/packetizer.cpp b/modules/compiler/vecz/source/transform/packetizer.cpp
@@ -55,6 +55,7 @@
 #include "memory_operations.h"
 #include "transform/instantiation_pass.h"
 #include "transform/packetization_helpers.h"
+#include "vectorization_context.h"
 #include "vectorization_unit.h"
 #include "vecz/vecz_choices.h"
 #include "vecz/vecz_target_info.h"
@@ -301,6 +302,14 @@ class Packetizer::Impl : public Packetizer {
   ///
   /// @return Packetized instruction.
   ValuePacket packetizeMemOp(MemOp &Op);
+  /// @brief Packetize a masked atomic RMW operation.
+  ///
+  /// @param[in] CI Masked atomic RMW builtin call to packetize.
+  /// @param[in] AtomicInfo Information about the masked atomic RMW.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeMaskedAtomicRMW(
+      CallInst &CI, VectorizationContext::MaskedAtomicRMW AtomicInfo);
   /// @brief Packetize a GEP instruction.
   ///
   /// @param[in] GEP Instruction to packetize.
@@ -2093,6 +2102,9 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
         return packetizeMemOp(*MaskedOp);
       }
     }
+    if (auto AtomicInfo = Ctx.isMaskedAtomicRMWFunction(*Callee)) {
+      return packetizeMaskedAtomicRMW(*CI, *AtomicInfo);
+    }
   }
 
   auto const Builtin = Ctx.builtins().analyzeBuiltin(*Callee);
@@ -2766,6 +2778,66 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
   return results;
 }
 
+ValuePacket Packetizer::Impl::packetizeMaskedAtomicRMW(
+    CallInst &CI, VectorizationContext::MaskedAtomicRMW AtomicInfo) {
+  ValuePacket results;
+
+  Value *const ptr = CI.getArgOperand(0);
+  Value *const val = CI.getArgOperand(1);
+  Value *const mask = CI.getArgOperand(2);
+
+  assert(AtomicInfo.ValTy == val->getType() && "AtomicInfo mismatch");
+  auto const packetWidth = getPacketWidthForType(val->getType());
+
+  if (VL && packetWidth != 1) {
+    emitVeczRemarkMissed(&F, &CI,
+                         "Can not vector-predicate packets larger than 1");
+    return {};
+  }
+
+  ValuePacket valPacket;
+  Result valResult = packetize(val);
+  PACK_FAIL_IF(!valResult);
+  valResult.getPacketValues(packetWidth, valPacket);
+  PACK_FAIL_IF(valPacket.empty());
+
+  ValuePacket ptrPacket;
+  Result ptrResult = packetize(ptr);
+  PACK_FAIL_IF(!ptrResult);
+  ptrResult.getPacketValues(packetWidth, ptrPacket);
+  PACK_FAIL_IF(ptrPacket.empty());
+
+  ValuePacket maskPacket;
+  Result maskResult = packetize(mask);
+  PACK_FAIL_IF(!maskResult);
+  maskResult.getPacketValues(packetWidth, maskPacket);
+  PACK_FAIL_IF(maskPacket.empty());
+
+  IRBuilder<> B(&CI);
+  IC.deleteInstructionLater(&CI);
+
+  for (unsigned i = 0; i != packetWidth; ++i) {
+    auto *const ptrI = ptrPacket[i];
+    auto *const valI = valPacket[i];
+
+    AtomicInfo.ValTy = valI->getType();
+    AtomicInfo.PointerTy = ptrI->getType();
+    auto *maskedAtomicF =
+        Ctx.getOrCreateMaskedAtomicRMWFunction(AtomicInfo, Choices, SimdWidth);
+    PACK_FAIL_IF(!maskedAtomicF);
+
+    SmallVector<Value *, 4> args = {ptrI, valI, maskPacket[i]};
+    if (AtomicInfo.IsVectorPredicated) {
+      assert(VL && "Missing vector length");
+      args.push_back(VL);
+    }
+
+    results.push_back(B.CreateCall(maskedAtomicF, args));
+  }
+
+  return results;
+}
+
 void Packetizer::Impl::vectorizeDI(Instruction *, Value *) {
   // FIXME: Reinstate support for vectorizing debug info
   return;