From f188fe5ec7e6f9d6be4b072a6d1b7e69efda552e Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 14 Dec 2023 16:27:41 +0000
Subject: [PATCH] [vecz] Add support for masking atomic RMW instructions

This commit allows the vectorizer to vectorize kernels in which there
are atomic RMW instructions that need masked for control-flow purposes:
in a divergent if/else or a loop, etc.

It follows a fairly simple paradigm - similar to how we mask loads and
stores - involving:

* Control-flow conversion replacing the atomic with a call to an
  'internal' vecz builtin
* The packetizer widening this builtin, and replacing the call with
  another call (with packetized arguments)
* The post-vectorization `DefineBuiltinsPass` running and providing
  function bodies for these masked atomic builtins

The builtins themselves are simply loops over the entire vectorized
arguments, conditionally doing an atomic operation one by one in
sequence depending on the mask. This should be correct (i.e., not
performing the whole atomic operation at once) since the results are
undefined for how work-items run in parallel and which work-items would
"win" if there was any contention in the atomic memory addresses. Note
also that this is also essentially how plain atomics are vectorized: by
scalarizing them.

There isn't yet support for the atomic cmpxhg instructions - those will
be done separately.
---
 .../source/include/vectorization_context.h    |  45 ++-
 .../source/include/vectorization_helpers.h    |  16 +-
 .../control_flow_conversion_pass.cpp          |  61 +++-
 .../vecz/source/transform/packetizer.cpp      |  72 ++++
 .../vecz/source/vectorization_context.cpp     | 331 ++++++++++++++++++
 .../vecz/source/vectorization_helpers.cpp     |  44 ++-
 .../vecz/test/lit/llvm/Boscc/printf.ll        | 125 -------
 .../vecz/test/lit/llvm/diverging_atomic.ll    |   5 +-
 .../vecz/test/lit/llvm/masked_atomics.ll      |  87 +++++
 .../test/lit/llvm/masked_atomics_scalar.ll    |  43 +++
 source/cl/test/UnitCL/source/C11Atomics.cpp   |  26 +-
 11 files changed, 695 insertions(+), 160 deletions(-)
 delete mode 100644 modules/compiler/vecz/test/lit/llvm/Boscc/printf.ll
 create mode 100644 modules/compiler/vecz/test/lit/llvm/masked_atomics.ll
 create mode 100644 modules/compiler/vecz/test/lit/llvm/masked_atomics_scalar.ll
diff --git a/modules/compiler/vecz/source/include/vectorization_context.h b/modules/compiler/vecz/source/include/vectorization_context.h
index 8c119eee3..3d580525c 100644
--- a/modules/compiler/vecz/source/include/vectorization_context.h
+++ b/modules/compiler/vecz/source/include/vectorization_context.h
@@ -23,8 +23,11 @@
 
 #include <llvm/ADT/DenseMap.h>
 #include <llvm/Analysis/IVDescriptors.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/LLVMContext.h>
 #include <llvm/IR/PassManager.h>
 #include <llvm/IR/ValueHandle.h>
+#include <llvm/Support/AtomicOrdering.h>
 #include <llvm/Support/TypeSize.h>
 #include <llvm/Transforms/Utils/ValueMapper.h>
 #include <multi_llvm/multi_llvm.h>
@@ -150,6 +153,38 @@ class VectorizationContext {
   /// @return The masked version of the function
   llvm::Function *getOrCreateMaskedFunction(llvm::CallInst *CI);
 
+  struct MaskedAtomicRMW {
+    llvm::Type *PointerTy;
+    llvm::Type *ValTy;
+    llvm::AtomicRMWInst::BinOp BinOp;
+    llvm::Align Align;
+    bool IsVolatile = false;
+    llvm::SyncScope::ID SyncScope;
+    llvm::AtomicOrdering Ordering;
+    // Vectorization info
+    llvm::ElementCount VF;
+    bool IsVectorPredicated = false;
+  };
+
+  /// @brief Check if the given function is a masked version of an atomic RMW
+  /// operation.
+  ///
+  /// @param[in] F The function to check
+  /// @return A MaskedAtomicRMW instance detailing the atomic operation if the
+  /// function is a masked atomic RMW, or std::nullopt otherwise
+  std::optional<MaskedAtomicRMW> isMaskedAtomicRMWFunction(
+      const llvm::Function &F) const;
+  /// @brief Get (if it exists already) or create the function representing the
+  /// masked version of an atomic RMW operation.
+  ///
+  /// @param[in] I Atomic to be masked
+  /// @param[in] Choices Choices to mangle into the function name
+  /// @param[in] VF The vectorization factor of the atomic operation
+  /// @return The masked version of the function
+  llvm::Function *getOrCreateMaskedAtomicRMWFunction(
+      MaskedAtomicRMW &I, const VectorizationChoices &Choices,
+      llvm::ElementCount VF);
+
   /// @brief Create a VectorizationUnit to use to vectorize the given scalar
   /// function.
   ///
@@ -157,7 +192,7 @@ class VectorizationContext {
   /// VectorizationContext.
   ///
   /// @param[in] F Function to vectorize.
-  /// @param[in] Width VF vectorization factor to use.
+  /// @param[in] VF vectorization factor to use.
   /// @param[in] Dimension SIMD dimension to use (0 => x, 1 => y, 2 => z).
   /// @param[in] Ch Vectorization Choices for the vectorization.
   VectorizationUnit *createVectorizationUnit(llvm::Function &F,
@@ -258,6 +293,14 @@ class VectorizationContext {
   bool emitSubgroupScanBody(llvm::Function &F, bool IsInclusive,
                             llvm::RecurKind OpKind, bool IsVP) const;
 
+  /// @brief Emit the body for a masked atomic builtin
+  ///
+  /// @param[in] F The empty (declaration only) function to emit the body in
+  /// @param[in] MA The MaskedAtomicRMW information
+  /// @returns true on success, false otherwise
+  bool emitMaskedAtomicRMWBody(llvm::Function &F,
+                               const MaskedAtomicRMW &MA) const;
+
   /// @brief Helper for non-vectorization tasks.
   TargetInfo &VTI;
   /// @brief Module in which the vectorization happens.
diff --git a/modules/compiler/vecz/source/include/vectorization_helpers.h b/modules/compiler/vecz/source/include/vectorization_helpers.h
index adba458a0..febd373bf 100644
--- a/modules/compiler/vecz/source/include/vectorization_helpers.h
+++ b/modules/compiler/vecz/source/include/vectorization_helpers.h
@@ -36,11 +36,25 @@ class VectorizationChoices;
 /// @param[in] ScalarName Name of the original function.
 /// @param[in] VF vectorization factor of the vectorized function.
 /// @param[in] Choices choices used for vectorization
+/// @param[in] IsBuiltin True if this is an internal builtin.
 ///
 /// @return Name for the vectorized function.
 std::string getVectorizedFunctionName(llvm::StringRef ScalarName,
                                       llvm::ElementCount VF,
-                                      VectorizationChoices Choices);
+                                      VectorizationChoices Choices,
+                                      bool IsBuiltin = false);
+
+/// @brief Parses a name generated for a vectorized function
+///
+/// @see getVectorizedFunctionName.
+///
+/// @param[in] Name Name of the vectorized function.
+///
+/// @return A tuple containing the original name of the function, and the
+/// element count and choices it was encoded with. Returns std::nullopt on
+/// failure.
+std::optional<std::tuple<std::string, llvm::ElementCount, VectorizationChoices>>
+decodeVectorizedFunctionName(llvm::StringRef Name);
 
 /// @brief Clone the scalar function's body into the function to vectorize,
 /// vectorizing function argument types where required.
diff --git a/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp b/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp
index 874a793d9..e16eab41b 100644
--- a/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -27,10 +27,13 @@
 #include <llvm/Analysis/ValueTracking.h>
 #include <llvm/IR/BasicBlock.h>
 #include <llvm/IR/CFG.h>
+#include <llvm/IR/DerivedTypes.h>
 #include <llvm/IR/Dominators.h>
 #include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
 #include <llvm/Support/Debug.h>
 #include <llvm/Support/Error.h>
+#include <llvm/Support/TypeSize.h>
 #include <llvm/Support/raw_ostream.h>
 
 #include <queue>
@@ -211,6 +214,16 @@ class ControlFlowConversionState::Impl : public ControlFlowConversionState {
   /// @return true if it is valid to mask this call, false otherwise
   bool applyMaskToCall(CallInst *CI, Value *mask, DeletionMap &toDelete);
 
+  /// @brief Attempt to apply a mask to an AtomicRMW instruction via a builtin
+  /// call.
+  ///
+  /// @param[in] atomicI The atomic instruction to apply the mask to
+  /// @param[in] mask The mask to apply to the masked atomic
+  /// @param[out] toDelete mapping of deleted unmasked operations
+  /// @return true if it is valid to mask this atomic, false otherwise
+  bool applyMaskToAtomicRMW(AtomicRMWInst &atomicI, Value *mask,
+                            DeletionMap &toDelete);
+
   /// @brief Linearize a CFG.
   /// @return true if no problem occurred, false otherwise.
   bool partiallyLinearizeCFG();
@@ -1124,9 +1137,12 @@ Error ControlFlowConversionState::Impl::applyMask(BasicBlock &BB, Value *mask) {
         return makeStringError("Could not apply mask to call instruction", I);
       }
     } else if (I.isAtomic() && !isa<FenceInst>(&I)) {
-      // We need to apply masks to atomic functions, but it is currently not
-      // implemented. See CA-3294.
-      return makeStringError("Could not apply mask to atomic instruction", I);
+      // Turn atomics into calls to masked builtins if possible.
+      // FIXME: We don't yet support masked cmpxchg instructions.
+      if (auto *atomicI = dyn_cast<AtomicRMWInst>(&I);
+          !atomicI || !applyMaskToAtomicRMW(*atomicI, mask, toDelete)) {
+        return makeStringError("Could not apply mask to atomic instruction", I);
+      }
     } else if (auto *branch = dyn_cast<BranchInst>(&I)) {
       // We have to be careful with infinite loops, because if they exist on a
       // divergent code path, they will always be entered and will hang the
@@ -1356,6 +1372,45 @@ bool ControlFlowConversionState::Impl::applyMaskToCall(CallInst *CI,
   return true;
 }
 
+bool ControlFlowConversionState::Impl::applyMaskToAtomicRMW(
+    AtomicRMWInst &atomicI, Value *mask, DeletionMap &toDelete) {
+  LLVM_DEBUG(dbgs() << "vecz-cf: Now at AtomicRMWInst " << atomicI << "\n");
+
+  VectorizationContext::MaskedAtomicRMW MA;
+  MA.Align = atomicI.getAlign();
+  MA.BinOp = atomicI.getOperation();
+  MA.IsVectorPredicated = VU.choices().vectorPredication();
+  MA.IsVolatile = atomicI.isVolatile();
+  MA.Ordering = atomicI.getOrdering();
+  MA.SyncScope = atomicI.getSyncScopeID();
+  MA.VF = ElementCount::getFixed(1);
+  MA.ValTy = atomicI.getType();
+  MA.PointerTy = atomicI.getPointerOperand()->getType();
+  // Create the new function and replace the old one with it
+  // Get the masked function
+  Function *newFunction = Ctx.getOrCreateMaskedAtomicRMWFunction(
+      MA, VU.choices(), ElementCount::getFixed(1));
+  VECZ_FAIL_IF(!newFunction);
+  SmallVector<Value *, 8> fnArgs = {atomicI.getPointerOperand(),
+                                    atomicI.getValOperand(), mask};
+  // We don't have a vector length just yet - pass in one as a dummy.
+  if (MA.IsVectorPredicated) {
+    fnArgs.push_back(
+        ConstantInt::get(IntegerType::getInt32Ty(atomicI.getContext()), 1));
+  }
+
+  CallInst *newCI = CallInst::Create(newFunction, fnArgs, "", &atomicI);
+  VECZ_FAIL_IF(!newCI);
+
+  atomicI.replaceAllUsesWith(newCI);
+  toDelete.emplace_back(&atomicI, newCI);
+
+  LLVM_DEBUG(dbgs() << "vecz-cf: Replaced " << atomicI << "\n");
+  LLVM_DEBUG(dbgs() << "          with " << *newCI << "\n");
+
+  return true;
+}
+
 bool ControlFlowConversionState::Impl::partiallyLinearizeCFG() {
   // Two methods are possible to transform the divergent loops into uniform
   // ones:
diff --git a/modules/compiler/vecz/source/transform/packetizer.cpp b/modules/compiler/vecz/source/transform/packetizer.cpp
index 8615245f8..9b80bca86 100644
--- a/modules/compiler/vecz/source/transform/packetizer.cpp
+++ b/modules/compiler/vecz/source/transform/packetizer.cpp
@@ -55,6 +55,7 @@
 #include "memory_operations.h"
 #include "transform/instantiation_pass.h"
 #include "transform/packetization_helpers.h"
+#include "vectorization_context.h"
 #include "vectorization_unit.h"
 #include "vecz/vecz_choices.h"
 #include "vecz/vecz_target_info.h"
@@ -301,6 +302,14 @@ class Packetizer::Impl : public Packetizer {
   ///
   /// @return Packetized instruction.
   ValuePacket packetizeMemOp(MemOp &Op);
+  /// @brief Packetize a masked atomic RMW operation.
+  ///
+  /// @param[in] CI Masked atomic RMW builtin call to packetize.
+  /// @param[in] AtomicInfo Information about the masked atomic RMW.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeMaskedAtomicRMW(
+      CallInst &CI, VectorizationContext::MaskedAtomicRMW AtomicInfo);
   /// @brief Packetize a GEP instruction.
   ///
   /// @param[in] GEP Instruction to packetize.
@@ -2093,6 +2102,9 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
         return packetizeMemOp(*MaskedOp);
       }
     }
+    if (auto AtomicInfo = Ctx.isMaskedAtomicRMWFunction(*Callee)) {
+      return packetizeMaskedAtomicRMW(*CI, *AtomicInfo);
+    }
   }
 
   auto const Builtin = Ctx.builtins().analyzeBuiltin(*Callee);
@@ -2766,6 +2778,66 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
   return results;
 }
 
+ValuePacket Packetizer::Impl::packetizeMaskedAtomicRMW(
+    CallInst &CI, VectorizationContext::MaskedAtomicRMW AtomicInfo) {
+  ValuePacket results;
+
+  Value *const ptr = CI.getArgOperand(0);
+  Value *const val = CI.getArgOperand(1);
+  Value *const mask = CI.getArgOperand(2);
+
+  assert(AtomicInfo.ValTy == val->getType() && "AtomicInfo mismatch");
+  auto const packetWidth = getPacketWidthForType(val->getType());
+
+  if (VL && packetWidth != 1) {
+    emitVeczRemarkMissed(&F, &CI,
+                         "Can not vector-predicate packets larger than 1");
+    return {};
+  }
+
+  ValuePacket valPacket;
+  Result valResult = packetize(val);
+  PACK_FAIL_IF(!valResult);
+  valResult.getPacketValues(packetWidth, valPacket);
+  PACK_FAIL_IF(valPacket.empty());
+
+  ValuePacket ptrPacket;
+  Result ptrResult = packetize(ptr);
+  PACK_FAIL_IF(!ptrResult);
+  ptrResult.getPacketValues(packetWidth, ptrPacket);
+  PACK_FAIL_IF(ptrPacket.empty());
+
+  ValuePacket maskPacket;
+  Result maskResult = packetize(mask);
+  PACK_FAIL_IF(!maskResult);
+  maskResult.getPacketValues(packetWidth, maskPacket);
+  PACK_FAIL_IF(maskPacket.empty());
+
+  IRBuilder<> B(&CI);
+  IC.deleteInstructionLater(&CI);
+
+  for (unsigned i = 0; i != packetWidth; ++i) {
+    auto *const ptrI = ptrPacket[i];
+    auto *const valI = valPacket[i];
+
+    AtomicInfo.ValTy = valI->getType();
+    AtomicInfo.PointerTy = ptrI->getType();
+    auto *maskedAtomicF =
+        Ctx.getOrCreateMaskedAtomicRMWFunction(AtomicInfo, Choices, SimdWidth);
+    PACK_FAIL_IF(!maskedAtomicF);
+
+    SmallVector<Value *, 4> args = {ptrI, valI, maskPacket[i]};
+    if (AtomicInfo.IsVectorPredicated) {
+      assert(VL && "Missing vector length");
+      args.push_back(VL);
+    }
+
+    results.push_back(B.CreateCall(maskedAtomicF, args));
+  }
+
+  return results;
+}
+
 void Packetizer::Impl::vectorizeDI(Instruction *, Value *) {
   // FIXME: Reinstate support for vectorizing debug info
   return;
diff --git a/modules/compiler/vecz/source/vectorization_context.cpp b/modules/compiler/vecz/source/vectorization_context.cpp
index e4fbbf4d6..821d62936 100644
--- a/modules/compiler/vecz/source/vectorization_context.cpp
+++ b/modules/compiler/vecz/source/vectorization_context.cpp
@@ -22,11 +22,17 @@
 #include <compiler/utils/pass_functions.h>
 #include <llvm/ADT/Statistic.h>
 #include <llvm/IR/Attributes.h>
+#include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/InstrTypes.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/Support/AtomicOrdering.h>
 #include <llvm/Target/TargetMachine.h>
 #include <multi_llvm/vector_type_helper.h>
 
 #include <algorithm>
 #include <cassert>
+#include <optional>
 
 #include "analysis/vectorization_unit_analysis.h"
 #include "debugging.h"
@@ -368,6 +374,227 @@ Function *VectorizationContext::getOrCreateMaskedFunction(CallInst *CI) {
   return newFunction;
 }
 
+std::optional<VectorizationContext::MaskedAtomicRMW>
+VectorizationContext::isMaskedAtomicRMWFunction(const Function &F) const {
+  auto VFInfo = decodeVectorizedFunctionName(F.getName());
+  if (!VFInfo) {
+    return std::nullopt;
+  }
+  auto [FnNameStr, VF, Choices] = *VFInfo;
+
+  llvm::StringRef FnName = FnNameStr;
+  if (!FnName.consume_front("masked_atomicrmw_")) {
+    return std::nullopt;
+  }
+  VectorizationContext::MaskedAtomicRMW AtomicInfo;
+
+  AtomicInfo.VF = VF;
+  AtomicInfo.IsVectorPredicated = Choices.vectorPredication();
+
+  AtomicInfo.IsVolatile = FnName.consume_front("volatile_");
+
+  if (FnName.consume_front("xchg")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Xchg;
+  } else if (FnName.consume_front("add")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Add;
+  } else if (FnName.consume_front("sub")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Sub;
+  } else if (FnName.consume_front("and")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::And;
+  } else if (FnName.consume_front("nand")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Nand;
+  } else if (FnName.consume_front("or")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Or;
+  } else if (FnName.consume_front("xor")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Xor;
+  } else if (FnName.consume_front("max")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Max;
+  } else if (FnName.consume_front("min")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Min;
+  } else if (FnName.consume_front("umax")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::UMax;
+  } else if (FnName.consume_front("umin")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::UMin;
+  } else if (FnName.consume_front("fadd")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::FAdd;
+  } else if (FnName.consume_front("fsub")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::FSub;
+  } else if (FnName.consume_front("fmax")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::FMax;
+  } else if (FnName.consume_front("fmin")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::FMin;
+  } else if (FnName.consume_front("uincwrap")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::UIncWrap;
+  } else if (FnName.consume_front("udecwrap")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::UDecWrap;
+  } else {
+    return std::nullopt;
+  }
+
+  if (!FnName.consume_front("_align")) {
+    return std::nullopt;
+  }
+
+  uint64_t Alignment = 0;
+  if (FnName.consumeInteger(/*Radix=*/10, Alignment)) {
+    return std::nullopt;
+  }
+
+  AtomicInfo.Align = Align(Alignment);
+
+  if (!FnName.consume_front("_")) {
+    return std::nullopt;
+  }
+
+  if (FnName.consume_front("acquire")) {
+    AtomicInfo.Ordering = AtomicOrdering::Acquire;
+  } else if (FnName.consume_front("acqrel")) {
+    AtomicInfo.Ordering = AtomicOrdering::AcquireRelease;
+  } else if (FnName.consume_front("monotonic")) {
+    AtomicInfo.Ordering = AtomicOrdering::Monotonic;
+  } else if (FnName.consume_front("notatomic")) {
+    AtomicInfo.Ordering = AtomicOrdering::NotAtomic;
+  } else if (FnName.consume_front("release")) {
+    AtomicInfo.Ordering = AtomicOrdering::Release;
+  } else if (FnName.consume_front("seqcst")) {
+    AtomicInfo.Ordering = AtomicOrdering::SequentiallyConsistent;
+  } else if (FnName.consume_front("unordered")) {
+    AtomicInfo.Ordering = AtomicOrdering::Unordered;
+  } else {
+    return std::nullopt;
+  }
+
+  if (!FnName.consume_front("_")) {
+    return std::nullopt;
+  }
+
+  unsigned SyncScopeID = 0;
+  if (FnName.consumeInteger(/*Radix=*/10, SyncScopeID)) {
+    return std::nullopt;
+  }
+
+  AtomicInfo.SyncScope = static_cast<SyncScope::ID>(SyncScopeID);
+
+  if (!FnName.consume_front("_")) {
+    return std::nullopt;
+  }
+
+  // Note - we just assume the rest of the builtin name is okay, here. It
+  // should be mangled types, but vecz builtins use a strange mangling system,
+  // purely for uniqueness and not to infer types. Types are always assumed to
+  // be inferrable from the function parameters.
+  AtomicInfo.PointerTy = F.getFunctionType()->getParamType(0);
+  AtomicInfo.ValTy = F.getFunctionType()->getParamType(1);
+
+  return AtomicInfo;
+}
+
+Function *VectorizationContext::getOrCreateMaskedAtomicRMWFunction(
+    MaskedAtomicRMW &I, const VectorizationChoices &Choices, ElementCount VF) {
+  LLVMContext &ctx = I.ValTy->getContext();
+
+  SmallVector<Type *, 8> argTys;
+
+  argTys.push_back(I.PointerTy);
+  argTys.push_back(I.ValTy);
+  // Add one extra argument for the mask, which is always the same length
+  // (scalar or vector) as the value type.
+  auto *i1Ty = Type::getInt1Ty(ctx);
+  argTys.push_back(
+      !I.ValTy->isVectorTy()
+          ? dyn_cast<Type>(i1Ty)
+          : VectorType::get(i1Ty,
+                            cast<VectorType>(I.ValTy)->getElementCount()));
+  if (Choices.vectorPredication()) {
+    argTys.push_back(Type::getInt32Ty(ctx));
+  }
+
+  std::string maskedFnName;
+  raw_string_ostream O(maskedFnName);
+  O << "masked_atomicrmw_";
+
+  if (I.IsVolatile) {
+    O << "volatile_";
+  }
+
+#define BINOP_CASE(BINOP, STR) \
+  case AtomicRMWInst::BINOP:   \
+    O << (STR);                \
+    break
+
+  switch (I.BinOp) {
+    BINOP_CASE(Xchg, "xchg");
+    BINOP_CASE(Add, "add");
+    BINOP_CASE(Sub, "sub");
+    BINOP_CASE(And, "and");
+    BINOP_CASE(Nand, "nand");
+    BINOP_CASE(Or, "or");
+    BINOP_CASE(Xor, "xor");
+    BINOP_CASE(Max, "max");
+    BINOP_CASE(Min, "min");
+    BINOP_CASE(UMax, "umax");
+    BINOP_CASE(UMin, "umin");
+    BINOP_CASE(FAdd, "fadd");
+    BINOP_CASE(FSub, "fsub");
+    BINOP_CASE(FMax, "fmax");
+    BINOP_CASE(FMin, "fmin");
+    BINOP_CASE(UIncWrap, "uincwrap");
+    BINOP_CASE(UDecWrap, "udecwrap");
+    case llvm::AtomicRMWInst::BAD_BINOP:
+      return nullptr;
+  }
+
+#undef BINOP_CASE
+
+  O << "_align" << I.Align.value() << "_";
+  // Mangle ordering
+  switch (I.Ordering) {
+    default:
+      O << static_cast<unsigned>(I.Ordering);
+      break;
+    case AtomicOrdering::Acquire:
+      O << "acquire";
+      break;
+    case AtomicOrdering::AcquireRelease:
+      O << "acqrel";
+      break;
+    case AtomicOrdering::Monotonic:
+      O << "monotonic";
+      break;
+    case AtomicOrdering::NotAtomic:
+      O << "notatomic";
+      break;
+    case AtomicOrdering::Release:
+      O << "release";
+      break;
+    case AtomicOrdering::SequentiallyConsistent:
+      O << "seqcst";
+      break;
+    case AtomicOrdering::Unordered:
+      O << "unordered";
+      break;
+  }
+  // Syncscope
+  O << "_" << static_cast<unsigned>(I.SyncScope) << "_";
+
+  // Mangle types
+  compiler::utils::NameMangler mangler(&ctx);
+  for (auto *ty : argTys) {
+    VECZ_FAIL_IF(!mangler.mangleType(
+        O, ty,
+        compiler::utils::TypeQualifiers(compiler::utils::eTypeQualNone)));
+  }
+
+  maskedFnName =
+      getVectorizedFunctionName(maskedFnName, VF, Choices, /*IsBuiltin=*/true);
+
+  // Create the function type
+  FunctionType *maskedFnTy =
+      FunctionType::get(I.ValTy, argTys, /*isVarArg=*/false);
+
+  return getOrCreateInternalBuiltin(maskedFnName, maskedFnTy);
+}
+
 namespace {
 std::optional<std::tuple<bool, RecurKind, bool>> isSubgroupScan(
     StringRef fnName, Type *const ty) {
@@ -460,6 +687,10 @@ bool VectorizationContext::defineInternalBuiltin(Function *F) {
     return emitSubgroupScanBody(*F, isInclusive, opKind, isVP);
   }
 
+  if (auto AtomicInfo = isMaskedAtomicRMWFunction(*F)) {
+    return emitMaskedAtomicRMWBody(*F, *AtomicInfo);
+  }
+
   return false;
 }
 
@@ -777,6 +1008,106 @@ bool VectorizationContext::emitSubgroupScanBody(Function &F, bool IsInclusive,
   return true;
 }
 
+bool VectorizationContext::emitMaskedAtomicRMWBody(
+    Function &F, const VectorizationContext::MaskedAtomicRMW &MA) const {
+  LLVMContext &Ctx = F.getContext();
+
+  auto *const EntryBB = BasicBlock::Create(Ctx, "entry", &F);
+
+  auto *const ExitBB = BasicBlock::Create(Ctx, "exit", &F);
+
+  auto *const PtrArg = F.getArg(0);
+  auto *const ValArg = F.getArg(1);
+  Value *MaskArg = F.getArg(2);
+
+  const bool IsVector = ValArg->getType()->isVectorTy();
+
+  IRBuilder<> B(EntryBB);
+  Value *const IdxStart = B.getInt32(0);
+  ConstantInt *const KnownMin = B.getInt32(MA.VF.getKnownMinValue());
+  Value *IdxEnd = !MA.VF.isScalable() ? KnownMin : B.CreateVScale(KnownMin);
+
+  // For vector-predicated masked atomics, we have to merge the incoming mask
+  // with a mask corresponding to the number of elements left active by the
+  // runtime vector length.
+  if (MA.IsVectorPredicated) {
+    auto *const VL = F.getArg(3);
+    auto *const IndexTy = VectorType::get(VL->getType(), MA.VF);
+    auto *const step = B.CreateStepVector(IndexTy);
+    auto *const VLMask = B.CreateICmpULT(step, B.CreateVectorSplat(MA.VF, VL));
+    MaskArg = B.CreateAnd(MaskArg, VLMask);
+  }
+
+  Value *RetVal = nullptr;
+
+  auto CreateLoopBody = [&MA, &F, &ExitBB, PtrArg, ValArg, MaskArg, &RetVal,
+                         IsVector](
+                            BasicBlock *BB, Value *Idx, ArrayRef<Value *> IVs,
+                            MutableArrayRef<Value *> IVsNext) -> BasicBlock * {
+    IRBuilder<> IRB(BB);
+
+    Value *MaskElt = MaskArg;
+    if (IsVector) {
+      MaskElt = IRB.CreateExtractElement(MaskArg, Idx, "mask");
+    }
+    auto *const MaskCmp =
+        IRB.CreateICmpNE(MaskElt, IRB.getInt1(false), "mask.cmp");
+
+    auto *const IfBB = BasicBlock::Create(F.getContext(), "if.then", &F);
+    auto *const ElseBB = BasicBlock::Create(F.getContext(), "if.else", &F);
+
+    IRB.CreateCondBr(MaskCmp, IfBB, ElseBB);
+
+    {
+      IRB.SetInsertPoint(IfBB);
+      Value *Ptr = PtrArg;
+      Value *Val = ValArg;
+      if (IsVector) {
+        Ptr = IRB.CreateExtractElement(PtrArg, Idx, "ptr");
+        Val = IRB.CreateExtractElement(ValArg, Idx, "val");
+      }
+      auto *const AtomicRMW = IRB.CreateAtomicRMW(MA.BinOp, Ptr, Val, MA.Align,
+                                                  MA.Ordering, MA.SyncScope);
+      AtomicRMW->setVolatile(MA.IsVolatile);
+
+      if (IsVector) {
+        RetVal = IRB.CreateInsertElement(IVs[0], AtomicRMW, Idx, "retvec");
+      } else {
+        RetVal = AtomicRMW;
+      }
+
+      IRB.CreateBr(ElseBB);
+    }
+
+    {
+      IRB.SetInsertPoint(ElseBB);
+
+      auto *MergePhi = IRB.CreatePHI(RetVal->getType(), 2, "merge");
+      MergePhi->addIncoming(IVs[0], BB);
+      MergePhi->addIncoming(RetVal, IfBB);
+      RetVal = MergePhi;
+    }
+    IVsNext[0] = RetVal;
+
+    // Move the exit block right to the end of the function.
+    ExitBB->moveAfter(ElseBB);
+
+    return ElseBB;
+  };
+
+  compiler::utils::CreateLoopOpts Opts;
+  {
+    Opts.IVs.push_back(PoisonValue::get(MA.ValTy));
+    Opts.loopIVNames.push_back("retvec.prev");
+  }
+  compiler::utils::createLoop(EntryBB, ExitBB, IdxStart, IdxEnd, Opts,
+                              CreateLoopBody);
+
+  B.SetInsertPoint(ExitBB);
+  B.CreateRet(RetVal);
+  return true;
+}
+
 Function *VectorizationContext::getInternalVectorEquivalent(
     Function *ScalarFn, unsigned SimdWidth) {
   // Handle masked memory loads and stores.
diff --git a/modules/compiler/vecz/source/vectorization_helpers.cpp b/modules/compiler/vecz/source/vectorization_helpers.cpp
index 0385dec20..b65a9c793 100644
--- a/modules/compiler/vecz/source/vectorization_helpers.cpp
+++ b/modules/compiler/vecz/source/vectorization_helpers.cpp
@@ -22,8 +22,11 @@
 #include <llvm/IR/DIBuilder.h>
 #include <llvm/IR/IntrinsicInst.h>
 #include <llvm/Support/Debug.h>
+#include <llvm/Support/TypeSize.h>
 #include <llvm/Transforms/Utils/Cloning.h>
 
+#include <optional>
+
 #include "debugging.h"
 #include "vectorization_context.h"
 #include "vectorization_unit.h"
@@ -146,14 +149,49 @@ SmallVector<Instruction *, 2> createArgumentPlaceholders(
 
 namespace vecz {
 std::string getVectorizedFunctionName(StringRef ScalarName, ElementCount VF,
-                                      VectorizationChoices Choices) {
+                                      VectorizationChoices Choices,
+                                      bool IsBuiltin) {
   Twine Prefix = Twine(VF.isScalable() ? "nxv" : "v");
   Twine IsVP = Twine(Choices.vectorPredication() ? "_vp_" : "_");
-  return (Twine("__vecz_") + Prefix + Twine(VF.getKnownMinValue()) + IsVP +
-          ScalarName)
+  return ((IsBuiltin ? VectorizationContext::InternalBuiltinPrefix
+                     : Twine("__vecz_")) +
+          Prefix + Twine(VF.getKnownMinValue()) + IsVP + ScalarName)
       .str();
 }
 
+std::optional<std::tuple<std::string, ElementCount, VectorizationChoices>>
+decodeVectorizedFunctionName(StringRef Name) {
+  if (!Name.consume_front(VectorizationContext::InternalBuiltinPrefix)) {
+    if (!Name.consume_front("__vecz_")) {
+      return std::nullopt;
+    }
+  }
+
+  ElementCount VF;
+  bool Scalable = false;
+  if (Name.consume_front("nxv")) {
+    Scalable = true;
+  } else if (!Name.consume_front("v")) {
+    return std::nullopt;
+  }
+
+  unsigned KnownMin = 0;
+  if (Name.consumeInteger(10, KnownMin)) {
+    return std::nullopt;
+  }
+
+  VF = ElementCount::get(KnownMin, Scalable);
+
+  VectorizationChoices Choices;
+  if (Name.consume_front("_vp_")) {
+    Choices.enableVectorPredication();
+  } else if (!Name.consume_front("_")) {
+    return std::nullopt;
+  }
+
+  return std::make_tuple(Name.str(), VF, Choices);
+}
+
 Function *cloneFunctionToVector(VectorizationUnit const &VU) {
   auto *const VectorizedFn = declareFunction(VU);
   VECZ_ERROR_IF(!VectorizedFn, "declareFunction failed to initialize");
diff --git a/modules/compiler/vecz/test/lit/llvm/Boscc/printf.ll b/modules/compiler/vecz/test/lit/llvm/Boscc/printf.ll
deleted file mode 100644
index d3a4c0600..000000000
--- a/modules/compiler/vecz/test/lit/llvm/Boscc/printf.ll
+++ /dev/null
@@ -1,125 +0,0 @@
-; Copyright (C) Codeplay Software Limited
-;
-; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-; Exceptions; you may not use this file except in compliance with the License.
-; You may obtain a copy of the License at
-;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-;
-; Unless required by applicable law or agreed to in writing, software
-; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-; License for the specific language governing permissions and limitations
-; under the License.
-;
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-; TODO(CA-1981): Using `not` in qemu does not work.
-; REQUIRES: native
-; RUN: not veczc -k printf_add -vecz-simd-width=4 -S -vecz-passes=cfg-convert -vecz-choices=LinearizeBOSCC < %s 2>&1 | FileCheck %s
-
-; This test just checks that we don't crash while converting the control flow.
-; LinearizeBOSCC would leave behind an invalid function when control flow fails
-; some time afterwards. This could trigger verification failures or crashes
-; depending on which passes were run later.
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "spir64-unknown-unknown"
-
-define spir_kernel void @printf_add(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out, i32 addrspace(1)* %status, i8 addrspace(1)* %x) {
-entry:
-  %in1.addr = alloca i32 addrspace(1)*, align 8
-  %in2.addr = alloca i32 addrspace(1)*, align 8
-  %out.addr = alloca i32 addrspace(1)*, align 8
-  %status.addr = alloca i32 addrspace(1)*, align 8
-  %tid = alloca i64, align 8
-  %sum = alloca i32, align 4
-  store i32 addrspace(1)* %in1, i32 addrspace(1)** %in1.addr, align 8
-  store i32 addrspace(1)* %in2, i32 addrspace(1)** %in2.addr, align 8
-  store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 8
-  store i32 addrspace(1)* %status, i32 addrspace(1)** %status.addr, align 8
-  %call = call i64 @__mux_get_global_id(i32 0) #4
-  store i64 %call, i64* %tid, align 8
-  %0 = load i32 addrspace(1)*, i32 addrspace(1)** %in1.addr, align 8
-  %1 = load i64, i64* %tid, align 8
-  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 %1
-  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %3 = load i32 addrspace(1)*, i32 addrspace(1)** %in2.addr, align 8
-  %4 = load i64, i64* %tid, align 8
-  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %3, i64 %4
-  %5 = load i32, i32 addrspace(1)* %arrayidx1, align 4
-  %add = add nsw i32 %2, %5
-  store i32 %add, i32* %sum, align 4
-  %6 = load i32, i32* %sum, align 4
-  %7 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 8
-  %8 = load i64, i64* %tid, align 8
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %7, i64 %8
-  store i32 %6, i32 addrspace(1)* %arrayidx2, align 4
-  %9 = load i64, i64* %tid, align 8
-  %conv = trunc i64 %9 to i32
-  %10 = load i32, i32* %sum, align 4
-  %11 = call i64 @__mux_get_num_groups(i32 0)
-  %12 = trunc i64 %11 to i32
-  %13 = call i64 @__mux_get_num_groups(i32 1)
-  %14 = trunc i64 %13 to i32
-  %15 = call i64 @__mux_get_num_groups(i32 2)
-  %16 = trunc i64 %15 to i32
-  %17 = call i64 @__mux_get_group_id(i32 0)
-  %18 = trunc i64 %17 to i32
-  %19 = call i64 @__mux_get_group_id(i32 1)
-  %20 = trunc i64 %19 to i32
-  %21 = call i64 @__mux_get_group_id(i32 2)
-  %22 = trunc i64 %21 to i32
-  %23 = mul i32 %12, %20
-  %24 = mul i32 %14, %16
-  %25 = mul i32 %22, %24
-  %26 = add i32 %23, %25
-  %27 = add i32 %18, %26
-  %28 = mul i32 %14, %16
-  %29 = mul i32 %12, %28
-  %30 = udiv i32 1048576, %29
-  %31 = and i32 %30, -4
-  %32 = mul i32 %27, %31
-  %33 = getelementptr i8, i8 addrspace(1)* %x, i32 %32
-  %34 = bitcast i8 addrspace(1)* %33 to i32 addrspace(1)*
-  %35 = bitcast i8 addrspace(1)* %33 to i32 addrspace(1)*
-  %36 = atomicrmw add i32 addrspace(1)* %35, i32 12 acq_rel
-  %37 = add i32 %36, 12
-  %38 = icmp ugt i32 %37, %31
-  br i1 %38, label %early_return.i, label %store.i
-
-early_return.i:                                   ; preds = %entry
-  %39 = bitcast i8 addrspace(1)* %33 to i32 addrspace(1)*
-  %40 = getelementptr i32, i32 addrspace(1)* %39, i32 1
-  %41 = atomicrmw add i32 addrspace(1)* %40, i32 12 acq_rel
-  br label %.exit
-
-store.i:                                          ; preds = %entry
-  %42 = getelementptr i8, i8 addrspace(1)* %33, i32 %36
-  %43 = bitcast i8 addrspace(1)* %42 to i32 addrspace(1)*
-  store i32 0, i32 addrspace(1)* %43, align 1
-  %44 = add i32 %36, 4
-  %45 = getelementptr i8, i8 addrspace(1)* %33, i32 %44
-  %46 = bitcast i8 addrspace(1)* %45 to i32 addrspace(1)*
-  store i32 %conv, i32 addrspace(1)* %46, align 1
-  %47 = add i32 %36, 8
-  %48 = getelementptr i8, i8 addrspace(1)* %33, i32 %47
-  %49 = bitcast i8 addrspace(1)* %48 to i32 addrspace(1)*
-  store i32 %10, i32 addrspace(1)* %49, align 1
-  br label %.exit
-
-.exit:                                            ; preds = %store.i, %early_return.i
-  %call31 = phi i32 [ -1, %early_return.i ], [ 0, %store.i ]
-  %50 = load i32 addrspace(1)*, i32 addrspace(1)** %status.addr, align 8
-  %51 = load i64, i64* %tid, align 8
-  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %50, i64 %51
-  store i32 %call31, i32 addrspace(1)* %arrayidx4, align 4
-  ret void
-}
-
-declare i64 @__mux_get_global_id(i32)
-declare i64 @__mux_get_group_id(i32)
-declare i64 @__mux_get_num_groups(i32)
-
-; We can't vectorize this control flow
-; CHECK: Error: Failed to vectorize function 'printf_add'
diff --git a/modules/compiler/vecz/test/lit/llvm/diverging_atomic.ll b/modules/compiler/vecz/test/lit/llvm/diverging_atomic.ll
index de4501013..b6beaae1e 100644
--- a/modules/compiler/vecz/test/lit/llvm/diverging_atomic.ll
+++ b/modules/compiler/vecz/test/lit/llvm/diverging_atomic.ll
@@ -22,7 +22,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; CHECK: Vecz: Could not apply masks for function "kernel"
 ; CHECK-NEXT: note: Could not apply mask to atomic instruction
-; CHECK-SAME:  %atomic = atomicrmw add ptr %arrayidx.in, i32 2 monotonic, align 4
+; CHECK-SAME:  atomic_success = cmpxchg ptr %arrayidx.in, i32 2, i32 4 acq_rel monotonic, align 4
 
 define spir_kernel void @kernel(ptr %in, ptr %out) {
 entry:
@@ -32,7 +32,8 @@ entry:
 
 if.then:
   %arrayidx.in = getelementptr inbounds i32, ptr %in, i64 %gid
-  %atomic = atomicrmw add ptr %arrayidx.in, i32 2 monotonic, align 4
+  %atomic_success = cmpxchg ptr %arrayidx.in, i32 2, i32 4 acq_rel monotonic, align 4
+  %atomic = extractvalue { i32, i1 } %atomic_success, 0
   br label %end
 
 end:
diff --git a/modules/compiler/vecz/test/lit/llvm/masked_atomics.ll b/modules/compiler/vecz/test/lit/llvm/masked_atomics.ll
new file mode 100644
index 000000000..2f11e37c2
--- /dev/null
+++ b/modules/compiler/vecz/test/lit/llvm/masked_atomics.ll
@@ -0,0 +1,87 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-passes=cfg-convert,verify,packetizer,define-builtins,verify -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; CHECK: define spir_kernel void @__vecz_v4_test_fn(ptr %p)
+define spir_kernel void @test_fn(ptr %p) {
+entry:
+; CHECK: [[SPLAT_PTR_INS:%.*]] = insertelement <4 x ptr> poison, ptr %p, i64 0
+; CHECK: [[SPLAT_PTR:%.*]] = shufflevector <4 x ptr> [[SPLAT_PTR_INS]], <4 x ptr> poison, <4 x i32> zeroinitializer
+; CHECK: [[CMP:%.*]] = icmp sgt <4 x i64> <i64 3, i64 3, i64 3, i64 3>, 
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %cmp = icmp sgt i64 3, %call
+; CHECK: [[VEC_PTR:%.*]] = getelementptr i32, ptr %p, <4 x i64>
+  %wi_p_i32 = getelementptr i32, ptr %p, i64 %call
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+; CHECK: = call <4 x i32> @__vecz_b_v4_masked_atomicrmw_add_align4_acquire_1_Dv4_u3ptrDv4_jDv4_b(
+; CHECK-SAME: <4 x ptr> [[SPLAT_PTR]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i1> [[CMP]]
+  %old0 = atomicrmw add ptr %p, i32 1 acquire
+; CHECK: = call <4 x i32> @__vecz_b_v4_masked_atomicrmw_add_align4_acquire_1_Dv4_u3ptrDv4_jDv4_b(
+; CHECK-SAME: <4 x ptr> [[VEC_PTR]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i1> [[CMP]]
+  %old1 = atomicrmw add ptr %wi_p_i32, i32 1 acquire
+; CHECK: = call <4 x i32> @__vecz_b_v4_masked_atomicrmw_umin_align2_monotonic_1_Dv4_u3ptrDv4_jDv4_b(
+; CHECK-SAME: <4 x ptr> [[VEC_PTR]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i1> [[CMP]]
+  %old2 = atomicrmw umin ptr %wi_p_i32, i32 1 monotonic, align 2
+; CHECK: = call <4 x float> @__vecz_b_v4_masked_atomicrmw_volatile_fmax_align4_seqcst_0_Dv4_u3ptrDv4_fDv4_b(
+; CHECK-SAME: <4 x ptr> [[VEC_PTR]], <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x i1> [[CMP]]
+  %old3 = atomicrmw volatile fmax ptr %wi_p_i32, float 1.0 syncscope("singlethread") seq_cst
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; CHECK: define <4 x i32> @__vecz_b_v4_masked_atomicrmw_add_align4_acquire_1_Dv4_u3ptrDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[VALS:%1]], <4 x i1> [[MASK:%2]]) {
+; CHECK: entry:
+; CHECK: br label %loopIR
+
+; CHECK: loopIR:
+; CHECK: [[IDX:%.*]] = phi i32 [ 0, %entry ], [ [[IDX_NEXT:%.*]], %if.else ]
+; CHECK: [[PREV:%.*]] = phi <4 x i32> [ poison, %entry ], [ [[MERGE:%.*]], %if.else ]
+; CHECK: [[MASKELT:%.*]] = extractelement <4 x i1> [[MASK]], i32 [[IDX]]
+; CHECK: [[MASKCMP:%.*]] = icmp ne i1 [[MASKELT]], false
+; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else
+
+; CHECK: if.then:
+; CHECK: [[PTR:%.*]] = extractelement <4 x ptr> [[PTRS]], i32 [[IDX]]
+; CHECK: [[VAL:%.*]] = extractelement <4 x i32> [[VALS]], i32 [[IDX]]
+; CHECK: [[ATOM:%.*]] = atomicrmw add ptr [[PTR]], i32 [[VAL]] acquire, align 4
+; CHECK: [[RET:%.*]] = insertelement <4 x i32> [[PREV]], i32 [[ATOM]], i32 [[IDX]]
+; CHECK: br label %if.else
+
+; CHECK: if.else:
+; CHECK: [[MERGE]] = phi <4 x i32> [ [[PREV]], %loopIR ], [ [[RET]], %if.then ]
+; CHECK: [[IDX_NEXT]] = add i32 [[IDX]], 1
+
+; CHECK: exit:
+; CHECK: ret <4 x i32> [[MERGE]]
+
+; Assume that all masked atomicrmw operations follow the logic above. Just
+; check that the right atomicrmw instruction is being generated.
+; CHECK: define <4 x i32> @__vecz_b_v4_masked_atomicrmw_umin_align2_monotonic_1_Dv4_u3ptrDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[VALS:%1]], <4 x i1> [[MASK:%2]]) {
+; CHECK: atomicrmw umin ptr {{%.*}}, i32 {{%.*}} monotonic, align 2
+
+
+; CHECK: define <4 x float> @__vecz_b_v4_masked_atomicrmw_volatile_fmax_align4_seqcst_0_Dv4_u3ptrDv4_fDv4_b(<4 x ptr> [[PTRS:%0]], <4 x float> [[VALS:%1]], <4 x i1> [[MASK:%2]]) {
+; CHECK: atomicrmw volatile fmax ptr {{%.*}}, float {{%.*}} syncscope("singlethread") seq_cst, align 4
+
+declare i64 @__mux_get_global_id(i32)
diff --git a/modules/compiler/vecz/test/lit/llvm/masked_atomics_scalar.ll b/modules/compiler/vecz/test/lit/llvm/masked_atomics_scalar.ll
new file mode 100644
index 000000000..6cab589dd
--- /dev/null
+++ b/modules/compiler/vecz/test/lit/llvm/masked_atomics_scalar.ll
@@ -0,0 +1,43 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes=define-builtins,verify -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_fn(ptr %p) {
+  %ret = call i32 @__vecz_b_v1_masked_atomicrmw_add_align4_acquire_1_u3ptrjb(ptr %p, i32 1, i1 true)
+  ret void
+}
+
+declare i32 @__vecz_b_v1_masked_atomicrmw_add_align4_acquire_1_u3ptrjb(ptr %p, i32 %val, i1 %mask)
+
+; CHECK: define i32 @__vecz_b_v1_masked_atomicrmw_add_align4_acquire_1_u3ptrjb(ptr %p, i32 %val, i1 %mask) {
+; CHECK: entry:
+; CHECK: [[MASKCMP:%.*]] = icmp ne i1 %mask, false
+; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else
+
+; CHECK: if.then:
+; CHECK: [[ATOM:%.*]] = atomicrmw add ptr %p, i32 %val acquire, align 4
+; CHECK: br label %if.else
+
+; CHECK: if.else:
+; CHECK: [[RET:%.*]] = phi i32 [ poison, %entry ], [ [[ATOM]], %if.then ]
+; CHECK: br label %exit
+
+; CHECK: exit:
+; CHECK: ret i32 [[RET]]
diff --git a/source/cl/test/UnitCL/source/C11Atomics.cpp b/source/cl/test/UnitCL/source/C11Atomics.cpp
index 0fbf82bb5..b4c759080 100644
--- a/source/cl/test/UnitCL/source/C11Atomics.cpp
+++ b/source/cl/test/UnitCL/source/C11Atomics.cpp
@@ -567,11 +567,6 @@ class FetchTest : public C11AtomicTestBase {
       const std::function<T(size_t, const std::vector<T> &)> &init_ref_fn,
       const std::function<T(size_t, const std::vector<T> &)> &op_ref_fn,
       bool clamp = false) {
-    // This test makes use of control flow in the kernel. Control flow
-    // conversion is not supported for atomics so we need to make sure this
-    // isn't registered as a failure when the vectorizer fails. See CA-3294.
-    fail_if_not_vectorized_ = false;
-
     // Generate the random input.
     std::vector<T> input_data(kts::N, T{});
     if (!clamp) {
@@ -1125,10 +1120,6 @@ TEST_P(FetchTest, C11Atomics_40_Fetch_Local_Or_Ulong) {
 }
 
 TEST_P(FetchTest, C11Atomics_41_Fetch_Global_Xor_Int) {
-  // This test makes use of control flow in the kernel. Control flow conversion
-  // is not supported for atomics so we need to make sure this isn't registered
-  // as a failure when the vectorizer fails. See CA-3294.
-  fail_if_not_vectorized_ = false;
   const auto xor_ref = [](size_t, const std::vector<cl_int> &input) {
     return std::accumulate(std::next(std::begin(input)), std::end(input),
                            input[0],
@@ -1141,10 +1132,6 @@ TEST_P(FetchTest, C11Atomics_41_Fetch_Global_Xor_Long) {
   if (!UCL::hasAtomic64Support(device)) {
     GTEST_SKIP();
   }
-  // This test makes use of control flow in the kernel. Control flow conversion
-  // is not supported for atomics so we need to make sure this isn't registered
-  // as a failure when the vectorizer fails. See CA-3294.
-  fail_if_not_vectorized_ = false;
   const auto xor_ref = [](size_t, const std::vector<cl_long> &input) {
     return std::accumulate(std::next(std::begin(input)), std::end(input),
                            input[0],
@@ -1154,10 +1141,6 @@ TEST_P(FetchTest, C11Atomics_41_Fetch_Global_Xor_Long) {
   doTest<cl_long>(firstEltReference<cl_long>, xor_ref);
 }
 TEST_P(FetchTest, C11Atomics_41_Fetch_Global_Xor_Uint) {
-  // This test makes use of control flow in the kernel. Control flow conversion
-  // is not supported for atomics so we need to make sure this isn't registered
-  // as a failure when the vectorizer fails. See CA-3294.
-  fail_if_not_vectorized_ = false;
   const auto xor_ref = [](size_t, const std::vector<cl_uint> &input) {
     return std::accumulate(std::next(std::begin(input)), std::end(input),
                            input[0],
@@ -1169,10 +1152,6 @@ TEST_P(FetchTest, C11Atomics_41_Fetch_Global_Xor_Ulong) {
   if (!UCL::hasAtomic64Support(device)) {
     GTEST_SKIP();
   }
-  // This test makes use of control flow in the kernel. Control flow conversion
-  // is not supported for atomics so we need to make sure this isn't registered
-  // as a failure when the vectorizer fails. See CA-3294.
-  fail_if_not_vectorized_ = false;
   const auto xor_ref = [](size_t, const std::vector<cl_ulong> &input) {
     return std::accumulate(
         std::next(std::begin(input)), std::end(input), input[0],
@@ -1472,10 +1451,7 @@ class FetchTruthTableTest
 
     this->AddBuildOption("-cl-std=CL3.0");
 
-    // This test makes use of control flow in the kernel or very small local
-    // sizes. Control flow conversion is not supported for atomics so we need
-    // to make sure this isn't registered as a failure when the vectorizer
-    // fails. See CA-3294.
+    // This test only uses uniform inputs so the vectorizer doesn't vectorize.
     this->fail_if_not_vectorized_ = false;
   }