Merge pull request #138 from frasercrmck/fix-vecz-alignment

[vecz] Fix alignment of load operations
uxlfoundation · Sep 20, 2023 · 00f82df · 00f82df
2 parents 4f56ab5 + 91bf24e
commit 00f82df
Show file tree

Hide file tree

Showing 13 changed files with 87 additions and 40 deletions.
diff --git a/modules/compiler/vecz/include/vecz/vecz_target_info.h b/modules/compiler/vecz/include/vecz/vecz_target_info.h
@@ -79,13 +79,15 @@ class TargetInfo {
   /// @param[in] ptr Memory address to load a vector value from.
   /// @param[in] stride Distance in elements between two lanes in memory.
   ///                     A stride of one represents a contiguous load.
+  /// @param[in] alignment The alignment of the load, in bytes
   /// @param[in] evl 'effective vector length' of the operation. Must be
   /// pre-scaled for vector operations. If null, the operation is unpredicated:
   /// it is executed on all lanes.
   ///
   /// @return IR value that results from the vector load.
   virtual llvm::Value *createLoad(llvm::IRBuilder<> &builder, llvm::Type *ty,
                                   llvm::Value *ptr, llvm::Value *stride,
+                                  unsigned alignment,
                                   llvm::Value *evl = nullptr) const;
 
   /// @brief Create a vector store. If a stride greater than one is used, the

diff --git a/modules/compiler/vecz/source/transform/packetizer.cpp b/modules/compiler/vecz/source/transform/packetizer.cpp
@@ -2158,8 +2158,8 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
             ptr = B.CreateInBoundsGEP(dataTy, ptr, packetStride,
                                       Twine(name, ".incr"));
           }
-          results.push_back(
-              VTI.createLoad(B, getWideType(dataTy, factor), ptr, one, EVL));
+          results.push_back(VTI.createLoad(B, getWideType(dataTy, factor), ptr,
+                                           one, alignment, EVL));
         }
       } else {
         auto *const one = B.getInt64(1);

diff --git a/modules/compiler/vecz/source/transform/passes.cpp b/modules/compiler/vecz/source/transform/passes.cpp
@@ -123,6 +123,7 @@ PreservedAnalyses SimplifyMaskedMemOpsPass::run(Function &F,
         Value *Data = MaskedOp->getDataOperand();
         Value *Ptr = MaskedOp->getPointerOperand();
         Type *DataTy = MaskedOp->getDataType();
+        auto Alignment = BuiltinDesc->getAlignment();
         if (MaskedOp->isLoad()) {
           Value *Load = nullptr;
           if (DataTy->isVectorTy()) {
@@ -133,11 +134,13 @@ PreservedAnalyses SimplifyMaskedMemOpsPass::run(Function &F,
             if (isa<ScalableVectorType>(DataTy)) {
               continue;
             }
-            Load = VTI.createLoad(B, CI->getType(), Ptr, B.getInt64(1));
+            Load =
+                VTI.createLoad(B, CI->getType(), Ptr, B.getInt64(1), Alignment);
           } else {
-            Load = B.CreateLoad(CI->getType(), Ptr, /*isVolatile*/ false,
-                                CI->getName());
+            Load = B.CreateAlignedLoad(CI->getType(), Ptr, Align(Alignment),
+                                       /*isVolatile*/ false, CI->getName());
           }
+          Load->takeName(CI);
           CI->replaceAllUsesWith(Load);
         } else {
           if (DataTy->isVectorTy()) {
@@ -151,7 +154,7 @@ PreservedAnalyses SimplifyMaskedMemOpsPass::run(Function &F,
             VTI.createStore(B, Data, Ptr, B.getInt64(1),
                             BuiltinDesc->getAlignment());
           } else {
-            B.CreateStore(Data, Ptr);
+            B.CreateAlignedStore(Data, Ptr, Align(Alignment));
           }
         }
         ToDelete.push_back(CI);

diff --git a/modules/compiler/vecz/source/vector_target_info.cpp b/modules/compiler/vecz/source/vector_target_info.cpp
@@ -73,7 +73,8 @@ bool isLegalMaskedScatter(const TargetTransformInfo &TTI, Type *Ty,
 TargetInfo::TargetInfo(TargetMachine *tm) : TM_(tm) {}
 
 Value *TargetInfo::createLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
-                              Value *Stride, Value *EVL) const {
+                              Value *Stride, unsigned Alignment,
+                              Value *EVL) const {
   if (!Ptr || !Stride || !Ty->isVectorTy()) {
     return nullptr;
   }
@@ -90,10 +91,9 @@ Value *TargetInfo::createLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
   PointerType *VecPtrTy = Ty->getPointerTo(PtrTy->getAddressSpace());
   Value *VecPtr = B.CreateBitCast(Ptr, VecPtrTy);
   if (CIntStride && CIntStride->getSExtValue() == 1) {
-    unsigned Align = EleTy->getScalarSizeInBits() / 8;
     if (EVL) {
       const Function *F = B.GetInsertBlock()->getParent();
-      auto const Legality = isVPLoadLegal(F, Ty, Align);
+      auto const Legality = isVPLoadLegal(F, Ty, Alignment);
       if (!Legality.isVPLegal()) {
         emitVeczRemarkMissed(F,
                              "Could not create a VP load as the target "
@@ -106,7 +106,7 @@ Value *TargetInfo::createLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
       SmallVector<llvm::Type *, 2> Tys = {Ty, VecPtr->getType()};
       return B.CreateIntrinsic(llvm::Intrinsic::vp_load, Tys, Args);
     }
-    return B.CreateAlignedLoad(Ty, VecPtr, MaybeAlign(Align));
+    return B.CreateAlignedLoad(Ty, VecPtr, MaybeAlign(Alignment));
   }
 
   if (EVL) {
@@ -1112,7 +1112,7 @@ bool TargetInfo::optimizeInterleavedGroup(IRBuilder<> &B,
       }
       Value *Load = nullptr;
       if (!HasMask) {
-        Load = createLoad(B, VecTy, AddressN, getSizeInt(B, 1));
+        Load = createLoad(B, VecTy, AddressN, getSizeInt(B, 1), Align);
       } else {
         Value *Mask = VecMasks[i];
         Load =

diff --git a/modules/compiler/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll b/modules/compiler/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
@@ -115,7 +115,7 @@ entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, {{(i32|i64)}} 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x float> undef)
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <vscale x 16 x float> [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    store <vscale x 16 x float> [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 16
@@ -125,7 +125,7 @@ entry:
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i32>, ptr addrspace(1) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i32>, ptr addrspace(1) [[ARRAYIDX]], align 16
 ; CHECK-NEXT:    [[AND1_I_I_I1_I1:%.*]] = and <vscale x 16 x i32> [[TMP1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2139095040, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[CMP_I_I_I2_I2:%.*]] = icmp ne <vscale x 16 x i32> [[AND1_I_I_I1_I1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2139095040, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[AND2_I_I_I3_I3:%.*]] = and <vscale x 16 x i32> [[TMP1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 8388607, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
@@ -160,7 +160,7 @@ entry:
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[OUT2:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    store <vscale x 16 x i32> [[TMP3]], ptr addrspace(1) [[ARRAYIDX4]], align 16
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 16
 ; CHECK-NEXT:    [[V46:%.*]] = fadd <vscale x 16 x float> [[TMP6]], [[TMP1]]
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    store <vscale x 16 x float> [[V46]], ptr addrspace(1) [[ARRAYIDX3]], align 16

diff --git a/modules/compiler/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll b/modules/compiler/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll
@@ -38,7 +38,7 @@ entry:
 
 ; CHECK: define spir_kernel void @__vecz_v4_runtime_index
 
-; CHECK: %[[INTO:.+]]  = load <16 x i32>, ptr %arrayidx, align 4
+; CHECK: %[[INTO:.+]]  = load <16 x i32>, ptr %arrayidx, align 16
 ; CHECK: %[[LD:.+]] = load <4 x i32>, ptr
 ; CHECK: %[[ADD:.+]] = add <4 x i32> %[[LD]], <i32 0, i32 4, i32 8, i32 12>
 

diff --git a/modules/compiler/vecz/test/lit/llvm/VectorWidening/widen_abs.ll b/modules/compiler/vecz/test/lit/llvm/VectorWidening/widen_abs.ll
@@ -62,7 +62,7 @@ entry:
 ; CHECK: %idx = call i64 @__mux_get_global_id(i32 0)
 ; CHECK: %a = getelementptr <2 x i32>, ptr %pa, i64 %idx
 ; CHECK: %b = getelementptr <2 x i32>, ptr %pb, i64 %idx
-; CHECK: %[[T0:.*]] = load <8 x i32>, ptr %a, align 4
+; CHECK: %[[T0:.*]] = load <8 x i32>, ptr %a, align 8
 ; CHECK: %[[RES2:.+]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %[[T0]], i1 true)
 ; CHECK: store <8 x i32> %[[RES2]], ptr %b, align 8
 ; CHECK: ret void
diff --git a/modules/compiler/vecz/test/lit/llvm/VectorWidening/widen_binops.ll b/modules/compiler/vecz/test/lit/llvm/VectorWidening/widen_binops.ll
@@ -41,10 +41,10 @@ entry:
 
 ; It checks that the zexts and add of <4 x i32> gets widened by a factor of 8,
 ; to produce PAIRs of <16 x i32>s.
-; CHECK: %[[LDA0:.+]] = load <16 x i32>, ptr %{{.+}}, align 4
-; CHECK: %[[LDA1:.+]] = load <16 x i32>, ptr %{{.+}}, align 4
-; CHECK: %[[LDB0:.+]] = load <16 x i32>, ptr %{{.+}}, align 4
-; CHECK: %[[LDB1:.+]] = load <16 x i32>, ptr %{{.+}}, align 4
+; CHECK: %[[LDA0:.+]] = load <16 x i32>, ptr %{{.+}}, align 16
+; CHECK: %[[LDA1:.+]] = load <16 x i32>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB0:.+]] = load <16 x i32>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB1:.+]] = load <16 x i32>, ptr %{{.+}}, align 16
 ; CHECK: %[[XA0:.+]] = zext <16 x i32> %[[LDA0]] to <16 x i64>
 ; CHECK: %[[XA1:.+]] = zext <16 x i32> %[[LDA1]] to <16 x i64>
 ; CHECK: %[[XB0:.+]] = zext <16 x i32> %[[LDB0]] to <16 x i64>

diff --git a/modules/compiler/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll b/modules/compiler/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll
@@ -69,8 +69,8 @@ entry:
 ; CHECK: %a = getelementptr <2 x float>, ptr %pa, i64 %idx
 ; CHECK: %b = getelementptr <2 x float>, ptr %pb, i64 %idx
 ; CHECK: %c = getelementptr <2 x float>, ptr %pc, i64 %idx
-; CHECK: [[T0:%.*]] = load <8 x float>, ptr %a, align 4
-; CHECK: [[T1:%.*]] = load <8 x float>, ptr %b, align 4
+; CHECK: [[T0:%.*]] = load <8 x float>, ptr %a, align 8
+; CHECK: [[T1:%.*]] = load <8 x float>, ptr %b, align 8
 ; CHECK: %res1 = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[T0]], <8 x float> [[T1]])
 ; CHECK: store <8 x float> %res1, ptr %c, align 8
 ; CHECK: ret void
diff --git a/modules/compiler/vecz/test/lit/llvm/VectorWidening/widen_fma.ll b/modules/compiler/vecz/test/lit/llvm/VectorWidening/widen_fma.ll
@@ -43,12 +43,12 @@ declare <4x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
 
 ; It checks that the fma intrinsic of <4 x float> gets widened by a factor of 8,
 ; to produce a PAIR of <16 x float>s.
-; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
 ; CHECK: %[[FMA0:.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %[[LDA0]], <16 x float> %[[LDB0]], <16 x float> %[[LDC0]])
 ; CHECK: %[[FMA1:.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %[[LDA1]], <16 x float> %[[LDB1]], <16 x float> %[[LDC1]])
 ; CHECK: store <16 x float> %[[FMA0]], ptr %{{.+}}, align 16

diff --git a/modules/compiler/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll b/modules/compiler/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll
@@ -43,12 +43,12 @@ declare <4x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
 
 ; It checks that the fmuladd intrinsic of <4 x float> gets widened by a factor of 8,
 ; to produce a PAIR of <16 x float>s.
-; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
 ; CHECK: %[[FMA0:.+]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[LDA0]], <16 x float> %[[LDB0]], <16 x float> %[[LDC0]])
 ; CHECK: %[[FMA1:.+]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[LDA1]], <16 x float> %[[LDB1]], <16 x float> %[[LDC1]])
 ; CHECK: store <16 x float> %[[FMA0]], ptr %{{.+}}, align 16

diff --git a/modules/compiler/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll b/modules/compiler/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll
@@ -53,12 +53,12 @@ declare <4x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
 
 ; It checks that the fmuladd intrinsic of <4 x float> gets widened by a factor of 8,
 ; to produce a PAIR of <16 x float>s.
-; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
 
 ; CHECK: loop:
 ; CHECK: %[[ACC0:.+]] = phi <16 x float> [ %[[FMA0:.+]], %loop ], [ %[[LDA0]], %entry ]

diff --git a/modules/compiler/vecz/test/lit/llvm/simplify-masked-memops.ll b/modules/compiler/vecz/test/lit/llvm/simplify-masked-memops.ll
@@ -0,0 +1,42 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k foo -vecz-passes=mask-memops -S < %s | FileCheck %s
+
+define void @foo(i16 %x, i32 %y, ptr addrspace(1) %p) {
+entry:
+  call void @__vecz_b_masked_store2_tu3ptrU3AS1b(i16 %x, ptr addrspace(1) %p, i1 true)
+  call void @__vecz_b_masked_store2_ju3ptrU3AS1b(i32 %y, ptr addrspace(1) %p, i1 true)
+  %f = call float @__vecz_b_masked_load2_fu3ptrU3AS1b(ptr addrspace(1) %p, i1 true)
+  %v4f = call <4 x float> @__vecz_b_masked_load2_Dv4_fu3ptrU3AS1Dv4_b(ptr addrspace(1) %p, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
+; Check we correctly set the alignment on the optimized loads and stores. The
+; alignment must come from the builtin, not from the natural/preferred
+; alignment for that type.
+; CHECK: define void @__vecz_v4_foo(i16 %x, i32 %y, ptr addrspace(1) %p)
+; CHECK: entry:
+; CHECK:      store i16 %x, ptr addrspace(1) %p, align 2
+; CHECK-NEXT: store i32 %y, ptr addrspace(1) %p, align 2
+; CHECK-NEXT: %f = load float, ptr addrspace(1) %p, align 2
+; CHECK-NEXT: %v4f = load <4 x float>, ptr addrspace(1) %p, align 2
+; CHECK-NEXT: ret void
+
+declare void @__vecz_b_masked_store2_tu3ptrU3AS1b(i16, ptr addrspace(1), i1)
+declare void @__vecz_b_masked_store2_ju3ptrU3AS1b(i32, ptr addrspace(1), i1)
+declare float @__vecz_b_masked_load2_fu3ptrU3AS1b(ptr addrspace(1), i1)
+declare <4 x float> @__vecz_b_masked_load2_Dv4_fu3ptrU3AS1Dv4_b(ptr addrspace(1), <4 x i1>)