Skip to content

Commit

Permalink
Merge pull request #138 from frasercrmck/fix-vecz-alignment
Browse files Browse the repository at this point in the history
[vecz] Fix alignment of load operations
  • Loading branch information
frasercrmck authored Sep 20, 2023
2 parents 4f56ab5 + 91bf24e commit 00f82df
Show file tree
Hide file tree
Showing 13 changed files with 87 additions and 40 deletions.
2 changes: 2 additions & 0 deletions modules/compiler/vecz/include/vecz/vecz_target_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,13 +79,15 @@ class TargetInfo {
/// @param[in] ptr Memory address to load a vector value from.
/// @param[in] stride Distance in elements between two lanes in memory.
/// A stride of one represents a contiguous load.
/// @param[in] alignment The alignment of the load, in bytes
/// @param[in] evl 'effective vector length' of the operation. Must be
/// pre-scaled for vector operations. If null, the operation is unpredicated:
/// it is executed on all lanes.
///
/// @return IR value that results from the vector load.
virtual llvm::Value *createLoad(llvm::IRBuilder<> &builder, llvm::Type *ty,
llvm::Value *ptr, llvm::Value *stride,
unsigned alignment,
llvm::Value *evl = nullptr) const;

/// @brief Create a vector store. If a stride greater than one is used, the
Expand Down
4 changes: 2 additions & 2 deletions modules/compiler/vecz/source/transform/packetizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2158,8 +2158,8 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
ptr = B.CreateInBoundsGEP(dataTy, ptr, packetStride,
Twine(name, ".incr"));
}
results.push_back(
VTI.createLoad(B, getWideType(dataTy, factor), ptr, one, EVL));
results.push_back(VTI.createLoad(B, getWideType(dataTy, factor), ptr,
one, alignment, EVL));
}
} else {
auto *const one = B.getInt64(1);
Expand Down
11 changes: 7 additions & 4 deletions modules/compiler/vecz/source/transform/passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ PreservedAnalyses SimplifyMaskedMemOpsPass::run(Function &F,
Value *Data = MaskedOp->getDataOperand();
Value *Ptr = MaskedOp->getPointerOperand();
Type *DataTy = MaskedOp->getDataType();
auto Alignment = BuiltinDesc->getAlignment();
if (MaskedOp->isLoad()) {
Value *Load = nullptr;
if (DataTy->isVectorTy()) {
Expand All @@ -133,11 +134,13 @@ PreservedAnalyses SimplifyMaskedMemOpsPass::run(Function &F,
if (isa<ScalableVectorType>(DataTy)) {
continue;
}
Load = VTI.createLoad(B, CI->getType(), Ptr, B.getInt64(1));
Load =
VTI.createLoad(B, CI->getType(), Ptr, B.getInt64(1), Alignment);
} else {
Load = B.CreateLoad(CI->getType(), Ptr, /*isVolatile*/ false,
CI->getName());
Load = B.CreateAlignedLoad(CI->getType(), Ptr, Align(Alignment),
/*isVolatile*/ false, CI->getName());
}
Load->takeName(CI);
CI->replaceAllUsesWith(Load);
} else {
if (DataTy->isVectorTy()) {
Expand All @@ -151,7 +154,7 @@ PreservedAnalyses SimplifyMaskedMemOpsPass::run(Function &F,
VTI.createStore(B, Data, Ptr, B.getInt64(1),
BuiltinDesc->getAlignment());
} else {
B.CreateStore(Data, Ptr);
B.CreateAlignedStore(Data, Ptr, Align(Alignment));
}
}
ToDelete.push_back(CI);
Expand Down
10 changes: 5 additions & 5 deletions modules/compiler/vecz/source/vector_target_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ bool isLegalMaskedScatter(const TargetTransformInfo &TTI, Type *Ty,
TargetInfo::TargetInfo(TargetMachine *tm) : TM_(tm) {}

Value *TargetInfo::createLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
Value *Stride, Value *EVL) const {
Value *Stride, unsigned Alignment,
Value *EVL) const {
if (!Ptr || !Stride || !Ty->isVectorTy()) {
return nullptr;
}
Expand All @@ -90,10 +91,9 @@ Value *TargetInfo::createLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
PointerType *VecPtrTy = Ty->getPointerTo(PtrTy->getAddressSpace());
Value *VecPtr = B.CreateBitCast(Ptr, VecPtrTy);
if (CIntStride && CIntStride->getSExtValue() == 1) {
unsigned Align = EleTy->getScalarSizeInBits() / 8;
if (EVL) {
const Function *F = B.GetInsertBlock()->getParent();
auto const Legality = isVPLoadLegal(F, Ty, Align);
auto const Legality = isVPLoadLegal(F, Ty, Alignment);
if (!Legality.isVPLegal()) {
emitVeczRemarkMissed(F,
"Could not create a VP load as the target "
Expand All @@ -106,7 +106,7 @@ Value *TargetInfo::createLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
SmallVector<llvm::Type *, 2> Tys = {Ty, VecPtr->getType()};
return B.CreateIntrinsic(llvm::Intrinsic::vp_load, Tys, Args);
}
return B.CreateAlignedLoad(Ty, VecPtr, MaybeAlign(Align));
return B.CreateAlignedLoad(Ty, VecPtr, MaybeAlign(Alignment));
}

if (EVL) {
Expand Down Expand Up @@ -1112,7 +1112,7 @@ bool TargetInfo::optimizeInterleavedGroup(IRBuilder<> &B,
}
Value *Load = nullptr;
if (!HasMask) {
Load = createLoad(B, VecTy, AddressN, getSizeInt(B, 1));
Load = createLoad(B, VecTy, AddressN, getSizeInt(B, 1), Align);
} else {
Value *Mask = VecMasks[i];
Load =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ entry:
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, {{(i32|i64)}} 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x float> undef)
; CHECK-NEXT: [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
; CHECK-NEXT: [[TMP3:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 16
; CHECK-NEXT: [[TMP4:%.*]] = fadd <vscale x 16 x float> [[TMP3]], [[TMP1]]
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
; CHECK-NEXT: store <vscale x 16 x float> [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 16
Expand All @@ -125,7 +125,7 @@ entry:
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
; CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 16 x i32>, ptr addrspace(1) [[ARRAYIDX]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 16 x i32>, ptr addrspace(1) [[ARRAYIDX]], align 16
; CHECK-NEXT: [[AND1_I_I_I1_I1:%.*]] = and <vscale x 16 x i32> [[TMP1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2139095040, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
; CHECK-NEXT: [[CMP_I_I_I2_I2:%.*]] = icmp ne <vscale x 16 x i32> [[AND1_I_I_I1_I1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2139095040, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
; CHECK-NEXT: [[AND2_I_I_I3_I3:%.*]] = and <vscale x 16 x i32> [[TMP1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 8388607, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
Expand Down Expand Up @@ -160,7 +160,7 @@ entry:
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[OUT2:%.*]], i64 [[CALL]]
; CHECK-NEXT: store <vscale x 16 x i32> [[TMP3]], ptr addrspace(1) [[ARRAYIDX4]], align 16
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
; CHECK-NEXT: [[TMP6:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 4
; CHECK-NEXT: [[TMP6:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 16
; CHECK-NEXT: [[V46:%.*]] = fadd <vscale x 16 x float> [[TMP6]], [[TMP1]]
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
; CHECK-NEXT: store <vscale x 16 x float> [[V46]], ptr addrspace(1) [[ARRAYIDX3]], align 16
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ entry:

; CHECK: define spir_kernel void @__vecz_v4_runtime_index

; CHECK: %[[INTO:.+]] = load <16 x i32>, ptr %arrayidx, align 4
; CHECK: %[[INTO:.+]] = load <16 x i32>, ptr %arrayidx, align 16
; CHECK: %[[LD:.+]] = load <4 x i32>, ptr
; CHECK: %[[ADD:.+]] = add <4 x i32> %[[LD]], <i32 0, i32 4, i32 8, i32 12>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ entry:
; CHECK: %idx = call i64 @__mux_get_global_id(i32 0)
; CHECK: %a = getelementptr <2 x i32>, ptr %pa, i64 %idx
; CHECK: %b = getelementptr <2 x i32>, ptr %pb, i64 %idx
; CHECK: %[[T0:.*]] = load <8 x i32>, ptr %a, align 4
; CHECK: %[[T0:.*]] = load <8 x i32>, ptr %a, align 8
; CHECK: %[[RES2:.+]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %[[T0]], i1 true)
; CHECK: store <8 x i32> %[[RES2]], ptr %b, align 8
; CHECK: ret void
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ entry:

; It checks that the zexts and add of <4 x i32> gets widened by a factor of 8,
; to produce PAIRs of <16 x i32>s.
; CHECK: %[[LDA0:.+]] = load <16 x i32>, ptr %{{.+}}, align 4
; CHECK: %[[LDA1:.+]] = load <16 x i32>, ptr %{{.+}}, align 4
; CHECK: %[[LDB0:.+]] = load <16 x i32>, ptr %{{.+}}, align 4
; CHECK: %[[LDB1:.+]] = load <16 x i32>, ptr %{{.+}}, align 4
; CHECK: %[[LDA0:.+]] = load <16 x i32>, ptr %{{.+}}, align 16
; CHECK: %[[LDA1:.+]] = load <16 x i32>, ptr %{{.+}}, align 16
; CHECK: %[[LDB0:.+]] = load <16 x i32>, ptr %{{.+}}, align 16
; CHECK: %[[LDB1:.+]] = load <16 x i32>, ptr %{{.+}}, align 16
; CHECK: %[[XA0:.+]] = zext <16 x i32> %[[LDA0]] to <16 x i64>
; CHECK: %[[XA1:.+]] = zext <16 x i32> %[[LDA1]] to <16 x i64>
; CHECK: %[[XB0:.+]] = zext <16 x i32> %[[LDB0]] to <16 x i64>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ entry:
; CHECK: %a = getelementptr <2 x float>, ptr %pa, i64 %idx
; CHECK: %b = getelementptr <2 x float>, ptr %pb, i64 %idx
; CHECK: %c = getelementptr <2 x float>, ptr %pc, i64 %idx
; CHECK: [[T0:%.*]] = load <8 x float>, ptr %a, align 4
; CHECK: [[T1:%.*]] = load <8 x float>, ptr %b, align 4
; CHECK: [[T0:%.*]] = load <8 x float>, ptr %a, align 8
; CHECK: [[T1:%.*]] = load <8 x float>, ptr %b, align 8
; CHECK: %res1 = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[T0]], <8 x float> [[T1]])
; CHECK: store <8 x float> %res1, ptr %c, align 8
; CHECK: ret void
12 changes: 6 additions & 6 deletions modules/compiler/vecz/test/lit/llvm/VectorWidening/widen_fma.ll
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,12 @@ declare <4x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)

; It checks that the fma intrinsic of <4 x float> gets widened by a factor of 8,
; to produce a PAIR of <16 x float>s.
; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
; CHECK: %[[FMA0:.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %[[LDA0]], <16 x float> %[[LDB0]], <16 x float> %[[LDC0]])
; CHECK: %[[FMA1:.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %[[LDA1]], <16 x float> %[[LDB1]], <16 x float> %[[LDC1]])
; CHECK: store <16 x float> %[[FMA0]], ptr %{{.+}}, align 16
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,12 @@ declare <4x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)

; It checks that the fmuladd intrinsic of <4 x float> gets widened by a factor of 8,
; to produce a PAIR of <16 x float>s.
; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
; CHECK: %[[FMA0:.+]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[LDA0]], <16 x float> %[[LDB0]], <16 x float> %[[LDC0]])
; CHECK: %[[FMA1:.+]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[LDA1]], <16 x float> %[[LDB1]], <16 x float> %[[LDC1]])
; CHECK: store <16 x float> %[[FMA0]], ptr %{{.+}}, align 16
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,12 @@ declare <4x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)

; It checks that the fmuladd intrinsic of <4 x float> gets widened by a factor of 8,
; to produce a PAIR of <16 x float>s.
; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 16

; CHECK: loop:
; CHECK: %[[ACC0:.+]] = phi <16 x float> [ %[[FMA0:.+]], %loop ], [ %[[LDA0]], %entry ]
Expand Down
42 changes: 42 additions & 0 deletions modules/compiler/vecz/test/lit/llvm/simplify-masked-memops.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
; Copyright (C) Codeplay Software Limited
;
; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
; Exceptions; you may not use this file except in compliance with the License.
; You may obtain a copy of the License at
;
; https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
;
; Unless required by applicable law or agreed to in writing, software
; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
; License for the specific language governing permissions and limitations
; under the License.
;
; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

; RUN: veczc -k foo -vecz-passes=mask-memops -S < %s | FileCheck %s

define void @foo(i16 %x, i32 %y, ptr addrspace(1) %p) {
entry:
call void @__vecz_b_masked_store2_tu3ptrU3AS1b(i16 %x, ptr addrspace(1) %p, i1 true)
call void @__vecz_b_masked_store2_ju3ptrU3AS1b(i32 %y, ptr addrspace(1) %p, i1 true)
%f = call float @__vecz_b_masked_load2_fu3ptrU3AS1b(ptr addrspace(1) %p, i1 true)
%v4f = call <4 x float> @__vecz_b_masked_load2_Dv4_fu3ptrU3AS1Dv4_b(ptr addrspace(1) %p, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
ret void
}

; Check we correctly set the alignment on the optimized loads and stores. The
; alignment must come from the builtin, not from the natural/preferred
; alignment for that type.
; CHECK: define void @__vecz_v4_foo(i16 %x, i32 %y, ptr addrspace(1) %p)
; CHECK: entry:
; CHECK: store i16 %x, ptr addrspace(1) %p, align 2
; CHECK-NEXT: store i32 %y, ptr addrspace(1) %p, align 2
; CHECK-NEXT: %f = load float, ptr addrspace(1) %p, align 2
; CHECK-NEXT: %v4f = load <4 x float>, ptr addrspace(1) %p, align 2
; CHECK-NEXT: ret void

declare void @__vecz_b_masked_store2_tu3ptrU3AS1b(i16, ptr addrspace(1), i1)
declare void @__vecz_b_masked_store2_ju3ptrU3AS1b(i32, ptr addrspace(1), i1)
declare float @__vecz_b_masked_load2_fu3ptrU3AS1b(ptr addrspace(1), i1)
declare <4 x float> @__vecz_b_masked_load2_Dv4_fu3ptrU3AS1Dv4_b(ptr addrspace(1), <4 x i1>)

0 comments on commit 00f82df

Please sign in to comment.