From 64d0ec5edebd82b1a4e0db8a53e76a31de5742b7 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Thu, 2 Nov 2023 19:16:54 +0000 Subject: [PATCH] [vecz] Support vector-predicated reductions natively The initial vecz support for vector-predication was implemented around LLVM 12, before there were intrinsics available for reduction operations. This meant that we had to work around the lack intrinsics by using regular reduction intrinsics and 'sanitizing' the input by masking out the unwanted vector elements with the neutral value. Vector-predicated reduction intrinsics have been around since LLVM 14 so it's high time we accommodate them natively. This should lead to better code generation when vector-predicating kernels. --- .../include/transform/packetization_helpers.h | 26 +++--- .../transform/packetization_helpers.cpp | 78 ++++++++++++++--- .../vecz/source/transform/packetizer.cpp | 52 ++---------- .../llvm/VectorPredication/boscc_reduction.ll | 7 +- .../packetize_mask_varying.ll | 7 +- .../VectorPredication/subgroup_reductions.ll | 83 +++++-------------- ...ions_spv_khr_uniform_group_instructions.ll | 72 ++++++---------- 7 files changed, 143 insertions(+), 182 deletions(-) diff --git a/modules/compiler/vecz/source/include/transform/packetization_helpers.h b/modules/compiler/vecz/source/include/transform/packetization_helpers.h index 6366145fd..c5da96058 100644 --- a/modules/compiler/vecz/source/include/transform/packetization_helpers.h +++ b/modules/compiler/vecz/source/include/transform/packetization_helpers.h @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -87,18 +88,21 @@ bool createSubSplats(const vecz::TargetInfo &TI, llvm::IRBuilder<> &B, llvm::SmallVectorImpl &srcs, unsigned subWidth); -/// @brief Utility function for sanitizing the input to a reduction when -/// vector-predicating. Since VP reduction intrinsics didn't land in LLVM 13, -/// reductions must ensure that elements past VL don't affect the result. +/// @brief Utility function for creating a reduction operation. /// -/// Only works on RecurKind::And, Or, Add, SMin, SMax, UMin, UMax, FAdd. +/// The value must be a vector. /// -/// @param[in] B IRBuilder to build any new instructions created -/// @param[in] Val The value to sanitize -/// @param[in] VL The vector length -/// @param[in] Kind The kind of reduction to sanitize for -llvm::Value *sanitizeVPReductionInput(llvm::IRBuilder<> &B, llvm::Value *Val, - llvm::Value *VL, llvm::RecurKind Kind); +/// If VL is passed and is non-null, it is assumed to be the i32 value +/// representing the active vector length. The reduction will be +/// vector-predicated according to this length. +/// +/// Only works on RecurKind::And, Or, Xor, Add, Mul, FAdd, FMul, {S,U,F}Min, +/// {S,U,F}Max. +llvm::Value *createMaybeVPTargetReduction(llvm::IRBuilderBase &B, + const llvm::TargetTransformInfo &TTI, + llvm::Value *Val, + llvm::RecurKind Kind, + llvm::Value *VL = nullptr); /// @brief Utility function to obtain an indices vector to be used in a gather /// operation. @@ -119,7 +123,7 @@ llvm::Value *getGatherIndicesVector(llvm::IRBuilder<> &B, llvm::Value *Indices, const llvm::Twine &N = ""); /// @brief Returns a boolean vector with all elements set to 'true'. -llvm::Value *createAllTrueMask(llvm::IRBuilder<> &B, llvm::ElementCount EC); +llvm::Value *createAllTrueMask(llvm::IRBuilderBase &B, llvm::ElementCount EC); /// @brief Returns an integer step vector, representing the sequence 0 ... N-1. llvm::Value *createIndexSequence(llvm::IRBuilder<> &Builder, diff --git a/modules/compiler/vecz/source/transform/packetization_helpers.cpp b/modules/compiler/vecz/source/transform/packetization_helpers.cpp index a76651a64..9c5c5c853 100644 --- a/modules/compiler/vecz/source/transform/packetization_helpers.cpp +++ b/modules/compiler/vecz/source/transform/packetization_helpers.cpp @@ -27,6 +27,8 @@ #include #include #include +#include +#include #include #include @@ -247,16 +249,70 @@ bool createSubSplats(const vecz::TargetInfo &TI, IRBuilder<> &B, return true; } -Value *sanitizeVPReductionInput(IRBuilder<> &B, Value *Val, Value *VL, - RecurKind Kind) { - Type *const ValTy = Val->getType(); - ElementCount const EC = multi_llvm::getVectorElementCount(ValTy); - Value *const VLSplat = B.CreateVectorSplat(EC, VL); - Value *const IdxVec = - createIndexSequence(B, VectorType::get(VL->getType(), EC)); - Value *const ActiveMask = B.CreateICmp(CmpInst::ICMP_ULT, IdxVec, VLSplat); - auto *const NeutralVal = compiler::utils::getNeutralVal(Kind, ValTy); - return B.CreateSelect(ActiveMask, Val, NeutralVal); +Value *createMaybeVPTargetReduction(IRBuilderBase &B, + const TargetTransformInfo &TTI, Value *Val, + RecurKind Kind, Value *VL) { + assert(isa(Val->getType()) && "Must be vector type"); + // If VL is null, it's not a vector-predicated reduction. + if (!VL) { + return createSimpleTargetReduction(B, &TTI, Val, Kind); + } + auto IntrinsicOp = Intrinsic::not_intrinsic; + switch (Kind) { + default: + break; + case RecurKind::None: + return nullptr; + case RecurKind::Add: + IntrinsicOp = Intrinsic::vp_reduce_add; + break; + case RecurKind::Mul: + IntrinsicOp = Intrinsic::vp_reduce_mul; + break; + case RecurKind::Or: + IntrinsicOp = Intrinsic::vp_reduce_or; + break; + case RecurKind::And: + IntrinsicOp = Intrinsic::vp_reduce_and; + break; + case RecurKind::Xor: + IntrinsicOp = Intrinsic::vp_reduce_xor; + break; + case RecurKind::FAdd: + IntrinsicOp = Intrinsic::vp_reduce_fadd; + break; + case RecurKind::FMul: + IntrinsicOp = Intrinsic::vp_reduce_fmul; + break; + case RecurKind::SMin: + IntrinsicOp = Intrinsic::vp_reduce_smin; + break; + case RecurKind::SMax: + IntrinsicOp = Intrinsic::vp_reduce_smax; + break; + case RecurKind::UMin: + IntrinsicOp = Intrinsic::vp_reduce_umin; + break; + case RecurKind::UMax: + IntrinsicOp = Intrinsic::vp_reduce_umax; + break; + case RecurKind::FMin: + IntrinsicOp = Intrinsic::vp_reduce_fmin; + break; + case RecurKind::FMax: + IntrinsicOp = Intrinsic::vp_reduce_fmax; + break; + } + + auto *const F = Intrinsic::getDeclaration(B.GetInsertBlock()->getModule(), + IntrinsicOp, Val->getType()); + assert(F && "Could not declare vector-predicated reduction intrinsic"); + + auto *const VecTy = cast(Val->getType()); + auto *const NeutralVal = + compiler::utils::getNeutralVal(Kind, VecTy->getElementType()); + auto *const Mask = createAllTrueMask(B, VecTy->getElementCount()); + return B.CreateCall(F, {NeutralVal, Val, Mask, VL}); } Value *getGatherIndicesVector(IRBuilder<> &B, Value *Indices, Type *Ty, @@ -272,7 +328,7 @@ Value *getGatherIndicesVector(IRBuilder<> &B, Value *Indices, Type *Ty, return B.CreateAdd(StepsMul, Indices, N); } -Value *createAllTrueMask(IRBuilder<> &B, ElementCount EC) { +Value *createAllTrueMask(IRBuilderBase &B, ElementCount EC) { return ConstantInt::getTrue(VectorType::get(B.getInt1Ty(), EC)); } diff --git a/modules/compiler/vecz/source/transform/packetizer.cpp b/modules/compiler/vecz/source/transform/packetizer.cpp index 7600f96e6..f1deff2a2 100644 --- a/modules/compiler/vecz/source/transform/packetizer.cpp +++ b/modules/compiler/vecz/source/transform/packetizer.cpp @@ -839,12 +839,7 @@ Value *Packetizer::Impl::reduceBranchCond(Value *cond, Instruction *terminator, // value. Value *&f = conds.front(); - if (VL) { - f = sanitizeVPReductionInput(B, f, VL, kind); - VECZ_FAIL_IF(!f); - } - - return createSimpleTargetReduction(B, &TTI, f, kind); + return createMaybeVPTargetReduction(B, TTI, f, kind, VL); } Packetizer::Result Packetizer::Impl::assign(Value *Scalar, Value *Vectorized) { @@ -899,14 +894,7 @@ Packetizer::Result Packetizer::Impl::packetize(Value *V) { if (newCond->getType()->isVectorTy()) { IRBuilder<> B(Branch); RecurKind kind = RecurKind::Or; - // Sanitize VP reduction inputs, if required. - if (VL) { - newCond = sanitizeVPReductionInput(B, newCond, VL, kind); - if (!newCond) { - return Packetizer::Result(*this); - } - } - newCond = createSimpleTargetReduction(B, &TTI, newCond, kind); + newCond = createMaybeVPTargetReduction(B, TTI, newCond, kind, VL); } Branch->setCondition(newCond); @@ -1183,19 +1171,8 @@ Value *Packetizer::Impl::packetizeGroupReduction(Instruction *I) { // them of ordering? See CA-3969. op.getPacketValues(packetWidth, opPackets); - // When in VP mode, pre-sanitize the reduction input (before VP reduction - // intrinsics, introduced in LLVM 14) - if (VL) { - assert(opPackets.size() == 1 && - "Should have bailed if dealing with more than one packet"); - Value *&val = opPackets.front(); - val = sanitizeVPReductionInput(B, val, VL, Info->Recurrence); - if (!val) { - emitVeczRemarkMissed( - &F, CI, "Can not vector-predicate workgroup/subgroup reduction"); - return nullptr; - } - } + assert((!VL || packetWidth) && + "Should have bailed if dealing with more than one VP packet"); // According to the OpenCL Spec, we are allowed to rearrange the operation // order of a workgroup/subgroup reduction any way we like (even though @@ -1216,8 +1193,8 @@ Value *Packetizer::Impl::packetizeGroupReduction(Instruction *I) { } // Reduce to a scalar. - Value *v = - createSimpleTargetReduction(B, &TTI, opPackets.front(), Info->Recurrence); + Value *v = createMaybeVPTargetReduction(B, TTI, opPackets.front(), + Info->Recurrence, VL); // We leave the original reduction function and divert the vectorized // reduction through it, giving us a reduction over the full apparent @@ -1624,14 +1601,8 @@ Value *Packetizer::Impl::packetizeMaskVarying(Instruction *I) { auto *maskInst = dyn_cast(vecMask); IRBuilder<> B(maskInst ? buildAfter(maskInst, F) : I); - // Sanitize any vector-predicated inputs. - if (VL) { - vecMask = sanitizeVPReductionInput(B, vecMask, VL, RecurKind::Or); - VECZ_FAIL_IF(!vecMask); - } - Value *anyOfMask = - createSimpleTargetReduction(B, &TTI, vecMask, RecurKind::Or); + createMaybeVPTargetReduction(B, TTI, vecMask, RecurKind::Or, VL); anyOfMask->setName("any_of_mask"); if (isVector) { @@ -2072,13 +2043,8 @@ ValuePacket Packetizer::Impl::packetizeGroupScan( // Thus we essentially keep the original group scan, but change it to be an // exclusive one. auto *Reduction = Ops.front(); - if (VL) { - Reduction = sanitizeVPReductionInput(B, Reduction, VL, Scan.Recurrence); - if (!Reduction) { - return results; - } - } - Reduction = createSimpleTargetReduction(B, &TTI, Reduction, Scan.Recurrence); + Reduction = + createMaybeVPTargetReduction(B, TTI, Reduction, Scan.Recurrence, VL); // Now we defer to an *exclusive* scan over the group. auto ExclScan = Scan; diff --git a/modules/compiler/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll b/modules/compiler/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll index c8393af17..d23ac4380 100644 --- a/modules/compiler/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll +++ b/modules/compiler/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll @@ -43,9 +43,4 @@ if.end: ; preds = %if.then, %entry ; CHECK: define spir_kernel void @__vecz_nxv2_vp_foo(ptr addrspace(1) nocapture readonly %a, ptr addrspace(1) nocapture %out) ; CHECK: [[CMP:%.*]] = fcmp oeq %{{.*}}, zeroinitializer -; CHECK: [[INS:%.*]] = insertelement poison, i32 [[VL:%.*]], {{(i32|i64)}} 0 -; CHECK: [[SPLAT:%.*]] = shufflevector [[INS]], poison, zeroinitializer -; CHECK: [[IDX:%.*]] = call @llvm.experimental.stepvector.nxv2i32() -; CHECK: [[MASK:%.*]] = icmp ult [[IDX]], [[SPLAT]] -; CHECK: [[INP:%.*]] = select [[MASK]], [[CMP]], zeroinitializer -; CHECK: %{{.*}} = call i1 @llvm.vector.reduce.or.nxv2i1( [[INP]]) +; CHECK: %{{.*}} = call i1 @llvm.vp.reduce.or.nxv2i1(i1 false, [[CMP]], {{.*}}, i32 {{.*}}) diff --git a/modules/compiler/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll b/modules/compiler/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll index 4e7ba7db7..6fc5db369 100644 --- a/modules/compiler/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll +++ b/modules/compiler/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll @@ -39,12 +39,7 @@ if.end: ret void ; CHECK: define spir_kernel void @__vecz_nxv4_vp_mask_varying ; CHECK: [[CMP:%.*]] = icmp slt %{{.*}}, -; CHECK: [[INS:%.*]] = insertelement poison, i32 [[VL:%.*]], {{(i32|i64)}} 0 -; CHECK: [[SPLAT:%.*]] = shufflevector [[INS]], poison, zeroinitializer -; CHECK: [[IDX:%.*]] = call @llvm.experimental.stepvector.nxv4i32() -; CHECK: [[MASK:%.*]] = icmp ult [[IDX]], [[SPLAT]] -; CHECK: [[INP:%.*]] = select [[MASK]], [[CMP]], zeroinitializer -; CHECK: [[RED:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[INP]]) +; CHECK: [[RED:%.*]] = call i1 @llvm.vp.reduce.or.nxv4i1(i1 false, [[CMP]], {{.*}}, i32 {{.*}}) ; CHECK: [[REINS:%.*]] = insertelement <4 x i1> poison, i1 [[RED]], {{(i32|i64)}} 0 ; CHECK: [[RESPLAT:%.*]] = shufflevector <4 x i1> [[REINS]], <4 x i1> poison, <4 x i32> zeroinitializer } diff --git a/modules/compiler/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll b/modules/compiler/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll index e4f95885c..9e35021ec 100644 --- a/modules/compiler/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll +++ b/modules/compiler/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll @@ -49,13 +49,8 @@ entry: store i32 %2, i32 addrspace(1)* %arrayidx3, align 4 ret void ; CHECK-LABEL: @__vecz_v4_vp_reduce_all_i32( -; CHECK: [[T2:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ult <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i1> , <4 x i1> [[T2]] -; CHECK: [[T3:%.*]] = bitcast <4 x i1> [[I]] to i4 -; CHECK: [[R:%.*]] = icmp eq i4 [[T3]], -1 +; CHECK: [[C:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer +; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.and.v4i1(i1 true, <4 x i1> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_all_i1(i1 [[R]]) ; CHECK: [[EXT:%.*]] = sext i1 %call2 to i32 ; CHECK: store i32 [[EXT]], ptr addrspace(1) {{%.*}}, align 4 @@ -75,13 +70,8 @@ entry: store i32 %2, i32 addrspace(1)* %arrayidx3, align 4 ret void ; CHECK-LABEL: @__vecz_v4_vp_reduce_any_i32( -; CHECK: [[T2:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i1> [[T2]], <4 x i1> zeroinitializer -; CHECK: [[T3:%.*]] = bitcast <4 x i1> [[I]] to i4 -; CHECK: [[R:%.*]] = icmp ne i4 [[T3]], 0 +; CHECK: [[C:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer +; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.or.v4i1(i1 false, <4 x i1> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_any_i1(i1 [[R]]) ; CHECK: [[EXT:%.*]] = sext i1 %call2 to i32 ; CHECK: store i32 [[EXT]], ptr addrspace(1) {{%.*}}, align 4 @@ -99,11 +89,8 @@ entry: store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4 ret void ; CHECK-LABEL: @__vecz_v4_vp_reduce_add_i32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> zeroinitializer -; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[I]]) +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.add.v4i32(i32 0, <4 x i32> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_add_i32(i32 [[R]]) ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 } @@ -120,11 +107,8 @@ entry: store i64 %call2, i64 addrspace(1)* %arrayidx3, align 4 ret void ; CHECK-LABEL: @__vecz_v4_vp_reduce_add_i64( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i64> {{%.*}}, <4 x i64> zeroinitializer -; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[I]]) +; CHECK: [[C:%.*]] = call <4 x i64> @llvm.vp.load.v4i64.p1( +; CHECK: [[R:%.*]] = call i64 @llvm.vp.reduce.add.v4i64(i64 0, <4 x i64> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_add_i64(i64 [[R]]) ; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4 } @@ -141,11 +125,8 @@ entry: store float %call2, float addrspace(1)* %arrayidx3, align 4 ret void ; CHECK-LABEL: @__vecz_v4_vp_reduce_add_f32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x float> {{%.*}}, <4 x float> [[I]]) +; CHECK: [[C:%.*]] = call <4 x float> @llvm.vp.load.v4f32.p1( +; CHECK: [[R:%.*]] = call float @llvm.vp.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fadd_f32(float [[R]]) ; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4 } @@ -162,11 +143,8 @@ entry: store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4 ret void ; CHECK-LABEL: @__vecz_v4_vp_reduce_smin_i32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> [[I]]) +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.smin.v4i32(i32 2147483647, <4 x i32> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smin_i32(i32 [[R]]) ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 } @@ -183,11 +161,8 @@ entry: store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4 ret void ; CHECK-LABEL: @__vecz_v4_vp_reduce_umin_i32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> -; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[I]]) +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.umin.v4i32(i32 -1, <4 x i32> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umin_i32(i32 [[R]]) ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 } @@ -204,11 +179,8 @@ entry: store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4 ret void ; CHECK-LABEL: @__vecz_v4_vp_reduce_smax_i32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> [[I]]) +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.smax.v4i32(i32 -2147483648, <4 x i32> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smax_i32(i32 [[R]]) ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 } @@ -225,11 +197,8 @@ entry: store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4 ret void ; CHECK-LABEL: @__vecz_v4_vp_reduce_umax_i32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> zeroinitializer -; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[I]]) +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.umax.v4i32(i32 0, <4 x i32> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umax_i32(i32 [[R]]) ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 } @@ -246,12 +215,9 @@ entry: store float %call2, float addrspace(1)* %arrayidx3, align 4 ret void ; CHECK-LABEL: @__vecz_v4_vp_reduce_fmin_f32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x float> {{%.*}}, <4 x float> [[I]]) -; CHEKC: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmin_f32(float [[R]]) +; CHECK: [[C:%.*]] = call <4 x float> @llvm.vp.load.v4f32.p1( +; CHECK: [[R:%.*]] = call float @llvm.vp.reduce.fmin.v4f32(float 0x7FF8000000000000, <4 x float> [[C]], {{.*}}) +; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmin_f32(float [[R]]) ; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4 } @@ -267,11 +233,8 @@ entry: store float %call2, float addrspace(1)* %arrayidx3, align 4 ret void ; CHECK-LABEL: @__vecz_v4_vp_reduce_fmax_f32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x float> {{%.*}}, <4 x float> [[I]]) +; CHECK: [[C:%.*]] = call <4 x float> @llvm.vp.load.v4f32.p1( +; CHECK: [[R:%.*]] = call float @llvm.vp.reduce.fmax.v4f32(float 0xFFF8000000000000, <4 x float> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmax_f32(float [[R]]) ; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4 } diff --git a/modules/compiler/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll b/modules/compiler/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll index 357d6bd0d..5ee579906 100644 --- a/modules/compiler/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll +++ b/modules/compiler/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll @@ -35,11 +35,8 @@ declare spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1) declare spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1) ; CHECK-LABEL: @__vecz_v4_vp_reduce_mul_i32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> -; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[I]]) +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.mul.v4i32(i32 1, <4 x i32> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_mul_i32(i32 [[R]]) ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 define spir_kernel void @reduce_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -56,11 +53,8 @@ entry: } ; CHECK-LABEL: @__vecz_v4_vp_reduce_mul_i64( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i64> {{%.*}}, <4 x i64> -; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> [[I]]) +; CHECK: [[C:%.*]] = call <4 x i64> @llvm.vp.load.v4i64.p1( +; CHECK: [[R:%.*]] = call i64 @llvm.vp.reduce.mul.v4i64(i64 1, <4 x i64> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_mul_i64(i64 [[R]]) ; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4 define spir_kernel void @reduce_mul_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -77,11 +71,8 @@ entry: } ; CHECK-LABEL: @__vecz_v4_vp_reduce_mul_f32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x float> {{%.*}}, <4 x float> -; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[I]]) +; CHECK: [[C:%.*]] = call <4 x float> @llvm.vp.load.v4f32.p1( +; CHECK: [[R:%.*]] = call float @llvm.vp.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmul_f32(float [[R]]) ; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4 define spir_kernel void @reduce_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -98,11 +89,8 @@ entry: } ; CHECK-LABEL: @__vecz_v4_vp_reduce_and_i32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> -; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[I]]) +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.and.v4i32(i32 -1, <4 x i32> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_and_i32(i32 [[R]]) ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 define spir_kernel void @reduce_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -119,11 +107,8 @@ entry: } ; CHECK-LABEL: @__vecz_v4_vp_reduce_or_i32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> zeroinitializer -; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[I]]) +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.or.v4i32(i32 0, <4 x i32> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_or_i32(i32 [[R]]) ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 define spir_kernel void @reduce_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -139,31 +124,28 @@ entry: ret void } -; CHECK-LABEL: @__vecz_v4_vp_reduce_xor_i32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i64> {{%.*}}, <4 x i64> zeroinitializer -; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> [[I]]) +; CHECK-LABEL: @__vecz_v4_vp_reduce_xor_i64( +; CHECK: [[C:%.*]] = call <4 x i64> @llvm.vp.load.v4i64.p1( +; CHECK: [[R:%.*]] = call i64 @llvm.vp.reduce.xor.v4i64(i64 0, <4 x i64> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_xor_i64(i64 [[R]]) -; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4 -define spir_kernel void @reduce_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 8 +define spir_kernel void @reduce_xor_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %call = tail call spir_func i64 @__mux_get_global_id(i32 0) %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 %conv = zext i32 %call1 to i64 - %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call - %0 = load i64, ptr addrspace(1) %arrayidx, align 4 + %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %in, i64 %call + %0 = load i64, ptr addrspace(1) %arrayidx, align 8 %call2 = tail call spir_func i64 @__mux_sub_group_reduce_xor_i64(i64 %0) %arrayidx3 = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %conv - store i64 %call2, ptr addrspace(1) %arrayidx3, align 4 + store i64 %call2, ptr addrspace(1) %arrayidx3, align 8 ret void } ; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_and( -; This doesn't generate a reduction intrinsic... -; CHECK: [[T:%.*]] = icmp eq i4 {{%.*}}, -1 -; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1 [[T]]) +; CHECK: [[T:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer +; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.and.v4i1(i1 true, <4 x i1> [[T]], {{.*}}) +; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1 [[R]]) ; CHECK: [[R:%.*]] = zext i1 %call2 to i32 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4 define spir_kernel void @reduce_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -182,8 +164,9 @@ entry: } ; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_or( -; CHECK: [[T:%.*]] = icmp ne i4 {{%.*}}, 0 -; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1 [[T]]) +; CHECK: [[T:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer +; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.or.v4i1(i1 false, <4 x i1> [[T]], {{.*}}) +; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1 [[R]]) ; CHECK: [[R:%.*]] = zext i1 %call2 to i32 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4 define spir_kernel void @reduce_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -202,10 +185,9 @@ entry: } ; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_xor( -; CHECK: [[X:%.*]] = call i4 @llvm.ctpop.i4(i4 {{%.*}}) -; CHECK: [[T:%.*]] = and i4 [[X]], 1 -; CHECK: [[C:%.*]] = icmp ne i4 [[T]], 0 -; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1 [[C]]) +; CHECK: [[T:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer +; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.xor.v4i1(i1 false, <4 x i1> [[T]], {{.*}}) +; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1 [[R]]) ; CHECK: [[R:%.*]] = zext i1 %call2 to i32 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4 define spir_kernel void @reduce_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {