diff --git a/modules/compiler/vecz/source/include/transform/packetization_helpers.h b/modules/compiler/vecz/source/include/transform/packetization_helpers.h index 6366145fd..c5da96058 100644 --- a/modules/compiler/vecz/source/include/transform/packetization_helpers.h +++ b/modules/compiler/vecz/source/include/transform/packetization_helpers.h @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -87,18 +88,21 @@ bool createSubSplats(const vecz::TargetInfo &TI, llvm::IRBuilder<> &B, llvm::SmallVectorImpl &srcs, unsigned subWidth); -/// @brief Utility function for sanitizing the input to a reduction when -/// vector-predicating. Since VP reduction intrinsics didn't land in LLVM 13, -/// reductions must ensure that elements past VL don't affect the result. +/// @brief Utility function for creating a reduction operation. /// -/// Only works on RecurKind::And, Or, Add, SMin, SMax, UMin, UMax, FAdd. +/// The value must be a vector. /// -/// @param[in] B IRBuilder to build any new instructions created -/// @param[in] Val The value to sanitize -/// @param[in] VL The vector length -/// @param[in] Kind The kind of reduction to sanitize for -llvm::Value *sanitizeVPReductionInput(llvm::IRBuilder<> &B, llvm::Value *Val, - llvm::Value *VL, llvm::RecurKind Kind); +/// If VL is passed and is non-null, it is assumed to be the i32 value +/// representing the active vector length. The reduction will be +/// vector-predicated according to this length. +/// +/// Only works on RecurKind::And, Or, Xor, Add, Mul, FAdd, FMul, {S,U,F}Min, +/// {S,U,F}Max. +llvm::Value *createMaybeVPTargetReduction(llvm::IRBuilderBase &B, + const llvm::TargetTransformInfo &TTI, + llvm::Value *Val, + llvm::RecurKind Kind, + llvm::Value *VL = nullptr); /// @brief Utility function to obtain an indices vector to be used in a gather /// operation. @@ -119,7 +123,7 @@ llvm::Value *getGatherIndicesVector(llvm::IRBuilder<> &B, llvm::Value *Indices, const llvm::Twine &N = ""); /// @brief Returns a boolean vector with all elements set to 'true'. -llvm::Value *createAllTrueMask(llvm::IRBuilder<> &B, llvm::ElementCount EC); +llvm::Value *createAllTrueMask(llvm::IRBuilderBase &B, llvm::ElementCount EC); /// @brief Returns an integer step vector, representing the sequence 0 ... N-1. llvm::Value *createIndexSequence(llvm::IRBuilder<> &Builder, diff --git a/modules/compiler/vecz/source/transform/packetization_helpers.cpp b/modules/compiler/vecz/source/transform/packetization_helpers.cpp index a76651a64..9c5c5c853 100644 --- a/modules/compiler/vecz/source/transform/packetization_helpers.cpp +++ b/modules/compiler/vecz/source/transform/packetization_helpers.cpp @@ -27,6 +27,8 @@ #include #include #include +#include +#include #include #include @@ -247,16 +249,70 @@ bool createSubSplats(const vecz::TargetInfo &TI, IRBuilder<> &B, return true; } -Value *sanitizeVPReductionInput(IRBuilder<> &B, Value *Val, Value *VL, - RecurKind Kind) { - Type *const ValTy = Val->getType(); - ElementCount const EC = multi_llvm::getVectorElementCount(ValTy); - Value *const VLSplat = B.CreateVectorSplat(EC, VL); - Value *const IdxVec = - createIndexSequence(B, VectorType::get(VL->getType(), EC)); - Value *const ActiveMask = B.CreateICmp(CmpInst::ICMP_ULT, IdxVec, VLSplat); - auto *const NeutralVal = compiler::utils::getNeutralVal(Kind, ValTy); - return B.CreateSelect(ActiveMask, Val, NeutralVal); +Value *createMaybeVPTargetReduction(IRBuilderBase &B, + const TargetTransformInfo &TTI, Value *Val, + RecurKind Kind, Value *VL) { + assert(isa(Val->getType()) && "Must be vector type"); + // If VL is null, it's not a vector-predicated reduction. + if (!VL) { + return createSimpleTargetReduction(B, &TTI, Val, Kind); + } + auto IntrinsicOp = Intrinsic::not_intrinsic; + switch (Kind) { + default: + break; + case RecurKind::None: + return nullptr; + case RecurKind::Add: + IntrinsicOp = Intrinsic::vp_reduce_add; + break; + case RecurKind::Mul: + IntrinsicOp = Intrinsic::vp_reduce_mul; + break; + case RecurKind::Or: + IntrinsicOp = Intrinsic::vp_reduce_or; + break; + case RecurKind::And: + IntrinsicOp = Intrinsic::vp_reduce_and; + break; + case RecurKind::Xor: + IntrinsicOp = Intrinsic::vp_reduce_xor; + break; + case RecurKind::FAdd: + IntrinsicOp = Intrinsic::vp_reduce_fadd; + break; + case RecurKind::FMul: + IntrinsicOp = Intrinsic::vp_reduce_fmul; + break; + case RecurKind::SMin: + IntrinsicOp = Intrinsic::vp_reduce_smin; + break; + case RecurKind::SMax: + IntrinsicOp = Intrinsic::vp_reduce_smax; + break; + case RecurKind::UMin: + IntrinsicOp = Intrinsic::vp_reduce_umin; + break; + case RecurKind::UMax: + IntrinsicOp = Intrinsic::vp_reduce_umax; + break; + case RecurKind::FMin: + IntrinsicOp = Intrinsic::vp_reduce_fmin; + break; + case RecurKind::FMax: + IntrinsicOp = Intrinsic::vp_reduce_fmax; + break; + } + + auto *const F = Intrinsic::getDeclaration(B.GetInsertBlock()->getModule(), + IntrinsicOp, Val->getType()); + assert(F && "Could not declare vector-predicated reduction intrinsic"); + + auto *const VecTy = cast(Val->getType()); + auto *const NeutralVal = + compiler::utils::getNeutralVal(Kind, VecTy->getElementType()); + auto *const Mask = createAllTrueMask(B, VecTy->getElementCount()); + return B.CreateCall(F, {NeutralVal, Val, Mask, VL}); } Value *getGatherIndicesVector(IRBuilder<> &B, Value *Indices, Type *Ty, @@ -272,7 +328,7 @@ Value *getGatherIndicesVector(IRBuilder<> &B, Value *Indices, Type *Ty, return B.CreateAdd(StepsMul, Indices, N); } -Value *createAllTrueMask(IRBuilder<> &B, ElementCount EC) { +Value *createAllTrueMask(IRBuilderBase &B, ElementCount EC) { return ConstantInt::getTrue(VectorType::get(B.getInt1Ty(), EC)); } diff --git a/modules/compiler/vecz/source/transform/packetizer.cpp b/modules/compiler/vecz/source/transform/packetizer.cpp index 7600f96e6..f1deff2a2 100644 --- a/modules/compiler/vecz/source/transform/packetizer.cpp +++ b/modules/compiler/vecz/source/transform/packetizer.cpp @@ -839,12 +839,7 @@ Value *Packetizer::Impl::reduceBranchCond(Value *cond, Instruction *terminator, // value. Value *&f = conds.front(); - if (VL) { - f = sanitizeVPReductionInput(B, f, VL, kind); - VECZ_FAIL_IF(!f); - } - - return createSimpleTargetReduction(B, &TTI, f, kind); + return createMaybeVPTargetReduction(B, TTI, f, kind, VL); } Packetizer::Result Packetizer::Impl::assign(Value *Scalar, Value *Vectorized) { @@ -899,14 +894,7 @@ Packetizer::Result Packetizer::Impl::packetize(Value *V) { if (newCond->getType()->isVectorTy()) { IRBuilder<> B(Branch); RecurKind kind = RecurKind::Or; - // Sanitize VP reduction inputs, if required. - if (VL) { - newCond = sanitizeVPReductionInput(B, newCond, VL, kind); - if (!newCond) { - return Packetizer::Result(*this); - } - } - newCond = createSimpleTargetReduction(B, &TTI, newCond, kind); + newCond = createMaybeVPTargetReduction(B, TTI, newCond, kind, VL); } Branch->setCondition(newCond); @@ -1183,19 +1171,8 @@ Value *Packetizer::Impl::packetizeGroupReduction(Instruction *I) { // them of ordering? See CA-3969. op.getPacketValues(packetWidth, opPackets); - // When in VP mode, pre-sanitize the reduction input (before VP reduction - // intrinsics, introduced in LLVM 14) - if (VL) { - assert(opPackets.size() == 1 && - "Should have bailed if dealing with more than one packet"); - Value *&val = opPackets.front(); - val = sanitizeVPReductionInput(B, val, VL, Info->Recurrence); - if (!val) { - emitVeczRemarkMissed( - &F, CI, "Can not vector-predicate workgroup/subgroup reduction"); - return nullptr; - } - } + assert((!VL || packetWidth) && + "Should have bailed if dealing with more than one VP packet"); // According to the OpenCL Spec, we are allowed to rearrange the operation // order of a workgroup/subgroup reduction any way we like (even though @@ -1216,8 +1193,8 @@ Value *Packetizer::Impl::packetizeGroupReduction(Instruction *I) { } // Reduce to a scalar. - Value *v = - createSimpleTargetReduction(B, &TTI, opPackets.front(), Info->Recurrence); + Value *v = createMaybeVPTargetReduction(B, TTI, opPackets.front(), + Info->Recurrence, VL); // We leave the original reduction function and divert the vectorized // reduction through it, giving us a reduction over the full apparent @@ -1624,14 +1601,8 @@ Value *Packetizer::Impl::packetizeMaskVarying(Instruction *I) { auto *maskInst = dyn_cast(vecMask); IRBuilder<> B(maskInst ? buildAfter(maskInst, F) : I); - // Sanitize any vector-predicated inputs. - if (VL) { - vecMask = sanitizeVPReductionInput(B, vecMask, VL, RecurKind::Or); - VECZ_FAIL_IF(!vecMask); - } - Value *anyOfMask = - createSimpleTargetReduction(B, &TTI, vecMask, RecurKind::Or); + createMaybeVPTargetReduction(B, TTI, vecMask, RecurKind::Or, VL); anyOfMask->setName("any_of_mask"); if (isVector) { @@ -2072,13 +2043,8 @@ ValuePacket Packetizer::Impl::packetizeGroupScan( // Thus we essentially keep the original group scan, but change it to be an // exclusive one. auto *Reduction = Ops.front(); - if (VL) { - Reduction = sanitizeVPReductionInput(B, Reduction, VL, Scan.Recurrence); - if (!Reduction) { - return results; - } - } - Reduction = createSimpleTargetReduction(B, &TTI, Reduction, Scan.Recurrence); + Reduction = + createMaybeVPTargetReduction(B, TTI, Reduction, Scan.Recurrence, VL); // Now we defer to an *exclusive* scan over the group. auto ExclScan = Scan; diff --git a/modules/compiler/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll b/modules/compiler/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll index c8393af17..d23ac4380 100644 --- a/modules/compiler/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll +++ b/modules/compiler/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll @@ -43,9 +43,4 @@ if.end: ; preds = %if.then, %entry ; CHECK: define spir_kernel void @__vecz_nxv2_vp_foo(ptr addrspace(1) nocapture readonly %a, ptr addrspace(1) nocapture %out) ; CHECK: [[CMP:%.*]] = fcmp oeq %{{.*}}, zeroinitializer -; CHECK: [[INS:%.*]] = insertelement poison, i32 [[VL:%.*]], {{(i32|i64)}} 0 -; CHECK: [[SPLAT:%.*]] = shufflevector [[INS]], poison, zeroinitializer -; CHECK: [[IDX:%.*]] = call @llvm.experimental.stepvector.nxv2i32() -; CHECK: [[MASK:%.*]] = icmp ult [[IDX]], [[SPLAT]] -; CHECK: [[INP:%.*]] = select [[MASK]], [[CMP]], zeroinitializer -; CHECK: %{{.*}} = call i1 @llvm.vector.reduce.or.nxv2i1( [[INP]]) +; CHECK: %{{.*}} = call i1 @llvm.vp.reduce.or.nxv2i1(i1 false, [[CMP]], {{.*}}, i32 {{.*}}) diff --git a/modules/compiler/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll b/modules/compiler/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll index 4e7ba7db7..6fc5db369 100644 --- a/modules/compiler/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll +++ b/modules/compiler/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll @@ -39,12 +39,7 @@ if.end: ret void ; CHECK: define spir_kernel void @__vecz_nxv4_vp_mask_varying ; CHECK: [[CMP:%.*]] = icmp slt %{{.*}}, -; CHECK: [[INS:%.*]] = insertelement poison, i32 [[VL:%.*]], {{(i32|i64)}} 0 -; CHECK: [[SPLAT:%.*]] = shufflevector [[INS]], poison, zeroinitializer -; CHECK: [[IDX:%.*]] = call @llvm.experimental.stepvector.nxv4i32() -; CHECK: [[MASK:%.*]] = icmp ult [[IDX]], [[SPLAT]] -; CHECK: [[INP:%.*]] = select [[MASK]], [[CMP]], zeroinitializer -; CHECK: [[RED:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[INP]]) +; CHECK: [[RED:%.*]] = call i1 @llvm.vp.reduce.or.nxv4i1(i1 false, [[CMP]], {{.*}}, i32 {{.*}}) ; CHECK: [[REINS:%.*]] = insertelement <4 x i1> poison, i1 [[RED]], {{(i32|i64)}} 0 ; CHECK: [[RESPLAT:%.*]] = shufflevector <4 x i1> [[REINS]], <4 x i1> poison, <4 x i32> zeroinitializer } diff --git a/modules/compiler/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll b/modules/compiler/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll index e4f95885c..9e35021ec 100644 --- a/modules/compiler/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll +++ b/modules/compiler/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll @@ -49,13 +49,8 @@ entry: store i32 %2, i32 addrspace(1)* %arrayidx3, align 4 ret void ; CHECK-LABEL: @__vecz_v4_vp_reduce_all_i32( -; CHECK: [[T2:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ult <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i1> , <4 x i1> [[T2]] -; CHECK: [[T3:%.*]] = bitcast <4 x i1> [[I]] to i4 -; CHECK: [[R:%.*]] = icmp eq i4 [[T3]], -1 +; CHECK: [[C:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer +; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.and.v4i1(i1 true, <4 x i1> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_all_i1(i1 [[R]]) ; CHECK: [[EXT:%.*]] = sext i1 %call2 to i32 ; CHECK: store i32 [[EXT]], ptr addrspace(1) {{%.*}}, align 4 @@ -75,13 +70,8 @@ entry: store i32 %2, i32 addrspace(1)* %arrayidx3, align 4 ret void ; CHECK-LABEL: @__vecz_v4_vp_reduce_any_i32( -; CHECK: [[T2:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i1> [[T2]], <4 x i1> zeroinitializer -; CHECK: [[T3:%.*]] = bitcast <4 x i1> [[I]] to i4 -; CHECK: [[R:%.*]] = icmp ne i4 [[T3]], 0 +; CHECK: [[C:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer +; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.or.v4i1(i1 false, <4 x i1> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_any_i1(i1 [[R]]) ; CHECK: [[EXT:%.*]] = sext i1 %call2 to i32 ; CHECK: store i32 [[EXT]], ptr addrspace(1) {{%.*}}, align 4 @@ -99,11 +89,8 @@ entry: store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4 ret void ; CHECK-LABEL: @__vecz_v4_vp_reduce_add_i32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> zeroinitializer -; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[I]]) +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.add.v4i32(i32 0, <4 x i32> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_add_i32(i32 [[R]]) ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 } @@ -120,11 +107,8 @@ entry: store i64 %call2, i64 addrspace(1)* %arrayidx3, align 4 ret void ; CHECK-LABEL: @__vecz_v4_vp_reduce_add_i64( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i64> {{%.*}}, <4 x i64> zeroinitializer -; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[I]]) +; CHECK: [[C:%.*]] = call <4 x i64> @llvm.vp.load.v4i64.p1( +; CHECK: [[R:%.*]] = call i64 @llvm.vp.reduce.add.v4i64(i64 0, <4 x i64> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_add_i64(i64 [[R]]) ; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4 } @@ -141,11 +125,8 @@ entry: store float %call2, float addrspace(1)* %arrayidx3, align 4 ret void ; CHECK-LABEL: @__vecz_v4_vp_reduce_add_f32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x float> {{%.*}}, <4 x float> [[I]]) +; CHECK: [[C:%.*]] = call <4 x float> @llvm.vp.load.v4f32.p1( +; CHECK: [[R:%.*]] = call float @llvm.vp.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fadd_f32(float [[R]]) ; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4 } @@ -162,11 +143,8 @@ entry: store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4 ret void ; CHECK-LABEL: @__vecz_v4_vp_reduce_smin_i32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> [[I]]) +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.smin.v4i32(i32 2147483647, <4 x i32> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smin_i32(i32 [[R]]) ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 } @@ -183,11 +161,8 @@ entry: store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4 ret void ; CHECK-LABEL: @__vecz_v4_vp_reduce_umin_i32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> -; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[I]]) +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.umin.v4i32(i32 -1, <4 x i32> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umin_i32(i32 [[R]]) ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 } @@ -204,11 +179,8 @@ entry: store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4 ret void ; CHECK-LABEL: @__vecz_v4_vp_reduce_smax_i32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> [[I]]) +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.smax.v4i32(i32 -2147483648, <4 x i32> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smax_i32(i32 [[R]]) ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 } @@ -225,11 +197,8 @@ entry: store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4 ret void ; CHECK-LABEL: @__vecz_v4_vp_reduce_umax_i32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> zeroinitializer -; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[I]]) +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.umax.v4i32(i32 0, <4 x i32> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umax_i32(i32 [[R]]) ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 } @@ -246,12 +215,9 @@ entry: store float %call2, float addrspace(1)* %arrayidx3, align 4 ret void ; CHECK-LABEL: @__vecz_v4_vp_reduce_fmin_f32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x float> {{%.*}}, <4 x float> [[I]]) -; CHEKC: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmin_f32(float [[R]]) +; CHECK: [[C:%.*]] = call <4 x float> @llvm.vp.load.v4f32.p1( +; CHECK: [[R:%.*]] = call float @llvm.vp.reduce.fmin.v4f32(float 0x7FF8000000000000, <4 x float> [[C]], {{.*}}) +; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmin_f32(float [[R]]) ; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4 } @@ -267,11 +233,8 @@ entry: store float %call2, float addrspace(1)* %arrayidx3, align 4 ret void ; CHECK-LABEL: @__vecz_v4_vp_reduce_fmax_f32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x float> {{%.*}}, <4 x float> [[I]]) +; CHECK: [[C:%.*]] = call <4 x float> @llvm.vp.load.v4f32.p1( +; CHECK: [[R:%.*]] = call float @llvm.vp.reduce.fmax.v4f32(float 0xFFF8000000000000, <4 x float> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmax_f32(float [[R]]) ; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4 } diff --git a/modules/compiler/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll b/modules/compiler/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll index 357d6bd0d..5ee579906 100644 --- a/modules/compiler/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll +++ b/modules/compiler/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll @@ -35,11 +35,8 @@ declare spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1) declare spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1) ; CHECK-LABEL: @__vecz_v4_vp_reduce_mul_i32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> -; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[I]]) +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.mul.v4i32(i32 1, <4 x i32> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_mul_i32(i32 [[R]]) ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 define spir_kernel void @reduce_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -56,11 +53,8 @@ entry: } ; CHECK-LABEL: @__vecz_v4_vp_reduce_mul_i64( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i64> {{%.*}}, <4 x i64> -; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> [[I]]) +; CHECK: [[C:%.*]] = call <4 x i64> @llvm.vp.load.v4i64.p1( +; CHECK: [[R:%.*]] = call i64 @llvm.vp.reduce.mul.v4i64(i64 1, <4 x i64> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_mul_i64(i64 [[R]]) ; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4 define spir_kernel void @reduce_mul_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -77,11 +71,8 @@ entry: } ; CHECK-LABEL: @__vecz_v4_vp_reduce_mul_f32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x float> {{%.*}}, <4 x float> -; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[I]]) +; CHECK: [[C:%.*]] = call <4 x float> @llvm.vp.load.v4f32.p1( +; CHECK: [[R:%.*]] = call float @llvm.vp.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmul_f32(float [[R]]) ; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4 define spir_kernel void @reduce_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -98,11 +89,8 @@ entry: } ; CHECK-LABEL: @__vecz_v4_vp_reduce_and_i32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> -; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[I]]) +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.and.v4i32(i32 -1, <4 x i32> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_and_i32(i32 [[R]]) ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 define spir_kernel void @reduce_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -119,11 +107,8 @@ entry: } ; CHECK-LABEL: @__vecz_v4_vp_reduce_or_i32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> zeroinitializer -; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[I]]) +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.or.v4i32(i32 0, <4 x i32> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_or_i32(i32 [[R]]) ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 define spir_kernel void @reduce_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -139,31 +124,28 @@ entry: ret void } -; CHECK-LABEL: @__vecz_v4_vp_reduce_xor_i32( -; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0 -; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], -; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i64> {{%.*}}, <4 x i64> zeroinitializer -; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> [[I]]) +; CHECK-LABEL: @__vecz_v4_vp_reduce_xor_i64( +; CHECK: [[C:%.*]] = call <4 x i64> @llvm.vp.load.v4i64.p1( +; CHECK: [[R:%.*]] = call i64 @llvm.vp.reduce.xor.v4i64(i64 0, <4 x i64> [[C]], {{.*}}) ; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_xor_i64(i64 [[R]]) -; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4 -define spir_kernel void @reduce_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 8 +define spir_kernel void @reduce_xor_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %call = tail call spir_func i64 @__mux_get_global_id(i32 0) %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 %conv = zext i32 %call1 to i64 - %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call - %0 = load i64, ptr addrspace(1) %arrayidx, align 4 + %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %in, i64 %call + %0 = load i64, ptr addrspace(1) %arrayidx, align 8 %call2 = tail call spir_func i64 @__mux_sub_group_reduce_xor_i64(i64 %0) %arrayidx3 = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %conv - store i64 %call2, ptr addrspace(1) %arrayidx3, align 4 + store i64 %call2, ptr addrspace(1) %arrayidx3, align 8 ret void } ; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_and( -; This doesn't generate a reduction intrinsic... -; CHECK: [[T:%.*]] = icmp eq i4 {{%.*}}, -1 -; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1 [[T]]) +; CHECK: [[T:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer +; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.and.v4i1(i1 true, <4 x i1> [[T]], {{.*}}) +; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1 [[R]]) ; CHECK: [[R:%.*]] = zext i1 %call2 to i32 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4 define spir_kernel void @reduce_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -182,8 +164,9 @@ entry: } ; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_or( -; CHECK: [[T:%.*]] = icmp ne i4 {{%.*}}, 0 -; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1 [[T]]) +; CHECK: [[T:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer +; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.or.v4i1(i1 false, <4 x i1> [[T]], {{.*}}) +; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1 [[R]]) ; CHECK: [[R:%.*]] = zext i1 %call2 to i32 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4 define spir_kernel void @reduce_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -202,10 +185,9 @@ entry: } ; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_xor( -; CHECK: [[X:%.*]] = call i4 @llvm.ctpop.i4(i4 {{%.*}}) -; CHECK: [[T:%.*]] = and i4 [[X]], 1 -; CHECK: [[C:%.*]] = icmp ne i4 [[T]], 0 -; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1 [[C]]) +; CHECK: [[T:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer +; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.xor.v4i1(i1 false, <4 x i1> [[T]], {{.*}}) +; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1 [[R]]) ; CHECK: [[R:%.*]] = zext i1 %call2 to i32 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4 define spir_kernel void @reduce_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {