From 2715ea1d066cf1269ca07841d4e942c227cd1a4c Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Wed, 20 Dec 2023 08:52:53 +0000 Subject: [PATCH] [compiler] Handle scalable structs as barrier live variables The work-item loops pass would crash when faced with a live variable whose type was a struct containing scalable vectors. We aren't legally allowed to store a struct type containing a mixture of scalable and fixed types (`{ , i8 }`) so we decompose such types into their constituent elements and store each individually. Note that scalable elements are stored in the scalable part of the live variables struct, and fixed elements are stored in the fixed part; in that way they are treated as if they were never a struct to begin with. Note that there may be a future optimization possible here where we store all-scalable structs "whole", but this isn't currently a priority. Note that this problem doesn't currently surface in the default pipeline in the main branch, because we only end up with scalable vectors when we vectorize as such, and we don't currently scalably vectorize any IR that's known to contain struct types, at least not in a way that creates a struct containing scalable vectors; see the new negative scalable vecz test as an example. The plan is to start allowing this when we improve the vectorization of `cmpxhg` instructions. This should also improve the codegen for these structures; see the new fixed-length vecz test for an example of the poor codegen currently emitted. --- .../barriers-live-vars-literal-structs.ll | 119 +++++++++++++ .../include/compiler/utils/barrier_regions.h | 32 +++- .../compiler/utils/source/barrier_regions.cpp | 145 +++++++++++----- .../utils/source/work_item_loops_pass.cpp | 39 ++--- .../test/lit/llvm/ScalableVectors/cmpxchg.ll | 66 +++++++ .../ScalableVectors/store_literal_struct.ll | 38 +++++ .../compiler/vecz/test/lit/llvm/cmpxchg.ll | 161 ++++++++++++++++++ 7 files changed, 532 insertions(+), 68 deletions(-) create mode 100644 modules/compiler/test/lit/passes/barriers-live-vars-literal-structs.ll create mode 100644 modules/compiler/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll create mode 100644 modules/compiler/vecz/test/lit/llvm/ScalableVectors/store_literal_struct.ll create mode 100644 modules/compiler/vecz/test/lit/llvm/cmpxchg.ll diff --git a/modules/compiler/test/lit/passes/barriers-live-vars-literal-structs.ll b/modules/compiler/test/lit/passes/barriers-live-vars-literal-structs.ll new file mode 100644 index 000000000..1fcf4fff7 --- /dev/null +++ b/modules/compiler/test/lit/passes/barriers-live-vars-literal-structs.ll @@ -0,0 +1,119 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; Run the verifier first, as working with scalable vectors in literal structs +; can be fraught with illegality (as discovered when writing this test). +; RUN: muxc --passes verify,work-item-loops,verify -S %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; CHECK: %foo_live_mem_info = type { i8, [7 x i8], { <4 x i8>, i32, <8 x i1> }, [12 x i8], [0 x i8] } + +; Check that we can successfully save/reload fixed and scalable struct literals +; across barriers. Scalable struct literals must be decomposed as it's invalid +; to store them whole - they're "unsized". This might change in future versions +; of LLVM. +; CHECK: @foo.mux-barrier-region(ptr [[D:%.*]], ptr [[A:%.*]], ptr [[MEM:%.*]]) +; CHECK: [[FIXED_ADDR:%.*]] = getelementptr inbounds %foo_live_mem_info, ptr [[MEM]], i32 0, i32 2 +; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() +; CHECK: [[NXV1I16_OFFS:%.*]] = mul i64 [[VSCALE]], 32 +; CHECK: [[NXV1I16_ADDR:%.*]] = getelementptr inbounds %foo_live_mem_info, ptr [[MEM]], i32 0, i32 4, i64 [[NXV1I16_OFFS]] +; CHECK: [[NXV8I32_ADDR:%.*]] = getelementptr inbounds %foo_live_mem_info, ptr [[MEM]], i32 0, i32 4, i32 0 +; CHECK: [[I8_ADDR:%.*]] = getelementptr inbounds %foo_live_mem_info, ptr [[MEM]], i32 0, i32 0 +; CHECK: [[IDX:%.*]] = tail call i64 @__mux_get_global_id(i32 0) +; We can store the fixed struct whole +; CHECK: [[FIXED_STRUCT:%.*]] = call { <4 x i8>, i32, <8 x i1> } @ext_fixed_vec() +; CHECK: store { <4 x i8>, i32, <8 x i1> } [[FIXED_STRUCT]], ptr [[FIXED_ADDR]], align 4 +; We must break down the scalable struct into pieces +; CHECK: [[SCALABLE_STRUCT:%.*]] = call { , , i8 } @ext_scalable_vec() +; CHECK: [[EXT0:%.*]] = extractvalue { , , i8 } [[SCALABLE_STRUCT]], 0 +; CHECK: store [[EXT0]], ptr [[NXV1I16_ADDR:%.*]], align 2 +; CHECK: [[EXT1:%.*]] = extractvalue { , , i8 } [[SCALABLE_STRUCT]], 1 +; CHECK: store [[EXT1]], ptr [[NXV8I32_ADDR]], align 32 +; CHECK: [[EXT2:%.*]] = extractvalue { , , i8 } [[SCALABLE_STRUCT]], 2 +; CHECK: store i8 [[EXT2]], ptr [[I8_ADDR]], align 1 +; CHECK: ret i32 2 + + +; CHECK: @foo.mux-barrier-region.1(ptr [[D:%.*]], ptr [[A:%.*]], ptr [[MEM:%.*]]) +; CHECK: [[FIXED_ADDR:%.*]] = getelementptr inbounds %foo_live_mem_info, ptr [[MEM]], i32 0, i32 2 +; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() +; CHECK: [[NXV1I16_OFFS:%.*]] = mul i64 [[VSCALE]], 32 +; CHECK: [[NXV1I16_ADDR:%.*]] = getelementptr inbounds %foo_live_mem_info, ptr [[MEM]], i32 0, i32 4, i64 [[NXV1I16_OFFS]] +; CHECK: [[NXV8I32_ADDR:%.*]] = getelementptr inbounds %foo_live_mem_info, ptr [[MEM]], i32 0, i32 4, i32 0 +; CHECK: [[I8_ADDR:%.*]] = getelementptr inbounds %foo_live_mem_info, ptr [[MEM]], i32 0, i32 0 +; CHECK: [[IDX:%.*]] = {{(tail )?}}call i64 @__mux_get_global_id(i32 0) +; We can reload the fixed struct directly +; CHECK: [[FIXED_LD:%.*]] = load { <4 x i8>, i32, <8 x i1> }, ptr [[FIXED_ADDR]], align 4 +; Load and insert the first scalable element +; CHECK: [[NXV1I16_LD:%.*]] = load , ptr [[NXV1I16_ADDR]], align 2 +; CHECK: [[INS0:%.*]] = insertvalue { , , i8 } poison, [[NXV1I16_LD]], 0 +; Load and insert the second scalable element +; CHECK: [[NXV8I32_LD:%.*]] = load , ptr [[NXV8I32_ADDR]], align 32 +; CHECK: [[INS1:%.*]] = insertvalue { , , i8 } [[INS0]], [[NXV8I32_LD]], 1 +; Load and insert the third and last scalable element +; CHECK: [[I8_LD:%.*]] = load i8, ptr [[I8_ADDR:%.*]], align 1 +; CHECK: [[SCALABLE_LD:%.*]] = insertvalue { , , i8 } [[INS1]], i8 [[I8_LD]], 2 + +; All the original code from after the barrier +; CHECK: %arrayidx1 = getelementptr inbounds i8, ptr %0, i64 [[IDX]] +; CHECK: store { <4 x i8>, i32, <8 x i1> } [[FIXED_LD]], ptr %arrayidx1, align 4 +; CHECK: %elt0 = extractvalue { , , i8 } [[SCALABLE_LD]], 0 +; CHECK: %elt1 = extractvalue { , , i8 } [[SCALABLE_LD]], 1 +; CHECK: %elt2 = extractvalue { , , i8 } [[SCALABLE_LD]], 2 +; CHECK: %arrayidx2 = getelementptr inbounds i8, ptr [[A]], i64 [[IDX]] +; CHECK: store %elt0, ptr %arrayidx2, align 1 +; CHECK: store %elt1, ptr %arrayidx2, align 4 +; CHECK: store i8 %elt2, ptr %arrayidx2, align 1 + +; CHECK: define void @foo.mux-barrier-wrapper(ptr %d, ptr %a) +; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() +; CHECK: [[SCALABLE_SIZE:%.*]] = mul i64 [[VSCALE:%.*]], 64 +; CHECK: [[PER_WI_SIZE:%.*]] = add i64 [[SCALABLE_SIZE]], 32 +; CHECK: [[TOTAL_WG_SIZE:%.*]] = mul i64 [[PER_WI_SIZE]], {{%.*}} +; CHECK: %live_variables = alloca i8, i64 [[TOTAL_WG_SIZE]], align 32 +define internal void @foo(ptr %d, ptr %a) #0 { +entry: + %idx = tail call i64 @__mux_get_global_id(i32 0) + %fixed.struct.literal = call { <4 x i8>, i32, <8 x i1> } @ext_fixed_vec() + %scalable.struct.literal = call { , , i8 } @ext_scalable_vec() + + tail call void @__mux_work_group_barrier(i32 0, i32 1, i32 272) + + ; Do something with the value on the other side of the barrier. + %arrayidx1 = getelementptr inbounds i8, ptr %d, i64 %idx + store { <4 x i8>, i32, <8 x i1> } %fixed.struct.literal, ptr %arrayidx1, align 4 + + ; We can't store "unsized types", so manually extract values and store those. + %elt0 = extractvalue { , , i8 } %scalable.struct.literal, 0 + %elt1 = extractvalue { , , i8 } %scalable.struct.literal, 1 + %elt2 = extractvalue { , , i8 } %scalable.struct.literal, 2 + + %arrayidx2 = getelementptr inbounds i8, ptr %a, i64 %idx + store %elt0, ptr %arrayidx2, align 1 + store %elt1, ptr %arrayidx2, align 4 + store i8 %elt2, ptr %arrayidx2, align 1 + + ret void +} + +declare i64 @__mux_get_global_id(i32) +declare void @__mux_work_group_barrier(i32, i32, i32) +declare { <4 x i8>, i32, <8 x i1> } @ext_fixed_vec() +declare { , , i8 } @ext_scalable_vec() + +attributes #0 = { "mux-kernel"="entry-point" } diff --git a/modules/compiler/utils/include/compiler/utils/barrier_regions.h b/modules/compiler/utils/include/compiler/utils/barrier_regions.h index 362a5051b..b85decf37 100644 --- a/modules/compiler/utils/include/compiler/utils/barrier_regions.h +++ b/modules/compiler/utils/include/compiler/utils/barrier_regions.h @@ -161,7 +161,10 @@ class Barrier { /// @brief struct to help retrieval of values from the barrier struct struct LiveValuesHelper { Barrier const &barrier; - llvm::DenseMap live_GEPs; + /// @brief A cache of queried live-values addresses (inside the live + /// variables struct), stored by the pair (value, member_idx). + llvm::DenseMap, llvm::Value *> + live_GEPs; llvm::DenseMap reloads; llvm::IRBuilder<> gepBuilder; llvm::Value *barrier_struct = nullptr; @@ -173,9 +176,19 @@ class Barrier { LiveValuesHelper(Barrier const &b, llvm::BasicBlock *bb, llvm::Value *s) : barrier(b), gepBuilder(bb), barrier_struct(s) {} - /// @brief get a GEP instruction pointing to the given value in the barrier - /// struct. - llvm::Value *getGEP(const llvm::Value *live); + /// @brief Return a GEP instruction pointing to the given value/idx pair in + /// the barrier struct. + /// + /// @return The GEP corresponding to the address of the value in the + /// struct, or nullptr if the value could not be found in the struct. + llvm::Value *getGEP(const llvm::Value *live, unsigned member_idx = 0); + + /// @brief Return a GEP instruction corresponding to the address of + /// the given ExtractValueInst in the barriers struct. + /// + /// @return The GEP corresponding to the address of the value in the + /// struct, or nullptr if the value is not an ExtractValueInst. + llvm::Value *getExtractValueGEP(const llvm::Value *live); /// @brief get a value reloaded from the barrier struct. /// @@ -194,10 +207,13 @@ class Barrier { std::pair, llvm::DenseSet>; /// @brief Type for memory allocation of live variables at all of barriers using live_variable_mem_t = OrderedSet; - /// @brief Type for index of live variabless on live variable inforamtion - using live_variable_index_map_t = llvm::DenseMap; - /// @brief Type for index of live variabless on live variable inforamtion - using live_variable_scalables_map_t = llvm::DenseMap; + /// @brief Type for index of live variables on live variable information + /// Indexed by the pair (value, member_idx) + using live_variable_index_map_t = + llvm::DenseMap, unsigned>; + /// @brief Type for index of live variables on live variable information + /// Indexed by the pair (value, member_idx) + using live_variable_scalables_map_t = live_variable_index_map_t; /// @brief Type for ids of barriers using barrier_id_map_t = llvm::DenseMap; /// @brief Type for ids of new kernel functions diff --git a/modules/compiler/utils/source/barrier_regions.cpp b/modules/compiler/utils/source/barrier_regions.cpp index 1b6c13429..3ed5ad5fc 100644 --- a/modules/compiler/utils/source/barrier_regions.cpp +++ b/modules/compiler/utils/source/barrier_regions.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -279,11 +280,33 @@ void UpdateAndTrimPHINodeEdges(BasicBlock *BB, ValueToValueMapTy &vmap) { } } +/// @brief Returns true if the type is a struct type containing any scalable +/// vectors in its list of elements +bool isStructWithScalables(Type *ty) { + if (auto *const struct_ty = dyn_cast(ty)) { + return any_of(struct_ty->elements(), + [](Type *ty) { return isa(ty); }); + } + return false; +} + } // namespace -Value *compiler::utils::Barrier::LiveValuesHelper::getGEP(const Value *live) { - auto gep_it = live_GEPs.find(live); - if (gep_it != live_GEPs.end()) { +Value *compiler::utils::Barrier::LiveValuesHelper::getExtractValueGEP( + const Value *live) { + if (auto *const extract = dyn_cast(live)) { + // We can't handle extracts with multiple indices + if (extract->getIndices().size() == 1) { + return getGEP(extract->getAggregateOperand(), extract->getIndices()[0]); + } + } + return nullptr; +} + +Value *compiler::utils::Barrier::LiveValuesHelper::getGEP(const Value *live, + unsigned member_idx) { + auto key = std::make_pair(live, member_idx); + if (auto gep_it = live_GEPs.find(key); gep_it != live_GEPs.end()) { return gep_it->second; } @@ -293,12 +316,8 @@ Value *compiler::utils::Barrier::LiveValuesHelper::getGEP(const Value *live) { data_ty = AI->getAllocatedType(); } - if (!isa(data_ty)) { - auto field_it = barrier.live_variable_index_map_.find(live); - if (field_it == barrier.live_variable_index_map_.end()) { - return nullptr; - } - + if (auto field_it = barrier.live_variable_index_map_.find(key); + field_it != barrier.live_variable_index_map_.end()) { LLVMContext &context = barrier.module_.getContext(); unsigned field_index = field_it->second; Value *live_variable_info_idxs[2] = { @@ -308,11 +327,8 @@ Value *compiler::utils::Barrier::LiveValuesHelper::getGEP(const Value *live) { gep = gepBuilder.CreateInBoundsGEP(barrier.live_var_mem_ty_, barrier_struct, live_variable_info_idxs, Twine("live_gep_") + live->getName()); - } else { - auto field_it = barrier.live_variable_scalables_map_.find(live); - if (field_it == barrier.live_variable_scalables_map_.end()) { - return nullptr; - } + } else if (auto field_it = barrier.live_variable_scalables_map_.find(key); + field_it != barrier.live_variable_scalables_map_.end()) { unsigned const field_offset = field_it->second; Value *scaled_offset = nullptr; @@ -346,9 +362,15 @@ Value *compiler::utils::Barrier::LiveValuesHelper::getGEP(const Value *live) { PointerType::get( data_ty, cast(barrier_struct->getType())->getAddressSpace())); + } else { + // Fall back and see if this live variable is actually a decomposed + // structure type. + return getExtractValueGEP(live); } - live_GEPs.insert(std::make_pair(live, gep)); + // Cache this GEP for later + live_GEPs[key] = gep; + return gep; } @@ -364,7 +386,22 @@ Value *compiler::utils::Barrier::LiveValuesHelper::getReload(Value *live, if (Value *v = getGEP(live)) { if (!isa(live)) { // If live variable is not allocainst, insert load. - v = ir.CreateLoad(live->getType(), v, Twine(live->getName(), name)); + if (!isStructWithScalables(live->getType())) { + v = ir.CreateLoad(live->getType(), v, Twine(live->getName(), name)); + } else { + auto *const struct_ty = cast(live->getType()); + // Start off with a poison value, and build the struct up member by + // member, reloading each member at a time from their respective + // offsets. + v = PoisonValue::get(struct_ty); + for (auto [idx, ty] : enumerate(struct_ty->elements())) { + auto *const elt_addr = getGEP(live, idx); + assert(elt_addr && "Could not get address of struct element"); + auto *const reload = + ir.CreateLoad(ty, elt_addr, Twine(live->getName(), name)); + v = ir.CreateInsertValue(v, reload, idx); + } + } } mapped = v; return v; @@ -855,47 +892,64 @@ void compiler::utils::Barrier::MakeLiveVariableMemType() { const auto &dl = module_.getDataLayout(); struct member_info { + /// @brief The root `value` being stored. Value *value; + /// @brief The member index of this member inside `value`, if `value` is a + /// decomposed structure type. Zero otherwise. + unsigned member_idx; + /// @brief The type of `value`, or of the specific member of `value`. Type *type; + /// @brief The alignment of the value being stored unsigned alignment; + /// @brief The size of the value being stored unsigned size; }; SmallVector barrier_members; barrier_members.reserve(whole_live_variables_set_.size()); - for (Value *i : whole_live_variables_set_) { - LLVM_DEBUG(dbgs() << "whole live set:" << *i << '\n'; - dbgs() << "type:" << *(i->getType()) << '\n';); - Type *field_ty = i->getType(); + for (Value *live_var : whole_live_variables_set_) { + LLVM_DEBUG(dbgs() << "whole live set:" << *live_var << '\n'; + dbgs() << "type:" << *(live_var->getType()) << '\n';); + Type *field_ty = live_var->getType(); Type *member_ty = nullptr; unsigned alignment = 0; // If allocainst is live variable, get element type of pointer type // from field_ty and remember alignment - if (const auto *AI = dyn_cast(i)) { + if (const auto *AI = dyn_cast(live_var)) { member_ty = AI->getAllocatedType(); alignment = AI->getAlign().value(); } else { member_ty = field_ty; } - // For a scalable vector, we need the size of the equivalent fixed vector - // based on its known minimum size. - auto member_ty_fixed = member_ty; - if (isa(member_ty)) { - auto *const eltTy = multi_llvm::getVectorElementType(member_ty); - auto n = multi_llvm::getVectorElementCount(member_ty).getKnownMinValue(); - member_ty_fixed = VectorType::get(eltTy, ElementCount::getFixed(n)); + std::vector member_tys = {member_ty}; + // If this is a struct type containing any scalable members, we must + // decompose the value into its individual components. + if (isStructWithScalables(member_ty)) { + member_tys = cast(member_ty)->elements().vec(); } - // Need to ensure that alloc alignment or preferred alignment is kept - // in the new struct so pad as necessary. - unsigned size = dl.getTypeAllocSize(member_ty_fixed); - alignment = std::max(dl.getPrefTypeAlign(member_ty).value(), - static_cast(alignment)); - max_live_var_alignment = std::max(alignment, max_live_var_alignment); + for (auto [idx, ty] : enumerate(member_tys)) { + // For a scalable vector, we need the size of the equivalent fixed vector + // based on its known minimum size. + auto member_ty_fixed = ty; + if (isa(ty)) { + auto *const eltTy = multi_llvm::getVectorElementType(ty); + auto n = multi_llvm::getVectorElementCount(ty).getKnownMinValue(); + member_ty_fixed = VectorType::get(eltTy, ElementCount::getFixed(n)); + } - barrier_members.push_back({i, member_ty, alignment, size}); + // Need to ensure that alloc alignment or preferred alignment is kept + // in the new struct so pad as necessary. + unsigned size = dl.getTypeAllocSize(member_ty_fixed); + alignment = std::max(dl.getPrefTypeAlign(ty).value(), + static_cast(alignment)); + max_live_var_alignment = std::max(alignment, max_live_var_alignment); + + barrier_members.push_back( + {live_var, static_cast(idx), ty, alignment, size}); + } } // sort the barrier members by decreasing alignment to minimise the amount @@ -930,7 +984,8 @@ void compiler::utils::Barrier::MakeLiveVariableMemType() { } } offset += member.size; - live_variable_index_map_[member.value] = field_tys.size(); + live_variable_index_map_[std::make_pair(member.value, member.member_idx)] = + field_tys.size(); field_tys.push_back(member.type); } // Pad the end of the struct to the max alignment as we are creating an @@ -950,7 +1005,8 @@ void compiler::utils::Barrier::MakeLiveVariableMemType() { offset = PadTypeToAlignment(field_tys_scalable, offset, member.alignment); - live_variable_scalables_map_[member.value] = offset; + live_variable_scalables_map_[std::make_pair(member.value, + member.member_idx)] = offset; offset += member.size; field_tys_scalable.push_back(member.type); } @@ -1217,7 +1273,20 @@ Function *compiler::utils::Barrier::GenerateNewKernel(BarrierRegion ®ion) { while (isa(insert_point)) { insert_point = insert_point->getNextNonDebugInstruction(); } - new StoreInst(live_var, live_values.getGEP(live_var), insert_point); + IRBuilder<> B(insert_point); + if (!isStructWithScalables(live_var->getType())) { + auto *addr = live_values.getGEP(live_var); + B.CreateStore(live_var, addr); + } else { + // Store this struct containing scalable members piece-wise + auto member_tys = cast(live_var->getType())->elements(); + for (auto [idx, ty] : enumerate(member_tys)) { + auto *extract = B.CreateExtractValue(live_var, idx); + auto *extract_addr = live_values.getGEP(extract); + assert(extract_addr); + B.CreateStore(extract, extract_addr); + } + } } } diff --git a/modules/compiler/utils/source/work_item_loops_pass.cpp b/modules/compiler/utils/source/work_item_loops_pass.cpp index 3806592f8..d7cb5b35a 100644 --- a/modules/compiler/utils/source/work_item_loops_pass.cpp +++ b/modules/compiler/utils/source/work_item_loops_pass.cpp @@ -161,15 +161,10 @@ Instruction *IRPrintf(const std::string format, Module &module, Value *v, Value *materializeVF(IRBuilder<> &builder, compiler::utils::VectorizationFactor vf) { - auto sizeTyBytes = - compiler::utils::getSizeTypeBytes(*builder.GetInsertBlock()->getModule()); - Value *multiple = builder.getIntN(8 * sizeTyBytes, vf.getKnownMin()); - if (!vf.isScalable()) { - return multiple; - } - Type *size_type = builder.getIntNTy(sizeTyBytes * 8); - Value *vscale = builder.CreateIntrinsic(Intrinsic::vscale, size_type, {}); - return builder.CreateMul(vscale, multiple); + auto &m = *builder.GetInsertBlock()->getModule(); + Constant *multiple = + ConstantInt::get(compiler::utils::getSizeType(m), vf.getKnownMin()); + return !vf.isScalable() ? multiple : builder.CreateVScale(multiple); } struct ScheduleGenerator { @@ -1309,14 +1304,14 @@ struct ScheduleGenerator { // here corresponds to the current outermost to innermost vectorized // dimensions, rather than in their absolutist sense. void setUpLiveVarsAlloca(compiler::utils::BarrierWithLiveVars &barrier, - IRBuilder<> &B, Value *const VF, unsigned sizeTyBytes, - Value *const sizeZ, Value *const sizeY, + IRBuilder<> &B, Value *const sizeZ, Value *const sizeY, Value *const sizeX, StringRef name, bool isDebug) { barrier.setSize0(sizeX); Value *const live_var_size = B.CreateMul(sizeX, B.CreateMul(sizeY, sizeZ)); barrier.setTotalSize(live_var_size); - AllocaInst *live_var_mem_space; + auto &m = *B.GetInsertBlock()->getModule(); + auto *const size_ty = compiler::utils::getSizeType(m); auto const scalablesSize = barrier.getLiveVarMemSizeScalable(); if (scalablesSize == 0) { live_var_mem_space = @@ -1327,10 +1322,10 @@ void setUpLiveVarsAlloca(compiler::utils::BarrierWithLiveVars &barrier, } else { auto const fixedSize = barrier.getLiveVarMemSizeFixed(); // We ensure that the VFs are the same between the main and tail. - auto *const vscale = cast(VF)->getOperand(0); - auto *const structSize = B.CreateAdd( - B.CreateMul(vscale, B.getIntN(8 * sizeTyBytes, scalablesSize)), - B.getIntN(8 * sizeTyBytes, fixedSize)); + auto *const vscale = + B.CreateVScale(ConstantInt::get(size_ty, scalablesSize)); + auto *const structSize = + B.CreateAdd(vscale, ConstantInt::get(size_ty, fixedSize)); auto *const buffer_size = B.CreateMul(structSize, live_var_size); live_var_mem_space = B.CreateAlloca(B.getInt8Ty(), buffer_size, name); @@ -1502,9 +1497,9 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction( } Value *const size0 = entryIR.CreateUDiv(numerator, VF); - setUpLiveVarsAlloca(barrierMain, entryIR, VF, sizeTyBytes, - localSizeDim[workItemDim2], localSizeDim[workItemDim1], - size0, "live_variables", IsDebug); + setUpLiveVarsAlloca(barrierMain, entryIR, localSizeDim[workItemDim2], + localSizeDim[workItemDim1], size0, "live_variables", + IsDebug); } // Amazingly, it's possible for the tail kernel to have live vars in its @@ -1519,9 +1514,9 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction( "tail.has.vp"); size0 = entryIR.CreateZExt(hasLeftover, peel->getType()); } - setUpLiveVarsAlloca(*barrierTail, entryIR, VF, sizeTyBytes, - localSizeDim[workItemDim2], localSizeDim[workItemDim1], - size0, "live_variables_peel", IsDebug); + setUpLiveVarsAlloca(*barrierTail, entryIR, localSizeDim[workItemDim2], + localSizeDim[workItemDim1], size0, + "live_variables_peel", IsDebug); } // next means next barrier id. This variable is uninitialized to begin with, diff --git a/modules/compiler/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll b/modules/compiler/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll new file mode 100644 index 000000000..7558c2907 --- /dev/null +++ b/modules/compiler/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll @@ -0,0 +1,66 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -vecz-scalable -vecz-passes=packetizer,verify \ +; RUN: --pass-remarks-missed=vecz -S < %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Note: we can't currently scalably packetize this kernel, due to the struct +; type. +; CHECK: Vecz: Could not packetize %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4 +define spir_kernel void @test_fn(ptr %p, ptr %q, ptr %r) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + + %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic + %val0 = extractvalue { i32, i1 } %old0, 0 + %success0 = extractvalue { i32, i1 } %old0, 1 + + %out = getelementptr i32, ptr %q, i64 %call + store i32 %val0, ptr %out, align 4 + + %outsuccess = getelementptr i8, ptr %r, i64 %call + %outbyte = zext i1 %success0 to i8 + store i8 %outbyte, ptr %outsuccess, align 1 + + ; Test a couple of insert/extract patterns + + ; Test inserting a uniform value into a varying literal struct + %testinsertconst = insertvalue { i32, i1 } %old0, i1 false, 1 + %testextract0 = extractvalue { i32, i1 } %testinsertconst, 1 + %outbyte0 = zext i1 %testextract0 to i8 + store i8 %outbyte0, ptr %outsuccess, align 1 + + ; Test inserting a varying value into a varying literal struct + %byte1 = load i8, ptr %outsuccess, align 1 + %bool1 = trunc i8 %byte1 to i1 + %testinsertvarying0 = insertvalue { i32, i1 } %old0, i1 %bool1, 1 + %testextract1 = extractvalue { i32, i1 } %testinsertvarying0, 1 + %outbyte1 = zext i1 %testextract1 to i8 + store i8 %outbyte1, ptr %outsuccess, align 1 + + ; Test inserting a varying value into a uniform literal struct + %testinsertvarying1 = insertvalue { i32, i1 } poison, i1 %bool1, 1 + %testextract2 = extractvalue { i32, i1 } %testinsertvarying1, 1 + %outbyte2 = zext i1 %testextract2 to i8 + store i8 %outbyte2, ptr %outsuccess, align 1 + + ret void +} + +declare i64 @__mux_get_global_id(i32) diff --git a/modules/compiler/vecz/test/lit/llvm/ScalableVectors/store_literal_struct.ll b/modules/compiler/vecz/test/lit/llvm/ScalableVectors/store_literal_struct.ll new file mode 100644 index 000000000..ad8599ba5 --- /dev/null +++ b/modules/compiler/vecz/test/lit/llvm/ScalableVectors/store_literal_struct.ll @@ -0,0 +1,38 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; Check that we do something correct when scalably packetizing struct literals. +; Right now we fail to packetize, but if we could packetize this we'd have to +; be careful as storing a struct literal containing scalable vectors is invalid +; IR. +; RUN: veczc -w 4 -vecz-scalable -vecz-passes=verify,packetizer,verify \ +; RUN: --pass-remarks-missed=vecz -S < %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; CHECK: Vecz: Could not packetize %v = load { i32, i32 }, ptr %arrayidx.p, align 4 +define spir_kernel void @test_fn(ptr %p, ptr %q) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidx.p = getelementptr { i32, i32 }, ptr %p, i64 %idx + %v = load { i32, i32 }, ptr %arrayidx.p, align 4 + %arrayidx.q = getelementptr { i32, i32 }, ptr %q, i64 %idx + store { i32, i32 } %v, ptr %arrayidx.q, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) diff --git a/modules/compiler/vecz/test/lit/llvm/cmpxchg.ll b/modules/compiler/vecz/test/lit/llvm/cmpxchg.ll new file mode 100644 index 000000000..d62486409 --- /dev/null +++ b/modules/compiler/vecz/test/lit/llvm/cmpxchg.ll @@ -0,0 +1,161 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -vecz-passes=packetizer,verify -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; CHECK: define spir_kernel void @__vecz_v4_test_fn(ptr %p, ptr %q, ptr %r) +define spir_kernel void @test_fn(ptr %p, ptr %q, ptr %r) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + +; Test that this cmpxchg is scalarized. Not ideal, but hey. +; CHECK: [[A0:%.*]] = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4 +; CHECK: [[A1:%.*]] = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4 +; CHECK: [[A2:%.*]] = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4 +; CHECK: [[A3:%.*]] = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4 + +; Then we insert the values into a strange struct +; CHECK: [[INS0:%.*]] = insertvalue [4 x { i32, i1 }] undef, { i32, i1 } [[A0]], 0 +; CHECK: [[INS1:%.*]] = insertvalue [4 x { i32, i1 }] [[INS0]], { i32, i1 } [[A1]], 1 +; CHECK: [[INS2:%.*]] = insertvalue [4 x { i32, i1 }] [[INS1]], { i32, i1 } [[A2]], 2 +; CHECK: [[INS3:%.*]] = insertvalue [4 x { i32, i1 }] [[INS2]], { i32, i1 } [[A3]], 3 + %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic + +; To extract from this result, we extract each element individually then insert +; each into a vector. +; CHECK: [[ELT0_0_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 0, 0 +; CHECK: [[ELT0_0_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 1, 0 +; CHECK: [[ELT0_0_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 2, 0 +; CHECK: [[ELT0_0_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 3, 0 +; CHECK: [[INS0V0_0:%.*]] = insertelement <4 x i32> undef, i32 [[ELT0_0_0]], i32 0 +; CHECK: [[INS0V0_1:%.*]] = insertelement <4 x i32> [[INS0V0_0]], i32 [[ELT0_0_1]], i32 1 +; CHECK: [[INS0V0_2:%.*]] = insertelement <4 x i32> [[INS0V0_1]], i32 [[ELT0_0_2]], i32 2 +; CHECK: [[INS0V0_3:%.*]] = insertelement <4 x i32> [[INS0V0_2]], i32 [[ELT0_0_3]], i32 3 + %val0 = extractvalue { i32, i1 } %old0, 0 +; Same again here +; CHECK: [[ELT1_0_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 0, 1 +; CHECK: [[ELT1_0_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 1, 1 +; CHECK: [[ELT1_0_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 2, 1 +; CHECK: [[ELT1_0_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 3, 1 +; CHECK: [[INS1V0_0:%.*]] = insertelement <4 x i1> undef, i1 [[ELT1_0_0]], i32 0 +; CHECK: [[INS1V0_1:%.*]] = insertelement <4 x i1> [[INS1V0_0]], i1 [[ELT1_0_1]], i32 1 +; CHECK: [[INS1V0_2:%.*]] = insertelement <4 x i1> [[INS1V0_1]], i1 [[ELT1_0_2]], i32 2 +; CHECK: [[INS1V0_3:%.*]] = insertelement <4 x i1> [[INS1V0_2]], i1 [[ELT1_0_3]], i32 3 + %success0 = extractvalue { i32, i1 } %old0, 1 + + %out = getelementptr i32, ptr %q, i64 %call +; Stored as a vector +; CHECK: store <4 x i32> [[INS0V0_3]], ptr + store i32 %val0, ptr %out, align 4 + +; CHECK: [[PTR:%.*]] = getelementptr i8, ptr %r, i64 %call + %outsuccess = getelementptr i8, ptr %r, i64 %call +; CHECK: [[ZEXT0:%.*]] = zext <4 x i1> [[INS1V0_3]] to <4 x i8> + %outbyte = zext i1 %success0 to i8 +; Stored as a vector +; CHECK: store <4 x i8> [[ZEXT0]], ptr [[PTR]], align 1 + store i8 %outbyte, ptr %outsuccess, align 1 + + ; Test a couple of insert/extract patterns + +; Test inserting a uniform value into a varying literal struct +; This is very inefficient +; CHECK: [[INSS0_0:%.*]] = insertvalue { i32, i1 } [[A0]], i1 false, 1 +; CHECK: [[INSS0_1:%.*]] = insertvalue { i32, i1 } [[A1]], i1 false, 1 +; CHECK: [[INSS0_2:%.*]] = insertvalue { i32, i1 } [[A2]], i1 false, 1 +; CHECK: [[INSS0_3:%.*]] = insertvalue { i32, i1 } [[A3]], i1 false, 1 +; CHECK: [[INSS1_0:%.*]] = insertvalue [4 x { i32, i1 }] undef, { i32, i1 } [[INSS0_0]], 0 +; CHECK: [[INSS1_1:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS1_0]], { i32, i1 } [[INSS0_1]], 1 +; CHECK: [[INSS1_2:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS1_1]], { i32, i1 } [[INSS0_2]], 2 +; CHECK: [[INSS1_3:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS1_2]], { i32, i1 } [[INSS0_3]], 3 +; CHECK: [[EXTS1_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS1_3]], 0, 1 +; CHECK: [[EXTS1_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS1_3]], 1, 1 +; CHECK: [[EXTS1_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS1_3]], 2, 1 +; CHECK: [[EXTS1_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS1_3]], 3, 1 +; CHECK: [[INS1V1_0:%.*]] = insertelement <4 x i1> undef, i1 [[EXTS1_0]], i32 0 +; CHECK: [[INS1V1_1:%.*]] = insertelement <4 x i1> [[INS1V1_0]], i1 [[EXTS1_1]], i32 1 +; CHECK: [[INS1V1_2:%.*]] = insertelement <4 x i1> [[INS1V1_1]], i1 [[EXTS1_2]], i32 2 +; CHECK: [[INS1V1_3:%.*]] = insertelement <4 x i1> [[INS1V1_2]], i1 [[EXTS1_3]], i32 3 +; CHECK: [[ZEXT1:%.*]] = zext <4 x i1> [[INS1V1_3]] to <4 x i8> +; CHECK: store <4 x i8> [[ZEXT1]], ptr [[PTR]], align 1 + %testinsertconst = insertvalue { i32, i1 } %old0, i1 false, 1 + %testextract0 = extractvalue { i32, i1 } %testinsertconst, 1 + %outbyte0 = zext i1 %testextract0 to i8 + store i8 %outbyte0, ptr %outsuccess, align 1 + + ; Test inserting a varying value into a varying literal struct +; CHECK: [[V4I8_LD:%.*]] = load <4 x i8>, ptr %outsuccess, align 1 +; CHECK: [[TRUNC:%.*]] = trunc <4 x i8> [[V4I8_LD]] to <4 x i1> +; CHECK: [[EXTV0_0:%.*]] = extractelement <4 x i1> [[TRUNC]], i32 0 +; CHECK: [[EXTV0_1:%.*]] = extractelement <4 x i1> [[TRUNC]], i32 1 +; CHECK: [[EXTV0_2:%.*]] = extractelement <4 x i1> [[TRUNC]], i32 2 +; CHECK: [[EXTV0_3:%.*]] = extractelement <4 x i1> [[TRUNC]], i32 3 +; CHECK: [[INSS2_0:%.*]] = insertvalue { i32, i1 } [[A0]], i1 [[EXTV0_0]], 1 +; CHECK: [[INSS2_1:%.*]] = insertvalue { i32, i1 } [[A1]], i1 [[EXTV0_1]], 1 +; CHECK: [[INSS2_2:%.*]] = insertvalue { i32, i1 } [[A2]], i1 [[EXTV0_2]], 1 +; CHECK: [[INSS2_3:%.*]] = insertvalue { i32, i1 } [[A3]], i1 [[EXTV0_3]], 1 +; CHECK: [[INSS3_0:%.*]] = insertvalue [4 x { i32, i1 }] undef, { i32, i1 } [[INSS2_0]], 0 +; CHECK: [[INSS3_1:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS3_0]], { i32, i1 } [[INSS2_1]], 1 +; CHECK: [[INSS3_2:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS3_1]], { i32, i1 } [[INSS2_2]], 2 +; CHECK: [[INSS3_3:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS3_2]], { i32, i1 } [[INSS2_3]], 3 +; CHECK: [[EXTS3_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS3_3]], 0, 1 +; CHECK: [[EXTS3_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS3_3]], 1, 1 +; CHECK: [[EXTS3_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS3_3]], 2, 1 +; CHECK: [[EXTS3_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS3_3]], 3, 1 +; CHECK: [[INS1V2_0:%.*]] = insertelement <4 x i1> undef, i1 [[EXTS3_0]], i32 0 +; CHECK: [[INS1V2_1:%.*]] = insertelement <4 x i1> [[INS1V2_0]], i1 [[EXTS3_1]], i32 1 +; CHECK: [[INS1V2_2:%.*]] = insertelement <4 x i1> [[INS1V2_1]], i1 [[EXTS3_2]], i32 2 +; CHECK: [[INS1V2_3:%.*]] = insertelement <4 x i1> [[INS1V2_2]], i1 [[EXTS3_3]], i32 3 +; CHECK: [[ZEXT2:%.*]] = zext <4 x i1> [[INS1V2_3]] to <4 x i8> +; CHECK: store <4 x i8> [[ZEXT2]], ptr [[PTR]], align 1 + %byte1 = load i8, ptr %outsuccess, align 1 + %bool1 = trunc i8 %byte1 to i1 + %testinsertvarying0 = insertvalue { i32, i1 } %old0, i1 %bool1, 1 + %testextract1 = extractvalue { i32, i1 } %testinsertvarying0, 1 + %outbyte1 = zext i1 %testextract1 to i8 + store i8 %outbyte1, ptr %outsuccess, align 1 + + ; Test inserting a varying value into a uniform literal struct +; CHECK: [[INSS4_0:%.*]] = insertvalue { i32, i1 } poison, i1 [[EXTV0_0]], 1 +; CHECK: [[INSS4_1:%.*]] = insertvalue { i32, i1 } poison, i1 [[EXTV0_1]], 1 +; CHECK: [[INSS4_2:%.*]] = insertvalue { i32, i1 } poison, i1 [[EXTV0_2]], 1 +; CHECK: [[INSS4_3:%.*]] = insertvalue { i32, i1 } poison, i1 [[EXTV0_3]], 1 +; CHECK: [[INSS5_0:%.*]] = insertvalue [4 x { i32, i1 }] undef, { i32, i1 } [[INSS4_0]], 0 +; CHECK: [[INSS5_1:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS5_0]], { i32, i1 } [[INSS4_1]], 1 +; CHECK: [[INSS5_2:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS5_1]], { i32, i1 } [[INSS4_2]], 2 +; CHECK: [[INSS5_3:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS5_2]], { i32, i1 } [[INSS4_3]], 3 +; CHECK: [[EXTS5_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS5_3]], 0, 1 +; CHECK: [[EXTS5_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS5_3]], 1, 1 +; CHECK: [[EXTS5_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS5_3]], 2, 1 +; CHECK: [[EXTS5_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS5_3]], 3, 1 +; CHECK: [[INS2V3_0:%.*]] = insertelement <4 x i1> undef, i1 [[EXTS5_0]], i32 0 +; CHECK: [[INS2V3_1:%.*]] = insertelement <4 x i1> [[INS2V3_0]], i1 [[EXTS5_1]], i32 1 +; CHECK: [[INS2V3_2:%.*]] = insertelement <4 x i1> [[INS2V3_1]], i1 [[EXTS5_2]], i32 2 +; CHECK: [[INS2V3_3:%.*]] = insertelement <4 x i1> [[INS2V3_2]], i1 [[EXTS5_3]], i32 3 +; CHECK: [[ZEXT3:%.*]] = zext <4 x i1> [[INS2V3_3]] to <4 x i8> +; CHECK: store <4 x i8> [[ZEXT3]], ptr [[PTR]], align 1 + %testinsertvarying1 = insertvalue { i32, i1 } poison, i1 %bool1, 1 + %testextract2 = extractvalue { i32, i1 } %testinsertvarying1, 1 + %outbyte2 = zext i1 %testextract2 to i8 + store i8 %outbyte2, ptr %outsuccess, align 1 + + ret void +} + +declare i64 @__mux_get_global_id(i32)