From 6620c870675fc40e7c838e123cf2aae34ae5683d Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Tue, 9 Jan 2024 12:36:40 +0000 Subject: [PATCH] [vecz] Ensure inactive lanes don't contribute to branch conditions In divergent blocks, we must ensure that masked-out - or inactive - work-items don't contribute a 'true' value towards the branch condition. Masked-out values may be poison or undef, such as those coming from a masked-load operation. Note that this can happen even with ostensibly 'uniform' values, as our uniform/divergent analysis isn't capable of discerning an unconditionally uniform value from one that's uniform but produces poison/garbage when masked out. To work around this, we ensure that branch conditions in divergent blocks are masked with the block's entry mask. This should ensure that only lanes that are meant to be active at that point contribute their values. This is likely not a 100% complete fix, as we don't perform this on uniform blocks or loops. There are tests with 'uniform' loops which are unconditionally entered despite no work-items being active. As such, if we mask the loop exit condition with the entry mask, it will never be true, and loop will never exit. This should be good enough to fix some known regressions - the more correct fix would likely involve a lot more work. Note also that it is pessimistic in many of the vecz test changes. Some (unmasked) uniform conditions don't need this applied, as they truly always produce the same value even on inactive lanes. We don't have to tools to distinguish these values from the ones that do need masking. --- .../control_flow_conversion_pass.cpp | 23 ++- .../vecz/test/lit/llvm/Boscc/boscc_merge.ll | 5 +- .../vecz/test/lit/llvm/Boscc/boscc_merge3.ll | 4 +- .../lit/llvm/Boscc/partial_linearization12.ll | 8 +- .../lit/llvm/Boscc/partial_linearization13.ll | 4 +- .../lit/llvm/Boscc/partial_linearization17.ll | 4 +- .../lit/llvm/Boscc/partial_linearization18.ll | 4 +- .../lit/llvm/Boscc/partial_linearization19.ll | 4 +- .../lit/llvm/Boscc/partial_linearization5.ll | 4 +- .../lit/llvm/Boscc/partial_linearization6.ll | 4 +- .../lit/llvm/Boscc/partial_linearization7.ll | 4 +- .../vecz/test/lit/llvm/divergent_loop_bug.ll | 186 ++++++++++++++++++ .../test/lit/llvm/partial_linearization12.ll | 8 +- .../test/lit/llvm/partial_linearization13.ll | 4 +- .../test/lit/llvm/partial_linearization17.ll | 4 +- .../test/lit/llvm/partial_linearization18.ll | 4 +- .../test/lit/llvm/partial_linearization19.ll | 4 +- .../test/lit/llvm/partial_linearization5.ll | 4 +- .../test/lit/llvm/partial_linearization7.ll | 4 +- 19 files changed, 266 insertions(+), 20 deletions(-) create mode 100644 modules/compiler/vecz/test/lit/llvm/divergent_loop_bug.ll diff --git a/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp b/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp index f7649b5b4..f3b0ef34a 100644 --- a/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp +++ b/modules/compiler/vecz/source/transform/control_flow_conversion_pass.cpp @@ -1503,11 +1503,32 @@ bool ControlFlowConversionState::Impl::createBranchReductions() { auto *TI = BB.getTerminator(); if (BranchInst *Branch = dyn_cast(TI)) { if (Branch->isConditional()) { - auto *const cond = Branch->getCondition(); + auto *cond = Branch->getCondition(); if (isa(cond)) { continue; } + // On divergent paths, ensure that only active lanes contribute to a + // branch condition; merge the branch condition with the active lane + // mask. This ensures that disabled lanes don't spuriously contribute a + // 'true' value into the reduced branch condition. + // Note that the distinction between 'uniform' and 'divergent' isn't + // 100% sufficient for our purposes here, because even uniform values + // may read undefined/poison values when masked out. + // Don't perform this on uniform loops as those may be unconditionally + // entered even when no work-items are active. Masking the loop exit + // with the entry mask would mean that the loop never exits. + // FIXME: Is this missing incorrect branches in uniform blocks/loops? + // FIXME: This is pessimistic - some branch condition values are truly + // uniform even on masked-out lanes - but we don't have sophisticated + // enough analysis to discern true positives from false positives. + if (auto *LTag = DR->getTag(&BB).loop; + DR->isDivergent(BB) && (!LTag || LTag->isLoopDivergent())) { + cond = BinaryOperator::Create(Instruction::BinaryOps::And, cond, + MaskInfos[&BB].entryMask, + cond->getName() + "_active", Branch); + } + const auto &name = needsAllOfMask ? nameAll : nameAny; Function *const F = Ctx.getOrCreateInternalBuiltin( Twine(baseName).concat(name).str(), FT); diff --git a/modules/compiler/vecz/test/lit/llvm/Boscc/boscc_merge.ll b/modules/compiler/vecz/test/lit/llvm/Boscc/boscc_merge.ll index e8e6062d8..469725b7c 100644 --- a/modules/compiler/vecz/test/lit/llvm/Boscc/boscc_merge.ll +++ b/modules/compiler/vecz/test/lit/llvm/Boscc/boscc_merge.ll @@ -122,6 +122,7 @@ if.else6: ; preds = %if.then6, %if.e ; CHECK: br i1 %[[CMP1]], label %[[IFTHEN:.+]], label %[[IFELSE:.+]] ; CHECK: [[IFTHEN]]: +; CHECK: %[[CMP2:.+]] = icmp ; CHECK: br i1 %{{.+}}, label %[[IFTHEN2UNIFORM:.+]], label %[[IFTHENBOSCCINDIR:.+]] ; CHECK: [[IFELSE2PREHEADERUNIFORM:.+]]: @@ -220,7 +221,9 @@ if.else6: ; preds = %if.then6, %if.e ; CHECK: [[IFTHEN2:.+]]: ; CHECK: %[[CMP3:.+]] = icmp -; CHECK: br i1 %[[CMP3]], label %[[IFTHEN3PREHEADER:.+]], label %[[IFELSE3PREHEADER:.+]] +; CHECK: %[[CMP3_ACTIVE:.+]] = and i1 %[[CMP3]], %[[CMP2]] +; CHECK: %[[CMP3_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP3_ACTIVE]]) +; CHECK: br i1 %[[CMP3_ACTIVE_ANY]], label %[[IFTHEN3PREHEADER:.+]], label %[[IFELSE3PREHEADER:.+]] ; CHECK: [[IFELSE3PREHEADER]]: ; CHECK: br label %[[IFELSE3]] diff --git a/modules/compiler/vecz/test/lit/llvm/Boscc/boscc_merge3.ll b/modules/compiler/vecz/test/lit/llvm/Boscc/boscc_merge3.ll index ccd79ca20..3986db936 100644 --- a/modules/compiler/vecz/test/lit/llvm/Boscc/boscc_merge3.ll +++ b/modules/compiler/vecz/test/lit/llvm/Boscc/boscc_merge3.ll @@ -96,7 +96,9 @@ attributes #0 = { nounwind readnone } ; CHECK: if.then1: ; CHECK: %gep1 = -; CHECK: br i1 %cmp2, label %if.then2, label %if.end1 +; CHECK: %[[CMP2_ACTIVE:.+]] = and i1 %cmp2, %cmp1 +; CHECK: %[[CMP2_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP2_ACTIVE]]) +; CHECK: br i1 %[[CMP2_ACTIVE_ANY]], label %if.then2, label %if.end1 ; Generalizing the expected %cmp3 value because the 'icmp' could go off ; by one BB between LLVM versions. Therefore we can get %cmp3.not. diff --git a/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization12.ll b/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization12.ll index eba24fbbd..763c8e960 100644 --- a/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization12.ll +++ b/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization12.ll @@ -667,7 +667,9 @@ attributes #2 = { nobuiltin nounwind readonly } ; CHECK: [[IFTHEN38]]: ; CHECK: %[[CMP42:cmp.+]] = icmp -; CHECK: br i1 %[[CMP42]], label %[[IFTHEN44:.+]], label %[[IFELSE46:.+]] +; CHECK: %[[CMP42_ACTIVE:.+]] = and i1 %[[CMP42]], {{%.*}} +; CHECK: %[[CMP42_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP42_ACTIVE]]) +; CHECK: br i1 %[[CMP42_ACTIVE_ANY]], label %[[IFTHEN44:.+]], label %[[IFELSE46:.+]] ; CHECK: [[IFTHEN44]]: ; CHECK: br label %[[IFELSE48:.+]] @@ -677,7 +679,9 @@ attributes #2 = { nobuiltin nounwind readonly } ; CHECK: [[IFELSE48]]: ; CHECK: %[[CMP49:.+]] = icmp -; CHECK: br i1 %[[CMP49]], label %[[IFTHEN51:.+]], label %[[IFELSE53:.+]] +; CHECK: %[[CMP49_ACTIVE:.+]] = and i1 %[[CMP49]], {{%.*}} +; CHECK: %[[CMP49_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP49_ACTIVE]]) +; CHECK: br i1 %[[CMP49_ACTIVE_ANY]], label %[[IFTHEN51:.+]], label %[[IFELSE53:.+]] ; CHECK: [[IFTHEN51]]: ; CHECK: br label %[[N58:.+]] diff --git a/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization13.ll b/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization13.ll index f7937ad89..7f61d8736 100644 --- a/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization13.ll +++ b/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization13.ll @@ -228,7 +228,9 @@ attributes #2 = { nobuiltin nounwind readonly } ; CHECK: [[IFTHEN4]]: ; CHECK: %[[TRUNC:.+]] = icmp -; CHECK: br i1 %[[TRUNC]], label %[[SWBB8:.+]], label %[[SWBB:.+]] +; CHECK: %[[TRUNC_ACTIVE:.+]] = and i1 %[[TRUNC]], {{%.*}} +; CHECK: %[[TRUNC_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[TRUNC_ACTIVE]]) +; CHECK: br i1 %[[TRUNC_ACTIVE_ANY]], label %[[SWBB8:.+]], label %[[SWBB:.+]] ; CHECK: [[SWBB]]: ; CHECK: br label %[[SWBB8]] diff --git a/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization17.ll b/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization17.ll index 5e7b83b78..1c2450f3a 100644 --- a/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization17.ll +++ b/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization17.ll @@ -277,7 +277,9 @@ attributes #2 = { convergent nobuiltin nounwind readonly } ; CHECK: [[WHILEBODY]]: ; CHECK: %[[CMP:.+]] = icmp -; CHECK: br i1 %[[CMP]], label %[[FORCOND28PREHEADER:.+]], label %[[IFELSE:.+]] +; CHECK: %[[CMP_ACTIVE:.+]] = and i1 %[[CMP]], {{%.*}} +; CHECK: %[[CMP_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP_ACTIVE]]) +; CHECK: br i1 %[[CMP_ACTIVE_ANY]], label %[[FORCOND28PREHEADER:.+]], label %[[IFELSE:.+]] ; CHECK: [[FORCOND28PREHEADER]]: ; CHECK: br label %[[WHILEBODYPUREEXIT:.+]] diff --git a/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization18.ll b/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization18.ll index 571c0a48f..fffcb6dbe 100644 --- a/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization18.ll +++ b/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization18.ll @@ -220,7 +220,9 @@ attributes #2 = { convergent nobuiltin nounwind readonly } ; CHECK: [[WHILEBODY]]: ; CHECK: %[[CMP:.+]] = icmp -; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFEND:.+]] +; CHECK: %[[CMP_ACTIVE:.+]] = and i1 %[[CMP]], {{%.*}} +; CHECK: %[[CMP_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP_ACTIVE]]) +; CHECK: br i1 %[[CMP_ACTIVE_ANY]], label %[[IFTHEN:.+]], label %[[IFEND:.+]] ; CHECK: [[IFTHEN]]: ; CHECK: br label %[[WHILEBODYPUREEXIT:.+]] diff --git a/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization19.ll b/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization19.ll index 7b67cbd48..7faea031b 100644 --- a/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization19.ll +++ b/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization19.ll @@ -233,7 +233,9 @@ attributes #2 = { convergent nobuiltin nounwind readonly } ; CHECK: [[WHILEBODY]]: ; CHECK: %[[CMP:.+]] = icmp -; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFEND:.+]] +; CHECK: %[[CMP_ACTIVE:.+]] = and i1 %[[CMP]], {{%.*}} +; CHECK: %[[CMP_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP_ACTIVE]]) +; CHECK: br i1 %[[CMP_ACTIVE_ANY]], label %[[IFTHEN:.+]], label %[[IFEND:.+]] ; CHECK: [[IFTHEN]]: ; CHECK: %[[CMP2:.+]] = icmp diff --git a/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization5.ll b/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization5.ll index cb6acb6c7..ca2b671e2 100644 --- a/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization5.ll +++ b/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization5.ll @@ -227,7 +227,9 @@ attributes #2 = { nobuiltin nounwind readonly } ; CHECK: [[IFELSE5]]: ; CHECK: %[[CMP7:.+]] = icmp -; CHECK: br i1 %[[CMP7]], label %[[IFTHEN]], label %[[FORCOND14PREHEADER:.+]] +; CHECK: %[[CMP7_ACTIVE:.+]] = and i1 %[[CMP7]], {{%.*}} +; CHECK: %[[CMP7_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP7_ACTIVE]]) +; CHECK: br i1 %[[CMP7_ACTIVE_ANY]], label %[[IFTHEN]], label %[[FORCOND14PREHEADER:.+]] ; CHECK: [[FORCOND14PREHEADER]]: ; CHECK: br label %[[FORCOND14:.+]] diff --git a/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization6.ll b/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization6.ll index c5a8af2b8..bd2027413 100644 --- a/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization6.ll +++ b/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization6.ll @@ -168,7 +168,9 @@ attributes #2 = { nobuiltin nounwind readonly } ; CHECK: [[WHILEBODY]]: ; CHECK: %[[CMP:.+]] = icmp -; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE:.+]] +; CHECK: %[[CMP_ACTIVE:.+]] = and i1 %[[CMP]], {{%.*}} +; CHECK: %[[CMP_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP_ACTIVE]]) +; CHECK: br i1 %[[CMP_ACTIVE_ANY]], label %[[IFTHEN:.+]], label %[[IFELSE:.+]] ; CHECK: [[IFTHEN]]: ; CHECK: %[[CMP2:.+]] = icmp diff --git a/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization7.ll b/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization7.ll index 1bbf53c65..4024f9bf5 100644 --- a/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization7.ll +++ b/modules/compiler/vecz/test/lit/llvm/Boscc/partial_linearization7.ll @@ -238,7 +238,9 @@ attributes #2 = { nobuiltin nounwind readonly } ; CHECK: [[E]]: ; CHECK: %[[CMP13:.+]] = icmp -; CHECK: br i1 %[[CMP13]], label %[[G:.+]], label %[[H:.+]] +; CHECK: %[[CMP13_ACTIVE:.+]] = and i1 %[[CMP13]], {{%.*}} +; CHECK: %[[CMP13_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP13_ACTIVE]]) +; CHECK: br i1 %[[CMP13_ACTIVE_ANY]], label %[[G:.+]], label %[[H:.+]] ; CHECK: [[G]]: ; CHECK: br label %[[FORCOND19:.+]] diff --git a/modules/compiler/vecz/test/lit/llvm/divergent_loop_bug.ll b/modules/compiler/vecz/test/lit/llvm/divergent_loop_bug.ll new file mode 100644 index 000000000..9eacfe58c --- /dev/null +++ b/modules/compiler/vecz/test/lit/llvm/divergent_loop_bug.ll @@ -0,0 +1,186 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-passes=cfg-convert -S < %s | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; CHECK: define spir_kernel void @__vecz_v4_uniform_if_then_in_divergent_block( +; CHECK-SAME: ptr addrspace(1) %accum_ptr, i32 %threshold, ptr addrspace(1) %out) +define spir_kernel void @uniform_if_then_in_divergent_block(ptr addrspace(1) %accum_ptr, i32 %threshold, ptr addrspace(1) %out) #4 !reqd_work_group_size !10 { +; CHECK: entry: +; CHECK: [[CMP_NOT:%.*]] = icmp slt i32 %0, %threshold +; CHECK: %cmp.not.ROSCC = icmp eq i1 [[CMP_NOT]], false +; CHECK: %cmp.not.ROSCC_any = call i1 @__vecz_b_divergence_any(i1 %cmp.not.ROSCC) +; CHECK: br i1 %cmp.not.ROSCC_any, label %entry.ROSCC, label %entry.if.end17_crit_edge +entry: + %cosa = alloca float, align 4 + %call = tail call i64 @__mux_get_global_id(i32 0) #5 + %sext = mul i64 %call, 51539607552 + %idx.ext = ashr exact i64 %sext, 32 + %add.ptr = getelementptr inbounds i32, ptr addrspace(1) %accum_ptr, i64 %idx.ext + %0 = load i32, ptr addrspace(1) %add.ptr, align 4 + %cmp.not = icmp slt i32 %0, %threshold + br i1 %cmp.not, label %entry.if.end17_crit_edge, label %if.then + +; CHECK: entry.ROSCC: +; CHECK: [[CMP_NOT_NOT:%.*]] = xor i1 [[CMP_NOT]], true +; CHECK: br label %if.then + +entry.if.end17_crit_edge: ; preds = %entry + br label %if.end17 + +; Ensure that only active lanes (masked by %cmp.not.not) contribute towards the +; %or.cond branch. +; CHECK: if.then: +; CHECK: call void @__vecz_b_masked_store4_fu3ptrb(float 0.000000e+00, ptr %cosa, i1 [[CMP_NOT_NOT]]) +; CHECK: %1 = call spir_func float @__vecz_b_masked__Z6sincosfPf(float 0.000000e+00, ptr nonnull %cosa, i1 [[CMP_NOT_NOT]]) #9 +; CHECK: %2 = call float @__vecz_b_masked_load4_fu3ptrb(ptr %cosa, i1 [[CMP_NOT_NOT]]) +; CHECK: %mul7 = fmul float %2, -2.950000e+01 +; CHECK: %cmp11 = fcmp uge float %mul7, 0.000000e+00 +; CHECK: %cmp14 = fcmp ult float %mul7, 6.400000e+01 +; CHECK: %or.cond = and i1 %cmp11, %cmp14 +; CHECK: %or.cond_active = and i1 %or.cond, [[CMP_NOT_NOT]] +; CHECK: %or.cond_active_any = call i1 @__vecz_b_divergence_any(i1 %or.cond_active) +; CHECK: br i1 %or.cond_active_any, label %if.then.if.end_crit_edge, label %if.then16 +if.then: ; preds = %entry + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cosa) #6 + store float 0.000000e+00, ptr %cosa, align 4 + %call4 = call spir_func float @_Z6sincosfPf(float 0.000000e+00, ptr nonnull %cosa) #7 + %1 = load float, ptr %cosa, align 4 + %mul7 = fmul float %1, -2.950000e+01 + %cmp11 = fcmp uge float %mul7, 0.000000e+00 + %cmp14 = fcmp ult float %mul7, 6.400000e+01 + %or.cond = and i1 %cmp11, %cmp14 + br i1 %or.cond, label %if.then.if.end_crit_edge, label %if.then16 + +if.then.if.end_crit_edge: ; preds = %if.then + br label %if.end + +if.then16: ; preds = %if.then + %sext2 = shl i64 %call, 32 + %idxprom = ashr exact i64 %sext2, 32 + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %out, i64 %idxprom + store float %mul7, ptr addrspace(1) %arrayidx, align 4 + br label %if.end + +if.end: ; preds = %if.then.if.end_crit_edge, %if.then16 + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cosa) #6 + br label %if.end17 + +if.end17: ; preds = %entry.if.end17_crit_edge, %if.end + ret void +} + +define spir_kernel void @uniform_if_else_in_divergent_block(ptr addrspace(1) %accum_ptr, i32 %threshold, ptr addrspace(1) %out) #4 !reqd_work_group_size !10 { +; CHECK: entry: +; CHECK: [[CMP_NOT:%.*]] = icmp slt i32 %0, %threshold +; CHECK: %cmp.not.ROSCC = icmp eq i1 [[CMP_NOT]], false +; CHECK: %cmp.not.ROSCC_any = call i1 @__vecz_b_divergence_any(i1 %cmp.not.ROSCC) +; CHECK: br i1 %cmp.not.ROSCC_any, label %entry.ROSCC, label %entry.if.end17_crit_edge +entry: + %cosa = alloca float, align 4 + %call = tail call i64 @__mux_get_global_id(i32 0) #5 + %sext = mul i64 %call, 51539607552 + %idx.ext = ashr exact i64 %sext, 32 + %add.ptr = getelementptr inbounds i32, ptr addrspace(1) %accum_ptr, i64 %idx.ext + %0 = load i32, ptr addrspace(1) %add.ptr, align 4 + %cmp.not = icmp slt i32 %0, %threshold + br i1 %cmp.not, label %entry.if.end17_crit_edge, label %if.then + +; CHECK: entry.ROSCC: +; CHECK: [[CMP_NOT_NOT:%.*]] = xor i1 [[CMP_NOT]], true +; CHECK: br label %if.then + +entry.if.end17_crit_edge: ; preds = %entry + br label %if.end17 + +; Ensure that only active lanes (masked by %cmp.not.not) contribute towards the +; %or.cond branch. +; CHECK: if.then: +; CHECK: call void @__vecz_b_masked_store4_fu3ptrb(float 0.000000e+00, ptr %cosa, i1 [[CMP_NOT_NOT]]) +; CHECK: %1 = call spir_func float @__vecz_b_masked__Z6sincosfPf(float 0.000000e+00, ptr nonnull %cosa, i1 [[CMP_NOT_NOT]]) #9 +; CHECK: %2 = call float @__vecz_b_masked_load4_fu3ptrb(ptr %cosa, i1 [[CMP_NOT_NOT]]) +; CHECK: %mul7 = fmul float %2, -2.950000e+01 +; CHECK: %cmp11 = fcmp uge float %mul7, 0.000000e+00 +; CHECK: %cmp14 = fcmp ult float %mul7, 6.400000e+01 +; CHECK: %or.cond = and i1 %cmp11, %cmp14 +; CHECK: %or.cond_active = and i1 %or.cond, [[CMP_NOT_NOT]] +; CHECK: %or.cond_active_any = call i1 @__vecz_b_divergence_any(i1 %or.cond_active) +; CHECK: br i1 %or.cond_active_any, label %if.else.crit_edge, label %if.then16 +if.then: ; preds = %entry + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cosa) #6 + store float 0.000000e+00, ptr %cosa, align 4 + %call4 = call spir_func float @_Z6sincosfPf(float 0.000000e+00, ptr nonnull %cosa) #7 + %1 = load float, ptr %cosa, align 4 + %mul7 = fmul float %1, -2.950000e+01 + %cmp11 = fcmp uge float %mul7, 0.000000e+00 + %cmp14 = fcmp ult float %mul7, 6.400000e+01 + %or.cond = and i1 %cmp11, %cmp14 + br i1 %or.cond, label %if.else.crit_edge, label %if.then16 + +if.else.crit_edge: ; preds = %if.then + br label %if.else + +if.then16: ; preds = %if.then + %sext2 = shl i64 %call, 32 + %idxprom = ashr exact i64 %sext2, 32 + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %out, i64 %idxprom + store float %mul7, ptr addrspace(1) %arrayidx, align 4 + br label %if.end + +if.else: + %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %idxprom + store float 1.0, ptr addrspace(1) %arrayidx2, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then16 + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cosa) #6 + br label %if.end17 + +if.end17: ; preds = %entry.if.end17_crit_edge, %if.end + ret void +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: nounwind +declare spir_func float @_Z6sincosfPf(float, ptr) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: alwaysinline norecurse nounwind memory(read) +declare i64 @__mux_get_global_id(i32) #3 + +attributes #0 = { norecurse nounwind "mux-kernel"="entry-point" "mux-local-mem-usage"="0" "mux-no-subgroups" "mux-orig-fn"="get_lines" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "vecz-mode"="auto" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) "vecz-mode"="auto" } +attributes #2 = { nounwind "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="0" "stackrealign" "vecz-mode"="auto" } +attributes #3 = { alwaysinline norecurse nounwind memory(read) "vecz-mode"="auto" } +attributes #4 = { norecurse nounwind "mux-base-fn-name"="get_lines" "mux-kernel"="entry-point" "mux-local-mem-usage"="0" "mux-no-subgroups" "mux-orig-fn"="get_lines" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "vecz-mode"="auto" } +attributes #5 = { alwaysinline norecurse nounwind memory(read) } +attributes #6 = { nounwind } +attributes #7 = { nobuiltin nounwind "no-builtins" } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!2} +!opencl.spir.version = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 1, i32 2} +!10 = !{i32 2, i32 1, i32 1} diff --git a/modules/compiler/vecz/test/lit/llvm/partial_linearization12.ll b/modules/compiler/vecz/test/lit/llvm/partial_linearization12.ll index f226c3eb5..e338a7352 100644 --- a/modules/compiler/vecz/test/lit/llvm/partial_linearization12.ll +++ b/modules/compiler/vecz/test/lit/llvm/partial_linearization12.ll @@ -512,7 +512,9 @@ attributes #2 = { nobuiltin nounwind readonly } ; CHECK: [[IFTHEN38]]: ; CHECK: %[[CMP42:.+]] = icmp slt i32 -; CHECK: br i1 %[[CMP42]], label %[[IFTHEN44:.+]], label %[[IFELSE46:.+]] +; CHECK: %[[CMP42_ACTIVE:.+]] = and i1 %[[CMP42]], {{%.*}} +; CHECK: %[[CMP42_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP42_ACTIVE]]) +; CHECK: br i1 %[[CMP42_ACTIVE_ANY]], label %[[IFTHEN44:.+]], label %[[IFELSE46:.+]] ; CHECK: [[IFTHEN44]]: ; CHECK: br label %[[IFELSE48:.+]] @@ -522,7 +524,9 @@ attributes #2 = { nobuiltin nounwind readonly } ; CHECK: [[IFELSE48]]: ; CHECK: %[[CMP49:.+]] = icmp -; CHECK: br i1 %[[CMP49]], label %[[IFTHEN51:.+]], label %[[IFELSE53:.+]] +; CHECK: %[[CMP49_ACTIVE:.+]] = and i1 %[[CMP49]], {{%.*}} +; CHECK: %[[CMP49_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP49_ACTIVE]]) +; CHECK: br i1 %[[CMP49_ACTIVE_ANY]], label %[[IFTHEN51:.+]], label %[[IFELSE53:.+]] ; CHECK: [[IFTHEN51]]: ; CHECK: br label %[[N58:.+]] diff --git a/modules/compiler/vecz/test/lit/llvm/partial_linearization13.ll b/modules/compiler/vecz/test/lit/llvm/partial_linearization13.ll index 5385a5ab9..a5150be43 100644 --- a/modules/compiler/vecz/test/lit/llvm/partial_linearization13.ll +++ b/modules/compiler/vecz/test/lit/llvm/partial_linearization13.ll @@ -198,7 +198,9 @@ attributes #2 = { nobuiltin nounwind readonly } ; CHECK: [[IFTHEN4]]: ; CHECK: %[[TMP:.+]] = and i64 %call1, 1 ; CHECK: %[[TRUNC:.+]] = icmp eq i64 %[[TMP]], 0 -; CHECK: br i1 %[[TRUNC]], label %[[SWBB8:.+]], label %[[SWBB:.+]] +; CHECK: %[[TRUNC_ACTIVE:.+]] = and i1 %[[TRUNC]], {{%.*}} +; CHECK: %[[TRUNC_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[TRUNC_ACTIVE]]) +; CHECK: br i1 %[[TRUNC_ACTIVE_ANY]], label %[[SWBB8:.+]], label %[[SWBB:.+]] ; CHECK: [[SWBB]]: ; CHECK: br label %[[SWBB8]] diff --git a/modules/compiler/vecz/test/lit/llvm/partial_linearization17.ll b/modules/compiler/vecz/test/lit/llvm/partial_linearization17.ll index 4dd7317c2..fa8995233 100644 --- a/modules/compiler/vecz/test/lit/llvm/partial_linearization17.ll +++ b/modules/compiler/vecz/test/lit/llvm/partial_linearization17.ll @@ -275,7 +275,9 @@ attributes #2 = { convergent nobuiltin nounwind readonly } ; CHECK: [[WHILEBODY]]: ; CHECK: %[[CMP:.+]] = icmp -; CHECK: br i1 %[[CMP]], label %[[FORCOND28PREHEADER:.+]], label %[[IFELSE:.+]] +; CHECK: %[[CMP_ACTIVE:.+]] = and i1 %[[CMP]], {{%.*}} +; CHECK: %[[CMP_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP_ACTIVE]]) +; CHECK: br i1 %[[CMP_ACTIVE_ANY]], label %[[FORCOND28PREHEADER:.+]], label %[[IFELSE:.+]] ; CHECK: [[FORCOND28PREHEADER]]: ; CHECK: br label %[[WHILEBODYPUREEXIT:.+]] diff --git a/modules/compiler/vecz/test/lit/llvm/partial_linearization18.ll b/modules/compiler/vecz/test/lit/llvm/partial_linearization18.ll index 3fc928055..42ef4ead7 100644 --- a/modules/compiler/vecz/test/lit/llvm/partial_linearization18.ll +++ b/modules/compiler/vecz/test/lit/llvm/partial_linearization18.ll @@ -218,7 +218,9 @@ attributes #2 = { convergent nobuiltin nounwind readonly } ; CHECK: [[WHILEBODY]]: ; CHECK: %[[CMP:.+]] = icmp -; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFEND:.+]] +; CHECK: %[[CMP_ACTIVE:.+]] = and i1 %[[CMP]], {{%.*}} +; CHECK: %[[CMP_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP_ACTIVE]]) +; CHECK: br i1 %[[CMP_ACTIVE_ANY]], label %[[IFTHEN:.+]], label %[[IFEND:.+]] ; CHECK: [[IFTHEN]]: ; CHECK: br label %[[WHILEBODYPUREEXIT:.+]] diff --git a/modules/compiler/vecz/test/lit/llvm/partial_linearization19.ll b/modules/compiler/vecz/test/lit/llvm/partial_linearization19.ll index b5c52ad8c..171e67318 100644 --- a/modules/compiler/vecz/test/lit/llvm/partial_linearization19.ll +++ b/modules/compiler/vecz/test/lit/llvm/partial_linearization19.ll @@ -231,7 +231,9 @@ attributes #2 = { convergent nobuiltin nounwind readonly } ; CHECK: [[WHILEBODY]]: ; CHECK: %[[CMP:.+]] = icmp -; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFEND:.+]] +; CHECK: %[[CMP_ACTIVE:.+]] = and i1 %[[CMP]], {{%.*}} +; CHECK: %[[CMP_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP_ACTIVE]]) +; CHECK: br i1 %[[CMP_ACTIVE_ANY]], label %[[IFTHEN:.+]], label %[[IFEND:.+]] ; CHECK: [[IFTHEN]]: ; CHECK: %[[CMP2:.+]] = icmp diff --git a/modules/compiler/vecz/test/lit/llvm/partial_linearization5.ll b/modules/compiler/vecz/test/lit/llvm/partial_linearization5.ll index 520b069f5..9db40f122 100644 --- a/modules/compiler/vecz/test/lit/llvm/partial_linearization5.ll +++ b/modules/compiler/vecz/test/lit/llvm/partial_linearization5.ll @@ -187,7 +187,9 @@ attributes #2 = { nobuiltin nounwind readonly } ; CHECK: [[IFELSE5]]: ; CHECK: %[[CMP7:.+]] = icmp -; CHECK: br i1 %[[CMP7]], label %[[IFTHEN]], label %[[FORCOND14PREHEADER:.+]] +; CHECK: %[[CMP7_ACTIVE:.+]] = and i1 %[[CMP7]], {{%.*}} +; CHECK: %[[CMP7_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP7_ACTIVE]]) +; CHECK: br i1 %[[CMP7_ACTIVE_ANY]], label %[[IFTHEN]], label %[[FORCOND14PREHEADER:.+]] ; CHECK: [[FORCOND14PREHEADER]]: ; CHECK: br label %[[FORCOND14:.+]] diff --git a/modules/compiler/vecz/test/lit/llvm/partial_linearization7.ll b/modules/compiler/vecz/test/lit/llvm/partial_linearization7.ll index 13bbb4131..4a84dbf04 100644 --- a/modules/compiler/vecz/test/lit/llvm/partial_linearization7.ll +++ b/modules/compiler/vecz/test/lit/llvm/partial_linearization7.ll @@ -204,7 +204,9 @@ attributes #2 = { nobuiltin nounwind readonly } ; CHECK: [[E]]: ; CHECK: %[[CMP13:.+]] = icmp -; CHECK: br i1 %[[CMP13]], label %[[G:.+]], label %[[H:.+]] +; CHECK: %[[CMP13_ACTIVE:.+]] = and i1 %[[CMP13]], {{%.*}} +; CHECK: %[[CMP13_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP13_ACTIVE]]) +; CHECK: br i1 %[[CMP13_ACTIVE_ANY]], label %[[G:.+]], label %[[H:.+]] ; CHECK: [[G]]: ; CHECK: br label %[[FORCOND19:.+]]