Skip to content

Commit

Permalink
[Generic][AIE2] Combiner for shufflevectors that use build vector
Browse files Browse the repository at this point in the history
Transforms a shufflevector that uses a build vector or undefined
into just a build vector. This can be done is because a shuffle
vector lowering is an unmerge and then merge. Since build is a
merge, the merge and unmerge cancel each other out and we can just
merge the vector directly.

Example:
```
    %1:_(s32) = COPY $r0
    %3:_(<8 x s32>) = G_IMPLICIT_DEF
    %5:_(s32) = G_IMPLICIT_DEF
    %2:_(<8 x s32>) = G_BUILD_VECTOR %1(s32), %5(s32), %5(s32), %5(s32), %5(s32), %5(s32), %5(s32), %5(s32)
    %0:_(<8 x s32>) = G_SHUFFLE_VECTOR %2(<8 x s32>), %3, shufflemask(0, 0, 0, 0, 0, 0, 0, 0)
    ===>
    %2:_(<8 x s32>) = G_BUILD_VECTOR %1(s32), %1(s32), %1(s32), %1(s32), %1(s32), %1(s32), %1(s32), %1(s32)
```
  • Loading branch information
ValentijnvdBeek committed Aug 5, 2024
1 parent 84f3995 commit fee2d99
Show file tree
Hide file tree
Showing 11 changed files with 591 additions and 58 deletions.
9 changes: 9 additions & 0 deletions llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,15 @@ class CombinerHelper {
applyCombineUnmergeMergeToPlainValues(MachineInstr &MI,
SmallVectorImpl<Register> &Operands);

/// Transform <ty, ...> G_SHUFFLE_VECTOR(G_MERGE ty X Y Z) -> G_MERGE ty X,Y,Z
bool
matchCombineShuffleVectorBuildVector(MachineInstr &MI,
SmallVectorImpl<Register> &Operands);

void
applyCombineShuffleVectorBuildVector(MachineInstr &MI,
SmallVectorImpl<Register> &Operands);

/// Transform G_UNMERGE Constant -> Constant1, Constant2, ...
bool matchCombineUnmergeConstant(MachineInstr &MI,
SmallVectorImpl<APInt> &Csts);
Expand Down
19 changes: 19 additions & 0 deletions llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// Modifications (c) Copyright 2024 Advanced Micro Devices, Inc. or its
// affiliates
//
//===----------------------------------------------------------------------===//
/// \file
/// Declares convenience wrapper classes for interpreting MachineInstr instances
Expand Down Expand Up @@ -240,6 +243,22 @@ class GUnmerge : public GenericMachineInstr {
}
};

/// Represents a G_SHUFFLE_VECTOR.
class GShuffleVector : public GenericMachineInstr {
public:
/// Returns the number of source registers.
unsigned getNumSources() const { return getNumOperands() - 2; }
/// Returns the I'th source register.
Register getSourceReg(unsigned I) const {
assert(I + 1 <= getNumSources());
return getReg(I + 1);
}

static bool classof(const MachineInstr *MI) {
return MI->getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR;
}
};

/// Represents G_BUILD_VECTOR, G_CONCAT_VECTORS or G_MERGE_VALUES.
/// All these have the common property of generating a single value from
/// multiple sources.
Expand Down
12 changes: 11 additions & 1 deletion llvm/include/llvm/Target/GlobalISel/Combine.td
Original file line number Diff line number Diff line change
Expand Up @@ -756,7 +756,7 @@ def fneg_fneg_fold: GICombineRule <
(apply (GIReplaceReg $dst, $src))
>;

// Fold (unmerge(merge x, y, z)) -> z, y, z.
// Fold (unmerge(merge x, y, z)) -> x, y, z.
def unmerge_merge_matchinfo : GIDefMatchData<"SmallVector<Register, 8>">;
def unmerge_merge : GICombineRule<
(defs root:$d, unmerge_merge_matchinfo:$info),
Expand All @@ -765,6 +765,16 @@ def unmerge_merge : GICombineRule<
(apply [{ Helper.applyCombineUnmergeMergeToPlainValues(*${d}, ${info}); }])
>;

// Fold (unmerge(merge x, y, z)) -> z, y, z.
def shufflevector_merge_matchinfo : GIDefMatchData<"SmallVector<Register, 8>">;
def shufflevector_merge : GICombineRule<
(defs root:$d, shufflevector_merge_matchinfo:$info),
(match (wip_match_opcode G_SHUFFLE_VECTOR): $d,
[{ return Helper.matchCombineShuffleVectorBuildVector(*${d}, ${info}); }]),
(apply [{ Helper.applyCombineShuffleVectorBuildVector(*${d}, ${info}); }])
>;


// Fold merge(unmerge).
def merge_unmerge : GICombineRule<
(defs root:$d, register_matchinfo:$matchinfo),
Expand Down
88 changes: 88 additions & 0 deletions llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/CmpInstAnalysis.h"
#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
Expand All @@ -27,6 +28,7 @@
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterBankInfo.h"
Expand Down Expand Up @@ -2269,6 +2271,92 @@ static Register peekThroughBitcast(Register Reg,
return Reg;
}

bool CombinerHelper::matchCombineShuffleVectorBuildVector(
MachineInstr &MI, SmallVectorImpl<Register> &Operands) {
assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
"Expected a shuffle vector");
auto &ShuffleVector = cast<GShuffleVector>(MI);
Register SrcReg1 = peekThroughBitcast(ShuffleVector.getSourceReg(0), MRI);
Register SrcReg2 = peekThroughBitcast(ShuffleVector.getSourceReg(1), MRI);

// Check if the Source registers are either merges or implicit definitions
auto *SrcInstr1 = getOpcodeDef<GBuildVector>(SrcReg1, MRI);
auto *SrcInstr2 = getOpcodeDef<GBuildVector>(SrcReg2, MRI);
auto *IsUndef1 = getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, SrcReg1, MRI);
auto *IsUndef2 = getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, SrcReg2, MRI);

// Our inputs need to be either be build vectors or undefined, register inputs
// break this optimization. You could maybe do something clever were you
// concatenate vectors to save half a build vector.
if ((!SrcInstr1 && !IsUndef1) || (!SrcInstr2 && !IsUndef2))
return false;

if (IsUndef1 && IsUndef2)
return true;

Register UndefReg;
if (SrcInstr1 || SrcInstr2)
UndefReg = MRI.createGenericVirtualRegister(MRI.getType(SrcReg1));

// Since our inputs to shufflevector must be of the same size, we can reuse
// the size of the defined register.
const unsigned NumElements = (SrcInstr1 != 0) ? SrcInstr1->getNumSources()
: SrcInstr2->getNumSources();
for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
const Register Elt =
(SrcInstr1 != 0) ? SrcInstr1->getSourceReg(Idx) : UndefReg;
Operands.push_back(Elt);
}

for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
const Register Elt =
(SrcInstr2 != 0) ? SrcInstr2->getSourceReg(Idx) : UndefReg;
Operands.push_back(Elt);
}

return true;
}

void CombinerHelper::applyCombineShuffleVectorBuildVector(
MachineInstr &MI, SmallVectorImpl<Register> &Operands) {
assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
"Expected a shuffle vector");
auto &ShuffleVector = cast<GShuffleVector>(MI);
const Register SrcReg1 =
peekThroughBitcast(ShuffleVector.getSourceReg(0), MRI);
const Register SrcReg2 =
peekThroughBitcast(ShuffleVector.getSourceReg(1), MRI);

// Check if the Source registers are either merges or implicit definitions
const MachineInstr *IsUndef1 =
getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, SrcReg1, MRI);
const MachineInstr *IsUndef2 =
getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, SrcReg2, MRI);

// If they're both undefined, we will just return an undefined as well.
if (IsUndef1 && IsUndef2) {
Builder.buildUndef(ShuffleVector.getReg(0));
MI.eraseFromParent();
return;
}

const LLT SrcReg1Ty = MRI.getType(SrcReg1);
const ArrayRef<int> ShiftMask = MI.getOperand(3).getShuffleMask();
Register UndefReg;
SmallVector<Register, 8> Arguments;
for (int Index : ShiftMask) {
if (!UndefReg) {
UndefReg = Builder.buildUndef(SrcReg1Ty.getScalarType()).getReg(0);
}

const Register Argument = Index >= 0 ? Operands[Index] : UndefReg;
Arguments.push_back(Argument);
}

Builder.buildBuildVector(ShuffleVector.getOperand(0), Arguments);
MI.eraseFromParent();
}

bool CombinerHelper::matchCombineUnmergeMergeToPlainValues(
MachineInstr &MI, SmallVectorImpl<Register> &Operands) {
assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AArch64/AArch64Combine.td
Original file line number Diff line number Diff line change
Expand Up @@ -295,5 +295,5 @@ def AArch64PostLegalizerCombiner
ptr_add_immed_chain, overlapping_and,
split_store_zero_128, undef_combines,
select_to_minmax, or_to_bsp, combine_concat_vector,
commute_constant_to_rhs]> {
commute_constant_to_rhs, shufflevector_merge]> {
}
3 changes: 2 additions & 1 deletion llvm/lib/Target/AIE/AIECombine.td
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ def AIE2PreLegalizerCombiner
all_combines, combine_S20NarrowingOpt,
combine_globalval_offset,
combine_extract_vector_elt_and_zsa_ext,
combine_splat_vector ]> {
combine_splat_vector,
shufflevector_merge ]> {
let CombineAllMethodName = "tryCombineAllImpl";
}

Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUCombine.td
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def gfx8_combines : GICombineGroup<[expand_promoted_fmed3]>;

def AMDGPUPreLegalizerCombiner: GICombiner<
"AMDGPUPreLegalizerCombinerImpl",
[all_combines, clamp_i64_to_i16, foldable_fneg]> {
[all_combines, clamp_i64_to_i16, foldable_fneg, shufflevector_merge]> {
let CombineAllMethodName = "tryCombineAllImpl";
}

Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/RISCV/RISCVCombine.td
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
include "llvm/Target/GlobalISel/Combine.td"

def RISCVPreLegalizerCombiner: GICombiner<
"RISCVPreLegalizerCombinerImpl", [all_combines]> {
"RISCVPreLegalizerCombinerImpl", [all_combines, shufflevector_merge]> {
}

def RISCVO0PreLegalizerCombiner: GICombiner<
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,18 @@ declare i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32>) #0
define i32 @bar() {
; CHECK-LABEL: bar:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: mov b1, v0[1]
; CHECK-NEXT: mov b2, v0[2]
; CHECK-NEXT: mov b3, v0[3]
; CHECK-NEXT: mov.h v0[1], v1[0]
; CHECK-NEXT: mov w8, #0 ; =0x0
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: mov.16b v1, v0
; CHECK-NEXT: mov.b v1[1], v0[0]
; CHECK-NEXT: mov.b v1[2], v0[0]
; CHECK-NEXT: mov.b v1[3], v0[0]
; CHECK-NEXT: mov b0, v1[1]
; CHECK-NEXT: mov b2, v1[2]
; CHECK-NEXT: mov b3, v1[3]
; CHECK-NEXT: mov.h v1[1], v0[0]
; CHECK-NEXT: mov.h v2[1], v3[0]
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: ushll.4s v0, v1, #0
; CHECK-NEXT: ushll.4s v1, v2, #0
; CHECK-NEXT: mov.d v0[1], v1[0]
; CHECK-NEXT: movi.4s v1, #1
Expand Down
75 changes: 28 additions & 47 deletions llvm/test/CodeGen/AArch64/shufflevector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -210,24 +210,14 @@ define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){
; CHECK-GI-LABEL: shufflevector_v4i8:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: mov h2, v0.h[1]
; CHECK-GI-NEXT: mov h3, v1.h[1]
; CHECK-GI-NEXT: adrp x8, .LCPI15_0
; CHECK-GI-NEXT: mov h4, v0.h[2]
; CHECK-GI-NEXT: mov h5, v0.h[3]
; CHECK-GI-NEXT: mov h6, v1.h[3]
; CHECK-GI-NEXT: mov v0.b[1], v2.b[0]
; CHECK-GI-NEXT: mov h2, v1.h[2]
; CHECK-GI-NEXT: mov v1.b[1], v3.b[0]
; CHECK-GI-NEXT: mov v0.b[2], v4.b[0]
; CHECK-GI-NEXT: mov v1.b[2], v2.b[0]
; CHECK-GI-NEXT: mov v0.b[3], v5.b[0]
; CHECK-GI-NEXT: mov v1.b[3], v6.b[0]
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI15_0]
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: mov h0, v0.h[2]
; CHECK-GI-NEXT: mov v2.b[1], v0.b[0]
; CHECK-GI-NEXT: mov h0, v1.h[3]
; CHECK-GI-NEXT: mov v2.b[2], v1.b[0]
; CHECK-GI-NEXT: mov v2.b[3], v0.b[0]
; CHECK-GI-NEXT: fmov w0, s2
; CHECK-GI-NEXT: ret
%c = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 7>
%d = bitcast <4 x i8> %c to i32
Expand Down Expand Up @@ -280,14 +270,8 @@ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: mov s2, v0.s[1]
; CHECK-GI-NEXT: mov s3, v1.s[1]
; CHECK-GI-NEXT: adrp x8, .LCPI17_0
; CHECK-GI-NEXT: mov v0.h[1], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI17_0]
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b
; CHECK-GI-NEXT: mov s0, v0.s[1]
; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
%c = shufflevector <2 x i16> %a, <2 x i16> %b, <2 x i32> <i32 1, i32 2>
Expand Down Expand Up @@ -397,9 +381,12 @@ define i32 @shufflevector_v4i8_zeroes(<4 x i8> %a, <4 x i8> %b){
;
; CHECK-GI-LABEL: shufflevector_v4i8_zeroes:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: dup v0.8b, w8
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov v1.16b, v0.16b
; CHECK-GI-NEXT: mov v1.b[1], v0.b[0]
; CHECK-GI-NEXT: mov v1.b[2], v0.b[0]
; CHECK-GI-NEXT: mov v1.b[3], v0.b[0]
; CHECK-GI-NEXT: fmov w0, s1
; CHECK-GI-NEXT: ret
%c = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
%d = bitcast <4 x i8> %c to i32
Expand Down Expand Up @@ -433,8 +420,8 @@ define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){
;
; CHECK-GI-LABEL: shufflevector_v2i16_zeroes:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: dup v0.4h, w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov v0.h[1], v0.h[0]
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
%c = shufflevector <2 x i16> %a, <2 x i16> %b, <2 x i32> <i32 0, i32 0>
Expand Down Expand Up @@ -492,20 +479,11 @@ define <3 x i8> @shufflevector_v3i8(<3 x i8> %a, <3 x i8> %b) {
;
; CHECK-GI-LABEL: shufflevector_v3i8:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fmov s0, w0
; CHECK-GI-NEXT: fmov s1, w1
; CHECK-GI-NEXT: adrp x8, .LCPI30_0
; CHECK-GI-NEXT: fmov s2, w3
; CHECK-GI-NEXT: fmov s3, w4
; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
; CHECK-GI-NEXT: fmov s0, w1
; CHECK-GI-NEXT: fmov s1, w2
; CHECK-GI-NEXT: mov v2.b[1], v3.b[0]
; CHECK-GI-NEXT: fmov s3, w5
; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
; CHECK-GI-NEXT: fmov s1, w4
; CHECK-GI-NEXT: mov v0.b[2], v1.b[0]
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI30_0]
; CHECK-GI-NEXT: mov v2.b[2], v3.b[0]
; CHECK-GI-NEXT: mov v0.d[1], v2.d[0]
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b
; CHECK-GI-NEXT: mov b1, v0.b[1]
; CHECK-GI-NEXT: mov b2, v0.b[2]
; CHECK-GI-NEXT: fmov w0, s0
Expand Down Expand Up @@ -614,11 +592,14 @@ define <3 x i8> @shufflevector_v3i8_zeroes(<3 x i8> %a, <3 x i8> %b) {
;
; CHECK-GI-LABEL: shufflevector_v3i8_zeroes:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: dup v0.8b, w0
; CHECK-GI-NEXT: mov b1, v0.b[1]
; CHECK-GI-NEXT: mov b2, v0.b[2]
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: fmov w1, s1
; CHECK-GI-NEXT: fmov s0, w0
; CHECK-GI-NEXT: mov v1.16b, v0.16b
; CHECK-GI-NEXT: mov v1.b[1], v0.b[0]
; CHECK-GI-NEXT: mov v1.b[2], v0.b[0]
; CHECK-GI-NEXT: mov b0, v1.b[1]
; CHECK-GI-NEXT: mov b2, v1.b[2]
; CHECK-GI-NEXT: fmov w0, s1
; CHECK-GI-NEXT: fmov w1, s0
; CHECK-GI-NEXT: fmov w2, s2
; CHECK-GI-NEXT: ret
%c = shufflevector <3 x i8> %a, <3 x i8> %b, <3 x i32> <i32 0, i32 0, i32 0>
Expand Down
Loading

0 comments on commit fee2d99

Please sign in to comment.