Skip to content

Commit

Permalink
[AArch64][SVE] optimisation for unary SVE store intrinsics with no ac…
Browse files Browse the repository at this point in the history
…tive lanes (llvm#95793)

This patch extends llvm#73964 and
adds optimisation of store SVE intrinsics when predicate is zero.
  • Loading branch information
Lukacma committed Jul 2, 2024
1 parent d7da0ae commit 9ceb45c
Show file tree
Hide file tree
Showing 2 changed files with 346 additions and 0 deletions.
36 changes: 36 additions & 0 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -985,6 +985,16 @@ static bool isAllActivePredicate(Value *Pred) {
m_ConstantInt<AArch64SVEPredPattern::all>()));
}

// Erase unary operation where predicate has all inactive lanes
static std::optional<Instruction *>
instCombineSVENoActiveUnaryErase(InstCombiner &IC, IntrinsicInst &II,
int PredPos) {
if (match(II.getOperand(PredPos), m_ZeroInt())) {
return IC.eraseInstFromFunction(II);
}
return std::nullopt;
}

// Simplify unary operation where predicate has all inactive lanes by replacing
// instruction with zeroed object
static std::optional<Instruction *>
Expand Down Expand Up @@ -2007,6 +2017,32 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
default:
break;

case Intrinsic::aarch64_sve_st1_scatter:
case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
case Intrinsic::aarch64_sve_st1_scatter_sxtw:
case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
case Intrinsic::aarch64_sve_st1_scatter_uxtw:
case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
case Intrinsic::aarch64_sve_st1dq:
case Intrinsic::aarch64_sve_st1q_scatter_index:
case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
case Intrinsic::aarch64_sve_st1wq:
case Intrinsic::aarch64_sve_stnt1:
case Intrinsic::aarch64_sve_stnt1_scatter:
case Intrinsic::aarch64_sve_stnt1_scatter_index:
case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
return instCombineSVENoActiveUnaryErase(IC, II, 1);
case Intrinsic::aarch64_sve_st2:
case Intrinsic::aarch64_sve_st2q:
return instCombineSVENoActiveUnaryErase(IC, II, 2);
case Intrinsic::aarch64_sve_st3:
case Intrinsic::aarch64_sve_st3q:
return instCombineSVENoActiveUnaryErase(IC, II, 3);
case Intrinsic::aarch64_sve_st4:
case Intrinsic::aarch64_sve_st4q:
return instCombineSVENoActiveUnaryErase(IC, II, 4);
case Intrinsic::aarch64_sve_ld1_gather:
case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
case Intrinsic::aarch64_sve_ld1_gather_sxtw:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,310 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
;RUN: opt -S -passes=instcombine < %s | FileCheck %s
target triple = "aarch64-unknown-linux-gnu"

define void @test_st1(ptr %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: define void @test_st1(
; CHECK-SAME: ptr [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: ret void
;
entry:
call void @llvm.aarch64.sve.st1.nxv16i8(<vscale x 16 x i8> %b, <vscale x 16 x i1> zeroinitializer, ptr %a)
ret void
}

define void @test_st1_scatter(<vscale x 2 x i16> %data_trunc, ptr %base, <vscale x 2 x i64> %b) {
; CHECK-LABEL: define void @test_st1_scatter(
; CHECK-SAME: <vscale x 2 x i16> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: ret void
;
entry:
call void @llvm.aarch64.sve.st1.scatter.nxv2i16(<vscale x 2 x i16> %data_trunc,
<vscale x 2 x i1> zeroinitializer,
ptr %base,
<vscale x 2 x i64> %b)
ret void
}

define void @test_st1_scatter_index(<vscale x 2 x i32> %data_trunc, ptr %base, <vscale x 2 x i64> %offsets) {
; CHECK-LABEL: define void @test_st1_scatter_index(
; CHECK-SAME: <vscale x 2 x i32> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFFSETS:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: call void @llvm.aarch64.sve.st1.scatter.index.nxv2i32(<vscale x 2 x i32> [[DATA_TRUNC]], <vscale x 2 x i1> zeroinitializer, ptr [[BASE]], <vscale x 2 x i64> [[OFFSETS]])
; CHECK-NEXT: ret void
;
entry:
call void @llvm.aarch64.sve.st1.scatter.index.nxv2i32(<vscale x 2 x i32> %data_trunc,
<vscale x 2 x i1> zeroinitializer,
ptr %base,
<vscale x 2 x i64> %offsets)
ret void
}

define void @test_st1_scatter_scalar_offset(<vscale x 4 x i8> %data_trunc, <vscale x 4 x i32> %base) {
; CHECK-LABEL: define void @test_st1_scatter_scalar_offset(
; CHECK-SAME: <vscale x 4 x i8> [[DATA_TRUNC:%.*]], <vscale x 4 x i32> [[BASE:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: ret void
;
entry:
call void @llvm.aarch64.sve.st1.scatter.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i8> %data_trunc,
<vscale x 4 x i1> zeroinitializer,
<vscale x 4 x i32> %base,
i64 16)
ret void
}

define void @test_st1_scatter_sxtw(<vscale x 4 x i8> %data_trunc, ptr %base, <vscale x 4 x i32> %offsets) {
; CHECK-LABEL: define void @test_st1_scatter_sxtw(
; CHECK-SAME: <vscale x 4 x i8> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 4 x i32> [[OFFSETS:%.*]]) {
; CHECK-NEXT: ret void
;
call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4i8(<vscale x 4 x i8> %data_trunc,
<vscale x 4 x i1> zeroinitializer,
ptr %base,
<vscale x 4 x i32> %offsets)
ret void
}

define void @test_st1_scatter_sxtw_index(<vscale x 4 x i16> %data_trunc, ptr %base, <vscale x 4 x i32> %indices) {
; CHECK-LABEL: define void @test_st1_scatter_sxtw_index(
; CHECK-SAME: <vscale x 4 x i16> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 4 x i32> [[INDICES:%.*]]) {
; CHECK-NEXT: ret void
;
call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4i16(<vscale x 4 x i16> %data_trunc,
<vscale x 4 x i1> zeroinitializer,
ptr %base,
<vscale x 4 x i32> %indices)
ret void
}

define void @test_st1_scatter_uxtw(<vscale x 4 x i8> %data_trunc, ptr %base, <vscale x 4 x i32> %offsets) {
; CHECK-LABEL: define void @test_st1_scatter_uxtw(
; CHECK-SAME: <vscale x 4 x i8> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 4 x i32> [[OFFSETS:%.*]]) {
; CHECK-NEXT: ret void
;
call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4i8(<vscale x 4 x i8> %data_trunc,
<vscale x 4 x i1> zeroinitializer,
ptr %base,
<vscale x 4 x i32> %offsets)
ret void
}

define void @test_st1_scatter_uxtw_index(<vscale x 4 x i16> %data_trunc, ptr %base, <vscale x 4 x i32> %indices) {
; CHECK-LABEL: define void @test_st1_scatter_uxtw_index(
; CHECK-SAME: <vscale x 4 x i16> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 4 x i32> [[INDICES:%.*]]) {
; CHECK-NEXT: ret void
;
call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv4i16(<vscale x 4 x i16> %data_trunc,
<vscale x 4 x i1> zeroinitializer,
ptr %base,
<vscale x 4 x i32> %indices)
ret void
}

define void @test_st1dq(<vscale x 2 x i64> %zt, ptr %gep1) {
; CHECK-LABEL: define void @test_st1dq(
; CHECK-SAME: <vscale x 2 x i64> [[ZT:%.*]], ptr [[GEP1:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: ret void
;
entry:
call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> %zt, <vscale x 1 x i1> zeroinitializer, ptr %gep1)
ret void
}

define void @test_st1q_scatter_index(<vscale x 8 x i16> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx) {
; CHECK-LABEL: define void @test_st1q_scatter_index(
; CHECK-SAME: <vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[PG:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: ret void
;
entry:
call void @llvm.aarch64.sve.st1q.scatter.index.nxv8i16(<vscale x 8 x i16> %data, <vscale x 1 x i1> zeroinitializer, ptr %base, <vscale x 2 x i64> %idx)
ret void
}

define void @test_st1q_scatter_scalar_offset(<vscale x 2 x i64> %data, <vscale x 2 x i64> %base) {
; CHECK-LABEL: define void @test_st1q_scatter_scalar_offset(
; CHECK-SAME: <vscale x 2 x i64> [[DATA:%.*]], <vscale x 2 x i64> [[BASE:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: ret void
;
entry:
call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> %data, <vscale x 1 x i1> zeroinitializer, <vscale x 2 x i64> %base, i64 0)
ret void
}

define void @test_st1q_scatter_vector_offset(<vscale x 8 x i16> %data, ptr %base, <vscale x 2 x i64> %off) {
; CHECK-LABEL: define void @test_st1q_scatter_vector_offset(
; CHECK-SAME: <vscale x 8 x i16> [[DATA:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: ret void
;
entry:
call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8i16(<vscale x 8 x i16> %data, <vscale x 1 x i1> zeroinitializer, ptr %base, <vscale x 2 x i64> %off)
ret void
}

define void @test_st1wq(ptr %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: define void @test_st1wq(
; CHECK-SAME: ptr [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: ret void
;
entry:
call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> %b, <vscale x 1 x i1> zeroinitializer, ptr %a)
ret void
}


define void @test_st2(ptr %a, <vscale x 8 x i32> %b) {
; CHECK-LABEL: define void @test_st2(
; CHECK-SAME: ptr [[A:%.*]], <vscale x 8 x i32> [[B:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: ret void
;
entry:
%0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> %b, i64 0)
%1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> %b, i64 4)
tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> zeroinitializer, ptr %a)
ret void
}

define void @test_st2q(ptr %a, <vscale x 8 x i32> %b) {
; CHECK-LABEL: define void @test_st2q(
; CHECK-SAME: ptr [[A:%.*]], <vscale x 8 x i32> [[B:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: ret void
;
entry:
%0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> %b, i64 0)
%1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> %b, i64 4)
tail call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> zeroinitializer, ptr %a)
ret void
}

define void @test_st3(ptr %a, <vscale x 12 x i32> %b) {
; CHECK-LABEL: define void @test_st3(
; CHECK-SAME: ptr [[A:%.*]], <vscale x 12 x i32> [[B:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: ret void
;
entry:
%0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 0)
%1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 4)
%2 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 8)
tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> zeroinitializer, ptr %a)
ret void
}

define void @test_st3q(ptr %a, <vscale x 12 x i32> %b) {
; CHECK-LABEL: define void @test_st3q(
; CHECK-SAME: ptr [[A:%.*]], <vscale x 12 x i32> [[B:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: ret void
;
entry:
%0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 0)
%1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 4)
%2 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 8)
tail call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> zeroinitializer, ptr %a)
ret void
}

define void @test_st4(ptr %a, <vscale x 16 x i32> %b) {
; CHECK-LABEL: define void @test_st4(
; CHECK-SAME: ptr [[A:%.*]], <vscale x 16 x i32> [[B:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: ret void
;
entry:
%0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 0)
%1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 4)
%2 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 8)
%3 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 12)
tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, <vscale x 4 x i1> zeroinitializer, ptr %a)
ret void
}

define void @test_st4q(ptr %a, <vscale x 16 x i32> %b) {
; CHECK-LABEL: define void @test_st4q(
; CHECK-SAME: ptr [[A:%.*]], <vscale x 16 x i32> [[B:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: ret void
;
entry:
%0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 0)
%1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 4)
%2 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 8)
%3 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 12)
tail call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, <vscale x 4 x i1> zeroinitializer, ptr %a)
ret void
}

define void @test_stnt1(ptr %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: define void @test_stnt1(
; CHECK-SAME: ptr [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: ret void
;
entry:
call void @llvm.aarch64.sve.stnt1.nxv16i8(<vscale x 16 x i8> %b, <vscale x 16 x i1> zeroinitializer, ptr %a)
ret void
}

define void @test_stnt1_scatter(<vscale x 2 x i16> %data_trunc, ptr %base, <vscale x 2 x i64> %b) {
; CHECK-LABEL: define void @test_stnt1_scatter(
; CHECK-SAME: <vscale x 2 x i16> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: ret void
;
entry:
call void @llvm.aarch64.sve.stnt1.scatter.nxv2i16(<vscale x 2 x i16> %data_trunc,
<vscale x 2 x i1> zeroinitializer,
ptr %base,
<vscale x 2 x i64> %b)
ret void
}

define void @test_stnt1_scatter_index(<vscale x 2 x i32> %data_trunc, ptr %base, <vscale x 2 x i64> %offsets) {
; CHECK-LABEL: define void @test_stnt1_scatter_index(
; CHECK-SAME: <vscale x 2 x i32> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFFSETS:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: ret void
;
entry:
call void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i32(<vscale x 2 x i32> %data_trunc,
<vscale x 2 x i1> zeroinitializer,
ptr %base,
<vscale x 2 x i64> %offsets)
ret void
}

define void @test_stnt1_scatter_scalar_offset(<vscale x 4 x i8> %data_trunc, <vscale x 4 x i32> %base) {
; CHECK-LABEL: define void @test_stnt1_scatter_scalar_offset(
; CHECK-SAME: <vscale x 4 x i8> [[DATA_TRUNC:%.*]], <vscale x 4 x i32> [[BASE:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: ret void
;
entry:
call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i8> %data_trunc,
<vscale x 4 x i1> zeroinitializer,
<vscale x 4 x i32> %base,
i64 16)
ret void
}

define void @test_stnt1_scatter_uxtw(<vscale x 4 x i8> %data_trunc, ptr %base, <vscale x 4 x i32> %offsets) {
; CHECK-LABEL: define void @test_stnt1_scatter_uxtw(
; CHECK-SAME: <vscale x 4 x i8> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 4 x i32> [[OFFSETS:%.*]]) {
; CHECK-NEXT: ret void
;
call void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i8(<vscale x 4 x i8> %data_trunc,
<vscale x 4 x i1> zeroinitializer,
ptr %base,
<vscale x 4 x i32> %offsets)
ret void
}

0 comments on commit 9ceb45c

Please sign in to comment.