Skip to content

Commit 6725e2e

Browse files
authored
Merge branch 'release/rocm-rel-7.1' into revert-4632-revert-4555-amd/dev/macurtis/release/rocm-rel-7.1/builtin-global-load-store
2 parents 707b41b + c82f577 commit 6725e2e

File tree

9 files changed

+682
-62
lines changed

9 files changed

+682
-62
lines changed

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 4 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
6464
}
6565

6666
MayNeedAGPRs = ST.hasMAIInsts();
67+
if (ST.hasGFX90AInsts() &&
68+
ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
69+
!mayUseAGPRs(F))
70+
MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
6771

6872
if (AMDGPU::isChainCC(CC)) {
6973
// Chain functions don't receive an SP from their caller, but are free to
@@ -100,11 +104,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
100104
ImplicitArgPtr = false;
101105
MaxKernArgAlign = std::max(ST.getAlignmentForImplicitArgPtr(),
102106
MaxKernArgAlign);
103-
104-
if (ST.hasGFX90AInsts() &&
105-
ST.getMaxNumVGPRs(F) <= ST.getAddressableNumArchVGPRs() &&
106-
!mayUseAGPRs(F))
107-
MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
108107
}
109108

110109
if (!AMDGPU::isGraphics(CC) ||
@@ -787,44 +786,3 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
787786
bool SIMachineFunctionInfo::mayUseAGPRs(const Function &F) const {
788787
return !F.hasFnAttribute("amdgpu-no-agpr");
789788
}
790-
791-
bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const {
792-
if (UsesAGPRs)
793-
return *UsesAGPRs;
794-
795-
if (!mayNeedAGPRs()) {
796-
UsesAGPRs = false;
797-
return false;
798-
}
799-
800-
if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv()) ||
801-
MF.getFrameInfo().hasCalls()) {
802-
UsesAGPRs = true;
803-
return true;
804-
}
805-
806-
const MachineRegisterInfo &MRI = MF.getRegInfo();
807-
808-
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
809-
const Register Reg = Register::index2VirtReg(I);
810-
const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
811-
if (RC && SIRegisterInfo::isAGPRClass(RC)) {
812-
UsesAGPRs = true;
813-
return true;
814-
}
815-
if (!RC && !MRI.use_empty(Reg) && MRI.getType(Reg).isValid()) {
816-
// Defer caching UsesAGPRs, function might not yet been regbank selected.
817-
return true;
818-
}
819-
}
820-
821-
for (MCRegister Reg : AMDGPU::AGPR_32RegClass) {
822-
if (MRI.isPhysRegUsed(Reg)) {
823-
UsesAGPRs = true;
824-
return true;
825-
}
826-
}
827-
828-
UsesAGPRs = false;
829-
return false;
830-
}

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -494,8 +494,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
494494
// scheduler stage.
495495
unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
496496

497-
mutable std::optional<bool> UsesAGPRs;
498-
499497
MCPhysReg getNextUserSGPR() const;
500498

501499
MCPhysReg getNextSystemSGPR() const;
@@ -1136,9 +1134,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
11361134
// has a call which may use it.
11371135
bool mayUseAGPRs(const Function &F) const;
11381136

1139-
// \returns true if a function needs or may need AGPRs.
1140-
bool usesAGPRs(const MachineFunction &MF) const;
1141-
11421137
/// \returns Default/requested number of work groups for this function.
11431138
SmallVector<unsigned> getMaxNumWorkGroups() const { return MaxNumWorkGroups; }
11441139

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -594,7 +594,7 @@ SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const {
594594
// TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
595595
// register file accordingly.
596596
if (ST.hasGFX90AInsts()) {
597-
if (MFI->usesAGPRs(MF)) {
597+
if (MFI->mayNeedAGPRs()) {
598598
MaxNumVGPRs /= 2;
599599
MaxNumAGPRs = MaxNumVGPRs;
600600
} else {

llvm/lib/Transforms/Scalar/SROA.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@
8383
#include "llvm/Transforms/Scalar.h"
8484
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
8585
#include "llvm/Transforms/Utils/Local.h"
86+
#include "llvm/TargetParser/Triple.h"
8687
#include "llvm/Transforms/Utils/PromoteMemToReg.h"
8788
#include "llvm/Transforms/Utils/SSAUpdater.h"
8889
#include <algorithm>
@@ -4905,6 +4906,34 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
49054906
// FIXME: We might want to defer PHI speculation until after here.
49064907
// FIXME: return nullptr;
49074908
} else {
4909+
// AMDGPU: If the target is AMDGPU and the chosen SliceTy is a HIP vector
4910+
// struct of 2 or 4 identical elements, canonicalize it to an IR vector.
4911+
// This helps SROA treat it as a single value and unlock vector ld/st.
4912+
// We pattern-match struct names starting with "struct.HIP_vector".
4913+
if (Function *F = AI.getFunction()) {
4914+
Triple TT(F->getParent()->getTargetTriple());
4915+
if (TT.isAMDGPU()) {
4916+
if (auto *STy = dyn_cast<StructType>(SliceTy)) {
4917+
StringRef Name = STy->hasName() ? STy->getName() : StringRef();
4918+
if (Name.starts_with("struct.HIP_vector")) {
4919+
unsigned NumElts = STy->getNumElements();
4920+
if ((NumElts == 2 || NumElts == 4) && NumElts > 0) {
4921+
Type *EltTy = STy->getElementType(0);
4922+
bool AllSame = true;
4923+
for (unsigned I = 1; I < NumElts; ++I)
4924+
if (STy->getElementType(I) != EltTy) {
4925+
AllSame = false;
4926+
break;
4927+
}
4928+
if (AllSame && VectorType::isValidElementType(EltTy)) {
4929+
SliceTy = FixedVectorType::get(EltTy, NumElts);
4930+
}
4931+
}
4932+
}
4933+
}
4934+
}
4935+
}
4936+
49084937
// Make sure the alignment is compatible with P.beginOffset().
49094938
const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
49104939
// If we will get at least this much alignment from the type alone, leave
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
; REQUIRES: asserts
2+
; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %s 2>&1 | FileCheck -check-prefix=CRASH %s
3+
4+
; CRASH: error: <unknown>:0:0: no registers from class available to allocate in function 'no_free_vgprs_at_agpr_to_agpr_copy'
5+
; CRASH: Cannot access invalid iterator
6+
7+
define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
8+
%asm = call { <32 x i32>, <16 x float> } asm sideeffect "; def $0 $1", "=${v[0:31]},=${a[0:15]}"()
9+
%vgpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 0
10+
%agpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 1
11+
%mfma = call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %v0, float %v1, <16 x float> %agpr0, i32 0, i32 0, i32 0)
12+
%agpr1 = call i32 asm sideeffect "; copy ", "={a1},a,~{a[0:15]}"(<16 x float> %agpr0)
13+
%agpr2 = call i32 asm sideeffect "; copy ", "={a2},a,{a[0:15]}"(i32 %agpr1, <16 x float> %mfma)
14+
call void asm sideeffect "; use $0 $1", "{a3},{v[0:31]}"(i32 %agpr2, <32 x i32> %vgpr0)
15+
ret void
16+
}
17+
18+
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #1
19+
declare noundef i32 @llvm.amdgcn.workitem.id.x() #2
20+
21+
attributes #0 = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="6,6" }
22+
attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
23+
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
240240
}
241241

242242
; Check that we do make use of v32 if there are no AGPRs present in the function
243-
define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #0 {
243+
define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #5 {
244244
; GFX908-LABEL: no_agpr_no_reserve:
245245
; GFX908: ; %bb.0:
246246
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -1145,5 +1145,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() #2
11451145
attributes #0 = { "amdgpu-waves-per-eu"="6,6" }
11461146
attributes #1 = { convergent nounwind readnone willreturn }
11471147
attributes #2 = { nounwind readnone willreturn }
1148-
attributes #3 = { "amdgpu-waves-per-eu"="7,7" }
1148+
attributes #3 = { "amdgpu-waves-per-eu"="7,7" "amdgpu-no-agpr" }
11491149
attributes #4 = { "amdgpu-waves-per-eu"="6,6" "amdgpu-flat-work-group-size"="1024,1024" }
1150+
attributes #5 = { "amdgpu-waves-per-eu"="6,6" "amdgpu-no-agpr" }

llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,9 +94,20 @@ bb3:
9494
ret void
9595
}
9696

97-
; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_nonentry:
97+
; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_nonentry_noagpr:
98+
; GFX908: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
99+
; GFX90A: v_mfma_f32_32x32x1{{.*}} v[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9:]+}}]
100+
define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0 {
101+
bb:
102+
%in.1 = load <32 x float>, ptr addrspace(1) %arg
103+
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
104+
store <32 x float> %mai.1, ptr addrspace(1) %arg
105+
ret void
106+
}
107+
108+
; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_nonentry_with_agpr:
98109
; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
99-
define void @test_mfma_f32_32x32x1f32_nonentry(ptr addrspace(1) %arg) #0 {
110+
define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg) #3 {
100111
bb:
101112
%in.1 = load <32 x float>, ptr addrspace(1) %arg
102113
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -109,3 +120,4 @@ declare void @foo()
109120
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" "amdgpu-no-agpr" }
110121
attributes #1 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" }
111122
attributes #2 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" }
123+
attributes #3 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" }

llvm/test/CodeGen/AMDGPU/spill-regpressure-less.mir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
ret void
77
}
88

9-
attributes #0 = { "amdgpu-waves-per-eu"="8,8" }
9+
attributes #0 = { "amdgpu-waves-per-eu"="8,8" "amdgpu-no-agpr" }
1010
...
1111

1212
---

0 commit comments

Comments
 (0)