Skip to content

Commit 6935430

Browse files
arsenmssahasra
authored andcommitted
AMDGPU: Reapply ABI attribute patch series reverted in merge
The merge in c0ed73f reverted these 3 commits: 0197cd0 AMDGPU: Optimize amdgpu-no-* attributes db4963d AMDGPU: Use attributor to propagate uniform-work-group-size 722b8e0 AMDGPU: Invert ABI attribute handling This reapplies them. Change-Id: I8425c010f41ea319119cdf76853c86e59aa848e9
1 parent 235b688 commit 6935430

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+5025
-2606
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp

Lines changed: 5 additions & 288 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@
66
//
77
//===----------------------------------------------------------------------===//
88
//
9-
/// \file This pass adds target attributes to functions which use intrinsics
10-
/// which will impact calling convention lowering.
9+
/// \file This pass propagates the uniform-work-group-size attribute from
10+
/// kernels to leaf functions when possible. It also adds additional attributes
11+
/// to hint ABI lowering optimizations later.
1112
//
1213
//===----------------------------------------------------------------------===//
1314

@@ -25,22 +26,11 @@
2526
using namespace llvm;
2627

2728
namespace {
28-
static constexpr StringLiteral ImplicitAttrNames[] = {
29-
// X ids unnecessarily propagated to kernels.
30-
"amdgpu-work-item-id-x", "amdgpu-work-item-id-y",
31-
"amdgpu-work-item-id-z", "amdgpu-work-group-id-x",
32-
"amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
33-
"amdgpu-dispatch-ptr", "amdgpu-dispatch-id",
34-
"amdgpu-queue-ptr", "amdgpu-implicitarg-ptr"};
35-
3629
class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
3730
private:
3831
const TargetMachine *TM = nullptr;
39-
SmallVector<CallGraphNode*, 8> NodeList;
4032

4133
bool addFeatureAttributes(Function &F);
42-
bool processUniformWorkGroupAttribute();
43-
bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
4434

4535
public:
4636
static char ID;
@@ -58,12 +48,6 @@ class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
5848
AU.setPreservesAll();
5949
CallGraphSCCPass::getAnalysisUsage(AU);
6050
}
61-
62-
static bool visitConstantExpr(const ConstantExpr *CE);
63-
static bool visitConstantExprsRecursively(
64-
const Constant *EntryC,
65-
SmallPtrSet<const Constant *, 8> &ConstantExprVisited, bool IsFunc,
66-
bool HasApertureRegs);
6751
};
6852

6953
} // end anonymous namespace
@@ -75,210 +59,11 @@ char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
7559
INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
7660
"Add AMDGPU function attributes", false, false)
7761

78-
79-
// The queue ptr is only needed when casting to flat, not from it.
80-
static bool castRequiresQueuePtr(unsigned SrcAS) {
81-
return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
82-
}
83-
84-
static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
85-
return castRequiresQueuePtr(ASC->getSrcAddressSpace());
86-
}
87-
88-
static bool isDSAddress(const Constant *C) {
89-
const GlobalValue *GV = dyn_cast<GlobalValue>(C);
90-
if (!GV)
91-
return false;
92-
unsigned AS = GV->getAddressSpace();
93-
return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
94-
}
95-
96-
bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
97-
if (CE->getOpcode() == Instruction::AddrSpaceCast) {
98-
unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
99-
return castRequiresQueuePtr(SrcAS);
100-
}
101-
102-
return false;
103-
}
104-
105-
bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
106-
const Constant *EntryC,
107-
SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
108-
bool IsFunc, bool HasApertureRegs) {
109-
110-
if (!ConstantExprVisited.insert(EntryC).second)
111-
return false;
112-
113-
SmallVector<const Constant *, 16> Stack;
114-
Stack.push_back(EntryC);
115-
116-
while (!Stack.empty()) {
117-
const Constant *C = Stack.pop_back_val();
118-
119-
// We need to trap on DS globals in non-entry functions.
120-
if (IsFunc && isDSAddress(C))
121-
return true;
122-
123-
// Check this constant expression.
124-
if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
125-
if (!HasApertureRegs && visitConstantExpr(CE))
126-
return true;
127-
}
128-
129-
// Visit all sub-expressions.
130-
for (const Use &U : C->operands()) {
131-
const auto *OpC = dyn_cast<Constant>(U);
132-
if (!OpC)
133-
continue;
134-
135-
if (!ConstantExprVisited.insert(OpC).second)
136-
continue;
137-
138-
Stack.push_back(OpC);
139-
}
140-
}
141-
142-
return false;
143-
}
144-
145-
// We do not need to note the x workitem or workgroup id because they are always
146-
// initialized.
147-
//
148-
// TODO: We should not add the attributes if the known compile time workgroup
149-
// size is 1 for y/z.
150-
static StringRef intrinsicToAttrName(Intrinsic::ID ID,
151-
bool &NonKernelOnly,
152-
bool &IsQueuePtr) {
153-
switch (ID) {
154-
case Intrinsic::amdgcn_workitem_id_x:
155-
NonKernelOnly = true;
156-
return "amdgpu-work-item-id-x";
157-
case Intrinsic::amdgcn_workgroup_id_x:
158-
NonKernelOnly = true;
159-
return "amdgpu-work-group-id-x";
160-
case Intrinsic::amdgcn_workitem_id_y:
161-
case Intrinsic::r600_read_tidig_y:
162-
return "amdgpu-work-item-id-y";
163-
case Intrinsic::amdgcn_workitem_id_z:
164-
case Intrinsic::r600_read_tidig_z:
165-
return "amdgpu-work-item-id-z";
166-
case Intrinsic::amdgcn_workgroup_id_y:
167-
case Intrinsic::r600_read_tgid_y:
168-
return "amdgpu-work-group-id-y";
169-
case Intrinsic::amdgcn_workgroup_id_z:
170-
case Intrinsic::r600_read_tgid_z:
171-
return "amdgpu-work-group-id-z";
172-
case Intrinsic::amdgcn_dispatch_ptr:
173-
return "amdgpu-dispatch-ptr";
174-
case Intrinsic::amdgcn_dispatch_id:
175-
return "amdgpu-dispatch-id";
176-
case Intrinsic::amdgcn_implicitarg_ptr:
177-
return "amdgpu-implicitarg-ptr";
178-
case Intrinsic::amdgcn_queue_ptr:
179-
case Intrinsic::amdgcn_is_shared:
180-
case Intrinsic::amdgcn_is_private:
181-
// TODO: Does not require queue ptr on gfx9+
182-
case Intrinsic::trap:
183-
case Intrinsic::debugtrap:
184-
IsQueuePtr = true;
185-
return "amdgpu-queue-ptr";
186-
default:
187-
return "";
188-
}
189-
}
190-
191-
static bool handleAttr(Function &Parent, const Function &Callee,
192-
StringRef Name) {
193-
if (Callee.hasFnAttribute(Name)) {
194-
Parent.addFnAttr(Name);
195-
return true;
196-
}
197-
return false;
198-
}
199-
200-
static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
201-
bool &NeedQueuePtr) {
202-
if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
203-
NeedQueuePtr = true;
204-
205-
for (StringRef AttrName : ImplicitAttrNames)
206-
handleAttr(Parent, Callee, AttrName);
207-
}
208-
209-
bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
210-
bool Changed = false;
211-
212-
for (auto *Node : reverse(NodeList)) {
213-
Function *Caller = Node->getFunction();
214-
215-
for (auto I : *Node) {
216-
Function *Callee = std::get<1>(I)->getFunction();
217-
if (Callee)
218-
Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
219-
}
220-
}
221-
222-
return Changed;
223-
}
224-
225-
bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
226-
Function &Caller, Function &Callee) {
227-
228-
// Check for externally defined function
229-
if (!Callee.hasExactDefinition()) {
230-
Callee.addFnAttr("uniform-work-group-size", "false");
231-
if (!Caller.hasFnAttribute("uniform-work-group-size"))
232-
Caller.addFnAttr("uniform-work-group-size", "false");
233-
234-
return true;
235-
}
236-
// Check if the Caller has the attribute
237-
if (Caller.hasFnAttribute("uniform-work-group-size")) {
238-
// Check if the value of the attribute is true
239-
if (Caller.getFnAttribute("uniform-work-group-size")
240-
.getValueAsString().equals("true")) {
241-
// Propagate the attribute to the Callee, if it does not have it
242-
if (!Callee.hasFnAttribute("uniform-work-group-size")) {
243-
Callee.addFnAttr("uniform-work-group-size", "true");
244-
return true;
245-
}
246-
} else {
247-
Callee.addFnAttr("uniform-work-group-size", "false");
248-
return true;
249-
}
250-
} else {
251-
// If the attribute is absent, set it as false
252-
Caller.addFnAttr("uniform-work-group-size", "false");
253-
Callee.addFnAttr("uniform-work-group-size", "false");
254-
return true;
255-
}
256-
return false;
257-
}
258-
25962
bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
260-
const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
261-
bool HasApertureRegs = ST.hasApertureRegs();
262-
SmallPtrSet<const Constant *, 8> ConstantExprVisited;
263-
26463
bool HaveStackObjects = false;
26564
bool Changed = false;
266-
bool NeedQueuePtr = false;
26765
bool HaveCall = false;
268-
bool HasIndirectCall = false;
26966
bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
270-
CallingConv::ID CC = F.getCallingConv();
271-
bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);
272-
273-
// If this function hasAddressTaken() = true
274-
// then add all attributes corresponding to the implicit args.
275-
if (CallingConvSupportsAllImplicits &&
276-
F.hasAddressTaken(nullptr, true, true, true)) {
277-
for (StringRef AttrName : ImplicitAttrNames) {
278-
F.addFnAttr(AttrName);
279-
}
280-
Changed = true;
281-
}
28267

28368
for (BasicBlock &BB : F) {
28469
for (Instruction &I : BB) {
@@ -293,59 +78,21 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
29378

29479
// Note the occurrence of indirect call.
29580
if (!Callee) {
296-
if (!CB->isInlineAsm()) {
297-
HasIndirectCall = true;
81+
if (!CB->isInlineAsm())
29882
HaveCall = true;
299-
}
83+
30084
continue;
30185
}
30286

30387
Intrinsic::ID IID = Callee->getIntrinsicID();
30488
if (IID == Intrinsic::not_intrinsic) {
30589
HaveCall = true;
306-
copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
30790
Changed = true;
308-
} else {
309-
bool NonKernelOnly = false;
310-
311-
StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly,
312-
NeedQueuePtr);
313-
if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
314-
F.addFnAttr(AttrName);
315-
Changed = true;
316-
}
317-
}
318-
}
319-
320-
if (NeedQueuePtr || (!IsFunc && HasApertureRegs))
321-
continue;
322-
323-
if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
324-
if (!HasApertureRegs && castRequiresQueuePtr(ASC)) {
325-
NeedQueuePtr = true;
326-
continue;
327-
}
328-
}
329-
330-
for (const Use &U : I.operands()) {
331-
const auto *OpC = dyn_cast<Constant>(U);
332-
if (!OpC)
333-
continue;
334-
335-
if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc,
336-
HasApertureRegs)) {
337-
NeedQueuePtr = true;
338-
break;
33991
}
34092
}
34193
}
34294
}
34395

344-
if (NeedQueuePtr) {
345-
F.addFnAttr("amdgpu-queue-ptr");
346-
Changed = true;
347-
}
348-
34996
// TODO: We could refine this to captured pointers that could possibly be
35097
// accessed by flat instructions. For now this is mostly a poor way of
35198
// estimating whether there are calls before argument lowering.
@@ -359,43 +106,13 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
359106
Changed = true;
360107
}
361108

362-
// This pass cannot copy attributes from callees to callers
363-
// if there is an indirect call and in thus such cases,
364-
// hasAddressTaken() would be false for kernels and functions
365-
// making an indirect call (if they are themselves not indirectly called).
366-
// We must tag all such kernels/functions with all implicits attributes
367-
// for correctness.
368-
// e.g.
369-
// 1. Kernel K1 makes an indirect call to function F1.
370-
// Without detecting an indirect call in K1, this pass will not
371-
// add all implicit args to K1 (which is incorrect).
372-
// 2. Kernel K1 makes direct call to F1 which makes indirect call to function
373-
// F2.
374-
// Without detecting an indirect call in F1 (whose hasAddressTaken() is
375-
// false), the pass will not add all implicit args to F1 (which is
376-
// essential for correctness).
377-
if (CallingConvSupportsAllImplicits && HasIndirectCall) {
378-
for (StringRef AttrName : ImplicitAttrNames) {
379-
F.addFnAttr(AttrName);
380-
}
381-
Changed = true;
382-
}
383-
384109
return Changed;
385110
}
386111

387112
bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
388113
bool Changed = false;
389114

390115
for (CallGraphNode *I : SCC) {
391-
// Build a list of CallGraphNodes from most number of uses to least
392-
if (I->getNumReferences())
393-
NodeList.push_back(I);
394-
else {
395-
processUniformWorkGroupAttribute();
396-
NodeList.clear();
397-
}
398-
399116
Function *F = I->getFunction();
400117
// Ignore functions with graphics calling conventions, these are currently
401118
// not allowed to have kernel arguments.

0 commit comments

Comments
 (0)