66//
77// ===----------------------------------------------------------------------===//
88//
9- // / \file This pass adds target attributes to functions which use intrinsics
10- // / which will impact calling convention lowering.
9+ // / \file This pass propagates the uniform-work-group-size attribute from
10+ // / kernels to leaf functions when possible. It also adds additional attributes
11+ // / to hint ABI lowering optimizations later.
1112//
1213// ===----------------------------------------------------------------------===//
1314
2526using namespace llvm ;
2627
2728namespace {
28- static constexpr StringLiteral ImplicitAttrNames[] = {
29- // X ids unnecessarily propagated to kernels.
30- " amdgpu-work-item-id-x" , " amdgpu-work-item-id-y" ,
31- " amdgpu-work-item-id-z" , " amdgpu-work-group-id-x" ,
32- " amdgpu-work-group-id-y" , " amdgpu-work-group-id-z" ,
33- " amdgpu-dispatch-ptr" , " amdgpu-dispatch-id" ,
34- " amdgpu-queue-ptr" , " amdgpu-implicitarg-ptr" };
35-
3629class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
3730private:
3831 const TargetMachine *TM = nullptr ;
39- SmallVector<CallGraphNode*, 8 > NodeList;
4032
4133 bool addFeatureAttributes (Function &F);
42- bool processUniformWorkGroupAttribute ();
43- bool propagateUniformWorkGroupAttribute (Function &Caller, Function &Callee);
4434
4535public:
4636 static char ID;
@@ -58,12 +48,6 @@ class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
5848 AU.setPreservesAll ();
5949 CallGraphSCCPass::getAnalysisUsage (AU);
6050 }
61-
62- static bool visitConstantExpr (const ConstantExpr *CE);
63- static bool visitConstantExprsRecursively (
64- const Constant *EntryC,
65- SmallPtrSet<const Constant *, 8 > &ConstantExprVisited, bool IsFunc,
66- bool HasApertureRegs);
6751};
6852
6953} // end anonymous namespace
@@ -75,210 +59,11 @@ char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
7559INITIALIZE_PASS (AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
7660 " Add AMDGPU function attributes" , false , false )
7761
78-
79- // The queue ptr is only needed when casting to flat, not from it.
80- static bool castRequiresQueuePtr(unsigned SrcAS) {
81- return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
82- }
83-
84- static bool castRequiresQueuePtr (const AddrSpaceCastInst *ASC) {
85- return castRequiresQueuePtr (ASC->getSrcAddressSpace ());
86- }
87-
88- static bool isDSAddress (const Constant *C) {
89- const GlobalValue *GV = dyn_cast<GlobalValue>(C);
90- if (!GV)
91- return false ;
92- unsigned AS = GV->getAddressSpace ();
93- return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
94- }
95-
96- bool AMDGPUAnnotateKernelFeatures::visitConstantExpr (const ConstantExpr *CE) {
97- if (CE->getOpcode () == Instruction::AddrSpaceCast) {
98- unsigned SrcAS = CE->getOperand (0 )->getType ()->getPointerAddressSpace ();
99- return castRequiresQueuePtr (SrcAS);
100- }
101-
102- return false ;
103- }
104-
105- bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively (
106- const Constant *EntryC,
107- SmallPtrSet<const Constant *, 8 > &ConstantExprVisited,
108- bool IsFunc, bool HasApertureRegs) {
109-
110- if (!ConstantExprVisited.insert (EntryC).second )
111- return false ;
112-
113- SmallVector<const Constant *, 16 > Stack;
114- Stack.push_back (EntryC);
115-
116- while (!Stack.empty ()) {
117- const Constant *C = Stack.pop_back_val ();
118-
119- // We need to trap on DS globals in non-entry functions.
120- if (IsFunc && isDSAddress (C))
121- return true ;
122-
123- // Check this constant expression.
124- if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
125- if (!HasApertureRegs && visitConstantExpr (CE))
126- return true ;
127- }
128-
129- // Visit all sub-expressions.
130- for (const Use &U : C->operands ()) {
131- const auto *OpC = dyn_cast<Constant>(U);
132- if (!OpC)
133- continue ;
134-
135- if (!ConstantExprVisited.insert (OpC).second )
136- continue ;
137-
138- Stack.push_back (OpC);
139- }
140- }
141-
142- return false ;
143- }
144-
145- // We do not need to note the x workitem or workgroup id because they are always
146- // initialized.
147- //
148- // TODO: We should not add the attributes if the known compile time workgroup
149- // size is 1 for y/z.
150- static StringRef intrinsicToAttrName (Intrinsic::ID ID,
151- bool &NonKernelOnly,
152- bool &IsQueuePtr) {
153- switch (ID) {
154- case Intrinsic::amdgcn_workitem_id_x:
155- NonKernelOnly = true ;
156- return " amdgpu-work-item-id-x" ;
157- case Intrinsic::amdgcn_workgroup_id_x:
158- NonKernelOnly = true ;
159- return " amdgpu-work-group-id-x" ;
160- case Intrinsic::amdgcn_workitem_id_y:
161- case Intrinsic::r600_read_tidig_y:
162- return " amdgpu-work-item-id-y" ;
163- case Intrinsic::amdgcn_workitem_id_z:
164- case Intrinsic::r600_read_tidig_z:
165- return " amdgpu-work-item-id-z" ;
166- case Intrinsic::amdgcn_workgroup_id_y:
167- case Intrinsic::r600_read_tgid_y:
168- return " amdgpu-work-group-id-y" ;
169- case Intrinsic::amdgcn_workgroup_id_z:
170- case Intrinsic::r600_read_tgid_z:
171- return " amdgpu-work-group-id-z" ;
172- case Intrinsic::amdgcn_dispatch_ptr:
173- return " amdgpu-dispatch-ptr" ;
174- case Intrinsic::amdgcn_dispatch_id:
175- return " amdgpu-dispatch-id" ;
176- case Intrinsic::amdgcn_implicitarg_ptr:
177- return " amdgpu-implicitarg-ptr" ;
178- case Intrinsic::amdgcn_queue_ptr:
179- case Intrinsic::amdgcn_is_shared:
180- case Intrinsic::amdgcn_is_private:
181- // TODO: Does not require queue ptr on gfx9+
182- case Intrinsic::trap:
183- case Intrinsic::debugtrap:
184- IsQueuePtr = true ;
185- return " amdgpu-queue-ptr" ;
186- default :
187- return " " ;
188- }
189- }
190-
191- static bool handleAttr (Function &Parent, const Function &Callee,
192- StringRef Name) {
193- if (Callee.hasFnAttribute (Name)) {
194- Parent.addFnAttr (Name);
195- return true ;
196- }
197- return false ;
198- }
199-
200- static void copyFeaturesToFunction (Function &Parent, const Function &Callee,
201- bool &NeedQueuePtr) {
202- if (handleAttr (Parent, Callee, " amdgpu-queue-ptr" ))
203- NeedQueuePtr = true ;
204-
205- for (StringRef AttrName : ImplicitAttrNames)
206- handleAttr (Parent, Callee, AttrName);
207- }
208-
209- bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute () {
210- bool Changed = false ;
211-
212- for (auto *Node : reverse (NodeList)) {
213- Function *Caller = Node->getFunction ();
214-
215- for (auto I : *Node) {
216- Function *Callee = std::get<1 >(I)->getFunction ();
217- if (Callee)
218- Changed = propagateUniformWorkGroupAttribute (*Caller, *Callee);
219- }
220- }
221-
222- return Changed;
223- }
224-
225- bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute (
226- Function &Caller, Function &Callee) {
227-
228- // Check for externally defined function
229- if (!Callee.hasExactDefinition ()) {
230- Callee.addFnAttr (" uniform-work-group-size" , " false" );
231- if (!Caller.hasFnAttribute (" uniform-work-group-size" ))
232- Caller.addFnAttr (" uniform-work-group-size" , " false" );
233-
234- return true ;
235- }
236- // Check if the Caller has the attribute
237- if (Caller.hasFnAttribute (" uniform-work-group-size" )) {
238- // Check if the value of the attribute is true
239- if (Caller.getFnAttribute (" uniform-work-group-size" )
240- .getValueAsString ().equals (" true" )) {
241- // Propagate the attribute to the Callee, if it does not have it
242- if (!Callee.hasFnAttribute (" uniform-work-group-size" )) {
243- Callee.addFnAttr (" uniform-work-group-size" , " true" );
244- return true ;
245- }
246- } else {
247- Callee.addFnAttr (" uniform-work-group-size" , " false" );
248- return true ;
249- }
250- } else {
251- // If the attribute is absent, set it as false
252- Caller.addFnAttr (" uniform-work-group-size" , " false" );
253- Callee.addFnAttr (" uniform-work-group-size" , " false" );
254- return true ;
255- }
256- return false ;
257- }
258-
25962bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
260- const GCNSubtarget &ST = TM->getSubtarget <GCNSubtarget>(F);
261- bool HasApertureRegs = ST.hasApertureRegs ();
262- SmallPtrSet<const Constant *, 8 > ConstantExprVisited;
263-
26463 bool HaveStackObjects = false ;
26564 bool Changed = false ;
266- bool NeedQueuePtr = false ;
26765 bool HaveCall = false ;
268- bool HasIndirectCall = false ;
26966 bool IsFunc = !AMDGPU::isEntryFunctionCC (F.getCallingConv ());
270- CallingConv::ID CC = F.getCallingConv ();
271- bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);
272-
273- // If this function hasAddressTaken() = true
274- // then add all attributes corresponding to the implicit args.
275- if (CallingConvSupportsAllImplicits &&
276- F.hasAddressTaken (nullptr , true , true , true )) {
277- for (StringRef AttrName : ImplicitAttrNames) {
278- F.addFnAttr (AttrName);
279- }
280- Changed = true ;
281- }
28267
28368 for (BasicBlock &BB : F) {
28469 for (Instruction &I : BB) {
@@ -293,59 +78,21 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
29378
29479 // Note the occurrence of indirect call.
29580 if (!Callee) {
296- if (!CB->isInlineAsm ()) {
297- HasIndirectCall = true ;
81+ if (!CB->isInlineAsm ())
29882 HaveCall = true ;
299- }
83+
30084 continue ;
30185 }
30286
30387 Intrinsic::ID IID = Callee->getIntrinsicID ();
30488 if (IID == Intrinsic::not_intrinsic) {
30589 HaveCall = true ;
306- copyFeaturesToFunction (F, *Callee, NeedQueuePtr);
30790 Changed = true ;
308- } else {
309- bool NonKernelOnly = false ;
310-
311- StringRef AttrName = intrinsicToAttrName (IID, NonKernelOnly,
312- NeedQueuePtr);
313- if (!AttrName.empty () && (IsFunc || !NonKernelOnly)) {
314- F.addFnAttr (AttrName);
315- Changed = true ;
316- }
317- }
318- }
319-
320- if (NeedQueuePtr || (!IsFunc && HasApertureRegs))
321- continue ;
322-
323- if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
324- if (!HasApertureRegs && castRequiresQueuePtr (ASC)) {
325- NeedQueuePtr = true ;
326- continue ;
327- }
328- }
329-
330- for (const Use &U : I.operands ()) {
331- const auto *OpC = dyn_cast<Constant>(U);
332- if (!OpC)
333- continue ;
334-
335- if (visitConstantExprsRecursively (OpC, ConstantExprVisited, IsFunc,
336- HasApertureRegs)) {
337- NeedQueuePtr = true ;
338- break ;
33991 }
34092 }
34193 }
34294 }
34395
344- if (NeedQueuePtr) {
345- F.addFnAttr (" amdgpu-queue-ptr" );
346- Changed = true ;
347- }
348-
34996 // TODO: We could refine this to captured pointers that could possibly be
35097 // accessed by flat instructions. For now this is mostly a poor way of
35198 // estimating whether there are calls before argument lowering.
@@ -359,43 +106,13 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
359106 Changed = true ;
360107 }
361108
362- // This pass cannot copy attributes from callees to callers
363- // if there is an indirect call and in thus such cases,
364- // hasAddressTaken() would be false for kernels and functions
365- // making an indirect call (if they are themselves not indirectly called).
366- // We must tag all such kernels/functions with all implicits attributes
367- // for correctness.
368- // e.g.
369- // 1. Kernel K1 makes an indirect call to function F1.
370- // Without detecting an indirect call in K1, this pass will not
371- // add all implicit args to K1 (which is incorrect).
372- // 2. Kernel K1 makes direct call to F1 which makes indirect call to function
373- // F2.
374- // Without detecting an indirect call in F1 (whose hasAddressTaken() is
375- // false), the pass will not add all implicit args to F1 (which is
376- // essential for correctness).
377- if (CallingConvSupportsAllImplicits && HasIndirectCall) {
378- for (StringRef AttrName : ImplicitAttrNames) {
379- F.addFnAttr (AttrName);
380- }
381- Changed = true ;
382- }
383-
384109 return Changed;
385110}
386111
387112bool AMDGPUAnnotateKernelFeatures::runOnSCC (CallGraphSCC &SCC) {
388113 bool Changed = false ;
389114
390115 for (CallGraphNode *I : SCC) {
391- // Build a list of CallGraphNodes from most number of uses to least
392- if (I->getNumReferences ())
393- NodeList.push_back (I);
394- else {
395- processUniformWorkGroupAttribute ();
396- NodeList.clear ();
397- }
398-
399116 Function *F = I->getFunction ();
400117 // Ignore functions with graphics calling conventions, these are currently
401118 // not allowed to have kernel arguments.
0 commit comments