Skip to content

Commit da70f2c

Browse files
authored
[flang][cuda] Lower ALLOCATE for device variable (#88980)
Replace the runtime call to `AllocatableAllocate` for CUDA device variable to the newly added `fir.cuda_allocate` operation.
1 parent a88ea8f commit da70f2c

File tree

2 files changed

+154
-10
lines changed

2 files changed

+154
-10
lines changed

flang/lib/Lower/Allocatable.cpp

+47-10
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "flang/Evaluate/tools.h"
1515
#include "flang/Lower/AbstractConverter.h"
1616
#include "flang/Lower/ConvertType.h"
17+
#include "flang/Lower/ConvertVariable.h"
1718
#include "flang/Lower/IterationSpace.h"
1819
#include "flang/Lower/Mangler.h"
1920
#include "flang/Lower/OpenACC.h"
@@ -368,20 +369,17 @@ class AllocateStmtHelper {
368369
[&](const Fortran::parser::AllocOpt::Mold &mold) {
369370
moldExpr = Fortran::semantics::GetExpr(mold.v.value());
370371
},
371-
[&](const Fortran::parser::AllocOpt::Stream &) {
372-
TODO(loc, "CUDA ALLOCATE(STREAM=)");
372+
[&](const Fortran::parser::AllocOpt::Stream &stream) {
373+
streamExpr = Fortran::semantics::GetExpr(stream.v.value());
373374
},
374-
[&](const Fortran::parser::AllocOpt::Pinned &) {
375-
TODO(loc, "CUDA ALLOCATE(PINNED=)");
375+
[&](const Fortran::parser::AllocOpt::Pinned &pinned) {
376+
pinnedExpr = Fortran::semantics::GetExpr(pinned.v.value());
376377
},
377378
},
378379
allocOption.u);
379380
}
380381

381382
void lowerAllocation(const Allocation &alloc) {
382-
if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
383-
TODO(loc, "Allocation of variable with CUDA attributes");
384-
385383
fir::MutableBoxValue boxAddr =
386384
genMutableBoxValue(converter, loc, alloc.getAllocObj());
387385

@@ -456,7 +454,8 @@ class AllocateStmtHelper {
456454
const fir::MutableBoxValue &box) {
457455
if (!box.isDerived() && !errorManager.hasStatSpec() &&
458456
!alloc.type.IsPolymorphic() && !alloc.hasCoarraySpec() &&
459-
!useAllocateRuntime && !box.isPointer()) {
457+
!useAllocateRuntime && !box.isPointer() &&
458+
!Fortran::semantics::HasCUDAAttr(alloc.getSymbol())) {
460459
// Pointers must use PointerAllocate so that their deallocations
461460
// can be validated.
462461
genInlinedAllocation(alloc, box);
@@ -472,7 +471,12 @@ class AllocateStmtHelper {
472471
genSetType(alloc, box, loc);
473472
genSetDeferredLengthParameters(alloc, box);
474473
genAllocateObjectBounds(alloc, box);
475-
mlir::Value stat = genRuntimeAllocate(builder, loc, box, errorManager);
474+
mlir::Value stat;
475+
if (!Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
476+
stat = genRuntimeAllocate(builder, loc, box, errorManager);
477+
else
478+
stat =
479+
genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol());
476480
fir::factory::syncMutableBoxFromIRBox(builder, loc, box);
477481
postAllocationAction(alloc);
478482
errorManager.assignStat(builder, loc, stat);
@@ -602,7 +606,10 @@ class AllocateStmtHelper {
602606
genSetDeferredLengthParameters(alloc, box);
603607
genAllocateObjectBounds(alloc, box);
604608
mlir::Value stat;
605-
if (isSource)
609+
if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
610+
stat =
611+
genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol());
612+
else if (isSource)
606613
stat = genRuntimeAllocateSource(builder, loc, box, exv, errorManager);
607614
else
608615
stat = genRuntimeAllocate(builder, loc, box, errorManager);
@@ -717,13 +724,43 @@ class AllocateStmtHelper {
717724
return nullptr;
718725
}
719726

727+
mlir::Value genCudaAllocate(fir::FirOpBuilder &builder, mlir::Location loc,
728+
const fir::MutableBoxValue &box,
729+
ErrorManager &errorManager,
730+
const Fortran::semantics::Symbol &sym) {
731+
Fortran::lower::StatementContext stmtCtx;
732+
fir::CUDADataAttributeAttr cudaAttr =
733+
Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(),
734+
sym);
735+
mlir::Value errmsg = errMsgExpr ? errorManager.errMsgAddr : nullptr;
736+
mlir::Value stream =
737+
streamExpr
738+
? fir::getBase(converter.genExprValue(loc, *streamExpr, stmtCtx))
739+
: nullptr;
740+
mlir::Value pinned =
741+
pinnedExpr
742+
? fir::getBase(converter.genExprAddr(loc, *pinnedExpr, stmtCtx))
743+
: nullptr;
744+
mlir::Value source = sourceExpr ? fir::getBase(sourceExv) : nullptr;
745+
746+
// Keep return type the same as a standard AllocatableAllocate call.
747+
mlir::Type retTy = fir::runtime::getModel<int>()(builder.getContext());
748+
return builder
749+
.create<fir::CUDAAllocateOp>(
750+
loc, retTy, box.getAddr(), errmsg, stream, pinned, source, cudaAttr,
751+
errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr)
752+
.getResult();
753+
}
754+
720755
Fortran::lower::AbstractConverter &converter;
721756
fir::FirOpBuilder &builder;
722757
const Fortran::parser::AllocateStmt &stmt;
723758
const Fortran::lower::SomeExpr *sourceExpr{nullptr};
724759
const Fortran::lower::SomeExpr *moldExpr{nullptr};
725760
const Fortran::lower::SomeExpr *statExpr{nullptr};
726761
const Fortran::lower::SomeExpr *errMsgExpr{nullptr};
762+
const Fortran::lower::SomeExpr *pinnedExpr{nullptr};
763+
const Fortran::lower::SomeExpr *streamExpr{nullptr};
727764
// If the allocate has a type spec, lenParams contains the
728765
// value of the length parameters that were specified inside.
729766
llvm::SmallVector<mlir::Value> lenParams;
+107
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
2+
3+
! Test lowering of CUDA allocatable allocate/deallocate statements.
4+
5+
subroutine sub1()
6+
real, allocatable, device :: a(:)
7+
allocate(a(10))
8+
end subroutine
9+
10+
! CHECK-LABEL: func.func @_QPsub1()
11+
! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
12+
! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
13+
! CHECK: fir.call @_FortranAAllocatableSetBounds
14+
! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
15+
16+
subroutine sub2()
17+
real, allocatable, managed :: a(:)
18+
integer :: istat
19+
allocate(a(10), stat=istat)
20+
end subroutine
21+
22+
! CHECK-LABEL: func.func @_QPsub2()
23+
! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub2Ea"}
24+
! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub2Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
25+
! CHECK: %[[ISTAT:.*]] = fir.alloca i32 {bindc_name = "istat", uniq_name = "_QFsub2Eistat"}
26+
! CHECK: %[[ISTAT_DECL:.*]]:2 = hlfir.declare %[[ISTAT]] {uniq_name = "_QFsub2Eistat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
27+
! CHECK: fir.call @_FortranAAllocatableSetBounds
28+
! CHECK: %[[STAT:.*]] = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<managed>, hasStat} -> i32
29+
! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>
30+
31+
subroutine sub3()
32+
integer, allocatable, pinned :: a(:,:)
33+
logical :: plog
34+
allocate(a(20,30), pinned = plog)
35+
end subroutine
36+
37+
! CHECK-LABEL: func.func @_QPsub3()
38+
! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xi32>>> {bindc_name = "a", uniq_name = "_QFsub3Ea"}
39+
! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub3Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
40+
! CHECK: %[[PLOG:.*]] = fir.alloca !fir.logical<4> {bindc_name = "plog", uniq_name = "_QFsub3Eplog"}
41+
! CHECK: %[[PLOG_DECL:.*]]:2 = hlfir.declare %5 {uniq_name = "_QFsub3Eplog"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
42+
! CHECK-2: fir.call @_FortranAAllocatableSetBounds
43+
! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> pinned(%[[PLOG_DECL]]#1 : !fir.ref<!fir.logical<4>>) {cuda_attr = #fir.cuda<pinned>} -> i32
44+
45+
subroutine sub4()
46+
real, allocatable, unified :: a(:)
47+
integer :: istream
48+
allocate(a(10), stream=istream)
49+
end subroutine
50+
51+
! CHECK-LABEL: func.func @_QPsub4()
52+
! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub4Ea"}
53+
! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<unified>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub4Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
54+
! CHECK: %[[ISTREAM:.*]] = fir.alloca i32 {bindc_name = "istream", uniq_name = "_QFsub4Eistream"}
55+
! CHECK: %[[ISTREAM_DECL:.*]]:2 = hlfir.declare %[[ISTREAM]] {uniq_name = "_QFsub4Eistream"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
56+
! CHECK: fir.call @_FortranAAllocatableSetBounds
57+
! CHECK: %[[STREAM:.*]] = fir.load %[[ISTREAM_DECL]]#0 : !fir.ref<i32>
58+
! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> stream(%[[STREAM]] : i32) {cuda_attr = #fir.cuda<unified>} -> i32
59+
60+
subroutine sub5()
61+
real, allocatable, device :: a(:)
62+
real, allocatable :: b(:)
63+
allocate(a, source=b)
64+
end subroutine
65+
66+
! CHECK-LABEL: func.func @_QPsub5()
67+
! CHECK: %[[BOX_A:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub5Ea"}
68+
! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub5Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
69+
! CHECK: %[[BOX_B:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "b", uniq_name = "_QFsub5Eb"}
70+
! CHECK: %[[BOX_B_DECL:.*]]:2 = hlfir.declare %[[BOX_B]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub5Eb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
71+
! CHECK: %[[LOAD_B:.*]] = fir.load %[[BOX_B_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
72+
! CHECK: fir.call @_FortranAAllocatableSetBounds
73+
! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> source(%[[LOAD_B]] : !fir.box<!fir.heap<!fir.array<?xf32>>>) {cuda_attr = #fir.cuda<device>} -> i32
74+
75+
subroutine sub6()
76+
real, allocatable, device :: a(:)
77+
real, allocatable :: b(:)
78+
allocate(a, mold=b)
79+
end subroutine
80+
81+
! CHECK-LABEL: func.func @_QPsub6()
82+
! CHECK: %[[BOX_A:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub6Ea"}
83+
! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub6Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
84+
! CHECK: %[[BOX_B:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "b", uniq_name = "_QFsub6Eb"}
85+
! CHECK: %[[BOX_B_DECL:.*]]:2 = hlfir.declare %[[BOX_B]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub6Eb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
86+
! CHECK: %[[LOAD_B:.*]] = fir.load %[[BOX_B_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
87+
! CHECK: fir.call @_FortranAAllocatableApplyMold
88+
! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
89+
90+
subroutine sub7()
91+
real, allocatable, device :: a(:)
92+
integer :: istat
93+
character(50) :: err
94+
allocate(a(100), stat=istat, errmsg=err)
95+
end subroutine
96+
97+
! CHECK-LABEL: func.func @_QPsub7()
98+
! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub7Ea"}
99+
! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
100+
! CHECK: %[[ERR:.*]] = fir.alloca !fir.char<1,50> {bindc_name = "err", uniq_name = "_QFsub7Eerr"}
101+
! CHECK: %[[ERR_DECL:.*]]:2 = hlfir.declare %[[ERR]] typeparams %{{.*}} {uniq_name = "_QFsub7Eerr"} : (!fir.ref<!fir.char<1,50>>, index) -> (!fir.ref<!fir.char<1,50>>, !fir.ref<!fir.char<1,50>>)
102+
! CHECK: %[[ISTAT:.*]] = fir.alloca i32 {bindc_name = "istat", uniq_name = "_QFsub7Eistat"}
103+
! CHECK: %[[ISTAT_DECL:.*]]:2 = hlfir.declare %[[ISTAT]] {uniq_name = "_QFsub7Eistat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
104+
! CHECK: %[[ERR_BOX:.*]] = fir.embox %[[ERR_DECL]]#1 : (!fir.ref<!fir.char<1,50>>) -> !fir.box<!fir.char<1,50>>
105+
! CHECK: fir.call @_FortranAAllocatableSetBounds
106+
! CHECK: %[[STAT:.*]] = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> errmsg(%[[ERR_BOX]] : !fir.box<!fir.char<1,50>>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
107+
! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>

0 commit comments

Comments
 (0)