Skip to content

Commit d05af69

Browse files
committed
Create a CUDA entry point into OpenMP libomptarget and use it
1 parent 16a816a commit d05af69

File tree

15 files changed

+327
-137
lines changed

15 files changed

+327
-137
lines changed

clang/lib/Driver/ToolChains/Cuda.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,9 @@ void CudaInstallationDetector::AddCudaIncludeArgs(
301301

302302
CC1Args.push_back("-include");
303303
CC1Args.push_back("__clang_cuda_runtime_wrapper.h");
304+
305+
CC1Args.push_back("-include");
306+
CC1Args.push_back("__openmp_cuda_host_wrapper.h");
304307
}
305308

306309
void CudaInstallationDetector::CheckCudaVersionSupportsArch(

clang/lib/Headers/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ set(cuda_wrapper_files
151151
cuda_wrappers/algorithm
152152
cuda_wrappers/complex
153153
cuda_wrappers/new
154+
cuda_wrappers/__openmp_cuda_host_wrapper.h
154155
)
155156

156157
set(ppc_wrapper_files
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
/*===---- __openmp_cuda_host_wrapper.h - CUDA host support for OpenMP ------===
2+
*
3+
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
* See https://llvm.org/LICENSE.txt for license information.
5+
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
*
7+
*===-----------------------------------------------------------------------===
8+
*/
9+
10+
#ifndef __OPENMP_CUDA_HOST_WRAPPER_H__
11+
#define __OPENMP_CUDA_HOST_WRAPPER_H__
12+
13+
#include "cuda.h"
14+
15+
#include <cstdint>
16+
#include <cstdio>
17+
#include <omp.h>
18+
19+
extern "C" {
20+
int __tgt_kernel(int64_t device_id, const void *host_ptr, void **args,
21+
int32_t grid_dim_x, int32_t grid_dim_y, int32_t grid_dim_z,
22+
int32_t block_dim_x, int32_t block_dim_y, int32_t block_dim_z,
23+
size_t shared_mem, void *stream);
24+
25+
struct __omp_kernel_t {
26+
dim3 __grid_size;
27+
dim3 __block_size;
28+
size_t __shared_memory;
29+
30+
void* __stream;
31+
};
32+
33+
static __omp_kernel_t __current_kernel;
34+
#pragma omp threadprivate(__current_kernel);
35+
36+
inline unsigned __cudaPushCallConfiguration(dim3 __grid_size, dim3 __block_size,
37+
size_t __shared_memory,
38+
void* __stream_ptr) {
39+
__omp_kernel_t __kernel = __current_kernel;
40+
__kernel.__stream = __stream_ptr;
41+
__kernel.__grid_size = __grid_size;
42+
__kernel.__block_size = __block_size;
43+
__kernel.__shared_memory = __shared_memory;
44+
return 0;
45+
}
46+
47+
inline unsigned __cudaPopCallConfiguration(dim3 *__grid_size,
48+
dim3 *__block_size,
49+
size_t *__shared_memory,
50+
void *__stream) {
51+
__omp_kernel_t &__kernel = __current_kernel;
52+
*__grid_size = __kernel.__grid_size;
53+
*__block_size = __kernel.__block_size;
54+
*__shared_memory = __kernel.__shared_memory;
55+
*((void**)__stream) = __kernel.__stream;
56+
return 0;
57+
}
58+
59+
inline cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
60+
dim3 blockDim, void **args,
61+
size_t sharedMem, cudaStream_t stream) {
62+
__omp_kernel_t &__kernel = __current_kernel;
63+
64+
int rv = __tgt_kernel(omp_get_default_device(), func, args, gridDim.x, gridDim.y,
65+
gridDim.z, blockDim.x, blockDim.y, blockDim.z, sharedMem,
66+
stream);
67+
return cudaError_t(rv);
68+
}
69+
}
70+
71+
#endif

openmp/libomptarget/include/omptarget.h

+9-2
Original file line numberDiff line numberDiff line change
@@ -164,11 +164,12 @@ class AsyncInfoTy {
164164
/// as long as this AsyncInfoTy object.
165165
std::deque<void *> BufferLocations;
166166

167-
__tgt_async_info AsyncInfo;
168167
DeviceTy &Device;
168+
__tgt_async_info AsyncInfo;
169169

170170
public:
171-
AsyncInfoTy(DeviceTy &Device) : Device(Device) {}
171+
AsyncInfoTy(DeviceTy &Device, void *Stream = nullptr)
172+
: Device(Device), AsyncInfo{Stream} {}
172173
~AsyncInfoTy() { synchronize(); }
173174

174175
/// Implicit conversion to the __tgt_async_info which is used in the
@@ -341,6 +342,12 @@ int __tgt_target_teams_mapper(ident_t *loc, int64_t device_id, void *host_ptr,
341342
int64_t *arg_sizes, int64_t *arg_types,
342343
map_var_info_t *arg_names, void **arg_mappers,
343344
int32_t num_teams, int32_t thread_limit);
345+
346+
int __tgt_kernel(int64_t device_id, const void *host_ptr, void **args,
347+
int32_t grid_dim_x, int32_t grid_dim_y, int32_t grid_dim_z,
348+
int32_t block_dim_x, int32_t block_dim_y, int32_t block_dim_z,
349+
size_t SharedMem, void *Stream);
350+
344351
int __tgt_target_teams_nowait_mapper(
345352
ident_t *loc, int64_t device_id, void *host_ptr, int32_t arg_num,
346353
void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,

openmp/libomptarget/include/omptargetplugin.h

+8
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,14 @@ int32_t __tgt_rtl_run_target_team_region_async(
135135
int32_t NumTeams, int32_t ThreadLimit, uint64_t loop_tripcount,
136136
__tgt_async_info *AsyncInfo);
137137

138+
// Entry point for non-OpenMP kernels
139+
int32_t __tgt_rtl_run_kernel_async(int32_t device_id, void *tgt_entry_ptr,
140+
void **tgt_args, int32_t grid_dim_x,
141+
int32_t grid_dim_y, int32_t grid_dim_z,
142+
int32_t block_dim_x, int32_t block_dim_y,
143+
int32_t block_dim_z,
144+
__tgt_async_info *async_info_ptr);
145+
138146
// Device synchronization. In case of success, return zero. Otherwise, return an
139147
// error code.
140148
int32_t __tgt_rtl_synchronize(int32_t ID, __tgt_async_info *AsyncInfo);

openmp/libomptarget/plugins/cuda/src/rtl.cpp

+129-100
Original file line numberDiff line numberDiff line change
@@ -1056,129 +1056,144 @@ class DeviceRTLTy {
10561056
ptrdiff_t *TgtOffsets, const int ArgNum,
10571057
const int TeamNum, const int ThreadLimit,
10581058
const unsigned int LoopTripCount,
1059-
__tgt_async_info *AsyncInfo) const {
1059+
__tgt_async_info *AsyncInfo, const int GridDimY = 1,
1060+
const int GridDimZ = 1, const int BlockDimY = 1,
1061+
const int BlockDimZ = 1) const {
10601062
CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
10611063
if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
10621064
return OFFLOAD_FAIL;
10631065

1064-
// All args are references.
1065-
std::vector<void *> Args(ArgNum);
1066-
std::vector<void *> Ptrs(ArgNum);
1067-
1068-
for (int I = 0; I < ArgNum; ++I) {
1069-
Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1070-
Args[I] = &Ptrs[I];
1071-
}
1072-
10731066
KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
10741067

1075-
const bool IsSPMDGenericMode =
1076-
KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
1077-
const bool IsSPMDMode =
1078-
KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
1079-
const bool IsGenericMode =
1080-
KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
1081-
1082-
int CudaThreadsPerBlock;
1083-
if (ThreadLimit > 0) {
1084-
DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1085-
CudaThreadsPerBlock = ThreadLimit;
1086-
// Add master warp if necessary
1087-
if (IsGenericMode) {
1088-
DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
1089-
CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1068+
bool OpenMPMode = TgtOffsets != nullptr;
1069+
bool IsSPMDMode = !OpenMPMode;
1070+
bool IsGenericMode = !IsSPMDMode;
1071+
bool IsSPMDGenericMode = false;
1072+
unsigned CudaBlocksPerGrid = TeamNum;
1073+
unsigned CudaThreadsPerBlock = ThreadLimit;
1074+
CUstream Stream = (CUstream)AsyncInfo;
1075+
1076+
if (OpenMPMode) {
1077+
// All args are references.
1078+
std::vector<void *> Args(ArgNum);
1079+
std::vector<void *> Ptrs(ArgNum);
1080+
1081+
for (int I = 0; I < ArgNum; ++I) {
1082+
Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1083+
Args[I] = &Ptrs[I];
1084+
}
1085+
TgtArgs = &Args[0];
1086+
1087+
IsSPMDGenericMode = KernelInfo->ExecutionMode ==
1088+
llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
1089+
IsSPMDMode =
1090+
KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
1091+
IsGenericMode =
1092+
KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
1093+
1094+
if (ThreadLimit > 0) {
1095+
DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1096+
CudaThreadsPerBlock = ThreadLimit;
1097+
// Add master warp if necessary
1098+
if (IsGenericMode) {
1099+
DP("Adding master warp: +%d threads\n",
1100+
DeviceData[DeviceId].WarpSize);
1101+
CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1102+
}
1103+
} else {
1104+
DP("Setting CUDA threads per block to default %d\n",
1105+
DeviceData[DeviceId].NumThreads);
1106+
CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
10901107
}
1091-
} else {
1092-
DP("Setting CUDA threads per block to default %d\n",
1093-
DeviceData[DeviceId].NumThreads);
1094-
CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1095-
}
10961108

1097-
if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1098-
DP("Threads per block capped at device limit %d\n",
1099-
DeviceData[DeviceId].ThreadsPerBlock);
1100-
CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1101-
}
1109+
if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1110+
DP("Threads per block capped at device limit %d\n",
1111+
DeviceData[DeviceId].ThreadsPerBlock);
1112+
CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1113+
}
11021114

1103-
if (!KernelInfo->MaxThreadsPerBlock) {
1104-
Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1105-
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1106-
KernelInfo->Func);
1107-
if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1108-
return OFFLOAD_FAIL;
1109-
}
1115+
if (!KernelInfo->MaxThreadsPerBlock) {
1116+
Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1117+
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1118+
KernelInfo->Func);
1119+
if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1120+
return OFFLOAD_FAIL;
1121+
}
11101122

1111-
if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1112-
DP("Threads per block capped at kernel limit %d\n",
1113-
KernelInfo->MaxThreadsPerBlock);
1114-
CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1115-
}
1123+
if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1124+
DP("Threads per block capped at kernel limit %d\n",
1125+
KernelInfo->MaxThreadsPerBlock);
1126+
CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1127+
}
11161128

1117-
unsigned int CudaBlocksPerGrid;
1118-
if (TeamNum <= 0) {
1119-
if (LoopTripCount > 0 && EnvNumTeams < 0) {
1120-
if (IsSPMDGenericMode) {
1121-
// If we reach this point, then we are executing a kernel that was
1122-
// transformed from Generic-mode to SPMD-mode. This kernel has
1123-
// SPMD-mode execution, but needs its blocks to be scheduled
1124-
// differently because the current loop trip count only applies to the
1125-
// `teams distribute` region and will create var too few blocks using
1126-
// the regular SPMD-mode method.
1127-
CudaBlocksPerGrid = LoopTripCount;
1128-
} else if (IsSPMDMode) {
1129-
// We have a combined construct, i.e. `target teams distribute
1130-
// parallel for [simd]`. We launch so many teams so that each thread
1131-
// will execute one iteration of the loop. round up to the nearest
1132-
// integer
1133-
CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1134-
} else if (IsGenericMode) {
1135-
// If we reach this point, then we have a non-combined construct, i.e.
1136-
// `teams distribute` with a nested `parallel for` and each team is
1137-
// assigned one iteration of the `distribute` loop. E.g.:
1138-
//
1139-
// #pragma omp target teams distribute
1140-
// for(...loop_tripcount...) {
1141-
// #pragma omp parallel for
1142-
// for(...) {}
1143-
// }
1144-
//
1145-
// Threads within a team will execute the iterations of the `parallel`
1146-
// loop.
1147-
CudaBlocksPerGrid = LoopTripCount;
1129+
if (TeamNum <= 0) {
1130+
if (LoopTripCount > 0 && EnvNumTeams < 0) {
1131+
if (IsSPMDGenericMode) {
1132+
// If we reach this point, then we are executing a kernel that was
1133+
// transformed from Generic-mode to SPMD-mode. This kernel has
1134+
// SPMD-mode execution, but needs its blocks to be scheduled
1135+
// differently because the current loop trip count only applies to
1136+
// the `teams distribute` region and will create var too few blocks
1137+
// using the regular SPMD-mode method.
1138+
CudaBlocksPerGrid = LoopTripCount;
1139+
} else if (IsSPMDMode) {
1140+
// We have a combined construct, i.e. `target teams distribute
1141+
// parallel for [simd]`. We launch so many teams so that each thread
1142+
// will execute one iteration of the loop. round up to the nearest
1143+
// integer
1144+
CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1145+
} else if (IsGenericMode) {
1146+
// If we reach this point, then we have a non-combined construct,
1147+
// i.e. `teams distribute` with a nested `parallel for` and each
1148+
// team is assigned one iteration of the `distribute` loop. E.g.:
1149+
//
1150+
// #pragma omp target teams distribute
1151+
// for(...loop_tripcount...) {
1152+
// #pragma omp parallel for
1153+
// for(...) {}
1154+
// }
1155+
//
1156+
// Threads within a team will execute the iterations of the
1157+
// `parallel` loop.
1158+
CudaBlocksPerGrid = LoopTripCount;
1159+
} else {
1160+
REPORT("Unknown execution mode: %d\n",
1161+
static_cast<int8_t>(KernelInfo->ExecutionMode));
1162+
return OFFLOAD_FAIL;
1163+
}
1164+
DP("Using %d teams due to loop trip count %" PRIu32
1165+
" and number of threads per block %d\n",
1166+
CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
11481167
} else {
1149-
REPORT("Unknown execution mode: %d\n",
1150-
static_cast<int8_t>(KernelInfo->ExecutionMode));
1151-
return OFFLOAD_FAIL;
1168+
DP("Using default number of teams %d\n",
1169+
DeviceData[DeviceId].NumTeams);
1170+
CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
11521171
}
1153-
DP("Using %d teams due to loop trip count %" PRIu32
1154-
" and number of threads per block %d\n",
1155-
CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1172+
} else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
1173+
DP("Capping number of teams to team limit %d\n",
1174+
DeviceData[DeviceId].BlocksPerGrid);
1175+
CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
11561176
} else {
1157-
DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1158-
CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1177+
DP("Using requested number of teams %d\n", TeamNum);
1178+
CudaBlocksPerGrid = TeamNum;
11591179
}
1160-
} else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
1161-
DP("Capping number of teams to team limit %d\n",
1162-
DeviceData[DeviceId].BlocksPerGrid);
1163-
CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1164-
} else {
1165-
DP("Using requested number of teams %d\n", TeamNum);
1166-
CudaBlocksPerGrid = TeamNum;
1180+
1181+
Stream = getStream(DeviceId, AsyncInfo);
11671182
}
11681183

11691184
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1170-
"Launching kernel %s with %d blocks and %d threads in %s mode\n",
1185+
"Launching kernel %s with [%d,%d,%d] blocks and [%d,%d,%d] threads in "
1186+
"%s mode\n",
11711187
(getOffloadEntry(DeviceId, TgtEntryPtr))
11721188
? getOffloadEntry(DeviceId, TgtEntryPtr)->name
11731189
: "(null)",
1174-
CudaBlocksPerGrid, CudaThreadsPerBlock,
1190+
CudaBlocksPerGrid, GridDimY, GridDimZ, CudaThreadsPerBlock, BlockDimY,
1191+
BlockDimZ,
11751192
(!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
11761193

1177-
CUstream Stream = getStream(DeviceId, AsyncInfo);
1178-
Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1179-
/* gridDimZ */ 1, CudaThreadsPerBlock,
1180-
/* blockDimY */ 1, /* blockDimZ */ 1,
1181-
DynamicMemorySize, Stream, &Args[0], nullptr);
1194+
Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, GridDimY,
1195+
GridDimZ, CudaThreadsPerBlock, BlockDimY, BlockDimZ,
1196+
DynamicMemorySize, Stream, TgtArgs, nullptr);
11821197
if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
11831198
return OFFLOAD_FAIL;
11841199

@@ -1559,6 +1574,20 @@ int32_t __tgt_rtl_run_target_team_region_async(
15591574
thread_limit, loop_tripcount, async_info_ptr);
15601575
}
15611576

1577+
int32_t __tgt_rtl_run_kernel_async(int32_t device_id, void *tgt_entry_ptr,
1578+
void **tgt_args, int32_t grid_dim_x,
1579+
int32_t grid_dim_y, int32_t grid_dim_z,
1580+
int32_t block_dim_x, int32_t block_dim_y,
1581+
int32_t block_dim_z,
1582+
__tgt_async_info *async_info_ptr) {
1583+
assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1584+
1585+
return DeviceRTL.runTargetTeamRegion(
1586+
device_id, tgt_entry_ptr, tgt_args, /* tgt_offsets */ nullptr,
1587+
/* arg_num */ 0, grid_dim_x, block_dim_x, /* loop_tripcount */ 0,
1588+
async_info_ptr, grid_dim_y, grid_dim_z, block_dim_y, block_dim_z);
1589+
}
1590+
15621591
int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
15631592
void **tgt_args, ptrdiff_t *tgt_offsets,
15641593
int32_t arg_num) {

0 commit comments

Comments
 (0)