@@ -1056,129 +1056,144 @@ class DeviceRTLTy {
1056
1056
ptrdiff_t *TgtOffsets, const int ArgNum,
1057
1057
const int TeamNum, const int ThreadLimit,
1058
1058
const unsigned int LoopTripCount,
1059
- __tgt_async_info *AsyncInfo) const {
1059
+ __tgt_async_info *AsyncInfo, const int GridDimY = 1 ,
1060
+ const int GridDimZ = 1 , const int BlockDimY = 1 ,
1061
+ const int BlockDimZ = 1 ) const {
1060
1062
CUresult Err = cuCtxSetCurrent (DeviceData[DeviceId].Context );
1061
1063
if (!checkResult (Err, " Error returned from cuCtxSetCurrent\n " ))
1062
1064
return OFFLOAD_FAIL;
1063
1065
1064
- // All args are references.
1065
- std::vector<void *> Args (ArgNum);
1066
- std::vector<void *> Ptrs (ArgNum);
1067
-
1068
- for (int I = 0 ; I < ArgNum; ++I) {
1069
- Ptrs[I] = (void *)((intptr_t )TgtArgs[I] + TgtOffsets[I]);
1070
- Args[I] = &Ptrs[I];
1071
- }
1072
-
1073
1066
KernelTy *KernelInfo = reinterpret_cast <KernelTy *>(TgtEntryPtr);
1074
1067
1075
- const bool IsSPMDGenericMode =
1076
- KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
1077
- const bool IsSPMDMode =
1078
- KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
1079
- const bool IsGenericMode =
1080
- KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
1081
-
1082
- int CudaThreadsPerBlock;
1083
- if (ThreadLimit > 0 ) {
1084
- DP (" Setting CUDA threads per block to requested %d\n " , ThreadLimit);
1085
- CudaThreadsPerBlock = ThreadLimit;
1086
- // Add master warp if necessary
1087
- if (IsGenericMode) {
1088
- DP (" Adding master warp: +%d threads\n " , DeviceData[DeviceId].WarpSize );
1089
- CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize ;
1068
+ bool OpenMPMode = TgtOffsets != nullptr ;
1069
+ bool IsSPMDMode = !OpenMPMode;
1070
+ bool IsGenericMode = !IsSPMDMode;
1071
+ bool IsSPMDGenericMode = false ;
1072
+ unsigned CudaBlocksPerGrid = TeamNum;
1073
+ unsigned CudaThreadsPerBlock = ThreadLimit;
1074
+ CUstream Stream = (CUstream)AsyncInfo;
1075
+
1076
+ if (OpenMPMode) {
1077
+ // All args are references.
1078
+ std::vector<void *> Args (ArgNum);
1079
+ std::vector<void *> Ptrs (ArgNum);
1080
+
1081
+ for (int I = 0 ; I < ArgNum; ++I) {
1082
+ Ptrs[I] = (void *)((intptr_t )TgtArgs[I] + TgtOffsets[I]);
1083
+ Args[I] = &Ptrs[I];
1084
+ }
1085
+ TgtArgs = &Args[0 ];
1086
+
1087
+ IsSPMDGenericMode = KernelInfo->ExecutionMode ==
1088
+ llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
1089
+ IsSPMDMode =
1090
+ KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
1091
+ IsGenericMode =
1092
+ KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
1093
+
1094
+ if (ThreadLimit > 0 ) {
1095
+ DP (" Setting CUDA threads per block to requested %d\n " , ThreadLimit);
1096
+ CudaThreadsPerBlock = ThreadLimit;
1097
+ // Add master warp if necessary
1098
+ if (IsGenericMode) {
1099
+ DP (" Adding master warp: +%d threads\n " ,
1100
+ DeviceData[DeviceId].WarpSize );
1101
+ CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize ;
1102
+ }
1103
+ } else {
1104
+ DP (" Setting CUDA threads per block to default %d\n " ,
1105
+ DeviceData[DeviceId].NumThreads );
1106
+ CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads ;
1090
1107
}
1091
- } else {
1092
- DP (" Setting CUDA threads per block to default %d\n " ,
1093
- DeviceData[DeviceId].NumThreads );
1094
- CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads ;
1095
- }
1096
1108
1097
- if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock ) {
1098
- DP (" Threads per block capped at device limit %d\n " ,
1099
- DeviceData[DeviceId].ThreadsPerBlock );
1100
- CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock ;
1101
- }
1109
+ if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock ) {
1110
+ DP (" Threads per block capped at device limit %d\n " ,
1111
+ DeviceData[DeviceId].ThreadsPerBlock );
1112
+ CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock ;
1113
+ }
1102
1114
1103
- if (!KernelInfo->MaxThreadsPerBlock ) {
1104
- Err = cuFuncGetAttribute (&KernelInfo->MaxThreadsPerBlock ,
1105
- CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1106
- KernelInfo->Func );
1107
- if (!checkResult (Err, " Error returned from cuFuncGetAttribute\n " ))
1108
- return OFFLOAD_FAIL;
1109
- }
1115
+ if (!KernelInfo->MaxThreadsPerBlock ) {
1116
+ Err = cuFuncGetAttribute (&KernelInfo->MaxThreadsPerBlock ,
1117
+ CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1118
+ KernelInfo->Func );
1119
+ if (!checkResult (Err, " Error returned from cuFuncGetAttribute\n " ))
1120
+ return OFFLOAD_FAIL;
1121
+ }
1110
1122
1111
- if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1112
- DP (" Threads per block capped at kernel limit %d\n " ,
1113
- KernelInfo->MaxThreadsPerBlock );
1114
- CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock ;
1115
- }
1123
+ if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1124
+ DP (" Threads per block capped at kernel limit %d\n " ,
1125
+ KernelInfo->MaxThreadsPerBlock );
1126
+ CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock ;
1127
+ }
1116
1128
1117
- unsigned int CudaBlocksPerGrid;
1118
- if (TeamNum <= 0 ) {
1119
- if (LoopTripCount > 0 && EnvNumTeams < 0 ) {
1120
- if (IsSPMDGenericMode) {
1121
- // If we reach this point, then we are executing a kernel that was
1122
- // transformed from Generic-mode to SPMD-mode. This kernel has
1123
- // SPMD-mode execution, but needs its blocks to be scheduled
1124
- // differently because the current loop trip count only applies to the
1125
- // `teams distribute` region and will create var too few blocks using
1126
- // the regular SPMD-mode method.
1127
- CudaBlocksPerGrid = LoopTripCount;
1128
- } else if (IsSPMDMode) {
1129
- // We have a combined construct, i.e. `target teams distribute
1130
- // parallel for [simd]`. We launch so many teams so that each thread
1131
- // will execute one iteration of the loop. round up to the nearest
1132
- // integer
1133
- CudaBlocksPerGrid = ((LoopTripCount - 1 ) / CudaThreadsPerBlock) + 1 ;
1134
- } else if (IsGenericMode) {
1135
- // If we reach this point, then we have a non-combined construct, i.e.
1136
- // `teams distribute` with a nested `parallel for` and each team is
1137
- // assigned one iteration of the `distribute` loop. E.g.:
1138
- //
1139
- // #pragma omp target teams distribute
1140
- // for(...loop_tripcount...) {
1141
- // #pragma omp parallel for
1142
- // for(...) {}
1143
- // }
1144
- //
1145
- // Threads within a team will execute the iterations of the `parallel`
1146
- // loop.
1147
- CudaBlocksPerGrid = LoopTripCount;
1129
+ if (TeamNum <= 0 ) {
1130
+ if (LoopTripCount > 0 && EnvNumTeams < 0 ) {
1131
+ if (IsSPMDGenericMode) {
1132
+ // If we reach this point, then we are executing a kernel that was
1133
+ // transformed from Generic-mode to SPMD-mode. This kernel has
1134
+ // SPMD-mode execution, but needs its blocks to be scheduled
1135
+ // differently because the current loop trip count only applies to
1136
+ // the `teams distribute` region and will create var too few blocks
1137
+ // using the regular SPMD-mode method.
1138
+ CudaBlocksPerGrid = LoopTripCount;
1139
+ } else if (IsSPMDMode) {
1140
+ // We have a combined construct, i.e. `target teams distribute
1141
+ // parallel for [simd]`. We launch so many teams so that each thread
1142
+ // will execute one iteration of the loop. round up to the nearest
1143
+ // integer
1144
+ CudaBlocksPerGrid = ((LoopTripCount - 1 ) / CudaThreadsPerBlock) + 1 ;
1145
+ } else if (IsGenericMode) {
1146
+ // If we reach this point, then we have a non-combined construct,
1147
+ // i.e. `teams distribute` with a nested `parallel for` and each
1148
+ // team is assigned one iteration of the `distribute` loop. E.g.:
1149
+ //
1150
+ // #pragma omp target teams distribute
1151
+ // for(...loop_tripcount...) {
1152
+ // #pragma omp parallel for
1153
+ // for(...) {}
1154
+ // }
1155
+ //
1156
+ // Threads within a team will execute the iterations of the
1157
+ // `parallel` loop.
1158
+ CudaBlocksPerGrid = LoopTripCount;
1159
+ } else {
1160
+ REPORT (" Unknown execution mode: %d\n " ,
1161
+ static_cast <int8_t >(KernelInfo->ExecutionMode ));
1162
+ return OFFLOAD_FAIL;
1163
+ }
1164
+ DP (" Using %d teams due to loop trip count %" PRIu32
1165
+ " and number of threads per block %d\n " ,
1166
+ CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1148
1167
} else {
1149
- REPORT ( " Unknown execution mode: %d\n " ,
1150
- static_cast < int8_t >(KernelInfo-> ExecutionMode ) );
1151
- return OFFLOAD_FAIL ;
1168
+ DP ( " Using default number of teams %d\n " ,
1169
+ DeviceData[DeviceId]. NumTeams );
1170
+ CudaBlocksPerGrid = DeviceData[DeviceId]. NumTeams ;
1152
1171
}
1153
- DP (" Using %d teams due to loop trip count %" PRIu32
1154
- " and number of threads per block %d\n " ,
1155
- CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1172
+ } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid ) {
1173
+ DP (" Capping number of teams to team limit %d\n " ,
1174
+ DeviceData[DeviceId].BlocksPerGrid );
1175
+ CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid ;
1156
1176
} else {
1157
- DP (" Using default number of teams %d\n " , DeviceData[DeviceId]. NumTeams );
1158
- CudaBlocksPerGrid = DeviceData[DeviceId]. NumTeams ;
1177
+ DP (" Using requested number of teams %d\n " , TeamNum );
1178
+ CudaBlocksPerGrid = TeamNum ;
1159
1179
}
1160
- } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid ) {
1161
- DP (" Capping number of teams to team limit %d\n " ,
1162
- DeviceData[DeviceId].BlocksPerGrid );
1163
- CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid ;
1164
- } else {
1165
- DP (" Using requested number of teams %d\n " , TeamNum);
1166
- CudaBlocksPerGrid = TeamNum;
1180
+
1181
+ Stream = getStream (DeviceId, AsyncInfo);
1167
1182
}
1168
1183
1169
1184
INFO (OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1170
- " Launching kernel %s with %d blocks and %d threads in %s mode\n " ,
1185
+ " Launching kernel %s with [%d,%d,%d] blocks and [%d,%d,%d] threads in "
1186
+ " %s mode\n " ,
1171
1187
(getOffloadEntry (DeviceId, TgtEntryPtr))
1172
1188
? getOffloadEntry (DeviceId, TgtEntryPtr)->name
1173
1189
: " (null)" ,
1174
- CudaBlocksPerGrid, CudaThreadsPerBlock,
1190
+ CudaBlocksPerGrid, GridDimY, GridDimZ, CudaThreadsPerBlock, BlockDimY,
1191
+ BlockDimZ,
1175
1192
(!IsSPMDMode ? (IsGenericMode ? " Generic" : " SPMD-Generic" ) : " SPMD" ));
1176
1193
1177
- CUstream Stream = getStream (DeviceId, AsyncInfo);
1178
- Err = cuLaunchKernel (KernelInfo->Func , CudaBlocksPerGrid, /* gridDimY */ 1 ,
1179
- /* gridDimZ */ 1 , CudaThreadsPerBlock,
1180
- /* blockDimY */ 1 , /* blockDimZ */ 1 ,
1181
- DynamicMemorySize, Stream, &Args[0 ], nullptr );
1194
+ Err = cuLaunchKernel (KernelInfo->Func , CudaBlocksPerGrid, GridDimY,
1195
+ GridDimZ, CudaThreadsPerBlock, BlockDimY, BlockDimZ,
1196
+ DynamicMemorySize, Stream, TgtArgs, nullptr );
1182
1197
if (!checkResult (Err, " Error returned from cuLaunchKernel\n " ))
1183
1198
return OFFLOAD_FAIL;
1184
1199
@@ -1559,6 +1574,20 @@ int32_t __tgt_rtl_run_target_team_region_async(
1559
1574
thread_limit, loop_tripcount, async_info_ptr);
1560
1575
}
1561
1576
1577
+ int32_t __tgt_rtl_run_kernel_async (int32_t device_id, void *tgt_entry_ptr,
1578
+ void **tgt_args, int32_t grid_dim_x,
1579
+ int32_t grid_dim_y, int32_t grid_dim_z,
1580
+ int32_t block_dim_x, int32_t block_dim_y,
1581
+ int32_t block_dim_z,
1582
+ __tgt_async_info *async_info_ptr) {
1583
+ assert (DeviceRTL.isValidDeviceId (device_id) && " device_id is invalid" );
1584
+
1585
+ return DeviceRTL.runTargetTeamRegion (
1586
+ device_id, tgt_entry_ptr, tgt_args, /* tgt_offsets */ nullptr ,
1587
+ /* arg_num */ 0 , grid_dim_x, block_dim_x, /* loop_tripcount */ 0 ,
1588
+ async_info_ptr, grid_dim_y, grid_dim_z, block_dim_y, block_dim_z);
1589
+ }
1590
+
1562
1591
int32_t __tgt_rtl_run_target_region (int32_t device_id, void *tgt_entry_ptr,
1563
1592
void **tgt_args, ptrdiff_t *tgt_offsets,
1564
1593
int32_t arg_num) {
0 commit comments