jdoerfert
diff --git a/‎clang/lib/Driver/ToolChains/Cuda.cpp
+3 b/‎clang/lib/Driver/ToolChains/Cuda.cpp
+3
diff --git a/‎clang/lib/Headers/CMakeLists.txt
+1 b/‎clang/lib/Headers/CMakeLists.txt
+1
diff --git a/‎clang/lib/Headers/cuda_wrappers/__openmp_cuda_host_wrapper.h
+71 b/‎clang/lib/Headers/cuda_wrappers/__openmp_cuda_host_wrapper.h
+71
diff --git a/‎openmp/libomptarget/include/omptarget.h
+9-2 b/‎openmp/libomptarget/include/omptarget.h
+9-2
diff --git a/‎openmp/libomptarget/include/omptargetplugin.h
+8 b/‎openmp/libomptarget/include/omptargetplugin.h
+8
diff --git a/‎openmp/libomptarget/plugins/cuda/src/rtl.cpp
+129-100 b/‎openmp/libomptarget/plugins/cuda/src/rtl.cpp
+129-100
@@ -301,6 +301,9 @@ void CudaInstallationDetector::AddCudaIncludeArgs(
 
   CC1Args.push_back("-include");
   CC1Args.push_back("__clang_cuda_runtime_wrapper.h");
+
+  CC1Args.push_back("-include");
+  CC1Args.push_back("__openmp_cuda_host_wrapper.h");
 }
 
 void CudaInstallationDetector::CheckCudaVersionSupportsArch(
 
@@ -151,6 +151,7 @@ set(cuda_wrapper_files
   cuda_wrappers/algorithm
   cuda_wrappers/complex
   cuda_wrappers/new
+  cuda_wrappers/__openmp_cuda_host_wrapper.h
 )
 
 set(ppc_wrapper_files
 
@@ -0,0 +1,71 @@
+/*===---- __openmp_cuda_host_wrapper.h - CUDA host support for OpenMP ------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __OPENMP_CUDA_HOST_WRAPPER_H__
+#define __OPENMP_CUDA_HOST_WRAPPER_H__
+
+#include "cuda.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <omp.h>
+
+extern "C" {
+int __tgt_kernel(int64_t device_id, const void *host_ptr, void **args,
+                 int32_t grid_dim_x, int32_t grid_dim_y, int32_t grid_dim_z,
+                 int32_t block_dim_x, int32_t block_dim_y, int32_t block_dim_z,
+                 size_t shared_mem, void *stream);
+
+struct __omp_kernel_t {
+  dim3 __grid_size;
+  dim3 __block_size;
+  size_t __shared_memory;
+
+  void* __stream;
+};
+
+static __omp_kernel_t __current_kernel;
+#pragma omp threadprivate(__current_kernel);
+
+inline unsigned __cudaPushCallConfiguration(dim3 __grid_size, dim3 __block_size,
+                                            size_t __shared_memory,
+                                            void* __stream_ptr) {
+  __omp_kernel_t __kernel = __current_kernel;
+  __kernel.__stream = __stream_ptr;
+  __kernel.__grid_size = __grid_size;
+  __kernel.__block_size = __block_size;
+  __kernel.__shared_memory = __shared_memory;
+  return 0;
+}
+
+inline unsigned __cudaPopCallConfiguration(dim3 *__grid_size,
+                                           dim3 *__block_size,
+                                           size_t *__shared_memory,
+                                           void *__stream) {
+  __omp_kernel_t &__kernel = __current_kernel;
+  *__grid_size = __kernel.__grid_size;
+  *__block_size = __kernel.__block_size;
+  *__shared_memory = __kernel.__shared_memory;
+  *((void**)__stream) = __kernel.__stream;
+  return 0;
+}
+
+inline cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
+                                    dim3 blockDim, void **args,
+                                    size_t sharedMem, cudaStream_t stream) {
+  __omp_kernel_t &__kernel = __current_kernel;
+
+  int rv = __tgt_kernel(omp_get_default_device(), func, args, gridDim.x, gridDim.y,
+               gridDim.z, blockDim.x, blockDim.y, blockDim.z, sharedMem,
+               stream);
+  return cudaError_t(rv);
+}
+}
+
+#endif
@@ -164,11 +164,12 @@ class AsyncInfoTy {
   /// as long as this AsyncInfoTy object.
   std::deque<void *> BufferLocations;
 
-  __tgt_async_info AsyncInfo;
   DeviceTy &Device;
+  __tgt_async_info AsyncInfo;
 
 public:
-  AsyncInfoTy(DeviceTy &Device) : Device(Device) {}
+  AsyncInfoTy(DeviceTy &Device, void *Stream = nullptr)
+      : Device(Device), AsyncInfo{Stream} {}
   ~AsyncInfoTy() { synchronize(); }
 
   /// Implicit conversion to the __tgt_async_info which is used in the
@@ -341,6 +342,12 @@ int __tgt_target_teams_mapper(ident_t *loc, int64_t device_id, void *host_ptr,
                               int64_t *arg_sizes, int64_t *arg_types,
                               map_var_info_t *arg_names, void **arg_mappers,
                               int32_t num_teams, int32_t thread_limit);
+
+int __tgt_kernel(int64_t device_id, const void *host_ptr, void **args,
+                 int32_t grid_dim_x, int32_t grid_dim_y, int32_t grid_dim_z,
+                 int32_t block_dim_x, int32_t block_dim_y, int32_t block_dim_z,
+                 size_t SharedMem, void *Stream);
+
 int __tgt_target_teams_nowait_mapper(
     ident_t *loc, int64_t device_id, void *host_ptr, int32_t arg_num,
     void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
 
@@ -135,6 +135,14 @@ int32_t __tgt_rtl_run_target_team_region_async(
     int32_t NumTeams, int32_t ThreadLimit, uint64_t loop_tripcount,
     __tgt_async_info *AsyncInfo);
 
+// Entry point for non-OpenMP kernels
+int32_t __tgt_rtl_run_kernel_async(int32_t device_id, void *tgt_entry_ptr,
+                                   void **tgt_args, int32_t grid_dim_x,
+                                   int32_t grid_dim_y, int32_t grid_dim_z,
+                                   int32_t block_dim_x, int32_t block_dim_y,
+                                   int32_t block_dim_z,
+                                   __tgt_async_info *async_info_ptr);
+
 // Device synchronization. In case of success, return zero. Otherwise, return an
 // error code.
 int32_t __tgt_rtl_synchronize(int32_t ID, __tgt_async_info *AsyncInfo);
 
@@ -1056,129 +1056,144 @@ class DeviceRTLTy {
                           ptrdiff_t *TgtOffsets, const int ArgNum,
                           const int TeamNum, const int ThreadLimit,
                           const unsigned int LoopTripCount,
-                          __tgt_async_info *AsyncInfo) const {
+                          __tgt_async_info *AsyncInfo, const int GridDimY = 1,
+                          const int GridDimZ = 1, const int BlockDimY = 1,
+                          const int BlockDimZ = 1) const {
     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
       return OFFLOAD_FAIL;
 
-    // All args are references.
-    std::vector<void *> Args(ArgNum);
-    std::vector<void *> Ptrs(ArgNum);
-
-    for (int I = 0; I < ArgNum; ++I) {
-      Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
-      Args[I] = &Ptrs[I];
-    }
-
     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
 
-    const bool IsSPMDGenericMode =
-        KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
-    const bool IsSPMDMode =
-        KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
-    const bool IsGenericMode =
-        KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
-
-    int CudaThreadsPerBlock;
-    if (ThreadLimit > 0) {
-      DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
-      CudaThreadsPerBlock = ThreadLimit;
-      // Add master warp if necessary
-      if (IsGenericMode) {
-        DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
-        CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
+    bool OpenMPMode = TgtOffsets != nullptr;
+    bool IsSPMDMode = !OpenMPMode;
+    bool IsGenericMode = !IsSPMDMode;
+    bool IsSPMDGenericMode = false;
+    unsigned CudaBlocksPerGrid = TeamNum;
+    unsigned CudaThreadsPerBlock = ThreadLimit;
+    CUstream Stream = (CUstream)AsyncInfo;
+
+    if (OpenMPMode) {
+      // All args are references.
+      std::vector<void *> Args(ArgNum);
+      std::vector<void *> Ptrs(ArgNum);
+
+      for (int I = 0; I < ArgNum; ++I) {
+        Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
+        Args[I] = &Ptrs[I];
+      }
+      TgtArgs = &Args[0];
+
+      IsSPMDGenericMode = KernelInfo->ExecutionMode ==
+                          llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
+      IsSPMDMode =
+          KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
+      IsGenericMode =
+          KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
+
+      if (ThreadLimit > 0) {
+        DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
+        CudaThreadsPerBlock = ThreadLimit;
+        // Add master warp if necessary
+        if (IsGenericMode) {
+          DP("Adding master warp: +%d threads\n",
+             DeviceData[DeviceId].WarpSize);
+          CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
+        }
+      } else {
+        DP("Setting CUDA threads per block to default %d\n",
+           DeviceData[DeviceId].NumThreads);
+        CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
       }
-    } else {
-      DP("Setting CUDA threads per block to default %d\n",
-         DeviceData[DeviceId].NumThreads);
-      CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
-    }
 
-    if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
-      DP("Threads per block capped at device limit %d\n",
-         DeviceData[DeviceId].ThreadsPerBlock);
-      CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
-    }
+      if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
+        DP("Threads per block capped at device limit %d\n",
+           DeviceData[DeviceId].ThreadsPerBlock);
+        CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
+      }
 
-    if (!KernelInfo->MaxThreadsPerBlock) {
-      Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
-                               CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                               KernelInfo->Func);
-      if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
-        return OFFLOAD_FAIL;
-    }
+      if (!KernelInfo->MaxThreadsPerBlock) {
+        Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
+                                 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                                 KernelInfo->Func);
+        if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
+          return OFFLOAD_FAIL;
+      }
 
-    if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
-      DP("Threads per block capped at kernel limit %d\n",
-         KernelInfo->MaxThreadsPerBlock);
-      CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
-    }
+      if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
+        DP("Threads per block capped at kernel limit %d\n",
+           KernelInfo->MaxThreadsPerBlock);
+        CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
+      }
 
-    unsigned int CudaBlocksPerGrid;
-    if (TeamNum <= 0) {
-      if (LoopTripCount > 0 && EnvNumTeams < 0) {
-        if (IsSPMDGenericMode) {
-          // If we reach this point, then we are executing a kernel that was
-          // transformed from Generic-mode to SPMD-mode. This kernel has
-          // SPMD-mode execution, but needs its blocks to be scheduled
-          // differently because the current loop trip count only applies to the
-          // `teams distribute` region and will create var too few blocks using
-          // the regular SPMD-mode method.
-          CudaBlocksPerGrid = LoopTripCount;
-        } else if (IsSPMDMode) {
-          // We have a combined construct, i.e. `target teams distribute
-          // parallel for [simd]`. We launch so many teams so that each thread
-          // will execute one iteration of the loop. round up to the nearest
-          // integer
-          CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
-        } else if (IsGenericMode) {
-          // If we reach this point, then we have a non-combined construct, i.e.
-          // `teams distribute` with a nested `parallel for` and each team is
-          // assigned one iteration of the `distribute` loop. E.g.:
-          //
-          // #pragma omp target teams distribute
-          // for(...loop_tripcount...) {
-          //   #pragma omp parallel for
-          //   for(...) {}
-          // }
-          //
-          // Threads within a team will execute the iterations of the `parallel`
-          // loop.
-          CudaBlocksPerGrid = LoopTripCount;
+      if (TeamNum <= 0) {
+        if (LoopTripCount > 0 && EnvNumTeams < 0) {
+          if (IsSPMDGenericMode) {
+            // If we reach this point, then we are executing a kernel that was
+            // transformed from Generic-mode to SPMD-mode. This kernel has
+            // SPMD-mode execution, but needs its blocks to be scheduled
+            // differently because the current loop trip count only applies to
+            // the `teams distribute` region and will create var too few blocks
+            // using the regular SPMD-mode method.
+            CudaBlocksPerGrid = LoopTripCount;
+          } else if (IsSPMDMode) {
+            // We have a combined construct, i.e. `target teams distribute
+            // parallel for [simd]`. We launch so many teams so that each thread
+            // will execute one iteration of the loop. round up to the nearest
+            // integer
+            CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
+          } else if (IsGenericMode) {
+            // If we reach this point, then we have a non-combined construct,
+            // i.e. `teams distribute` with a nested `parallel for` and each
+            // team is assigned one iteration of the `distribute` loop. E.g.:
+            //
+            // #pragma omp target teams distribute
+            // for(...loop_tripcount...) {
+            //   #pragma omp parallel for
+            //   for(...) {}
+            // }
+            //
+            // Threads within a team will execute the iterations of the
+            // `parallel` loop.
+            CudaBlocksPerGrid = LoopTripCount;
+          } else {
+            REPORT("Unknown execution mode: %d\n",
+                   static_cast<int8_t>(KernelInfo->ExecutionMode));
+            return OFFLOAD_FAIL;
+          }
+          DP("Using %d teams due to loop trip count %" PRIu32
+             " and number of threads per block %d\n",
+             CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
         } else {
-          REPORT("Unknown execution mode: %d\n",
-                 static_cast<int8_t>(KernelInfo->ExecutionMode));
-          return OFFLOAD_FAIL;
+          DP("Using default number of teams %d\n",
+             DeviceData[DeviceId].NumTeams);
+          CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
         }
-        DP("Using %d teams due to loop trip count %" PRIu32
-           " and number of threads per block %d\n",
-           CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
+      } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
+        DP("Capping number of teams to team limit %d\n",
+           DeviceData[DeviceId].BlocksPerGrid);
+        CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
       } else {
-        DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
-        CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
+        DP("Using requested number of teams %d\n", TeamNum);
+        CudaBlocksPerGrid = TeamNum;
       }
-    } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
-      DP("Capping number of teams to team limit %d\n",
-         DeviceData[DeviceId].BlocksPerGrid);
-      CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
-    } else {
-      DP("Using requested number of teams %d\n", TeamNum);
-      CudaBlocksPerGrid = TeamNum;
+
+      Stream = getStream(DeviceId, AsyncInfo);
     }
 
     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-         "Launching kernel %s with %d blocks and %d threads in %s mode\n",
+         "Launching kernel %s with [%d,%d,%d] blocks and [%d,%d,%d] threads in "
+         "%s mode\n",
          (getOffloadEntry(DeviceId, TgtEntryPtr))
              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
              : "(null)",
-         CudaBlocksPerGrid, CudaThreadsPerBlock,
+         CudaBlocksPerGrid, GridDimY, GridDimZ, CudaThreadsPerBlock, BlockDimY,
+         BlockDimZ,
          (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
 
-    CUstream Stream = getStream(DeviceId, AsyncInfo);
-    Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
-                         /* gridDimZ */ 1, CudaThreadsPerBlock,
-                         /* blockDimY */ 1, /* blockDimZ */ 1,
-                         DynamicMemorySize, Stream, &Args[0], nullptr);
+    Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, GridDimY,
+                         GridDimZ, CudaThreadsPerBlock, BlockDimY, BlockDimZ,
+                         DynamicMemorySize, Stream, TgtArgs, nullptr);
     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
       return OFFLOAD_FAIL;
 
@@ -1559,6 +1574,20 @@ int32_t __tgt_rtl_run_target_team_region_async(
       thread_limit, loop_tripcount, async_info_ptr);
 }
 
+int32_t __tgt_rtl_run_kernel_async(int32_t device_id, void *tgt_entry_ptr,
+                                   void **tgt_args, int32_t grid_dim_x,
+                                   int32_t grid_dim_y, int32_t grid_dim_z,
+                                   int32_t block_dim_x, int32_t block_dim_y,
+                                   int32_t block_dim_z,
+                                   __tgt_async_info *async_info_ptr) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
+  return DeviceRTL.runTargetTeamRegion(
+      device_id, tgt_entry_ptr, tgt_args, /* tgt_offsets */ nullptr,
+      /* arg_num */ 0, grid_dim_x, block_dim_x, /* loop_tripcount */ 0,
+      async_info_ptr, grid_dim_y, grid_dim_z, block_dim_y, block_dim_z);
+}
+
 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
                                     void **tgt_args, ptrdiff_t *tgt_offsets,
                                     int32_t arg_num) {
Original file line number	Diff line number	Diff line change
`@@ -301,6 +301,9 @@ void CudaInstallationDetector::AddCudaIncludeArgs(`
`301`	`301`
`302`	`302`	`CC1Args.push_back("-include");`
`303`	`303`	`CC1Args.push_back("__clang_cuda_runtime_wrapper.h");`
	`304`	`+`
	`305`	`+ CC1Args.push_back("-include");`
	`306`	`+ CC1Args.push_back("__openmp_cuda_host_wrapper.h");`
`304`	`307`	`}`
`305`	`308`
`306`	`309`	`void CudaInstallationDetector::CheckCudaVersionSupportsArch(`
Original file line number	Diff line number	Diff line change
`@@ -151,6 +151,7 @@ set(cuda_wrapper_files`
`151`	`151`	`cuda_wrappers/algorithm`
`152`	`152`	`cuda_wrappers/complex`
`153`	`153`	`cuda_wrappers/new`
	`154`	`+ cuda_wrappers/__openmp_cuda_host_wrapper.h`
`154`	`155`	`)`
`155`	`156`
`156`	`157`	`set(ppc_wrapper_files`