[UR][Offload] Fixes for enqueue UR CTS tests (#19926)

RossBrunton · web-flow · commit 36419ab5444c · 2025-09-02T08:02:44.000-07:00
A small selection of fixes to increase the pass rate of the enqueue CTS
unit tests:
* Blocking memory reads/writes now properly wait on the queue.
* `urKernelSetArgMemObj` added to the function table.
* Debug print removed.
* Layout of kernel arguments now matches the HIP target if Offload is on
  an AMD device.
* `urEnqueueEventsWaitWithBarrierExt` has been implemented (it just
  calls to the non-ext version).
* `UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP` set to false.
diff --git a/unified-runtime/source/adapters/offload/device.cpp b/unified-runtime/source/adapters/offload/device.cpp
@@ -194,6 +194,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_IMAGE_SRGB:
   case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY:
   case UR_DEVICE_INFO_LINKER_AVAILABLE:
+  case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP:
     return ReturnValue(false);
   case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT:
   case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT:
diff --git a/unified-runtime/source/adapters/offload/enqueue.cpp b/unified-runtime/source/adapters/offload/enqueue.cpp
@@ -141,6 +141,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
   return doWait<true>(hQueue, numEventsInWaitList, phEventWaitList, phEvent);
 }
 
+// This function only makes sense for level_zero, the flag in properties is
+// ignored
+UR_APIEXPORT ur_result_t urEnqueueEventsWaitWithBarrierExt(
+    ur_queue_handle_t hQueue, const ur_exp_enqueue_ext_properties_t *,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
+                                        phEventWaitList, phEvent);
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
     const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
@@ -235,6 +245,8 @@ ur_result_t doMemcpy(ur_command_t Command, ur_queue_handle_t hQueue,
   OL_RETURN_ON_ERR(waitOnEvents(Queue, phEventWaitList, numEventsInWaitList));
 
   if (blocking) {
+    // Ensure all work in the queue is complete
+    OL_RETURN_ON_ERR(olSyncQueue(Queue));
     OL_RETURN_ON_ERR(
         olMemcpy(nullptr, DestPtr, DestDevice, SrcPtr, SrcDevice, size));
     if (phEvent) {
diff --git a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp
@@ -118,7 +118,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
   pDdiTable->pfnRelease = urKernelRelease;
   pDdiTable->pfnRetain = urKernelRetain;
   pDdiTable->pfnSetArgLocal = nullptr;
-  pDdiTable->pfnSetArgMemObj = nullptr;
+  pDdiTable->pfnSetArgMemObj = urKernelSetArgMemObj;
   pDdiTable->pfnSetArgPointer = urKernelSetArgPointer;
   pDdiTable->pfnSetArgSampler = nullptr;
   pDdiTable->pfnSetArgValue = urKernelSetArgValue;
@@ -172,6 +172,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
   pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite;
   pDdiTable->pfnEventsWait = urEnqueueEventsWait;
   pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier;
+  pDdiTable->pfnEventsWaitWithBarrierExt = urEnqueueEventsWaitWithBarrierExt;
   pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch;
   pDdiTable->pfnMemBufferCopy = urEnqueueMemBufferCopy;
   pDdiTable->pfnMemBufferCopyRect = nullptr;
diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueEventsWaitWithBarrier.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueEventsWaitWithBarrier.cpp
@@ -88,7 +88,6 @@ struct urEnqueueEventsWaitWithBarrierOrderingTest : uur::urProgramTest {
 
     auto entry_points =
         uur::KernelsEnvironment::instance->GetEntryPointNames(program_name);
-    std::cout << entry_points[0];
 
     ASSERT_SUCCESS(urKernelCreate(program, "_ZTS3Add", &add_kernel));
     ASSERT_SUCCESS(urKernelCreate(program, "_ZTS3Mul", &mul_kernel));
diff --git a/unified-runtime/test/conformance/source/environment.cpp b/unified-runtime/test/conformance/source/environment.cpp
@@ -191,56 +191,6 @@ KernelsEnvironment::parseKernelOptions(int argc, char **argv,
   return options;
 }
 
-std::string
-KernelsEnvironment::getDefaultTargetName(ur_platform_handle_t platform) {
-  if (instance->GetDevices().size() == 0) {
-    error = "no devices available on the platform";
-    return {};
-  }
-
-  ur_backend_t backend;
-  if (urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, sizeof(backend),
-                        &backend, nullptr)) {
-    error = "failed to get backend from platform.";
-    return {};
-  }
-
-  switch (backend) {
-  case UR_BACKEND_OPENCL:
-  case UR_BACKEND_LEVEL_ZERO:
-    return "spir64";
-  case UR_BACKEND_CUDA:
-    return "nvptx64-nvidia-cuda";
-  case UR_BACKEND_HIP:
-    return "amdgcn-amd-amdhsa";
-  case UR_BACKEND_OFFLOAD: {
-    // All Offload platforms report this backend, use the platform name to select
-    // the actual underlying backend.
-    std::vector<char> PlatformName;
-    size_t PlatformNameSize = 0;
-    urPlatformGetInfo(platform, UR_PLATFORM_INFO_NAME, 0, nullptr,
-                      &PlatformNameSize);
-    PlatformName.resize(PlatformNameSize);
-    urPlatformGetInfo(platform, UR_PLATFORM_INFO_NAME, PlatformNameSize,
-                      PlatformName.data(), nullptr);
-    if (std::strcmp(PlatformName.data(), "CUDA") == 0) {
-      return "nvptx64-nvidia-cuda";
-    } else if (std::strcmp(PlatformName.data(), "AMDGPU") == 0) {
-      return "amdgcn-amd-amdhsa";
-    } else {
-      error = "Could not detect target for Offload platform";
-      return {};
-    }
-  }
-  case UR_BACKEND_NATIVE_CPU:
-    error = "native_cpu doesn't support kernel tests yet";
-    return {};
-  default:
-    error = "unknown target.";
-    return {};
-  }
-}
-
 std::string
 KernelsEnvironment::getKernelSourcePath(const std::string &kernel_name,
                                         const std::string &target_name) {
@@ -256,12 +206,17 @@ void KernelsEnvironment::LoadSource(
   // We don't have a way to build device code for native cpu yet.
   UUR_KNOWN_FAILURE_ON_PARAM(platform, uur::NativeCPU{});
 
-  std::string target_name = getDefaultTargetName(platform);
-  if (target_name.empty()) {
-    FAIL() << error;
+  if (instance->GetDevices().size() == 0) {
+    FAIL() << "no devices available on the platform";
+  }
+
+  std::string triple_name;
+  auto Err = GetPlatformTriple(platform, triple_name);
+  if (Err) {
+    FAIL() << "GetPlatformTriple failed with error " << Err << "\n";
   }
 
-  return LoadSource(kernel_name, target_name, binary_out);
+  return LoadSource(kernel_name, triple_name, binary_out);
 }
 
 void KernelsEnvironment::LoadSource(
diff --git a/unified-runtime/test/conformance/testing/include/uur/environment.h b/unified-runtime/test/conformance/testing/include/uur/environment.h
@@ -90,7 +90,6 @@ struct KernelsEnvironment : DevicesEnvironment {
                                    const std::string &kernels_default_dir);
   std::string getKernelSourcePath(const std::string &kernel_name,
                                   const std::string &target_name);
-  std::string getDefaultTargetName(ur_platform_handle_t platform);
 
   KernelOptions kernel_options;
   // mapping between kernels (full_path + kernel_name) and their saved source.
diff --git a/unified-runtime/test/conformance/testing/include/uur/fixtures.h b/unified-runtime/test/conformance/testing/include/uur/fixtures.h
@@ -1472,7 +1472,9 @@ struct KernelLaunchHelper {
     ur_backend_t backend;
     ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND,
                                      sizeof(backend), &backend, nullptr));
-    if (backend == UR_BACKEND_HIP) {
+    std::string target_name;
+    ASSERT_SUCCESS(GetPlatformTriple(platform, target_name));
+    if (target_name == "amdgcn-amd-amdhsa") {
       // this emulates the three offset params for buffer accessor on AMD.
       size_t val = 0;
       ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1,
diff --git a/unified-runtime/test/conformance/testing/include/uur/utils.h b/unified-runtime/test/conformance/testing/include/uur/utils.h
@@ -416,6 +416,8 @@ ur_result_t GetTimestampRecordingSupport(ur_device_handle_t device,
                                          bool &support);
 ur_result_t GetUSMContextMemcpyExpSupport(ur_device_handle_t device,
                                           bool &support);
+ur_result_t GetPlatformTriple(ur_platform_handle_t platform,
+                              std::string &Triple);
 
 ur_device_partition_property_t makePartitionByCountsDesc(uint32_t count);
 
diff --git a/unified-runtime/test/conformance/testing/source/utils.cpp b/unified-runtime/test/conformance/testing/source/utils.cpp
@@ -649,6 +649,53 @@ ur_result_t GetUSMContextMemcpyExpSupport(ur_device_handle_t device,
       device, UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP, support);
 }
 
+ur_result_t GetPlatformTriple(ur_platform_handle_t platform,
+                              std::string &triple) {
+  ur_backend_t backend;
+  if (auto Err = urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND,
+                                   sizeof(backend), &backend, nullptr)) {
+    return Err;
+  }
+
+  switch (backend) {
+  case UR_BACKEND_OPENCL:
+  case UR_BACKEND_LEVEL_ZERO:
+    triple = "spir64";
+    break;
+  case UR_BACKEND_CUDA:
+    triple = "nvptx64-nvidia-cuda";
+    break;
+  case UR_BACKEND_HIP:
+    triple = "amdgcn-amd-amdhsa";
+    break;
+  case UR_BACKEND_OFFLOAD: {
+    // All Offload platforms report this backend, use the platform name to select
+    // the actual underlying backend.
+    std::vector<char> PlatformName;
+    size_t PlatformNameSize = 0;
+    urPlatformGetInfo(platform, UR_PLATFORM_INFO_NAME, 0, nullptr,
+                      &PlatformNameSize);
+    PlatformName.resize(PlatformNameSize);
+    urPlatformGetInfo(platform, UR_PLATFORM_INFO_NAME, PlatformNameSize,
+                      PlatformName.data(), nullptr);
+    if (strcmp(PlatformName.data(), "CUDA") == 0) {
+      triple = "nvptx64-nvidia-cuda";
+    } else if (strcmp(PlatformName.data(), "AMDGPU") == 0) {
+      triple = "amdgcn-amd-amdhsa";
+    } else {
+      return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+    }
+    break;
+  }
+  case UR_BACKEND_NATIVE_CPU:
+    return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  default:
+    return UR_RESULT_ERROR_INVALID_ENUMERATION;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
 ur_device_partition_property_t makePartitionByCountsDesc(uint32_t count) {
   ur_device_partition_property_t desc;
   desc.type = UR_DEVICE_PARTITION_BY_COUNTS;