initial HIP support

rabauke · Oct 18, 2023 · b2fedc0 · b2fedc0
1 parent 9b06232
commit b2fedc0
Show file tree

Hide file tree

Showing 40 changed files with 281 additions and 97 deletions.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -7,6 +7,7 @@ find_package(TBB)
 
 include(CheckLanguage)
 check_language(CUDA)
+check_language(HIP)
 
 include_directories(..)
 link_libraries(trng4::trng4)
@@ -61,3 +62,8 @@ if(CMAKE_CUDA_COMPILER)
   add_executable_and_copy_dlls(pi_leap_cuda pi_leap_cuda.cu)
   set_property(TARGET pi_leap_cuda PROPERTY CUDA_STANDARD 11)
 endif()
+if(CMAKE_HIP_COMPILER)
+  enable_language(HIP)
+   add_executable_and_copy_dlls(pi_block_hip pi_block_hip.hip)
+   set_source_files_properties(pi_block_hip.hip PROPERTIES LANGUAGE HIP)
+endif()
diff --git a/examples/pi_block_hip.hip b/examples/pi_block_hip.hip
@@ -0,0 +1,72 @@
+// Copyright (c) 2000-2022, Heiko Bauke
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//   * Redistributions of source code must retain the above copyright
+//     notice, this list of conditions and the following disclaimer.
+//
+//   * Redistributions in binary form must reproduce the above
+//     copyright notice, this list of conditions and the following
+//     disclaimer in the documentation and/or other materials provided
+//     with the distribution.
+//
+//   * Neither the name of the copyright holder nor the names of its
+//     contributors may be used to endorse or promote products derived
+//     from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+// COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+// OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <cstdlib>
+#include <iostream>
+#include <vector>
+#include <trng/yarn5s.hpp>
+#include <trng/uniform01_dist.hpp>
+#include <hip/hip_runtime.h>
+
+
+__global__ void parallel_pi(long samples, long *in, trng::yarn5s r) {
+  long rank = hipThreadIdx_x;
+  long size = hipBlockDim_x;
+  r.jump(2 * (rank * samples / size));  // jump ahead
+  trng::uniform01_dist<float> u;        // random number distribution
+  in[rank] = 0;                         // local number of points in circle
+  for (long i = rank * samples / size; i < (rank + 1) * samples / size; ++i) {
+    const float x = u(r), y = u(r);  // choose random x- and y-coordinates
+    if (x * x + y * y <= 1)          // is point in circle?
+      ++in[rank];                    // increase thread-local counter
+  }
+}
+
+int main(int argc, char *argv[]) {
+  const long samples{1000000l};  // total number of points in square
+  const int size{128};           // number of threads
+  long *in_device;
+  hipMalloc(&in_device, size * sizeof(*in_device));
+  trng::yarn5s r;
+  // start parallel Monte Carlo
+  parallel_pi<<<1, size>>>(samples, in_device, r);
+  // gather results
+  std::vector<long> in(size);
+  hipMemcpy(in.data(), in_device, size * sizeof(*in_device), hipMemcpyDeviceToHost);
+  hipFree(in_device);
+  long sum{0};
+  for (int rank{0}; rank < size; ++rank)
+    sum += in[rank];
+  // print result
+  std::cout << "pi = " << 4.0 * sum / samples << std::endl;
+  return EXIT_SUCCESS;
+}
diff --git a/trng/beta_dist.hpp b/trng/beta_dist.hpp
@@ -165,7 +165,7 @@ namespace trng {
       if (x < 0 or x > 1)
         return 0;
       if ((x == 0 and P.alpha() - 1 < 0) or (x == 1 and P.beta() - 1 < 0)) {
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
         errno = EDOM;
 #endif
         return math::numeric_limits<result_type>::quiet_NaN();
@@ -185,7 +185,7 @@ namespace trng {
     TRNG_CUDA_ENABLE
     result_type icdf(result_type x) const {
       if (x < 0 or x > 1) {
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
         errno = EDOM;
 #endif
         return math::numeric_limits<result_type>::quiet_NaN();

diff --git a/trng/cauchy_dist.hpp b/trng/cauchy_dist.hpp
@@ -175,7 +175,7 @@ namespace trng {
     TRNG_CUDA_ENABLE
     result_type icdf(result_type x) const {
       if (x <= 0 or x >= 1) {
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
         errno = EDOM;
 #endif
         return math::numeric_limits<result_type>::quiet_NaN();

diff --git a/trng/chi_square_dist.hpp b/trng/chi_square_dist.hpp
@@ -187,7 +187,7 @@ namespace trng {
     TRNG_CUDA_ENABLE
     result_type icdf(result_type x) const {
       if (x <= 0 or x >= 1) {
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
         errno = EDOM;
 #endif
         return math::numeric_limits<result_type>::quiet_NaN();

diff --git a/trng/cuda.hpp b/trng/cuda.hpp
@@ -34,13 +34,18 @@
 
 #define TRNG_CUDA_HPP
 
-#if defined __CUDACC__
+#if defined __CUDACC__ && !(defined __HIPCC__)
 
 #define TRNG_CUDA 1
 #define TRNG_CUDA_ENABLE __device__ __host__
 
 #include <cuda.h>
 
+#elif defined __HIPCC__
+
+#define TRNG_CUDA 1
+#define TRNG_CUDA_ENABLE __device__ __host__
+
 #else
 
 #define TRNG_CUDA_ENABLE

diff --git a/trng/exponential_dist.hpp b/trng/exponential_dist.hpp
@@ -150,7 +150,7 @@ namespace trng {
     TRNG_CUDA_ENABLE
     result_type icdf(result_type x) const {
       if (x < 0 or x > 1) {
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
         errno = EDOM;
 #endif
         return math::numeric_limits<result_type>::quiet_NaN();

diff --git a/trng/extreme_value_dist.hpp b/trng/extreme_value_dist.hpp
@@ -166,7 +166,7 @@ namespace trng {
     TRNG_CUDA_ENABLE
     result_type icdf(result_type x) const {
       if (x <= 0 or x >= 1) {
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
         errno = EDOM;
 #endif
         return math::numeric_limits<result_type>::quiet_NaN();

diff --git a/trng/gamma_dist.hpp b/trng/gamma_dist.hpp
@@ -193,7 +193,7 @@ namespace trng {
     TRNG_CUDA_ENABLE
     result_type icdf(result_type x) const {
       if (x <= 0 or x >= 1) {
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
         errno = EDOM;
 #endif
         return math::numeric_limits<result_type>::quiet_NaN();

diff --git a/trng/int_math.hpp b/trng/int_math.hpp
@@ -115,7 +115,7 @@ namespace trng {
 
     TRNG_CUDA_ENABLE
     inline int32_t modulo_inverse(int32_t a, int32_t m) {
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
       if (a <= 0 or m <= 1)
         utility::throw_this(
             std::invalid_argument("invalid argument in trng::int_math::modulo_inverse"));
@@ -130,7 +130,7 @@ namespace trng {
         f = flast - q * f;
         flast = temp;
       }
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
       if (a == 0)
         utility::throw_this(std::runtime_error("no inverse in trng::int_math::modulo_inverse"));
 #endif
@@ -189,7 +189,7 @@ namespace trng {
         }
       }
       // test if a solution exists
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
       for (int i{rank}; i < n; ++i)
         if (b[p[i]] != 0)
           utility::throw_this(

diff --git a/trng/lcg64.hpp b/trng/lcg64.hpp
@@ -317,7 +317,7 @@ namespace trng {
 
   TRNG_CUDA_ENABLE
   inline void lcg64::split(unsigned int s, unsigned int n) {
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
     if (s < 1 or n >= s)
       utility::throw_this(std::invalid_argument("invalid argument for trng::lcg64::split"));
 #endif

diff --git a/trng/lcg64_count_shift.hpp b/trng/lcg64_count_shift.hpp
@@ -364,7 +364,7 @@ namespace trng {
 
   TRNG_CUDA_ENABLE
   inline void lcg64_count_shift::split(unsigned int s, unsigned int n) {
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
     if (s < 1 or n >= s)
       utility::throw_this(
           std::invalid_argument("invalid argument for trng::lcg64_count_shift::split"));

diff --git a/trng/lcg64_shift.hpp b/trng/lcg64_shift.hpp
@@ -323,7 +323,7 @@ namespace trng {
 
   TRNG_CUDA_ENABLE
   inline void lcg64_shift::split(unsigned int s, unsigned int n) {
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
     if (s < 1 or n >= s)
       utility::throw_this(
           std::invalid_argument("invalid argument for trng::lcg64_shift::split"));

diff --git a/trng/limits.hpp b/trng/limits.hpp
@@ -38,7 +38,7 @@
 #include <cfloat>
 #include <trng/cuda.hpp>
 
-#if defined TRNG_CUDA
+#if defined __CUDACC__ && !(defined __HIPCC__)
 #include <math_constants.h>
 #include <cuda/std/limits>
 #endif
@@ -47,8 +47,51 @@ namespace trng {
 
   namespace math {
 
-#if defined TRNG_CUDA
+#if defined __CUDACC__ && !(defined __HIPCC__)
     using cuda::std::numeric_limits;
+#elif defined __HIPCC__
+    template<typename T>
+    class numeric_limits {
+    public:
+      static constexpr bool is_specialized = ::std::numeric_limits<T>::is_specialized;
+      static constexpr T min() noexcept { return ::std::numeric_limits<T>::min(); }
+      static constexpr T max() noexcept { return ::std::numeric_limits<T>::max(); }
+      static constexpr int digits = ::std::numeric_limits<T>::digits;
+      static constexpr int digits10 = ::std::numeric_limits<T>::digits10;
+      static constexpr bool is_signed = ::std::numeric_limits<T>::is_signed;
+      static constexpr bool is_integer = ::std::numeric_limits<T>::is_integer;
+      static constexpr bool is_exact = ::std::numeric_limits<T>::is_exact;
+      static constexpr int radix = ::std::numeric_limits<T>::radix;
+      static constexpr T epsilon() noexcept { return ::std::numeric_limits<T>::epsilon(); }
+      static constexpr T round_error() noexcept {
+        return ::std::numeric_limits<T>::round_error();
+      }
+      static constexpr int min_exponent = ::std::numeric_limits<T>::min_exponent;
+      static constexpr int min_exponent10 = ::std::numeric_limits<T>::min_exponent10;
+      static constexpr int max_exponent = ::std::numeric_limits<T>::max_exponent;
+      static constexpr int max_exponent10 = ::std::numeric_limits<T>::max_exponent10;
+      static constexpr bool has_infinity = ::std::numeric_limits<T>::has_infinity;
+      static constexpr bool has_quiet_NaN = ::std::numeric_limits<T>::has_quiet_NaN;
+      static constexpr bool has_signaling_NaN = ::std::numeric_limits<T>::has_signaling_NaN;
+      static constexpr ::std::float_denorm_style has_denorm =
+          ::std::numeric_limits<T>::has_denorm;
+      static constexpr bool has_denorm_loss = ::std::numeric_limits<T>::has_denorm_loss;
+      static constexpr T infinity() noexcept { return ::std::numeric_limits<T>::infinity(); }
+      static constexpr T quiet_NaN() noexcept { return ::std::numeric_limits<T>::quiet_NaN(); }
+      static constexpr T signaling_NaN() noexcept {
+        return ::std::numeric_limits<T>::signaling_NaN();
+      }
+      static constexpr T denorm_min() noexcept {
+        return ::std::numeric_limits<T>::denorm_min();
+      }
+      static constexpr bool is_iec559 = ::std::numeric_limits<T>::is_iec559;
+      static constexpr bool is_bounded = ::std::numeric_limits<T>::is_bounded;
+      static constexpr bool is_modulo = ::std::numeric_limits<T>::is_modulo;
+      static constexpr bool traps = ::std::numeric_limits<T>::traps;
+      static constexpr bool tinyness_before = ::std::numeric_limits<T>::tinyness_before;
+      static constexpr ::std::float_round_style round_style =
+          ::std::numeric_limits<T>::round_style;
+    };
 #else
     using std::numeric_limits;
 #endif

diff --git a/trng/logistic_dist.hpp b/trng/logistic_dist.hpp
@@ -171,7 +171,7 @@ namespace trng {
     TRNG_CUDA_ENABLE
     result_type icdf(result_type x) const {
       if (x < 0 or x > 1) {
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
         errno = EDOM;
 #endif
         return math::numeric_limits<result_type>::quiet_NaN();

diff --git a/trng/lognormal_dist.hpp b/trng/lognormal_dist.hpp
@@ -172,7 +172,7 @@ namespace trng {
     TRNG_CUDA_ENABLE
     result_type icdf(result_type x) const {
       if (x < 0 or x > 1) {
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
         errno = EDOM;
 #endif
         return math::numeric_limits<result_type>::quiet_NaN();

diff --git a/trng/maxwell_dist.hpp b/trng/maxwell_dist.hpp
@@ -157,7 +157,7 @@ namespace trng {
     TRNG_CUDA_ENABLE
     result_type icdf(result_type x) const {
       if (x < 0 or x > 1) {
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
         errno = EDOM;
 #endif
         return math::numeric_limits<result_type>::quiet_NaN();

diff --git a/trng/mrg2.hpp b/trng/mrg2.hpp
@@ -175,7 +175,7 @@ namespace trng {
   // Parallel random number generator concept
   TRNG_CUDA_ENABLE
   inline void mrg2::split(unsigned int s, unsigned int n) {
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
     if (s < 1 or n >= s)
       utility::throw_this(std::invalid_argument("invalid argument for trng::mrg2::split"));
 #endif

diff --git a/trng/mrg3.hpp b/trng/mrg3.hpp
@@ -180,7 +180,7 @@ namespace trng {
   // Parallel random number generator concept
   TRNG_CUDA_ENABLE
   inline void mrg3::split(unsigned int s, unsigned int n) {
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
     if (s < 1 or n >= s)
       utility::throw_this(std::invalid_argument("invalid argument for trng::mrg3::split"));
 #endif

diff --git a/trng/mrg3s.hpp b/trng/mrg3s.hpp
@@ -179,7 +179,7 @@ namespace trng {
   // Parallel random number generator concept
   TRNG_CUDA_ENABLE
   inline void mrg3s::split(unsigned int s, unsigned int n) {
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
     if (s < 1 or n >= s)
       utility::throw_this(std::invalid_argument("invalid argument for trng::mrg3s::split"));
 #endif

diff --git a/trng/mrg4.hpp b/trng/mrg4.hpp
@@ -185,7 +185,7 @@ namespace trng {
   // Parallel random number generator concept
   TRNG_CUDA_ENABLE
   inline void mrg4::split(unsigned int s, unsigned int n) {
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
     if (s < 1 or n >= s)
       utility::throw_this(std::invalid_argument("invalid argument for trng::mrg4::split"));
 #endif

diff --git a/trng/mrg5.hpp b/trng/mrg5.hpp
@@ -190,7 +190,7 @@ namespace trng {
   // Parallel random number generator concept
   TRNG_CUDA_ENABLE
   inline void mrg5::split(unsigned int s, unsigned int n) {
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
     if (s < 1 or n >= s)
       utility::throw_this(std::invalid_argument("invalid argument for trng::mrg5::split"));
 #endif

diff --git a/trng/mrg5s.hpp b/trng/mrg5s.hpp
@@ -191,7 +191,7 @@ namespace trng {
   // Parallel random number generator concept
   TRNG_CUDA_ENABLE
   inline void mrg5s::split(unsigned int s, unsigned int n) {
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
     if (s < 1 or n >= s)
       utility::throw_this(std::invalid_argument("invalid argument for trng::mrg5s::split"));
 #endif

diff --git a/trng/pareto_dist.hpp b/trng/pareto_dist.hpp
@@ -166,7 +166,7 @@ namespace trng {
     // inverse cumulative density function
     result_type icdf(result_type x) const {
       if (x <= 0 or x >= 1) {
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
         errno = EDOM;
 #endif
         return math::numeric_limits<result_type>::quiet_NaN();

diff --git a/trng/powerlaw_dist.hpp b/trng/powerlaw_dist.hpp
@@ -168,7 +168,7 @@ namespace trng {
     TRNG_CUDA_ENABLE
     result_type icdf(result_type x) const {
       if (x <= 0 or x >= 1) {
-#if !(defined __CUDA_ARCH__)
+#if !(defined TRNG_CUDA)
         errno = EDOM;
 #endif
         return math::numeric_limits<result_type>::quiet_NaN();

diff --git a/trng/rayleigh_dist.hpp b/trng/rayleigh_dist.hpp
@@ -159,7 +159,7 @@ namespace trng {
     TRNG_CUDA_ENABLE
     result_type icdf(result_type x) const {
       if (x < 0 or x > 1) {
-#if !(defined __CUDA_ARCH__)
+#if !(TRNG_CUDA)
         errno = EDOM;
 #endif
         return math::numeric_limits<result_type>::quiet_NaN();