From 085e2f7bddc45f859fcdb786926d60d709b2daa0 Mon Sep 17 00:00:00 2001 From: Pruthvi Madugundu Date: Wed, 29 Sep 2021 09:53:51 -0700 Subject: [PATCH] [ROCm] Changes not to rely on CUDA_VERSION or HIP_VERSION (#65610) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/65610 - Replace HIP_PLATFORM_HCC with USE_ROCM - Dont rely on CUDA_VERSION or HIP_VERSION and use USE_ROCM and ROCM_VERSION. - In the next PR - Will be removing the mapping from CUDA_VERSION to HIP_VERSION and CUDA to HIP in hipify. - HIP_PLATFORM_HCC is deprecated, so will add HIP_PLATFORM_AMD to support HIP host code compilation on gcc. cc jeffdaily sunway513 jithunnair-amd ROCmSupport amathews-amd Reviewed By: jbschlosser Differential Revision: D30909053 Pulled By: ezyang fbshipit-source-id: 224a966ebf1aaec79beccbbd686fdf3d49267e06 --- aten/src/ATen/Dispatch.h | 4 +- aten/src/ATen/core/Array.h | 2 +- aten/src/ATen/cuda/Atomic.cuh | 10 ++--- aten/src/ATen/cuda/CUDAApplyUtils.cuh | 4 +- aten/src/ATen/cuda/CUDABlas.cpp | 44 +++++++++---------- aten/src/ATen/cuda/CUDABlas.h | 12 ++--- aten/src/ATen/cuda/CUDAEvent.h | 4 +- aten/src/ATen/cuda/CUDAGraph.cpp | 28 ++++++------ aten/src/ATen/cuda/CUDAGraph.h | 2 +- aten/src/ATen/cuda/CublasHandlePool.cpp | 4 +- aten/src/ATen/cuda/DeviceUtils.cuh | 20 ++++----- aten/src/ATen/cuda/Exceptions.h | 2 +- aten/src/ATen/cuda/cub.cuh | 4 +- aten/src/ATen/cuda/detail/CUDAHooks.cpp | 12 ++--- aten/src/ATen/cuda/detail/LazyNVRTC.cpp | 2 +- .../src/ATen/cuda/detail/OffsetCalculator.cuh | 2 +- aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h | 4 +- aten/src/ATen/native/ForeachUtils.h | 4 +- aten/src/ATen/native/cuda/Activation.cu | 2 +- aten/src/ATen/native/cuda/CuFFTPlanCache.h | 14 +++--- aten/src/ATen/native/cuda/CuFFTUtils.h | 2 +- aten/src/ATen/native/cuda/DepthwiseConv2d.cu | 10 ++--- aten/src/ATen/native/cuda/DeviceSqrt.cuh | 2 +- aten/src/ATen/native/cuda/DilatedMaxPool2d.cu | 2 +- aten/src/ATen/native/cuda/Distributions.cu | 2 +- aten/src/ATen/native/cuda/Dropout.cu | 4 +- aten/src/ATen/native/cuda/Embedding.cu | 6 +-- aten/src/ATen/native/cuda/EmbeddingBag.cu | 4 +- aten/src/ATen/native/cuda/Indexing.cu | 2 +- aten/src/ATen/native/cuda/KernelUtils.cuh | 5 ++- aten/src/ATen/native/cuda/Loops.cuh | 2 +- aten/src/ATen/native/cuda/LossCTC.cu | 8 ++-- aten/src/ATen/native/cuda/MiscUtils.h | 6 +-- .../native/cuda/NaiveDilatedConvolution.cu | 2 +- aten/src/ATen/native/cuda/Normalization.cuh | 4 +- aten/src/ATen/native/cuda/RNN.cu | 8 ++-- aten/src/ATen/native/cuda/Shape.cu | 4 +- aten/src/ATen/native/cuda/SoftMax.cu | 4 +- aten/src/ATen/native/cuda/Sort.cu | 2 +- aten/src/ATen/native/cuda/SortUtils.cuh | 6 +-- aten/src/ATen/native/cuda/SortingCommon.cuh | 2 +- .../ATen/native/cuda/SortingRadixSelect.cuh | 6 +-- aten/src/ATen/native/cuda/SpectralOps.cpp | 2 +- aten/src/ATen/native/cuda/SummaryOps.cu | 2 +- aten/src/ATen/native/cuda/TensorFactories.cu | 2 +- .../src/ATen/native/cuda/TensorModeKernel.cuh | 6 +-- aten/src/ATen/native/cuda/TensorTopK.cu | 2 +- .../ATen/native/cuda/TensorTransformations.cu | 2 +- aten/src/ATen/native/cuda/UnaryOpsKernel.cu | 2 +- aten/src/ATen/native/cuda/WeightNorm.cu | 4 +- aten/src/ATen/native/cudnn/RNN.cpp | 6 +-- aten/src/ATen/native/sparse/cuda/SoftMax.cu | 2 +- .../sparse/cuda/SparseCUDAApplyUtils.cuh | 12 ++--- .../sparse/cuda/SparseCUDATensorMath.cu | 6 +-- aten/src/ATen/test/cuda_vectorized_test.cu | 2 +- aten/src/THC/THCAsmUtils.cuh | 18 ++++---- aten/src/THC/THCDeviceTensor-inl.cuh | 8 ++-- aten/src/THC/THCGeneral.cpp | 2 +- aten/src/THC/THCGeneral.h.in | 2 +- aten/src/THC/THCScanUtils.cuh | 2 +- aten/src/THC/THCStorage.cu | 2 +- aten/src/THC/generic/THCStorage.cu | 2 +- c10/cuda/CUDAGraphsC10Utils.h | 2 +- c10/macros/Macros.h | 6 +-- c10/test/util/complex_test_common.h | 4 +- c10/test/util/exception_test.cpp | 2 +- c10/util/BFloat16.h | 8 ++-- c10/util/C++17.h | 2 +- c10/util/Half.h | 2 +- c10/util/complex.h | 2 +- caffe2/contrib/aten/aten_op_template.h | 4 +- caffe2/core/common_gpu.cc | 10 ++--- caffe2/core/common_gpu.h | 25 ++++++----- caffe2/core/macros.h.in | 2 + .../distributed/file_store_handler_op_gpu.cc | 4 +- .../distributed/redis_store_handler_op_gpu.cc | 4 +- caffe2/operators/batch_matmul_op.cu | 2 +- caffe2/operators/channel_stats_op.cu | 4 +- caffe2/operators/fully_connected_op_gpu.cc | 4 +- caffe2/operators/generate_proposals_op.cu | 2 +- .../generate_proposals_op_util_nms_gpu.cu | 2 +- caffe2/operators/group_norm_op.cu | 20 ++++----- caffe2/operators/instance_norm_op.cu | 12 ++--- caffe2/operators/minmax_ops.cu | 2 +- caffe2/operators/reduce_front_back_max_ops.cu | 2 +- caffe2/operators/rmac_regions_op.cu | 4 +- caffe2/operators/segment_reduction_op_gpu.cu | 14 +++--- caffe2/operators/segment_reduction_op_gpu.cuh | 4 +- caffe2/operators/tile_op.cu | 2 +- caffe2/operators/top_k_heap_selection.cuh | 10 ++--- caffe2/operators/top_k_radix_selection.cuh | 4 +- caffe2/python/pybind_state.cc | 8 ++-- caffe2/sgd/adagrad_fused_op_gpu.cu | 14 +++--- caffe2/sgd/adagrad_fused_op_gpu.cuh | 11 ++--- caffe2/sgd/fp16_momentum_sgd_op.cu | 4 +- caffe2/utils/GpuAtomics.cuh | 2 +- caffe2/utils/GpuDefs.cuh | 26 +++++------ caffe2/utils/GpuScanUtils.cuh | 8 ++-- caffe2/utils/fixed_divisor.h | 22 +++++----- caffe2/utils/fixed_divisor_test.cc | 2 +- caffe2/utils/math/broadcast.cu | 4 +- caffe2/utils/math/elementwise.cu | 20 ++++----- caffe2/utils/math/reduce.cu | 16 +++---- caffe2/utils/math/transpose.cu | 4 +- caffe2/utils/math_gpu.cu | 44 +++++++++---------- cmake/Summary.cmake | 3 ++ torch/csrc/CudaIPCTypes.cpp | 4 +- torch/csrc/Module.cpp | 4 +- torch/csrc/autograd/engine.cpp | 2 +- torch/csrc/autograd/profiler_legacy.cpp | 6 +-- torch/csrc/cuda/Module.cpp | 8 +++- torch/csrc/cuda/nccl.cpp | 2 +- torch/csrc/cuda/shared/cudart.cpp | 6 +-- torch/csrc/cuda/shared/cudnn.cpp | 4 +- torch/csrc/distributed/c10d/NCCLUtils.hpp | 2 +- .../csrc/distributed/rpc/tensorpipe_cuda.cpp | 2 +- torch/csrc/generic/StorageSharing.cpp | 4 +- torch/csrc/jit/codegen/cuda/codegen.cpp | 2 +- torch/csrc/jit/codegen/cuda/executor.cpp | 2 +- .../csrc/jit/codegen/cuda/executor_utils.cpp | 12 ++--- torch/csrc/jit/codegen/fuser/codegen.cpp | 2 +- .../jit/codegen/fuser/cuda/fused_kernel.cpp | 8 ++-- .../jit/codegen/fuser/cuda/resource_strings.h | 6 +-- torch/csrc/jit/ir/ir.cpp | 2 +- torch/csrc/jit/ir/ir.h | 2 +- torch/csrc/jit/python/init.cpp | 4 +- .../csrc/jit/python/python_sugared_value.cpp | 2 +- torch/csrc/jit/python/python_sugared_value.h | 2 +- torch/csrc/jit/runtime/register_cuda_ops.cpp | 2 +- torch/csrc/jit/tensorexpr/cuda_codegen.cpp | 10 ++--- torch/utils/cpp_extension.py | 1 + 131 files changed, 415 insertions(+), 398 deletions(-) diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h index cd9dd8b8cdf10e..8b292e0d9c2e09 100644 --- a/aten/src/ATen/Dispatch.h +++ b/aten/src/ATen/Dispatch.h @@ -76,11 +76,11 @@ TORCH_API void record_kernel_function_dtype(std::string name); // Workaround for C10_UNUSED because CUDA 10.1 and below fails to handle unused // attribute in the type aliasing context. Keep name long and verbose to avoid // macro collisions. -#if defined(__CUDACC__) && CUDA_VERSION <= 10100 +#if defined(__CUDACC__) && defined(CUDA_VERSION) && CUDA_VERSION <= 10100 #define C10_UNUSED_DISPATCH_CUDA_WORKAROUND #else #define C10_UNUSED_DISPATCH_CUDA_WORKAROUND C10_UNUSED -#endif // defined(__CUDACC__) && CUDA_VERSION <= 10100 +#endif // defined(__CUDACC__) && defined(CUDA_VERSION) && CUDA_VERSION <= 10100 #if defined __cpp_if_constexpr #define AT_QINT_PRIVATE_CASE_TYPE( \ diff --git a/aten/src/ATen/core/Array.h b/aten/src/ATen/core/Array.h index 479ed4ba09a1a7..6e0fce606efc80 100644 --- a/aten/src/ATen/core/Array.h +++ b/aten/src/ATen/core/Array.h @@ -17,7 +17,7 @@ struct Array { C10_HOST_DEVICE T& operator[](int i) { return data[i]; } -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) C10_HOST_DEVICE Array() = default; C10_HOST_DEVICE Array(const Array&) = default; C10_HOST_DEVICE Array& operator=(const Array&) = default; diff --git a/aten/src/ATen/cuda/Atomic.cuh b/aten/src/ATen/cuda/Atomic.cuh index c93ba63f1d4129..cd002414687a34 100644 --- a/aten/src/ATen/cuda/Atomic.cuh +++ b/aten/src/ATen/cuda/Atomic.cuh @@ -167,7 +167,7 @@ static inline __device__ int32_t gpuAtomicAdd(int32_t *address, int32_t val) { } static inline __device__ void gpuAtomicAdd(int64_t *address, int64_t val) { -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) __atomic_fetch_add(address, val, __ATOMIC_RELAXED); #else AtomicAddIntegerImpl()(address, val); @@ -179,7 +179,7 @@ static inline __device__ void gpuAtomicAdd(bool *address, bool val) { } static inline __device__ at::Half gpuAtomicAdd(at::Half *address, at::Half val) { -#if ((CUDA_VERSION < 10000) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) +#if defined(USE_ROCM) || ((defined(CUDA_VERSION) && CUDA_VERSION < 10000) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) return AtomicFPOp()(address, val, [](at::Half hsum, at::Half val) { return hsum + val; @@ -196,7 +196,7 @@ static inline __device__ at::BFloat16 gpuAtomicAdd(at::BFloat16 *address, at::BF }); } -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000) +#if defined(CUDA_VERSION) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000) // from CUDA C Programmic Guide static inline __device__ double atomicAdd(double* address, double val) #if defined(__clang__) && defined(__CUDA__) @@ -212,7 +212,7 @@ static inline __device__ double atomicAdd(double* address, double val) return __double_as_longlong(val + __longlong_as_double(assumed)); }); } -#elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000) || defined(__HIP_PLATFORM_HCC__) +#elif defined(USE_ROCM) || !(defined(__CUDA_ARCH__) && (defined(CUDA_VERSION) && CUDA_VERSION < 8000)) /* Note [hip-clang differences to hcc] * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -298,7 +298,7 @@ static inline __device__ void gpuAtomicAddNoReturn(at::BFloat16 *address, at::BF static inline __device__ void gpuAtomicAddNoReturn(double *address, double val) { gpuAtomicAdd(address, val); } /* Special case fp32 atomic. */ -#if defined(__HIP_PLATFORM_HCC__) && defined(__gfx908__) +#if defined(USE_ROCM) && defined(__gfx908__) static inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { atomicAddNoRet(address, val); } #else static inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { gpuAtomicAdd(address, val); } diff --git a/aten/src/ATen/cuda/CUDAApplyUtils.cuh b/aten/src/ATen/cuda/CUDAApplyUtils.cuh index a89d81f757cc3e..4e6efbf8564853 100644 --- a/aten/src/ATen/cuda/CUDAApplyUtils.cuh +++ b/aten/src/ATen/cuda/CUDAApplyUtils.cuh @@ -274,7 +274,7 @@ template -#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__ +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM) #endif __global__ void kernelPointwiseApply1(detail::TensorInfo a, @@ -360,7 +360,7 @@ template -#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__ +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm) #endif __global__ void diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp index 70c3dda6f3401f..ab542cb3bdab04 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp @@ -133,7 +133,7 @@ const char* _cublasGetErrorEnum(cublasStatus_t error) { /* LEVEL 3 BLAS FUNCTIONS */ -#ifndef __HIP_PLATFORM_HCC__ +#ifndef USE_ROCM #if defined(CUDA_VERSION) && CUDA_VERSION >= 11200 #define cublasGemmStridedBatchedExFix cublasGemmStridedBatchedEx #else @@ -271,7 +271,7 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::Half)) { BGEMM_CHECK_ARGVALUES(at::Half); float falpha = alpha; float fbeta = beta; -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM TORCH_CUDABLAS_CHECK(rocblas_gemm_strided_batched_ex(handle, opa, opb, (int)m, (int)n, (int)k, (void*)&falpha, a, rocblas_datatype_f16_r, (int)lda, stridea, b, rocblas_datatype_f16_r, (int)ldb, strideb, @@ -284,7 +284,7 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::Half)) { // On CUDA versions prior to 11, users are required to set the math mode to CUBLAS_TENSOR_OP_MATH // manually to be able to use tensor cores for FP16. On CUDA 11, this is no longer required. TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH)); - #endif // CUDA_VERSION < 11000 + #endif // defined(CUDA_VERSION) && CUDA_VERSION < 11000 cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); if (prop->major >= 5){ @@ -308,11 +308,11 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::Half)) { // On CUDA versions prior to 11, users are required to set the math mode to CUBLAS_TENSOR_OP_MATH // manually to be able to use tensor cores for FP16. On CUDA 11, this is no longer required. TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH)); - #endif // CUDA_VERSION < 11000 -#endif // __HIP_PLATFORM_HCC__ + #endif // defined(CUDA_VERSION) && CUDA_VERSION < 11000 +#endif // USE_ROCM } -#if defined(__HIP_PLATFORM_HCC__) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000 +#if defined(USE_ROCM) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000 template <> void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { // See Note [Writing Nondeterministic Operations] @@ -332,7 +332,7 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { b, CUDA_R_16BF, (int)ldb, strideb, (void*)&fbeta, c, CUDA_R_16BF, (int)ldc, stridec, (int)num_batches, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - #elif defined(__HIP_PLATFORM_HCC__) + #elif defined(USE_ROCM) TORCH_CUDABLAS_CHECK(rocblas_gemm_strided_batched_ex(handle, opa, opb, (int)m, (int)n, (int)k, (void*)&falpha, a, rocblas_datatype_bf16_r, (int)lda, stridea, b, rocblas_datatype_bf16_r, (int)ldb, strideb, @@ -344,7 +344,7 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { TORCH_CHECK(false, "CUDA BFloat16 bgemm requires CUDA 11 or later"); #endif // defined(CUDA_VERSION) && CUDA_VERSION >= 11000 } -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM template <> void gemm(CUDABLAS_GEMM_ARGTYPES(double)) { @@ -372,7 +372,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(float)) { handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)); } -#if !defined(__HIP_PLATFORM_HCC__) || (defined(__HIP_PLATFORM_HCC__) && TORCH_HIP_VERSION >= 210) +#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 21000) template <> void gemm>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) { // See Note [Writing Nondeterministic Operations] @@ -389,7 +389,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(float)) { } #endif -#if !defined(__HIP_PLATFORM_HCC__) || (defined(__HIP_PLATFORM_HCC__) && TORCH_HIP_VERSION >= 210) +#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 21000) template <> void gemm>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) { // See Note [Writing Nondeterministic Operations] @@ -417,7 +417,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) { float fbeta = beta; _cublasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc); GEMM_CHECK_ARGVALUES(at::Half); -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM TORCH_CUDABLAS_CHECK(rocblas_gemm_ex( handle, opa, @@ -450,7 +450,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) { // On CUDA versions prior to 11, users are required to set the math mode to CUBLAS_TENSOR_OP_MATH // manually to be able to use tensor cores for FP16. On CUDA 11, this is no longer required. TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH)); -#endif // CUDA_VERSION < 11000 +#endif // defined(CUDA_VERSION) && CUDA_VERSION < 11000 TORCH_CUDABLAS_CHECK(cublasGemmEx( handle, opa, @@ -475,7 +475,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) { // On CUDA versions prior to 11, users are required to set the math mode to CUBLAS_TENSOR_OP_MATH // manually to be able to use tensor cores for FP16. On CUDA 11, this is no longer required. TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH)); -#endif // CUDA_VERSION < 11000 +#endif // defined(CUDA_VERSION) && CUDA_VERSION < 11000 } else { TORCH_CUDABLAS_CHECK(cublasSgemmEx( handle, @@ -499,7 +499,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) { #endif } -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM template <> void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); @@ -569,7 +569,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { CUDA_R_32F, CUBLAS_GEMM_DFALT_TENSOR_OP)); } -#endif +#endif // defined(CUDA_VERSION) && CUDA_VERSION >= 11000 template <> void trsm(CUDABLAS_TRSM_ARGTYPES(float)) { @@ -702,7 +702,7 @@ void trsmBatched>( CUDABLAS_POSINT_CHECK(gemv, incy); \ } while (0) -#if !defined(__HIP_PLATFORM_HCC__) || (defined(__HIP_PLATFORM_HCC__) && TORCH_HIP_VERSION >= 210) +#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 21000) template <> void gemv>(CUDABLAS_GEMV_ARGTYPES(c10::complex)) { // See Note [Writing Nondeterministic Operations] @@ -718,7 +718,7 @@ void trsmBatched>( } #endif -#if !defined(__HIP_PLATFORM_HCC__) || (defined(__HIP_PLATFORM_HCC__) && TORCH_HIP_VERSION >= 210) +#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 21000) template <> void gemv>(CUDABLAS_GEMV_ARGTYPES(c10::complex)) { // gemv is bw bound, and does not benefit from TF32. But the precision @@ -797,7 +797,7 @@ void gemv(CUDABLAS_GEMV_ARGTYPES(at::Half)) { 'n', trans_flipped, 1, m, n, alpha, x, incx, a, lda, beta, y, incy); } -#if defined(__HIP_PLATFORM_HCC__) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000 +#if defined(USE_ROCM) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000 template <> void gemv(CUDABLAS_GEMV_ARGTYPES(at::BFloat16)) { bool trans_bool = (_cublasOpFromChar(trans) != CUBLAS_OP_N); @@ -838,7 +838,7 @@ void dot>(CUDABLAS_DOT_ARGTYPES(c10::complex)) { template <> void dot(CUDABLAS_DOT_ARGTYPES(at::Half)) { -#if CUDA_VERSION >= 8000 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 8000 TORCH_CUDABLAS_CHECK(cublasDotEx( handle, n, @@ -851,7 +851,7 @@ void dot(CUDABLAS_DOT_ARGTYPES(at::Half)) { result, CUDA_R_16F, CUDA_R_32F)); -#elif TORCH_HIP_VERSION >= 210 +#elif defined(ROCM_VERSION) && ROCM_VERSION >= 21000 TORCH_CUDABLAS_CHECK(rocblas_hdot( handle, n, @@ -867,7 +867,7 @@ void dot(CUDABLAS_DOT_ARGTYPES(at::Half)) { template <> void dot(CUDABLAS_DOT_ARGTYPES(at::BFloat16)) { -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 TORCH_CUDABLAS_CHECK(cublasDotEx( handle, n, @@ -880,7 +880,7 @@ void dot(CUDABLAS_DOT_ARGTYPES(at::BFloat16)) { result, CUDA_R_16BF, CUDA_R_32F)); -#elif TORCH_HIP_VERSION >= 210 +#elif defined(ROCM_VERSION) && ROCM_VERSION >= 21000 TORCH_CUDABLAS_CHECK(rocblas_bfdot( handle, n, diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h index 77450eaf9d26c3..225ea9f296e114 100644 --- a/aten/src/ATen/cuda/CUDABlas.h +++ b/aten/src/ATen/cuda/CUDABlas.h @@ -54,17 +54,17 @@ template <> void gemm(CUDABLAS_GEMM_ARGTYPES(double)); template <> void gemm(CUDABLAS_GEMM_ARGTYPES(float)); -#if !defined(__HIP_PLATFORM_HCC__) || (defined(__HIP_PLATFORM_HCC__) && TORCH_HIP_VERSION >= 210) +#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 21000) template <> void gemm>(CUDABLAS_GEMM_ARGTYPES(c10::complex)); #endif -#if !defined(__HIP_PLATFORM_HCC__) || (defined(__HIP_PLATFORM_HCC__) && TORCH_HIP_VERSION >= 210) +#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 21000) template <> void gemm>(CUDABLAS_GEMM_ARGTYPES(c10::complex)); #endif template <> void gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)); -#if defined(__HIP_PLATFORM_HCC__) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000 +#if defined(USE_ROCM) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000 template <> void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)); #endif @@ -90,7 +90,7 @@ template <> void bgemm>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)); template <> void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::Half)); -#if defined(__HIP_PLATFORM_HCC__) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000 +#if defined(USE_ROCM) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000 template <> void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); #endif @@ -152,7 +152,7 @@ template <> void gemv(CUDABLAS_GEMV_ARGTYPES(double)); template <> void gemv(CUDABLAS_GEMV_ARGTYPES(float)); -#if !defined(__HIP_PLATFORM_HCC__) || (defined(__HIP_PLATFORM_HCC__) && TORCH_HIP_VERSION >= 210) +#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 21000) template <> void gemv>(CUDABLAS_GEMV_ARGTYPES(c10::complex)); template <> @@ -160,7 +160,7 @@ void gemv>(CUDABLAS_GEMV_ARGTYPES(c10::complex)); #endif template <> void gemv(CUDABLAS_GEMV_ARGTYPES(at::Half)); -#if defined(__HIP_PLATFORM_HCC__) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000 +#if defined(USE_ROCM) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000 template <> void gemv(CUDABLAS_GEMV_ARGTYPES(at::BFloat16)); #endif diff --git a/aten/src/ATen/cuda/CUDAEvent.h b/aten/src/ATen/cuda/CUDAEvent.h index 1fb97dd82f4b3a..deaebd3583d670 100644 --- a/aten/src/ATen/cuda/CUDAEvent.h +++ b/aten/src/ATen/cuda/CUDAEvent.h @@ -32,7 +32,7 @@ struct TORCH_CUDA_CPP_API CUDAEvent { CUDAEvent( DeviceIndex device_index, const cudaIpcEventHandle_t* handle) { - #ifndef __HIP_PLATFORM_HCC__ + #if !defined(USE_ROCM) device_index_ = device_index; CUDAGuard guard(device_index_); @@ -148,7 +148,7 @@ struct TORCH_CUDA_CPP_API CUDAEvent { // Note: cudaIpcGetEventHandle must be called on the same device as the event void ipc_handle(cudaIpcEventHandle_t * handle) { - #ifndef __HIP_PLATFORM_HCC__ + #if !defined(USE_ROCM) if (!is_created_) { // this CUDAEvent object was initially constructed from flags but event_ // is not created yet. diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp index 5c64dc1fd05ebd..8a25f3841356f6 100644 --- a/aten/src/ATen/cuda/CUDAGraph.cpp +++ b/aten/src/ATen/cuda/CUDAGraph.cpp @@ -9,14 +9,14 @@ namespace at { namespace cuda { MempoolId_t graph_pool_handle() { -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 // uuid count starts at 1. 0 is reserved to mean "wasn't set by graph_pool_handle". static std::atomic uuid{1}; // Sets just the second value, to distinguish it from MempoolId_ts created from // cudaStreamGetCaptureInfo id_s in capture_begin. return {0, uuid++}; #else - TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0"); + TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and not yet supported on ROCM"); return {0, 0}; #endif } @@ -45,13 +45,13 @@ MempoolId_t graph_pool_handle() { CUDAGraph::CUDAGraph() // CUDAStreams may not be default-constructed. : capture_stream_(at::cuda::getCurrentCUDAStream()) { -#if CUDA_VERSION < 11000 - TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0"); +#if (defined(CUDA_VERSION) && CUDA_VERSION < 11000) || defined(USE_ROCM) + TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and not yet supported on ROCM"); #endif } void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/) { -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 TORCH_CHECK(!has_graph_exec_, "This CUDAGraph instance already owns a captured graph. " "To capture a new graph, create a new instance."); @@ -120,12 +120,12 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/) { // kernel will end up as part of the capture or not. c10::cuda::CUDACachingAllocator::notifyCaptureBegin(capture_dev_, id_, mempool_id_); #else - TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0"); + TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and not yet supported on ROCM"); #endif } void CUDAGraph::capture_end() { -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 auto stream = at::cuda::getCurrentCUDAStream(); TORCH_CHECK(stream == capture_stream_, @@ -156,12 +156,12 @@ void CUDAGraph::capture_end() { AT_CUDA_CHECK(cudaGraphDestroy(graph_)); has_graph_ = false; #else - TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0"); + TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and not yet supported on ROCM"); #endif } void CUDAGraph::replay() { -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 TORCH_CHECK(has_graph_exec_, "Called CUDAGraph::replay without a preceding successful capture."); @@ -190,12 +190,12 @@ void CUDAGraph::replay() { cudaDeviceSynchronize(); } #else - TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0"); + TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and not yet supported on ROCM"); #endif } void CUDAGraph::reset() { -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 // I'd prefer these checks throw exceptions, not print warnings, // but the destructor calls reset(), and at least one CI build // refuses to compile with a throwing destructor. @@ -226,17 +226,17 @@ void CUDAGraph::reset() { C10_CUDA_CHECK_WARN(cudaGraphExecDestroy(graph_exec_)); } #else - TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0"); + TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and not yet supported on ROCM"); #endif } // Returns an id another graph's capture_begin can use to share the same memory pool as this graph. MempoolId_t CUDAGraph::pool() { -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 TORCH_CHECK(has_graph_exec_, "Called CUDAGraph::pool() without a preceding successful capture."); #else - TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0"); + TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and not yet supported on ROCM"); #endif return mempool_id_; } diff --git a/aten/src/ATen/cuda/CUDAGraph.h b/aten/src/ATen/cuda/CUDAGraph.h index d8295833b269b7..09b0b7b5d8004d 100644 --- a/aten/src/ATen/cuda/CUDAGraph.h +++ b/aten/src/ATen/cuda/CUDAGraph.h @@ -26,7 +26,7 @@ struct TORCH_CUDA_CPP_API CUDAGraph { MempoolId_t pool(); protected: -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 cudaGraph_t graph_ = NULL; cudaGraphExec_t graph_exec_ = NULL; #endif diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp index 202e97fbeed245..08fa4e4904c904 100644 --- a/aten/src/ATen/cuda/CublasHandlePool.cpp +++ b/aten/src/ATen/cuda/CublasHandlePool.cpp @@ -47,7 +47,7 @@ cublasHandle_t getCurrentCUDABlasHandle() { auto handle = myPoolWindow->reserve(device); auto stream = c10::cuda::getCurrentCUDAStream(); TORCH_CUDABLAS_CHECK(cublasSetStream(handle, stream)); -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 // On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup // FP32 data type calculations based on the value of the allow_tf32 flag. // To enable TF32, set the math mode of the handle to CUBLAS_TF32_TENSOR_OP_MATH. @@ -57,7 +57,7 @@ cublasHandle_t getCurrentCUDABlasHandle() { TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH)); } #endif -#if defined(__HIP_PLATFORM_HCC__) && TORCH_HIP_VERSION >= 308 +#if defined(USE_ROCM) && ROCM_VERSION >= 30800 rocblas_atomics_mode rocblas_mode; if (at::globalContext().deterministicAlgorithms()) { rocblas_mode = rocblas_atomics_not_allowed; diff --git a/aten/src/ATen/cuda/DeviceUtils.cuh b/aten/src/ATen/cuda/DeviceUtils.cuh index 1286ce32b3eab0..dc17aa80ca84b1 100644 --- a/aten/src/ATen/cuda/DeviceUtils.cuh +++ b/aten/src/ATen/cuda/DeviceUtils.cuh @@ -6,7 +6,7 @@ __device__ __forceinline__ unsigned int ACTIVE_MASK() { -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) return __activemask(); #else // will be ignored anyway @@ -14,7 +14,7 @@ __device__ __forceinline__ unsigned int ACTIVE_MASK() #endif } -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) __device__ __forceinline__ unsigned long long int WARP_BALLOT(int predicate) { return __ballot(predicate); @@ -22,7 +22,7 @@ return __ballot(predicate); #else __device__ __forceinline__ unsigned int WARP_BALLOT(int predicate, unsigned int mask = 0xffffffff) { -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) return __ballot_sync(mask, predicate); #else return __ballot(predicate); @@ -33,7 +33,7 @@ __device__ __forceinline__ unsigned int WARP_BALLOT(int predicate, unsigned int template __device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff) { -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) return __shfl_xor_sync(mask, value, laneMask, width); #else return __shfl_xor(value, laneMask, width); @@ -43,7 +43,7 @@ __device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = wa template __device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width = warpSize, unsigned int mask = 0xffffffff) { -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) return __shfl_sync(mask, value, srcLane, width); #else return __shfl(value, srcLane, width); @@ -53,7 +53,7 @@ __device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width = warpSiz template __device__ __forceinline__ T WARP_SHFL_UP(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff) { -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) return __shfl_up_sync(mask, value, delta, width); #else return __shfl_up(value, delta, width); @@ -63,14 +63,14 @@ __device__ __forceinline__ T WARP_SHFL_UP(T value, unsigned int delta, int width template __device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff) { -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) return __shfl_down_sync(mask, value, delta, width); #else return __shfl_down(value, delta, width); #endif } -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) template<> __device__ __forceinline__ int64_t WARP_SHFL_DOWN(int64_t value, unsigned int delta, int width , unsigned int mask) { @@ -91,7 +91,7 @@ __device__ __forceinline__ c10::Half WARP_SHFL_DOWN(c10::Half value, template __device__ __forceinline__ c10::complex WARP_SHFL_DOWN(c10::complex value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff) { -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) return c10::complex( __shfl_down_sync(mask, value.real_, delta, width), __shfl_down_sync(mask, value.imag_, delta, width)); @@ -107,7 +107,7 @@ __device__ __forceinline__ c10::complex WARP_SHFL_DOWN(c10::complex value, */ template __device__ __forceinline__ T doLdg(const T* p) { -#if __CUDA_ARCH__ >= 350 && !defined __HIP_PLATFORM_HCC__ +#if __CUDA_ARCH__ >= 350 && !defined(USE_ROCM) return __ldg(p); #else return *p; diff --git a/aten/src/ATen/cuda/Exceptions.h b/aten/src/ATen/cuda/Exceptions.h index 1414e319656bd3..781829dc61ce37 100644 --- a/aten/src/ATen/cuda/Exceptions.h +++ b/aten/src/ATen/cuda/Exceptions.h @@ -89,7 +89,7 @@ const char* cusolverGetErrorMessage(cusolverStatus_t status); // This is here instead of in c10 because NVRTC is loaded dynamically via a stub // in ATen, and we need to use its nvrtcGetErrorString. // See NOTE [ USE OF NVRTC AND DRIVER API ]. -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #define AT_CUDA_DRIVER_CHECK(EXPR) \ do { \ diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh index 26f804768e42c1..5d8ae777ebef71 100644 --- a/aten/src/ATen/cuda/cub.cuh +++ b/aten/src/ATen/cuda/cub.cuh @@ -29,7 +29,7 @@ AT_CUDA_CHECK(cudaGetLastError()); \ } while (false) -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM #define NO_ROCM(x) #else #define NO_ROCM(x) x @@ -67,7 +67,7 @@ struct cuda_type { using type = __nv_bfloat16; }; -#elif !defined(__HIP_PLATFORM_HCC__) +#elif !defined(USE_ROCM) // backport https://github.com/NVIDIA/cub/pull/306 for c10::BFloat16 diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index 3c58a0fa85884a..e61b1f5e9cdecf 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -25,7 +25,7 @@ #include #endif -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) #include #endif @@ -93,7 +93,7 @@ bool CUDAHooks::isPinnedPtr(void* data) const { } cudaPointerAttributes attr; cudaError_t err = cudaPointerGetAttributes(&attr, data); -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) if (err == cudaErrorInvalidValue) { cudaGetLastError(); return false; @@ -106,7 +106,7 @@ bool CUDAHooks::isPinnedPtr(void* data) const { return false; } #endif -#if CUDA_VERSION >= 10000 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000 return attr.type == cudaMemoryTypeHost; #else return attr.memoryType == cudaMemoryTypeHost; @@ -287,7 +287,7 @@ std::string CUDAHooks::showConfig() const { } }; -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) oss << " - CUDA Runtime "; #else oss << " - HIP Runtime "; @@ -296,7 +296,7 @@ std::string CUDAHooks::showConfig() const { oss << "\n"; // TODO: Make HIPIFY understand CUDART_VERSION macro -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) if (runtimeVersion != CUDART_VERSION) { oss << " - Built with CUDA Runtime "; printCudaStyleVersion(CUDART_VERSION); @@ -305,7 +305,7 @@ std::string CUDAHooks::showConfig() const { oss << " - NVCC architecture flags: " << NVCC_FLAGS_EXTRA << "\n"; #endif -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #if AT_CUDNN_ENABLED() diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp index efdca848386d05..704001200d227d 100644 --- a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp +++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp @@ -147,7 +147,7 @@ nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, NVRTC_STUB1(nvrtcDestroyProgram, nvrtcProgram *); NVRTC_STUB2(nvrtcGetPTXSize, nvrtcProgram, size_t *); NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *); -#if CUDA_VERSION >= 11010 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11010 NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *); NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *); #endif diff --git a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh index f3065da8a59922..08f8d14d18b73c 100644 --- a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh +++ b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh @@ -13,7 +13,7 @@ // Operands that share the same shape, but may have different strides. // OffsetCalculator iterates the tensor in a column-major order -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) constexpr int MAX_DIMS = 16; #else constexpr int MAX_DIMS = 25; diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h index 6c28c298008eb4..c1e64ebb3baadc 100644 --- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h +++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h @@ -29,7 +29,7 @@ namespace at { namespace cuda { // and edit ATen/cuda/detail/LazyNVRTC.cpp accordingly (e.g., via one of the stub // macros). -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #define AT_FORALL_NVRTC_BASE(_) \ _(nvrtcVersion) \ @@ -56,7 +56,7 @@ namespace at { namespace cuda { _(cuLinkAddData) \ _(cuLinkComplete) -#if CUDA_VERSION >= 11010 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11010 #define AT_FORALL_NVRTC(_) \ AT_FORALL_NVRTC_BASE(_) \ _(nvrtcGetCUBINSize) \ diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h index 08be237c8fc7c3..98b398e7c659f9 100644 --- a/aten/src/ATen/native/ForeachUtils.h +++ b/aten/src/ATen/native/ForeachUtils.h @@ -120,7 +120,7 @@ bool check_fast_path_restrictions( bool can_use_fast_route(ArrayRef tensorLists, ArrayRef scalarList = {}, bool does_op_promote_integer_inputs_to_float = false) { -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) return false; #else return check_fast_path_restrictions(tensorLists, scalarList, does_op_promote_integer_inputs_to_float); @@ -128,7 +128,7 @@ bool can_use_fast_route(ArrayRef tensorLists, } bool can_use_fast_route(TensorList tensors1, TensorList tensors2, bool does_op_promote_integer_inputs_to_float = false) { -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) return false; #else return can_use_fast_route({tensors1, tensors2}, {}, does_op_promote_integer_inputs_to_float); diff --git a/aten/src/ATen/native/cuda/Activation.cu b/aten/src/ATen/native/cuda/Activation.cu index 6dd4e3c2401ecb..2a399b39c9676e 100644 --- a/aten/src/ATen/native/cuda/Activation.cu +++ b/aten/src/ATen/native/cuda/Activation.cu @@ -432,7 +432,7 @@ std::tuple prelu_backward_cuda(const Tensor& grad_out_, const Te // rrelu // ----------------------------------- template -#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__ +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(256, 4) #endif __global__ void rrelu_with_noise_cuda_kernel( diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h index cd6ca9d370f7cd..6337e130d66a2c 100644 --- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h +++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h @@ -112,7 +112,7 @@ class CuFFTHandle { ~CuFFTHandle() { // Not using fftDestroy() for rocFFT to work around double freeing of handles -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) cufftDestroy(handle_); #endif } @@ -123,7 +123,7 @@ static bool is_pow_of_two(int64_t x) { return (x & (x - 1)) == 0; } -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) using cufft_size_type = int; #else using cufft_size_type = long long int; @@ -258,7 +258,7 @@ class CuFFTConfig { // use a flag to keep track throughout this function to see if we need to // input = input.clone(); -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) // clone input to avoid issues with hipfft clobering the input and failing tests clone_input = true; #else @@ -300,7 +300,7 @@ class CuFFTConfig { const bool simple_layout = in_layout.simple && out_layout.simple; -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) hipfftType exec_type = [&]{ if (dtype == kFloat) { switch (fft_type) { @@ -350,7 +350,7 @@ class CuFFTConfig { // by assuming istride = ostride = 1. // // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu. -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) CUFFT_CHECK(hipfftMakePlanMany(plan(), signal_ndim, signal_sizes.data(), /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, @@ -362,7 +362,7 @@ class CuFFTConfig { batch, &ws_size_t, exec_type)); #endif } else { -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) CUFFT_CHECK(hipfftMakePlanMany(plan(), signal_ndim, signal_sizes.data(), in_layout.embed.data(), in_layout.stride, in_layout.dist, out_layout.embed.data(), out_layout.stride, out_layout.dist, @@ -392,7 +392,7 @@ class CuFFTConfig { ScalarType value_type_; }; -#if CUDA_VERSION < 10000 +#if (defined(CUDA_VERSION) && CUDA_VERSION < 10000) || defined(USE_ROCM) // Note that the max plan number for CUDA version < 10 has to be 1023 // due to a bug that fails on the 1024th plan constexpr int64_t CUFFT_MAX_PLAN_NUM = 1023; diff --git a/aten/src/ATen/native/cuda/CuFFTUtils.h b/aten/src/ATen/native/cuda/CuFFTUtils.h index 3d3d6efc4b5485..09d561736472f2 100644 --- a/aten/src/ATen/native/cuda/CuFFTUtils.h +++ b/aten/src/ATen/native/cuda/CuFFTUtils.h @@ -49,7 +49,7 @@ static inline std::string _cudaGetErrorEnum(cufftResult error) return "CUFFT_NO_WORKSPACE"; case CUFFT_NOT_IMPLEMENTED: return "CUFFT_NOT_IMPLEMENTED"; -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) case CUFFT_LICENSE_ERROR: return "CUFFT_LICENSE_ERROR"; #endif diff --git a/aten/src/ATen/native/cuda/DepthwiseConv2d.cu b/aten/src/ATen/native/cuda/DepthwiseConv2d.cu index b849391e0ffdc2..61c3754a97395e 100644 --- a/aten/src/ATen/native/cuda/DepthwiseConv2d.cu +++ b/aten/src/ATen/native/cuda/DepthwiseConv2d.cu @@ -69,11 +69,11 @@ __global__ void conv_depthwise2d_forward_kernel( acc_t value = biasEnabled ? static_cast(bias.data()[c]) : acc_t(0); const index_t offset0 = (n * inputChannels + inputChannel) * inputHeight * inputWidth; -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #pragma unroll #endif for (int kH = 0; kH < KH_LIMIT; ++kH) { -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #pragma unroll #endif for (int kW = 0; kW < KW_LIMIT; ++kW) { @@ -125,17 +125,17 @@ __global__ void conv_depthwise2d_backward_kernel( acc_t value(0); -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #pragma unroll #endif for (int multiplier = 0; multiplier < depthwiseMultiplier; ++multiplier) { int och = (c * depthwiseMultiplier) + multiplier; int weightOffset = och * kernelHeight * kernelWidth; -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #pragma unroll #endif for (int kh = 0; kh < KH_LIMIT; ++kh) { -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) #pragma unroll #endif for (int kw = 0; kw < KW_LIMIT; ++kw) { diff --git a/aten/src/ATen/native/cuda/DeviceSqrt.cuh b/aten/src/ATen/native/cuda/DeviceSqrt.cuh index 29711a06cb3e77..38a7804015be18 100644 --- a/aten/src/ATen/native/cuda/DeviceSqrt.cuh +++ b/aten/src/ATen/native/cuda/DeviceSqrt.cuh @@ -1,7 +1,7 @@ #pragma once namespace at { namespace native { -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) // take these out when ROCm implements std:: math functions #include template diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu index 52b0142bf00d3c..e31ce1f30dcf55 100644 --- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu +++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu @@ -160,7 +160,7 @@ __global__ void max_pool_forward_nhwc(const scalar_t* bottom_data, const int nba static const int BLOCK_THREADS = 256; template -#if defined (__HIP_PLATFORM_HCC__) +#if defined (USE_ROCM) C10_LAUNCH_BOUNDS_2(BLOCK_THREADS, 4) #else C10_LAUNCH_BOUNDS_2(BLOCK_THREADS, 8) diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu index e930a4635199bc..df1ea64301a12a 100644 --- a/aten/src/ATen/native/cuda/Distributions.cu +++ b/aten/src/ATen/native/cuda/Distributions.cu @@ -87,7 +87,7 @@ void binomial_cuda_kernel( at::native::distribution_binary_kernel(iter, philox_args, [philox_args] GPU_LAMBDA (curandStatePhilox4_32_10_t& state, scalar_t count, scalar_t prob) { - #if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__) + #if defined(__CUDA_ARCH__) || defined(USE_ROCM) auto uniform_lambda = curand_uniform_wrapper(state); BaseSampler standard_uniform(uniform_lambda); auto sample = sample_binomial(count, prob, standard_uniform); diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu index 853ab4d2f329e0..171b26903cb575 100644 --- a/aten/src/ATen/native/cuda/Dropout.cu +++ b/aten/src/ATen/native/cuda/Dropout.cu @@ -29,7 +29,7 @@ template < typename IndexType, int ADims, int VEC> -#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__ +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(256, 4) #endif __global__ void fused_dropout_kernel_vec( @@ -118,7 +118,7 @@ template < typename IndexType, int ADims, int BDims = ADims> -#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__ +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(256, 4) #endif __global__ void fused_dropout_kernel( diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu index 002b19504df009..596a1d139968e3 100644 --- a/aten/src/ATen/native/cuda/Embedding.cu +++ b/aten/src/ATen/native/cuda/Embedding.cu @@ -16,7 +16,7 @@ namespace at { namespace native { namespace { -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) static const int BLOCKDIMY = 16; #else static const int BLOCKDIMY = 32; @@ -83,7 +83,7 @@ __global__ void embedding_backward_feature_kernel (dst_row == indices_batch[chunk_start - batch_start + threadIdx.x]); if(threadIdx.x >= n_this_chunk) match_found_this_thread = 0; -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) unsigned long long int matchmask = WARP_BALLOT(match_found_this_thread); int first_remaining_peer = __ffsll(matchmask) - 1; #else @@ -96,7 +96,7 @@ __global__ void embedding_backward_feature_kernel matchmask ^= (1 << first_remaining_peer); while(matchmask) { -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) first_remaining_peer = __ffsll(matchmask) - 1; #else first_remaining_peer = __ffs(matchmask) - 1; diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu index dbd02145431fc7..2e5985a37d56d1 100644 --- a/aten/src/ATen/native/cuda/EmbeddingBag.cu +++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu @@ -237,7 +237,7 @@ Tensor embedding_bag_backward_cuda_max(const Tensor &grad, cudaStream_t stream = at::cuda::getCurrentCUDAStream(); -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) dim3 block = dim3(64, 4); #else dim3 block = dim3(32, 8); @@ -335,7 +335,7 @@ _embedding_bag_cuda(const Tensor &weight, const Tensor &indices_, max_indices = at::empty({0}, indices.options()); } -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) dim3 block = dim3(64, 4); #else dim3 block = dim3(32, 8); diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu index 1d242db6e3ff12..ba4615b56ff6b6 100644 --- a/aten/src/ATen/native/cuda/Indexing.cu +++ b/aten/src/ATen/native/cuda/Indexing.cu @@ -229,7 +229,7 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List(tensor) + index, diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh index 8849293e20210e..24afee867f90b3 100644 --- a/aten/src/ATen/native/cuda/Loops.cuh +++ b/aten/src/ATen/native/cuda/Loops.cuh @@ -81,7 +81,7 @@ __device__ inline void elementwise_kernel_helper(func_t f, policy_t policy) { // Because for some reason trying to enable vectorized // memory access introduce regression on ROCm. -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #include #else #include diff --git a/aten/src/ATen/native/cuda/LossCTC.cu b/aten/src/ATen/native/cuda/LossCTC.cu index b9e2a8ed459fee..093eefe9047334 100644 --- a/aten/src/ATen/native/cuda/LossCTC.cu +++ b/aten/src/ATen/native/cuda/LossCTC.cu @@ -57,7 +57,7 @@ __device__ static inline int64_t get_target_prime( // computed when we start a new block_s. This is why we have our own for loop here. template __global__ void -#if defined (__HIP_PLATFORM_HCC__) +#if defined (USE_ROCM) C10_LAUNCH_BOUNDS_2((std::is_same::value ? 1024 : 896), 1) #endif ctc_loss_log_alpha_gpu_kernel(scalar_t* __restrict__ log_alpha_data, @@ -413,7 +413,7 @@ ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data, // alphabets the inplace nature is a considerable advantage. template __global__ void -#if defined (__HIP_PLATFORM_HCC__) +#if defined (USE_ROCM) C10_LAUNCH_BOUNDS_2((std::is_same::value ? 1024 : 896), 1) #endif ctc_loss_backward_collect_nonblank_gpu_kernel(scalar_t* __restrict__ gradient_data, @@ -465,7 +465,7 @@ ctc_loss_backward_collect_nonblank_gpu_kernel(scalar_t* __restrict__ gradient_da // It appears to be faster than the above method for small batch sizes. template __global__ void -#if defined (__HIP_PLATFORM_HCC__) +#if defined (USE_ROCM) C10_LAUNCH_BOUNDS_2((std::is_same::value ? 1024 : 896), 1) #endif ctc_loss_backward_collect_gpu_kernel(scalar_t* __restrict__ gradient_data, @@ -537,7 +537,7 @@ ctc_loss_backward_collect_gpu_kernel(scalar_t* __restrict__ gradient_data, // elements are padded template __global__ void -#if defined (__HIP_PLATFORM_HCC__) +#if defined (USE_ROCM) C10_LAUNCH_BOUNDS_2((std::is_same::value ? 1024 : 896), 1) #endif ctc_loss_zero_padded_gradients( diff --git a/aten/src/ATen/native/cuda/MiscUtils.h b/aten/src/ATen/native/cuda/MiscUtils.h index b28f77c701744c..a8ab8104229790 100644 --- a/aten/src/ATen/native/cuda/MiscUtils.h +++ b/aten/src/ATen/native/cuda/MiscUtils.h @@ -25,7 +25,7 @@ struct MAGMAQueue { // Constructor explicit MAGMAQueue(int64_t device_id) { cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 // Magma operations is numerically sensitive, so TF32 should be off // regardless of the global flag. TORCH_CUDABLAS_CHECK(cublasGetMathMode(handle, &original_math_mode)); @@ -44,7 +44,7 @@ struct MAGMAQueue { // Destructor ~MAGMAQueue() { -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 // We've manually set the math mode to CUBLAS_DEFAULT_MATH, now we // should restore the original math mode back cublasHandle_t handle = magma_queue_get_cublas_handle(magma_queue_); @@ -55,7 +55,7 @@ struct MAGMAQueue { private: magma_queue_t magma_queue_; -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 cublasMath_t original_math_mode; #endif }; diff --git a/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu b/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu index 2b3f8dda34e60c..6f342e48c5499e 100644 --- a/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu +++ b/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu @@ -206,7 +206,7 @@ void slow_conv_dilated_all_cuda_template( output.zero_(); } -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) /* When using ROCm, the sum evaluation is inaccurate for double tensors. The reason is currently unknown. Hence, we use gemv for computing `grad_output_n.sum(dims)` until the ROCm-sum issue is diff --git a/aten/src/ATen/native/cuda/Normalization.cuh b/aten/src/ATen/native/cuda/Normalization.cuh index 8f26bc60ba2ed0..266d5f19206d6c 100644 --- a/aten/src/ATen/native/cuda/Normalization.cuh +++ b/aten/src/ATen/native/cuda/Normalization.cuh @@ -12,7 +12,7 @@ namespace at { namespace native { // The maximum number of threads in a block -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) constexpr int MAX_BLOCK_SIZE = 256; #else constexpr int MAX_BLOCK_SIZE = 512; @@ -22,7 +22,7 @@ constexpr unsigned MAX_GRID_SIZE = 65535u; // Number of threads in a block given an input size up to MAX_BLOCK_SIZE static int getNumThreads(int nElem) { -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) int threadSizes[5] = { 16, 32, 64, 128, MAX_BLOCK_SIZE }; #else int threadSizes[5] = { 32, 64, 128, 256, MAX_BLOCK_SIZE }; diff --git a/aten/src/ATen/native/cuda/RNN.cu b/aten/src/ATen/native/cuda/RNN.cu index e873c3b3192441..ba5117e1e5695b 100644 --- a/aten/src/ATen/native/cuda/RNN.cu +++ b/aten/src/ATen/native/cuda/RNN.cu @@ -81,7 +81,7 @@ T sigmoid(T in) { namespace kernel { template -#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__ +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(512, 4) #endif __global__ void lstm_cell_forward( @@ -168,7 +168,7 @@ __global__ void lstm_cell_forward( } template -#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__ +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(512, 4) #endif __global__ void lstm_cell_backward( @@ -233,7 +233,7 @@ __global__ void lstm_cell_backward( } template -#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__ +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(512, 4) #endif __global__ void gru_cell_forward( @@ -303,7 +303,7 @@ __global__ void gru_cell_forward( } template -#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__ +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(512, 4) #endif __global__ void gru_cell_backward( diff --git a/aten/src/ATen/native/cuda/Shape.cu b/aten/src/ATen/native/cuda/Shape.cu index 1fd151c1402495..61cfe36b73da36 100644 --- a/aten/src/ATen/native/cuda/Shape.cu +++ b/aten/src/ATen/native/cuda/Shape.cu @@ -14,7 +14,7 @@ namespace at { namespace native { -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) constexpr int CAT_ARRAY_BATCH_SIZE = 1024; #else constexpr int CAT_ARRAY_BATCH_SIZE = 128; @@ -546,7 +546,7 @@ Tensor& cat_out_cuda(TensorList inputs, int64_t dimension, Tensor& out) { }); allSameType = allSameType && (out.scalar_type() == firstType); -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) if (inputs.size() > 1 && out.dim() <= CAT_ARRAY_MAX_INPUT_DIMS && at::cuda::detail::canUse32BitIndexMath(out) && diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu index 849c02712f7cb6..8fd1c530ba57f8 100644 --- a/aten/src/ATen/native/cuda/SoftMax.cu +++ b/aten/src/ATen/native/cuda/SoftMax.cu @@ -125,7 +125,7 @@ void SpatialSoftMax_getLaunchSizes( uint32_t block_threads = block.x * block.y; smem_size = block.x == 1 ? 0 : block_threads * sizeof(accscalar_t); int max_active_blocks; -#if defined(__HIP_PLATFORM_HCC__) && TORCH_HIP_VERSION < 305 +#if defined(USE_ROCM) && TORCH_HIP_VERSION < 305 // HIP function signature is not compatible yet. uint32_t max_blocks; cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, @@ -358,7 +358,7 @@ blockReduce(AccumT* smem, AccumT val, for (int i = 0; i < C10_WARP_SIZE; ++i) { warpVal = r(warpVal, smem[lane * C10_WARP_SIZE + i]); } -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) __syncwarp(mask); #endif smem[lane] = warpVal; diff --git a/aten/src/ATen/native/cuda/Sort.cu b/aten/src/ATen/native/cuda/Sort.cu index 68a9fbad52bb1d..d5657ef8e71353 100644 --- a/aten/src/ATen/native/cuda/Sort.cu +++ b/aten/src/ATen/native/cuda/Sort.cu @@ -366,7 +366,7 @@ void sort_cuda_kernel( int64_t numel_or_intmax = std::min(numel, static_cast(std::numeric_limits::max())); int64_t nbatch = (numel_or_intmax / nsort) * nsort; -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) constexpr bool is_rocm = true; #else constexpr bool is_rocm = false; diff --git a/aten/src/ATen/native/cuda/SortUtils.cuh b/aten/src/ATen/native/cuda/SortUtils.cuh index 220bba7afaa206..08787b3b95ef3e 100644 --- a/aten/src/ATen/native/cuda/SortUtils.cuh +++ b/aten/src/ATen/native/cuda/SortUtils.cuh @@ -37,13 +37,13 @@ __device__ inline void bitonicSort(K keys[Power2SortSize], V values[Power2SortSize], bool valid[Power2SortSize], const Comparator& comp) { -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #pragma unroll #endif for (unsigned int size = 2; size < Power2SortSize; size *= 2) { bool flag = ((threadIdx.x & (size / 2)) != 0); -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #pragma unroll #endif for (unsigned int stride = size / 2; stride > 0; stride /= 2) { @@ -58,7 +58,7 @@ __device__ inline void bitonicSort(K keys[Power2SortSize], } } -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #pragma unroll #endif for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) { diff --git a/aten/src/ATen/native/cuda/SortingCommon.cuh b/aten/src/ATen/native/cuda/SortingCommon.cuh index 00809e5a48e1a0..441fccafb9151d 100644 --- a/aten/src/ATen/native/cuda/SortingCommon.cuh +++ b/aten/src/ATen/native/cuda/SortingCommon.cuh @@ -13,7 +13,7 @@ namespace at { namespace native { // Is this questionable namespace pollution? -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) constexpr int MAX_BLOCK_SIZE = 256; #else diff --git a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh index a9410852daf82e..f4cb50de7137bd 100644 --- a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh +++ b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh @@ -127,7 +127,7 @@ struct TopKTypeConfig { typedef uint32_t RadixType; static inline __device__ RadixType convert(at::Half v) { -#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__) +#if defined(__CUDA_ARCH__) || defined(USE_ROCM) RadixType x = __half_as_ushort(v); RadixType mask = (x & 0x00008000) ? 0x0000ffff : 0x00008000; return (v == v) ? (x ^ mask) : 0xffff; @@ -138,7 +138,7 @@ struct TopKTypeConfig { } static inline __device__ at::Half deconvert(RadixType v) { -#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__) +#if defined(__CUDA_ARCH__) || defined(USE_ROCM) RadixType mask = (v & 0x00008000) ? 0x00008000 : 0x0000ffff; return __ushort_as_half(v ^ mask); #else @@ -211,7 +211,7 @@ __device__ void countRadixUsingMask( #pragma unroll for (uint32_t j = 0; j < RadixSize; ++j) { bool vote = hasVal && (digitInRadix == j); -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) counts[j] += __popcll(WARP_BALLOT(vote)); #else counts[j] += __popc(WARP_BALLOT(vote, ACTIVE_MASK())); diff --git a/aten/src/ATen/native/cuda/SpectralOps.cpp b/aten/src/ATen/native/cuda/SpectralOps.cpp index 15fd875dab9eb7..941513b885249a 100644 --- a/aten/src/ATen/native/cuda/SpectralOps.cpp +++ b/aten/src/ATen/native/cuda/SpectralOps.cpp @@ -28,7 +28,7 @@ using namespace at::native::detail; static void exec_cufft_plan( const CuFFTConfig &config, void* in_data, void* out_data, bool forward) { auto& plan = config.plan(); -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) auto value_type = config.data_type(); if (value_type == kFloat) { switch (config.transform_type()) { diff --git a/aten/src/ATen/native/cuda/SummaryOps.cu b/aten/src/ATen/native/cuda/SummaryOps.cu index 6d0ea7a53391f1..1a1ab8026480d2 100644 --- a/aten/src/ATen/native/cuda/SummaryOps.cu +++ b/aten/src/ATen/native/cuda/SummaryOps.cu @@ -353,7 +353,7 @@ Tensor _histc_cuda_template( maxvalue = maxvalue + 1; } -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) TORCH_CHECK( !(THCNumerics::isinf(minvalue) || THCNumerics::isinf(maxvalue) || diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu index da08bbe94377b4..222107838dcb54 100644 --- a/aten/src/ATen/native/cuda/TensorFactories.cu +++ b/aten/src/ATen/native/cuda/TensorFactories.cu @@ -224,7 +224,7 @@ inline void get_coordinate_in_triu_trapezoid( template __global__ -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) C10_LAUNCH_BOUNDS_1(512) #endif void tril_indices_kernel(scalar_t * tensor, diff --git a/aten/src/ATen/native/cuda/TensorModeKernel.cuh b/aten/src/ATen/native/cuda/TensorModeKernel.cuh index d29454bb9d4b19..93412ca36d6d1e 100644 --- a/aten/src/ATen/native/cuda/TensorModeKernel.cuh +++ b/aten/src/ATen/native/cuda/TensorModeKernel.cuh @@ -143,13 +143,13 @@ __device__ inline void bitonicSortKeys( K keys[Power2SortSize], bool valid[Power2SortSize], const Comparator& comp) { -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #pragma unroll #endif for (unsigned int size = 2; size < Power2SortSize; size *= 2) { bool flag = ((threadIdx.x & (size / 2)) != 0); -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #pragma unroll #endif for (unsigned int stride = size / 2; stride > 0; stride /= 2) { @@ -166,7 +166,7 @@ __device__ inline void bitonicSortKeys( } } -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #pragma unroll #endif for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) { diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu index 3e51f16ccb0408..407f957a2b12fc 100644 --- a/aten/src/ATen/native/cuda/TensorTopK.cu +++ b/aten/src/ATen/native/cuda/TensorTopK.cu @@ -33,7 +33,7 @@ __global__ void gatherTopK(at::cuda::detail::TensorInfo input, IndexType indicesWithinSliceStride) { // Indices are limited to integer fp precision, so counts can fit in // int32, regardless of IndexType -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) __shared__ int smem[64]; #else __shared__ int smem[32]; // one per each warp, up to warp limit diff --git a/aten/src/ATen/native/cuda/TensorTransformations.cu b/aten/src/ATen/native/cuda/TensorTransformations.cu index b61f7f95d88656..d46a5613df78cb 100644 --- a/aten/src/ATen/native/cuda/TensorTransformations.cu +++ b/aten/src/ATen/native/cuda/TensorTransformations.cu @@ -13,7 +13,7 @@ namespace at { namespace native { template -#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__ +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(cuda::getApplyBlockSize(), cuda::getApplyBlocksPerSM()) #endif __global__ void kernel_pointwise_flip_apply2( diff --git a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu index 6f9c96d46a5883..aedf691193b1a6 100644 --- a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu +++ b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu @@ -155,7 +155,7 @@ void nan_to_num_kernel_cuda( } void frexp_kernel_cuda(TensorIteratorBase& iter) { -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) // Reference: https://rocmdocs.amd.com/en/latest/ROCm_API_References/HIP-MATH.html // https://github.com/ROCm-Developer-Tools/HIP/issues/2169 // ROCm does not support frexp function yet diff --git a/aten/src/ATen/native/cuda/WeightNorm.cu b/aten/src/ATen/native/cuda/WeightNorm.cu index 3ec523cc3276b9..12d3b572bdb06b 100644 --- a/aten/src/ATen/native/cuda/WeightNorm.cu +++ b/aten/src/ATen/native/cuda/WeightNorm.cu @@ -45,7 +45,7 @@ __device__ __forceinline__ void reduce_block_into_lanes __syncthreads(); } -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #pragma unroll #endif for(int i = (blockSize >> 1); i >= 64; i >>= 1) @@ -64,7 +64,7 @@ __device__ __forceinline__ void reduce_block_into_lanes final = val; // __SYNCWARP(); -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #pragma unroll #endif for(int i = 16; i >= lanes; i >>= 1) diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp index f81de80cf5ec22..8eceed0212ec31 100644 --- a/aten/src/ATen/native/cudnn/RNN.cpp +++ b/aten/src/ATen/native/cudnn/RNN.cpp @@ -1406,7 +1406,7 @@ struct DropoutState { at::Tensor buffer; c10::optional event; std::mutex mutex; -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 // cudaStreamGetCaptureInfo will never give back a capture id of 0, so 0 can serve // as a sentinel value that capture was not underway. cuda::CaptureId_t capture_id_last_lock = 0; @@ -1424,7 +1424,7 @@ struct DropoutState { // could then define it before we get to unlock(). mutex.lock(); if (event) { -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 // See Note [DropoutState and CUDA graph capture] cudaStreamCaptureStatus status; AT_CUDA_CHECK(cudaStreamGetCaptureInfo(cuda::getCurrentCUDAStream(), @@ -1445,7 +1445,7 @@ struct DropoutState { void unlock() { if (event) { event->record(); -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 // See Note [DropoutState and CUDA graph capture] cudaStreamCaptureStatus status; AT_CUDA_CHECK(cudaStreamGetCaptureInfo(cuda::getCurrentCUDAStream(), diff --git a/aten/src/ATen/native/sparse/cuda/SoftMax.cu b/aten/src/ATen/native/sparse/cuda/SoftMax.cu index 831b28982c6940..f5e4d98050503b 100644 --- a/aten/src/ATen/native/sparse/cuda/SoftMax.cu +++ b/aten/src/ATen/native/sparse/cuda/SoftMax.cu @@ -51,7 +51,7 @@ namespace { // Number of threads in a block given an input size up to MAX_BLOCK_SIZE static int getNumThreads(int nElem) { -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) int threadSizes[5] = {16, 32, 64, 128, 256}; #else int threadSizes[5] = {32, 64, 128, 256, 512}; diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh index 52c1cb1b87d907..c83592335511f2 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh @@ -41,7 +41,7 @@ __device__ void applyOp3( // Assume both dense and values are contiguous. // Currently only used in add_out_dense_sparse_cuda: add(dense, sparse, scalar). template -#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__ +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(cuda::getApplyBlockSize(), cuda::getApplyBlocksPerSM()) #endif __global__ void sparseElementwiseKernel( @@ -71,7 +71,7 @@ __global__ void sparseElementwiseKernel( // Assume dense is contiguous. // Currently only used in add_out_dense_sparse_cuda: add(dense, sparse, scalar). template -#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__ +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(cuda::getApplyBlockSize(), cuda::getApplyBlocksPerSM()) #endif __global__ void sparseElementwiseKernelScalar( @@ -95,7 +95,7 @@ __global__ void sparseElementwiseKernelScalar( } template -#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__ +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(cuda::getApplyBlockSize(), cuda::getApplyBlocksPerSM()) #endif __global__ void valueSparseUnionKernel( @@ -142,7 +142,7 @@ __global__ void valueSparseUnionKernel( // TODO find a way to parallelize this... template -#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__ +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(cuda::getApplyBlockSize(), cuda::getApplyBlocksPerSM()) #endif __global__ void indexSparseUnionKernel( @@ -192,7 +192,7 @@ __global__ void indexSparseUnionKernel( } template -#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__ +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(cuda::getApplyBlockSize(), cuda::getApplyBlocksPerSM()) #endif __global__ void valueSparseIntersectionKernel( @@ -231,7 +231,7 @@ __global__ void valueSparseIntersectionKernel( // TODO find a way to parallelize this... template -#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__ +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(cuda::getApplyBlockSize(), cuda::getApplyBlocksPerSM()) #endif __global__ void indexSparseIntersectionKernel( diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu index b3734ad4484e65..5565750e6f49bd 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu @@ -517,7 +517,7 @@ SparseTensor& mul_out_sparse_cuda(const SparseTensor& t_, const SparseTensor& sr // see NOTE [ sparse.sum() backward ] // -------------------------------------------------------------------- template -#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__ +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(cuda::getApplyBlockSize(), cuda::getApplyBlocksPerSM()) #endif __global__ void _sparse_sum_backward_cuda_kernel( @@ -683,7 +683,7 @@ Tensor bmm_sparse_cuda(const SparseTensor& self, const Tensor& mat2) { return bmm_out_sparse_cuda(self, mat2, result); } -#if !(defined(__HIP_PLATFORM_HCC__) || (defined(_MSC_VER) && CUSPARSE_VERSION < 11000)) +#if !(defined(USE_ROCM) || (defined(_MSC_VER) && CUSPARSE_VERSION < 11000)) __global__ void search_end_matrix_indices_cuda_kernel( int64_t* mat_el_end_indices, int64_t num_matrices, @@ -764,7 +764,7 @@ cudaDataType getTensorCudaDataType(Tensor self) { #endif Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor& result) { -#if defined __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) TORCH_CHECK(false, "bmm sparse-dense is not supported on HIP"); #elif defined(_MSC_VER) && (CUSPARSE_VERSION < 11000) TORCH_CHECK(false, "bmm sparse-dense CUDA is not supported on Windows with cuda before 11.0"); diff --git a/aten/src/ATen/test/cuda_vectorized_test.cu b/aten/src/ATen/test/cuda_vectorized_test.cu index a3b04333a52807..5af233b4102fcd 100644 --- a/aten/src/ATen/test/cuda_vectorized_test.cu +++ b/aten/src/ATen/test/cuda_vectorized_test.cu @@ -25,7 +25,7 @@ void reset_buffers() { } } -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) TEST(TestLoops, HasSameArgTypes) { // This is a compile-time unit test. If this file compiles without error, // then the test passes and during runtime, we just need to return. diff --git a/aten/src/THC/THCAsmUtils.cuh b/aten/src/THC/THCAsmUtils.cuh index be0bf6ffa1ba15..daa6a13c569d29 100644 --- a/aten/src/THC/THCAsmUtils.cuh +++ b/aten/src/THC/THCAsmUtils.cuh @@ -10,7 +10,7 @@ template <> struct Bitfield { static __device__ __forceinline__ unsigned int getBitfield(unsigned int val, int pos, int len) { -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) pos &= 0xff; len &= 0xff; @@ -25,7 +25,7 @@ struct Bitfield { static __device__ __forceinline__ unsigned int setBitfield(unsigned int val, unsigned int toInsert, int pos, int len) { -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) pos &= 0xff; len &= 0xff; @@ -48,7 +48,7 @@ template <> struct Bitfield { static __device__ __forceinline__ uint64_t getBitfield(uint64_t val, int pos, int len) { -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) pos &= 0xff; len &= 0xff; @@ -63,7 +63,7 @@ struct Bitfield { static __device__ __forceinline__ uint64_t setBitfield(uint64_t val, uint64_t toInsert, int pos, int len) { -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) pos &= 0xff; len &= 0xff; @@ -83,7 +83,7 @@ struct Bitfield { }; __device__ __forceinline__ int getLaneId() { -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) return __lane_id(); #else int laneId; @@ -92,7 +92,7 @@ __device__ __forceinline__ int getLaneId() { #endif } -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) __device__ __forceinline__ unsigned long long int getLaneMaskLt() { const std::uint64_t m = (1ull << getLaneId()) - 1ull; return m; @@ -105,7 +105,7 @@ __device__ __forceinline__ unsigned getLaneMaskLt() { } #endif -#if defined (__HIP_PLATFORM_HCC__) +#if defined (USE_ROCM) __device__ __forceinline__ unsigned long long int getLaneMaskLe() { std::uint64_t m = UINT64_MAX >> (sizeof(std::uint64_t) * CHAR_BIT - (getLaneId() + 1)); return m; @@ -118,7 +118,7 @@ __device__ __forceinline__ unsigned getLaneMaskLe() { } #endif -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) __device__ __forceinline__ unsigned long long int getLaneMaskGt() { const std::uint64_t m = getLaneMaskLe(); return m ? ~m : m; @@ -131,7 +131,7 @@ __device__ __forceinline__ unsigned getLaneMaskGt() { } #endif -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) __device__ __forceinline__ unsigned long long int getLaneMaskGe() { const std::uint64_t m = getLaneMaskLt(); return ~m; diff --git a/aten/src/THC/THCDeviceTensor-inl.cuh b/aten/src/THC/THCDeviceTensor-inl.cuh index 16e1f94e476916..25b1b878f7474e 100644 --- a/aten/src/THC/THCDeviceTensor-inl.cuh +++ b/aten/src/THC/THCDeviceTensor-inl.cuh @@ -182,7 +182,7 @@ template THCDeviceTensor::transpose(int dim1, int dim2) const { -#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__) +#if defined(__CUDA_ARCH__) || defined(USE_ROCM) // Device code assert(dim1 >= 0 && dim1 < Dim); assert(dim1 >= 0 && dim2 < Dim); @@ -285,7 +285,7 @@ THCDeviceTensor::downcastOuter() { // in all of the dimensions we are collapsing (no padding in // them). bool cont = isContiguousRange(0, Dim - NewDim); -#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__) +#if defined(__CUDA_ARCH__) || defined(USE_ROCM) // Device code assert(cont); #else @@ -336,7 +336,7 @@ THCDeviceTensor::downcastInner() { // in all of the dimensions we are collapsing (no padding in // them). bool cont = isContiguousRange(NewDim, Dim); -#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__) +#if defined(__CUDA_ARCH__) || defined(USE_ROCM) // Device code assert(cont); #else @@ -404,7 +404,7 @@ template class PtrTraits> void THCDeviceTensor::zero(cudaStream_t stream) { -#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__) +#if defined(__CUDA_ARCH__) || defined(USE_ROCM) assert(isContiguous()); #else if (!isContiguous()) { diff --git a/aten/src/THC/THCGeneral.cpp b/aten/src/THC/THCGeneral.cpp index c422cc150d0529..126a1db704a1e2 100644 --- a/aten/src/THC/THCGeneral.cpp +++ b/aten/src/THC/THCGeneral.cpp @@ -219,7 +219,7 @@ void __THCublasCheck(cublasStatus_t status, const char *file, const int line) errmsg = "an absent device architectural feature is required"; break; -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) case CUBLAS_STATUS_MAPPING_ERROR: errmsg = "an access to GPU memory space failed"; break; diff --git a/aten/src/THC/THCGeneral.h.in b/aten/src/THC/THCGeneral.h.in index 65e64f4650fc28..b04b7b34fc7795 100644 --- a/aten/src/THC/THCGeneral.h.in +++ b/aten/src/THC/THCGeneral.h.in @@ -14,7 +14,7 @@ #cmakedefine USE_MAGMA /* Needed for hipMAGMA to correctly identify implementation */ -#if defined(USE_MAGMA) && defined(__HIP_PLATFORM_HCC__) +#if defined(USE_MAGMA) && defined(USE_ROCM) #define HAVE_HIP 1 #endif diff --git a/aten/src/THC/THCScanUtils.cuh b/aten/src/THC/THCScanUtils.cuh index 12268db5b4924d..9f03c5b306d764 100644 --- a/aten/src/THC/THCScanUtils.cuh +++ b/aten/src/THC/THCScanUtils.cuh @@ -96,7 +96,7 @@ __device__ void exclusivePrefixScan(T* smem, T in, T* out, T* carry, BinaryFunct template __device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFunction binop) { // Within-warp, we use warp voting. -#if defined (__HIP_PLATFORM_HCC__) +#if defined (USE_ROCM) unsigned long long int vote = WARP_BALLOT(in); T index = __popcll(getLaneMaskLe() & vote); T carry = __popcll(vote); diff --git a/aten/src/THC/THCStorage.cu b/aten/src/THC/THCStorage.cu index d186c7e53ca8d6..1bf47902b84486 100644 --- a/aten/src/THC/THCStorage.cu +++ b/aten/src/THC/THCStorage.cu @@ -3,7 +3,7 @@ #include #include #include -#if CUDA_VERSION >= 7000 || defined(__HIP_PLATFORM_HCC__) +#if (defined(CUDA_VERSION) && CUDA_VERSION >= 7000) || defined(USE_ROCM) #include #endif diff --git a/aten/src/THC/generic/THCStorage.cu b/aten/src/THC/generic/THCStorage.cu index 43a17f375280c8..19316d38334b7c 100644 --- a/aten/src/THC/generic/THCStorage.cu +++ b/aten/src/THC/generic/THCStorage.cu @@ -7,7 +7,7 @@ void THCStorage_(fill)(THCState *state, THCStorage *self, scalar_t value) at::cuda::ThrustAllocator thrustAlloc; thrust::device_ptr self_data(THCStorage_(data)(state, self)); thrust::fill( -#if CUDA_VERSION >= 7000 || defined __HIP_PLATFORM_HCC__ +#if (defined(CUDA_VERSION) && CUDA_VERSION >= 7000) || defined(USE_ROCM) thrust::cuda::par(thrustAlloc).on(c10::cuda::getCurrentCUDAStream()), #endif self_data, diff --git a/c10/cuda/CUDAGraphsC10Utils.h b/c10/cuda/CUDAGraphsC10Utils.h index 79d727feeb1601..ba8031d3e61102 100644 --- a/c10/cuda/CUDAGraphsC10Utils.h +++ b/c10/cuda/CUDAGraphsC10Utils.h @@ -17,7 +17,7 @@ using MempoolId_t = std::pair; // RAII guard for "cudaStreamCaptureMode", a thread-local value // that controls the error-checking strictness of a capture. -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 struct C10_CUDA_API CUDAStreamCaptureModeGuard { CUDAStreamCaptureModeGuard(cudaStreamCaptureMode desired) { strictness_ = desired; diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h index 8706181f3021db..123d7a4717d1cf 100644 --- a/c10/macros/Macros.h +++ b/c10/macros/Macros.h @@ -295,13 +295,13 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256; #define C10_DEVICE #endif -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) #define C10_HIP_HOST_DEVICE __host__ __device__ #else #define C10_HIP_HOST_DEVICE #endif -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) #define C10_WARP_SIZE warpSize // = 64 or 32 (Defined in hip_runtime.h) #else #define C10_WARP_SIZE 32 @@ -315,7 +315,7 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256; // even when NDEBUG is defined. This is useful for important assertions in CUDA // code that would otherwise be suppressed when building Release. #if defined(__ANDROID__) || defined(__APPLE__) || \ - (defined(__HIP_PLATFORM_HCC__) && ROCM_VERSION < 40100) + (defined(USE_ROCM) && ROCM_VERSION < 40100) // Those platforms do not support assert() #define CUDA_KERNEL_ASSERT(cond) #elif defined(_MSC_VER) diff --git a/c10/test/util/complex_test_common.h b/c10/test/util/complex_test_common.h index f7ab797e776c39..5ad57fd337c8a0 100644 --- a/c10/test/util/complex_test_common.h +++ b/c10/test/util/complex_test_common.h @@ -433,7 +433,7 @@ C10_HOST_DEVICE void test_arithmetic_assign_complex() { // this test is skipped due to a bug in constexpr evaluation // in nvcc. This bug has already been fixed since CUDA 11.2 -#if !defined(__CUDACC__) || CUDA_VERSION >= 11020 +#if !defined(__CUDACC__) || (defined(CUDA_VERSION) && CUDA_VERSION >= 11020) static_assert(x3.imag() == scalar_t(3), ""); #endif @@ -445,7 +445,7 @@ C10_HOST_DEVICE void test_arithmetic_assign_complex() { // this test is skipped due to a bug in constexpr evaluation // in nvcc. This bug has already been fixed since CUDA 11.2 -#if !defined(__CUDACC__) || CUDA_VERSION >= 11020 +#if !defined(__CUDACC__) || (defined(CUDA_VERSION) && CUDA_VERSION >= 11020) static_assert(y3.imag() == scalar_t(1), ""); #endif diff --git a/c10/test/util/exception_test.cpp b/c10/test/util/exception_test.cpp index 9c0a533de992de..af06b4cb90a33e 100644 --- a/c10/test/util/exception_test.cpp +++ b/c10/test/util/exception_test.cpp @@ -37,7 +37,7 @@ TEST(ExceptionTest, TORCH_INTERNAL_ASSERT_DEBUG_ONLY) { // On these platforms there's no assert #if !defined(__ANDROID__) && !defined(__APPLE__) && \ - !(defined(__HIP_PLATFORM_HCC__) && ROCM_VERSION < 40100) + !(defined(USE_ROCM) && ROCM_VERSION < 40100) TEST(ExceptionTest, CUDA_KERNEL_ASSERT) { // This function always throws even in NDEBUG mode ASSERT_DEATH_IF_SUPPORTED({ CUDA_KERNEL_ASSERT(false); }, "Assert"); diff --git a/c10/util/BFloat16.h b/c10/util/BFloat16.h index 5446eb9419253d..9bcdcf5c61e8d4 100644 --- a/c10/util/BFloat16.h +++ b/c10/util/BFloat16.h @@ -19,7 +19,7 @@ inline C10_HOST_DEVICE float f32_from_bits(uint16_t src) { uint32_t tmp = src; tmp <<= 16; -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) float* tempRes; // We should be using memcpy in order to respect the strict aliasing rule @@ -36,7 +36,7 @@ inline C10_HOST_DEVICE float f32_from_bits(uint16_t src) { inline C10_HOST_DEVICE uint16_t bits_from_f32(float src) { uint32_t res = 0; -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) // We should be using memcpy in order to respect the strict aliasing rule // but it fails in the HIP environment. uint32_t* tempRes = reinterpret_cast(&src); @@ -49,7 +49,7 @@ inline C10_HOST_DEVICE uint16_t bits_from_f32(float src) { } inline C10_HOST_DEVICE uint16_t round_to_nearest_even(float src) { -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) if (src != src) { #elif defined(_MSC_VER) if (isnan(src)) { @@ -74,7 +74,7 @@ struct alignas(2) BFloat16 { uint16_t x; // HIP wants __host__ __device__ tag, CUDA does not -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) C10_HOST_DEVICE BFloat16() = default; #else BFloat16() = default; diff --git a/c10/util/C++17.h b/c10/util/C++17.h index 90b04d896c9daf..13ba70ddab122e 100644 --- a/c10/util/C++17.h +++ b/c10/util/C++17.h @@ -107,7 +107,7 @@ using void_t = typename make_void::type; #endif -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) // rocm doesn't like the C10_HOST_DEVICE #define CUDA_HOST_DEVICE #else diff --git a/c10/util/Half.h b/c10/util/Half.h index c22db1fab2487d..72ad276f51c02f 100644 --- a/c10/util/Half.h +++ b/c10/util/Half.h @@ -372,7 +372,7 @@ struct alignas(2) Half { } // HIP wants __host__ __device__ tag, CUDA does not -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) C10_HOST_DEVICE Half() = default; #else Half() = default; diff --git a/c10/util/complex.h b/c10/util/complex.h index 67ed463febd942..3ada79f5f3a63d 100644 --- a/c10/util/complex.h +++ b/c10/util/complex.h @@ -541,7 +541,7 @@ C10_HOST_DEVICE T abs(const c10::complex& z) { #endif } -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) #define ROCm_Bug(x) #else #define ROCm_Bug(x) x diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h index 7f2ef7270e2fbd..68b1feda93b7dc 100644 --- a/caffe2/contrib/aten/aten_op_template.h +++ b/caffe2/contrib/aten/aten_op_template.h @@ -63,7 +63,7 @@ class ATenOp : public Operator { at::TensorOptions optionsFor(const Tensor& ten) { at::Device device = ten.GetDevice(); -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) if (backend() == at::Backend::HIP) { device = at::Device(kCUDA, device.index()); } @@ -107,7 +107,7 @@ class ATenOp : public Operator { auto at_sizes = src.sizes(); caffe2::TypeMeta type_meta = typeMetaFor(src); at::Device device = src.device(); -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) if (device.is_cuda()) { device = at::Device(at::DeviceType::HIP, device.index()); } diff --git a/caffe2/core/common_gpu.cc b/caffe2/core/common_gpu.cc index e9ca9927aa8b51..cb7fa3c8722cfd 100644 --- a/caffe2/core/common_gpu.cc +++ b/caffe2/core/common_gpu.cc @@ -117,7 +117,7 @@ void DeviceQuery(const int device) { << std::endl; ss << "Total registers per block: " << prop.regsPerBlock << std::endl; ss << "Warp size: " << prop.warpSize << std::endl; -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) ss << "Maximum memory pitch: " << prop.memPitch << std::endl; #endif ss << "Maximum threads per block: " << prop.maxThreadsPerBlock @@ -130,14 +130,14 @@ void DeviceQuery(const int device) { << prop.maxGridSize[2] << std::endl; ss << "Clock rate: " << prop.clockRate << std::endl; ss << "Total constant memory: " << prop.totalConstMem << std::endl; -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) ss << "Texture alignment: " << prop.textureAlignment << std::endl; ss << "Concurrent copy and execution: " << (prop.deviceOverlap ? "Yes" : "No") << std::endl; #endif ss << "Number of multiprocessors: " << prop.multiProcessorCount << std::endl; -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) ss << "Kernel execution timeout: " << (prop.kernelExecTimeoutEnabled ? "Yes" : "No") << std::endl; #endif @@ -186,7 +186,7 @@ const char* cublasGetErrorString(cublasStatus_t error) { return "CUBLAS_STATUS_ARCH_MISMATCH"; case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; case CUBLAS_STATUS_EXECUTION_FAILED: @@ -240,7 +240,7 @@ const char* curandGetErrorString(curandStatus_t error) { return "CURAND_STATUS_ARCH_MISMATCH"; case CURAND_STATUS_INTERNAL_ERROR: return "CURAND_STATUS_INTERNAL_ERROR"; -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) case HIPRAND_STATUS_NOT_IMPLEMENTED: return "HIPRAND_STATUS_NOT_IMPLEMENTED"; #endif diff --git a/caffe2/core/common_gpu.h b/caffe2/core/common_gpu.h index b9b43dea83bd12..0fe790bd246f5f 100644 --- a/caffe2/core/common_gpu.h +++ b/caffe2/core/common_gpu.h @@ -5,14 +5,14 @@ #include #include -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #ifdef __GNUC__ #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) #pragma GCC diagnostic push #endif #pragma GCC diagnostic ignored "-Wstrict-aliasing" #endif // __GNUC__ -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM #include #include @@ -30,10 +30,11 @@ // CAFFE2_CUDA_API gets translated to CAFFE2_HIP_API in hipify script, which // causes a marco redefinition issue with the later definition of // CAFFE2_HIP_API, so we exclude this definition when HIP is specified -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #define CAFFE2_CUDA_API TORCH_CUDA_CPP_API -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM +//TODO: [ROCm] Need to remove this after CUDA->HIP mapping is updated. #define CAFFE2_HIP_EXPORT C10_EXPORT #define CAFFE2_HIP_API TORCH_HIP_API @@ -52,20 +53,20 @@ #endif // cuda major revision number below which fp16 compute is not supoorted -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) constexpr int kFp16CUDADevicePropMajor = 6; #else constexpr int kFp16CUDADevicePropMajor = 3; #endif // Re-enable strict aliasing diagnostic if it was disabled. -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #ifdef __GNUC__ #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) #pragma GCC diagnostic pop #endif #endif // __GNUC__ -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM /** * The maximum number of peers that each gpu can have when doing p2p setup. @@ -78,14 +79,14 @@ constexpr int kFp16CUDADevicePropMajor = 3; namespace caffe2 { -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) /** * Empty class to identify TensorCore-based math */ class TensorCoreEngine {}; -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM -#if CUDA_VERSION >= 10000 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000 #define CAFFE2_CUDA_PTRATTR_MEMTYPE type #else #define CAFFE2_CUDA_PTRATTR_MEMTYPE memoryType @@ -95,7 +96,11 @@ class TensorCoreEngine {}; * A runtime function to report the cuda version that Caffe2 is built with. */ inline int CudaVersion() { +#if defined(USE_ROCM) + return ROCM_VERSION; +#else return CUDA_VERSION; +#endif } /** diff --git a/caffe2/core/macros.h.in b/caffe2/core/macros.h.in index bd9a447b879d65..11fd739b209080 100644 --- a/caffe2/core/macros.h.in +++ b/caffe2/core/macros.h.in @@ -57,7 +57,9 @@ static_assert( {"BLAS_INFO", "${BLAS_INFO}"}, \ {"LAPACK_INFO", "${LAPACK_INFO}"}, \ {"USE_CUDA", "${USE_CUDA}"}, \ + {"USE_ROCM", "${USE_ROCM}"}, \ {"CUDA_VERSION", "${CUDA_VERSION}"}, \ + {"ROCM_VERSION", "${ROCM_VERSION}"}, \ {"USE_CUDNN", "${USE_CUDNN}"}, \ {"CUDNN_VERSION", "${CUDNN_VERSION}"}, \ {"USE_NCCL", "${USE_NCCL}"}, \ diff --git a/caffe2/distributed/file_store_handler_op_gpu.cc b/caffe2/distributed/file_store_handler_op_gpu.cc index b60b6502604f75..2263b443cf23d8 100644 --- a/caffe2/distributed/file_store_handler_op_gpu.cc +++ b/caffe2/distributed/file_store_handler_op_gpu.cc @@ -1,6 +1,6 @@ #include "caffe2/distributed/file_store_handler_op.h" -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #include #else #include @@ -8,7 +8,7 @@ namespace caffe2 { -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) REGISTER_CUDA_OPERATOR( FileStoreHandlerCreate, FileStoreHandlerCreateOp); diff --git a/caffe2/distributed/redis_store_handler_op_gpu.cc b/caffe2/distributed/redis_store_handler_op_gpu.cc index 6ad3a5375ca88c..7403d25e35d82a 100644 --- a/caffe2/distributed/redis_store_handler_op_gpu.cc +++ b/caffe2/distributed/redis_store_handler_op_gpu.cc @@ -1,6 +1,6 @@ #include "caffe2/distributed/redis_store_handler_op.h" -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #include #else #include @@ -8,7 +8,7 @@ namespace caffe2 { -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) REGISTER_CUDA_OPERATOR( RedisStoreHandlerCreate, RedisStoreHandlerCreateOp); diff --git a/caffe2/operators/batch_matmul_op.cu b/caffe2/operators/batch_matmul_op.cu index 7bcaac97a15b16..be607d459761e1 100644 --- a/caffe2/operators/batch_matmul_op.cu +++ b/caffe2/operators/batch_matmul_op.cu @@ -12,7 +12,7 @@ bool BatchMatMulOp::RunOnDevice() { REGISTER_CUDA_OPERATOR(BatchMatMul, BatchMatMulOp); -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) template <> bool BatchMatMulOp::RunOnDevice() { diff --git a/caffe2/operators/channel_stats_op.cu b/caffe2/operators/channel_stats_op.cu index 424137d1f5c386..d498f183eae246 100644 --- a/caffe2/operators/channel_stats_op.cu +++ b/caffe2/operators/channel_stats_op.cu @@ -25,7 +25,7 @@ __global__ void ChannelStatsNCHWCUDAKernel( for (int n = threadIdx.x; n < N; n += blockDim.x) { for (int hw = threadIdx.y; hw < HxW; hw += blockDim.y) { const int index = (n * C + c) * HxW + hw; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) m_val += __ldg(X + index); v_val += __ldg(X + index) * __ldg(X + index); #else @@ -58,7 +58,7 @@ __global__ void ChannelStatsNHWCCUDAKernel( T v_val = 0; for (int i = threadIdx.x; i < inner_size; i += blockDim.x) { const int index = i * C + c; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) m_val += __ldg(X + index); v_val += __ldg(X + index) * __ldg(X + index); #else diff --git a/caffe2/operators/fully_connected_op_gpu.cc b/caffe2/operators/fully_connected_op_gpu.cc index 096e302737a22c..94232b9143cca2 100644 --- a/caffe2/operators/fully_connected_op_gpu.cc +++ b/caffe2/operators/fully_connected_op_gpu.cc @@ -139,7 +139,7 @@ bool FullyConnectedGradientOp< } -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) // Require these to be defined otherwise TensorCore FC ops will end // up calling the default FC implementation which doesn't have @@ -191,7 +191,7 @@ REGISTER_CUDA_OPERATOR( DefaultEngine, false /* don't transpose weight */>); -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) REGISTER_CUDA_OPERATOR_WITH_ENGINE( FC, diff --git a/caffe2/operators/generate_proposals_op.cu b/caffe2/operators/generate_proposals_op.cu index 64518538b6b0aa..cab0ad3d0b88f2 100644 --- a/caffe2/operators/generate_proposals_op.cu +++ b/caffe2/operators/generate_proposals_op.cu @@ -6,7 +6,7 @@ #include "caffe2/operators/generate_proposals_op_util_nms.h" #include "caffe2/operators/generate_proposals_op_util_nms_gpu.h" -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) #include #endif diff --git a/caffe2/operators/generate_proposals_op_util_nms_gpu.cu b/caffe2/operators/generate_proposals_op_util_nms_gpu.cu index 7851d6f3dad5e9..9776266154cf3f 100644 --- a/caffe2/operators/generate_proposals_op_util_nms_gpu.cu +++ b/caffe2/operators/generate_proposals_op_util_nms_gpu.cu @@ -7,7 +7,7 @@ namespace utils { namespace { // Helper data structure used locally struct -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) __align__(16) #endif Box { diff --git a/caffe2/operators/group_norm_op.cu b/caffe2/operators/group_norm_op.cu index 52becd364f9c70..535b7bad993336 100644 --- a/caffe2/operators/group_norm_op.cu +++ b/caffe2/operators/group_norm_op.cu @@ -44,7 +44,7 @@ __global__ void ComputeFusedParamsCUDAKernel( if (index < N * C) { const int ng = index / K; const int c = index % C; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) const float scale_val = __ldg(gamma + c) * __ldg(rsig + ng); scale[index] = scale_val; bias[index] = fmaf(-scale_val, __ldg(mu + ng), __ldg(beta + c)); @@ -78,7 +78,7 @@ __global__ void GroupNormForwardCUDAKernel( const int index = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x; if (index < N * C * HxW) { const int nc = index / HxW; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) Y[index] = fmaf(__ldg(X + index), __ldg(scale + nc), __ldg(bias + nc)); #else Y[index] = fmaf(X[index], scale[nc], bias[nc]); @@ -98,7 +98,7 @@ __global__ void GroupNormForwardCUDAKernel( const int index = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x; if (index < N * C * HxW) { const int nc = index / (HxW * C) * C + index % C; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) Y[index] = fmaf(__ldg(X + index), __ldg(scale + nc), __ldg(bias + nc)); #else Y[index] = fmaf(X[index], scale[nc], bias[nc]); @@ -120,7 +120,7 @@ __global__ void ComputeInternalGradientsNCHWCUDAKernel( T db_sum = 0; for (int i = threadIdx.x; i < HxW; i += blockDim.x) { const int index = nc * HxW + i; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) ds_sum += __ldg(dY + index) * __ldg(X + index); db_sum += __ldg(dY + index); #else @@ -160,7 +160,7 @@ __global__ void ComputeYGradientScaleCUDAKernel( if (index < N * C) { const int ng = index / K; const int c = index % C; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) dY_scale[index] = __ldg(gamma + c) * __ldg(rsig + ng); #else dY_scale[index] = gamma[c] * rsig[ng]; @@ -203,7 +203,7 @@ __global__ void ComputeXScaleAndBiasCUDAKernel( for (int i = threadIdx.x; i < K; i += blockDim.x) { const int index = ng * K + i; const int c = g * K + i; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) ds_sum += __ldg(ds + index) * __ldg(gamma + c); db_sum += __ldg(db + index) * __ldg(gamma + c); #else @@ -214,7 +214,7 @@ __global__ void ComputeXScaleAndBiasCUDAKernel( ds_sum = BlockReduce(ds_storage).Sum(ds_sum); db_sum = BlockReduce(db_storage).Sum(db_sum); if (threadIdx.x == 0) { -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) const float x = fmaf(db_sum, __ldg(mu + ng), -ds_sum) * math::utils::Cube(__ldg(rsig + ng)) * alpha; X_scale[ng] = x; @@ -258,7 +258,7 @@ __global__ void GroupNormBackwardCUDAKernel( if (index < N * C * HxW) { const int nc = index / HxW; const int ng = nc / K; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) dX[index] = fmaf( __ldg(dY_scale + nc), __ldg(dY + index), @@ -287,7 +287,7 @@ __global__ void GroupNormBackwardCUDAKernel( if (index < N * C * HxW) { const int nc = index / (HxW * C) * C + index % C; const int ng = nc / K; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) dX[index] = fmaf( __ldg(dY_scale + nc), __ldg(dY + index), @@ -333,7 +333,7 @@ __global__ void GammaBetaBackwardCUDAKernel( for (int i = threadIdx.x; i < N; i += blockDim.x) { const int nc = i * C + c; const int ng = i * G + g; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) dg_sum += fmaf(-__ldg(db + nc), __ldg(mu + ng), __ldg(ds + nc)) * __ldg(rsig + ng); db_sum += __ldg(db + nc); diff --git a/caffe2/operators/instance_norm_op.cu b/caffe2/operators/instance_norm_op.cu index 7fe46c813a11c4..c711db038108b1 100644 --- a/caffe2/operators/instance_norm_op.cu +++ b/caffe2/operators/instance_norm_op.cu @@ -21,7 +21,7 @@ __global__ void ComputeFusedParamsCUDAKernel( const int64_t index = blockIdx.x * blockDim.x + threadIdx.x; if (index < N * C) { const int64_t c = index % C; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) const T scale_val = __ldg(gamma + c) * __ldg(rstd + index); scale[index] = scale_val; bias[index] = __ldg(beta + c) - scale_val * __ldg(mean + index); @@ -47,7 +47,7 @@ __global__ void InstanceNormForwardCUDAKernel( const int64_t nc = kOrder == StorageOrder::NCHW ? (index / HxW) : (index / (HxW * C) * C + index % C); -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) Y[index] = __ldg(scale + nc) * __ldg(X + index) + __ldg(bias + nc); #else Y[index] = scale[nc] * X[index] + bias[nc]; @@ -69,7 +69,7 @@ __global__ void ComputeInternalGradientsNCHWCUDAKernel( T db_sum = 0; for (int64_t j = threadIdx.x; j < HxW; j += blockDim.x) { const int64_t index = i * HxW + j; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) ds_sum += __ldg(dY + index) * __ldg(X + index); db_sum += __ldg(dY + index); #else @@ -101,7 +101,7 @@ __global__ void ComputeFusedParams( const int64_t index = blockIdx.x * blockDim.x + threadIdx.x; if (index < N * C) { const int64_t c = index % C; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) T x = __ldg(ds + index) * __ldg(gamma + c); T y = __ldg(db + index) * __ldg(gamma + c); x = (y * __ldg(mean + index) - x) * @@ -136,7 +136,7 @@ __global__ void InstanceNormBackwardCUDAKernel( const int64_t c = kOrder == StorageOrder::NCHW ? (index / HxW) : (index / (HxW * C) * C + index % C); -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) dX[index] = __ldg(c1 + c) * __ldg(dY + index) + __ldg(c2 + c) * __ldg(X + index) + __ldg(c3 + c); #else @@ -162,7 +162,7 @@ __global__ void GammaBetaBackwardCUDAKernel( T sum2 = 0; for (int64_t i = threadIdx.x; i < N; i += blockDim.x) { const int64_t index = i * C + c; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) sum1 += (__ldg(ds + index) - __ldg(db + index) * __ldg(mean + index)) * __ldg(rstd + index); sum2 += __ldg(db + index); diff --git a/caffe2/operators/minmax_ops.cu b/caffe2/operators/minmax_ops.cu index ca5da5b7b2a601..e0eb52ba2d8fcd 100644 --- a/caffe2/operators/minmax_ops.cu +++ b/caffe2/operators/minmax_ops.cu @@ -16,7 +16,7 @@ __global__ void SelectGradientCUDAKernel( T* dX) { const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x; if (i < N) { -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) dX[i] = __ldg(X + i) == __ldg(Y + i) ? __ldg(dY + i) : T(0); #else dX[i] = X[i] == Y[i] ? dY[i] : T(0); diff --git a/caffe2/operators/reduce_front_back_max_ops.cu b/caffe2/operators/reduce_front_back_max_ops.cu index 3c6ee7f0ae2ffc..ae91f8a6da727d 100644 --- a/caffe2/operators/reduce_front_back_max_ops.cu +++ b/caffe2/operators/reduce_front_back_max_ops.cu @@ -2,7 +2,7 @@ #include "caffe2/core/context_gpu.h" #include "caffe2/operators/reduce_front_back_max_ops.h" -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) #include #endif diff --git a/caffe2/operators/rmac_regions_op.cu b/caffe2/operators/rmac_regions_op.cu index 0ec2dd351a3d50..76c4d012d71a55 100644 --- a/caffe2/operators/rmac_regions_op.cu +++ b/caffe2/operators/rmac_regions_op.cu @@ -3,11 +3,11 @@ #include "caffe2/core/context_gpu.h" #include "caffe2/operators/rmac_regions_op.h" -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) #include #endif -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) namespace rocprim { #else namespace cub { diff --git a/caffe2/operators/segment_reduction_op_gpu.cu b/caffe2/operators/segment_reduction_op_gpu.cu index 33befacc43ab70..44d5732b637224 100644 --- a/caffe2/operators/segment_reduction_op_gpu.cu +++ b/caffe2/operators/segment_reduction_op_gpu.cu @@ -41,7 +41,7 @@ void inclusive_scan_wrapper( } template -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(1024, SEGREDUCE_MINBLOCKS) #endif __global__ void length_sum_kernel( @@ -85,7 +85,7 @@ __global__ void length_sum_kernel( } template -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(1024, SEGREDUCE_MINBLOCKS) #endif __global__ void length_sum_gradient_kernel( @@ -126,7 +126,7 @@ __global__ void length_sum_gradient_kernel( } template -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(1024, SEGREDUCE_MINBLOCKS) #endif __global__ void length_max_kernel( @@ -172,7 +172,7 @@ __global__ void length_max_kernel( } template -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(1024, SEGREDUCE_MINBLOCKS) #endif __global__ void length_weighted_sum_gradient_kernel( @@ -209,7 +209,7 @@ __global__ void length_weighted_sum_gradient_kernel( } template -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(1024, SEGREDUCE_MINBLOCKS) #endif __global__ void length_weighted_sum_with_main_input_gradient_kernel( @@ -252,7 +252,7 @@ __global__ void length_weighted_sum_with_main_input_gradient_kernel( } template -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(1024, SEGREDUCE_MINBLOCKS) #endif __global__ void sparse_length_max_kernel( @@ -313,7 +313,7 @@ __global__ void sparse_length_max_kernel( } template -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(1024, SEGREDUCE_MINBLOCKS) #endif __global__ void sparse_length_weighted_sum_kernel( diff --git a/caffe2/operators/segment_reduction_op_gpu.cuh b/caffe2/operators/segment_reduction_op_gpu.cuh index ffe834e886ef96..8d51196ee13884 100644 --- a/caffe2/operators/segment_reduction_op_gpu.cuh +++ b/caffe2/operators/segment_reduction_op_gpu.cuh @@ -4,7 +4,7 @@ #include "caffe2/core/context_gpu.h" -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) #define SEGREDUCE_MINBLOCKS 8 #else #define SEGREDUCE_MINBLOCKS 16 @@ -56,7 +56,7 @@ template < typename IndexType, bool ExactBlock = false, bool Average = false> -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(1024,SEGREDUCE_MINBLOCKS) #endif __global__ void sparse_length_sum_kernel( diff --git a/caffe2/operators/tile_op.cu b/caffe2/operators/tile_op.cu index 16151c280860e9..ae0610713f1435 100644 --- a/caffe2/operators/tile_op.cu +++ b/caffe2/operators/tile_op.cu @@ -20,7 +20,7 @@ __global__ void TileCopyCUDAKernel( if (x < total_size) { const int r = x / inner_size / tiles; const int c = x % inner_size; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) Y[x] = __ldg(X + r * inner_size + c); #else Y[x] = X[r * inner_size + c]; diff --git a/caffe2/operators/top_k_heap_selection.cuh b/caffe2/operators/top_k_heap_selection.cuh index e6bb1226d581a1..ec9d1edfb30c50 100644 --- a/caffe2/operators/top_k_heap_selection.cuh +++ b/caffe2/operators/top_k_heap_selection.cuh @@ -71,7 +71,7 @@ __device__ inline void warpHeapInsert(K k, V v, K* keyHeap, V* valueHeap) { // (0 12 3456) // log2(8 / 2) = 2 levels of interior nodes for heap size 8 (0 and 12) int i = 0; -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #pragma unroll #endif for (int levels = 0; levels < math::IntegerLog2(HeapSize / 2); ++levels) { @@ -114,7 +114,7 @@ warpHeap(K k, V v, K& keyHeapHead, K* keyHeap, V* valueHeap) { bool wantInsert = Dir ? (k > keyHeapHead) : (k < keyHeapHead); // Find out all the lanes that have elements to add to the heap -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) unsigned long long int vote = __ballot(wantInsert); if (!vote) { @@ -138,7 +138,7 @@ warpHeap(K k, V v, K& keyHeapHead, K* keyHeap, V* valueHeap) { // that have elements int index = __popc(getLaneMaskLt() & vote); int total = __popc(vote); -#endif // __HIP_PLATFORM_HCC__ +#endif // _USE_ROCM // FIXME: try switch statement and explicitly handle cases // FIXME: how do cases work? @@ -261,14 +261,14 @@ __global__ void selectRowsViaHeap( V vals[Unroll]; for (int i = threadIdx.x; i < n; i += blockDim.x * Unroll) { -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #pragma unroll #endif for (int j = 0; j < Unroll; ++j) { vals[j] = inputStart[i + j * blockDim.x]; } -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #pragma unroll #endif for (int j = 0; j < Unroll; ++j) { diff --git a/caffe2/operators/top_k_radix_selection.cuh b/caffe2/operators/top_k_radix_selection.cuh index 250a075360775d..5405e121cc0f5f 100644 --- a/caffe2/operators/top_k_radix_selection.cuh +++ b/caffe2/operators/top_k_radix_selection.cuh @@ -170,11 +170,11 @@ __device__ void countRadixUsingMask(CountType counts[RadixSize], #pragma unroll for (unsigned int j = 0; j < RadixSize; ++j) { bool vote = hasVal && (digitInRadix == j); -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) counts[j] += __popcll(__ballot(vote)); #else counts[j] += __popc(__ballot_sync(__activemask(), vote)); -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM } } diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc index 071db4c280d607..ad04cab82d5aa0 100644 --- a/caffe2/python/pybind_state.cc +++ b/caffe2/python/pybind_state.cc @@ -1064,15 +1064,15 @@ void addGlobalMethods(py::module& m) { #endif // CAFFE2_USE_MKLDNN ); - // if the binary is built with __HIP_PLATFORM_HCC__, this is a ROCm build + // if the binary is built with USE_ROCM, this is a ROCm build // and therefore we need to ignore dyndep failures (because the the module // may not have a ROCm equivalent yet e.g. nccl) m.attr("use_rocm") = py::bool_( -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) true -#else // __HIP_PLATFORM_HCC__ +#else // USE_ROCM false -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM ); m.attr("use_trt") = py::bool_( diff --git a/caffe2/sgd/adagrad_fused_op_gpu.cu b/caffe2/sgd/adagrad_fused_op_gpu.cu index e2bf91c880395e..2c2ad2cf76aea3 100644 --- a/caffe2/sgd/adagrad_fused_op_gpu.cu +++ b/caffe2/sgd/adagrad_fused_op_gpu.cu @@ -86,7 +86,7 @@ void sort_pairs_wrapper( } template -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(1024, SEGREDUCE_MINBLOCKS) #endif __global__ void gradient_mean_kernel( @@ -104,7 +104,7 @@ __global__ void gradient_mean_kernel( } template -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(1024, SEGREDUCE_MINBLOCKS) #endif __global__ void sparse_adagrad_fused_length_sum_gradient_kernel( @@ -171,7 +171,7 @@ __global__ void sparse_adagrad_fused_length_sum_gradient_kernel( } template -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(1024, SEGREDUCE_MINBLOCKS) #endif __global__ void sparse_adagrad_fused_length_weighted_sum_gradient_kernel( @@ -252,7 +252,7 @@ __global__ void sparse_adagrad_fused_length_weighted_sum_gradient_kernel( // Construct a reverse map of offset_of_idx -> segment_id. template -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(1024, SEGREDUCE_MINBLOCKS) #endif __global__ void linear_index_weight_offsets_dedup_kernel( @@ -279,7 +279,7 @@ template < typename T, bool ExactBlock = false, roundOption roundOpt = NEAREST> -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(1024, SEGREDUCE_MINBLOCKS) #endif __global__ void rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel( @@ -343,7 +343,7 @@ __global__ void rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel( sorted_linear_ind_data[sorted_linear_indice_id + num_dup + threadIdx.x] == index; } -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) int32_t num_dup_incr = __popc(__ballot_sync(0xFFFFFFFF, segment_continue)); #else int32_t num_dup_incr = __popc(__ballot(segment_continue)); @@ -438,7 +438,7 @@ __global__ void rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel( } template -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(1024, SEGREDUCE_MINBLOCKS) #endif __global__ diff --git a/caffe2/sgd/adagrad_fused_op_gpu.cuh b/caffe2/sgd/adagrad_fused_op_gpu.cuh index ee4b0d0255da27..e33cf36e2d6e36 100644 --- a/caffe2/sgd/adagrad_fused_op_gpu.cuh +++ b/caffe2/sgd/adagrad_fused_op_gpu.cuh @@ -10,7 +10,7 @@ #include "caffe2/core/operator.h" #include "caffe2/utils/GpuAtomics.cuh" -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) #define SEGREDUCE_MINBLOCKS 8 #else #define SEGREDUCE_MINBLOCKS 16 @@ -31,7 +31,7 @@ constexpr int kWarpSize = 32; template inline __device__ T shfl_xor(const T val, int laneMask, int width = kWarpSize) { -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) return __shfl_xor_sync(0xffffffff, val, laneMask, width); #else return __shfl_xor(val, laneMask, width); @@ -108,8 +108,9 @@ static inline __device__ void gpuAtomicAdd(float* address, float val) { } static inline __device__ void gpuAtomicAdd(c10::Half* address, c10::Half val) { -#if ( \ - (CUDA_VERSION < 10000) || \ +#if ( \ + (defined(USE_ROCM)) || \ + (defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) || \ (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) unsigned int* address_as_ui = (unsigned int*)((char*)address - ((size_t)address & 2)); @@ -136,7 +137,7 @@ template < typename T, bool ExactBlock = false, roundOption roundOpt = NEAREST> -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(1024, SEGREDUCE_MINBLOCKS) #endif __global__ void rowwise_sparse_adagrad_fused_length_sum_gradient_kernel( diff --git a/caffe2/sgd/fp16_momentum_sgd_op.cu b/caffe2/sgd/fp16_momentum_sgd_op.cu index 985f4f2864d15c..f565bf1abe2639 100644 --- a/caffe2/sgd/fp16_momentum_sgd_op.cu +++ b/caffe2/sgd/fp16_momentum_sgd_op.cu @@ -22,7 +22,7 @@ __global__ void FP16MomentumSGDKernel( bool nesterov, const float wd, half2* param) { -#if __CUDA_ARCH__ >= 530 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 530 || defined(USE_ROCM) const float lr2 = lr[0]; const half2 LR = __float2half2_rn(lr2); const half2 momentum = __float2half2_rn(mom); @@ -109,7 +109,7 @@ __global__ void FP16MomentumSGDFP32Kernel( bool nesterov, const float wd, half2* param) { -#if __CUDA_ARCH__ >= 530 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 530 || defined(USE_ROCM) const float lr2 = lr[0]; const float LR = lr2; const float momentum = mom; diff --git a/caffe2/utils/GpuAtomics.cuh b/caffe2/utils/GpuAtomics.cuh index ad5d8725e0c42b..2bbcc14fa7dadc 100644 --- a/caffe2/utils/GpuAtomics.cuh +++ b/caffe2/utils/GpuAtomics.cuh @@ -14,7 +14,7 @@ inline __device__ void gpu_atomic_add(T* address, const T val) { template <> inline __device__ void gpu_atomic_add(float* address, const float val) { -#if defined(__HIP_PLATFORM_HCC__) && defined(__gfx908__) +#if defined(USE_ROCM) && defined(__gfx908__) atomicAddNoRet(address, val); #else atomicAdd(address, val); diff --git a/caffe2/utils/GpuDefs.cuh b/caffe2/utils/GpuDefs.cuh index be591cc95b92e0..fcf2c64ddcb1a9 100644 --- a/caffe2/utils/GpuDefs.cuh +++ b/caffe2/utils/GpuDefs.cuh @@ -7,7 +7,7 @@ namespace caffe2 { // Static definition of GPU warp size for unrolling and code generation -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) constexpr int kWarpSize = warpSize; // = 64 (Defined in hip_runtime.h) #else constexpr int kWarpSize = 32; @@ -25,7 +25,7 @@ template <> struct Bitfield { static __device__ __forceinline__ unsigned int getBitfield(unsigned int val, int pos, int len) { -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) pos &= 0xff; len &= 0xff; @@ -35,12 +35,12 @@ struct Bitfield { unsigned int ret; asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len)); return ret; -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM } static __device__ __forceinline__ unsigned int setBitfield(unsigned int val, unsigned int toInsert, int pos, int len) { -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) pos &= 0xff; len &= 0xff; @@ -55,7 +55,7 @@ struct Bitfield { asm("bfi.b32 %0, %1, %2, %3, %4;" : "=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len)); return ret; -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM } }; @@ -63,7 +63,7 @@ template <> struct Bitfield { static __device__ __forceinline__ unsigned long long int getBitfield(unsigned long long int val, int pos, int len) { -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) pos &= 0xff; len &= 0xff; @@ -73,12 +73,12 @@ struct Bitfield { unsigned long long int ret; asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len)); return ret; -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM } static __device__ __forceinline__ unsigned long long int setBitfield(unsigned long long int val, unsigned long long int toInsert, int pos, int len) { -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) pos &= 0xff; len &= 0xff; @@ -93,21 +93,21 @@ struct Bitfield { asm("bfi.b64 %0, %1, %2, %3, %4;" : "=l"(ret) : "l"(toInsert), "l"(val), "r"(pos), "r"(len)); return ret; -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM } }; __device__ __forceinline__ int getLaneId() { -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) return __lane_id(); #else int laneId; asm("mov.s32 %0, %%laneid;" : "=r"(laneId) ); return laneId; -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM } -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) __device__ __forceinline__ unsigned long long int getLaneMaskLt() { unsigned long long int m = (1ull << getLaneId()) - 1ull; return m; @@ -151,7 +151,7 @@ __device__ __forceinline__ unsigned getLaneMaskGe() { asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask)); return mask; } -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM } // namespace caffe2 diff --git a/caffe2/utils/GpuScanUtils.cuh b/caffe2/utils/GpuScanUtils.cuh index af2577651973c7..0f6823d8e85efd 100644 --- a/caffe2/utils/GpuScanUtils.cuh +++ b/caffe2/utils/GpuScanUtils.cuh @@ -62,7 +62,7 @@ __device__ void exclusivePrefixScan(T* smem, T in, T* out, T* carry, BinaryFunct template __device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFunction binop) { // Within-warp, we use warp voting. -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) unsigned long long int vote = __ballot(in); T index = __popcll(getLaneMaskLe() & vote); @@ -71,7 +71,7 @@ __device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFuncti T vote = __ballot_sync(__activemask(), in); T index = __popc(getLaneMaskLe() & vote); T carry = __popc(vote); -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM int warp = threadIdx.x / kWarpSize; @@ -117,11 +117,11 @@ __device__ void exclusiveBinaryPrefixScan(T* smem, bool in, T* out, T* carry, Bi *out -= (T) in; // The outgoing carry for all threads is the last warp's sum -#if defined(__HIP_PLATFORM_HCC__) +#if defined(USE_ROCM) *carry = smem[math::DivUp(blockDim.x, kWarpSize) - 1]; #else *carry = smem[(blockDim.x / kWarpSize) - 1]; -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM if (KillWARDependency) { __syncthreads(); diff --git a/caffe2/utils/fixed_divisor.h b/caffe2/utils/fixed_divisor.h index 82aa843521275c..c8607c12c76dff 100644 --- a/caffe2/utils/fixed_divisor.h +++ b/caffe2/utils/fixed_divisor.h @@ -30,16 +30,16 @@ class FixedDivisor { FixedDivisor() = default; explicit FixedDivisor(const std::int32_t d) : d_(d) { -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) CalcSignedMagic(); -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM } FIXED_DIVISOR_DECL std::int32_t d() const { return d_; } -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) FIXED_DIVISOR_DECL std::uint64_t magic() const { return magic_; } @@ -47,17 +47,17 @@ class FixedDivisor { FIXED_DIVISOR_DECL int shift() const { return shift_; } -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM /// Calculates `q = n / d`. FIXED_DIVISOR_DECL std::int32_t Div(const std::int32_t n) const { -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) return n / d_; -#else // __HIP_PLATFORM_HCC__ +#else // USE_ROCM // In lieu of a mulhi instruction being available, perform the // work in uint64 return (int32_t)((magic_ * (uint64_t)n) >> shift_); -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM } /// Calculates `r = n % d`. @@ -73,7 +73,7 @@ class FixedDivisor { } private: -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) // Calculates magic multiplicative value and shift amount for calculating `q = // n / d` for signed 32-bit integers. // Implementation taken from Hacker's Delight section 10. @@ -117,14 +117,14 @@ class FixedDivisor { shift_ = p; magic_ = (std::uint64_t)(std::uint32_t)magic; } -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM std::int32_t d_ = 1; -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) std::uint64_t magic_; int shift_; -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM }; } // namespace caffe2 diff --git a/caffe2/utils/fixed_divisor_test.cc b/caffe2/utils/fixed_divisor_test.cc index fc5e74a90625b0..6093bc764c3980 100644 --- a/caffe2/utils/fixed_divisor_test.cc +++ b/caffe2/utils/fixed_divisor_test.cc @@ -17,7 +17,7 @@ void CompareDivMod(int32_t v, int32_t divisor) { int fixed_q = fixed.Div(v); int fixed_r = fixed.Mod(v); -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) EXPECT_EQ(native_q, fixed_q) << v << " / " << divisor << " magic " << fixed.magic() << " shift " << fixed.shift() << " quot " << fixed_q << " " << native_q; diff --git a/caffe2/utils/math/broadcast.cu b/caffe2/utils/math/broadcast.cu index 7d7a2535743c6a..8c0c5795192671 100644 --- a/caffe2/utils/math/broadcast.cu +++ b/caffe2/utils/math/broadcast.cu @@ -32,7 +32,7 @@ __global__ void AffineChannelNCHWCUDAKernel( const int w = blockIdx.x % M * CAFFE_CUDA_NUM_THREADS + threadIdx.x; if (w < HxW) { const int index = nc * HxW + w; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) Y[index] = fmaf(__ldg(X + index), __ldg(scale + c), __ldg(bias + c)); #else Y[index] = fmaf(X[index], scale[c], bias[c]); @@ -58,7 +58,7 @@ __global__ void AffineChannelNHWCCUDAKernel( const int c = blockIdx.y * CAFFE_CUDA_NUM_THREADS + threadIdx.x; if (c < C) { const int index = blockIdx.x * C + c; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) Y[index] = fmaf(__ldg(X + index), __ldg(scale + c), __ldg(bias + c)); #else Y[index] = fmaf(X[index], scale[c], bias[c]); diff --git a/caffe2/utils/math/elementwise.cu b/caffe2/utils/math/elementwise.cu index 778147a7b9eb93..b41d2590e9296b 100644 --- a/caffe2/utils/math/elementwise.cu +++ b/caffe2/utils/math/elementwise.cu @@ -21,7 +21,7 @@ template __global__ void SinCosCUDAKernel(const int N, const T* X, T* S, T* C) { const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x; if (i < N) { -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) c10::cuda::compat::sincos(__ldg(X + i), S + i, C + i); #else c10::cuda::compat::sincos(X[i], S + i, C + i); @@ -29,7 +29,7 @@ __global__ void SinCosCUDAKernel(const int N, const T* X, T* S, T* C) { } } -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) template __global__ void AxpyCUDAKernel( @@ -105,7 +105,7 @@ __global__ void AxpyCUDAKernel( DELEGATE_HALF_AXPY_CUDA_KERNEL(float, fmaf) #undef DELEGATE_HALF_AXPY_CUDA_KERNEL -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM template __global__ void AxpbyCUDAKernel( @@ -473,7 +473,7 @@ DELEGATE_CUDA_SCALE(float, cublasSscal) DELEGATE_CUDA_SCALE(double, cublasDscal) #undef DELEGATE_CUDA_SCALE -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #define DELEGATE_CUDA_SCALE_EX( \ TAlpha, TData, kAlphaType, kDataType, kExecutionType) \ @@ -541,7 +541,7 @@ DELEGATE_CUDA_SCALE_EX(float, double, CUDA_R_32F, CUDA_R_64F, CUDA_R_64F) DELEGATE_CUDA_SCALE_EX(float, at::Half, CUDA_R_32F, CUDA_R_16F, CUDA_R_32F) #undef DELEGATE_CUDA_SCALE_EX -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM #define CAFFE2_SPECIALIZED_CUDA_SCALE(TAlpha, TData) \ template <> \ @@ -577,10 +577,10 @@ DELEGATE_CUDA_SCALE_EX(float, at::Half, CUDA_R_32F, CUDA_R_16F, CUDA_R_32F) CAFFE2_SPECIALIZED_CUDA_SCALE(std::int32_t, std::int32_t) CAFFE2_SPECIALIZED_CUDA_SCALE(std::int64_t, std::int64_t) -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) CAFFE2_SPECIALIZED_CUDA_SCALE(float, double) CAFFE2_SPECIALIZED_CUDA_SCALE(float, at::Half) -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM #undef CAFFE2_SPECIALIZED_CUDA_SCALE #define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(T, Func, DeviceFunc) \ @@ -793,7 +793,7 @@ DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION( DELEGATE_CUDA_AXPY(float, cublasSaxpy) #undef DELEGATE_CUDA_AXPY -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #define DELEGATE_CUDA_AXPY_EX( \ TAlpha, TData, kAlphaType, kDataType, kExecutionType) \ @@ -845,7 +845,7 @@ DELEGATE_CUDA_AXPY_EX(float, double, CUDA_R_32F, CUDA_R_64F, CUDA_R_64F) DELEGATE_CUDA_AXPY_EX(float, at::Half, CUDA_R_32F, CUDA_R_16F, CUDA_R_32F) #undef DELEGATE_CUDA_AXPY_EX -#else // __HIP_PLATFORM_HCC__ +#else // USE_ROCM #define CAFFE2_SPECIALIZED_CUDA_AXPY(TAlpha, TData) \ template <> \ @@ -878,7 +878,7 @@ CAFFE2_SPECIALIZED_CUDA_AXPY(float, double) CAFFE2_SPECIALIZED_CUDA_AXPY(float, at::Half) #undef CAFFE2_SPECIALIZED_CUDA_AXPY -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM #define CAFFE2_SPECIALIZED_CUDA_AXPBY(TAlpha, TData) \ template <> \ diff --git a/caffe2/utils/math/reduce.cu b/caffe2/utils/math/reduce.cu index 8c40c5d2b0ba74..fc3e476b288ba7 100644 --- a/caffe2/utils/math/reduce.cu +++ b/caffe2/utils/math/reduce.cu @@ -35,7 +35,7 @@ __global__ void RowwiseReduceCUDAKernel( const int r = blockIdx.x; T val = init; for (int c = threadIdx.x; c < cols; c += blockDim.x) { -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) val = reducer(val, __ldg(X + r * cols + c)); #else val = reducer(val, X[r * cols + c]); @@ -60,7 +60,7 @@ __global__ void ColwiseReduceCUDAKernel( const int c = blockIdx.x; T val = init; for (int r = threadIdx.x; r < rows; r += blockDim.x) { -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) val = reducer(val, __ldg(X + r * cols + c)); #else val = reducer(val, X[r * cols + c]); @@ -88,7 +88,7 @@ __global__ void BothEndsReduceCUDAKernel( T val = init; for (int m = threadIdx.x; m < M; m += blockDim.x) { for (int k = threadIdx.y; k < K; k += blockDim.y) { -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) val = reducer(val, __ldg(X + (m * N + n) * K + k)); #else val = reducer(val, X[(m * N + n) * K + k]); @@ -123,7 +123,7 @@ __global__ void ReduceTensorCUDAKernel( X_index += Y_index % Y_dims.data[d] * X_strides.data[d]; Y_index /= Y_dims.data[d]; } -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) val = reducer(val, __ldg(X + X_index)); #else val = reducer(val, X[X_index]); @@ -252,7 +252,7 @@ RowwiseMomentsCUDAKernel(const int cols, const T* X, T* mean, T* var) { T v_val = 0; for (int c = threadIdx.x; c < cols; c += blockDim.x) { const int X_index = r * cols + c; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) m_val += __ldg(X + X_index); v_val += __ldg(X + X_index) * __ldg(X + X_index); #else @@ -284,7 +284,7 @@ __global__ void ColwiseMomentsCUDAKernel( T v_val = 0; for (int r = threadIdx.x; r < rows; r += blockDim.x) { const int X_index = r * cols + c; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) m_val += __ldg(X + X_index); v_val += __ldg(X + X_index) * __ldg(X + X_index); #else @@ -320,7 +320,7 @@ __global__ void BothEndsMomentsCUDAKernel( for (int m = threadIdx.x; m < M; m += blockDim.x) { for (int k = threadIdx.y; k < K; k += blockDim.y) { const int X_index = (m * N + n) * K + k; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) m_val += __ldg(X + X_index); v_val += __ldg(X + X_index) * __ldg(X + X_index); #else @@ -360,7 +360,7 @@ __global__ void MomentsCUDAKernel( X_index += Y_index % Y_dims.data[d] * X_strides.data[d]; Y_index /= Y_dims.data[d]; } -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) m_val += __ldg(X + X_index); v_val += __ldg(X + X_index) * __ldg(X + X_index); #else diff --git a/caffe2/utils/math/transpose.cu b/caffe2/utils/math/transpose.cu index 4474d38311ad28..07a3c311780e37 100644 --- a/caffe2/utils/math/transpose.cu +++ b/caffe2/utils/math/transpose.cu @@ -38,7 +38,7 @@ __global__ void BatchTranspose2DCUDAKernel( int y = r * kTileDim + threadIdx.y; if (x < W) { for (int i = 0; threadIdx.y + i < kTileDim && y + i < H; i += kBlockRows) { -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) tile[threadIdx.y + i][threadIdx.x] = __ldg(X + offset + (y + i) * W + x); #else tile[threadIdx.y + i][threadIdx.x] = X[offset + (y + i) * W + x]; @@ -132,7 +132,7 @@ __global__ void TransposeCUDAKernel( X_index += v % Y_dims.data[i] * X_strides.data[i]; v /= Y_dims.data[i]; } -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) Y[Y_index] = __ldg(X + X_index); #else Y[Y_index] = X[X_index]; diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu index 7f3bb8eea6f56b..a37d4b744d73c2 100644 --- a/caffe2/utils/math_gpu.cu +++ b/caffe2/utils/math_gpu.cu @@ -20,7 +20,7 @@ #include "caffe2/utils/fixed_divisor.h" // TODO: Move this to fixed_divisor.h -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) #define FIXED_DIVISOR int32_t #define FIXED_DIVISOR_DIV(d, n) (n / d) #define FIXED_DIVISOR_MOD(d, n) (n % d) @@ -30,14 +30,14 @@ *q = n_copy / d; \ *r = n_copy % d; \ } while (0) -#else // __HIP_PLATFORM_HCC__ +#else // USE_ROCM #define FIXED_DIVISOR FixedDivisor #define FIXED_DIVISOR_DIV(d, n) (d.Div(n)) #define FIXED_DIVISOR_MOD(d, n) (d.Mod(n)) #define FIXED_DIVISOR_DIV_MOD(d, n, q, r) (d.DivMod(n, q, r)) -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) using CUBLAS_HALF_TYPE = rocblas_half; #else // __HIP_PLATFORM_HCC using CUBLAS_HALF_TYPE = __half; @@ -606,7 +606,7 @@ CAFFE2_CUDA_EXPORT void Gemm( if (math_type == TensorProto_DataType_FLOAT) { CUBLAS_ENFORCE(cublasSetPointerMode( context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) // rocblas doesn't support cublasSgemmEx type API yet. // It has more general rocblas_gemm_ex API which is more close to // cublasGemmEx rocblas_gemm_ex does D = alpha*op( A )*op( B ) + beta*C, @@ -655,7 +655,7 @@ CAFFE2_CUDA_EXPORT void Gemm( C, CUDA_R_16F, N)); -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM } else if (math_type == TensorProto_DataType_FLOAT16) { // convert alpha, beta from float -> __half const __half alpha_fp16 = at::Half(alpha); @@ -721,7 +721,7 @@ CAFFE2_CUDA_EXPORT void GemmBatched( float** C, CUDAContext* context, TensorProto::DataType math_type) { -#if __CUDACC_VER_MAJOR__ < 8 || defined(__HIP_PLATFORM_HCC__) +#if __CUDACC_VER_MAJOR__ < 8 || defined(USE_ROCM) // loop over matrices in the batch for (int i = 0; i < batch_size; ++i) { Gemm( @@ -790,7 +790,7 @@ CAFFE2_CUDA_EXPORT void GemmStridedBatched( const int C_stride, CUDAContext* context, TensorProto::DataType math_type) { -#if __CUDACC_VER_MAJOR__ < 8 && !defined(__HIP_PLATFORM_HCC__) +#if __CUDACC_VER_MAJOR__ < 8 && !defined(USE_ROCM) // loop over matrices in the batch for (int i = 0; i < batch_size; ++i) { Gemm( @@ -961,7 +961,7 @@ CAFFE2_CUDA_EXPORT void GemmStridedBatched( const int C_stride, CUDAContext* context, TensorProto::DataType math_type) { -#if __CUDACC_VER_MAJOR__ < 8 && !defined(__HIP_PLATFORM_HCC__) +#if __CUDACC_VER_MAJOR__ < 8 && !defined(USE_ROCM) // loop over matrices in the batch for (int i = 0; i < batch_size; ++i) { Gemm( @@ -983,7 +983,7 @@ CAFFE2_CUDA_EXPORT void GemmStridedBatched( if (math_type == TensorProto_DataType_FLOAT) { CUBLAS_ENFORCE(cublasSetPointerMode( context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) // D[i*stride_d] = alpha*op(A[i*stride_a])*op(B[i*stride_b]) + // beta*C[i*stride_c], for i in [0,batch_count-1] ROCBLAS_ENFORCE(rocblas_gemm_strided_batched_ex( @@ -1041,7 +1041,7 @@ CAFFE2_CUDA_EXPORT void GemmStridedBatched( batch_size, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM } else if (math_type == TensorProto_DataType_FLOAT16) { // Convert alpha, beta from float -> __half const __half alpha_fp16 = at::Half(alpha); @@ -1128,7 +1128,7 @@ CAFFE2_CUDA_EXPORT void Gemv( if (math_type == TensorProto_DataType_FLOAT) { CUBLAS_ENFORCE(cublasSetPointerMode( context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) // rocblas doesn't support cublasSgemmEx type API yet. // It has more general rocblas_gemm_ex API which is more close to // cublasGemmEx rocblas_gemm_ex does D = alpha*op( A )*op( B ) + beta*C, @@ -1177,7 +1177,7 @@ CAFFE2_CUDA_EXPORT void Gemv( y, CUDA_R_16F, ldc)); -#endif // __HIP_PLATFORM_HCC__ +#endif // USE_ROCM } else if (math_type == TensorProto_DataType_FLOAT16) { const __half alpha_fp16 = at::Half(alpha); const __half beta_fp16 = at::Half(beta); @@ -1204,7 +1204,7 @@ CAFFE2_CUDA_EXPORT void Gemv( } } -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) // No change, but required. Defer to default CUDA engine template <> @@ -1675,9 +1675,9 @@ CAFFE2_CUDA_EXPORT void Dot( const at::Half* b, at::Half* y, CUDAContext* context) { -#if defined __HIP_PLATFORM_HCC__ && TORCH_HIP_VERSION < 210 +#if defined(USE_ROCM) && (TORCH_HIP_VERSION < 210) CAFFE_THROW("HIP currently does not support FP16 completely yet."); -#elif defined __HIP_PLATFORM_HCC__ && TORCH_HIP_VERSION >= 210 +#elif defined(USE_ROCM) && (TORCH_HIP_VERSION >= 210) CUBLAS_ENFORCE(cublasSetPointerMode( context->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE)); CUBLAS_ENFORCE(rocblas_hdot( @@ -1997,7 +1997,7 @@ __global__ void Im2ColNCHWCUDAKernel( for (int j = 0; j < kernel_w; ++j) { const int h = h_in + dh; const int w = w_in + dw; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) *col_data_ptr = utils::IsAGeZeroAndALtB(h, input_h) && utils::IsAGeZeroAndALtB(w, input_w) ? __ldg(img_data_ptr + dh * input_w + dw) @@ -2048,7 +2048,7 @@ __global__ void Im2ColNHWCCUDAKernel( for (int j = 0; j < kernel_w; ++j) { const int h = h_in + dh; const int w = w_in + dw; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) *col_data_ptr = utils::IsAGeZeroAndALtB(h, input_h) && utils::IsAGeZeroAndALtB(w, input_w) ? __ldg(img_data + (h * input_w + w) * channels + channel_in) @@ -2110,7 +2110,7 @@ __global__ void Col2ImNCHWCUDAKernel( (((c * patch_h + h_k) * patch_w + w_k) * output_h + h_col) * output_w + w_col; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) val += __ldg(col_data + col_data_index); #else val += col_data[col_data_index]; @@ -2162,7 +2162,7 @@ __global__ void Col2ImNHWCCUDAKernel( h_k /= dilation_h; w_k /= dilation_w; const int c_col = (h_k * patch_w + w_k) * channels + c; -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) val += __ldg( col_data + (h_col * output_w + w_col) * channels_col + c_col); #else @@ -2214,7 +2214,7 @@ __global__ void Im2ColNdNCHWCUDAKernel( is_padding |= !utils::IsAGeZeroAndALtB(d_img, img_shape.data[d_i + 1]); img_index = img_index * img_shape.data[d_i + 1] + d_img; } -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) if (!kCol2Im) { Y_data[col_index] = is_padding ? 0 : __ldg(X_data + img_index); } else if (!is_padding) { @@ -2835,7 +2835,7 @@ __global__ void BroadcastCUDAKernel( FIXED_DIVISOR_DIV_MOD(Y_dims.data[i], Y_index_val, &Y_index_val, &d); X_index += d * X_strides.data[i]; } -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) Y[Y_index] = __ldg(X + X_index) * alpha; #else Y[Y_index] = X[X_index] * alpha; diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index 848a4320a6184e..c905196b9ca3f8 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -113,6 +113,9 @@ function(caffe2_print_configuration_summary) endif() endif() message(STATUS " USE_ROCM : ${USE_ROCM}") + if(${USE_ROCM}) + message(STATUS " ROCM_VERSION : ${ROCM_VERSION}") + endif() message(STATUS " USE_EIGEN_FOR_BLAS : ${CAFFE2_USE_EIGEN_FOR_BLAS}") message(STATUS " USE_FBGEMM : ${USE_FBGEMM}") message(STATUS " USE_FAKELOWP : ${USE_FAKELOWP}") diff --git a/torch/csrc/CudaIPCTypes.cpp b/torch/csrc/CudaIPCTypes.cpp index 9033d445081ea9..a4673f023ffbd4 100644 --- a/torch/csrc/CudaIPCTypes.cpp +++ b/torch/csrc/CudaIPCTypes.cpp @@ -144,7 +144,7 @@ CudaIPCSentData::CudaIPCSentData( counter_ptr_(counter_ptr), original_ptr_(), device_(device) { -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) // CUDA have the unofficial limit on the number of recorded blocking interprocess // events, to prevent using of all events, we are switching to StreamSync // before limit reached. @@ -186,7 +186,7 @@ CudaIPCSentData::CudaIPCSentData( CudaIPCSentData::~CudaIPCSentData() { ReturnRefCounter(handle_, offset_); -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) try { if (event_sync_required_) { at::cuda::CUDAGuard device_guard(device_.index()); diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index 6dd9f077a5b675..7b0835d917f83d 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -508,7 +508,7 @@ PyObject *THPModule_setBenchmarkCuDNN(PyObject *_unused, PyObject *arg) { THPUtils_assert(PyBool_Check(arg), "set_benchmark_cudnn expects a bool, " "but got %s", THPUtils_typename(arg)); -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) if (arg == Py_False) { TORCH_WARN_ONCE("Disabling benchmark mode for MIOpen is NOT supported. Overriding value to True"); arg = Py_True; @@ -918,7 +918,7 @@ PyObject* initModule() { return PyModule_AddObject(module, name, v) == 0; }; -#if defined(USE_CUDNN) || defined(__HIP_PLATFORM_HCC__) +#if defined(USE_CUDNN) || defined(USE_ROCM) PyObject *has_cudnn = Py_True; #else PyObject *has_cudnn = Py_False; diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp index 2e7592b2550478..92f05da540a7d9 100644 --- a/torch/csrc/autograd/engine.cpp +++ b/torch/csrc/autograd/engine.cpp @@ -1261,7 +1261,7 @@ void GraphTask::stash_current_streams() { caller_current_streams_.resize(num_gpus); if (num_gpus > 0) { for (c10::DeviceIndex idx = 0; idx < num_gpus; idx++) { -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) // If the build targets ROCM, stash streams for all visible devices unconditionally, to work around // https://github.com/pytorch/pytorch/issues/59750. // TODO: Remove ROCM-specific behavior when https://github.com/pytorch/pytorch/issues/59750 is fixed. diff --git a/torch/csrc/autograd/profiler_legacy.cpp b/torch/csrc/autograd/profiler_legacy.cpp index ed96209a106aeb..b1ddecc755eb57 100644 --- a/torch/csrc/autograd/profiler_legacy.cpp +++ b/torch/csrc/autograd/profiler_legacy.cpp @@ -302,17 +302,17 @@ std::string getNvtxStr( const std::vector>& shapes) { if (sequence_nr >= -1 || shapes.size() > 0) { std::stringstream s; -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) s << name.str(); #endif if (sequence_nr >= 0) { -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) s << ", seq = " << sequence_nr; #else s << name.str() << ", seq = " << sequence_nr; #endif } else if (sequence_nr == -1) { -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) s << name.str(); #endif } diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp index ce828e9b21c279..3692d3ee9806cf 100644 --- a/torch/csrc/cuda/Module.cpp +++ b/torch/csrc/cuda/Module.cpp @@ -181,7 +181,11 @@ PyObject * THCPModule_setStream_wrap(PyObject *self, PyObject *obj) PyObject * THCPModule_getCompiledVersion(PyObject *self, PyObject *noargs) { +#if defined(USE_ROCM) + return THPUtils_packInt64((int64_t) ROCM_VERSION); +#else return THPUtils_packInt64((int64_t) CUDA_VERSION); +#endif } PyObject * THCPModule_cudaHostAllocator(PyObject *_unused, PyObject *noargs) @@ -632,7 +636,7 @@ namespace shared { void initCudartBindings(PyObject* module); void initNvtxBindings(PyObject* module); -#if defined(USE_CUDNN) || defined(__HIP_PLATFORM_HCC__) +#if defined(USE_CUDNN) || defined(USE_ROCM) void initCudnnBindings(PyObject* module); #endif @@ -644,7 +648,7 @@ void initModule(PyObject *module) { // so this condition might not always be true... shared::initCudartBindings(module); shared::initNvtxBindings(module); -#if defined(USE_CUDNN) || defined(__HIP_PLATFORM_HCC__) +#if defined(USE_CUDNN) || defined(USE_ROCM) shared::initCudnnBindings(module); #endif registerCudaDeviceProperties(module); diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp index daae91e06a1346..74a7dba5b6eb5f 100644 --- a/torch/csrc/cuda/nccl.cpp +++ b/torch/csrc/cuda/nccl.cpp @@ -90,7 +90,7 @@ ncclDataType_t to_nccl_data_type(c10::ScalarType type) { return ncclDataType_t::ncclUint8; case at::kBool: return ncclDataType_t::ncclUint8; -#if defined(__HIP_PLATFORM_HCC__) && TORCH_HIP_VERSION >= 301 +#if defined(USE_ROCM) && TORCH_HIP_VERSION >= 301 case at::kBFloat16: return ncclDataType_t::ncclBfloat16; #endif diff --git a/torch/csrc/cuda/shared/cudart.cpp b/torch/csrc/cuda/shared/cudart.cpp index 9dfa57357f046b..be6010f71abad1 100644 --- a/torch/csrc/cuda/shared/cudart.cpp +++ b/torch/csrc/cuda/shared/cudart.cpp @@ -1,7 +1,7 @@ #include #include #include -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #include #else #include @@ -18,7 +18,7 @@ void initCudartBindings(PyObject* module) { // By splitting the names of these objects into two literals we prevent the // HIP rewrite rules from changing these names when building with HIP. -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) py::enum_(cudart, "cuda" "OutputMode") .value("KeyValuePair", cudaKeyValuePair) .value("CSV", cudaCSV); @@ -42,7 +42,7 @@ void initCudartBindings(PyObject* module) { cudart.def("cuda" "StreamDestroy", [](uintptr_t ptr) -> cudaError_t { return cudaStreamDestroy((cudaStream_t)ptr); }); -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) cudart.def("cuda" "ProfilerInitialize", cudaProfilerInitialize); #endif cudart.def("cuda" "MemGetInfo", [](int device) -> std::pair { diff --git a/torch/csrc/cuda/shared/cudnn.cpp b/torch/csrc/cuda/shared/cudnn.cpp index d5342a30d6838e..594a4a19c0c7c2 100644 --- a/torch/csrc/cuda/shared/cudnn.cpp +++ b/torch/csrc/cuda/shared/cudnn.cpp @@ -1,6 +1,6 @@ // The clang-tidy job seems to complain that it can't find cudnn.h without this. // This file should only be compiled if this condition holds, so it should be safe. -#if defined(USE_CUDNN) || defined(__HIP_PLATFORM_HCC__) +#if defined(USE_CUDNN) || defined(USE_ROCM) #include #include @@ -40,7 +40,7 @@ size_t getVersionInt() { } } -#elif defined(__HIP_PLATFORM_HCC__) +#elif defined(USE_ROCM) #include #include diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp index c5050170dd7e72..5995612f600844 100644 --- a/torch/csrc/distributed/c10d/NCCLUtils.hpp +++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp @@ -61,7 +61,7 @@ const inline char* getNcclErrorDetailStr(ncclResult_t error, c10::optional 9) || \ ((NCCL_MINOR == 9) && defined(NCCL_PATCH) && (NCCL_PATCH >= 7))))) || \ - (defined(__HIP_PLATFORM_HCC__) && (TORCH_HIP_VERSION >= 301)) + (defined(USE_ROCM) && (TORCH_HIP_VERSION >= 301)) #define ENABLE_NCCL_BF16_DATATYPE #endif diff --git a/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp b/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp index c244513fb85caf..28d7e18e4639bf 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp +++ b/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp @@ -1,7 +1,7 @@ #include #include -#if defined(USE_TENSORPIPE) && !defined(__HIP_PLATFORM_HCC__) +#if defined(USE_TENSORPIPE) && !defined(USE_ROCM) #include #include diff --git a/torch/csrc/generic/StorageSharing.cpp b/torch/csrc/generic/StorageSharing.cpp index 30511534253d88..c168a8644e7557 100644 --- a/torch/csrc/generic/StorageSharing.cpp +++ b/torch/csrc/generic/StorageSharing.cpp @@ -263,7 +263,7 @@ static PyObject * THPStorage_(shareCuda)(PyObject *_self, PyObject *noargs) // NOLINTNEXTLINE(cppcoreguidelines-init-variables) cudaIpcEventHandle_t ipc_event_handle; -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) if (sent_data->event_sync_required_) { THCudaCheck(cudaIpcGetEventHandle(&ipc_event_handle, sent_data->event_)); } @@ -381,7 +381,7 @@ static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args) int64_t device = THPUtils_unpackLong(_device); at::cuda::CUDAGuard device_guard(device); -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) if (PyObject_IsTrue(_event_sync_required)) { // Ensure that producer prepared all tensor's data std::string s_ipc_event_handle = diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/torch/csrc/jit/codegen/cuda/codegen.cpp index 00d7ce0fe53510..661c850c644100 100644 --- a/torch/csrc/jit/codegen/cuda/codegen.cpp +++ b/torch/csrc/jit/codegen/cuda/codegen.cpp @@ -128,7 +128,7 @@ class CudaKernelGenerator : private kir::IrVisitor { // Shared memory if (has_dynamic_smem || has_reductions || has_parallel_welford) { indent() << "alignas(" -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) << dataTypeSize(kernel_summary.largest_smem_data_type) #else << 8 // for HIP, we want 8-aligned even for smaller datatypes diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp index 5e671afb3c5e19..595b7a3ff926ff 100644 --- a/torch/csrc/jit/codegen/cuda/executor.cpp +++ b/torch/csrc/jit/codegen/cuda/executor.cpp @@ -59,7 +59,7 @@ typedef unsigned long long int uint64_t; std::string FusionExecutor::getStructuredCode(const std::string& kernel) { // generating cuda code; std::string code = ""; -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) #if ROCM_VERSION < 40200 code += std::string("#include \n") + std::string("#include \n"); diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp index 7efe7cb7bf253c..fe58896b55f90e 100644 --- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp +++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp @@ -40,7 +40,7 @@ namespace executor_utils { std::string kernelPreamble() { std::stringstream ss; -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) ss << nvfuser_resources::fp16_support_cu; #else ss << R"( @@ -665,13 +665,13 @@ NvrtcFunction nvrtcCompile( at::globalContext().getNVRTC().nvrtcDestroyProgram(&program)); }); -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) std::vector args = {"--std=c++14"}; #if ROCM_VERSION >= 40200 args.push_back("-hip-pch"); #endif #else -#if CUDA_VERSION < 11010 +#if defined(CUDA_VERSION) && CUDA_VERSION < 11010 // compile to sass is not allowed prior to CUDA 11.1 compile_to_sass = false; #endif @@ -701,7 +701,7 @@ NvrtcFunction nvrtcCompile( const char* disable_fma = getenv("PYTORCH_NVFUSER_DISABLE_FMA"); // int disable_fma_flag = disable_fma ? atoi(disable_fma) : 0; if (disable_fma && atoi(disable_fma)) { -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) TORCH_WARN_ONCE( "PYTORCH_CUDA_FUSER_DISABLE_FMA is not supported on ROCm, ignoring"); #else @@ -843,7 +843,7 @@ NvrtcFunction nvrtcCompile( { FUSER_PERF_SCOPE("executor_utils::Nvrtc::GetPTX"); -#if CUDA_VERSION >= 11010 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11010 // compile_to_sass determines whether we are generating SASS or PTX, hence // the different API. const auto getSize = compile_to_sass @@ -865,7 +865,7 @@ NvrtcFunction nvrtcCompile( // TODO: We do go through different code path, should investigate whether this // has an impact on generated binary. -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) const char* prefix_env = getenv("PYTORCH_NVFUSER_CUBIN"); if (prefix_env) { FUSER_PERF_SCOPE("executor_utils::Nvrtc::LoadCUBIN"); diff --git a/torch/csrc/jit/codegen/fuser/codegen.cpp b/torch/csrc/jit/codegen/fuser/codegen.cpp index 9df86e83db8e4b..3c02386b065b80 100644 --- a/torch/csrc/jit/codegen/fuser/codegen.cpp +++ b/torch/csrc/jit/codegen/fuser/codegen.cpp @@ -661,7 +661,7 @@ std::string generateKernel( // HIP headers must be included until precompiled header feature is available // clang-format off -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) #if ROCM_VERSION < 40200 if (use_cuda && has_half_tensor) { env.s("RuntimeHeader", R"( diff --git a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp index ad90a28117cca8..b32ef68375fe3e 100644 --- a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp +++ b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp @@ -118,14 +118,14 @@ FusedKernelCUDA::FusedKernelCUDA( AT_CUDA_NVRTC_CHECK(nvrtc().nvrtcCreateProgram( &program, code_.c_str(), nullptr, 0, nullptr, nullptr)); -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) std::vector args = {"--std=c++14"}; #if ROCM_VERSION >= 40200 args.push_back("-hip-pch"); #endif #else const std::string compute = std::string("--gpu-architecture=") + -#if CUDA_VERSION >= 11010 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11010 // CUDA 11.1 allows going directly to SASS (sm_) instead of PTX (compute_) // which gives better backwards compatibility to work on older driver, // (since older driver doesn't necessrily recognize PTX emitted by new @@ -160,7 +160,7 @@ FusedKernelCUDA::FusedKernelCUDA( AT_CUDA_NVRTC_CHECK(result); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) size_t ptx_size; -#if CUDA_VERSION >= 11010 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11010 // compile_to_sass determines whether we are generating SASS or PTX, hence // the different API. const auto getSize = compile_to_sass @@ -182,7 +182,7 @@ FusedKernelCUDA::FusedKernelCUDA( nvrtc().cuModuleGetFunction(&function_, module_, name_.c_str())); // Computes max blocks -#if defined(__HIP_PLATFORM_HCC__) && TORCH_HIP_VERSION < 305 +#if defined(USE_ROCM) && ROCM_VERSION < 30500 // HIP function signature is not compatible yet uint32_t max_blocks; AT_CUDA_DRIVER_CHECK(nvrtc().hipOccupancyMaxActiveBlocksPerMultiprocessor( diff --git a/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h b/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h index 38a4e06372e32a..8a89c6952b92ea 100644 --- a/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h +++ b/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h @@ -13,7 +13,7 @@ tensor as input. Correct code for this case is generated, however, nvrtc does not know how to handle int*_t integer types, so typedefs help it handle those cases*/ -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) static auto type_declarations_template = CodeTemplate(R"( ${RuntimeHeader} ${HalfHeader} @@ -212,7 +212,7 @@ void ${kernelName}(IndexType totalElements, ${formals} ${RandParam}) { // with __half2float(). All mathematical operations are done on float // values, and if needed the intermediate float representation is // converted to half with __float2half() when writing to a half tensor. -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) constexpr auto half_support_literal = R"( typedef __half half; @@ -262,7 +262,7 @@ typedef __half half; )"; #endif -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) constexpr auto bfloat16_support_literal = R"( #ifndef __align__ diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp index e62ef93b573794..b9f9833a4088e2 100644 --- a/torch/csrc/jit/ir/ir.cpp +++ b/torch/csrc/jit/ir/ir.cpp @@ -1162,7 +1162,7 @@ bool Node::hasSideEffects() const { case prim::rpc_sync: // It represents RPC message sent. case prim::rpc_remote: // It represents RPC message sent. case aten::wait: // It can represent RPC message received. -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) case cuda::set_stream: case cuda::_set_device: case cuda::_current_device: diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h index 99f6a6ce5c57b8..85ffb377605869 100644 --- a/torch/csrc/jit/ir/ir.h +++ b/torch/csrc/jit/ir/ir.h @@ -76,7 +76,7 @@ namespace aten { using namespace ::c10::aten; } namespace cuda { -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) using namespace ::c10::cuda; #endif } // namespace cuda diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp index c4120d1254331d..fb80d735d300b6 100644 --- a/torch/csrc/jit/python/init.cpp +++ b/torch/csrc/jit/python/init.cpp @@ -139,7 +139,7 @@ bool loadPythonClasses() { } } // anonymous namespace -#if !defined(__HIP_PLATFORM_HCC__) +#if !defined(USE_ROCM) TORCH_API void runJITCPPTests(); #endif @@ -548,7 +548,7 @@ void initJITBindings(PyObject* module) { [](const std::shared_ptr& graph) { CreateAutodiffSubgraphs(graph); }) -#if defined(BUILDING_TESTS) && !defined(__HIP_PLATFORM_HCC__) +#if defined(BUILDING_TESTS) && !defined(USE_ROCM) .def( "_jit_run_cpp_tests", []() { diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp index 130c6d2b264dda..f07b5750492494 100644 --- a/torch/csrc/jit/python/python_sugared_value.cpp +++ b/torch/csrc/jit/python/python_sugared_value.cpp @@ -217,7 +217,7 @@ std::shared_ptr PythonModuleValue::attr( return toSugaredValue(member, m, loc, /*is_constant=*/true); } -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) std::shared_ptr CUDAPythonModuleValue::attr( const SourceRange& loc, Function& m, diff --git a/torch/csrc/jit/python/python_sugared_value.h b/torch/csrc/jit/python/python_sugared_value.h index 8f339f2024d952..6f1b90d603a63b 100644 --- a/torch/csrc/jit/python/python_sugared_value.h +++ b/torch/csrc/jit/python/python_sugared_value.h @@ -96,7 +96,7 @@ struct VISIBILITY_HIDDEN PythonModuleValue : public PythonValue { // Used for desugaring uses of the torch.cuda module. All the CUDA APIs with // torch.cuda.* are resolved using CUDAPythonModuleValue. -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) struct VISIBILITY_HIDDEN CUDAPythonModuleValue : public PythonValue { explicit CUDAPythonModuleValue(py::object mod) : PythonValue(std::move(mod)) {} diff --git a/torch/csrc/jit/runtime/register_cuda_ops.cpp b/torch/csrc/jit/runtime/register_cuda_ops.cpp index 599fd5398c3707..bd74c2d5e646ac 100644 --- a/torch/csrc/jit/runtime/register_cuda_ops.cpp +++ b/torch/csrc/jit/runtime/register_cuda_ops.cpp @@ -1,6 +1,6 @@ // This file registers special JIT operators used to implement the PyTorch CUDA // API in TorchScript. -#ifndef __HIP_PLATFORM_HCC__ +#if !defined(USE_ROCM) #include #include #include diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp index cd2e0fdedb880f..0dccee06f2451c 100644 --- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp @@ -925,7 +925,7 @@ void CudaCodeGen::Initialize() { HalfChecker halfChecker(buffer_args()); stmt_v->accept(&halfChecker); -#if __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) #if ROCM_VERSION < 40200 os() << "#include " << std::endl; if (halfChecker.hasHalf()) { @@ -948,7 +948,7 @@ void CudaCodeGen::Initialize() { std::string func_name = GetUniqueFuncName(kernel_func_name()); os() << "extern \"C\" __global__" << std::endl; -#ifdef USE_ROCM +#if defined(USE_ROCM) // CUDA has a default limit of threads per block (=flat work group size) // of 1024, but ROCm uses 256 by default. At the time of writing // (#45506), I am unaware of a stricter limit that TensorExpr imposes @@ -1218,14 +1218,14 @@ void CudaCodeGen::CompileToNVRTC( AT_CUDA_NVRTC_CHECK(nvrtc().nvrtcCreateProgram( &program, code.c_str(), nullptr, 0, nullptr, nullptr)); -#ifdef __HIP_PLATFORM_HCC__ +#if defined(USE_ROCM) std::vector args = {"--std=c++14"}; #if ROCM_VERSION >= 40200 args.push_back("-hip-pch"); #endif #else const std::string compute = std::string("--gpu-architecture=") + -#if CUDA_VERSION >= 11010 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11010 // CUDA 11.1 allows going directly to SASS (sm_) instead of PTX (compute_) // which gives better backwards compatibility to work on older driver, // (since older driver doesn't necessrily recognize PTX emitted by new @@ -1264,7 +1264,7 @@ void CudaCodeGen::CompileToNVRTC( size_t ptx_size; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) std::vector ptx; -#if CUDA_VERSION >= 11010 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11010 // compile_to_sass determines whether we are generating SASS or PTX, hence // the different API. auto getSize = compile_to_sass diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py index bb0a85982c665c..8dc83ec6020788 100644 --- a/torch/utils/cpp_extension.py +++ b/torch/utils/cpp_extension.py @@ -198,6 +198,7 @@ def _join_rocm_home(*paths) -> str: COMMON_HIP_FLAGS = [ '-fPIC', '-D__HIP_PLATFORM_HCC__=1', + '-DUSE_ROCM=1', ] COMMON_HIPCC_FLAGS = [