From 59a6abf3c99ee4fed5312d357f6ecbf857f24433 Mon Sep 17 00:00:00 2001 From: Matt Wong <156021403+mawong-amd@users.noreply.github.com> Date: Mon, 8 Apr 2024 14:31:02 -0700 Subject: [PATCH] [Hotfix][CI/Build][Kernel] CUDA 11.8 does not support layernorm optimizations (#3782) --- cmake/utils.cmake | 2 ++ csrc/layernorm_kernels.cu | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 4cb8a69f93de0..7c71673e36f29 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -100,6 +100,8 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG) if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8) list(APPEND GPU_FLAGS "-DENABLE_FP8_E5M2") + endif() + if (CUDA_VERSION VERSION_GREATER_EQUAL 12.0) list(REMOVE_ITEM GPU_FLAGS "-D__CUDA_NO_HALF_OPERATORS__" "-D__CUDA_NO_HALF_CONVERSIONS__" diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu index ea30fa2747838..e56b4d2204005 100644 --- a/csrc/layernorm_kernels.cu +++ b/csrc/layernorm_kernels.cu @@ -59,6 +59,8 @@ __global__ void rms_norm_kernel( template struct _typeConvert { static constexpr bool exists = false; }; +#if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)) +// CUDA < 12.0 runs into issues with packed type conversion template<> struct _typeConvert { static constexpr bool exists = true; @@ -85,8 +87,8 @@ struct _typeConvert { __device__ static inline hip_type convert(float x) { return __float2bfloat16(x); } __device__ static inline packed_hip_type convert(float2 x) { return __float22bfloat162_rn(x); } }; -#endif - +#endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 +#endif // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)) /* Vector POD struct to generate vectorized and packed FP16/BF16 ops for appropriate specializations of fused_add_rms_norm_kernel.