diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 4cb8a69f93de0..7c71673e36f29 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -100,6 +100,8 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG) if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8) list(APPEND GPU_FLAGS "-DENABLE_FP8_E5M2") + endif() + if (CUDA_VERSION VERSION_GREATER_EQUAL 12.0) list(REMOVE_ITEM GPU_FLAGS "-D__CUDA_NO_HALF_OPERATORS__" "-D__CUDA_NO_HALF_CONVERSIONS__" diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu index ea30fa2747838..e56b4d2204005 100644 --- a/csrc/layernorm_kernels.cu +++ b/csrc/layernorm_kernels.cu @@ -59,6 +59,8 @@ __global__ void rms_norm_kernel( template struct _typeConvert { static constexpr bool exists = false; }; +#if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)) +// CUDA < 12.0 runs into issues with packed type conversion template<> struct _typeConvert { static constexpr bool exists = true; @@ -85,8 +87,8 @@ struct _typeConvert { __device__ static inline hip_type convert(float x) { return __float2bfloat16(x); } __device__ static inline packed_hip_type convert(float2 x) { return __float22bfloat162_rn(x); } }; -#endif - +#endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 +#endif // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)) /* Vector POD struct to generate vectorized and packed FP16/BF16 ops for appropriate specializations of fused_add_rms_norm_kernel.