From 6f513d95c0e708cf086b7910f6c8c42841befa60 Mon Sep 17 00:00:00 2001 From: Taiki Endo Date: Fri, 21 Oct 2022 00:18:18 +0900 Subject: [PATCH] Optimize atomic float on NVPTX --- build.rs | 2 +- src/imp/mod.rs | 10 ++ src/imp/nvptx.rs | 402 +++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 3 +- tools/build.sh | 2 + 5 files changed, 417 insertions(+), 2 deletions(-) create mode 100644 src/imp/nvptx.rs diff --git a/build.rs b/build.rs index 8226d1983..6b3a36199 100644 --- a/build.rs +++ b/build.rs @@ -154,7 +154,7 @@ fn main() { println!("cargo:rustc-cfg=portable_atomic_llvm15"); } if !no_asm - && (target_arch == "powerpc64" || target_arch == "s390x") + && (target_arch == "powerpc64" || target_arch == "s390x" || target_arch == "nvptx64") && is_allowed_feature("asm_experimental_arch") { println!("cargo:rustc-cfg=portable_atomic_asm_experimental_arch"); diff --git a/src/imp/mod.rs b/src/imp/mod.rs index f48d07579..11f042621 100644 --- a/src/imp/mod.rs +++ b/src/imp/mod.rs @@ -69,6 +69,11 @@ mod s390x; #[cfg(target_arch = "msp430")] pub(crate) mod msp430; +#[cfg(portable_atomic_asm_experimental_arch)] +#[cfg(feature = "float")] +#[cfg(target_arch = "nvptx64")] +pub(crate) mod nvptx; + #[cfg_attr(portable_atomic_no_cfg_target_has_atomic, cfg(any(test, portable_atomic_no_atomic_cas)))] #[cfg_attr( not(portable_atomic_no_cfg_target_has_atomic), @@ -147,8 +152,13 @@ mod interrupt; // Atomic float implementations #[cfg(feature = "float")] +#[cfg(not(all(target_arch = "nvptx64", portable_atomic_asm_experimental_arch)))] pub(crate) mod float; +#[cfg(feature = "float")] +#[cfg(all(target_arch = "nvptx64", portable_atomic_asm_experimental_arch))] +pub(crate) use nvptx as float; + // ----------------------------------------------------------------------------- // Atomic{Isize,Usize,Bool,Ptr}, Atomic{I,U}{8,16} diff --git a/src/imp/nvptx.rs b/src/imp/nvptx.rs new file mode 100644 index 000000000..801fa61a0 --- /dev/null +++ b/src/imp/nvptx.rs @@ -0,0 +1,402 @@ +// Atomic float implementation on NVPTX. +// +// Refs: +// - https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld +// - https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom +// - https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar +// - User Guide for NVPTX Back-end (LLVM documentation) https://llvm.org/docs/NVPTXUsage.html +// - https://github.com/NVIDIA/libcudacxx/blob/1.9.0-rc1/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_generated.h + +// TODO: handle pre-sm_70 + +use core::{arch::asm, sync::atomic::Ordering}; + +// NVPTX's seqcst atomic op is preceding seqcst fence + acquire op. +macro_rules! fence_sc { + () => { + "fence.sc.gl;" + }; +} + +macro_rules! atomic_rmw { + ($op:ident, $order:ident) => { + match $order { + Ordering::Relaxed => $op!("relaxed", ""), + Ordering::Acquire => $op!("acquire", ""), + Ordering::Release => $op!("release", ""), + Ordering::AcqRel => $op!("acqrel", ""), + Ordering::SeqCst => $op!("acquire", fence_sc!()), + _ => unreachable!("{:?}", $order), + } + }; +} + +macro_rules! atomic_float { + ( + $atomic_type:ident, $float_type:ident, $atomic_int_type:ident, $int_type:ident, + $val_reg:ident, $align:expr + ) => { + #[repr(C, align($align))] + pub(crate) struct $atomic_type { + v: core::cell::UnsafeCell<$float_type>, + } + + // Send is implicitly implemented. + // SAFETY: any data races are prevented by atomic operations. + unsafe impl Sync for $atomic_type {} + + impl $atomic_type { + #[inline] + pub(crate) const fn new(v: $float_type) -> Self { + Self { v: core::cell::UnsafeCell::new(v) } + } + + #[inline] + pub(crate) fn is_lock_free() -> bool { + true + } + #[inline] + pub(crate) const fn is_always_lock_free() -> bool { + true + } + + #[inline] + pub(crate) fn get_mut(&mut self) -> &mut $float_type { + // SAFETY: the mutable reference guarantees unique ownership. + // (UnsafeCell::get_mut requires Rust 1.50) + unsafe { &mut *self.v.get() } + } + + #[inline] + pub(crate) fn into_inner(self) -> $float_type { + self.v.into_inner() + } + + #[inline] + #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)] + pub(crate) fn load(&self, order: Ordering) -> $float_type { + // SAFETY: any data races are prevented by atomic intrinsics and the raw + // pointer passed in is valid because we got it from a reference. + unsafe { $float_type::atomic_load(self.v.get(), order) } + } + + #[inline] + #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)] + pub(crate) fn store(&self, val: $float_type, order: Ordering) { + // SAFETY: any data races are prevented by atomic intrinsics and the raw + // pointer passed in is valid because we got it from a reference. + unsafe { $float_type::atomic_store(self.v.get(), val, order) } + } + + #[inline] + pub(crate) fn swap(&self, val: $float_type, order: Ordering) -> $float_type { + // SAFETY: any data races are prevented by atomic intrinsics and the raw + // pointer passed in is valid because we got it from a reference. + unsafe { $float_type::atomic_swap(self.v.get(), val, order) } + } + + #[inline] + #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)] + pub(crate) fn compare_exchange( + &self, + current: $float_type, + new: $float_type, + success: Ordering, + failure: Ordering, + ) -> Result<$float_type, $float_type> { + let order = crate::utils::upgrade_success_ordering(success, failure); + // SAFETY: any data races are prevented by atomic intrinsics and the raw + // pointer passed in is valid because we got it from a reference. + let res = unsafe { + $float_type::atomic_compare_exchange(self.v.get(), current, new, order) + }; + if res.to_bits() == current.to_bits() { + Ok(res) + } else { + Err(res) + } + } + + #[inline] + #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)] + pub(crate) fn compare_exchange_weak( + &self, + current: $float_type, + new: $float_type, + success: Ordering, + failure: Ordering, + ) -> Result<$float_type, $float_type> { + self.compare_exchange(current, new, success, failure) + } + + #[inline] + pub(crate) fn fetch_add(&self, val: $float_type, order: Ordering) -> $float_type { + // SAFETY: any data races are prevented by atomic intrinsics and the raw + // pointer passed in is valid because we got it from a reference. + unsafe { $float_type::atomic_add(self.v.get(), val, order) } + } + + #[inline] + pub(crate) fn fetch_sub(&self, val: $float_type, order: Ordering) -> $float_type { + // There is no atom.sub, so add `-val`. + // SAFETY: any data races are prevented by atomic intrinsics and the raw + // pointer passed in is valid because we got it from a reference. + unsafe { $float_type::atomic_add(self.v.get(), -val, order) } + } + + #[inline] + pub(crate) fn fetch_max(&self, val: $float_type, order: Ordering) -> $float_type { + // SAFETY: any data races are prevented by atomic intrinsics and the raw + // pointer passed in is valid because we got it from a reference. + unsafe { $float_type::atomic_max(self.v.get(), val, order) } + } + + #[inline] + pub(crate) fn fetch_min(&self, val: $float_type, order: Ordering) -> $float_type { + // SAFETY: any data races are prevented by atomic intrinsics and the raw + // pointer passed in is valid because we got it from a reference. + unsafe { $float_type::atomic_min(self.v.get(), val, order) } + } + + #[inline] + pub(crate) fn fetch_abs(&self, order: Ordering) -> $float_type { + const ABS_MASK: $int_type = !0 / 2; + // TODO: use $float_type::atomic_and + $float_type::from_bits(self.as_bits().fetch_and(ABS_MASK, order)) + } + + #[inline] + pub(crate) fn as_bits(&self) -> &crate::$atomic_int_type { + // SAFETY: $atomic_type and $atomic_int_type have the same layout, + // and there is no concurrent access to the value that does not go through this method. + unsafe { &*(self as *const $atomic_type as *const crate::$atomic_int_type) } + } + } + + impl AtomicOperations for $float_type { + unsafe fn atomic_load(src: *mut Self, order: Ordering) -> Self { + let out; + // SAFETY: the caller must uphold the safety contract for `atomic_load`. + unsafe { + macro_rules! atomic_load { + ($sem:tt, $fence_sc:expr) => { + asm!( + $fence_sc, + concat!("ld.", $sem, ".gpu.", stringify!($float_type), " {out}, [{src}];"), + src = in(reg64) src, + out = out($val_reg) out, + ) + }; + } + match order { + Ordering::Relaxed => atomic_load!("relaxed", ""), + Ordering::Acquire => atomic_load!("acquire", ""), + Ordering::SeqCst => atomic_load!("acquire", fence_sc!()), + _ => unreachable!("{:?}", order), + } + } + out + } + unsafe fn atomic_store(dst: *mut Self, val: Self, order: Ordering) { + // SAFETY: the caller must uphold the safety contract for `atomic_store`. + unsafe { + macro_rules! atomic_store { + ($sem:tt, $fence_sc:expr) => { + asm!( + $fence_sc, + concat!("st.", $sem, ".gpu.", stringify!($float_type), " [{dst}], {val};"), + dst = in(reg64) dst, + val = in($val_reg) val, + ) + }; + } + match order { + Ordering::Relaxed => atomic_store!("relaxed", ""), + Ordering::Release => atomic_store!("release", ""), + Ordering::SeqCst => atomic_store!("relaxed", fence_sc!()), + _ => unreachable!("{:?}", order), + } + } + } + unsafe fn atomic_swap(dst: *mut Self, val: Self, order: Ordering) -> Self { + let out; + // SAFETY: the caller must uphold the safety contract for `atomic_swap`. + unsafe { + macro_rules! swap { + ($sem:tt, $fence_sc:expr) => { + asm!( + $fence_sc, + concat!("atom.", $sem, ".gpu.exch.", stringify!($float_type), " {out}, [{dst}], {val};"), + dst = in(reg64) dst, + val = in($val_reg) val, + out = out($val_reg) out, + ) + }; + } + atomic_rmw!(swap, order); + } + out + } + unsafe fn atomic_compare_exchange( + dst: *mut Self, + old: Self, + new: Self, + order: Ordering, + ) -> Self { + let out; + // SAFETY: the caller must uphold the safety contract for `atomic_compare_exchange`. + unsafe { + macro_rules! cmpxchg { + ($sem:tt, $fence_sc:expr) => { + asm!( + $fence_sc, + concat!("atom.", $sem, ".gpu.cas.", stringify!($float_type), " {out}, [{dst}], {old}, {new};"), + dst = in(reg64) dst, + old = in($val_reg) old, + new = in($val_reg) new, + out = out($val_reg) out, + ) + }; + } + atomic_rmw!(cmpxchg, order); + } + out + } + unsafe fn atomic_and(dst: *mut Self, val: Self, order: Ordering) -> Self { + let out; + // SAFETY: the caller must uphold the safety contract for `atomic_and`. + unsafe { + macro_rules! and { + ($sem:tt, $fence_sc:expr) => { + asm!( + $fence_sc, + concat!("atom.", $sem, ".gpu.and.", stringify!($float_type), " {out}, [{dst}], {val};"), + dst = in(reg64) dst, + val = in($val_reg) val, + out = out($val_reg) out, + ) + }; + } + atomic_rmw!(and, order); + } + out + } + unsafe fn atomic_or(dst: *mut Self, val: Self, order: Ordering) -> Self { + let out; + // SAFETY: the caller must uphold the safety contract for `atomic_or`. + unsafe { + macro_rules! or { + ($sem:tt, $fence_sc:expr) => { + asm!( + $fence_sc, + concat!("atom.", $sem, ".gpu.or.", stringify!($float_type), " {out}, [{dst}], {val};"), + dst = in(reg64) dst, + val = in($val_reg) val, + out = out($val_reg) out, + ) + }; + } + atomic_rmw!(or, order); + } + out + } + unsafe fn atomic_xor(dst: *mut Self, val: Self, order: Ordering) -> Self { + let out; + // SAFETY: the caller must uphold the safety contract for `atomic_xor`. + unsafe { + macro_rules! xor { + ($sem:tt, $fence_sc:expr) => { + asm!( + $fence_sc, + concat!("atom.", $sem, ".gpu.xor.", stringify!($float_type), " {out}, [{dst}], {val};"), + dst = in(reg64) dst, + val = in($val_reg) val, + out = out($val_reg) out, + ) + }; + } + atomic_rmw!(xor, order); + } + out + } + unsafe fn atomic_add(dst: *mut Self, val: Self, order: Ordering) -> Self { + let out; + // SAFETY: the caller must uphold the safety contract for `atomic_add`. + unsafe { + macro_rules! add { + ($sem:tt, $fence_sc:expr) => { + asm!( + $fence_sc, + concat!("atom.", $sem, ".gpu.add.", stringify!($float_type), " {out}, [{dst}], {val};"), + dst = in(reg64) dst, + val = in($val_reg) val, + out = out($val_reg) out, + ) + }; + } + atomic_rmw!(add, order); + } + out + } + unsafe fn atomic_min(dst: *mut Self, val: Self, order: Ordering) -> Self { + let out; + // SAFETY: the caller must uphold the safety contract for `atomic_min`. + unsafe { + macro_rules! min { + ($sem:tt, $fence_sc:expr) => { + asm!( + $fence_sc, + concat!("atom.", $sem, ".gpu.min.", stringify!($float_type), " {out}, [{dst}], {val};"), + dst = in(reg64) dst, + val = in($val_reg) val, + out = out($val_reg) out, + ) + }; + } + atomic_rmw!(min, order); + } + out + } + unsafe fn atomic_max(dst: *mut Self, val: Self, order: Ordering) -> Self { + let out; + // SAFETY: the caller must uphold the safety contract for `atomic_max`. + unsafe { + macro_rules! max { + ($sem:tt, $fence_sc:expr) => { + asm!( + $fence_sc, + concat!("atom.", $sem, ".gpu.max.", stringify!($float_type), " {out}, [{dst}], {val};"), + dst = in(reg64) dst, + val = in($val_reg) val, + out = out($val_reg) out, + ) + }; + } + atomic_rmw!(max, order); + } + out + } + } + }; +} + +trait AtomicOperations: Sized { + unsafe fn atomic_load(src: *mut Self, order: Ordering) -> Self; + unsafe fn atomic_store(dst: *mut Self, val: Self, order: Ordering); + unsafe fn atomic_swap(dst: *mut Self, val: Self, order: Ordering) -> Self; + unsafe fn atomic_compare_exchange( + dst: *mut Self, + old: Self, + new: Self, + order: Ordering, + ) -> Self; + unsafe fn atomic_add(dst: *mut Self, val: Self, order: Ordering) -> Self; + unsafe fn atomic_and(dst: *mut Self, val: Self, order: Ordering) -> Self; + unsafe fn atomic_or(dst: *mut Self, val: Self, order: Ordering) -> Self; + unsafe fn atomic_xor(dst: *mut Self, val: Self, order: Ordering) -> Self; + unsafe fn atomic_min(dst: *mut Self, val: Self, order: Ordering) -> Self; + unsafe fn atomic_max(dst: *mut Self, val: Self, order: Ordering) -> Self; +} + +atomic_float!(AtomicF32, f32, AtomicU32, u32, reg32, 4); +atomic_float!(AtomicF64, f64, AtomicU64, u64, reg64, 8); diff --git a/src/lib.rs b/src/lib.rs index ae7fbf340..3e7eca889 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -186,7 +186,7 @@ See [this list](https://github.com/taiki-e/portable-atomic/issues/10#issuecommen )] // asm_experimental_arch // AVR and MSP430 are tier 3 platforms and require nightly anyway. -// On tier 2 platforms (powerpc64 and s390x), we use cfg set by build script to +// On tier 2 platforms (powerpc64, s390x, nvptx64), we use cfg set by build script to // determine whether this feature is available or not. #![cfg_attr( all( @@ -204,6 +204,7 @@ See [this list](https://github.com/taiki-e/portable-atomic/issues/10#issuecommen ) ), all(portable_atomic_asm_experimental_arch, target_arch = "s390x"), + all(portable_atomic_asm_experimental_arch, target_arch = "nvptx64"), ), ), feature(asm_experimental_arch) diff --git a/tools/build.sh b/tools/build.sh index 930f04c0f..0cd4c4217 100755 --- a/tools/build.sh +++ b/tools/build.sh @@ -62,6 +62,8 @@ default_targets=( # riscv32 with atomic riscv32imac-unknown-none-elf riscv32imc-esp-espidf + # nvptx64 + nvptx64-nvidia-cuda # other tier 1 targets i686-pc-windows-gnu