From 1daf33537a59d35b3963fae3a7a4d7002ed18316 Mon Sep 17 00:00:00 2001 From: Taiki Endo Date: Mon, 13 Jan 2025 00:16:35 +0900 Subject: [PATCH] Optimize atomic float on NVPTX --- build.rs | 39 ++++- src/imp/float/int.rs | 17 +- src/imp/float/mod.rs | 27 +++ src/imp/float/nvptx.rs | 380 +++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 3 +- tools/build.sh | 8 + 6 files changed, 469 insertions(+), 5 deletions(-) create mode 100644 src/imp/float/nvptx.rs diff --git a/build.rs b/build.rs index 4a0b8500..9b319bdf 100644 --- a/build.rs +++ b/build.rs @@ -47,7 +47,7 @@ fn main() { if version.minor >= 80 { println!( - r#"cargo:rustc-check-cfg=cfg(target_feature,values("experimental-zacas","fast-serialization","load-store-on-cond","distinct-ops","miscellaneous-extensions-3"))"# + r#"cargo:rustc-check-cfg=cfg(target_feature,values("experimental-zacas","sm_70","fast-serialization","load-store-on-cond","distinct-ops","miscellaneous-extensions-3"))"# ); // Custom cfgs set by build script. Not public API. @@ -58,7 +58,7 @@ fn main() { // TODO: handle multi-line target_feature_fallback // grep -F 'target_feature_fallback("' build.rs | grep -Ev '^ *//' | sed -E 's/^.*target_feature_fallback\(//; s/",.*$/"/' | LC_ALL=C sort -u | tr '\n' ',' | sed -E 's/,$/\n/' println!( - r#"cargo:rustc-check-cfg=cfg(portable_atomic_target_feature,values("cmpxchg16b","distinct-ops","experimental-zacas","fast-serialization","load-store-on-cond","lse","lse128","lse2","mclass","miscellaneous-extensions-3","quadword-atomics","rcpc3","v6","zaamo","zabha"))"# + r#"cargo:rustc-check-cfg=cfg(portable_atomic_target_feature,values("cmpxchg16b","distinct-ops","experimental-zacas","fast-serialization","load-store-on-cond","lse","lse128","lse2","mclass","miscellaneous-extensions-3","quadword-atomics","rcpc3","sm_70","v6","zaamo","zabha"))"# ); } @@ -175,6 +175,11 @@ fn main() { println!("cargo:rustc-cfg=portable_atomic_unstable_asm_experimental_arch"); } } + "nvptx64" => { + if version.nightly && is_allowed_feature("asm_experimental_arch") { + println!("cargo:rustc-cfg=portable_atomic_unstable_asm_experimental_arch"); + } + } _ => {} } } @@ -435,6 +440,36 @@ fn main() { // nand (nnr{,g}k), select (sel{,g}r), etc. target_feature_fallback("miscellaneous-extensions-3", arch13_features); } + "nvptx64" => { + let mut sm_70 = false; + if let Some(rustflags) = env::var_os("CARGO_ENCODED_RUSTFLAGS") { + for mut flag in rustflags.to_string_lossy().split('\x1f') { + flag = strip_prefix(flag, "-C").unwrap_or(flag); + if let Some(flag) = strip_prefix(flag, "target-feature=") { + for s in flag.split(',') { + // TODO: Handles cases where a specific target feature + // implicitly enables another target feature. + match (s.as_bytes().first(), s.get(1..)) { + (Some(b'+'), Some(f)) => { + if let Some(sm) = strip_prefix(f, "sm_") { + if let Ok(sm) = sm.parse::() { + if sm >= 70 { + sm_70 = true; + } + } + } + } + (Some(b'-'), Some(_f)) => { + // TODO + } + _ => {} + } + } + } + } + } + target_feature_fallback("sm_70", sm_70); + } _ => {} } } diff --git a/src/imp/float/int.rs b/src/imp/float/int.rs index bda88364..8652db43 100644 --- a/src/imp/float/int.rs +++ b/src/imp/float/int.rs @@ -9,8 +9,9 @@ Note that most of `fetch_*` operations of atomic floats are implemented using CAS loops, which can be slower than equivalent operations of atomic integers. AArch64 with FEAT_LSFE and GPU targets have atomic instructions for float. -Both will use architecture-specific implementations instead of this implementation in the -future: https://github.com/taiki-e/portable-atomic/issues/34 / https://github.com/taiki-e/portable-atomic/pull/45 +See nvptx.rs for NVPTX. +AArch64 with FEAT_LSFE will also use architecture-specific implementations instead of this implementation in the +future: https://github.com/taiki-e/portable-atomic/pull/201 */ // TODO: fetch_{minimum,maximum}* https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2024/p3008r2.html / https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2024/p0493r5.pdf @@ -203,9 +204,21 @@ macro_rules! atomic_float { cfg_has_atomic_16! { atomic_float!(AtomicF16, f16, AtomicU16, u16, 2); } +#[cfg(not(all( + target_arch = "nvptx64", + any(target_feature = "sm_70", portable_atomic_target_feature = "sm_70"), + not(any(miri, portable_atomic_sanitize_thread)), + portable_atomic_unstable_asm_experimental_arch, +)))] cfg_has_atomic_32! { atomic_float!(AtomicF32, f32, AtomicU32, u32, 4); } +#[cfg(not(all( + target_arch = "nvptx64", + any(target_feature = "sm_70", portable_atomic_target_feature = "sm_70"), + not(any(miri, portable_atomic_sanitize_thread)), + portable_atomic_unstable_asm_experimental_arch, +)))] cfg_has_atomic_64! { atomic_float!(AtomicF64, f64, AtomicU64, u64, 8); } diff --git a/src/imp/float/mod.rs b/src/imp/float/mod.rs index a36f0983..8f49938d 100644 --- a/src/imp/float/mod.rs +++ b/src/imp/float/mod.rs @@ -8,13 +8,40 @@ Atomic float implementations mod int; +#[cfg(all( + target_arch = "nvptx64", + any(target_feature = "sm_70", portable_atomic_target_feature = "sm_70"), + not(any(miri, portable_atomic_sanitize_thread)), + portable_atomic_unstable_asm_experimental_arch, +))] +mod nvptx; + #[cfg(portable_atomic_unstable_f16)] cfg_has_atomic_16! { pub(crate) use self::int::AtomicF16; } +#[cfg(all( + target_arch = "nvptx64", + any(target_feature = "sm_70", portable_atomic_target_feature = "sm_70"), + not(any(miri, portable_atomic_sanitize_thread)), + portable_atomic_unstable_asm_experimental_arch, +))] +pub(crate) use self::nvptx::{AtomicF32, AtomicF64}; +#[cfg(not(all( + target_arch = "nvptx64", + any(target_feature = "sm_70", portable_atomic_target_feature = "sm_70"), + not(any(miri, portable_atomic_sanitize_thread)), + portable_atomic_unstable_asm_experimental_arch, +)))] cfg_has_atomic_32! { pub(crate) use self::int::AtomicF32; } +#[cfg(not(all( + target_arch = "nvptx64", + any(target_feature = "sm_70", portable_atomic_target_feature = "sm_70"), + not(any(miri, portable_atomic_sanitize_thread)), + portable_atomic_unstable_asm_experimental_arch, +)))] cfg_has_atomic_64! { pub(crate) use self::int::AtomicF64; } diff --git a/src/imp/float/nvptx.rs b/src/imp/float/nvptx.rs new file mode 100644 index 00000000..596f6ca7 --- /dev/null +++ b/src/imp/float/nvptx.rs @@ -0,0 +1,380 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT + +/* +Atomic float implementation on NVPTX. + +Refs: +- https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld +- https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom +- https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar +- User Guide for NVPTX Back-end (LLVM documentation) https://llvm.org/docs/NVPTXUsage.html +- https://github.com/NVIDIA/cccl/blob/cc7c1bb7e888dcfc8665ca4936d8e99c7476a847/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +*/ + +// This module is currently enabled on sm_70+. +// TODO: Support pre-sm_70 + +use core::{arch::asm, cell::UnsafeCell, sync::atomic::Ordering}; + +// NVPTX's seqcst atomic op is preceding seqcst fence + acquire op. +macro_rules! fence_sc { + () => { + "fence.sc.gl;" + }; +} + +macro_rules! atomic_rmw { + ($op:ident, $order:ident) => { + match $order { + Ordering::Relaxed => $op!("relaxed", ""), + Ordering::Acquire => $op!("acquire", ""), + Ordering::Release => $op!("release", ""), + Ordering::AcqRel => $op!("acqrel", ""), + Ordering::SeqCst => $op!("acquire", fence_sc!()), + _ => unreachable!(), + } + }; +} + +macro_rules! atomic_float { + ( + $atomic_type:ident, $float_type:ident, $atomic_int_type:ident, $int_type:ident, + $val_reg:ident, $align:expr + ) => { + #[repr(C, align($align))] + pub(crate) struct $atomic_type { + v: UnsafeCell<$float_type>, + } + + // Send is implicitly implemented. + // SAFETY: any data races are prevented by atomic operations. + unsafe impl Sync for $atomic_type {} + + impl $atomic_type { + #[inline] + pub(crate) const fn new(v: $float_type) -> Self { + Self { v: UnsafeCell::new(v) } + } + + #[inline] + pub(crate) fn is_lock_free() -> bool { + true + } + pub(crate) const IS_ALWAYS_LOCK_FREE: bool = true; + + #[inline] + #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)] + pub(crate) fn load(&self, order: Ordering) -> $float_type { + let src = self.v.get(); + let out; + // SAFETY: any data races are prevented by atomic intrinsics and the raw + // pointer passed in is valid because we got it from a reference. + unsafe { + macro_rules! atomic_load { + ($sem:tt, $fence_sc:expr) => { + asm!( + $fence_sc, + concat!("ld.", $sem, ".gpu.", stringify!($float_type), " {out}, [{src}];"), + src = in(reg64) src, + out = out($val_reg) out, + options(nostack), + ) + }; + } + match order { + Ordering::Relaxed => atomic_load!("relaxed", ""), + Ordering::Acquire => atomic_load!("acquire", ""), + Ordering::SeqCst => atomic_load!("acquire", fence_sc!()), + _ => unreachable!(), + } + } + out + } + + #[inline] + #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)] + pub(crate) fn store(&self, val: $float_type, order: Ordering) { + let dst = self.v.get(); + // SAFETY: any data races are prevented by atomic intrinsics and the raw + // pointer passed in is valid because we got it from a reference. + unsafe { + macro_rules! atomic_store { + ($sem:tt, $fence_sc:expr) => { + asm!( + $fence_sc, + concat!("st.", $sem, ".gpu.", stringify!($float_type), " [{dst}], {val};"), + dst = in(reg64) dst, + val = in($val_reg) val, + options(nostack), + ) + }; + } + match order { + Ordering::Relaxed => atomic_store!("relaxed", ""), + Ordering::Release => atomic_store!("release", ""), + Ordering::SeqCst => atomic_store!("relaxed", fence_sc!()), + _ => unreachable!(), + } + } + } + + #[inline] + pub(crate) fn swap(&self, val: $float_type, order: Ordering) -> $float_type { + let dst = self.v.get(); + let out; + // SAFETY: any data races are prevented by atomic intrinsics and the raw + // pointer passed in is valid because we got it from a reference. + unsafe { + macro_rules! swap { + ($sem:tt, $fence_sc:expr) => { + asm!( + $fence_sc, + concat!("atom.", $sem, ".gpu.exch.", stringify!($float_type), " {out}, [{dst}], {val};"), + dst = in(reg64) dst, + val = in($val_reg) val, + out = out($val_reg) out, + options(nostack), + ) + }; + } + atomic_rmw!(swap, order); + } + out + } + + #[inline] + #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)] + pub(crate) fn compare_exchange( + &self, + old: $float_type, + new: $float_type, + success: Ordering, + failure: Ordering, + ) -> Result<$float_type, $float_type> { + let order = crate::utils::upgrade_success_ordering(success, failure); + let dst = self.v.get(); + let out: $float_type; + // SAFETY: any data races are prevented by atomic intrinsics and the raw + // pointer passed in is valid because we got it from a reference. + unsafe { + macro_rules! cmpxchg { + ($sem:tt, $fence_sc:expr) => { + asm!( + $fence_sc, + concat!("atom.", $sem, ".gpu.cas.", stringify!($float_type), " {out}, [{dst}], {old}, {new};"), + dst = in(reg64) dst, + old = in($val_reg) old, + new = in($val_reg) new, + out = out($val_reg) out, + options(nostack), + ) + }; + } + atomic_rmw!(cmpxchg, order); + } + if out.to_bits() == old.to_bits() { + Ok(out) + } else { + Err(out) + } + } + + #[inline] + #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)] + pub(crate) fn compare_exchange_weak( + &self, + current: $float_type, + new: $float_type, + success: Ordering, + failure: Ordering, + ) -> Result<$float_type, $float_type> { + self.compare_exchange(current, new, success, failure) + } + + #[inline] + pub(crate) fn fetch_add(&self, val: $float_type, order: Ordering) -> $float_type { + let dst = self.v.get(); + let out; + // SAFETY: any data races are prevented by atomic intrinsics and the raw + // pointer passed in is valid because we got it from a reference. + unsafe { + macro_rules! add { + ($sem:tt, $fence_sc:expr) => { + asm!( + $fence_sc, + concat!("atom.", $sem, ".gpu.add.", stringify!($float_type), " {out}, [{dst}], {val};"), + dst = in(reg64) dst, + val = in($val_reg) val, + out = out($val_reg) out, + options(nostack), + ) + }; + } + atomic_rmw!(add, order); + } + out + } + + #[inline] + pub(crate) fn fetch_sub(&self, val: $float_type, order: Ordering) -> $float_type { + // There is no atomic sub instruction, so add `-val`. + self.fetch_add(-val, order) + } + + #[allow(dead_code)] // TODO + #[inline] + pub(crate) fn fetch_and(&self, val: $float_type, order: Ordering) -> $float_type { + let dst = self.v.get(); + let out; + // SAFETY: any data races are prevented by atomic intrinsics and the raw + // pointer passed in is valid because we got it from a reference. + unsafe { + macro_rules! and { + ($sem:tt, $fence_sc:expr) => { + asm!( + $fence_sc, + concat!("atom.", $sem, ".gpu.and.", stringify!($float_type), " {out}, [{dst}], {val};"), + dst = in(reg64) dst, + val = in($val_reg) val, + out = out($val_reg) out, + options(nostack), + ) + }; + } + atomic_rmw!(and, order); + } + out + } + + #[allow(dead_code)] // TODO + #[inline] + pub(crate) fn fetch_or(&self, val: $float_type, order: Ordering) -> $float_type { + let dst = self.v.get(); + let out; + // SAFETY: any data races are prevented by atomic intrinsics and the raw + // pointer passed in is valid because we got it from a reference. + unsafe { + macro_rules! or { + ($sem:tt, $fence_sc:expr) => { + asm!( + $fence_sc, + concat!("atom.", $sem, ".gpu.or.", stringify!($float_type), " {out}, [{dst}], {val};"), + dst = in(reg64) dst, + val = in($val_reg) val, + out = out($val_reg) out, + options(nostack), + ) + }; + } + atomic_rmw!(or, order); + } + out + } + + #[allow(dead_code)] // TODO + #[inline] + pub(crate) fn fetch_xor(&self, val: $float_type, order: Ordering) -> $float_type { + let dst = self.v.get(); + let out; + // SAFETY: any data races are prevented by atomic intrinsics and the raw + // pointer passed in is valid because we got it from a reference. + unsafe { + macro_rules! xor { + ($sem:tt, $fence_sc:expr) => { + asm!( + $fence_sc, + concat!("atom.", $sem, ".gpu.xor.", stringify!($float_type), " {out}, [{dst}], {val};"), + dst = in(reg64) dst, + val = in($val_reg) val, + out = out($val_reg) out, + options(nostack), + ) + }; + } + atomic_rmw!(xor, order); + } + out + } + + #[inline] + pub(crate) fn fetch_max(&self, val: $float_type, order: Ordering) -> $float_type { + let dst = self.v.get(); + let out; + // SAFETY: any data races are prevented by atomic intrinsics and the raw + // pointer passed in is valid because we got it from a reference. + unsafe { + macro_rules! max { + ($sem:tt, $fence_sc:expr) => { + asm!( + $fence_sc, + concat!("atom.", $sem, ".gpu.max.", stringify!($float_type), " {out}, [{dst}], {val};"), + dst = in(reg64) dst, + val = in($val_reg) val, + out = out($val_reg) out, + options(nostack), + ) + }; + } + atomic_rmw!(max, order); + } + out + } + + #[inline] + pub(crate) fn fetch_min(&self, val: $float_type, order: Ordering) -> $float_type { + let dst = self.v.get(); + let out; + // SAFETY: any data races are prevented by atomic intrinsics and the raw + // pointer passed in is valid because we got it from a reference. + unsafe { + macro_rules! min { + ($sem:tt, $fence_sc:expr) => { + asm!( + $fence_sc, + concat!("atom.", $sem, ".gpu.min.", stringify!($float_type), " {out}, [{dst}], {val};"), + dst = in(reg64) dst, + val = in($val_reg) val, + out = out($val_reg) out, + options(nostack), + ) + }; + } + atomic_rmw!(min, order); + } + out + } + + #[inline] + pub(crate) fn fetch_neg(&self, order: Ordering) -> $float_type { + const NEG_MASK: $int_type = !0 / 2 + 1; + // TODO: use self.fetch_xor + $float_type::from_bits(self.as_bits().fetch_xor(NEG_MASK, order)) + } + + #[inline] + pub(crate) fn fetch_abs(&self, order: Ordering) -> $float_type { + const ABS_MASK: $int_type = !0 / 2; + // TODO: use self.fetch_and + $float_type::from_bits(self.as_bits().fetch_and(ABS_MASK, order)) + } + + const_fn! { + const_if: #[cfg(not(portable_atomic_no_const_raw_ptr_deref))]; + #[inline] + pub(crate) const fn as_bits(&self) -> &crate::$atomic_int_type { + // SAFETY: $atomic_type and $atomic_int_type have the same layout, + // and there is no concurrent access to the value that does not go through this method. + unsafe { &*(self as *const $atomic_type as *const crate::$atomic_int_type) } + } + } + + #[inline] + pub(crate) const fn as_ptr(&self) -> *mut $float_type { + self.v.get() + } + } + }; +} + +atomic_float!(AtomicF32, f32, AtomicU32, u32, reg32, 4); +atomic_float!(AtomicF64, f64, AtomicU64, u64, reg64, 8); diff --git a/src/lib.rs b/src/lib.rs index 26e28e1b..48340d99 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -222,7 +222,7 @@ RUSTFLAGS="--cfg portable_atomic_no_outline_atomics" cargo ... #![allow(clippy::inline_always, clippy::used_underscore_items)] // asm_experimental_arch // AVR, MSP430, and Xtensa are tier 3 platforms and require nightly anyway. -// On tier 2 platforms (powerpc64), we use cfg set by build script to +// On tier 2 platforms (powerpc64 and nvptx64), we use cfg set by build script to // determine whether this feature is available or not. #![cfg_attr( all( @@ -232,6 +232,7 @@ RUSTFLAGS="--cfg portable_atomic_no_outline_atomics" cargo ... target_arch = "msp430", all(target_arch = "xtensa", portable_atomic_unsafe_assume_single_core), all(target_arch = "powerpc64", portable_atomic_unstable_asm_experimental_arch), + all(target_arch = "nvptx64", portable_atomic_unstable_asm_experimental_arch), ), ), feature(asm_experimental_arch) diff --git a/tools/build.sh b/tools/build.sh index 669d1103..a62fd383 100755 --- a/tools/build.sh +++ b/tools/build.sh @@ -123,6 +123,9 @@ default_targets=( # s390x # rustc --print target-list | grep -E '^s390x' s390x-unknown-linux-gnu + + # nvptx64 + nvptx64-nvidia-cuda ) # NB: sync with: # - docs.rs metadata in Cargo.toml @@ -655,6 +658,11 @@ build() { RUSTFLAGS="${target_rustflags} -C target-cpu=z15" \ x_cargo "${args[@]}" "$@" ;; + nvptx64-*) + CARGO_TARGET_DIR="${target_dir}/sm_70" \ + RUSTFLAGS="${target_rustflags} -C target-feature=+sm_70" \ + x_cargo "${args[@]}" "$@" + ;; esac }