From 1daf33537a59d35b3963fae3a7a4d7002ed18316 Mon Sep 17 00:00:00 2001
From: Taiki Endo <te316e89@gmail.com>
Date: Mon, 13 Jan 2025 00:16:35 +0900
Subject: [PATCH] Optimize atomic float on NVPTX

---
 build.rs               |  39 ++++-
 src/imp/float/int.rs   |  17 +-
 src/imp/float/mod.rs   |  27 +++
 src/imp/float/nvptx.rs | 380 +++++++++++++++++++++++++++++++++++++++++
 src/lib.rs             |   3 +-
 tools/build.sh         |   8 +
 6 files changed, 469 insertions(+), 5 deletions(-)
 create mode 100644 src/imp/float/nvptx.rs
diff --git a/build.rs b/build.rs
index 4a0b8500..9b319bdf 100644
--- a/build.rs
+++ b/build.rs
@@ -47,7 +47,7 @@ fn main() {
 
     if version.minor >= 80 {
         println!(
-            r#"cargo:rustc-check-cfg=cfg(target_feature,values("experimental-zacas","fast-serialization","load-store-on-cond","distinct-ops","miscellaneous-extensions-3"))"#
+            r#"cargo:rustc-check-cfg=cfg(target_feature,values("experimental-zacas","sm_70","fast-serialization","load-store-on-cond","distinct-ops","miscellaneous-extensions-3"))"#
         );
 
         // Custom cfgs set by build script. Not public API.
@@ -58,7 +58,7 @@ fn main() {
         // TODO: handle multi-line target_feature_fallback
         // grep -F 'target_feature_fallback("' build.rs | grep -Ev '^ *//' | sed -E 's/^.*target_feature_fallback\(//; s/",.*$/"/' | LC_ALL=C sort -u | tr '\n' ',' | sed -E 's/,$/\n/'
         println!(
-            r#"cargo:rustc-check-cfg=cfg(portable_atomic_target_feature,values("cmpxchg16b","distinct-ops","experimental-zacas","fast-serialization","load-store-on-cond","lse","lse128","lse2","mclass","miscellaneous-extensions-3","quadword-atomics","rcpc3","v6","zaamo","zabha"))"#
+            r#"cargo:rustc-check-cfg=cfg(portable_atomic_target_feature,values("cmpxchg16b","distinct-ops","experimental-zacas","fast-serialization","load-store-on-cond","lse","lse128","lse2","mclass","miscellaneous-extensions-3","quadword-atomics","rcpc3","sm_70","v6","zaamo","zabha"))"#
         );
     }
 
@@ -175,6 +175,11 @@ fn main() {
                     println!("cargo:rustc-cfg=portable_atomic_unstable_asm_experimental_arch");
                 }
             }
+            "nvptx64" => {
+                if version.nightly && is_allowed_feature("asm_experimental_arch") {
+                    println!("cargo:rustc-cfg=portable_atomic_unstable_asm_experimental_arch");
+                }
+            }
             _ => {}
         }
     }
@@ -435,6 +440,36 @@ fn main() {
             // nand (nnr{,g}k), select (sel{,g}r), etc.
             target_feature_fallback("miscellaneous-extensions-3", arch13_features);
         }
+        "nvptx64" => {
+            let mut sm_70 = false;
+            if let Some(rustflags) = env::var_os("CARGO_ENCODED_RUSTFLAGS") {
+                for mut flag in rustflags.to_string_lossy().split('\x1f') {
+                    flag = strip_prefix(flag, "-C").unwrap_or(flag);
+                    if let Some(flag) = strip_prefix(flag, "target-feature=") {
+                        for s in flag.split(',') {
+                            // TODO: Handles cases where a specific target feature
+                            // implicitly enables another target feature.
+                            match (s.as_bytes().first(), s.get(1..)) {
+                                (Some(b'+'), Some(f)) => {
+                                    if let Some(sm) = strip_prefix(f, "sm_") {
+                                        if let Ok(sm) = sm.parse::<u32>() {
+                                            if sm >= 70 {
+                                                sm_70 = true;
+                                            }
+                                        }
+                                    }
+                                }
+                                (Some(b'-'), Some(_f)) => {
+                                    // TODO
+                                }
+                                _ => {}
+                            }
+                        }
+                    }
+                }
+            }
+            target_feature_fallback("sm_70", sm_70);
+        }
         _ => {}
     }
 }
diff --git a/src/imp/float/int.rs b/src/imp/float/int.rs
index bda88364..8652db43 100644
--- a/src/imp/float/int.rs
+++ b/src/imp/float/int.rs
@@ -9,8 +9,9 @@ Note that most of `fetch_*` operations of atomic floats are implemented using
 CAS loops, which can be slower than equivalent operations of atomic integers.
 
 AArch64 with FEAT_LSFE and GPU targets have atomic instructions for float.
-Both will use architecture-specific implementations instead of this implementation in the
-future: https://github.com/taiki-e/portable-atomic/issues/34 / https://github.com/taiki-e/portable-atomic/pull/45
+See nvptx.rs for NVPTX.
+AArch64 with FEAT_LSFE will also use architecture-specific implementations instead of this implementation in the
+future: https://github.com/taiki-e/portable-atomic/pull/201
 */
 
 // TODO: fetch_{minimum,maximum}* https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2024/p3008r2.html / https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2024/p0493r5.pdf
@@ -203,9 +204,21 @@ macro_rules! atomic_float {
 cfg_has_atomic_16! {
     atomic_float!(AtomicF16, f16, AtomicU16, u16, 2);
 }
+#[cfg(not(all(
+    target_arch = "nvptx64",
+    any(target_feature = "sm_70", portable_atomic_target_feature = "sm_70"),
+    not(any(miri, portable_atomic_sanitize_thread)),
+    portable_atomic_unstable_asm_experimental_arch,
+)))]
 cfg_has_atomic_32! {
     atomic_float!(AtomicF32, f32, AtomicU32, u32, 4);
 }
+#[cfg(not(all(
+    target_arch = "nvptx64",
+    any(target_feature = "sm_70", portable_atomic_target_feature = "sm_70"),
+    not(any(miri, portable_atomic_sanitize_thread)),
+    portable_atomic_unstable_asm_experimental_arch,
+)))]
 cfg_has_atomic_64! {
     atomic_float!(AtomicF64, f64, AtomicU64, u64, 8);
 }
diff --git a/src/imp/float/mod.rs b/src/imp/float/mod.rs
index a36f0983..8f49938d 100644
--- a/src/imp/float/mod.rs
+++ b/src/imp/float/mod.rs
@@ -8,13 +8,40 @@ Atomic float implementations
 
 mod int;
 
+#[cfg(all(
+    target_arch = "nvptx64",
+    any(target_feature = "sm_70", portable_atomic_target_feature = "sm_70"),
+    not(any(miri, portable_atomic_sanitize_thread)),
+    portable_atomic_unstable_asm_experimental_arch,
+))]
+mod nvptx;
+
 #[cfg(portable_atomic_unstable_f16)]
 cfg_has_atomic_16! {
     pub(crate) use self::int::AtomicF16;
 }
+#[cfg(all(
+    target_arch = "nvptx64",
+    any(target_feature = "sm_70", portable_atomic_target_feature = "sm_70"),
+    not(any(miri, portable_atomic_sanitize_thread)),
+    portable_atomic_unstable_asm_experimental_arch,
+))]
+pub(crate) use self::nvptx::{AtomicF32, AtomicF64};
+#[cfg(not(all(
+    target_arch = "nvptx64",
+    any(target_feature = "sm_70", portable_atomic_target_feature = "sm_70"),
+    not(any(miri, portable_atomic_sanitize_thread)),
+    portable_atomic_unstable_asm_experimental_arch,
+)))]
 cfg_has_atomic_32! {
     pub(crate) use self::int::AtomicF32;
 }
+#[cfg(not(all(
+    target_arch = "nvptx64",
+    any(target_feature = "sm_70", portable_atomic_target_feature = "sm_70"),
+    not(any(miri, portable_atomic_sanitize_thread)),
+    portable_atomic_unstable_asm_experimental_arch,
+)))]
 cfg_has_atomic_64! {
     pub(crate) use self::int::AtomicF64;
 }
diff --git a/src/imp/float/nvptx.rs b/src/imp/float/nvptx.rs
new file mode 100644
index 00000000..596f6ca7
--- /dev/null
+++ b/src/imp/float/nvptx.rs
@@ -0,0 +1,380 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+/*
+Atomic float implementation on NVPTX.
+
+Refs:
+- https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld
+- https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
+- https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar
+- User Guide for NVPTX Back-end (LLVM documentation) https://llvm.org/docs/NVPTXUsage.html
+- https://github.com/NVIDIA/cccl/blob/cc7c1bb7e888dcfc8665ca4936d8e99c7476a847/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
+*/
+
+// This module is currently enabled on sm_70+.
+// TODO: Support pre-sm_70
+
+use core::{arch::asm, cell::UnsafeCell, sync::atomic::Ordering};
+
+// NVPTX's seqcst atomic op is preceding seqcst fence + acquire op.
+macro_rules! fence_sc {
+    () => {
+        "fence.sc.gl;"
+    };
+}
+
+macro_rules! atomic_rmw {
+    ($op:ident, $order:ident) => {
+        match $order {
+            Ordering::Relaxed => $op!("relaxed", ""),
+            Ordering::Acquire => $op!("acquire", ""),
+            Ordering::Release => $op!("release", ""),
+            Ordering::AcqRel => $op!("acqrel", ""),
+            Ordering::SeqCst => $op!("acquire", fence_sc!()),
+            _ => unreachable!(),
+        }
+    };
+}
+
+macro_rules! atomic_float {
+    (
+        $atomic_type:ident, $float_type:ident, $atomic_int_type:ident, $int_type:ident,
+        $val_reg:ident, $align:expr
+    ) => {
+        #[repr(C, align($align))]
+        pub(crate) struct $atomic_type {
+            v: UnsafeCell<$float_type>,
+        }
+
+        // Send is implicitly implemented.
+        // SAFETY: any data races are prevented by atomic operations.
+        unsafe impl Sync for $atomic_type {}
+
+        impl $atomic_type {
+            #[inline]
+            pub(crate) const fn new(v: $float_type) -> Self {
+                Self { v: UnsafeCell::new(v) }
+            }
+
+            #[inline]
+            pub(crate) fn is_lock_free() -> bool {
+                true
+            }
+            pub(crate) const IS_ALWAYS_LOCK_FREE: bool = true;
+
+            #[inline]
+            #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)]
+            pub(crate) fn load(&self, order: Ordering) -> $float_type {
+                let src = self.v.get();
+                let out;
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe {
+                    macro_rules! atomic_load {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("ld.", $sem, ".gpu.", stringify!($float_type), " {out}, [{src}];"),
+                                src = in(reg64) src,
+                                out = out($val_reg) out,
+                                options(nostack),
+                            )
+                        };
+                    }
+                    match order {
+                        Ordering::Relaxed => atomic_load!("relaxed", ""),
+                        Ordering::Acquire => atomic_load!("acquire", ""),
+                        Ordering::SeqCst => atomic_load!("acquire", fence_sc!()),
+                        _ => unreachable!(),
+                    }
+                }
+                out
+            }
+
+            #[inline]
+            #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)]
+            pub(crate) fn store(&self, val: $float_type, order: Ordering) {
+                let dst = self.v.get();
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe {
+                    macro_rules! atomic_store {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("st.", $sem, ".gpu.", stringify!($float_type), " [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                                options(nostack),
+                            )
+                        };
+                    }
+                    match order {
+                        Ordering::Relaxed => atomic_store!("relaxed", ""),
+                        Ordering::Release => atomic_store!("release", ""),
+                        Ordering::SeqCst => atomic_store!("relaxed", fence_sc!()),
+                        _ => unreachable!(),
+                    }
+                }
+            }
+
+            #[inline]
+            pub(crate) fn swap(&self, val: $float_type, order: Ordering) -> $float_type {
+                let dst = self.v.get();
+                let out;
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe {
+                    macro_rules! swap {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.exch.", stringify!($float_type), " {out}, [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                                out = out($val_reg) out,
+                                options(nostack),
+                            )
+                        };
+                    }
+                    atomic_rmw!(swap, order);
+                }
+                out
+            }
+
+            #[inline]
+            #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)]
+            pub(crate) fn compare_exchange(
+                &self,
+                old: $float_type,
+                new: $float_type,
+                success: Ordering,
+                failure: Ordering,
+            ) -> Result<$float_type, $float_type> {
+                let order = crate::utils::upgrade_success_ordering(success, failure);
+                let dst = self.v.get();
+                let out: $float_type;
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe {
+                    macro_rules! cmpxchg {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.cas.", stringify!($float_type), " {out}, [{dst}], {old}, {new};"),
+                                dst = in(reg64) dst,
+                                old = in($val_reg) old,
+                                new = in($val_reg) new,
+                                out = out($val_reg) out,
+                                options(nostack),
+                            )
+                        };
+                    }
+                    atomic_rmw!(cmpxchg, order);
+                }
+                if out.to_bits() == old.to_bits() {
+                    Ok(out)
+                } else {
+                    Err(out)
+                }
+            }
+
+            #[inline]
+            #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)]
+            pub(crate) fn compare_exchange_weak(
+                &self,
+                current: $float_type,
+                new: $float_type,
+                success: Ordering,
+                failure: Ordering,
+            ) -> Result<$float_type, $float_type> {
+                self.compare_exchange(current, new, success, failure)
+            }
+
+            #[inline]
+            pub(crate) fn fetch_add(&self, val: $float_type, order: Ordering) -> $float_type {
+                let dst = self.v.get();
+                let out;
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe {
+                    macro_rules! add {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.add.", stringify!($float_type), " {out}, [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                                out = out($val_reg) out,
+                                options(nostack),
+                            )
+                        };
+                    }
+                    atomic_rmw!(add, order);
+                }
+                out
+            }
+
+            #[inline]
+            pub(crate) fn fetch_sub(&self, val: $float_type, order: Ordering) -> $float_type {
+                // There is no atomic sub instruction, so add `-val`.
+                self.fetch_add(-val, order)
+            }
+
+            #[allow(dead_code)] // TODO
+            #[inline]
+            pub(crate) fn fetch_and(&self, val: $float_type, order: Ordering) -> $float_type {
+                let dst = self.v.get();
+                let out;
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe {
+                    macro_rules! and {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.and.", stringify!($float_type), " {out}, [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                                out = out($val_reg) out,
+                                options(nostack),
+                            )
+                        };
+                    }
+                    atomic_rmw!(and, order);
+                }
+                out
+            }
+
+            #[allow(dead_code)] // TODO
+            #[inline]
+            pub(crate) fn fetch_or(&self, val: $float_type, order: Ordering) -> $float_type {
+                let dst = self.v.get();
+                let out;
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe {
+                    macro_rules! or {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.or.", stringify!($float_type), " {out}, [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                                out = out($val_reg) out,
+                                options(nostack),
+                            )
+                        };
+                    }
+                    atomic_rmw!(or, order);
+                }
+                out
+            }
+
+            #[allow(dead_code)] // TODO
+            #[inline]
+            pub(crate) fn fetch_xor(&self, val: $float_type, order: Ordering) -> $float_type {
+                let dst = self.v.get();
+                let out;
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe {
+                    macro_rules! xor {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.xor.", stringify!($float_type), " {out}, [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                                out = out($val_reg) out,
+                                options(nostack),
+                            )
+                        };
+                    }
+                    atomic_rmw!(xor, order);
+                }
+                out
+            }
+
+            #[inline]
+            pub(crate) fn fetch_max(&self, val: $float_type, order: Ordering) -> $float_type {
+                let dst = self.v.get();
+                let out;
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe {
+                    macro_rules! max {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.max.", stringify!($float_type), " {out}, [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                                out = out($val_reg) out,
+                                options(nostack),
+                            )
+                        };
+                    }
+                    atomic_rmw!(max, order);
+                }
+                out
+            }
+
+            #[inline]
+            pub(crate) fn fetch_min(&self, val: $float_type, order: Ordering) -> $float_type {
+                let dst = self.v.get();
+                let out;
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe {
+                    macro_rules! min {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.min.", stringify!($float_type), " {out}, [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                                out = out($val_reg) out,
+                                options(nostack),
+                            )
+                        };
+                    }
+                    atomic_rmw!(min, order);
+                }
+                out
+            }
+
+            #[inline]
+            pub(crate) fn fetch_neg(&self, order: Ordering) -> $float_type {
+                const NEG_MASK: $int_type = !0 / 2 + 1;
+                // TODO: use self.fetch_xor
+                $float_type::from_bits(self.as_bits().fetch_xor(NEG_MASK, order))
+            }
+
+            #[inline]
+            pub(crate) fn fetch_abs(&self, order: Ordering) -> $float_type {
+                const ABS_MASK: $int_type = !0 / 2;
+                // TODO: use self.fetch_and
+                $float_type::from_bits(self.as_bits().fetch_and(ABS_MASK, order))
+            }
+
+            const_fn! {
+                const_if: #[cfg(not(portable_atomic_no_const_raw_ptr_deref))];
+                #[inline]
+                pub(crate) const fn as_bits(&self) -> &crate::$atomic_int_type {
+                    // SAFETY: $atomic_type and $atomic_int_type have the same layout,
+                    // and there is no concurrent access to the value that does not go through this method.
+                    unsafe { &*(self as *const $atomic_type as *const crate::$atomic_int_type) }
+                }
+            }
+
+            #[inline]
+            pub(crate) const fn as_ptr(&self) -> *mut $float_type {
+                self.v.get()
+            }
+        }
+    };
+}
+
+atomic_float!(AtomicF32, f32, AtomicU32, u32, reg32, 4);
+atomic_float!(AtomicF64, f64, AtomicU64, u64, reg64, 8);
diff --git a/src/lib.rs b/src/lib.rs
index 26e28e1b..48340d99 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -222,7 +222,7 @@ RUSTFLAGS="--cfg portable_atomic_no_outline_atomics" cargo ...
 #![allow(clippy::inline_always, clippy::used_underscore_items)]
 // asm_experimental_arch
 // AVR, MSP430, and Xtensa are tier 3 platforms and require nightly anyway.
-// On tier 2 platforms (powerpc64), we use cfg set by build script to
+// On tier 2 platforms (powerpc64 and nvptx64), we use cfg set by build script to
 // determine whether this feature is available or not.
 #![cfg_attr(
     all(
@@ -232,6 +232,7 @@ RUSTFLAGS="--cfg portable_atomic_no_outline_atomics" cargo ...
             target_arch = "msp430",
             all(target_arch = "xtensa", portable_atomic_unsafe_assume_single_core),
             all(target_arch = "powerpc64", portable_atomic_unstable_asm_experimental_arch),
+            all(target_arch = "nvptx64", portable_atomic_unstable_asm_experimental_arch),
         ),
     ),
     feature(asm_experimental_arch)
diff --git a/tools/build.sh b/tools/build.sh
index 669d1103..a62fd383 100755
--- a/tools/build.sh
+++ b/tools/build.sh
@@ -123,6 +123,9 @@ default_targets=(
     # s390x
     # rustc --print target-list | grep -E '^s390x'
     s390x-unknown-linux-gnu
+
+    # nvptx64
+    nvptx64-nvidia-cuda
 )
 # NB: sync with:
 # - docs.rs metadata in Cargo.toml
@@ -655,6 +658,11 @@ build() {
                 RUSTFLAGS="${target_rustflags} -C target-cpu=z15" \
                 x_cargo "${args[@]}" "$@"
             ;;
+        nvptx64-*)
+            CARGO_TARGET_DIR="${target_dir}/sm_70" \
+                RUSTFLAGS="${target_rustflags} -C target-feature=+sm_70" \
+                x_cargo "${args[@]}" "$@"
+            ;;
     esac
 }