1#ifndef AMREX_GPU_ATOMIC_H_
2#define AMREX_GPU_ATOMIC_H_
3#include <AMReX_Config.H>
13namespace Gpu::Atomic {
29 template <
typename R,
typename I,
typename F>
31 R atomic_op (R*
const address, R
const val,
F const f)
noexcept
33#if defined(__SYCL_DEVICE_ONLY__)
34 constexpr auto mo = sycl::memory_order::relaxed;
35 constexpr auto ms = sycl::memory_scope::device;
36 constexpr auto as = sycl::access::address_space::global_space;
37 static_assert(
sizeof(R) ==
sizeof(I),
"sizeof R != sizeof I");
38 I*
const add_as_I =
reinterpret_cast<I*
>(address);
39 sycl::atomic_ref<I,mo,ms,as> a{*add_as_I};
40 I old_I = a.load(), new_I;
42 R
const new_R = f(*(
reinterpret_cast<R const*
>(&old_I)), val);
43 new_I = *(
reinterpret_cast<I const*
>(&new_R));
44 }
while (! a.compare_exchange_strong(old_I, new_I));
45 return *(
reinterpret_cast<R const*
>(&old_I));
48 *address = f(*address, val);
53 template <
typename R,
typename I,
typename Op,
typename Cond>
55 bool atomic_op_if (R*
const address, R
const val, Op&& op, Cond&& cond)
noexcept
57#if defined(__SYCL_DEVICE_ONLY__)
58 constexpr auto mo = sycl::memory_order::relaxed;
59 constexpr auto ms = sycl::memory_scope::device;
60 constexpr auto as = sycl::access::address_space::global_space;
61 static_assert(
sizeof(R) ==
sizeof(I),
"sizeof R != sizeof I");
62 I*
const add_as_I =
reinterpret_cast<I*
>(address);
63 sycl::atomic_ref<I,mo,ms,as> a{*add_as_I};
64 I old_I = a.load(), new_I;
67 R
const tmp = op(*(
reinterpret_cast<R const*
>(&old_I)), val);
68 new_I = *(
reinterpret_cast<I const*
>(&tmp));
69 test_success = cond(tmp);
70 }
while (test_success && ! a.compare_exchange_strong(old_I, new_I));
74 R
const tmp = op(*(
reinterpret_cast<R const*
>(&old)), val);
84#elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
86 template <
typename R,
typename I,
typename F>
88 R atomic_op (R*
const address, R
const val,
F const f)
noexcept
90 static_assert(
sizeof(R) ==
sizeof(I),
"sizeof R != sizeof I");
91 I*
const add_as_I =
reinterpret_cast<I*
>(address);
92 I old_I = *add_as_I, assumed_I;
95 R
const new_R = f(*(
reinterpret_cast<R const*
>(&assumed_I)), val);
96 old_I = atomicCAS(add_as_I, assumed_I, *(
reinterpret_cast<I const*
>(&new_R)));
97 }
while (assumed_I != old_I);
98 return *(
reinterpret_cast<R const*
>(&old_I));
101 template <
typename R,
typename I,
typename Op,
typename Cond>
103 bool atomic_op_if (R*
const address, R
const val, Op&& op, Cond&& cond)
noexcept
105 static_assert(
sizeof(R) ==
sizeof(I),
"sizeof R != sizeof I");
106 I*
const add_as_I =
reinterpret_cast<I*
>(address);
107 I old_I = *add_as_I, assumed_I;
111 R
const new_R = op(*(
reinterpret_cast<R const*
>(&assumed_I)), val);
112 test_success = cond(new_R);
114 old_I = atomicCAS(add_as_I, assumed_I, *(
reinterpret_cast<I const*
>(&new_R)));
116 }
while (test_success && assumed_I != old_I);
132 template<
class T, sycl::access::address_space AS = sycl::access::address_space::global_space>
139#if defined(__SYCL_DEVICE_ONLY__)
140 constexpr auto mo = sycl::memory_order::relaxed;
141 constexpr auto ms = sycl::memory_scope::device;
142 sycl::atomic_ref<T,mo,ms,AS> a{*sum};
143 return a.fetch_add(value);
153#if defined(AMREX_USE_HIP) && defined(__gfx90a__)
156 float Add_device (
float*
const sum,
float const value)
noexcept
158 return unsafeAtomicAdd(sum, value);
162 double Add_device (
double*
const sum,
double const value)
noexcept
164 return unsafeAtomicAdd(sum, value);
168#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
177 return detail::atomic_op<Long, unsigned long long>(sum, value,
amrex::Plus<Long>());
182#if defined(AMREX_USE_CUDA) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)
185 double Add_device (
double*
const sum,
double const value)
noexcept
195 template<
class T, sycl::access::address_space AS = sycl::access::address_space::global_space>
200 T
Add (T* sum, T value)
noexcept
218#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) || defined(AMREX_USE_SYCL)
219 template <
typename T,
typename Op,
typename Cond,
220 std::enable_if_t<
sizeof(T) ==
sizeof(
unsigned int),
int> foo = 0>
222 bool If_device (T*
const sum, T
const value, Op&& op, Cond&& cond)
noexcept
224 return detail::atomic_op_if<T, unsigned int>(sum, value,
225 std::forward<Op>(op), std::forward<Cond>(cond));
228 template <
typename T,
typename Op,
typename Cond,
229 std::enable_if_t<
sizeof(T) ==
sizeof(
unsigned long long),
int> foo = 0>
231 bool If_device (T*
const sum, T
const value, Op&& op, Cond&& cond)
noexcept
233 return detail::atomic_op_if<T, unsigned long long>(sum, value,
234 std::forward<Op>(op), std::forward<Cond>(cond));
254 template<
class T,
class Op,
class Cond>
256 bool If (T*
const add, T
const value, Op&& op, Cond&& cond)
noexcept
259 return If_device(add, value, std::forward<Op>(op), std::forward<Cond>(cond));
263 T
const tmp = std::forward<Op>(op)(old, value);
264 if (std::forward<Cond>(cond)(tmp)) {
278 template<
class T, sycl::access::address_space AS = sycl::access::address_space::global_space>
285#if defined(__SYCL_DEVICE_ONLY__)
286 Add_device<T,AS>(sum, value);
293#if defined(AMREX_USE_HIP) && defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 5)
295 void AddNoRet (
float*
const sum,
float const value)
noexcept
297#pragma clang diagnostic push
298#pragma clang diagnostic ignored "-Wdeprecated-declarations"
300#pragma clang diagnostic pop
315#if defined(__SYCL_DEVICE_ONLY__)
316 constexpr auto mo = sycl::memory_order::relaxed;
317 constexpr auto ms = sycl::memory_scope::device;
318 constexpr auto as = sycl::access::address_space::global_space;
319 sycl::atomic_ref<T,mo,ms,as> a{*m};
320 return a.fetch_min(value);
331 float Min_device (
float*
const m,
float const value)
noexcept
337 double Min_device (
double*
const m,
double const value)
noexcept
342#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
356 T
Min (T*
const m, T
const value)
noexcept
363 *m = (*m) < value ? (*m) : value;
378#if defined(__SYCL_DEVICE_ONLY__)
379 constexpr auto mo = sycl::memory_order::relaxed;
380 constexpr auto ms = sycl::memory_scope::device;
381 constexpr auto as = sycl::access::address_space::global_space;
382 sycl::atomic_ref<T,mo,ms,as> a{*m};
383 return a.fetch_max(value);
394 float Max_device (
float*
const m,
float const value)
noexcept
400 double Max_device (
double*
const m,
double const value)
noexcept
405#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
419 T
Max (T*
const m, T
const value)
noexcept
426 *m = (*m) > value ? (*m) : value;
438#if defined(__SYCL_DEVICE_ONLY__)
439 constexpr auto mo = sycl::memory_order::relaxed;
440 constexpr auto ms = sycl::memory_scope::device;
441 constexpr auto as = sycl::access::address_space::global_space;
442 sycl::atomic_ref<int,mo,ms,as> a{*m};
443 return a.fetch_or(value);
446 return atomicOr(m, value);
463#if defined(__SYCL_DEVICE_ONLY__)
464 constexpr auto mo = sycl::memory_order::relaxed;
465 constexpr auto ms = sycl::memory_scope::device;
466 constexpr auto as = sycl::access::address_space::global_space;
467 sycl::atomic_ref<int,mo,ms,as> a{*m};
468 return a.fetch_and(value ? ~0x0 : 0);
471 return atomicAnd(m, value ? ~0x0 : 0);
485 template <
typename T>
487 T
Exch (T* address, T val)
noexcept
489#if defined(__SYCL_DEVICE_ONLY__)
490 constexpr auto mo = sycl::memory_order::relaxed;
491 constexpr auto ms = sycl::memory_scope::device;
492 constexpr auto as = sycl::access::address_space::global_space;
493 sycl::atomic_ref<T,mo,ms,as> a{*address};
494 return a.exchange(val);
497 return atomicExch(address, val);
500 auto const old = *address;
511 template <
typename T>
513 T
CAS (T*
const address, T compare, T
const val)
noexcept
515#if defined(__SYCL_DEVICE_ONLY__)
516 constexpr auto mo = sycl::memory_order::relaxed;
517 constexpr auto ms = sycl::memory_scope::device;
518 constexpr auto as = sycl::access::address_space::global_space;
519 sycl::atomic_ref<T,mo,ms,as> a{*address};
520 a.compare_exchange_strong(compare, val);
524 return atomicCAS(address, compare, val);
527 auto const old = *address;
528 *address = (old == compare ? val : old);
540 template <
typename T, std::enable_if_t<sizeof(T) == sizeof(
int),
int> = 0>
547 template <
typename T, std::enable_if_t<sizeof(T) == sizeof(
unsigned long long),
int> = 0>
564 auto const old = *prod;
576 template <
typename T, std::enable_if_t<sizeof(T) == sizeof(
int),
int> = 0>
583 template <
typename T, std::enable_if_t<sizeof(T) == sizeof(
unsigned long long),
int> = 0>
594 T
Divide (T*
const quot, T
const value)
noexcept
600 auto const old = *quot;
607namespace HostDevice::Atomic {
611 void Add_Host (T*
const sum, T
const value)
noexcept
614#pragma omp atomic update
621 void Add (T*
const sum, T
const value)
noexcept
632 template <
typename T>
640 template <
typename T>
648 template <
typename T>
656 template <
typename T>
664 template <
typename T>
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_IF_ON_DEVICE(CODE)
Definition AMReX_GpuQualifiers.H:56
#define AMREX_IF_ON_HOST(CODE)
Definition AMReX_GpuQualifiers.H:58
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
amrex_long Long
Definition AMReX_INT.H:30
__host__ __device__ AMREX_FORCE_INLINE T Exch(T *address, T val) noexcept
Definition AMReX_GpuAtomic.H:487
__device__ AMREX_FORCE_INLINE bool If_device(T *const sum, T const value, Op &&op, Cond &&cond) noexcept
Definition AMReX_GpuAtomic.H:222
__device__ AMREX_FORCE_INLINE T Divide_device(T *const quot, T const value) noexcept
Definition AMReX_GpuAtomic.H:578
__host__ __device__ AMREX_FORCE_INLINE int LogicalAnd(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:461
__host__ __device__ AMREX_FORCE_INLINE T CAS(T *const address, T compare, T const val) noexcept
Definition AMReX_GpuAtomic.H:513
__host__ __device__ AMREX_FORCE_INLINE T Multiply(T *const prod, T const value) noexcept
Definition AMReX_GpuAtomic.H:558
__host__ __device__ AMREX_FORCE_INLINE int LogicalOr(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:436
__device__ AMREX_FORCE_INLINE T Multiply_device(T *const prod, T const value) noexcept
Definition AMReX_GpuAtomic.H:542
__host__ __device__ AMREX_FORCE_INLINE bool If(T *const add, T const value, Op &&op, Cond &&cond) noexcept
Conditionally perform an atomic operation.
Definition AMReX_GpuAtomic.H:256
__device__ AMREX_FORCE_INLINE T Max_device(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:376
__host__ __device__ AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:283
__device__ AMREX_FORCE_INLINE T Min_device(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:313
__host__ __device__ AMREX_FORCE_INLINE T Add(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:200
__device__ AMREX_FORCE_INLINE T Add_device(T *const sum, T const value) noexcept
Definition AMReX_GpuAtomic.H:137
__host__ __device__ AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:419
__host__ __device__ AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:356
__host__ __device__ AMREX_FORCE_INLINE T Divide(T *const quot, T const value) noexcept
Definition AMReX_GpuAtomic.H:594
__host__ __device__ AMREX_FORCE_INLINE void Add(T *const sum, T const value) noexcept
Definition AMReX_GpuAtomic.H:621
AMREX_FORCE_INLINE void Add_Host(T *const sum, T const value) noexcept
Definition AMReX_GpuAtomic.H:611
Definition AMReX_Amr.cpp:49
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:138
Definition AMReX_Functional.H:77
Definition AMReX_GpuAtomic.H:634
__device__ void operator()(T *const dest, T const source) noexcept
Definition AMReX_GpuAtomic.H:635
Definition AMReX_GpuAtomic.H:658
__device__ void operator()(T *const dest, T const source) noexcept
Definition AMReX_GpuAtomic.H:659
Definition AMReX_GpuAtomic.H:666
__device__ void operator()(T *const dest, T const source) noexcept
Definition AMReX_GpuAtomic.H:667
Definition AMReX_GpuAtomic.H:650
__device__ void operator()(T *const dest, T const source) noexcept
Definition AMReX_GpuAtomic.H:651
Definition AMReX_GpuAtomic.H:642
__device__ void operator()(T *const dest, T const source) noexcept
Definition AMReX_GpuAtomic.H:643
Definition AMReX_Functional.H:50
Definition AMReX_Functional.H:59
Definition AMReX_Functional.H:41
Definition AMReX_Functional.H:32
Definition AMReX_Functional.H:68
Definition AMReX_Functional.H:14