1#ifndef AMREX_GPU_ATOMIC_H_
2#define AMREX_GPU_ATOMIC_H_
3#include <AMReX_Config.H>
13namespace Gpu::Atomic {
30 template <
typename R,
typename I,
typename F>
32 R atomic_op (R*
const address, R
const val,
F const f)
noexcept
34#if defined(__SYCL_DEVICE_ONLY__)
35 constexpr auto mo = sycl::memory_order::relaxed;
36 constexpr auto ms = sycl::memory_scope::device;
37 constexpr auto as = sycl::access::address_space::global_space;
38 static_assert(
sizeof(R) ==
sizeof(I),
"sizeof R != sizeof I");
39 I*
const add_as_I =
reinterpret_cast<I*
>(address);
40 sycl::atomic_ref<I,mo,ms,as> a{*add_as_I};
41 I old_I = a.load(), new_I;
43 R
const new_R = f(*(
reinterpret_cast<R const*
>(&old_I)), val);
44 new_I = *(
reinterpret_cast<I const*
>(&new_R));
45 }
while (! a.compare_exchange_strong(old_I, new_I));
46 return *(
reinterpret_cast<R const*
>(&old_I));
49 *address = f(*address, val);
54 template <
typename R,
typename I,
typename Op,
typename Cond>
56 bool atomic_op_if (R*
const address, R
const val, Op&& op, Cond&& cond)
noexcept
58#if defined(__SYCL_DEVICE_ONLY__)
59 constexpr auto mo = sycl::memory_order::relaxed;
60 constexpr auto ms = sycl::memory_scope::device;
61 constexpr auto as = sycl::access::address_space::global_space;
62 static_assert(
sizeof(R) ==
sizeof(I),
"sizeof R != sizeof I");
63 I*
const add_as_I =
reinterpret_cast<I*
>(address);
64 sycl::atomic_ref<I,mo,ms,as> a{*add_as_I};
65 I old_I = a.load(), new_I;
68 R
const tmp = op(*(
reinterpret_cast<R const*
>(&old_I)), val);
69 new_I = *(
reinterpret_cast<I const*
>(&tmp));
70 test_success = cond(tmp);
71 }
while (test_success && ! a.compare_exchange_strong(old_I, new_I));
75 R
const tmp = op(*(
reinterpret_cast<R const*
>(&old)), val);
85#elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
87 template <
typename R,
typename I,
typename F>
89 R atomic_op (R*
const address, R
const val,
F const f)
noexcept
91 static_assert(
sizeof(R) ==
sizeof(I),
"sizeof R != sizeof I");
92 I*
const add_as_I =
reinterpret_cast<I*
>(address);
93 I old_I = *add_as_I, assumed_I;
96 R
const new_R = f(*(
reinterpret_cast<R const*
>(&assumed_I)), val);
97 old_I = atomicCAS(add_as_I, assumed_I, *(
reinterpret_cast<I const*
>(&new_R)));
98 }
while (assumed_I != old_I);
99 return *(
reinterpret_cast<R const*
>(&old_I));
102 template <
typename R,
typename I,
typename Op,
typename Cond>
104 bool atomic_op_if (R*
const address, R
const val, Op&& op, Cond&& cond)
noexcept
106 static_assert(
sizeof(R) ==
sizeof(I),
"sizeof R != sizeof I");
107 I*
const add_as_I =
reinterpret_cast<I*
>(address);
108 I old_I = *add_as_I, assumed_I;
112 R
const new_R = op(*(
reinterpret_cast<R const*
>(&assumed_I)), val);
113 test_success = cond(new_R);
115 old_I = atomicCAS(add_as_I, assumed_I, *(
reinterpret_cast<I const*
>(&new_R)));
117 }
while (test_success && assumed_I != old_I);
133 template<
class T, sycl::access::address_space AS = sycl::access::address_space::global_space>
140#if defined(__SYCL_DEVICE_ONLY__)
141 constexpr auto mo = sycl::memory_order::relaxed;
142 constexpr auto ms = sycl::memory_scope::device;
143 sycl::atomic_ref<T,mo,ms,AS> a{*sum};
144 return a.fetch_add(value);
154#if defined(AMREX_USE_HIP) && defined(__gfx90a__)
157 float Add_device (
float*
const sum,
float const value)
noexcept
159 return unsafeAtomicAdd(sum, value);
163 double Add_device (
double*
const sum,
double const value)
noexcept
165 return unsafeAtomicAdd(sum, value);
169#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
178 return detail::atomic_op<Long, unsigned long long>(sum, value,
amrex::Plus<Long>());
183#if defined(AMREX_USE_CUDA) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)
186 double Add_device (
double*
const sum,
double const value)
noexcept
196 template<
class T, sycl::access::address_space AS = sycl::access::address_space::global_space>
201 T
Add (T* sum, T value)
noexcept
219#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) || defined(AMREX_USE_SYCL)
220 template <
typename T,
typename Op,
typename Cond,
221 std::enable_if_t<
sizeof(T) ==
sizeof(
unsigned int),
int> foo = 0>
223 bool If_device (T*
const sum, T
const value, Op&& op, Cond&& cond)
noexcept
225 return detail::atomic_op_if<T, unsigned int>(sum, value,
226 std::forward<Op>(op), std::forward<Cond>(cond));
229 template <
typename T,
typename Op,
typename Cond,
230 std::enable_if_t<
sizeof(T) ==
sizeof(
unsigned long long),
int> foo = 0>
232 bool If_device (T*
const sum, T
const value, Op&& op, Cond&& cond)
noexcept
234 return detail::atomic_op_if<T, unsigned long long>(sum, value,
235 std::forward<Op>(op), std::forward<Cond>(cond));
255 template<
class T,
class Op,
class Cond>
257 bool If (T*
const add, T
const value, Op&& op, Cond&& cond)
noexcept
260 return If_device(add, value, std::forward<Op>(op), std::forward<Cond>(cond));
264 T
const tmp = std::forward<Op>(op)(old, value);
265 if (std::forward<Cond>(cond)(tmp)) {
279 template<
class T, sycl::access::address_space AS = sycl::access::address_space::global_space>
286#if defined(__SYCL_DEVICE_ONLY__)
287 Add_device<T,AS>(sum, value);
294#if defined(AMREX_USE_HIP) && defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 5)
296 void AddNoRet (
float*
const sum,
float const value)
noexcept
298#pragma clang diagnostic push
299#pragma clang diagnostic ignored "-Wdeprecated-declarations"
301#pragma clang diagnostic pop
316#if defined(__SYCL_DEVICE_ONLY__)
317 constexpr auto mo = sycl::memory_order::relaxed;
318 constexpr auto ms = sycl::memory_scope::device;
319 constexpr auto as = sycl::access::address_space::global_space;
320 sycl::atomic_ref<T,mo,ms,as> a{*m};
321 return a.fetch_min(value);
332 float Min_device (
float*
const m,
float const value)
noexcept
338 double Min_device (
double*
const m,
double const value)
noexcept
343#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
357 T
Min (T*
const m, T
const value)
noexcept
364 *m = (*m) < value ? (*m) : value;
379#if defined(__SYCL_DEVICE_ONLY__)
380 constexpr auto mo = sycl::memory_order::relaxed;
381 constexpr auto ms = sycl::memory_scope::device;
382 constexpr auto as = sycl::access::address_space::global_space;
383 sycl::atomic_ref<T,mo,ms,as> a{*m};
384 return a.fetch_max(value);
395 float Max_device (
float*
const m,
float const value)
noexcept
401 double Max_device (
double*
const m,
double const value)
noexcept
406#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
420 T
Max (T*
const m, T
const value)
noexcept
427 *m = (*m) > value ? (*m) : value;
439#if defined(__SYCL_DEVICE_ONLY__)
440 constexpr auto mo = sycl::memory_order::relaxed;
441 constexpr auto ms = sycl::memory_scope::device;
442 constexpr auto as = sycl::access::address_space::global_space;
443 sycl::atomic_ref<int,mo,ms,as> a{*m};
444 return a.fetch_or(value);
447 return atomicOr(m, value);
464#if defined(__SYCL_DEVICE_ONLY__)
465 constexpr auto mo = sycl::memory_order::relaxed;
466 constexpr auto ms = sycl::memory_scope::device;
467 constexpr auto as = sycl::access::address_space::global_space;
468 sycl::atomic_ref<int,mo,ms,as> a{*m};
469 return a.fetch_and(value ? ~0x0 : 0);
472 return atomicAnd(m, value ? ~0x0 : 0);
486 template <
typename T>
488 T
Exch (T* address, T val)
noexcept
490#if defined(__SYCL_DEVICE_ONLY__)
491 constexpr auto mo = sycl::memory_order::relaxed;
492 constexpr auto ms = sycl::memory_scope::device;
493 constexpr auto as = sycl::access::address_space::global_space;
494 sycl::atomic_ref<T,mo,ms,as> a{*address};
495 return a.exchange(val);
498 return atomicExch(address, val);
501 auto const old = *address;
512 template <
typename T>
514 T
CAS (T*
const address, T compare, T
const val)
noexcept
516#if defined(__SYCL_DEVICE_ONLY__)
517 constexpr auto mo = sycl::memory_order::relaxed;
518 constexpr auto ms = sycl::memory_scope::device;
519 constexpr auto as = sycl::access::address_space::global_space;
520 sycl::atomic_ref<T,mo,ms,as> a{*address};
521 a.compare_exchange_strong(compare, val);
525 return atomicCAS(address, compare, val);
528 auto const old = *address;
529 *address = (old == compare ? val : old);
541 template <
typename T, std::enable_if_t<sizeof(T) == sizeof(
int),
int> = 0>
548 template <
typename T, std::enable_if_t<sizeof(T) == sizeof(
unsigned long long),
int> = 0>
565 auto const old = *prod;
577 template <
typename T, std::enable_if_t<sizeof(T) == sizeof(
int),
int> = 0>
584 template <
typename T, std::enable_if_t<sizeof(T) == sizeof(
unsigned long long),
int> = 0>
595 T
Divide (T*
const quot, T
const value)
noexcept
601 auto const old = *quot;
608namespace HostDevice::Atomic {
612 void Add_Host (T*
const sum, T
const value)
noexcept
615#pragma omp atomic update
622 void Add (T*
const sum, T
const value)
noexcept
633 template <
typename T>
641 template <
typename T>
649 template <
typename T>
657 template <
typename T>
665 template <
typename T>
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_IF_ON_DEVICE(CODE)
Definition AMReX_GpuQualifiers.H:56
#define AMREX_IF_ON_HOST(CODE)
Definition AMReX_GpuQualifiers.H:58
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
amrex_long Long
Definition AMReX_INT.H:30
__host__ __device__ AMREX_FORCE_INLINE T Exch(T *address, T val) noexcept
Definition AMReX_GpuAtomic.H:488
__device__ AMREX_FORCE_INLINE bool If_device(T *const sum, T const value, Op &&op, Cond &&cond) noexcept
Definition AMReX_GpuAtomic.H:223
__device__ AMREX_FORCE_INLINE T Divide_device(T *const quot, T const value) noexcept
Definition AMReX_GpuAtomic.H:579
__host__ __device__ AMREX_FORCE_INLINE int LogicalAnd(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:462
__host__ __device__ AMREX_FORCE_INLINE T CAS(T *const address, T compare, T const val) noexcept
Definition AMReX_GpuAtomic.H:514
__host__ __device__ AMREX_FORCE_INLINE T Multiply(T *const prod, T const value) noexcept
Definition AMReX_GpuAtomic.H:559
__host__ __device__ AMREX_FORCE_INLINE int LogicalOr(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:437
__device__ AMREX_FORCE_INLINE T Multiply_device(T *const prod, T const value) noexcept
Definition AMReX_GpuAtomic.H:543
__host__ __device__ AMREX_FORCE_INLINE bool If(T *const add, T const value, Op &&op, Cond &&cond) noexcept
Conditionally perform an atomic operation.
Definition AMReX_GpuAtomic.H:257
__device__ AMREX_FORCE_INLINE T Max_device(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:377
__host__ __device__ AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:284
__device__ AMREX_FORCE_INLINE T Min_device(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:314
__host__ __device__ AMREX_FORCE_INLINE T Add(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:201
__device__ AMREX_FORCE_INLINE T Add_device(T *const sum, T const value) noexcept
Definition AMReX_GpuAtomic.H:138
__host__ __device__ AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:420
__host__ __device__ AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:357
__host__ __device__ AMREX_FORCE_INLINE T Divide(T *const quot, T const value) noexcept
Definition AMReX_GpuAtomic.H:595
__host__ __device__ AMREX_FORCE_INLINE void Add(T *const sum, T const value) noexcept
Definition AMReX_GpuAtomic.H:622
AMREX_FORCE_INLINE void Add_Host(T *const sum, T const value) noexcept
Definition AMReX_GpuAtomic.H:612
Definition AMReX_Amr.cpp:49
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
Definition AMReX_Functional.H:77
Definition AMReX_GpuAtomic.H:635
__device__ void operator()(T *const dest, T const source) noexcept
Definition AMReX_GpuAtomic.H:636
Definition AMReX_GpuAtomic.H:659
__device__ void operator()(T *const dest, T const source) noexcept
Definition AMReX_GpuAtomic.H:660
Definition AMReX_GpuAtomic.H:667
__device__ void operator()(T *const dest, T const source) noexcept
Definition AMReX_GpuAtomic.H:668
Definition AMReX_GpuAtomic.H:651
__device__ void operator()(T *const dest, T const source) noexcept
Definition AMReX_GpuAtomic.H:652
Definition AMReX_GpuAtomic.H:643
__device__ void operator()(T *const dest, T const source) noexcept
Definition AMReX_GpuAtomic.H:644
Definition AMReX_Functional.H:50
Definition AMReX_Functional.H:59
Definition AMReX_Functional.H:41
Definition AMReX_Functional.H:32
Definition AMReX_Functional.H:68
Definition AMReX_Functional.H:14