Block-Structured AMR Software Framework
 
Loading...
Searching...
No Matches
AMReX_GpuAtomic.H
Go to the documentation of this file.
1#ifndef AMREX_GPU_ATOMIC_H_
2#define AMREX_GPU_ATOMIC_H_
3#include <AMReX_Config.H>
4
6#include <AMReX_Functional.H>
7#include <AMReX_INT.H>
8
9#include <utility>
10
11namespace amrex {
12
13namespace Gpu::Atomic {
14
15// For Add, Min and Max, we support int, unsigned int, long, unsigned long long, float and double.
16// For Multiply and Divide, we support generic types provided they are the same size as int or unsigned long long
17// and have *= and /= operators.
18// For LogicalOr and LogicalAnd, the data type is int.
19// For Exch and CAS, the data type is generic.
20// All these functions are non-atomic in host code!!!
21// If one needs them to be atomic in host code, use HostDevice::Atomic::*. Currently only
22// HostDevice::Atomic::Add is supported. We could certainly add more.
23
25namespace detail {
26
27#ifdef AMREX_USE_SYCL
28
29 template <typename R, typename I, typename F>
31 R atomic_op (R* const address, R const val, F const f) noexcept
32 {
33#if defined(__SYCL_DEVICE_ONLY__)
34 constexpr auto mo = sycl::memory_order::relaxed;
35 constexpr auto ms = sycl::memory_scope::device;
36 constexpr auto as = sycl::access::address_space::global_space;
37 static_assert(sizeof(R) == sizeof(I), "sizeof R != sizeof I");
38 I* const add_as_I = reinterpret_cast<I*>(address);
39 sycl::atomic_ref<I,mo,ms,as> a{*add_as_I};
40 I old_I = a.load(), new_I;
41 do {
42 R const new_R = f(*(reinterpret_cast<R const*>(&old_I)), val);
43 new_I = *(reinterpret_cast<I const*>(&new_R));
44 } while (! a.compare_exchange_strong(old_I, new_I));
45 return *(reinterpret_cast<R const*>(&old_I));
46#else
47 R old = *address;
48 *address = f(*address, val);
49 return old;
50#endif
51 }
52
53 template <typename R, typename I, typename Op, typename Cond>
55 bool atomic_op_if (R* const address, R const val, Op&& op, Cond&& cond) noexcept
56 {
57#if defined(__SYCL_DEVICE_ONLY__)
58 constexpr auto mo = sycl::memory_order::relaxed;
59 constexpr auto ms = sycl::memory_scope::device;
60 constexpr auto as = sycl::access::address_space::global_space;
61 static_assert(sizeof(R) == sizeof(I), "sizeof R != sizeof I");
62 I* const add_as_I = reinterpret_cast<I*>(address);
63 sycl::atomic_ref<I,mo,ms,as> a{*add_as_I};
64 I old_I = a.load(), new_I;
65 bool test_success;
66 do {
67 R const tmp = op(*(reinterpret_cast<R const*>(&old_I)), val);
68 new_I = *(reinterpret_cast<I const*>(&tmp));
69 test_success = cond(tmp);
70 } while (test_success && ! a.compare_exchange_strong(old_I, new_I));
71 return test_success;
72#else
73 R old = *address;
74 R const tmp = op(*(reinterpret_cast<R const*>(&old)), val);
75 if (cond(tmp)) {
76 *address = tmp;
77 return true;
78 } else {
79 return false;
80 }
81#endif
82 }
83
84#elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
85
86 template <typename R, typename I, typename F>
88 R atomic_op (R* const address, R const val, F const f) noexcept
89 {
90 static_assert(sizeof(R) == sizeof(I), "sizeof R != sizeof I");
91 I* const add_as_I = reinterpret_cast<I*>(address);
92 I old_I = *add_as_I, assumed_I;
93 do {
94 assumed_I = old_I;
95 R const new_R = f(*(reinterpret_cast<R const*>(&assumed_I)), val);
96 old_I = atomicCAS(add_as_I, assumed_I, *(reinterpret_cast<I const*>(&new_R)));
97 } while (assumed_I != old_I);
98 return *(reinterpret_cast<R const*>(&old_I));
99 }
100
101 template <typename R, typename I, typename Op, typename Cond>
103 bool atomic_op_if (R* const address, R const val, Op&& op, Cond&& cond) noexcept
104 {
105 static_assert(sizeof(R) == sizeof(I), "sizeof R != sizeof I");
106 I* const add_as_I = reinterpret_cast<I*>(address);
107 I old_I = *add_as_I, assumed_I;
108 bool test_success;
109 do {
110 assumed_I = old_I;
111 R const new_R = op(*(reinterpret_cast<R const*>(&assumed_I)), val);
112 test_success = cond(new_R);
113 if (test_success) {
114 old_I = atomicCAS(add_as_I, assumed_I, *(reinterpret_cast<I const*>(&new_R)));
115 }
116 } while (test_success && assumed_I != old_I);
117 return test_success;
118 }
119
120#endif
121
122}
124
126// Add
128
129#ifdef AMREX_USE_GPU
130
131#ifdef AMREX_USE_SYCL
132 template<class T, sycl::access::address_space AS = sycl::access::address_space::global_space>
133#else
134 template<class T>
135#endif
137 T Add_device (T* const sum, T const value) noexcept
138 {
139#if defined(__SYCL_DEVICE_ONLY__)
140 constexpr auto mo = sycl::memory_order::relaxed;
141 constexpr auto ms = sycl::memory_scope::device;
142 sycl::atomic_ref<T,mo,ms,AS> a{*sum};
143 return a.fetch_add(value);
144#else
145 AMREX_IF_ON_DEVICE(( return atomicAdd(sum, value); ))
147 amrex::ignore_unused(sum, value);
148 return T(); // should never get here, but have to return something
149 ))
150#endif
151 }
152
153#if defined(AMREX_USE_HIP) && defined(__gfx90a__)
154 // https://github.com/ROCm-Developer-Tools/hipamd/blob/rocm-4.5.x/include/hip/amd_detail/amd_hip_unsafe_atomics.h
156 float Add_device (float* const sum, float const value) noexcept
157 {
158 return unsafeAtomicAdd(sum, value);
159 }
160
162 double Add_device (double* const sum, double const value) noexcept
163 {
164 return unsafeAtomicAdd(sum, value);
165 }
166#endif
167
168#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
169
170 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicadd
171 // https://rocmdocs.amd.com/en/latest/Programming_Guides/Kernel_language.html?#atomic-functions
172 // CUDA and HIP support int, unsigned int, and unsigned long long.
173
175 Long Add_device (Long* const sum, Long const value) noexcept
176 {
177 return detail::atomic_op<Long, unsigned long long>(sum, value, amrex::Plus<Long>());
178 }
179
180#endif
181
182#if defined(AMREX_USE_CUDA) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)
183
185 double Add_device (double* const sum, double const value) noexcept
186 {
187 return detail::atomic_op<double, unsigned long long>(sum, value, amrex::Plus<double>());
188 }
189
190#endif
191
192#endif
193
194#ifdef AMREX_USE_SYCL
195 template<class T, sycl::access::address_space AS = sycl::access::address_space::global_space>
196#else
197 template<class T>
198#endif
200 T Add (T* sum, T value) noexcept
201 {
202#ifdef AMREX_USE_SYCL
203 AMREX_IF_ON_DEVICE((return Add_device<T,AS>(sum, value);))
204#else
205 AMREX_IF_ON_DEVICE((return Add_device(sum, value);))
206#endif
208 auto old = *sum;
209 *sum += value;
210 return old;
211 ))
212 }
213
215// If
217
218#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) || defined(AMREX_USE_SYCL)
219 template <typename T, typename Op, typename Cond,
220 std::enable_if_t<sizeof(T) == sizeof(unsigned int), int> foo = 0>
222 bool If_device (T* const sum, T const value, Op&& op, Cond&& cond) noexcept
223 {
224 return detail::atomic_op_if<T, unsigned int>(sum, value,
225 std::forward<Op>(op), std::forward<Cond>(cond));
226 }
227
228 template <typename T, typename Op, typename Cond,
229 std::enable_if_t<sizeof(T) == sizeof(unsigned long long), int> foo = 0>
231 bool If_device (T* const sum, T const value, Op&& op, Cond&& cond) noexcept
232 {
233 return detail::atomic_op_if<T, unsigned long long>(sum, value,
234 std::forward<Op>(op), std::forward<Cond>(cond));
235 }
236#endif
237
254 template<class T, class Op, class Cond>
256 bool If (T* const add, T const value, Op&& op, Cond&& cond) noexcept
257 {
259 return If_device(add, value, std::forward<Op>(op), std::forward<Cond>(cond));
260 ))
262 T old = *add;
263 T const tmp = std::forward<Op>(op)(old, value);
264 if (std::forward<Cond>(cond)(tmp)) {
265 *add = tmp;
266 return true;
267 } else {
268 return false;
269 }
270 ))
271 }
272
274// AddNoRet
276
277#ifdef AMREX_USE_SYCL
278 template<class T, sycl::access::address_space AS = sycl::access::address_space::global_space>
279#else
280 template<class T>
281#endif
283 void AddNoRet (T* sum, T value) noexcept
284 {
285#if defined(__SYCL_DEVICE_ONLY__)
286 Add_device<T,AS>(sum, value);
287#else
288 AMREX_IF_ON_DEVICE((Add_device(sum, value);))
289 AMREX_IF_ON_HOST((*sum += value;))
290#endif
291 }
292
293#if defined(AMREX_USE_HIP) && defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 5)
295 void AddNoRet (float* const sum, float const value) noexcept
296 {
297#pragma clang diagnostic push
298#pragma clang diagnostic ignored "-Wdeprecated-declarations"
299 AMREX_IF_ON_DEVICE((atomicAddNoRet(sum, value);))
300#pragma clang diagnostic pop
301 AMREX_IF_ON_HOST((*sum += value;))
302 }
303#endif
304
306// Min
308
309#ifdef AMREX_USE_GPU
310
311 template<class T>
313 T Min_device (T* const m, T const value) noexcept
314 {
315#if defined(__SYCL_DEVICE_ONLY__)
316 constexpr auto mo = sycl::memory_order::relaxed;
317 constexpr auto ms = sycl::memory_scope::device;
318 constexpr auto as = sycl::access::address_space::global_space;
319 sycl::atomic_ref<T,mo,ms,as> a{*m};
320 return a.fetch_min(value);
321#else
322 AMREX_IF_ON_DEVICE(( return atomicMin(m, value); ))
324 amrex::ignore_unused(m,value);
325 return T(); // should never get here, but have to return something
326 ))
327#endif
328 }
329
331 float Min_device (float* const m, float const value) noexcept
332 {
333 return detail::atomic_op<float,int>(m,value,amrex::Minimum<float>());
334 }
335
337 double Min_device (double* const m, double const value) noexcept
338 {
339 return detail::atomic_op<double, unsigned long long int>(m,value,amrex::Minimum<double>());
340 }
341
342#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
343
345 Long Min_device (Long* const m, Long const value) noexcept
346 {
347 return detail::atomic_op<Long, unsigned long long int>(m,value,amrex::Minimum<Long>());
348 }
349
350#endif
351
352#endif
353
354 template<class T>
356 T Min (T* const m, T const value) noexcept
357 {
359 return Min_device(m, value);
360 ))
362 auto const old = *m;
363 *m = (*m) < value ? (*m) : value;
364 return old;
365 ))
366 }
367
369// Max
371
372#ifdef AMREX_USE_GPU
373
374 template<class T>
376 T Max_device (T* const m, T const value) noexcept
377 {
378#if defined(__SYCL_DEVICE_ONLY__)
379 constexpr auto mo = sycl::memory_order::relaxed;
380 constexpr auto ms = sycl::memory_scope::device;
381 constexpr auto as = sycl::access::address_space::global_space;
382 sycl::atomic_ref<T,mo,ms,as> a{*m};
383 return a.fetch_max(value);
384#else
385 AMREX_IF_ON_DEVICE(( return atomicMax(m, value); ))
387 amrex::ignore_unused(m,value);
388 return T(); // should never get here, but have to return something
389 ))
390#endif
391 }
392
394 float Max_device (float* const m, float const value) noexcept
395 {
396 return detail::atomic_op<float,int>(m,value,amrex::Maximum<float>());
397 }
398
400 double Max_device (double* const m, double const value) noexcept
401 {
402 return detail::atomic_op<double, unsigned long long int>(m,value,amrex::Maximum<double>());
403 }
404
405#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
406
408 Long Max_device (Long* const m, Long const value) noexcept
409 {
410 return detail::atomic_op<Long, unsigned long long int>(m,value,amrex::Maximum<Long>());
411 }
412
413#endif
414
415#endif
416
417 template<class T>
419 T Max (T* const m, T const value) noexcept
420 {
422 return Max_device(m, value);
423 ))
425 auto const old = *m;
426 *m = (*m) > value ? (*m) : value;
427 return old;
428 ))
429 }
430
432// LogicalOr
434
436 int LogicalOr (int* const m, int const value) noexcept
437 {
438#if defined(__SYCL_DEVICE_ONLY__)
439 constexpr auto mo = sycl::memory_order::relaxed;
440 constexpr auto ms = sycl::memory_scope::device;
441 constexpr auto as = sycl::access::address_space::global_space;
442 sycl::atomic_ref<int,mo,ms,as> a{*m};
443 return a.fetch_or(value);
444#else
446 return atomicOr(m, value);
447 ))
449 int const old = *m;
450 *m = (*m) || value;
451 return old;
452 ))
453#endif
454 }
455
457// LogicalAnd
459
461 int LogicalAnd (int* const m, int const value) noexcept
462 {
463#if defined(__SYCL_DEVICE_ONLY__)
464 constexpr auto mo = sycl::memory_order::relaxed;
465 constexpr auto ms = sycl::memory_scope::device;
466 constexpr auto as = sycl::access::address_space::global_space;
467 sycl::atomic_ref<int,mo,ms,as> a{*m};
468 return a.fetch_and(value ? ~0x0 : 0);
469#else
471 return atomicAnd(m, value ? ~0x0 : 0);
472 ))
474 int const old = *m;
475 *m = (*m) && value;
476 return old;
477 ))
478#endif
479 }
480
482// Exch
484
485 template <typename T>
487 T Exch (T* address, T val) noexcept
488 {
489#if defined(__SYCL_DEVICE_ONLY__)
490 constexpr auto mo = sycl::memory_order::relaxed;
491 constexpr auto ms = sycl::memory_scope::device;
492 constexpr auto as = sycl::access::address_space::global_space;
493 sycl::atomic_ref<T,mo,ms,as> a{*address};
494 return a.exchange(val);
495#else
497 return atomicExch(address, val);
498 ))
500 auto const old = *address;
501 *address = val;
502 return old;
503 ))
504#endif
505 }
506
508// CAS
510
511 template <typename T>
513 T CAS (T* const address, T compare, T const val) noexcept
514 { // cannot be T const compare because of compare_exchange_strong
515#if defined(__SYCL_DEVICE_ONLY__)
516 constexpr auto mo = sycl::memory_order::relaxed;
517 constexpr auto ms = sycl::memory_scope::device;
518 constexpr auto as = sycl::access::address_space::global_space;
519 sycl::atomic_ref<T,mo,ms,as> a{*address};
520 a.compare_exchange_strong(compare, val);
521 return compare;
522#else
524 return atomicCAS(address, compare, val);
525 ))
527 auto const old = *address;
528 *address = (old == compare ? val : old);
529 return old;
530 ))
531#endif
532 }
533
535// Multiply
537
538#ifdef AMREX_USE_GPU
539
540 template <typename T, std::enable_if_t<sizeof(T) == sizeof(int), int> = 0>
542 T Multiply_device (T* const prod, T const value) noexcept
543 {
544 return detail::atomic_op<T, int>(prod,value,amrex::Multiplies<T>());
545 }
546
547 template <typename T, std::enable_if_t<sizeof(T) == sizeof(unsigned long long), int> = 0>
549 T Multiply_device (T* const prod, T const value) noexcept
550 {
551 return detail::atomic_op<T, unsigned long long>(prod,value,amrex::Multiplies<T>());
552 }
553
554#endif
555
556 template<class T>
558 T Multiply (T* const prod, T const value) noexcept
559 {
561 return Multiply_device(prod, value);
562 ))
564 auto const old = *prod;
565 *prod *= value;
566 return old;
567 ))
568 }
569
571// Divide
573
574#ifdef AMREX_USE_GPU
575
576 template <typename T, std::enable_if_t<sizeof(T) == sizeof(int), int> = 0>
578 T Divide_device (T* const quot, T const value) noexcept
579 {
580 return detail::atomic_op<T, int>(quot,value,amrex::Divides<T>());
581 }
582
583 template <typename T, std::enable_if_t<sizeof(T) == sizeof(unsigned long long), int> = 0>
585 T Divide_device (T* const quot, T const value) noexcept
586 {
587 return detail::atomic_op<T, unsigned long long>(quot,value,amrex::Divides<T>());
588 }
589
590#endif
591
592 template<class T>
594 T Divide (T* const quot, T const value) noexcept
595 {
597 return Divide_device(quot, value);
598 ))
600 auto const old = *quot;
601 *quot /= value;
602 return old;
603 ))
604 }
605}
606
607namespace HostDevice::Atomic {
608
609 template <class T>
611 void Add_Host (T* const sum, T const value) noexcept
612 {
613#ifdef AMREX_USE_OMP
614#pragma omp atomic update
615#endif
616 *sum += value;
617 }
618
619 template <class T>
621 void Add (T* const sum, T const value) noexcept
622 {
624 AMREX_IF_ON_HOST((Add_Host(sum,value);))
625 }
626
627}
628
629#ifdef AMREX_USE_GPU
630// functors
631namespace Gpu {
632 template <typename T>
634 {
635 AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
636 Gpu::Atomic::AddNoRet(dest, source);
637 }
638 };
639
640 template <typename T>
642 {
643 AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
644 Gpu::Atomic::Min(dest, source);
645 }
646 };
647
648 template <typename T>
650 {
651 AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
652 Gpu::Atomic::Max(dest, source);
653 }
654 };
655
656 template <typename T>
658 {
659 AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
660 Gpu::Atomic::LogicalAnd(dest, source);
661 }
662 };
663
664 template <typename T>
666 {
667 AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
668 Gpu::Atomic::LogicalOr(dest, source);
669 }
670 };
671}
672#endif
673
674}
675#endif
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_IF_ON_DEVICE(CODE)
Definition AMReX_GpuQualifiers.H:56
#define AMREX_IF_ON_HOST(CODE)
Definition AMReX_GpuQualifiers.H:58
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
amrex_long Long
Definition AMReX_INT.H:30
__host__ __device__ AMREX_FORCE_INLINE T Exch(T *address, T val) noexcept
Definition AMReX_GpuAtomic.H:487
__device__ AMREX_FORCE_INLINE bool If_device(T *const sum, T const value, Op &&op, Cond &&cond) noexcept
Definition AMReX_GpuAtomic.H:222
__device__ AMREX_FORCE_INLINE T Divide_device(T *const quot, T const value) noexcept
Definition AMReX_GpuAtomic.H:578
__host__ __device__ AMREX_FORCE_INLINE int LogicalAnd(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:461
__host__ __device__ AMREX_FORCE_INLINE T CAS(T *const address, T compare, T const val) noexcept
Definition AMReX_GpuAtomic.H:513
__host__ __device__ AMREX_FORCE_INLINE T Multiply(T *const prod, T const value) noexcept
Definition AMReX_GpuAtomic.H:558
__host__ __device__ AMREX_FORCE_INLINE int LogicalOr(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:436
__device__ AMREX_FORCE_INLINE T Multiply_device(T *const prod, T const value) noexcept
Definition AMReX_GpuAtomic.H:542
__host__ __device__ AMREX_FORCE_INLINE bool If(T *const add, T const value, Op &&op, Cond &&cond) noexcept
Conditionally perform an atomic operation.
Definition AMReX_GpuAtomic.H:256
__device__ AMREX_FORCE_INLINE T Max_device(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:376
__host__ __device__ AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:283
__device__ AMREX_FORCE_INLINE T Min_device(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:313
__host__ __device__ AMREX_FORCE_INLINE T Add(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:200
__device__ AMREX_FORCE_INLINE T Add_device(T *const sum, T const value) noexcept
Definition AMReX_GpuAtomic.H:137
__host__ __device__ AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:419
__host__ __device__ AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:356
__host__ __device__ AMREX_FORCE_INLINE T Divide(T *const quot, T const value) noexcept
Definition AMReX_GpuAtomic.H:594
__host__ __device__ AMREX_FORCE_INLINE void Add(T *const sum, T const value) noexcept
Definition AMReX_GpuAtomic.H:621
AMREX_FORCE_INLINE void Add_Host(T *const sum, T const value) noexcept
Definition AMReX_GpuAtomic.H:611
Definition AMReX_Amr.cpp:49
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:138
Definition AMReX_Functional.H:77
Definition AMReX_GpuAtomic.H:634
__device__ void operator()(T *const dest, T const source) noexcept
Definition AMReX_GpuAtomic.H:635
Definition AMReX_GpuAtomic.H:658
__device__ void operator()(T *const dest, T const source) noexcept
Definition AMReX_GpuAtomic.H:659
Definition AMReX_GpuAtomic.H:666
__device__ void operator()(T *const dest, T const source) noexcept
Definition AMReX_GpuAtomic.H:667
Definition AMReX_GpuAtomic.H:650
__device__ void operator()(T *const dest, T const source) noexcept
Definition AMReX_GpuAtomic.H:651
Definition AMReX_GpuAtomic.H:642
__device__ void operator()(T *const dest, T const source) noexcept
Definition AMReX_GpuAtomic.H:643
Definition AMReX_Functional.H:50
Definition AMReX_Functional.H:59
Definition AMReX_Functional.H:41
Definition AMReX_Functional.H:32
Definition AMReX_Functional.H:68
Definition AMReX_Functional.H:14