Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_GpuAtomic.H
Go to the documentation of this file.
1#ifndef AMREX_GPU_ATOMIC_H_
2#define AMREX_GPU_ATOMIC_H_
3#include <AMReX_Config.H>
4
6#include <AMReX_Functional.H>
7#include <AMReX_INT.H>
8
9#include <utility>
10
11namespace amrex {
12
13namespace Gpu::Atomic {
14
15// For Add, Min and Max, we support int, unsigned int, long, unsigned long long, float and double.
16// For Multiply and Divide, we support generic types provided they are the same size as int or unsigned long long
17// and have *= and /= operators.
18// For LogicalOr and LogicalAnd, the data type is int.
19// For Exch and CAS, the data type is generic.
20// All these functions are non-atomic in host code!!!
21// If one needs them to be atomic in host code, use HostDevice::Atomic::*. Currently only
22// HostDevice::Atomic::Add is supported. We could certainly add more.
23// If we add more types for atomicAdd, we also need to update HasAtomicAdd in AMReX_TypeTraits.H.
24
26namespace detail {
27
28#ifdef AMREX_USE_SYCL
29
30 template <typename R, typename I, typename F>
32 R atomic_op (R* const address, R const val, F const f) noexcept
33 {
34#if defined(__SYCL_DEVICE_ONLY__)
35 constexpr auto mo = sycl::memory_order::relaxed;
36 constexpr auto ms = sycl::memory_scope::device;
37 constexpr auto as = sycl::access::address_space::global_space;
38 static_assert(sizeof(R) == sizeof(I), "sizeof R != sizeof I");
39 I* const add_as_I = reinterpret_cast<I*>(address);
40 sycl::atomic_ref<I,mo,ms,as> a{*add_as_I};
41 I old_I = a.load(), new_I;
42 do {
43 R const new_R = f(*(reinterpret_cast<R const*>(&old_I)), val);
44 new_I = *(reinterpret_cast<I const*>(&new_R));
45 } while (! a.compare_exchange_strong(old_I, new_I));
46 return *(reinterpret_cast<R const*>(&old_I));
47#else
48 R old = *address;
49 *address = f(*address, val);
50 return old;
51#endif
52 }
53
54 template <typename R, typename I, typename Op, typename Cond>
56 bool atomic_op_if (R* const address, R const val, Op&& op, Cond&& cond) noexcept
57 {
58#if defined(__SYCL_DEVICE_ONLY__)
59 constexpr auto mo = sycl::memory_order::relaxed;
60 constexpr auto ms = sycl::memory_scope::device;
61 constexpr auto as = sycl::access::address_space::global_space;
62 static_assert(sizeof(R) == sizeof(I), "sizeof R != sizeof I");
63 I* const add_as_I = reinterpret_cast<I*>(address);
64 sycl::atomic_ref<I,mo,ms,as> a{*add_as_I};
65 I old_I = a.load(), new_I;
66 bool test_success;
67 do {
68 R const tmp = op(*(reinterpret_cast<R const*>(&old_I)), val);
69 new_I = *(reinterpret_cast<I const*>(&tmp));
70 test_success = cond(tmp);
71 } while (test_success && ! a.compare_exchange_strong(old_I, new_I));
72 return test_success;
73#else
74 R old = *address;
75 R const tmp = op(*(reinterpret_cast<R const*>(&old)), val);
76 if (cond(tmp)) {
77 *address = tmp;
78 return true;
79 } else {
80 return false;
81 }
82#endif
83 }
84
85#elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
86
87 template <typename R, typename I, typename F>
89 R atomic_op (R* const address, R const val, F const f) noexcept
90 {
91 static_assert(sizeof(R) == sizeof(I), "sizeof R != sizeof I");
92 I* const add_as_I = reinterpret_cast<I*>(address);
93 I old_I = *add_as_I, assumed_I;
94 do {
95 assumed_I = old_I;
96 R const new_R = f(*(reinterpret_cast<R const*>(&assumed_I)), val);
97 old_I = atomicCAS(add_as_I, assumed_I, *(reinterpret_cast<I const*>(&new_R)));
98 } while (assumed_I != old_I);
99 return *(reinterpret_cast<R const*>(&old_I));
100 }
101
102 template <typename R, typename I, typename Op, typename Cond>
104 bool atomic_op_if (R* const address, R const val, Op&& op, Cond&& cond) noexcept
105 {
106 static_assert(sizeof(R) == sizeof(I), "sizeof R != sizeof I");
107 I* const add_as_I = reinterpret_cast<I*>(address);
108 I old_I = *add_as_I, assumed_I;
109 bool test_success;
110 do {
111 assumed_I = old_I;
112 R const new_R = op(*(reinterpret_cast<R const*>(&assumed_I)), val);
113 test_success = cond(new_R);
114 if (test_success) {
115 old_I = atomicCAS(add_as_I, assumed_I, *(reinterpret_cast<I const*>(&new_R)));
116 }
117 } while (test_success && assumed_I != old_I);
118 return test_success;
119 }
120
121#endif
122
123}
125
127// Add
129
130#ifdef AMREX_USE_GPU
131
132#ifdef AMREX_USE_SYCL
133 template<class T, sycl::access::address_space AS = sycl::access::address_space::global_space>
134#else
135 template<class T>
136#endif
138 T Add_device (T* const sum, T const value) noexcept
139 {
140#if defined(__SYCL_DEVICE_ONLY__)
141 constexpr auto mo = sycl::memory_order::relaxed;
142 constexpr auto ms = sycl::memory_scope::device;
143 sycl::atomic_ref<T,mo,ms,AS> a{*sum};
144 return a.fetch_add(value);
145#else
146 AMREX_IF_ON_DEVICE(( return atomicAdd(sum, value); ))
148 amrex::ignore_unused(sum, value);
149 return T(); // should never get here, but have to return something
150 ))
151#endif
152 }
153
154#if defined(AMREX_USE_HIP) && defined(__gfx90a__)
155 // https://github.com/ROCm-Developer-Tools/hipamd/blob/rocm-4.5.x/include/hip/amd_detail/amd_hip_unsafe_atomics.h
157 float Add_device (float* const sum, float const value) noexcept
158 {
159 return unsafeAtomicAdd(sum, value);
160 }
161
163 double Add_device (double* const sum, double const value) noexcept
164 {
165 return unsafeAtomicAdd(sum, value);
166 }
167#endif
168
169#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
170
171 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicadd
172 // https://rocmdocs.amd.com/en/latest/Programming_Guides/Kernel_language.html?#atomic-functions
173 // CUDA and HIP support int, unsigned int, and unsigned long long.
174
176 Long Add_device (Long* const sum, Long const value) noexcept
177 {
178 return detail::atomic_op<Long, unsigned long long>(sum, value, amrex::Plus<Long>());
179 }
180
181#endif
182
183#if defined(AMREX_USE_CUDA) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)
184
186 double Add_device (double* const sum, double const value) noexcept
187 {
188 return detail::atomic_op<double, unsigned long long>(sum, value, amrex::Plus<double>());
189 }
190
191#endif
192
193#endif
194
195#ifdef AMREX_USE_SYCL
196 template<class T, sycl::access::address_space AS = sycl::access::address_space::global_space>
197#else
198 template<class T>
199#endif
201 T Add (T* sum, T value) noexcept
202 {
203#ifdef AMREX_USE_SYCL
204 AMREX_IF_ON_DEVICE((return Add_device<T,AS>(sum, value);))
205#else
206 AMREX_IF_ON_DEVICE((return Add_device(sum, value);))
207#endif
209 auto old = *sum;
210 *sum += value;
211 return old;
212 ))
213 }
214
216// If
218
219#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) || defined(AMREX_USE_SYCL)
220 template <typename T, typename Op, typename Cond,
221 std::enable_if_t<sizeof(T) == sizeof(unsigned int), int> foo = 0>
223 bool If_device (T* const sum, T const value, Op&& op, Cond&& cond) noexcept
224 {
225 return detail::atomic_op_if<T, unsigned int>(sum, value,
226 std::forward<Op>(op), std::forward<Cond>(cond));
227 }
228
229 template <typename T, typename Op, typename Cond,
230 std::enable_if_t<sizeof(T) == sizeof(unsigned long long), int> foo = 0>
232 bool If_device (T* const sum, T const value, Op&& op, Cond&& cond) noexcept
233 {
234 return detail::atomic_op_if<T, unsigned long long>(sum, value,
235 std::forward<Op>(op), std::forward<Cond>(cond));
236 }
237#endif
238
255 template<class T, class Op, class Cond>
257 bool If (T* const add, T const value, Op&& op, Cond&& cond) noexcept
258 {
260 return If_device(add, value, std::forward<Op>(op), std::forward<Cond>(cond));
261 ))
263 T old = *add;
264 T const tmp = std::forward<Op>(op)(old, value);
265 if (std::forward<Cond>(cond)(tmp)) {
266 *add = tmp;
267 return true;
268 } else {
269 return false;
270 }
271 ))
272 }
273
275// AddNoRet
277
278#ifdef AMREX_USE_SYCL
279 template<class T, sycl::access::address_space AS = sycl::access::address_space::global_space>
280#else
281 template<class T>
282#endif
284 void AddNoRet (T* sum, T value) noexcept
285 {
286#if defined(__SYCL_DEVICE_ONLY__)
287 Add_device<T,AS>(sum, value);
288#else
289 AMREX_IF_ON_DEVICE((Add_device(sum, value);))
290 AMREX_IF_ON_HOST((*sum += value;))
291#endif
292 }
293
294#if defined(AMREX_USE_HIP) && defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 5)
296 void AddNoRet (float* const sum, float const value) noexcept
297 {
298#pragma clang diagnostic push
299#pragma clang diagnostic ignored "-Wdeprecated-declarations"
300 AMREX_IF_ON_DEVICE((atomicAddNoRet(sum, value);))
301#pragma clang diagnostic pop
302 AMREX_IF_ON_HOST((*sum += value;))
303 }
304#endif
305
307// Min
309
310#ifdef AMREX_USE_GPU
311
312 template<class T>
314 T Min_device (T* const m, T const value) noexcept
315 {
316#if defined(__SYCL_DEVICE_ONLY__)
317 constexpr auto mo = sycl::memory_order::relaxed;
318 constexpr auto ms = sycl::memory_scope::device;
319 constexpr auto as = sycl::access::address_space::global_space;
320 sycl::atomic_ref<T,mo,ms,as> a{*m};
321 return a.fetch_min(value);
322#else
323 AMREX_IF_ON_DEVICE(( return atomicMin(m, value); ))
325 amrex::ignore_unused(m,value);
326 return T(); // should never get here, but have to return something
327 ))
328#endif
329 }
330
332 float Min_device (float* const m, float const value) noexcept
333 {
334 return detail::atomic_op<float,int>(m,value,amrex::Minimum<float>());
335 }
336
338 double Min_device (double* const m, double const value) noexcept
339 {
340 return detail::atomic_op<double, unsigned long long int>(m,value,amrex::Minimum<double>());
341 }
342
343#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
344
346 Long Min_device (Long* const m, Long const value) noexcept
347 {
348 return detail::atomic_op<Long, unsigned long long int>(m,value,amrex::Minimum<Long>());
349 }
350
351#endif
352
353#endif
354
355 template<class T>
357 T Min (T* const m, T const value) noexcept
358 {
360 return Min_device(m, value);
361 ))
363 auto const old = *m;
364 *m = (*m) < value ? (*m) : value;
365 return old;
366 ))
367 }
368
370// Max
372
373#ifdef AMREX_USE_GPU
374
375 template<class T>
377 T Max_device (T* const m, T const value) noexcept
378 {
379#if defined(__SYCL_DEVICE_ONLY__)
380 constexpr auto mo = sycl::memory_order::relaxed;
381 constexpr auto ms = sycl::memory_scope::device;
382 constexpr auto as = sycl::access::address_space::global_space;
383 sycl::atomic_ref<T,mo,ms,as> a{*m};
384 return a.fetch_max(value);
385#else
386 AMREX_IF_ON_DEVICE(( return atomicMax(m, value); ))
388 amrex::ignore_unused(m,value);
389 return T(); // should never get here, but have to return something
390 ))
391#endif
392 }
393
395 float Max_device (float* const m, float const value) noexcept
396 {
397 return detail::atomic_op<float,int>(m,value,amrex::Maximum<float>());
398 }
399
401 double Max_device (double* const m, double const value) noexcept
402 {
403 return detail::atomic_op<double, unsigned long long int>(m,value,amrex::Maximum<double>());
404 }
405
406#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
407
409 Long Max_device (Long* const m, Long const value) noexcept
410 {
411 return detail::atomic_op<Long, unsigned long long int>(m,value,amrex::Maximum<Long>());
412 }
413
414#endif
415
416#endif
417
418 template<class T>
420 T Max (T* const m, T const value) noexcept
421 {
423 return Max_device(m, value);
424 ))
426 auto const old = *m;
427 *m = (*m) > value ? (*m) : value;
428 return old;
429 ))
430 }
431
433// LogicalOr
435
437 int LogicalOr (int* const m, int const value) noexcept
438 {
439#if defined(__SYCL_DEVICE_ONLY__)
440 constexpr auto mo = sycl::memory_order::relaxed;
441 constexpr auto ms = sycl::memory_scope::device;
442 constexpr auto as = sycl::access::address_space::global_space;
443 sycl::atomic_ref<int,mo,ms,as> a{*m};
444 return a.fetch_or(value);
445#else
447 return atomicOr(m, value);
448 ))
450 int const old = *m;
451 *m = (*m) || value;
452 return old;
453 ))
454#endif
455 }
456
458// LogicalAnd
460
462 int LogicalAnd (int* const m, int const value) noexcept
463 {
464#if defined(__SYCL_DEVICE_ONLY__)
465 constexpr auto mo = sycl::memory_order::relaxed;
466 constexpr auto ms = sycl::memory_scope::device;
467 constexpr auto as = sycl::access::address_space::global_space;
468 sycl::atomic_ref<int,mo,ms,as> a{*m};
469 return a.fetch_and(value ? ~0x0 : 0);
470#else
472 return atomicAnd(m, value ? ~0x0 : 0);
473 ))
475 int const old = *m;
476 *m = (*m) && value;
477 return old;
478 ))
479#endif
480 }
481
483// Exch
485
486 template <typename T>
488 T Exch (T* address, T val) noexcept
489 {
490#if defined(__SYCL_DEVICE_ONLY__)
491 constexpr auto mo = sycl::memory_order::relaxed;
492 constexpr auto ms = sycl::memory_scope::device;
493 constexpr auto as = sycl::access::address_space::global_space;
494 sycl::atomic_ref<T,mo,ms,as> a{*address};
495 return a.exchange(val);
496#else
498 return atomicExch(address, val);
499 ))
501 auto const old = *address;
502 *address = val;
503 return old;
504 ))
505#endif
506 }
507
509// CAS
511
512 template <typename T>
514 T CAS (T* const address, T compare, T const val) noexcept
515 { // cannot be T const compare because of compare_exchange_strong
516#if defined(__SYCL_DEVICE_ONLY__)
517 constexpr auto mo = sycl::memory_order::relaxed;
518 constexpr auto ms = sycl::memory_scope::device;
519 constexpr auto as = sycl::access::address_space::global_space;
520 sycl::atomic_ref<T,mo,ms,as> a{*address};
521 a.compare_exchange_strong(compare, val);
522 return compare;
523#else
525 return atomicCAS(address, compare, val);
526 ))
528 auto const old = *address;
529 *address = (old == compare ? val : old);
530 return old;
531 ))
532#endif
533 }
534
536// Multiply
538
539#ifdef AMREX_USE_GPU
540
541 template <typename T, std::enable_if_t<sizeof(T) == sizeof(int), int> = 0>
543 T Multiply_device (T* const prod, T const value) noexcept
544 {
545 return detail::atomic_op<T, int>(prod,value,amrex::Multiplies<T>());
546 }
547
548 template <typename T, std::enable_if_t<sizeof(T) == sizeof(unsigned long long), int> = 0>
550 T Multiply_device (T* const prod, T const value) noexcept
551 {
552 return detail::atomic_op<T, unsigned long long>(prod,value,amrex::Multiplies<T>());
553 }
554
555#endif
556
557 template<class T>
559 T Multiply (T* const prod, T const value) noexcept
560 {
562 return Multiply_device(prod, value);
563 ))
565 auto const old = *prod;
566 *prod *= value;
567 return old;
568 ))
569 }
570
572// Divide
574
575#ifdef AMREX_USE_GPU
576
577 template <typename T, std::enable_if_t<sizeof(T) == sizeof(int), int> = 0>
579 T Divide_device (T* const quot, T const value) noexcept
580 {
581 return detail::atomic_op<T, int>(quot,value,amrex::Divides<T>());
582 }
583
584 template <typename T, std::enable_if_t<sizeof(T) == sizeof(unsigned long long), int> = 0>
586 T Divide_device (T* const quot, T const value) noexcept
587 {
588 return detail::atomic_op<T, unsigned long long>(quot,value,amrex::Divides<T>());
589 }
590
591#endif
592
593 template<class T>
595 T Divide (T* const quot, T const value) noexcept
596 {
598 return Divide_device(quot, value);
599 ))
601 auto const old = *quot;
602 *quot /= value;
603 return old;
604 ))
605 }
606}
607
608namespace HostDevice::Atomic {
609
610 template <class T>
612 void Add_Host (T* const sum, T const value) noexcept
613 {
614#ifdef AMREX_USE_OMP
615#pragma omp atomic update
616#endif
617 *sum += value;
618 }
619
620 template <class T>
622 void Add (T* const sum, T const value) noexcept
623 {
625 AMREX_IF_ON_HOST((Add_Host(sum,value);))
626 }
627
628}
629
630#ifdef AMREX_USE_GPU
631// functors
632namespace Gpu {
633 template <typename T>
635 {
636 AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
637 Gpu::Atomic::AddNoRet(dest, source);
638 }
639 };
640
641 template <typename T>
643 {
644 AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
645 Gpu::Atomic::Min(dest, source);
646 }
647 };
648
649 template <typename T>
651 {
652 AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
653 Gpu::Atomic::Max(dest, source);
654 }
655 };
656
657 template <typename T>
659 {
660 AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
661 Gpu::Atomic::LogicalAnd(dest, source);
662 }
663 };
664
665 template <typename T>
667 {
668 AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
669 Gpu::Atomic::LogicalOr(dest, source);
670 }
671 };
672}
673#endif
674
675}
676#endif
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_IF_ON_DEVICE(CODE)
Definition AMReX_GpuQualifiers.H:56
#define AMREX_IF_ON_HOST(CODE)
Definition AMReX_GpuQualifiers.H:58
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
amrex_long Long
Definition AMReX_INT.H:30
__host__ __device__ AMREX_FORCE_INLINE T Exch(T *address, T val) noexcept
Definition AMReX_GpuAtomic.H:488
__device__ AMREX_FORCE_INLINE bool If_device(T *const sum, T const value, Op &&op, Cond &&cond) noexcept
Definition AMReX_GpuAtomic.H:223
__device__ AMREX_FORCE_INLINE T Divide_device(T *const quot, T const value) noexcept
Definition AMReX_GpuAtomic.H:579
__host__ __device__ AMREX_FORCE_INLINE int LogicalAnd(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:462
__host__ __device__ AMREX_FORCE_INLINE T CAS(T *const address, T compare, T const val) noexcept
Definition AMReX_GpuAtomic.H:514
__host__ __device__ AMREX_FORCE_INLINE T Multiply(T *const prod, T const value) noexcept
Definition AMReX_GpuAtomic.H:559
__host__ __device__ AMREX_FORCE_INLINE int LogicalOr(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:437
__device__ AMREX_FORCE_INLINE T Multiply_device(T *const prod, T const value) noexcept
Definition AMReX_GpuAtomic.H:543
__host__ __device__ AMREX_FORCE_INLINE bool If(T *const add, T const value, Op &&op, Cond &&cond) noexcept
Conditionally perform an atomic operation.
Definition AMReX_GpuAtomic.H:257
__device__ AMREX_FORCE_INLINE T Max_device(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:377
__host__ __device__ AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:284
__device__ AMREX_FORCE_INLINE T Min_device(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:314
__host__ __device__ AMREX_FORCE_INLINE T Add(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:201
__device__ AMREX_FORCE_INLINE T Add_device(T *const sum, T const value) noexcept
Definition AMReX_GpuAtomic.H:138
__host__ __device__ AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:420
__host__ __device__ AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:357
__host__ __device__ AMREX_FORCE_INLINE T Divide(T *const quot, T const value) noexcept
Definition AMReX_GpuAtomic.H:595
__host__ __device__ AMREX_FORCE_INLINE void Add(T *const sum, T const value) noexcept
Definition AMReX_GpuAtomic.H:622
AMREX_FORCE_INLINE void Add_Host(T *const sum, T const value) noexcept
Definition AMReX_GpuAtomic.H:612
Definition AMReX_Amr.cpp:49
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
Definition AMReX_Functional.H:77
Definition AMReX_GpuAtomic.H:635
__device__ void operator()(T *const dest, T const source) noexcept
Definition AMReX_GpuAtomic.H:636
Definition AMReX_GpuAtomic.H:659
__device__ void operator()(T *const dest, T const source) noexcept
Definition AMReX_GpuAtomic.H:660
Definition AMReX_GpuAtomic.H:667
__device__ void operator()(T *const dest, T const source) noexcept
Definition AMReX_GpuAtomic.H:668
Definition AMReX_GpuAtomic.H:651
__device__ void operator()(T *const dest, T const source) noexcept
Definition AMReX_GpuAtomic.H:652
Definition AMReX_GpuAtomic.H:643
__device__ void operator()(T *const dest, T const source) noexcept
Definition AMReX_GpuAtomic.H:644
Definition AMReX_Functional.H:50
Definition AMReX_Functional.H:59
Definition AMReX_Functional.H:41
Definition AMReX_Functional.H:32
Definition AMReX_Functional.H:68
Definition AMReX_Functional.H:14