| /* Macros for atomic functionality for tile. |
| Copyright (C) 2011-2021 Free Software Foundation, Inc. |
| Contributed by Walter Lee (walt@tilera.com) |
| |
| This file is free software; you can redistribute it and/or modify it |
| under the terms of the GNU General Public License as published by the |
| Free Software Foundation; either version 3, or (at your option) any |
| later version. |
| |
| This file is distributed in the hope that it will be useful, but |
| WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| General Public License for more details. |
| |
| Under Section 7 of GPL version 3, you are granted additional |
| permissions described in the GCC Runtime Library Exception, version |
| 3.1, as published by the Free Software Foundation. |
| |
| You should have received a copy of the GNU General Public License and |
| a copy of the GCC Runtime Library Exception along with this program; |
| see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
| <http://www.gnu.org/licenses/>. */ |
| |
| |
| /* Provides macros for common atomic functionality. */ |
| |
| #ifndef _ATOMIC_H_ |
| #define _ATOMIC_H_ |
| |
| #ifdef __tilegx__ |
| /* Atomic instruction macros |
| |
| The macros provided by atomic.h simplify access to the TILE-Gx |
| architecture's atomic instructions. The architecture provides a |
| variety of atomic instructions, including "exchange", "compare and |
| exchange", "fetch and ADD", "fetch and AND", "fetch and OR", and |
| "fetch and ADD if greater than or equal to zero". |
| |
| No barrier or fence semantics are implied by any of the atomic |
| instructions for manipulating memory; you must specify the barriers |
| that you wish explicitly, using the provided macros. |
| |
| Any integral 32- or 64-bit value can be used as the argument |
| to these macros, such as "int", "long long", "unsigned long", etc. |
| The pointers must be aligned to 4 or 8 bytes for 32- or 64-bit data. |
| The "exchange" and "compare and exchange" macros may also take |
| pointer values. We use the pseudo-type "VAL" in the documentation |
| to indicate the use of an appropriate type. */ |
| #else |
| /* Atomic instruction macros |
| |
| The macros provided by atomic.h simplify access to the Tile |
| architecture's atomic instructions. Since the architecture |
| supports test-and-set as its only in-silicon atomic operation, many |
| of the operations provided by this header are implemented as |
| fast-path calls to Linux emulation routines. |
| |
| Using the kernel for atomic operations allows userspace to take |
| advantage of the kernel's existing atomic-integer support (managed |
| by a distributed array of locks). The kernel provides proper |
| ordering among simultaneous atomic operations on different cores, |
| and guarantees a process cannot be context-switched part way |
| through an atomic operation. By virtue of sharing the kernel |
| atomic implementation, the userspace atomic operations |
| are compatible with the atomic methods provided by the kernel's |
| futex() syscall API. Note that these operations never cause Linux |
| kernel scheduling, and are in fact invisible to the kernel; they |
| simply act as regular function calls but with an elevated privilege |
| level. Note that the kernel's distributed lock array is hashed by |
| using only VA bits from the atomic value's address (to avoid the |
| performance hit of page table locking and multiple page-table |
| lookups to get the PA) and only the VA bits that are below page |
| granularity (to properly lock simultaneous accesses to the same |
| page mapped at different VAs). As a result, simultaneous atomic |
| operations on values whose addresses are at the same offset on a |
| page will contend in the kernel for the same lock array element. |
| |
| No barrier or fence semantics are implied by any of the atomic |
| instructions for manipulating memory; you must specify the barriers |
| that you wish explicitly, using the provided macros. |
| |
| Any integral 32- or 64-bit value can be used as the argument |
| to these macros, such as "int", "long long", "unsigned long", etc. |
| The pointers must be aligned to 4 or 8 bytes for 32- or 64-bit data. |
| The "exchange" and "compare and exchange" macros may also take |
| pointer values. We use the pseudo-type "VAL" in the documentation |
| to indicate the use of an appropriate type. |
| |
| The 32-bit routines are implemented using a single kernel fast |
| syscall, as is the 64-bit compare-and-exchange. The other 64-bit |
| routines are implemented by looping over the 64-bit |
| compare-and-exchange routine, so may be potentially less efficient. */ |
| #endif |
| |
| #ifdef __tilegx__ |
| #define SPR_CMPEXCH_VALUE 0x2780 |
| #else |
| #define __NR_FAST_cmpxchg -1 |
| #define __NR_FAST_atomic_update -2 |
| #define __NR_FAST_cmpxchg64 -3 |
| #endif |
| |
| |
| /* 32-bit integer compare-and-exchange. */ |
| static __inline __attribute__ ((always_inline)) |
| int arch_atomic_val_compare_and_exchange_4 (volatile int *mem, |
| int oldval, int newval) |
| { |
| #ifdef __tilegx__ |
| __insn_mtspr (SPR_CMPEXCH_VALUE, oldval); |
| return __insn_cmpexch4 (mem, newval); |
| #else |
| int result; |
| __asm__ __volatile__ ("swint1":"=R00" (result), |
| "=m" (*mem):"R10" (__NR_FAST_cmpxchg), "R00" (mem), |
| "R01" (oldval), "R02" (newval), "m" (*mem):"r20", |
| "r21", "r22", "r23", "r24", "r25", "r26", "r27", |
| "r28", "r29", "memory"); |
| return result; |
| #endif |
| } |
| |
| /* 64-bit integer compare-and-exchange. */ |
| static __inline __attribute__ ((always_inline)) |
| long long arch_atomic_val_compare_and_exchange_8 (volatile long long |
| *mem, long long oldval, |
| long long newval) |
| { |
| #ifdef __tilegx__ |
| __insn_mtspr (SPR_CMPEXCH_VALUE, oldval); |
| return __insn_cmpexch (mem, newval); |
| #else |
| unsigned int result_lo, result_hi; |
| unsigned int oldval_lo = oldval & 0xffffffffu, oldval_hi = oldval >> 32; |
| unsigned int newval_lo = newval & 0xffffffffu, newval_hi = newval >> 32; |
| __asm__ __volatile__ ("swint1":"=R00" (result_lo), "=R01" (result_hi), |
| "=m" (*mem):"R10" (__NR_FAST_cmpxchg64), "R00" (mem), |
| "R02" (oldval_lo), "R03" (oldval_hi), |
| "R04" (newval_lo), "R05" (newval_hi), |
| "m" (*mem):"r20", "r21", "r22", "r23", "r24", "r25", |
| "r26", "r27", "r28", "r29", "memory"); |
| return ((long long) result_hi) << 32 | result_lo; |
| #endif |
| } |
| |
| /* This non-existent symbol is called for sizes other than "4" and "8", |
| indicating a bug in the caller. */ |
| extern int __arch_atomic_error_bad_argument_size (void) |
| __attribute__ ((warning ("sizeof atomic argument not 4 or 8"))); |
| |
| |
| #define arch_atomic_val_compare_and_exchange(mem, o, n) \ |
| __extension__ ({ \ |
| (__typeof(*(mem)))(__typeof(*(mem)-*(mem))) \ |
| ((sizeof(*(mem)) == 8) ? \ |
| arch_atomic_val_compare_and_exchange_8( \ |
| (volatile long long*)(mem), (__typeof((o)-(o)))(o), \ |
| (__typeof((n)-(n)))(n)) : \ |
| (sizeof(*(mem)) == 4) ? \ |
| arch_atomic_val_compare_and_exchange_4( \ |
| (volatile int*)(mem), (__typeof((o)-(o)))(o), \ |
| (__typeof((n)-(n)))(n)) : \ |
| __arch_atomic_error_bad_argument_size()); \ |
| }) |
| |
| #define arch_atomic_bool_compare_and_exchange(mem, o, n) \ |
| __extension__ ({ \ |
| __typeof(o) __o = (o); \ |
| __builtin_expect( \ |
| __o == arch_atomic_val_compare_and_exchange((mem), __o, (n)), 1); \ |
| }) |
| |
| |
| /* Loop with compare_and_exchange until we guess the correct value. |
| Normally "expr" will be an expression using __old and __value. */ |
| #define __arch_atomic_update_cmpxchg(mem, value, expr) \ |
| __extension__ ({ \ |
| __typeof(value) __value = (value); \ |
| __typeof(*(mem)) *__mem = (mem), __old = *__mem, __guess; \ |
| do { \ |
| __guess = __old; \ |
| __old = arch_atomic_val_compare_and_exchange(__mem, __old, (expr)); \ |
| } while (__builtin_expect(__old != __guess, 0)); \ |
| __old; \ |
| }) |
| |
| #ifdef __tilegx__ |
| |
| /* Generic atomic op with 8- or 4-byte variant. |
| The _mask, _addend, and _expr arguments are ignored on tilegx. */ |
| #define __arch_atomic_update(mem, value, op, _mask, _addend, _expr) \ |
| __extension__ ({ \ |
| ((__typeof(*(mem))) \ |
| ((sizeof(*(mem)) == 8) ? (__typeof(*(mem)-*(mem)))__insn_##op( \ |
| (volatile void *)(mem), \ |
| (long long)(__typeof((value)-(value)))(value)) : \ |
| (sizeof(*(mem)) == 4) ? (int)__insn_##op##4( \ |
| (volatile void *)(mem), \ |
| (int)(__typeof((value)-(value)))(value)) : \ |
| __arch_atomic_error_bad_argument_size())); \ |
| }) |
| |
| #else |
| |
| /* This uses TILEPro's fast syscall support to atomically compute: |
| |
| int old = *ptr; |
| *ptr = (old & mask) + addend; |
| return old; |
| |
| This primitive can be used for atomic exchange, add, or, and. |
| Only 32-bit support is provided. */ |
| static __inline __attribute__ ((always_inline)) |
| int |
| __arch_atomic_update_4 (volatile int *mem, int mask, int addend) |
| { |
| int result; |
| __asm__ __volatile__ ("swint1":"=R00" (result), |
| "=m" (*mem):"R10" (__NR_FAST_atomic_update), |
| "R00" (mem), "R01" (mask), "R02" (addend), |
| "m" (*mem):"r20", "r21", "r22", "r23", "r24", "r25", |
| "r26", "r27", "r28", "r29", "memory"); |
| return result; |
| } |
| |
| /* Generic atomic op with 8- or 4-byte variant. |
| The _op argument is ignored on tilepro. */ |
| #define __arch_atomic_update(mem, value, _op, mask, addend, expr) \ |
| __extension__ ({ \ |
| (__typeof(*(mem)))(__typeof(*(mem)-*(mem))) \ |
| ((sizeof(*(mem)) == 8) ? \ |
| __arch_atomic_update_cmpxchg((mem), (value), (expr)) : \ |
| (sizeof(*(mem)) == 4) ? \ |
| __arch_atomic_update_4((volatile int*)(mem), \ |
| (__typeof((mask)-(mask)))(mask), \ |
| (__typeof((addend)-(addend)))(addend)) : \ |
| __arch_atomic_error_bad_argument_size()); \ |
| }) |
| |
| #endif /* __tilegx__ */ |
| |
| |
| #define arch_atomic_exchange(mem, newvalue) \ |
| __arch_atomic_update(mem, newvalue, exch, 0, newvalue, __value) |
| |
| #define arch_atomic_add(mem, value) \ |
| __arch_atomic_update(mem, value, fetchadd, -1, value, __old + __value) |
| |
| #define arch_atomic_sub(mem, value) arch_atomic_add((mem), -(value)) |
| |
| #define arch_atomic_increment(mem) arch_atomic_add((mem), 1) |
| |
| #define arch_atomic_decrement(mem) arch_atomic_add((mem), -1) |
| |
| #define arch_atomic_and(mem, mask) \ |
| __arch_atomic_update(mem, mask, fetchand, mask, 0, __old & __value) |
| |
| #define arch_atomic_or(mem, mask) \ |
| __arch_atomic_update(mem, mask, fetchor, ~mask, mask, __old | __value) |
| |
| #define arch_atomic_xor(mem, mask) \ |
| __arch_atomic_update_cmpxchg(mem, mask, __old ^ __value) |
| |
| #define arch_atomic_nand(mem, mask) \ |
| __arch_atomic_update_cmpxchg(mem, mask, ~(__old & __value)) |
| |
| #define arch_atomic_bit_set(mem, bit) \ |
| __extension__ ({ \ |
| __typeof(*(mem)) __mask = (__typeof(*(mem)))1 << (bit); \ |
| __mask & arch_atomic_or((mem), __mask); \ |
| }) |
| |
| #define arch_atomic_bit_clear(mem, bit) \ |
| __extension__ ({ \ |
| __typeof(*(mem)) __mask = (__typeof(*(mem)))1 << (bit); \ |
| __mask & arch_atomic_and((mem), ~__mask); \ |
| }) |
| |
| #ifdef __tilegx__ |
| /* Atomically store a new value to memory. |
| Note that you can freely use types of any size here, unlike the |
| other atomic routines, which require 32- or 64-bit types. |
| This accessor is provided for compatibility with TILEPro, which |
| required an explicit atomic operation for stores that needed |
| to be atomic with respect to other atomic methods in this header. */ |
| #define arch_atomic_write(mem, value) ((void) (*(mem) = (value))) |
| #else |
| #define arch_atomic_write(mem, value) \ |
| do { \ |
| __typeof(mem) __aw_mem = (mem); \ |
| __typeof(value) __aw_val = (value); \ |
| unsigned int *__aw_mem32, __aw_intval, __aw_val32, __aw_off, __aw_mask; \ |
| __aw_intval = (__typeof((value) - (value)))__aw_val; \ |
| switch (sizeof(*__aw_mem)) { \ |
| case 8: \ |
| __arch_atomic_update_cmpxchg(__aw_mem, __aw_val, __value); \ |
| break; \ |
| case 4: \ |
| __arch_atomic_update_4((int *)__aw_mem, 0, __aw_intval); \ |
| break; \ |
| case 2: \ |
| __aw_off = 8 * ((long)__aw_mem & 0x2); \ |
| __aw_mask = 0xffffU << __aw_off; \ |
| __aw_mem32 = (unsigned int *)((long)__aw_mem & ~0x2); \ |
| __aw_val32 = (__aw_intval << __aw_off) & __aw_mask; \ |
| __arch_atomic_update_cmpxchg(__aw_mem32, __aw_val32, \ |
| (__old & ~__aw_mask) | __value); \ |
| break; \ |
| case 1: \ |
| __aw_off = 8 * ((long)__aw_mem & 0x3); \ |
| __aw_mask = 0xffU << __aw_off; \ |
| __aw_mem32 = (unsigned int *)((long)__aw_mem & ~0x3); \ |
| __aw_val32 = (__aw_intval << __aw_off) & __aw_mask; \ |
| __arch_atomic_update_cmpxchg(__aw_mem32, __aw_val32, \ |
| (__old & ~__aw_mask) | __value); \ |
| break; \ |
| } \ |
| } while (0) |
| #endif |
| |
| /* Compiler barrier. |
| |
| This macro prevents loads or stores from being moved by the compiler |
| across the macro. Any loaded value that was loaded before this |
| macro must then be reloaded by the compiler. */ |
| #define arch_atomic_compiler_barrier() __asm__ __volatile__("" ::: "memory") |
| |
| /* Full memory barrier. |
| |
| This macro has the semantics of arch_atomic_compiler_barrer(), but also |
| ensures that previous stores are visible to other cores, and that |
| all previous loaded values have been placed into their target |
| register on this core. */ |
| #define arch_atomic_full_barrier() __insn_mf() |
| |
| /* Read memory barrier. |
| |
| Ensure that all reads by this processor that occurred prior to the |
| read memory barrier have completed, and that no reads that occur |
| after the read memory barrier on this processor are initiated |
| before the barrier. |
| |
| On current TILE chips a read barrier is implemented as a full barrier, |
| but this may not be true in later versions of the architecture. |
| |
| See also arch_atomic_acquire_barrier() for the appropriate idiom to use |
| to ensure no reads are lifted above an atomic lock instruction. */ |
| #define arch_atomic_read_barrier() arch_atomic_full_barrier() |
| |
| /* Write memory barrier. |
| |
| Ensure that all writes by this processor that occurred prior to the |
| write memory barrier have completed, and that no writes that occur |
| after the write memory barrier on this processor are initiated |
| before the barrier. |
| |
| On current TILE chips a write barrier is implemented as a full barrier, |
| but this may not be true in later versions of the architecture. |
| |
| See also arch_atomic_release_barrier() for the appropriate idiom to use |
| to ensure all writes are complete prior to an atomic unlock instruction. */ |
| #define arch_atomic_write_barrier() arch_atomic_full_barrier() |
| |
| /* Lock acquisition barrier. |
| |
| Ensure that no load operations that follow this macro in the |
| program can issue prior to the barrier. Without such a barrier, |
| the compiler can reorder them to issue earlier, or the hardware can |
| issue them speculatively. The latter is not currently done in the |
| Tile microarchitecture, but using this operation improves |
| portability to future implementations. |
| |
| This operation is intended to be used as part of the "acquire" |
| path for locking, that is, when entering a critical section. |
| This should be done after the atomic operation that actually |
| acquires the lock, and in conjunction with a "control dependency" |
| that checks the atomic operation result to see if the lock was |
| in fact acquired. See the arch_atomic_read_barrier() macro |
| for a heavier-weight barrier to use in certain unusual constructs, |
| or arch_atomic_acquire_barrier_value() if no control dependency exists. */ |
| #define arch_atomic_acquire_barrier() arch_atomic_compiler_barrier() |
| |
| /* Lock release barrier. |
| |
| Ensure that no store operations that precede this macro in the |
| program complete subsequent to the barrier. Without such a |
| barrier, the compiler can reorder stores to issue later, or stores |
| can be still outstanding in the memory network. |
| |
| This operation is intended to be used as part of the "release" path |
| for locking, that is, when leaving a critical section. This should |
| be done before the operation (such as a store of zero) that |
| actually releases the lock. */ |
| #define arch_atomic_release_barrier() arch_atomic_write_barrier() |
| |
| /* Barrier until the read of a particular value is complete. |
| |
| This is occasionally useful when constructing certain locking |
| scenarios. For example, you might write a routine that issues an |
| atomic instruction to enter a critical section, then reads one or |
| more values within the critical section without checking to see if |
| the critical section was in fact acquired, and only later checks |
| the atomic instruction result to see if the lock was acquired. If |
| so the routine could properly release the lock and know that the |
| values that were read were valid. |
| |
| In this scenario, it is required to wait for the result of the |
| atomic instruction, even if the value itself is not checked. This |
| guarantees that if the atomic instruction succeeded in taking the lock, |
| the lock was held before any reads in the critical section issued. */ |
| #define arch_atomic_acquire_barrier_value(val) \ |
| __asm__ __volatile__("move %0, %0" :: "r"(val)) |
| |
| /* Access the given variable in memory exactly once. |
| |
| In some contexts, an algorithm may need to force access to memory, |
| since otherwise the compiler may think it can optimize away a |
| memory load or store; for example, in a loop when polling memory to |
| see if another cpu has updated it yet. Generally this is only |
| required for certain very carefully hand-tuned algorithms; using it |
| unnecessarily may result in performance losses. |
| |
| A related use of this macro is to ensure that the compiler does not |
| rematerialize the value of "x" by reloading it from memory |
| unexpectedly; the "volatile" marking will prevent the compiler from |
| being able to rematerialize. This is helpful if an algorithm needs |
| to read a variable without locking, but needs it to have the same |
| value if it ends up being used several times within the algorithm. |
| |
| Note that multiple uses of this macro are guaranteed to be ordered, |
| i.e. the compiler will not reorder stores or loads that are wrapped |
| in arch_atomic_access_once(). */ |
| #define arch_atomic_access_once(x) (*(volatile __typeof(x) *)&(x)) |
| |
| |
| |
| #endif /* !_ATOMIC_H_ */ |