libgcc/config/tilepro/atomic.c - gcc - Git at Google

 /* TILE atomics.
    Copyright (C) 2011-2021 Free Software Foundation, Inc.
    Contributed by Walter Lee (walt@tilera.com)

    This file is free software; you can redistribute it and/or modify it
    under the terms of the GNU General Public License as published by the
    Free Software Foundation; either version 3, or (at your option) any
    later version.

    This file is distributed in the hope that it will be useful, but
    WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    General Public License for more details.

    Under Section 7 of GPL version 3, you are granted additional
    permissions described in the GCC Runtime Library Exception, version
    3.1, as published by the Free Software Foundation.

    You should have received a copy of the GNU General Public License and
    a copy of the GCC Runtime Library Exception along with this program;
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */

 #include "tconfig.h"
 #include "coretypes.h"
 #include "atomic.h"

 #define bool unsigned char

 /* This code should be inlined by the compiler, but for now support
    it as out-of-line methods in libgcc.  */

 static inline void
 pre_atomic_barrier (int model)
 {
   switch (model)
     {
     case __ATOMIC_RELEASE:
     case __ATOMIC_ACQ_REL:
     case __ATOMIC_SEQ_CST:
       __atomic_thread_fence (model);
       break;
     default:
       break;
     }
   return;
 }

 static inline void
 post_atomic_barrier (int model)
 {
   switch (model)
     {
     case __ATOMIC_ACQUIRE:
     case __ATOMIC_ACQ_REL:
     case __ATOMIC_SEQ_CST:
       __atomic_thread_fence (model);
       break;
     default:
       break;
     }
   return;
 }

 #define __unused __attribute__((unused))

 #define __fetch_and_do(proto, type, size, opname, top, bottom)	\
 proto								\
 {								\
   top;								\
   type rv = arch_atomic_##opname(p, i);				\
   bottom;							\
   return rv;							\
 }

 #define __atomic_fetch_and_do(type, size, opname)			\
   __fetch_and_do(type __atomic_fetch_##opname##_##size(type* p, type i, int model), \
 		 type, size, opname,					\
 		 pre_atomic_barrier(model),				\
 		 post_atomic_barrier(model))				\

 __atomic_fetch_and_do (int, 4, add)
 __atomic_fetch_and_do (int, 4, sub)
 __atomic_fetch_and_do (int, 4, or)
 __atomic_fetch_and_do (int, 4, and)
 __atomic_fetch_and_do (int, 4, xor)
 __atomic_fetch_and_do (int, 4, nand)
 __atomic_fetch_and_do (long long, 8, add)
 __atomic_fetch_and_do (long long, 8, sub)
 __atomic_fetch_and_do (long long, 8, or)
 __atomic_fetch_and_do (long long, 8, and)
 __atomic_fetch_and_do (long long, 8, xor)
 __atomic_fetch_and_do (long long, 8, nand)

 #define __sync_fetch_and_do(type, size, opname)				\
   __fetch_and_do(type __sync_fetch_and_##opname##_##size(type* p, type i), \
 		 type, size, opname,					\
 		 arch_atomic_write_barrier(),				\
 		 arch_atomic_read_barrier())

 __sync_fetch_and_do (int, 4, add)
 __sync_fetch_and_do (int, 4, sub)
 __sync_fetch_and_do (int, 4, or)
 __sync_fetch_and_do (int, 4, and)
 __sync_fetch_and_do (int, 4, xor)
 __sync_fetch_and_do (int, 4, nand)
 __sync_fetch_and_do (long long, 8, add)
 __sync_fetch_and_do (long long, 8, sub)
 __sync_fetch_and_do (long long, 8, or)
 __sync_fetch_and_do (long long, 8, and)
 __sync_fetch_and_do (long long, 8, xor)
 __sync_fetch_and_do (long long, 8, nand)

 #define __do_and_fetch(proto, type, size, opname, op, op2, top, bottom)	\
 proto									\
 {									\
   top;									\
   type rv = op2 (arch_atomic_##opname(p, i) op i);			\
   bottom;								\
   return rv;								\
 }

 #define __atomic_do_and_fetch(type, size, opname, op, op2)		\
   __do_and_fetch(type __atomic_##opname##_fetch_##size(type* p, type i, int model), \
 		 type, size, opname, op, op2,				\
 		 pre_atomic_barrier(model),				\
 		 post_atomic_barrier(model))				\

 __atomic_do_and_fetch (int, 4, add, +, )
 __atomic_do_and_fetch (int, 4, sub, -, )
 __atomic_do_and_fetch (int, 4, or, |, )
 __atomic_do_and_fetch (int, 4, and, &, )
 __atomic_do_and_fetch (int, 4, xor, |, )
 __atomic_do_and_fetch (int, 4, nand, &, ~)
 __atomic_do_and_fetch (long long, 8, add, +, )
 __atomic_do_and_fetch (long long, 8, sub, -, )
 __atomic_do_and_fetch (long long, 8, or, |, )
 __atomic_do_and_fetch (long long, 8, and, &, )
 __atomic_do_and_fetch (long long, 8, xor, |, )
 __atomic_do_and_fetch (long long, 8, nand, &, ~)

 #define __sync_do_and_fetch(type, size, opname, op, op2)		\
   __do_and_fetch(type __sync_##opname##_and_fetch_##size(type* p, type i), \
 		 type, size, opname, op, op2,				\
 		 arch_atomic_write_barrier(),				\
 		 arch_atomic_read_barrier())				\

 __sync_do_and_fetch (int, 4, add, +, )
 __sync_do_and_fetch (int, 4, sub, -, )
 __sync_do_and_fetch (int, 4, or, |, )
 __sync_do_and_fetch (int, 4, and, &, )
 __sync_do_and_fetch (int, 4, xor, |, )
 __sync_do_and_fetch (int, 4, nand, &, ~)
 __sync_do_and_fetch (long long, 8, add, +, )
 __sync_do_and_fetch (long long, 8, sub, -, )
 __sync_do_and_fetch (long long, 8, or, |, )
 __sync_do_and_fetch (long long, 8, and, &, )
 __sync_do_and_fetch (long long, 8, xor, |, )
 __sync_do_and_fetch (long long, 8, nand, &, ~)

 #define __atomic_exchange_methods(type, size)				\
 bool									\
 __atomic_compare_exchange_##size(volatile type* ptr, type* oldvalp,	\
 				 type newval, bool weak __unused,	\
 				 int models, int modelf __unused)	\
 {									\
   type oldval = *oldvalp;						\
   pre_atomic_barrier(models);						\
   type retval = arch_atomic_val_compare_and_exchange(ptr, oldval, newval); \
   post_atomic_barrier(models);						\
   bool success = (retval == oldval);					\
   *oldvalp = retval;							\
   return success;							\
 }									\
 									\
 type									\
 __atomic_exchange_##size(volatile type* ptr, type val, int model)	\
 {									\
   pre_atomic_barrier(model);						\
   type retval = arch_atomic_exchange(ptr, val);				\
   post_atomic_barrier(model);						\
   return retval;							\
 }

 __atomic_exchange_methods (int, 4)
 __atomic_exchange_methods (long long, 8)

 #define __sync_exchange_methods(type, size)				\
 type									\
 __sync_val_compare_and_swap_##size(type* ptr, type oldval, type newval)	\
 {									\
   arch_atomic_write_barrier();						\
   type retval = arch_atomic_val_compare_and_exchange(ptr, oldval, newval); \
   arch_atomic_read_barrier();						\
   return retval;							\
 }									\
 									\
 bool									\
 __sync_bool_compare_and_swap_##size(type* ptr, type oldval, type newval) \
 {									\
   arch_atomic_write_barrier();						\
   bool retval = arch_atomic_bool_compare_and_exchange(ptr, oldval, newval); \
   arch_atomic_read_barrier();						\
   return retval;							\
 }									\
 									\
 type									\
 __sync_lock_test_and_set_##size(type* ptr, type val)			\
 {									\
   type retval = arch_atomic_exchange(ptr, val);				\
   arch_atomic_acquire_barrier_value(retval);				\
   return retval;							\
 }

 __sync_exchange_methods (int, 4)
 __sync_exchange_methods (long long, 8)

 #ifdef __LITTLE_ENDIAN__
 #define BIT_OFFSET(n, type) ((n) * 8)
 #else
 #define BIT_OFFSET(n, type) ((4 - sizeof(type) - (n)) * 8)
 #endif

 /* Subword methods require the same approach for both TILEPro and
    TILE-Gx.  We load the background data for the word, insert the
    desired subword piece, then compare-and-exchange it into place.  */
 #define u8 unsigned char
 #define u16 unsigned short

 #define __subword_cmpxchg_body(type, size, ptr, guess, val)		\
   ({									\
     unsigned int *p = (unsigned int *)((unsigned long)ptr & ~3UL);	\
     const int shift = BIT_OFFSET((unsigned long)ptr & 3UL, type);	\
     const unsigned int valmask = (1 << (sizeof(type) * 8)) - 1;		\
     const unsigned int bgmask = ~(valmask << shift);			\
     unsigned int oldword = *p;						\
     type oldval = (oldword >> shift) & valmask;				\
     if (__builtin_expect((oldval == guess), 1)) {			\
       unsigned int word = (oldword & bgmask) | ((val & valmask) << shift); \
       oldword = arch_atomic_val_compare_and_exchange(p, oldword, word);	\
       oldval = (oldword >> shift) & valmask;				\
     }									\
     oldval;								\
   })									\

 #define __atomic_subword_cmpxchg(type, size)				\
   									\
 bool									\
 __atomic_compare_exchange_##size(volatile type* ptr, type* guess_ptr,	\
 				 type val, bool weak __unused, int models, \
 				 int modelf __unused)			\
 {									\
   pre_atomic_barrier(models);						\
   type guess = *guess_ptr;						\
   type oldval = __subword_cmpxchg_body(type, size, ptr, guess, val);	\
   post_atomic_barrier(models);						\
   bool success = (oldval == guess);					\
   *guess_ptr = oldval;							\
   return success;							\
 }

 __atomic_subword_cmpxchg (u8, 1)
 __atomic_subword_cmpxchg (u16, 2)

 #define __sync_subword_cmpxchg(type, size)				\
   									\
 type									\
 __sync_val_compare_and_swap_##size(type* ptr, type guess, type val)	\
 {									\
   arch_atomic_write_barrier();						\
   type oldval = __subword_cmpxchg_body(type, size, ptr, guess, val);	\
   arch_atomic_read_barrier();						\
   return oldval;							\
 }									\
 									\
 bool									\
 __sync_bool_compare_and_swap_##size(type* ptr, type guess, type val)	\
 {									\
   type oldval = __sync_val_compare_and_swap_##size(ptr, guess, val);	\
   return oldval == guess;						\
 }

 __sync_subword_cmpxchg (u8, 1)
 __sync_subword_cmpxchg (u16, 2)

 /* For the atomic-update subword methods, we use the same approach as
    above, but we retry until we succeed if the compare-and-exchange
    fails.  */
 #define __subword(type, proto, top, expr, bottom)			\
 proto									\
 {									\
   top									\
   unsigned int *p = (unsigned int *)((unsigned long)ptr & ~3UL);	\
   const int shift = BIT_OFFSET((unsigned long)ptr & 3UL, type);		\
   const unsigned int valmask = (1 << (sizeof(type) * 8)) - 1;		\
   const unsigned int bgmask = ~(valmask << shift);			\
   unsigned int oldword, xword = *p;					\
   type val, oldval;							\
   do {									\
     oldword = xword;							\
     oldval = (oldword >> shift) & valmask;				\
     val = expr;								\
     unsigned int word = (oldword & bgmask) | ((val & valmask) << shift); \
     xword = arch_atomic_val_compare_and_exchange(p, oldword, word);	\
   } while (__builtin_expect(xword != oldword, 0));			\
   bottom								\
 }

 #define __atomic_subword_fetch(type, funcname, expr, retval)		\
   __subword(type,							\
 	    type __atomic_ ## funcname(volatile type *ptr, type i, int model), \
 	    pre_atomic_barrier(model);,					\
 	    expr,							\
 	    post_atomic_barrier(model); return retval;)

 __atomic_subword_fetch (u8, fetch_add_1, oldval + i, oldval)
 __atomic_subword_fetch (u8, fetch_sub_1, oldval - i, oldval)
 __atomic_subword_fetch (u8, fetch_or_1, oldval | i, oldval)
 __atomic_subword_fetch (u8, fetch_and_1, oldval & i, oldval)
 __atomic_subword_fetch (u8, fetch_xor_1, oldval ^ i, oldval)
 __atomic_subword_fetch (u8, fetch_nand_1, ~(oldval & i), oldval)

 __atomic_subword_fetch (u16, fetch_add_2, oldval + i, oldval)
 __atomic_subword_fetch (u16, fetch_sub_2, oldval - i, oldval)
 __atomic_subword_fetch (u16, fetch_or_2, oldval | i, oldval)
 __atomic_subword_fetch (u16, fetch_and_2, oldval & i, oldval)
 __atomic_subword_fetch (u16, fetch_xor_2, oldval ^ i, oldval)
 __atomic_subword_fetch (u16, fetch_nand_2, ~(oldval & i), oldval)

 __atomic_subword_fetch (u8, add_fetch_1, oldval + i, val)
 __atomic_subword_fetch (u8, sub_fetch_1, oldval - i, val)
 __atomic_subword_fetch (u8, or_fetch_1, oldval | i, val)
 __atomic_subword_fetch (u8, and_fetch_1, oldval & i, val)
 __atomic_subword_fetch (u8, xor_fetch_1, oldval ^ i, val)
 __atomic_subword_fetch (u8, nand_fetch_1, ~(oldval & i), val)

 __atomic_subword_fetch (u16, add_fetch_2, oldval + i, val)
 __atomic_subword_fetch (u16, sub_fetch_2, oldval - i, val)
 __atomic_subword_fetch (u16, or_fetch_2, oldval | i, val)
 __atomic_subword_fetch (u16, and_fetch_2, oldval & i, val)
 __atomic_subword_fetch (u16, xor_fetch_2, oldval ^ i, val)
 __atomic_subword_fetch (u16, nand_fetch_2, ~(oldval & i), val)

 #define __sync_subword_fetch(type, funcname, expr, retval)	\
   __subword(type,						\
 	    type __sync_ ## funcname(type *ptr, type i),	\
 	    arch_atomic_read_barrier();,			\
 	    expr,						\
 	    arch_atomic_write_barrier(); return retval;)

 __sync_subword_fetch (u8, fetch_and_add_1, oldval + i, oldval)
 __sync_subword_fetch (u8, fetch_and_sub_1, oldval - i, oldval)
 __sync_subword_fetch (u8, fetch_and_or_1, oldval | i, oldval)
 __sync_subword_fetch (u8, fetch_and_and_1, oldval & i, oldval)
 __sync_subword_fetch (u8, fetch_and_xor_1, oldval ^ i, oldval)
 __sync_subword_fetch (u8, fetch_and_nand_1, ~(oldval & i), oldval)

 __sync_subword_fetch (u16, fetch_and_add_2, oldval + i, oldval)
 __sync_subword_fetch (u16, fetch_and_sub_2, oldval - i, oldval)
 __sync_subword_fetch (u16, fetch_and_or_2, oldval | i, oldval)
 __sync_subword_fetch (u16, fetch_and_and_2, oldval & i, oldval)
 __sync_subword_fetch (u16, fetch_and_xor_2, oldval ^ i, oldval)
 __sync_subword_fetch (u16, fetch_and_nand_2, ~(oldval & i), oldval)

 __sync_subword_fetch (u8, add_and_fetch_1, oldval + i, val)
 __sync_subword_fetch (u8, sub_and_fetch_1, oldval - i, val)
 __sync_subword_fetch (u8, or_and_fetch_1, oldval | i, val)
 __sync_subword_fetch (u8, and_and_fetch_1, oldval & i, val)
 __sync_subword_fetch (u8, xor_and_fetch_1, oldval ^ i, val)
 __sync_subword_fetch (u8, nand_and_fetch_1, ~(oldval & i), val)

 __sync_subword_fetch (u16, add_and_fetch_2, oldval + i, val)
 __sync_subword_fetch (u16, sub_and_fetch_2, oldval - i, val)
 __sync_subword_fetch (u16, or_and_fetch_2, oldval | i, val)
 __sync_subword_fetch (u16, and_and_fetch_2, oldval & i, val)
 __sync_subword_fetch (u16, xor_and_fetch_2, oldval ^ i, val)
 __sync_subword_fetch (u16, nand_and_fetch_2, ~(oldval & i), val)

 #define __atomic_subword_lock(type, size)				\
   __subword(type,							\
 	    type __atomic_exchange_##size(volatile type* ptr, type nval, int model), \
 	    pre_atomic_barrier(model);,					\
 	    nval,							\
 	    post_atomic_barrier(model); return oldval;)

 __atomic_subword_lock (u8, 1)
 __atomic_subword_lock (u16, 2)

 #define __sync_subword_lock(type, size)					\
   __subword(type,							\
 	    type __sync_lock_test_and_set_##size(type* ptr, type nval), \
 	    ,								\
 	    nval,							\
 	    arch_atomic_acquire_barrier_value(oldval); return oldval;)

 __sync_subword_lock (u8, 1)
 __sync_subword_lock (u16, 2)
	/* TILE atomics.
	Copyright (C) 2011-2021 Free Software Foundation, Inc.
	Contributed by Walter Lee (walt@tilera.com)

	This file is free software; you can redistribute it and/or modify it
	under the terms of the GNU General Public License as published by the
	Free Software Foundation; either version 3, or (at your option) any
	later version.

	This file is distributed in the hope that it will be useful, but
	WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	General Public License for more details.

	Under Section 7 of GPL version 3, you are granted additional
	permissions described in the GCC Runtime Library Exception, version
	3.1, as published by the Free Software Foundation.

	You should have received a copy of the GNU General Public License and
	a copy of the GCC Runtime Library Exception along with this program;
	see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	<http://www.gnu.org/licenses/>. */

	#include "tconfig.h"
	#include "coretypes.h"
	#include "atomic.h"

	#define bool unsigned char

	/* This code should be inlined by the compiler, but for now support
	it as out-of-line methods in libgcc. */

	static inline void
	pre_atomic_barrier (int model)
	{
	switch (model)
	{
	case __ATOMIC_RELEASE:
	case __ATOMIC_ACQ_REL:
	case __ATOMIC_SEQ_CST:
	__atomic_thread_fence (model);
	break;
	default:
	break;
	}
	return;
	}

	static inline void
	post_atomic_barrier (int model)
	{
	switch (model)
	{
	case __ATOMIC_ACQUIRE:
	case __ATOMIC_ACQ_REL:
	case __ATOMIC_SEQ_CST:
	__atomic_thread_fence (model);
	break;
	default:
	break;
	}
	return;
	}

	#define __unused __attribute__((unused))

	#define __fetch_and_do(proto, type, size, opname, top, bottom) \
	proto \
	{ \
	top; \
	type rv = arch_atomic_##opname(p, i); \
	bottom; \
	return rv; \
	}

	#define __atomic_fetch_and_do(type, size, opname) \
	__fetch_and_do(type __atomic_fetch_##opname##_##size(type* p, type i, int model), \
	type, size, opname, \
	pre_atomic_barrier(model), \
	post_atomic_barrier(model)) \

	__atomic_fetch_and_do (int, 4, add)
	__atomic_fetch_and_do (int, 4, sub)
	__atomic_fetch_and_do (int, 4, or)
	__atomic_fetch_and_do (int, 4, and)
	__atomic_fetch_and_do (int, 4, xor)
	__atomic_fetch_and_do (int, 4, nand)
	__atomic_fetch_and_do (long long, 8, add)
	__atomic_fetch_and_do (long long, 8, sub)
	__atomic_fetch_and_do (long long, 8, or)
	__atomic_fetch_and_do (long long, 8, and)
	__atomic_fetch_and_do (long long, 8, xor)
	__atomic_fetch_and_do (long long, 8, nand)

	#define __sync_fetch_and_do(type, size, opname) \
	__fetch_and_do(type __sync_fetch_and_##opname##_##size(type* p, type i), \
	type, size, opname, \
	arch_atomic_write_barrier(), \
	arch_atomic_read_barrier())

	__sync_fetch_and_do (int, 4, add)
	__sync_fetch_and_do (int, 4, sub)
	__sync_fetch_and_do (int, 4, or)
	__sync_fetch_and_do (int, 4, and)
	__sync_fetch_and_do (int, 4, xor)
	__sync_fetch_and_do (int, 4, nand)
	__sync_fetch_and_do (long long, 8, add)
	__sync_fetch_and_do (long long, 8, sub)
	__sync_fetch_and_do (long long, 8, or)
	__sync_fetch_and_do (long long, 8, and)
	__sync_fetch_and_do (long long, 8, xor)
	__sync_fetch_and_do (long long, 8, nand)

	#define __do_and_fetch(proto, type, size, opname, op, op2, top, bottom) \
	proto \
	{ \
	top; \
	type rv = op2 (arch_atomic_##opname(p, i) op i); \
	bottom; \
	return rv; \
	}

	#define __atomic_do_and_fetch(type, size, opname, op, op2) \
	__do_and_fetch(type __atomic_##opname##_fetch_##size(type* p, type i, int model), \
	type, size, opname, op, op2, \
	pre_atomic_barrier(model), \
	post_atomic_barrier(model)) \

	__atomic_do_and_fetch (int, 4, add, +, )
	__atomic_do_and_fetch (int, 4, sub, -, )
	__atomic_do_and_fetch (int, 4, or, \|, )
	__atomic_do_and_fetch (int, 4, and, &, )
	__atomic_do_and_fetch (int, 4, xor, \|, )
	__atomic_do_and_fetch (int, 4, nand, &, ~)
	__atomic_do_and_fetch (long long, 8, add, +, )
	__atomic_do_and_fetch (long long, 8, sub, -, )
	__atomic_do_and_fetch (long long, 8, or, \|, )
	__atomic_do_and_fetch (long long, 8, and, &, )
	__atomic_do_and_fetch (long long, 8, xor, \|, )
	__atomic_do_and_fetch (long long, 8, nand, &, ~)

	#define __sync_do_and_fetch(type, size, opname, op, op2) \
	__do_and_fetch(type __sync_##opname##_and_fetch_##size(type* p, type i), \
	type, size, opname, op, op2, \
	arch_atomic_write_barrier(), \
	arch_atomic_read_barrier()) \

	__sync_do_and_fetch (int, 4, add, +, )
	__sync_do_and_fetch (int, 4, sub, -, )
	__sync_do_and_fetch (int, 4, or, \|, )
	__sync_do_and_fetch (int, 4, and, &, )
	__sync_do_and_fetch (int, 4, xor, \|, )
	__sync_do_and_fetch (int, 4, nand, &, ~)
	__sync_do_and_fetch (long long, 8, add, +, )
	__sync_do_and_fetch (long long, 8, sub, -, )
	__sync_do_and_fetch (long long, 8, or, \|, )
	__sync_do_and_fetch (long long, 8, and, &, )
	__sync_do_and_fetch (long long, 8, xor, \|, )
	__sync_do_and_fetch (long long, 8, nand, &, ~)

	#define __atomic_exchange_methods(type, size) \
	bool \
	__atomic_compare_exchange_##size(volatile type* ptr, type* oldvalp, \
	type newval, bool weak __unused, \
	int models, int modelf __unused) \
	{ \
	type oldval = *oldvalp; \
	pre_atomic_barrier(models); \
	type retval = arch_atomic_val_compare_and_exchange(ptr, oldval, newval); \
	post_atomic_barrier(models); \
	bool success = (retval == oldval); \
	*oldvalp = retval; \
	return success; \
	} \
	\
	type \
	__atomic_exchange_##size(volatile type* ptr, type val, int model) \
	{ \
	pre_atomic_barrier(model); \
	type retval = arch_atomic_exchange(ptr, val); \
	post_atomic_barrier(model); \
	return retval; \
	}

	__atomic_exchange_methods (int, 4)
	__atomic_exchange_methods (long long, 8)

	#define __sync_exchange_methods(type, size) \
	type \
	__sync_val_compare_and_swap_##size(type* ptr, type oldval, type newval) \
	{ \
	arch_atomic_write_barrier(); \
	type retval = arch_atomic_val_compare_and_exchange(ptr, oldval, newval); \
	arch_atomic_read_barrier(); \
	return retval; \
	} \
	\
	bool \
	__sync_bool_compare_and_swap_##size(type* ptr, type oldval, type newval) \
	{ \
	arch_atomic_write_barrier(); \
	bool retval = arch_atomic_bool_compare_and_exchange(ptr, oldval, newval); \
	arch_atomic_read_barrier(); \
	return retval; \
	} \
	\
	type \
	__sync_lock_test_and_set_##size(type* ptr, type val) \
	{ \
	type retval = arch_atomic_exchange(ptr, val); \
	arch_atomic_acquire_barrier_value(retval); \
	return retval; \
	}

	__sync_exchange_methods (int, 4)
	__sync_exchange_methods (long long, 8)

	#ifdef __LITTLE_ENDIAN__
	#define BIT_OFFSET(n, type) ((n) * 8)
	#else
	#define BIT_OFFSET(n, type) ((4 - sizeof(type) - (n)) * 8)
	#endif

	/* Subword methods require the same approach for both TILEPro and
	TILE-Gx. We load the background data for the word, insert the
	desired subword piece, then compare-and-exchange it into place. */
	#define u8 unsigned char
	#define u16 unsigned short

	#define __subword_cmpxchg_body(type, size, ptr, guess, val) \
	({ \
	unsigned int p = (unsigned int )((unsigned long)ptr & ~3UL); \
	const int shift = BIT_OFFSET((unsigned long)ptr & 3UL, type); \
	const unsigned int valmask = (1 << (sizeof(type) * 8)) - 1; \
	const unsigned int bgmask = ~(valmask << shift); \
	unsigned int oldword = *p; \
	type oldval = (oldword >> shift) & valmask; \
	if (__builtin_expect((oldval == guess), 1)) { \
	unsigned int word = (oldword & bgmask) \| ((val & valmask) << shift); \
	oldword = arch_atomic_val_compare_and_exchange(p, oldword, word); \
	oldval = (oldword >> shift) & valmask; \
	} \
	oldval; \
	}) \

	#define __atomic_subword_cmpxchg(type, size) \
	\
	bool \
	__atomic_compare_exchange_##size(volatile type* ptr, type* guess_ptr, \
	type val, bool weak __unused, int models, \
	int modelf __unused) \
	{ \
	pre_atomic_barrier(models); \
	type guess = *guess_ptr; \
	type oldval = __subword_cmpxchg_body(type, size, ptr, guess, val); \
	post_atomic_barrier(models); \
	bool success = (oldval == guess); \
	*guess_ptr = oldval; \
	return success; \
	}

	__atomic_subword_cmpxchg (u8, 1)
	__atomic_subword_cmpxchg (u16, 2)

	#define __sync_subword_cmpxchg(type, size) \
	\
	type \
	__sync_val_compare_and_swap_##size(type* ptr, type guess, type val) \
	{ \
	arch_atomic_write_barrier(); \
	type oldval = __subword_cmpxchg_body(type, size, ptr, guess, val); \
	arch_atomic_read_barrier(); \
	return oldval; \
	} \
	\
	bool \
	__sync_bool_compare_and_swap_##size(type* ptr, type guess, type val) \
	{ \
	type oldval = __sync_val_compare_and_swap_##size(ptr, guess, val); \
	return oldval == guess; \
	}

	__sync_subword_cmpxchg (u8, 1)
	__sync_subword_cmpxchg (u16, 2)

	/* For the atomic-update subword methods, we use the same approach as
	above, but we retry until we succeed if the compare-and-exchange
	fails. */
	#define __subword(type, proto, top, expr, bottom) \
	proto \
	{ \
	top \
	unsigned int p = (unsigned int )((unsigned long)ptr & ~3UL); \
	const int shift = BIT_OFFSET((unsigned long)ptr & 3UL, type); \
	const unsigned int valmask = (1 << (sizeof(type) * 8)) - 1; \
	const unsigned int bgmask = ~(valmask << shift); \
	unsigned int oldword, xword = *p; \
	type val, oldval; \
	do { \
	oldword = xword; \
	oldval = (oldword >> shift) & valmask; \
	val = expr; \
	unsigned int word = (oldword & bgmask) \| ((val & valmask) << shift); \
	xword = arch_atomic_val_compare_and_exchange(p, oldword, word); \
	} while (__builtin_expect(xword != oldword, 0)); \
	bottom \
	}

	#define __atomic_subword_fetch(type, funcname, expr, retval) \
	__subword(type, \
	type __atomic_ ## funcname(volatile type *ptr, type i, int model), \
	pre_atomic_barrier(model);, \
	expr, \
	post_atomic_barrier(model); return retval;)

	__atomic_subword_fetch (u8, fetch_add_1, oldval + i, oldval)
	__atomic_subword_fetch (u8, fetch_sub_1, oldval - i, oldval)
	__atomic_subword_fetch (u8, fetch_or_1, oldval \| i, oldval)
	__atomic_subword_fetch (u8, fetch_and_1, oldval & i, oldval)
	__atomic_subword_fetch (u8, fetch_xor_1, oldval ^ i, oldval)
	__atomic_subword_fetch (u8, fetch_nand_1, ~(oldval & i), oldval)

	__atomic_subword_fetch (u16, fetch_add_2, oldval + i, oldval)
	__atomic_subword_fetch (u16, fetch_sub_2, oldval - i, oldval)
	__atomic_subword_fetch (u16, fetch_or_2, oldval \| i, oldval)
	__atomic_subword_fetch (u16, fetch_and_2, oldval & i, oldval)
	__atomic_subword_fetch (u16, fetch_xor_2, oldval ^ i, oldval)
	__atomic_subword_fetch (u16, fetch_nand_2, ~(oldval & i), oldval)

	__atomic_subword_fetch (u8, add_fetch_1, oldval + i, val)
	__atomic_subword_fetch (u8, sub_fetch_1, oldval - i, val)
	__atomic_subword_fetch (u8, or_fetch_1, oldval \| i, val)
	__atomic_subword_fetch (u8, and_fetch_1, oldval & i, val)
	__atomic_subword_fetch (u8, xor_fetch_1, oldval ^ i, val)
	__atomic_subword_fetch (u8, nand_fetch_1, ~(oldval & i), val)

	__atomic_subword_fetch (u16, add_fetch_2, oldval + i, val)
	__atomic_subword_fetch (u16, sub_fetch_2, oldval - i, val)
	__atomic_subword_fetch (u16, or_fetch_2, oldval \| i, val)
	__atomic_subword_fetch (u16, and_fetch_2, oldval & i, val)
	__atomic_subword_fetch (u16, xor_fetch_2, oldval ^ i, val)
	__atomic_subword_fetch (u16, nand_fetch_2, ~(oldval & i), val)

	#define __sync_subword_fetch(type, funcname, expr, retval) \
	__subword(type, \
	type __sync_ ## funcname(type *ptr, type i), \
	arch_atomic_read_barrier();, \
	expr, \
	arch_atomic_write_barrier(); return retval;)

	__sync_subword_fetch (u8, fetch_and_add_1, oldval + i, oldval)
	__sync_subword_fetch (u8, fetch_and_sub_1, oldval - i, oldval)
	__sync_subword_fetch (u8, fetch_and_or_1, oldval \| i, oldval)
	__sync_subword_fetch (u8, fetch_and_and_1, oldval & i, oldval)
	__sync_subword_fetch (u8, fetch_and_xor_1, oldval ^ i, oldval)
	__sync_subword_fetch (u8, fetch_and_nand_1, ~(oldval & i), oldval)

	__sync_subword_fetch (u16, fetch_and_add_2, oldval + i, oldval)
	__sync_subword_fetch (u16, fetch_and_sub_2, oldval - i, oldval)
	__sync_subword_fetch (u16, fetch_and_or_2, oldval \| i, oldval)
	__sync_subword_fetch (u16, fetch_and_and_2, oldval & i, oldval)
	__sync_subword_fetch (u16, fetch_and_xor_2, oldval ^ i, oldval)
	__sync_subword_fetch (u16, fetch_and_nand_2, ~(oldval & i), oldval)

	__sync_subword_fetch (u8, add_and_fetch_1, oldval + i, val)
	__sync_subword_fetch (u8, sub_and_fetch_1, oldval - i, val)
	__sync_subword_fetch (u8, or_and_fetch_1, oldval \| i, val)
	__sync_subword_fetch (u8, and_and_fetch_1, oldval & i, val)
	__sync_subword_fetch (u8, xor_and_fetch_1, oldval ^ i, val)
	__sync_subword_fetch (u8, nand_and_fetch_1, ~(oldval & i), val)

	__sync_subword_fetch (u16, add_and_fetch_2, oldval + i, val)
	__sync_subword_fetch (u16, sub_and_fetch_2, oldval - i, val)
	__sync_subword_fetch (u16, or_and_fetch_2, oldval \| i, val)
	__sync_subword_fetch (u16, and_and_fetch_2, oldval & i, val)
	__sync_subword_fetch (u16, xor_and_fetch_2, oldval ^ i, val)
	__sync_subword_fetch (u16, nand_and_fetch_2, ~(oldval & i), val)

	#define __atomic_subword_lock(type, size) \
	__subword(type, \
	type __atomic_exchange_##size(volatile type* ptr, type nval, int model), \
	pre_atomic_barrier(model);, \
	nval, \
	post_atomic_barrier(model); return oldval;)

	__atomic_subword_lock (u8, 1)
	__atomic_subword_lock (u16, 2)

	#define __sync_subword_lock(type, size) \
	__subword(type, \
	type __sync_lock_test_and_set_##size(type* ptr, type nval), \
	, \
	nval, \
	arch_atomic_acquire_barrier_value(oldval); return oldval;)

	__sync_subword_lock (u8, 1)
	__sync_subword_lock (u16, 2)