libcilkrts/runtime/os-unix.c - gcc - Git at Google

 /* os-unix.c                  -*-C-*-
  *
  *************************************************************************
  *
  *  Copyright (C) 2009-2016, Intel Corporation
  *  All rights reserved.
  *
  *  Redistribution and use in source and binary forms, with or without
  *  modification, are permitted provided that the following conditions
  *  are met:
  *
  *    * Redistributions of source code must retain the above copyright
  *      notice, this list of conditions and the following disclaimer.
  *    * Redistributions in binary form must reproduce the above copyright
  *      notice, this list of conditions and the following disclaimer in
  *      the documentation and/or other materials provided with the
  *      distribution.
  *    * Neither the name of Intel Corporation nor the names of its
  *      contributors may be used to endorse or promote products derived
  *      from this software without specific prior written permission.
  *
  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  *  POSSIBILITY OF SUCH DAMAGE.
  *
  *  *********************************************************************
  *
  *  PLEASE NOTE: This file is a downstream copy of a file mainitained in
  *  a repository at cilkplus.org. Changes made to this file that are not
  *  submitted through the contribution process detailed at
  *  http://www.cilkplus.org/submit-cilk-contribution will be lost the next
  *  time that a new version is released. Changes only submitted to the
  *  GNU compiler collection or posted to the git repository at
  *  https://bitbucket.org/intelcilkruntime/intel-cilk-runtime.git are
  *  not tracked.
  *
  *  We welcome your contributions to this open source project. Thank you
  *  for your assistance in helping us improve Cilk Plus.
  **************************************************************************/

 #include "os.h"
 #include "bug.h"
 #include "cilk_malloc.h"
 #include <internal/abi.h>

 #if defined __linux__
 #   include <sys/sysinfo.h>
 #   include <sys/syscall.h>

 #elif defined __APPLE__
 #   include <sys/sysctl.h>
     // Uses sysconf(_SC_NPROCESSORS_ONLN) in verbose output

 #elif defined  __VXWORKS__
 #   include <vxWorks.h>
 #   include <vxCpuLib.h>
 #   include <taskLib.h>

 // Solaris
 #elif defined __sun__ && defined __svr4__
 #   include <sched.h>

 // OSes we know about which don't require any additional files
 #elif defined __CYGWIN__ || \
       defined __DragonFly__ || \
       defined __FreeBSD__ || \
       defined __GNU__
 // No additional include files

 #else
 #   error "Unsupported OS"
 #endif

 #include <stdarg.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <pthread.h>
 #include <sys/types.h>


 // /* Thread-local storage */
 // #ifdef _WIN32
 // typedef unsigned cilkos_tls_key_t;
 // #else
 // typedef pthread_key_t cilkos_tls_key_t;
 // #endif
 // cilkos_tls_key_t cilkos_allocate_tls_key();
 // void cilkos_set_tls_pointer(cilkos_tls_key_t key, void* ptr);
 // void* cilkos_get_tls_pointer(cilkos_tls_key_t key);

 #if !defined CILK_WORKER_TLS
 static int cilk_keys_defined;
 static pthread_key_t worker_key, pedigree_leaf_key, tbb_interop_key;

 #if SUPPORT_GET_CURRENT_FIBER > 0
 static pthread_key_t fiber_key;
 #endif

 static void *serial_worker;


 // This destructor is called when a pthread dies to deallocate the
 // pedigree node.
 static void __cilkrts_pedigree_leaf_destructor(void* pedigree_tls_ptr)
 {
     __cilkrts_pedigree* pedigree_tls
 	= (__cilkrts_pedigree*)pedigree_tls_ptr;
     if (pedigree_tls) {
         // Assert that we have either one or two nodes
         // left in the pedigree chain.
         // If we have more, then something is going wrong...
         CILK_ASSERT(!pedigree_tls->parent || !pedigree_tls->parent->parent);
 	__cilkrts_free(pedigree_tls);
     }
 }

 void __cilkrts_init_tls_variables(void)
 {
     int status;
     /* This will be called once in serial execution before any
        Cilk parallelism so we do not need to worry about races
        on cilk_keys_defined. */
     if (cilk_keys_defined)
         return;
     status = pthread_key_create(&worker_key, NULL);
     CILK_ASSERT (status == 0);
     status = pthread_key_create(&pedigree_leaf_key,
 				__cilkrts_pedigree_leaf_destructor);
     CILK_ASSERT (status == 0);
     status = pthread_key_create(&tbb_interop_key, NULL);
     CILK_ASSERT (status == 0);

 #if SUPPORT_GET_CURRENT_FIBER > 0
     status = pthread_key_create(&fiber_key, NULL);
     CILK_ASSERT (status == 0);
 #endif
     cilk_keys_defined = 1;
     return;
 }

 COMMON_SYSDEP
 void* cilkos_get_current_thread_id(void)
 {
     return (void*)pthread_self();
 }


 CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker()
 {
     if (__builtin_expect(cilk_keys_defined, 1))
         return (__cilkrts_worker *)pthread_getspecific(worker_key);
     else
         return serial_worker;

 }

 CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker_fast()
 {
   return (__cilkrts_worker *)pthread_getspecific(worker_key);
 }

 COMMON_SYSDEP
 __cilk_tbb_stack_op_thunk *__cilkrts_get_tls_tbb_interop(void)
 {
     if (__builtin_expect(cilk_keys_defined, 1))
         return (__cilk_tbb_stack_op_thunk *)
             pthread_getspecific(tbb_interop_key);
     else
         return 0;
 }

 // This counter should be updated atomically.
 static int __cilkrts_global_pedigree_tls_counter = -1;

 COMMON_SYSDEP
 __cilkrts_pedigree *__cilkrts_get_tls_pedigree_leaf(int create_new)
 {
     __cilkrts_pedigree *pedigree_tls;
     if (__builtin_expect(cilk_keys_defined, 1)) {
         pedigree_tls =
             (struct __cilkrts_pedigree *)pthread_getspecific(pedigree_leaf_key);
     }
     else {
         return 0;
     }

     if (!pedigree_tls && create_new) {
         // This call creates two nodes, X and Y.
         // X == pedigree_tls[0] is the leaf node, which gets copied
         // in and out of a user worker w when w binds and unbinds.
         // Y == pedigree_tls[1] is the root node,
         // which is a constant node that represents the user worker
         // thread w.
 	pedigree_tls = (__cilkrts_pedigree*)
 	    __cilkrts_malloc(2 * sizeof(__cilkrts_pedigree));

         // This call sets the TLS pointer to the new node.
 	__cilkrts_set_tls_pedigree_leaf(pedigree_tls);

         pedigree_tls[0].rank = 0;
         pedigree_tls[0].parent = &pedigree_tls[1];

         // Create Y, whose rank begins as the global counter value.
         pedigree_tls[1].rank =
             __sync_add_and_fetch(&__cilkrts_global_pedigree_tls_counter, 1);

         pedigree_tls[1].parent = NULL;
         CILK_ASSERT(pedigree_tls[1].rank != -1);
     }
     return pedigree_tls;
 }

 #if SUPPORT_GET_CURRENT_FIBER > 0
 COMMON_SYSDEP
 cilk_fiber_sysdep* cilkos_get_tls_cilk_fiber(void)
 {
     if (__builtin_expect(cilk_keys_defined, 1))
         return (cilk_fiber_sysdep *)pthread_getspecific(fiber_key);
     else
         return NULL;
 }
 #endif

 COMMON_SYSDEP
 void __cilkrts_set_tls_worker(__cilkrts_worker *w)
 {
     if (__builtin_expect(cilk_keys_defined, 1)) {
         int status;
         status = pthread_setspecific(worker_key, w);
         CILK_ASSERT (status == 0);
         return;
     }
     else
     {
         serial_worker = w;
     }
 }

 COMMON_SYSDEP
 void __cilkrts_set_tls_tbb_interop(__cilk_tbb_stack_op_thunk *t)
 {
     if (__builtin_expect(cilk_keys_defined, 1)) {
         int status;
         status = pthread_setspecific(tbb_interop_key, t);
         CILK_ASSERT (status == 0);
         return;
     }
     abort();
 }

 COMMON_SYSDEP
 void __cilkrts_set_tls_pedigree_leaf(__cilkrts_pedigree* pedigree_leaf)
 {
     if (__builtin_expect(cilk_keys_defined, 1)) {
         int status;
         status = pthread_setspecific(pedigree_leaf_key, pedigree_leaf);
         CILK_ASSERT (status == 0);
         return;
     }
     abort();
 }

 #if SUPPORT_GET_CURRENT_FIBER > 0
 COMMON_SYSDEP
 void cilkos_set_tls_cilk_fiber(cilk_fiber_sysdep* fiber)
 {
     if (__builtin_expect(cilk_keys_defined, 1)) {
         int status;
         status = pthread_setspecific(fiber_key, fiber);
         CILK_ASSERT (status == 0);
         return;
     }
     abort();
 }
 #endif

 #else
 void __cilkrts_init_tls_variables(void)
 {
 }
 #endif

 #if defined (__linux__) && ! defined(__ANDROID__)
 /*
  * Get the thread id, rather than the pid. In the case of MIC offload, it's
  * possible that we have multiple threads entering Cilk, and each has a
  * different affinity.
  */
 static pid_t linux_gettid(void)
 {
     return syscall(SYS_gettid);
 }

 /*
  * On Linux we look at the thread affinity mask and restrict ourself to one
  * thread for each of the hardware contexts to which we are bound.
  * Therefore if user does
  * % taskset 0-1 cilkProgram
  *       # restrict execution to hardware contexts zero and one
  * the Cilk program will only use two threads even if it is running on a
  * machine that has 32 hardware contexts.
  * This is the right thing to do, because the threads are restricted to two
  * hardware contexts by the affinity mask set by taskset, and if we were to
  * create extra threads they would simply oversubscribe the hardware resources
  * we can use.
  * This is particularly important on MIC in offload mode, where the affinity
  * mask is set by the offload library to force the offload code away from
  * cores that have offload support threads running on them.
  */
 static int linux_get_affinity_count ()
 {
     long system_cores = sysconf(_SC_NPROCESSORS_ONLN);
     int affinity_cores = 0;

 #if defined HAVE_PTHREAD_AFFINITY_NP

 #if defined (CPU_ALLOC_SIZE) && ! defined(DONT_USE_CPU_ALLOC_SIZE)
     // Statically allocated cpu_set_t's max out at 1024 cores.  If
     // CPU_ALLOC_SIZE is available, use it to support large numbers of cores
     size_t cpusetsize = CPU_ALLOC_SIZE(system_cores);
     cpu_set_t *process_mask = (cpu_set_t *)__cilkrts_malloc(cpusetsize);

     // Get the affinity mask for this thread
     int err = pthread_getaffinity_np(pthread_self(),
                                      cpusetsize,
                                      process_mask);

     // Count the available cores.
     if (0 == err)
         affinity_cores = CPU_COUNT_S(cpusetsize, process_mask);

     __cilkrts_free(process_mask);

 #else
     // CPU_ALLOC_SIZE isn't available, or this is the Intel compiler build
     // and we have to support RHEL5.  Use a statically allocated cpu_set_t

     cpu_set_t process_mask;

     // Extract the thread affinity mask
     int err = pthread_getaffinity_np(pthread_self(),
                                      sizeof(process_mask),
                                      &process_mask);

     if (0 == err)
     {
         // We have extracted the mask OK, so now we can count the number of
         // threads in it.  This is linear in the maximum number of CPUs
         // available, We could do a logarithmic version, if we assume the
         // format of the mask, but it's not really worth it. We only call
         // this at thread startup anyway.
         int i;
         for (i = 0; i < CPU_SETSIZE; i++)
         {
             if (CPU_ISSET(i, &process_mask))
             {
                 affinity_cores++;
             }
         }
     }
 #endif  // CPU_ALLOC_SIZE
 #endif  //  ! defined HAVE_PTHREAD_AFFINITY_NP

     // If we've got a count of cores this thread is supposed to use, that's
     // the number or cores we'll use.  Otherwise, default to the number of
     // cores on the system.
     if (0 == affinity_cores)
         return system_cores;
     else
         return affinity_cores;
 }
 #endif  //  defined (__linux__) && ! defined(__ANDROID__)

 /*
  * __cilkrts_hardware_cpu_count
  *
  * Returns the number of available CPUs on this hardware.  This is architecture-
  * specific.
  */

 COMMON_SYSDEP int __cilkrts_hardware_cpu_count(void)
 {
 #if defined __ANDROID__  || \
     defined __CYGWIN__   || \
     defined __DragonFly__  || \
     defined __FreeBSD__  || \
     (defined(__sun__) && defined(__svr4__))
     return (int)sysconf(_SC_NPROCESSORS_ONLN);
 #elif defined __MIC__
     /// HACK: Usually, the 3rd and 4th hyperthreads are not beneficial
     /// on KNC.  Also, ignore the last core.
     int count = (int)sysconf (_SC_NPROCESSORS_ONLN);
     return count/2 - 2;
 #elif defined __linux__
     return linux_get_affinity_count();
 #elif defined __APPLE__
     int count;
     size_t len = sizeof count;
     int status = sysctlbyname("hw.logicalcpu", &count, &len, 0, 0);
     assert(0 == status);

     return count;
 #elif defined  __VXWORKS__
     return __builtin_popcount(vxCpuEnabledGet());
 #else
 #error "Unsupported architecture"
 #endif
 }

 COMMON_SYSDEP void __cilkrts_idle(void)
 {
     // This is another version of __cilkrts_yield() to be used when
     // silencing workers that are not stealing work.
 #if defined(__ANDROID__)  || \
     defined(__FreeBSD__)  || \
     defined(__VXWORKS__)  || \
     (defined(__sun__) && defined(__svr4__))
     sched_yield();
 #elif defined(__MIC__)
     _mm_delay_32(1024);
 #elif defined(__linux__) || \
       defined(__APPLE__) || \
       defined(__CYGWIN__)

     usleep(10000);
 #else
 # error "Unsupported architecture"
 #endif
 }

 COMMON_SYSDEP void __cilkrts_sleep(void)
 {
 #ifdef __VXWORKS__
     taskDelay(1);
 #else
     usleep(1);
 #endif
 }

 COMMON_SYSDEP void __cilkrts_yield(void)
 {
 #if defined(__ANDROID__)  || \
     defined(__APPLE__)    || \
     defined(__CYGWIN__)   || \
     defined(__FreeBSD__)  || \
     defined(__VXWORKS__)  || \
     (defined(__sun__) && defined(__svr4__))
     // Call sched_yield to yield quantum.  I'm not sure why we
     // don't do this on Linux also.
     sched_yield();
 #elif defined(__MIC__)
     // On MIC, pthread_yield() really trashes things.  Arch's measurements
     // showed that calling _mm_delay_32() (or doing nothing) was a better
     // option.  Delaying 1024 clock cycles is a reasonable compromise between
     // giving up the processor and latency starting up when work becomes
     // available
     _mm_delay_32(1024);
 #elif defined(__linux__)
     // On Linux, call pthread_yield (which in turn will call sched_yield)
     // to yield quantum.
     pthread_yield();
 #else
 # error "Unsupported architecture"
 #endif
 }

 COMMON_SYSDEP __STDNS size_t cilkos_getenv(char* value, __STDNS size_t vallen,
                                            const char* varname)
 {
     CILK_ASSERT(value);
     CILK_ASSERT(varname);

     const char* envstr = getenv(varname);
     if (envstr)
     {
         size_t len = cilk_strlen(envstr);
         if (len > vallen - 1)
             return len + 1;
         cilk_strcpy_s(value, vallen, envstr);
         return len;
     }
     else
     {
         value[0] = '\0';
         return 0;
     }
 }

 /*
  * Unrecoverable error: Print an error message and abort execution.
  */
 COMMON_SYSDEP void cilkos_error(const char *fmt, ...)
 {
     va_list l;
     fflush(NULL);
     fprintf(stderr, "Cilk error: ");
     va_start(l, fmt);
     vfprintf(stderr, fmt, l);
     va_end(l);
     fprintf(stderr, "Exiting.\n");
     fflush(stderr);

     abort();
 }

 /*
  * Print a warning message and return.
  */
 COMMON_SYSDEP void cilkos_warning(const char *fmt, ...)
 {
     va_list l;
     fflush(NULL);
     fprintf(stderr, "Cilk warning: ");
     va_start(l, fmt);
     vfprintf(stderr, fmt, l);
     va_end(l);
     fflush(stderr);
 }

 #ifdef __VXWORKS__
 #ifdef _WRS_KERNEL
 void cilkStart()
 {
     __cilkrts_init_tls_variables();
 }
 #else
 _WRS_CONSTRUCTOR(cilkInit, 100)
 {
     __cilkrts_init_tls_variables();
 }
 #endif
 #else
 static void __attribute__((constructor)) init_once()
 {
     /*__cilkrts_debugger_notification_internal(CILK_DB_RUNTIME_LOADED);*/
     __cilkrts_init_tls_variables();
 }
 #endif


 #define PAGE 4096
 #define CILK_MIN_STACK_SIZE (4*PAGE)
 // Default size for the stacks that we create in Cilk for Unix.
 #define CILK_DEFAULT_STACK_SIZE 0x100000

 /*
  * Convert the user's specified stack size into a "reasonable" value
  * for this OS.
  */
 size_t cilkos_validate_stack_size(size_t specified_stack_size) {
     // Convert any negative value to the default.
     if (specified_stack_size == 0) {
         CILK_ASSERT((CILK_DEFAULT_STACK_SIZE % PAGE) == 0);
         return CILK_DEFAULT_STACK_SIZE;
     }
     // Round values in between 0 and CILK_MIN_STACK_SIZE up to
     // CILK_MIN_STACK_SIZE.
     if (specified_stack_size <= CILK_MIN_STACK_SIZE) {
         return CILK_MIN_STACK_SIZE;
     }
     if ((specified_stack_size % PAGE) > 0) {
         // Round the user's stack size value up to nearest page boundary.
         return (PAGE * (1 + specified_stack_size / PAGE));
     }
     return specified_stack_size;
 }

 long cilkos_atomic_add(volatile long* p, long x)
 {
     return __sync_add_and_fetch(p, x);
 }

 /* End os-unix.c */
	/* os-unix.c --C--
	*
	*************************************************************************
	*
	* Copyright (C) 2009-2016, Intel Corporation
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in
	* the documentation and/or other materials provided with the
	* distribution.
	* * Neither the name of Intel Corporation nor the names of its
	* contributors may be used to endorse or promote products derived
	* from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
	* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*
	* *********************************************************************
	*
	* PLEASE NOTE: This file is a downstream copy of a file mainitained in
	* a repository at cilkplus.org. Changes made to this file that are not
	* submitted through the contribution process detailed at
	* http://www.cilkplus.org/submit-cilk-contribution will be lost the next
	* time that a new version is released. Changes only submitted to the
	* GNU compiler collection or posted to the git repository at
	* https://bitbucket.org/intelcilkruntime/intel-cilk-runtime.git are
	* not tracked.
	*
	* We welcome your contributions to this open source project. Thank you
	* for your assistance in helping us improve Cilk Plus.
	**************************************************************************/

	#include "os.h"
	#include "bug.h"
	#include "cilk_malloc.h"
	#include <internal/abi.h>

	#if defined __linux__
	# include <sys/sysinfo.h>
	# include <sys/syscall.h>

	#elif defined __APPLE__
	# include <sys/sysctl.h>
	// Uses sysconf(_SC_NPROCESSORS_ONLN) in verbose output

	#elif defined __VXWORKS__
	# include <vxWorks.h>
	# include <vxCpuLib.h>
	# include <taskLib.h>

	// Solaris
	#elif defined __sun__ && defined __svr4__
	# include <sched.h>

	// OSes we know about which don't require any additional files
	#elif defined __CYGWIN__ \|\| \
	defined __DragonFly__ \|\| \
	defined __FreeBSD__ \|\| \
	defined __GNU__
	// No additional include files

	#else
	# error "Unsupported OS"
	#endif

	#include <stdarg.h>
	#include <stddef.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <unistd.h>
	#include <pthread.h>
	#include <sys/types.h>



	// /* Thread-local storage */
	// #ifdef _WIN32
	// typedef unsigned cilkos_tls_key_t;
	// #else
	// typedef pthread_key_t cilkos_tls_key_t;
	// #endif
	// cilkos_tls_key_t cilkos_allocate_tls_key();
	// void cilkos_set_tls_pointer(cilkos_tls_key_t key, void* ptr);
	// void* cilkos_get_tls_pointer(cilkos_tls_key_t key);

	#if !defined CILK_WORKER_TLS
	static int cilk_keys_defined;
	static pthread_key_t worker_key, pedigree_leaf_key, tbb_interop_key;

	#if SUPPORT_GET_CURRENT_FIBER > 0
	static pthread_key_t fiber_key;
	#endif

	static void *serial_worker;


	// This destructor is called when a pthread dies to deallocate the
	// pedigree node.
	static void __cilkrts_pedigree_leaf_destructor(void* pedigree_tls_ptr)
	{
	__cilkrts_pedigree* pedigree_tls
	= (__cilkrts_pedigree*)pedigree_tls_ptr;
	if (pedigree_tls) {
	// Assert that we have either one or two nodes
	// left in the pedigree chain.
	// If we have more, then something is going wrong...
	CILK_ASSERT(!pedigree_tls->parent \|\| !pedigree_tls->parent->parent);
	__cilkrts_free(pedigree_tls);
	}
	}

	void __cilkrts_init_tls_variables(void)
	{
	int status;
	/* This will be called once in serial execution before any
	Cilk parallelism so we do not need to worry about races
	on cilk_keys_defined. */
	if (cilk_keys_defined)
	return;
	status = pthread_key_create(&worker_key, NULL);
	CILK_ASSERT (status == 0);
	status = pthread_key_create(&pedigree_leaf_key,
	__cilkrts_pedigree_leaf_destructor);
	CILK_ASSERT (status == 0);
	status = pthread_key_create(&tbb_interop_key, NULL);
	CILK_ASSERT (status == 0);

	#if SUPPORT_GET_CURRENT_FIBER > 0
	status = pthread_key_create(&fiber_key, NULL);
	CILK_ASSERT (status == 0);
	#endif
	cilk_keys_defined = 1;
	return;
	}

	COMMON_SYSDEP
	void* cilkos_get_current_thread_id(void)
	{
	return (void*)pthread_self();
	}


	CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker()
	{
	if (__builtin_expect(cilk_keys_defined, 1))
	return (__cilkrts_worker *)pthread_getspecific(worker_key);
	else
	return serial_worker;

	}

	CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker_fast()
	{
	return (__cilkrts_worker *)pthread_getspecific(worker_key);
	}

	COMMON_SYSDEP
	__cilk_tbb_stack_op_thunk *__cilkrts_get_tls_tbb_interop(void)
	{
	if (__builtin_expect(cilk_keys_defined, 1))
	return (__cilk_tbb_stack_op_thunk *)
	pthread_getspecific(tbb_interop_key);
	else
	return 0;
	}

	// This counter should be updated atomically.
	static int __cilkrts_global_pedigree_tls_counter = -1;

	COMMON_SYSDEP
	__cilkrts_pedigree *__cilkrts_get_tls_pedigree_leaf(int create_new)
	{
	__cilkrts_pedigree *pedigree_tls;
	if (__builtin_expect(cilk_keys_defined, 1)) {
	pedigree_tls =
	(struct __cilkrts_pedigree *)pthread_getspecific(pedigree_leaf_key);
	}
	else {
	return 0;
	}

	if (!pedigree_tls && create_new) {
	// This call creates two nodes, X and Y.
	// X == pedigree_tls[0] is the leaf node, which gets copied
	// in and out of a user worker w when w binds and unbinds.
	// Y == pedigree_tls[1] is the root node,
	// which is a constant node that represents the user worker
	// thread w.
	pedigree_tls = (__cilkrts_pedigree*)
	__cilkrts_malloc(2 * sizeof(__cilkrts_pedigree));

	// This call sets the TLS pointer to the new node.
	__cilkrts_set_tls_pedigree_leaf(pedigree_tls);

	pedigree_tls[0].rank = 0;
	pedigree_tls[0].parent = &pedigree_tls[1];

	// Create Y, whose rank begins as the global counter value.
	pedigree_tls[1].rank =
	__sync_add_and_fetch(&__cilkrts_global_pedigree_tls_counter, 1);

	pedigree_tls[1].parent = NULL;
	CILK_ASSERT(pedigree_tls[1].rank != -1);
	}
	return pedigree_tls;
	}

	#if SUPPORT_GET_CURRENT_FIBER > 0
	COMMON_SYSDEP
	cilk_fiber_sysdep* cilkos_get_tls_cilk_fiber(void)
	{
	if (__builtin_expect(cilk_keys_defined, 1))
	return (cilk_fiber_sysdep *)pthread_getspecific(fiber_key);
	else
	return NULL;
	}
	#endif

	COMMON_SYSDEP
	void __cilkrts_set_tls_worker(__cilkrts_worker *w)
	{
	if (__builtin_expect(cilk_keys_defined, 1)) {
	int status;
	status = pthread_setspecific(worker_key, w);
	CILK_ASSERT (status == 0);
	return;
	}
	else
	{
	serial_worker = w;
	}
	}

	COMMON_SYSDEP
	void __cilkrts_set_tls_tbb_interop(__cilk_tbb_stack_op_thunk *t)
	{
	if (__builtin_expect(cilk_keys_defined, 1)) {
	int status;
	status = pthread_setspecific(tbb_interop_key, t);
	CILK_ASSERT (status == 0);
	return;
	}
	abort();
	}

	COMMON_SYSDEP
	void __cilkrts_set_tls_pedigree_leaf(__cilkrts_pedigree* pedigree_leaf)
	{
	if (__builtin_expect(cilk_keys_defined, 1)) {
	int status;
	status = pthread_setspecific(pedigree_leaf_key, pedigree_leaf);
	CILK_ASSERT (status == 0);
	return;
	}
	abort();
	}

	#if SUPPORT_GET_CURRENT_FIBER > 0
	COMMON_SYSDEP
	void cilkos_set_tls_cilk_fiber(cilk_fiber_sysdep* fiber)
	{
	if (__builtin_expect(cilk_keys_defined, 1)) {
	int status;
	status = pthread_setspecific(fiber_key, fiber);
	CILK_ASSERT (status == 0);
	return;
	}
	abort();
	}
	#endif

	#else
	void __cilkrts_init_tls_variables(void)
	{
	}
	#endif

	#if defined (__linux__) && ! defined(__ANDROID__)
	/*
	* Get the thread id, rather than the pid. In the case of MIC offload, it's
	* possible that we have multiple threads entering Cilk, and each has a
	* different affinity.
	*/
	static pid_t linux_gettid(void)
	{
	return syscall(SYS_gettid);
	}

	/*
	* On Linux we look at the thread affinity mask and restrict ourself to one
	* thread for each of the hardware contexts to which we are bound.
	* Therefore if user does
	* % taskset 0-1 cilkProgram
	* # restrict execution to hardware contexts zero and one
	* the Cilk program will only use two threads even if it is running on a
	* machine that has 32 hardware contexts.
	* This is the right thing to do, because the threads are restricted to two
	* hardware contexts by the affinity mask set by taskset, and if we were to
	* create extra threads they would simply oversubscribe the hardware resources
	* we can use.
	* This is particularly important on MIC in offload mode, where the affinity
	* mask is set by the offload library to force the offload code away from
	* cores that have offload support threads running on them.
	*/
	static int linux_get_affinity_count ()
	{
	long system_cores = sysconf(_SC_NPROCESSORS_ONLN);
	int affinity_cores = 0;

	#if defined HAVE_PTHREAD_AFFINITY_NP

	#if defined (CPU_ALLOC_SIZE) && ! defined(DONT_USE_CPU_ALLOC_SIZE)
	// Statically allocated cpu_set_t's max out at 1024 cores. If
	// CPU_ALLOC_SIZE is available, use it to support large numbers of cores
	size_t cpusetsize = CPU_ALLOC_SIZE(system_cores);
	cpu_set_t process_mask = (cpu_set_t )__cilkrts_malloc(cpusetsize);

	// Get the affinity mask for this thread
	int err = pthread_getaffinity_np(pthread_self(),
	cpusetsize,
	process_mask);

	// Count the available cores.
	if (0 == err)
	affinity_cores = CPU_COUNT_S(cpusetsize, process_mask);

	__cilkrts_free(process_mask);

	#else
	// CPU_ALLOC_SIZE isn't available, or this is the Intel compiler build
	// and we have to support RHEL5. Use a statically allocated cpu_set_t

	cpu_set_t process_mask;

	// Extract the thread affinity mask
	int err = pthread_getaffinity_np(pthread_self(),
	sizeof(process_mask),
	&process_mask);

	if (0 == err)
	{
	// We have extracted the mask OK, so now we can count the number of
	// threads in it. This is linear in the maximum number of CPUs
	// available, We could do a logarithmic version, if we assume the
	// format of the mask, but it's not really worth it. We only call
	// this at thread startup anyway.
	int i;
	for (i = 0; i < CPU_SETSIZE; i++)
	{
	if (CPU_ISSET(i, &process_mask))
	{
	affinity_cores++;
	}
	}
	}
	#endif // CPU_ALLOC_SIZE
	#endif // ! defined HAVE_PTHREAD_AFFINITY_NP

	// If we've got a count of cores this thread is supposed to use, that's
	// the number or cores we'll use. Otherwise, default to the number of
	// cores on the system.
	if (0 == affinity_cores)
	return system_cores;
	else
	return affinity_cores;
	}
	#endif // defined (__linux__) && ! defined(__ANDROID__)

	/*
	* __cilkrts_hardware_cpu_count
	*
	* Returns the number of available CPUs on this hardware. This is architecture-
	* specific.
	*/

	COMMON_SYSDEP int __cilkrts_hardware_cpu_count(void)
	{
	#if defined __ANDROID__ \|\| \
	defined __CYGWIN__ \|\| \
	defined __DragonFly__ \|\| \
	defined __FreeBSD__ \|\| \
	(defined(__sun__) && defined(__svr4__))
	return (int)sysconf(_SC_NPROCESSORS_ONLN);
	#elif defined __MIC__
	/// HACK: Usually, the 3rd and 4th hyperthreads are not beneficial
	/// on KNC. Also, ignore the last core.
	int count = (int)sysconf (_SC_NPROCESSORS_ONLN);
	return count/2 - 2;
	#elif defined __linux__
	return linux_get_affinity_count();
	#elif defined __APPLE__
	int count;
	size_t len = sizeof count;
	int status = sysctlbyname("hw.logicalcpu", &count, &len, 0, 0);
	assert(0 == status);

	return count;
	#elif defined __VXWORKS__
	return __builtin_popcount(vxCpuEnabledGet());
	#else
	#error "Unsupported architecture"
	#endif
	}

	COMMON_SYSDEP void __cilkrts_idle(void)
	{
	// This is another version of __cilkrts_yield() to be used when
	// silencing workers that are not stealing work.
	#if defined(__ANDROID__) \|\| \
	defined(__FreeBSD__) \|\| \
	defined(__VXWORKS__) \|\| \
	(defined(__sun__) && defined(__svr4__))
	sched_yield();
	#elif defined(__MIC__)
	_mm_delay_32(1024);
	#elif defined(__linux__) \|\| \
	defined(__APPLE__) \|\| \
	defined(__CYGWIN__)

	usleep(10000);
	#else
	# error "Unsupported architecture"
	#endif
	}

	COMMON_SYSDEP void __cilkrts_sleep(void)
	{
	#ifdef __VXWORKS__
	taskDelay(1);
	#else
	usleep(1);
	#endif
	}

	COMMON_SYSDEP void __cilkrts_yield(void)
	{
	#if defined(__ANDROID__) \|\| \
	defined(__APPLE__) \|\| \
	defined(__CYGWIN__) \|\| \
	defined(__FreeBSD__) \|\| \
	defined(__VXWORKS__) \|\| \
	(defined(__sun__) && defined(__svr4__))
	// Call sched_yield to yield quantum. I'm not sure why we
	// don't do this on Linux also.
	sched_yield();
	#elif defined(__MIC__)
	// On MIC, pthread_yield() really trashes things. Arch's measurements
	// showed that calling _mm_delay_32() (or doing nothing) was a better
	// option. Delaying 1024 clock cycles is a reasonable compromise between
	// giving up the processor and latency starting up when work becomes
	// available
	_mm_delay_32(1024);
	#elif defined(__linux__)
	// On Linux, call pthread_yield (which in turn will call sched_yield)
	// to yield quantum.
	pthread_yield();
	#else
	# error "Unsupported architecture"
	#endif
	}

	COMMON_SYSDEP __STDNS size_t cilkos_getenv(char* value, __STDNS size_t vallen,
	const char* varname)
	{
	CILK_ASSERT(value);
	CILK_ASSERT(varname);

	const char* envstr = getenv(varname);
	if (envstr)
	{
	size_t len = cilk_strlen(envstr);
	if (len > vallen - 1)
	return len + 1;
	cilk_strcpy_s(value, vallen, envstr);
	return len;
	}
	else
	{
	value[0] = '\0';
	return 0;
	}
	}

	/*
	* Unrecoverable error: Print an error message and abort execution.
	*/
	COMMON_SYSDEP void cilkos_error(const char *fmt, ...)
	{
	va_list l;
	fflush(NULL);
	fprintf(stderr, "Cilk error: ");
	va_start(l, fmt);
	vfprintf(stderr, fmt, l);
	va_end(l);
	fprintf(stderr, "Exiting.\n");
	fflush(stderr);

	abort();
	}

	/*
	* Print a warning message and return.
	*/
	COMMON_SYSDEP void cilkos_warning(const char *fmt, ...)
	{
	va_list l;
	fflush(NULL);
	fprintf(stderr, "Cilk warning: ");
	va_start(l, fmt);
	vfprintf(stderr, fmt, l);
	va_end(l);
	fflush(stderr);
	}

	#ifdef __VXWORKS__
	#ifdef _WRS_KERNEL
	void cilkStart()
	{
	__cilkrts_init_tls_variables();
	}
	#else
	_WRS_CONSTRUCTOR(cilkInit, 100)
	{
	__cilkrts_init_tls_variables();
	}
	#endif
	#else
	static void __attribute__((constructor)) init_once()
	{
	/__cilkrts_debugger_notification_internal(CILK_DB_RUNTIME_LOADED);/
	__cilkrts_init_tls_variables();
	}
	#endif


	#define PAGE 4096
	#define CILK_MIN_STACK_SIZE (4*PAGE)
	// Default size for the stacks that we create in Cilk for Unix.
	#define CILK_DEFAULT_STACK_SIZE 0x100000

	/*
	* Convert the user's specified stack size into a "reasonable" value
	* for this OS.
	*/
	size_t cilkos_validate_stack_size(size_t specified_stack_size) {
	// Convert any negative value to the default.
	if (specified_stack_size == 0) {
	CILK_ASSERT((CILK_DEFAULT_STACK_SIZE % PAGE) == 0);
	return CILK_DEFAULT_STACK_SIZE;
	}
	// Round values in between 0 and CILK_MIN_STACK_SIZE up to
	// CILK_MIN_STACK_SIZE.
	if (specified_stack_size <= CILK_MIN_STACK_SIZE) {
	return CILK_MIN_STACK_SIZE;
	}
	if ((specified_stack_size % PAGE) > 0) {
	// Round the user's stack size value up to nearest page boundary.
	return (PAGE * (1 + specified_stack_size / PAGE));
	}
	return specified_stack_size;
	}

	long cilkos_atomic_add(volatile long* p, long x)
	{
	return __sync_add_and_fetch(p, x);
	}

	/* End os-unix.c */