| /* os-unix.c -*-C-*- |
| * |
| ************************************************************************* |
| * |
| * Copyright (C) 2009-2016, Intel Corporation |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in |
| * the documentation and/or other materials provided with the |
| * distribution. |
| * * Neither the name of Intel Corporation nor the names of its |
| * contributors may be used to endorse or promote products derived |
| * from this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
| * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
| * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS |
| * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED |
| * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY |
| * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| * POSSIBILITY OF SUCH DAMAGE. |
| * |
| * ********************************************************************* |
| * |
| * PLEASE NOTE: This file is a downstream copy of a file mainitained in |
| * a repository at cilkplus.org. Changes made to this file that are not |
| * submitted through the contribution process detailed at |
| * http://www.cilkplus.org/submit-cilk-contribution will be lost the next |
| * time that a new version is released. Changes only submitted to the |
| * GNU compiler collection or posted to the git repository at |
| * https://bitbucket.org/intelcilkruntime/intel-cilk-runtime.git are |
| * not tracked. |
| * |
| * We welcome your contributions to this open source project. Thank you |
| * for your assistance in helping us improve Cilk Plus. |
| **************************************************************************/ |
| |
| #include "os.h" |
| #include "bug.h" |
| #include "cilk_malloc.h" |
| #include <internal/abi.h> |
| |
| #if defined __linux__ |
| # include <sys/sysinfo.h> |
| # include <sys/syscall.h> |
| |
| #elif defined __APPLE__ |
| # include <sys/sysctl.h> |
| // Uses sysconf(_SC_NPROCESSORS_ONLN) in verbose output |
| |
| #elif defined __VXWORKS__ |
| # include <vxWorks.h> |
| # include <vxCpuLib.h> |
| # include <taskLib.h> |
| |
| // Solaris |
| #elif defined __sun__ && defined __svr4__ |
| # include <sched.h> |
| |
| // OSes we know about which don't require any additional files |
| #elif defined __CYGWIN__ || \ |
| defined __DragonFly__ || \ |
| defined __FreeBSD__ || \ |
| defined __GNU__ |
| // No additional include files |
| |
| #else |
| # error "Unsupported OS" |
| #endif |
| |
| #include <stdarg.h> |
| #include <stddef.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <unistd.h> |
| #include <pthread.h> |
| #include <sys/types.h> |
| |
| |
| |
| // /* Thread-local storage */ |
| // #ifdef _WIN32 |
| // typedef unsigned cilkos_tls_key_t; |
| // #else |
| // typedef pthread_key_t cilkos_tls_key_t; |
| // #endif |
| // cilkos_tls_key_t cilkos_allocate_tls_key(); |
| // void cilkos_set_tls_pointer(cilkos_tls_key_t key, void* ptr); |
| // void* cilkos_get_tls_pointer(cilkos_tls_key_t key); |
| |
| #if !defined CILK_WORKER_TLS |
| static int cilk_keys_defined; |
| static pthread_key_t worker_key, pedigree_leaf_key, tbb_interop_key; |
| |
| #if SUPPORT_GET_CURRENT_FIBER > 0 |
| static pthread_key_t fiber_key; |
| #endif |
| |
| static void *serial_worker; |
| |
| |
| // This destructor is called when a pthread dies to deallocate the |
| // pedigree node. |
| static void __cilkrts_pedigree_leaf_destructor(void* pedigree_tls_ptr) |
| { |
| __cilkrts_pedigree* pedigree_tls |
| = (__cilkrts_pedigree*)pedigree_tls_ptr; |
| if (pedigree_tls) { |
| // Assert that we have either one or two nodes |
| // left in the pedigree chain. |
| // If we have more, then something is going wrong... |
| CILK_ASSERT(!pedigree_tls->parent || !pedigree_tls->parent->parent); |
| __cilkrts_free(pedigree_tls); |
| } |
| } |
| |
| void __cilkrts_init_tls_variables(void) |
| { |
| int status; |
| /* This will be called once in serial execution before any |
| Cilk parallelism so we do not need to worry about races |
| on cilk_keys_defined. */ |
| if (cilk_keys_defined) |
| return; |
| status = pthread_key_create(&worker_key, NULL); |
| CILK_ASSERT (status == 0); |
| status = pthread_key_create(&pedigree_leaf_key, |
| __cilkrts_pedigree_leaf_destructor); |
| CILK_ASSERT (status == 0); |
| status = pthread_key_create(&tbb_interop_key, NULL); |
| CILK_ASSERT (status == 0); |
| |
| #if SUPPORT_GET_CURRENT_FIBER > 0 |
| status = pthread_key_create(&fiber_key, NULL); |
| CILK_ASSERT (status == 0); |
| #endif |
| cilk_keys_defined = 1; |
| return; |
| } |
| |
| COMMON_SYSDEP |
| void* cilkos_get_current_thread_id(void) |
| { |
| return (void*)pthread_self(); |
| } |
| |
| |
| CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker() |
| { |
| if (__builtin_expect(cilk_keys_defined, 1)) |
| return (__cilkrts_worker *)pthread_getspecific(worker_key); |
| else |
| return serial_worker; |
| |
| } |
| |
| CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker_fast() |
| { |
| return (__cilkrts_worker *)pthread_getspecific(worker_key); |
| } |
| |
| COMMON_SYSDEP |
| __cilk_tbb_stack_op_thunk *__cilkrts_get_tls_tbb_interop(void) |
| { |
| if (__builtin_expect(cilk_keys_defined, 1)) |
| return (__cilk_tbb_stack_op_thunk *) |
| pthread_getspecific(tbb_interop_key); |
| else |
| return 0; |
| } |
| |
| // This counter should be updated atomically. |
| static int __cilkrts_global_pedigree_tls_counter = -1; |
| |
| COMMON_SYSDEP |
| __cilkrts_pedigree *__cilkrts_get_tls_pedigree_leaf(int create_new) |
| { |
| __cilkrts_pedigree *pedigree_tls; |
| if (__builtin_expect(cilk_keys_defined, 1)) { |
| pedigree_tls = |
| (struct __cilkrts_pedigree *)pthread_getspecific(pedigree_leaf_key); |
| } |
| else { |
| return 0; |
| } |
| |
| if (!pedigree_tls && create_new) { |
| // This call creates two nodes, X and Y. |
| // X == pedigree_tls[0] is the leaf node, which gets copied |
| // in and out of a user worker w when w binds and unbinds. |
| // Y == pedigree_tls[1] is the root node, |
| // which is a constant node that represents the user worker |
| // thread w. |
| pedigree_tls = (__cilkrts_pedigree*) |
| __cilkrts_malloc(2 * sizeof(__cilkrts_pedigree)); |
| |
| // This call sets the TLS pointer to the new node. |
| __cilkrts_set_tls_pedigree_leaf(pedigree_tls); |
| |
| pedigree_tls[0].rank = 0; |
| pedigree_tls[0].parent = &pedigree_tls[1]; |
| |
| // Create Y, whose rank begins as the global counter value. |
| pedigree_tls[1].rank = |
| __sync_add_and_fetch(&__cilkrts_global_pedigree_tls_counter, 1); |
| |
| pedigree_tls[1].parent = NULL; |
| CILK_ASSERT(pedigree_tls[1].rank != -1); |
| } |
| return pedigree_tls; |
| } |
| |
| #if SUPPORT_GET_CURRENT_FIBER > 0 |
| COMMON_SYSDEP |
| cilk_fiber_sysdep* cilkos_get_tls_cilk_fiber(void) |
| { |
| if (__builtin_expect(cilk_keys_defined, 1)) |
| return (cilk_fiber_sysdep *)pthread_getspecific(fiber_key); |
| else |
| return NULL; |
| } |
| #endif |
| |
| COMMON_SYSDEP |
| void __cilkrts_set_tls_worker(__cilkrts_worker *w) |
| { |
| if (__builtin_expect(cilk_keys_defined, 1)) { |
| int status; |
| status = pthread_setspecific(worker_key, w); |
| CILK_ASSERT (status == 0); |
| return; |
| } |
| else |
| { |
| serial_worker = w; |
| } |
| } |
| |
| COMMON_SYSDEP |
| void __cilkrts_set_tls_tbb_interop(__cilk_tbb_stack_op_thunk *t) |
| { |
| if (__builtin_expect(cilk_keys_defined, 1)) { |
| int status; |
| status = pthread_setspecific(tbb_interop_key, t); |
| CILK_ASSERT (status == 0); |
| return; |
| } |
| abort(); |
| } |
| |
| COMMON_SYSDEP |
| void __cilkrts_set_tls_pedigree_leaf(__cilkrts_pedigree* pedigree_leaf) |
| { |
| if (__builtin_expect(cilk_keys_defined, 1)) { |
| int status; |
| status = pthread_setspecific(pedigree_leaf_key, pedigree_leaf); |
| CILK_ASSERT (status == 0); |
| return; |
| } |
| abort(); |
| } |
| |
| #if SUPPORT_GET_CURRENT_FIBER > 0 |
| COMMON_SYSDEP |
| void cilkos_set_tls_cilk_fiber(cilk_fiber_sysdep* fiber) |
| { |
| if (__builtin_expect(cilk_keys_defined, 1)) { |
| int status; |
| status = pthread_setspecific(fiber_key, fiber); |
| CILK_ASSERT (status == 0); |
| return; |
| } |
| abort(); |
| } |
| #endif |
| |
| #else |
| void __cilkrts_init_tls_variables(void) |
| { |
| } |
| #endif |
| |
| #if defined (__linux__) && ! defined(__ANDROID__) |
| /* |
| * Get the thread id, rather than the pid. In the case of MIC offload, it's |
| * possible that we have multiple threads entering Cilk, and each has a |
| * different affinity. |
| */ |
| static pid_t linux_gettid(void) |
| { |
| return syscall(SYS_gettid); |
| } |
| |
| /* |
| * On Linux we look at the thread affinity mask and restrict ourself to one |
| * thread for each of the hardware contexts to which we are bound. |
| * Therefore if user does |
| * % taskset 0-1 cilkProgram |
| * # restrict execution to hardware contexts zero and one |
| * the Cilk program will only use two threads even if it is running on a |
| * machine that has 32 hardware contexts. |
| * This is the right thing to do, because the threads are restricted to two |
| * hardware contexts by the affinity mask set by taskset, and if we were to |
| * create extra threads they would simply oversubscribe the hardware resources |
| * we can use. |
| * This is particularly important on MIC in offload mode, where the affinity |
| * mask is set by the offload library to force the offload code away from |
| * cores that have offload support threads running on them. |
| */ |
| static int linux_get_affinity_count () |
| { |
| long system_cores = sysconf(_SC_NPROCESSORS_ONLN); |
| int affinity_cores = 0; |
| |
| #if defined HAVE_PTHREAD_AFFINITY_NP |
| |
| #if defined (CPU_ALLOC_SIZE) && ! defined(DONT_USE_CPU_ALLOC_SIZE) |
| // Statically allocated cpu_set_t's max out at 1024 cores. If |
| // CPU_ALLOC_SIZE is available, use it to support large numbers of cores |
| size_t cpusetsize = CPU_ALLOC_SIZE(system_cores); |
| cpu_set_t *process_mask = (cpu_set_t *)__cilkrts_malloc(cpusetsize); |
| |
| // Get the affinity mask for this thread |
| int err = pthread_getaffinity_np(pthread_self(), |
| cpusetsize, |
| process_mask); |
| |
| // Count the available cores. |
| if (0 == err) |
| affinity_cores = CPU_COUNT_S(cpusetsize, process_mask); |
| |
| __cilkrts_free(process_mask); |
| |
| #else |
| // CPU_ALLOC_SIZE isn't available, or this is the Intel compiler build |
| // and we have to support RHEL5. Use a statically allocated cpu_set_t |
| |
| cpu_set_t process_mask; |
| |
| // Extract the thread affinity mask |
| int err = pthread_getaffinity_np(pthread_self(), |
| sizeof(process_mask), |
| &process_mask); |
| |
| if (0 == err) |
| { |
| // We have extracted the mask OK, so now we can count the number of |
| // threads in it. This is linear in the maximum number of CPUs |
| // available, We could do a logarithmic version, if we assume the |
| // format of the mask, but it's not really worth it. We only call |
| // this at thread startup anyway. |
| int i; |
| for (i = 0; i < CPU_SETSIZE; i++) |
| { |
| if (CPU_ISSET(i, &process_mask)) |
| { |
| affinity_cores++; |
| } |
| } |
| } |
| #endif // CPU_ALLOC_SIZE |
| #endif // ! defined HAVE_PTHREAD_AFFINITY_NP |
| |
| // If we've got a count of cores this thread is supposed to use, that's |
| // the number or cores we'll use. Otherwise, default to the number of |
| // cores on the system. |
| if (0 == affinity_cores) |
| return system_cores; |
| else |
| return affinity_cores; |
| } |
| #endif // defined (__linux__) && ! defined(__ANDROID__) |
| |
| /* |
| * __cilkrts_hardware_cpu_count |
| * |
| * Returns the number of available CPUs on this hardware. This is architecture- |
| * specific. |
| */ |
| |
| COMMON_SYSDEP int __cilkrts_hardware_cpu_count(void) |
| { |
| #if defined __ANDROID__ || \ |
| defined __CYGWIN__ || \ |
| defined __DragonFly__ || \ |
| defined __FreeBSD__ || \ |
| (defined(__sun__) && defined(__svr4__)) |
| return (int)sysconf(_SC_NPROCESSORS_ONLN); |
| #elif defined __MIC__ |
| /// HACK: Usually, the 3rd and 4th hyperthreads are not beneficial |
| /// on KNC. Also, ignore the last core. |
| int count = (int)sysconf (_SC_NPROCESSORS_ONLN); |
| return count/2 - 2; |
| #elif defined __linux__ |
| return linux_get_affinity_count(); |
| #elif defined __APPLE__ |
| int count; |
| size_t len = sizeof count; |
| int status = sysctlbyname("hw.logicalcpu", &count, &len, 0, 0); |
| assert(0 == status); |
| |
| return count; |
| #elif defined __VXWORKS__ |
| return __builtin_popcount(vxCpuEnabledGet()); |
| #else |
| #error "Unsupported architecture" |
| #endif |
| } |
| |
| COMMON_SYSDEP void __cilkrts_idle(void) |
| { |
| // This is another version of __cilkrts_yield() to be used when |
| // silencing workers that are not stealing work. |
| #if defined(__ANDROID__) || \ |
| defined(__FreeBSD__) || \ |
| defined(__VXWORKS__) || \ |
| (defined(__sun__) && defined(__svr4__)) |
| sched_yield(); |
| #elif defined(__MIC__) |
| _mm_delay_32(1024); |
| #elif defined(__linux__) || \ |
| defined(__APPLE__) || \ |
| defined(__CYGWIN__) |
| |
| usleep(10000); |
| #else |
| # error "Unsupported architecture" |
| #endif |
| } |
| |
| COMMON_SYSDEP void __cilkrts_sleep(void) |
| { |
| #ifdef __VXWORKS__ |
| taskDelay(1); |
| #else |
| usleep(1); |
| #endif |
| } |
| |
| COMMON_SYSDEP void __cilkrts_yield(void) |
| { |
| #if defined(__ANDROID__) || \ |
| defined(__APPLE__) || \ |
| defined(__CYGWIN__) || \ |
| defined(__FreeBSD__) || \ |
| defined(__VXWORKS__) || \ |
| (defined(__sun__) && defined(__svr4__)) |
| // Call sched_yield to yield quantum. I'm not sure why we |
| // don't do this on Linux also. |
| sched_yield(); |
| #elif defined(__MIC__) |
| // On MIC, pthread_yield() really trashes things. Arch's measurements |
| // showed that calling _mm_delay_32() (or doing nothing) was a better |
| // option. Delaying 1024 clock cycles is a reasonable compromise between |
| // giving up the processor and latency starting up when work becomes |
| // available |
| _mm_delay_32(1024); |
| #elif defined(__linux__) |
| // On Linux, call pthread_yield (which in turn will call sched_yield) |
| // to yield quantum. |
| pthread_yield(); |
| #else |
| # error "Unsupported architecture" |
| #endif |
| } |
| |
| COMMON_SYSDEP __STDNS size_t cilkos_getenv(char* value, __STDNS size_t vallen, |
| const char* varname) |
| { |
| CILK_ASSERT(value); |
| CILK_ASSERT(varname); |
| |
| const char* envstr = getenv(varname); |
| if (envstr) |
| { |
| size_t len = cilk_strlen(envstr); |
| if (len > vallen - 1) |
| return len + 1; |
| cilk_strcpy_s(value, vallen, envstr); |
| return len; |
| } |
| else |
| { |
| value[0] = '\0'; |
| return 0; |
| } |
| } |
| |
| /* |
| * Unrecoverable error: Print an error message and abort execution. |
| */ |
| COMMON_SYSDEP void cilkos_error(const char *fmt, ...) |
| { |
| va_list l; |
| fflush(NULL); |
| fprintf(stderr, "Cilk error: "); |
| va_start(l, fmt); |
| vfprintf(stderr, fmt, l); |
| va_end(l); |
| fprintf(stderr, "Exiting.\n"); |
| fflush(stderr); |
| |
| abort(); |
| } |
| |
| /* |
| * Print a warning message and return. |
| */ |
| COMMON_SYSDEP void cilkos_warning(const char *fmt, ...) |
| { |
| va_list l; |
| fflush(NULL); |
| fprintf(stderr, "Cilk warning: "); |
| va_start(l, fmt); |
| vfprintf(stderr, fmt, l); |
| va_end(l); |
| fflush(stderr); |
| } |
| |
| #ifdef __VXWORKS__ |
| #ifdef _WRS_KERNEL |
| void cilkStart() |
| { |
| __cilkrts_init_tls_variables(); |
| } |
| #else |
| _WRS_CONSTRUCTOR(cilkInit, 100) |
| { |
| __cilkrts_init_tls_variables(); |
| } |
| #endif |
| #else |
| static void __attribute__((constructor)) init_once() |
| { |
| /*__cilkrts_debugger_notification_internal(CILK_DB_RUNTIME_LOADED);*/ |
| __cilkrts_init_tls_variables(); |
| } |
| #endif |
| |
| |
| #define PAGE 4096 |
| #define CILK_MIN_STACK_SIZE (4*PAGE) |
| // Default size for the stacks that we create in Cilk for Unix. |
| #define CILK_DEFAULT_STACK_SIZE 0x100000 |
| |
| /* |
| * Convert the user's specified stack size into a "reasonable" value |
| * for this OS. |
| */ |
| size_t cilkos_validate_stack_size(size_t specified_stack_size) { |
| // Convert any negative value to the default. |
| if (specified_stack_size == 0) { |
| CILK_ASSERT((CILK_DEFAULT_STACK_SIZE % PAGE) == 0); |
| return CILK_DEFAULT_STACK_SIZE; |
| } |
| // Round values in between 0 and CILK_MIN_STACK_SIZE up to |
| // CILK_MIN_STACK_SIZE. |
| if (specified_stack_size <= CILK_MIN_STACK_SIZE) { |
| return CILK_MIN_STACK_SIZE; |
| } |
| if ((specified_stack_size % PAGE) > 0) { |
| // Round the user's stack size value up to nearest page boundary. |
| return (PAGE * (1 + specified_stack_size / PAGE)); |
| } |
| return specified_stack_size; |
| } |
| |
| long cilkos_atomic_add(volatile long* p, long x) |
| { |
| return __sync_add_and_fetch(p, x); |
| } |
| |
| /* End os-unix.c */ |