| /* global_state.cpp -*-C++-*- |
| * |
| ************************************************************************* |
| * |
| * @copyright |
| * Copyright (C) 2009-2013, Intel Corporation |
| * All rights reserved. |
| * |
| * @copyright |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in |
| * the documentation and/or other materials provided with the |
| * distribution. |
| * * Neither the name of Intel Corporation nor the names of its |
| * contributors may be used to endorse or promote products derived |
| * from this software without specific prior written permission. |
| * |
| * @copyright |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
| * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
| * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS |
| * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED |
| * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY |
| * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| * POSSIBILITY OF SUCH DAMAGE. |
| **************************************************************************/ |
| |
| #include "global_state.h" |
| #include "os.h" |
| #include "bug.h" |
| #include "metacall_impl.h" |
| #include "stats.h" |
| #include "cilk/cilk_api.h" |
| #include "cilk_malloc.h" |
| #include "record-replay.h" |
| |
| #include <algorithm> // For max() |
| #include <cstring> |
| #include <cstdlib> |
| #include <climits> |
| #include <cerrno> |
| |
| #ifdef _WIN32 |
| # include <wchar.h> |
| #endif |
| |
| // TBD: There is a race when multiple threads try to initialize the |
| // user_settable_values?? |
| // |
| // Set to true if the user settable values portion of the global state |
| // singleton is initialized, even if the rest of the singleton is not |
| // initialized. |
| int cilkg_user_settable_values_initialized = false; |
| |
| namespace { |
| |
| // Single copy of the global state. Zero-filled until |
| // cilkg_get_user_settable_values() is called and partially-zero-filled until |
| // cilkg_init_global_state() is called. The first field is filled in with |
| // the size of a void* for the debugger and must be valid before initialization |
| global_state_t global_state_singleton = |
| { |
| sizeof(void *), // addr_size |
| }; |
| |
| |
| // Variables that need to export C-style names |
| extern "C" |
| { |
| // Pointer to the global state singleton. |
| global_state_t *cilkg_singleton_ptr = NULL; |
| |
| // __cilkrts_global_state is exported and referenced by the debugger. |
| // The debugger expects it to be valid when the module loads. |
| // CILK_EXPORT_DATA |
| global_state_t *__cilkrts_global_state = &global_state_singleton; |
| } |
| |
| // Returns true if 'a' and 'b' are equal null-terminated strings |
| inline bool strmatch(const char* a, const char* b) |
| { |
| return 0 == std::strcmp(a, b); |
| } |
| |
| // Returns the integer value represented by the null-terminated string at 's'. |
| inline long to_long(const char* s) |
| { |
| char *end; |
| |
| errno = 0; |
| return std::strtol(s, &end, 0); |
| } |
| |
| #ifdef _WIN32 |
| // Returns true if 'a' and 'b' are equal null-terminated wide-char strings |
| inline bool strmatch(const wchar_t* a, const wchar_t* b) |
| { |
| return 0 == wcscmp(a, b); |
| } |
| |
| // Returns true if the multi-byte character string at 'a' represents the same |
| // character sequence as the wide-character string at 'b'. The behavior is |
| // undefined if 'a' contains more than 30 multi-byte characters. |
| bool strmatch(const char* a, const wchar_t* b) |
| { |
| // Convert 'a' to wide-characters, then compare. |
| wchar_t wa[31]; |
| std::size_t count; |
| errno_t err = mbstowcs_s(&count, wa, a, 30); |
| CILK_ASSERT(0 == err); |
| if (err) return false; |
| return strmatch(wa, b); |
| } |
| |
| // Returns true if the wide-character string at 'a' represents the same |
| // character sequence as the multi-byte character string at 'b'. The behavior |
| // id undefined if 'b' contains more than 30 multi-byte characters. |
| inline |
| bool strmatch(const wchar_t* a, const char* b) |
| { |
| return strmatch(b, a); |
| } |
| |
| |
| // Returns the integer value represented by the null-terminated wide-char |
| // string at 's'. |
| inline long to_long(const wchar_t* s) |
| { |
| wchar_t *end; |
| |
| errno = 0; |
| return wcstol(s, &end, 0); |
| } |
| #endif |
| |
| // Check if Cilkscreen or other sequential ptool wants to force reducers. |
| bool always_force_reduce() |
| { |
| // Metacall *looks* like a no-op. volatile needed to keep compiler from |
| // optimizing away variable. |
| volatile char not_force_reduce = '\377'; |
| __cilkrts_metacall(METACALL_TOOL_SYSTEM, HYPER_ZERO_IF_FORCE_REDUCE, |
| const_cast<char*>(¬_force_reduce)); |
| return ! not_force_reduce; |
| } |
| |
| // Stores the boolean value represented by the null-terminated string at 'val' |
| // into the integer object at 'out'. Returns '__CILKRTS_SET_PARAM_SUCCESS' if |
| // 'val' is "true", "false", "0" or "1" and '__CILKRTS_SET_PARAM_INVALID' |
| // otherwise. |
| template <typename INT_T, typename CHAR_T> |
| int store_bool(INT_T *out, const CHAR_T *val) |
| { |
| static const char* const s_zero = "0"; |
| static const char* const s_one = "1"; |
| static const char* const s_true = "true"; |
| static const char* const s_false = "false"; |
| |
| if (val == 0) |
| return __CILKRTS_SET_PARAM_INVALID; |
| |
| if (strmatch(s_false, val) || strmatch(s_zero, val)) { |
| *out = 0; |
| return __CILKRTS_SET_PARAM_SUCCESS; |
| } |
| |
| if (strmatch(s_true, val) || strmatch(s_one, val)) { |
| *out = 1; |
| return __CILKRTS_SET_PARAM_SUCCESS; |
| } |
| |
| return __CILKRTS_SET_PARAM_INVALID; |
| } |
| |
| // Stores the integer value represented by the null-terminated string at 'val' |
| // into the integer object at 'out', restricting the result to the range 'min' |
| // to 'max', inclusive. Returns '__CILKRTS_SET_PARAM_SUCCESS' if the conversion |
| // succeeds and is in range, '__CILKRTS_SET_PARAM_XRANGE' if the conversion |
| // succeeds but is out of range, and '__CILKRTS_SET_PARAM_INVALID' otherwise. In |
| // the case of any error, '*out' is unchanged. |
| template <typename INT_T, typename CHAR_T> |
| int store_int(INT_T *out, const CHAR_T *val, INT_T min, INT_T max) |
| { |
| errno = 0; |
| long val_as_long = to_long(val); |
| if (val_as_long == 0 && errno != 0) |
| return __CILKRTS_SET_PARAM_INVALID; |
| if (val_as_long < min || val_as_long == LONG_MIN) |
| return __CILKRTS_SET_PARAM_XRANGE; |
| else if (val_as_long > max || val_as_long == LONG_MAX) |
| return __CILKRTS_SET_PARAM_XRANGE; |
| |
| *out = val_as_long; |
| return __CILKRTS_SET_PARAM_SUCCESS; |
| } |
| |
| // Implementaton of cilkg_set_param templatized on character type. |
| // Windows will instantiate with both char and wchar_t. |
| // Note that g must have its user settable values set, but need not be fully |
| // initialized. |
| template <class CHAR_T> |
| int set_param_imp(global_state_t* g, const CHAR_T* param, const CHAR_T* value) |
| { |
| static const char* const s_force_reduce = "force reduce"; |
| static const char* const s_nworkers = "nworkers"; |
| static const char* const s_max_user_workers = "max user workers"; |
| static const char* const s_local_stacks = "local stacks"; |
| static const char* const s_shared_stacks = "shared stacks"; |
| static const char* const s_nstacks = "nstacks"; |
| static const char* const s_stack_size = "stack size"; |
| |
| // We must have a parameter and a value |
| if (0 == param) |
| return __CILKRTS_SET_PARAM_INVALID; |
| if (0 == value) |
| return __CILKRTS_SET_PARAM_INVALID; |
| |
| if (strmatch(param, s_force_reduce)) |
| { |
| // Sets whether we force a reduce operation at every sync. Useful for |
| // debugging reducers. Off by default. Overridden by Cilkscreen |
| // |
| // Documented in cilk_api_<os>.h |
| if (always_force_reduce()) |
| // Force reduce is set by cilkscreen. User cannot change it. |
| return __CILKRTS_SET_PARAM_LATE; |
| |
| return store_bool(&g->force_reduce, value); |
| } |
| else if (strmatch(param, s_nworkers)) |
| { |
| // Set the total number of workers. Overrides count of cores we get |
| // from the OS and the setting of the CILK_NWORKERS environment |
| // variable. Setting to 0 indicates that the default worker count |
| // should be used. |
| // |
| // Documented in cilk_api_<os>.h |
| if (cilkg_singleton_ptr) |
| return __CILKRTS_SET_PARAM_LATE; |
| |
| // Fetch the number of cores. There must be at last 1, since we're |
| // executing on *something*, aren't we!? |
| int hardware_cpu_count = __cilkrts_hardware_cpu_count(); |
| CILK_ASSERT(hardware_cpu_count > 0); |
| |
| int max_cpu_count = 16 * hardware_cpu_count; |
| if (__cilkrts_running_under_sequential_ptool()) |
| { |
| hardware_cpu_count = 1; |
| max_cpu_count = 1; |
| } |
| // Allow a value of 0, which means "set to hardware thread count". |
| int ret = store_int(&g->P, value, 0, max_cpu_count); |
| if (0 == g->P) |
| g->P = hardware_cpu_count; |
| return ret; |
| } |
| else if (strmatch(param, s_max_user_workers)) |
| { |
| // ** UNDOCUMENTED ** |
| // |
| // Sets the number of slots allocated for user worker threads |
| int hardware_cpu_count = __cilkrts_hardware_cpu_count(); |
| CILK_ASSERT (hardware_cpu_count > 0); |
| |
| return store_int(&g->max_user_workers, value, 1, |
| 16 * hardware_cpu_count); |
| } |
| else if (strmatch(param, s_local_stacks)) |
| { |
| // ** UNDOCUMENTED ** |
| // |
| // Number of stacks we'll hold in the per-worker stack cache. Maximum |
| // value is 42. See __cilkrts_make_global_state for details. |
| return store_int(&g->fiber_pool_size, value, 0, 42); |
| } |
| else if (strmatch(param, s_shared_stacks)) |
| { |
| // ** UNDOCUMENTED ** |
| // |
| // Maximum number of stacks we'll hold in the global stack |
| // cache. Maximum value is 42. See __cilkrts_make_global_state for |
| // details. |
| return store_int(&g->global_fiber_pool_size, value, 0, 42); |
| } |
| else if (strmatch(param, s_nstacks)) |
| { |
| // Sets the maximum number of stacks permitted at one time. If the |
| // runtime reaches this maximum, it will cease to allocate stacks and |
| // the app will lose parallelism. 0 means unlimited. Default is |
| // unlimited. Minimum is twice the number of worker threads, though |
| // that cannot be tested at this time. |
| // |
| // Undocumented at this time, though there are plans to expose it. |
| // The current implentation is for Linux debugging only and is not |
| // robust enough for users. |
| if (cilkg_singleton_ptr) |
| return __CILKRTS_SET_PARAM_LATE; |
| return store_int<unsigned>(&g->max_stacks, value, 0, INT_MAX); |
| } |
| else if (strmatch(param, s_stack_size)) |
| { |
| // ** UNDOCUMENTED ** |
| // |
| // Sets the size (in bytes) of the stacks that Cilk creates. |
| // Can only be set before the runtime starts. |
| if (cilkg_singleton_ptr) |
| return __CILKRTS_SET_PARAM_LATE; |
| |
| // Maximum value that can be parsed is MAX_INT (32-bit). |
| int ret = store_int<size_t>(&g->stack_size, value, 0, INT_MAX); |
| |
| // Process the value the user set (or 0 if the user didn't set |
| // anything) into something nice for the current OS. This |
| // processing is done immediately and stored into |
| // g->stack_size so that a call to get stack size will return |
| // the value that the runtime will actually use. |
| g->stack_size = cilkos_validate_stack_size(g->stack_size); |
| return ret; |
| } |
| |
| |
| // If got here, then didn't match any of the strings |
| return __CILKRTS_SET_PARAM_UNIMP; |
| } |
| |
| inline |
| int calc_max_user_workers(global_state_t *g) |
| { |
| // If it's been set by the user, give back what we got |
| if (g->max_user_workers > 0) |
| return g->max_user_workers; |
| |
| // Calculate it |
| return std::max(3, g->P * 2); |
| } |
| |
| } // end unnamed namespace |
| |
| __CILKRTS_BEGIN_EXTERN_C |
| |
| /** |
| * @brief Returns the global state object. If called for the first time, |
| * initializes the user-settable values in the global state, but does not |
| * initialize the rest of the structure. |
| */ |
| global_state_t* cilkg_get_user_settable_values() |
| { |
| // Environment variable value. More than big enough for a 64-bit signed |
| // integer. |
| char envstr[24]; |
| |
| // Abbreviating &global_state_singleton as g is not only shorter, it also |
| // facilitates grepping for the string "g->", which appears ubiquitously |
| // in the runtime code. |
| global_state_t* g = &global_state_singleton; |
| |
| // TBD: We need synchronization around this loop to prevent |
| // multiple threads from initializing this data. |
| if (! cilkg_user_settable_values_initialized) |
| { |
| size_t len; |
| |
| // Preserve stealing disabled since it may have been set by the |
| // debugger |
| int stealing_disabled = g->stealing_disabled; |
| |
| // All fields will be zero until set. In particular |
| std::memset(g, 0, sizeof(global_state_t)); |
| |
| // Fetch the number of cores. There must be at last 1, since we're |
| // executing on *something*, aren't we!? |
| int hardware_cpu_count = __cilkrts_hardware_cpu_count(); |
| CILK_ASSERT(hardware_cpu_count > 0); |
| |
| bool under_ptool = __cilkrts_running_under_sequential_ptool(); |
| if (under_ptool) |
| hardware_cpu_count = 1; |
| |
| g->stealing_disabled = stealing_disabled; |
| g->under_ptool = under_ptool; |
| g->force_reduce = 0; // Default Off |
| g->P = hardware_cpu_count; // Defaults to hardware CPU count |
| g->max_user_workers = 0; // 0 unless set by user |
| g->fiber_pool_size = 7; // Arbitrary default |
| |
| g->global_fiber_pool_size = 3 * 3* g->P; // Arbitrary default |
| // 3*P was the default size of the worker array (including |
| // space for extra user workers). This parameter was chosen |
| // to match previous versions of the runtime. |
| |
| if (4 == sizeof(void *)) |
| g->max_stacks = 1200; // Only 1GB on 32-bit machines |
| else |
| g->max_stacks = 2400; // 2GB on 64-bit machines |
| |
| // If we have 2400 1MB stacks, that is 2 gb. If we reach this |
| // limit on a single-socket machine, we may have other |
| // problems. Is 2400 too small for large multicore machines? |
| |
| // TBD(jsukha, 11/27/2012): I set this limit on stacks to be a |
| // value independent of P. When running on a Xeon Phi with |
| // small values of P, I recall seeing a few microbenchmarks |
| // (e.g., fib) where a limit of 10*P seemed to be |
| // unnecessarily slowing things down. |
| // |
| // That being said, the code has changed sufficiently that |
| // this observation may no longer be true. |
| // |
| // Note: in general, the worst-case number of stacks required |
| // for a Cilk computation with spawn depth "d" on P workers is |
| // O(Pd). Code with unbalanced recursion may run into issues |
| // with this stack usage. |
| |
| g->max_steal_failures = 128; // TBD: depend on max_workers? |
| g->stack_size = 0; // 0 unless set by the user |
| |
| // Assume no record or replay log for now |
| g->record_replay_file_name = NULL; |
| g->record_or_replay = RECORD_REPLAY_NONE; // set by user |
| |
| if (always_force_reduce()) |
| g->force_reduce = true; |
| else if (cilkos_getenv(envstr, sizeof(envstr), "CILK_FORCE_REDUCE")) |
| store_bool(&g->force_reduce, envstr); |
| |
| if (under_ptool) |
| g->P = 1; // Ignore environment variable if under cilkscreen |
| else if (cilkos_getenv(envstr, sizeof(envstr), "CILK_NWORKERS")) |
| // Set P to environment variable, but limit to no less than 1 |
| // and no more than 16 times the number of hardware threads. |
| store_int(&g->P, envstr, 1, 16 * hardware_cpu_count); |
| |
| if (cilkos_getenv(envstr, sizeof(envstr), "CILK_MAX_USER_WORKERS")) |
| // Set max_user_workers to environment variable, but limit to no |
| // less than 1 and no more 16 times the number of hardware |
| // threads. If not specified, defaults (somewhat arbitrarily) to |
| // the larger of 3 and twice the number of hardware threads. |
| store_int(&g->max_user_workers, envstr, 1, 16*hardware_cpu_count); |
| |
| if (cilkos_getenv(envstr, sizeof(envstr), "CILK_STEAL_FAILURES")) |
| // Set the number of times a worker should fail to steal before |
| // it looks to see whether it should suspend itself. |
| store_int<unsigned>(&g->max_steal_failures, envstr, 1, INT_MAX); |
| |
| // Compute the total number of workers to allocate. Subtract one from |
| // nworkers and user workers so that the first user worker isn't |
| // factored in twice. |
| // |
| // total_workers must be computed now to support __cilkrts_get_total_workers |
| g->total_workers = g->P + calc_max_user_workers(g) - 1; |
| |
| #ifdef CILK_RECORD_REPLAY |
| // RecordReplay: See if we've been asked to replay a log |
| len = cilkos_getenv(envstr, 0, "CILK_REPLAY_LOG"); |
| if (len > 0) |
| { |
| len += 1; // Allow for trailing NUL |
| g->record_or_replay = REPLAY_LOG; |
| g->record_replay_file_name = (char *)__cilkrts_malloc(len); |
| cilkos_getenv(g->record_replay_file_name, len, "CILK_REPLAY_LOG"); |
| } |
| |
| // RecordReplay: See if we've been asked to record a log |
| len = cilkos_getenv(envstr, 0, "CILK_RECORD_LOG"); |
| if (len > 0) |
| { |
| if (RECORD_REPLAY_NONE != g->record_or_replay) |
| cilkos_warning("CILK_RECORD_LOG ignored since CILK_REPLAY_LOG is defined.\n"); |
| else |
| { |
| len += 1; // Allow for trailing NUL |
| g->record_or_replay = RECORD_LOG; |
| g->record_replay_file_name = (char *)__cilkrts_malloc(len); |
| cilkos_getenv(g->record_replay_file_name, len, "CILK_RECORD_LOG"); |
| } |
| } |
| #endif |
| |
| cilkg_user_settable_values_initialized = true; |
| } |
| |
| return g; |
| } |
| |
| int cilkg_calc_total_workers() |
| { |
| global_state_t* g = cilkg_get_user_settable_values(); |
| |
| // Compute the total number of workers to allocate. Subtract one from |
| // nworkers and user workers so that the first user worker isn't |
| // factored in twice. |
| return g->P + calc_max_user_workers(g) - 1; |
| } |
| |
| // Should be called while holding the global lock. |
| global_state_t* cilkg_init_global_state() |
| { |
| if (cilkg_singleton_ptr) |
| return cilkg_singleton_ptr; |
| |
| // Get partially-initialized global state. |
| global_state_t* g = cilkg_get_user_settable_values(); |
| |
| if (g->max_stacks > 0) { |
| |
| // nstacks is currently honored on non-Windows systems only. |
| |
| // Set an upper bound on the number of stacks that are allocated. If |
| // nstacks is set, each worker gets up to one stack in its cache so that |
| // no one worker can hog all of the free stacks and keep work from being |
| // stolen by the other workers. |
| |
| // nstacks corresponds to the number of stacks that will be allocated by |
| // the runtime apart from the initial stack created for each thread by |
| // the system. Therefore, if a user asks for n stacks, and there are |
| // p workers created, the total number of stacks is actually n + p. |
| |
| // This feature is primarily for MIC which has flat memory |
| // instead of virtual addresses and tends to run out really quickly. |
| // It is not implemented for Windows and it's non-intuitive |
| // interaction with the local stack cache is specifically to help out |
| // MIC. |
| |
| // About max_stacks / P stacks, except we require at least 1 |
| // per pool. |
| if (((int)g->max_stacks / g->P) < g->fiber_pool_size) |
| g->fiber_pool_size = g->max_stacks / g->P; |
| |
| if (g->fiber_pool_size <= 0) { |
| g->fiber_pool_size = 1; |
| } |
| |
| if ((int)g->max_stacks < g->P) |
| g->max_stacks = g->P; |
| |
| g->global_fiber_pool_size = g->P * (g->fiber_pool_size+1); |
| } |
| |
| // Number of bytes/address - validation for debugger integration |
| g->addr_size = sizeof(void *); |
| |
| __cilkrts_init_stats(&g->stats); |
| |
| __cilkrts_frame_malloc_global_init(g); |
| |
| g->Q = 0; |
| g->total_workers = cilkg_calc_total_workers(); |
| g->system_workers = g->P - 1; // system_workers is here for the debugger. |
| g->work_done = 0; |
| g->workers_running = 0; |
| g->ltqsize = 1024; /* FIXME */ |
| |
| g->stack_size = cilkos_validate_stack_size(g->stack_size); |
| g->failure_to_allocate_stack = 0; |
| |
| |
| return g; |
| } |
| |
| void cilkg_publish_global_state(global_state_t* g) |
| { |
| |
| // TBD: which one of these needs to be executed first? I say |
| // cilkg_singleton_ptr needs to be set last, with a mfence in |
| // between, since it is the flag that cilkg_is_published_is |
| // checking for. |
| __cilkrts_global_state = g; |
| __cilkrts_fence(); |
| cilkg_singleton_ptr = g; |
| } |
| |
| void cilkg_deinit_global_state() |
| { |
| cilkg_singleton_ptr = NULL; |
| __cilkrts_global_state = NULL; |
| } |
| |
| int cilkg_is_published(void) |
| { |
| return NULL != cilkg_singleton_ptr; |
| } |
| |
| int cilkg_set_param(const char* param, const char* value) |
| { |
| return set_param_imp(cilkg_get_user_settable_values(), param, value); |
| } |
| |
| #ifdef _WIN32 |
| int cilkg_set_param_w(const wchar_t* param, const wchar_t* value) |
| { |
| return set_param_imp(cilkg_get_user_settable_values(), param, value); |
| } |
| #endif |
| |
| extern "C++" { |
| // C++ scheduler function (that may throw exceptions) |
| typedef void cpp_scheduler_t(__cilkrts_worker *w); |
| } |
| |
| void __cilkrts_run_scheduler_with_exceptions(__cilkrts_worker *w) |
| { |
| global_state_t* g = cilkg_get_global_state(); |
| CILK_ASSERT(g->scheduler); |
| |
| cpp_scheduler_t* scheduler = (cpp_scheduler_t*) g->scheduler; |
| |
| try { |
| scheduler(w); |
| } catch (...) { |
| __cilkrts_bug("Exception escaped Cilk context"); |
| } |
| } |
| |
| __CILKRTS_END_EXTERN_C |
| |
| /* End global_state.cpp */ |