| /* scheduler.c -*-C-*- |
| * |
| ************************************************************************* |
| * |
| * @copyright |
| * Copyright (C) 2007-2013, Intel Corporation |
| * All rights reserved. |
| * |
| * @copyright |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in |
| * the documentation and/or other materials provided with the |
| * distribution. |
| * * Neither the name of Intel Corporation nor the names of its |
| * contributors may be used to endorse or promote products derived |
| * from this software without specific prior written permission. |
| * |
| * @copyright |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
| * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
| * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS |
| * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED |
| * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY |
| * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| * POSSIBILITY OF SUCH DAMAGE. |
| * |
| **************************************************************************/ |
| |
| /* |
| * Cilk scheduler |
| */ |
| |
| #include "scheduler.h" |
| #include "bug.h" |
| #include "os.h" |
| #include "os_mutex.h" |
| #include "local_state.h" |
| #include "signal_node.h" |
| #include "full_frame.h" |
| #include "sysdep.h" |
| #include "except.h" |
| #include "cilk_malloc.h" |
| #include "pedigrees.h" |
| #include "record-replay.h" |
| |
| #include <limits.h> |
| #include <string.h> /* memcpy */ |
| #include <stdio.h> // sprintf |
| #include <stdlib.h> // malloc, free, abort |
| |
| #ifdef _WIN32 |
| # pragma warning(disable:1786) // disable warning: sprintf is deprecated |
| # include "sysdep-win.h" |
| # include "except-win32.h" |
| #endif // _WIN32 |
| |
| // ICL: Don't complain about conversion from pointer to same-sized integral |
| // type in __cilkrts_put_stack. That's why we're using ptrdiff_t |
| #ifdef _WIN32 |
| # pragma warning(disable: 1684) |
| #endif |
| |
| #include "cilk/cilk_api.h" |
| #include "frame_malloc.h" |
| #include "metacall_impl.h" |
| #include "reducer_impl.h" |
| #include "cilk-tbb-interop.h" |
| #include "cilk-ittnotify.h" |
| #include "stats.h" |
| |
| // ICL: Don't complain about loss of precision in myrand |
| // I tried restoring the warning after the function, but it didn't |
| // suppress it |
| #ifdef _WIN32 |
| # pragma warning(disable: 2259) |
| #endif |
| |
| #ifndef _WIN32 |
| # include <unistd.h> |
| #endif |
| |
| #ifdef __VXWORKS__ |
| // redeclare longjmp() with noreturn to stop warnings |
| extern __attribute__((noreturn)) |
| void longjmp(jmp_buf, int); |
| #endif |
| |
| //#define DEBUG_LOCKS 1 |
| #ifdef DEBUG_LOCKS |
| // The currently executing worker must own this worker's lock |
| # define ASSERT_WORKER_LOCK_OWNED(w) \ |
| { \ |
| __cilkrts_worker *tls_worker = __cilkrts_get_tls_worker(); \ |
| CILK_ASSERT((w)->l->lock.owner == tls_worker); \ |
| } |
| #else |
| # define ASSERT_WORKER_LOCK_OWNED(w) |
| #endif // DEBUG_LOCKS |
| |
| // Options for the scheduler. |
| enum schedule_t { SCHEDULE_RUN, |
| SCHEDULE_WAIT, |
| SCHEDULE_EXIT }; |
| |
| // Return values for provably_good_steal() |
| enum provably_good_steal_t |
| { |
| ABANDON_EXECUTION, // Not the last child to the sync - attempt to steal work |
| CONTINUE_EXECUTION, // Last child to the sync - continue executing on this worker |
| WAIT_FOR_CONTINUE // The replay log indicates that this was the worker |
| // which continued. Loop until we are the last worker |
| // to the sync. |
| }; |
| |
| |
| // Verify that "w" is the worker we are currently executing on. |
| // Because this check is expensive, this method is usually a no-op. |
| static inline void verify_current_wkr(__cilkrts_worker *w) |
| { |
| #if ((REDPAR_DEBUG >= 3) || (FIBER_DEBUG >= 1)) |
| // Lookup the worker from TLS and compare to w. |
| __cilkrts_worker* tmp = __cilkrts_get_tls_worker(); |
| if (w != tmp) { |
| fprintf(stderr, "Error. W=%d, actual worker =%d...\n", |
| w->self, |
| tmp->self); |
| } |
| CILK_ASSERT(w == tmp); |
| #endif |
| } |
| |
| static enum schedule_t worker_runnable(__cilkrts_worker *w); |
| |
| // Scheduling-fiber functions: |
| static void do_return_from_spawn (__cilkrts_worker *w, |
| full_frame *ff, |
| __cilkrts_stack_frame *sf); |
| static void do_sync (__cilkrts_worker *w, |
| full_frame *ff, |
| __cilkrts_stack_frame *sf); |
| |
| // max is defined on Windows and VxWorks |
| #if (! defined(_WIN32)) && (! defined(__VXWORKS__)) |
| // TBD: definition of max() for Linux. |
| # define max(a, b) ((a) < (b) ? (b) : (a)) |
| #endif |
| |
| void __cilkrts_dump_stats_to_stderr(global_state_t *g) |
| { |
| #ifdef CILK_PROFILE |
| int i; |
| for (i = 0; i < g->total_workers; ++i) { |
| // Print out statistics for each worker. We collected them, |
| // so why not print them out? |
| fprintf(stderr, "Stats for worker %d\n", i); |
| dump_stats_to_file(stderr, g->workers[i]->l->stats); |
| __cilkrts_accum_stats(&g->stats, g->workers[i]->l->stats); |
| } |
| |
| // Also print out aggregate statistics. |
| dump_stats_to_file(stderr, &g->stats); |
| #endif |
| fprintf(stderr, |
| "CILK PLUS Thread Info: P=%d, Q=%d\n", |
| g->P, |
| g->Q); |
| fprintf(stderr, |
| "CILK PLUS RUNTIME MEMORY USAGE: %lld bytes", |
| (long long)g->frame_malloc.allocated_from_os); |
| #ifdef CILK_PROFILE |
| if (g->stats.stack_hwm) |
| fprintf(stderr, ", %ld stacks", g->stats.stack_hwm); |
| #endif |
| fputc('\n', stderr); |
| } |
| |
| static void validate_worker(__cilkrts_worker *w) |
| { |
| /* check the magic numbers, for debugging purposes */ |
| if (w->l->worker_magic_0 != WORKER_MAGIC_0 || |
| w->l->worker_magic_1 != WORKER_MAGIC_1) |
| abort_because_rts_is_corrupted(); |
| } |
| |
| static void double_link(full_frame *left_ff, full_frame *right_ff) |
| { |
| if (left_ff) |
| left_ff->right_sibling = right_ff; |
| if (right_ff) |
| right_ff->left_sibling = left_ff; |
| } |
| |
| /* add CHILD to the right of all children of PARENT */ |
| static void push_child(full_frame *parent_ff, full_frame *child_ff) |
| { |
| double_link(parent_ff->rightmost_child, child_ff); |
| double_link(child_ff, 0); |
| parent_ff->rightmost_child = child_ff; |
| } |
| |
| /* unlink CHILD from the list of all children of PARENT */ |
| static void unlink_child(full_frame *parent_ff, full_frame *child_ff) |
| { |
| double_link(child_ff->left_sibling, child_ff->right_sibling); |
| |
| if (!child_ff->right_sibling) { |
| /* this is the rightmost child -- update parent link */ |
| CILK_ASSERT(parent_ff->rightmost_child == child_ff); |
| parent_ff->rightmost_child = child_ff->left_sibling; |
| } |
| child_ff->left_sibling = child_ff->right_sibling = 0; /* paranoia */ |
| } |
| |
| static void incjoin(full_frame *ff) |
| { |
| ++ff->join_counter; |
| } |
| |
| static int decjoin(full_frame *ff) |
| { |
| CILK_ASSERT(ff->join_counter > 0); |
| return (--ff->join_counter); |
| } |
| |
| static int simulate_decjoin(full_frame *ff) |
| { |
| CILK_ASSERT(ff->join_counter > 0); |
| return (ff->join_counter - 1); |
| } |
| |
| /* |
| * Pseudo-random generator defined by the congruence S' = 69070 * S |
| * mod (2^32 - 5). Marsaglia (CACM July 1993) says on page 107 that |
| * this is a ``good one''. There you go. |
| * |
| * The literature makes a big fuss about avoiding the division, but |
| * for us it is not worth the hassle. |
| */ |
| static const unsigned RNGMOD = ((1ULL << 32) - 5); |
| static const unsigned RNGMUL = 69070U; |
| |
| static unsigned myrand(__cilkrts_worker *w) |
| { |
| unsigned state = w->l->rand_seed; |
| state = (unsigned)((RNGMUL * (unsigned long long)state) % RNGMOD); |
| w->l->rand_seed = state; |
| return state; |
| } |
| |
| static void mysrand(__cilkrts_worker *w, unsigned seed) |
| { |
| seed %= RNGMOD; |
| seed += (seed == 0); /* 0 does not belong to the multiplicative |
| group. Use 1 instead */ |
| w->l->rand_seed = seed; |
| } |
| |
| /* W grabs its own lock */ |
| void __cilkrts_worker_lock(__cilkrts_worker *w) |
| { |
| validate_worker(w); |
| CILK_ASSERT(w->l->do_not_steal == 0); |
| |
| /* tell thieves to stay out of the way */ |
| w->l->do_not_steal = 1; |
| __cilkrts_fence(); /* probably redundant */ |
| |
| __cilkrts_mutex_lock(w, &w->l->lock); |
| } |
| |
| void __cilkrts_worker_unlock(__cilkrts_worker *w) |
| { |
| __cilkrts_mutex_unlock(w, &w->l->lock); |
| CILK_ASSERT(w->l->do_not_steal == 1); |
| /* The fence is probably redundant. Use a release |
| operation when supported (gcc and compatibile); |
| that is faster on x86 which serializes normal stores. */ |
| #if defined __GNUC__ && (__GNUC__ * 10 + __GNUC_MINOR__ > 43 || __ICC >= 1110) |
| __sync_lock_release(&w->l->do_not_steal); |
| #else |
| w->l->do_not_steal = 0; |
| __cilkrts_fence(); /* store-store barrier, redundant on x86 */ |
| #endif |
| } |
| |
| /* try to acquire the lock of some *other* worker */ |
| static int worker_trylock_other(__cilkrts_worker *w, |
| __cilkrts_worker *other) |
| { |
| int status = 0; |
| |
| validate_worker(other); |
| |
| /* This protocol guarantees that, after setting the DO_NOT_STEAL |
| flag, worker W can enter its critical section after waiting for |
| the thief currently in the critical section (if any) and at |
| most one other thief. |
| |
| This requirement is overly paranoid, but it should protect us |
| against future nonsense from OS implementors. |
| */ |
| |
| /* compete for the right to disturb OTHER */ |
| if (__cilkrts_mutex_trylock(w, &other->l->steal_lock)) { |
| if (other->l->do_not_steal) { |
| /* leave it alone */ |
| } else { |
| status = __cilkrts_mutex_trylock(w, &other->l->lock); |
| } |
| __cilkrts_mutex_unlock(w, &other->l->steal_lock); |
| } |
| |
| |
| return status; |
| } |
| |
| static void worker_unlock_other(__cilkrts_worker *w, |
| __cilkrts_worker *other) |
| { |
| __cilkrts_mutex_unlock(w, &other->l->lock); |
| } |
| |
| |
| /* Lock macro Usage: |
| BEGIN_WITH_WORKER_LOCK(w) { |
| statement; |
| statement; |
| BEGIN_WITH_FRAME_LOCK(w, ff) { |
| statement; |
| statement; |
| } END_WITH_FRAME_LOCK(w, ff); |
| } END_WITH_WORKER_LOCK(w); |
| */ |
| #define BEGIN_WITH_WORKER_LOCK(w) __cilkrts_worker_lock(w); do |
| #define END_WITH_WORKER_LOCK(w) while (__cilkrts_worker_unlock(w), 0) |
| |
| // TBD(jsukha): These are worker lock acquistions on |
| // a worker whose deque is empty. My conjecture is that we |
| // do not need to hold the worker lock at these points. |
| // I have left them in for now, however. |
| // |
| // #define REMOVE_POSSIBLY_OPTIONAL_LOCKS |
| #ifdef REMOVE_POSSIBLY_OPTIONAL_LOCKS |
| #define BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) do |
| #define END_WITH_WORKER_LOCK_OPTIONAL(w) while (0) |
| #else |
| #define BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) __cilkrts_worker_lock(w); do |
| #define END_WITH_WORKER_LOCK_OPTIONAL(w) while (__cilkrts_worker_unlock(w), 0) |
| #endif |
| |
| |
| #define BEGIN_WITH_FRAME_LOCK(w, ff) \ |
| do { full_frame *_locked_ff = ff; __cilkrts_frame_lock(w, _locked_ff); do |
| |
| #define END_WITH_FRAME_LOCK(w, ff) \ |
| while (__cilkrts_frame_unlock(w, _locked_ff), 0); } while (0) |
| |
| /* W becomes the owner of F and F can be stolen from W */ |
| static void make_runnable(__cilkrts_worker *w, full_frame *ff) |
| { |
| w->l->frame_ff = ff; |
| |
| /* CALL_STACK is invalid (the information is stored implicitly in W) */ |
| ff->call_stack = 0; |
| } |
| |
| /* |
| * The worker parameter is unused, except for print-debugging purposes. |
| */ |
| static void make_unrunnable(__cilkrts_worker *w, |
| full_frame *ff, |
| __cilkrts_stack_frame *sf, |
| int is_loot, |
| const char *why) |
| { |
| /* CALL_STACK becomes valid again */ |
| ff->call_stack = sf; |
| |
| if (sf) { |
| #if CILK_LIB_DEBUG |
| if (__builtin_expect(sf->flags & CILK_FRAME_EXITING, 0)) |
| __cilkrts_bug("W%d suspending exiting frame %p/%p\n", w->self, ff, sf); |
| #endif |
| sf->flags |= CILK_FRAME_STOLEN | CILK_FRAME_SUSPENDED; |
| sf->worker = 0; |
| |
| if (is_loot) |
| __cilkrts_put_stack(ff, sf); |
| |
| /* perform any system-dependent action, such as saving the |
| state of the stack */ |
| __cilkrts_make_unrunnable_sysdep(w, ff, sf, is_loot, why); |
| } |
| } |
| |
| |
| /* Push the next full frame to be made active in this worker and increment its |
| * join counter. __cilkrts_push_next_frame and pop_next_frame work on a |
| * one-element queue. This queue is used to communicate across the runtime |
| * from the code that wants to activate a frame to the code that can actually |
| * begin execution on that frame. They are asymetrical in that push |
| * increments the join counter but pop does not decrement it. Rather, a |
| * single push/pop combination makes a frame active and increments its join |
| * counter once. */ |
| void __cilkrts_push_next_frame(__cilkrts_worker *w, full_frame *ff) |
| { |
| CILK_ASSERT(ff); |
| CILK_ASSERT(!w->l->next_frame_ff); |
| incjoin(ff); |
| w->l->next_frame_ff = ff; |
| } |
| |
| /* Get the next full-frame to be made active in this worker. The join count |
| * of the full frame will have been incremented by the corresponding push |
| * event. See __cilkrts_push_next_frame, above. |
| */ |
| static full_frame *pop_next_frame(__cilkrts_worker *w) |
| { |
| full_frame *ff; |
| ff = w->l->next_frame_ff; |
| // Remove the frame from the next_frame field. |
| // |
| // If this is a user worker, then there is a chance that another worker |
| // from our team could push work into our next_frame (if it is the last |
| // worker doing work for this team). The other worker's setting of the |
| // next_frame could race with our setting of next_frame to NULL. This is |
| // the only possible race condition on next_frame. However, if next_frame |
| // has a non-NULL value, then it means the team still has work to do, and |
| // there is no chance of another team member populating next_frame. Thus, |
| // it is safe to set next_frame to NULL, if it was populated. There is no |
| // need for an atomic op. |
| if (NULL != ff) { |
| w->l->next_frame_ff = NULL; |
| } |
| return ff; |
| } |
| |
| /* |
| * Identify the single worker that is allowed to cross a sync in this frame. A |
| * thief should call this function when it is the first to steal work from a |
| * user worker. "First to steal work" may mean that there has been parallelism |
| * in the user worker before, but the whole team sync'd, and this is the first |
| * steal after that. |
| * |
| * This should happen while holding the worker and frame lock. |
| */ |
| static void set_sync_master(__cilkrts_worker *w, full_frame *ff) |
| { |
| w->l->last_full_frame = ff; |
| ff->sync_master = w; |
| } |
| |
| /* |
| * The sync that ends all parallelism for a particular user worker is about to |
| * be crossed. Decouple the worker and frame. |
| * |
| * No locks need to be held since the user worker isn't doing anything, and none |
| * of the system workers can steal from it. But unset_sync_master() should be |
| * called before the user worker knows about this work (i.e., before it is |
| * inserted into the w->l->next_frame_ff is set). |
| */ |
| static void unset_sync_master(__cilkrts_worker *w, full_frame *ff) |
| { |
| CILK_ASSERT(WORKER_USER == w->l->type); |
| CILK_ASSERT(ff->sync_master == w); |
| ff->sync_master = NULL; |
| w->l->last_full_frame = NULL; |
| } |
| |
| /******************************************************************** |
| * THE protocol: |
| ********************************************************************/ |
| /* |
| * This is a protocol for work stealing that minimizes the overhead on |
| * the victim. |
| * |
| * The protocol uses three shared pointers into the worker's deque: |
| * - T - the "tail" |
| * - H - the "head" |
| * - E - the "exception" NB: In this case, "exception" has nothing to do |
| * with C++ throw-catch exceptions -- it refers only to a non-normal return, |
| * i.e., a steal or similar scheduling exception. |
| * |
| * with H <= E, H <= T. |
| * |
| * Stack frames SF, where H <= E < T, are available for stealing. |
| * |
| * The worker operates on the T end of the stack. The frame being |
| * worked on is not on the stack. To make a continuation available for |
| * stealing the worker pushes a from onto the stack: stores *T++ = SF. |
| * To return, it pops the frame off the stack: obtains SF = *--T. |
| * |
| * After decrementing T, the condition E > T signals to the victim that |
| * it should invoke the runtime system's "THE" exception handler. The |
| * pointer E can become INFINITY, in which case the victim must invoke |
| * the THE exception handler as soon as possible. |
| * |
| * See "The implementation of the Cilk-5 multithreaded language", PLDI 1998, |
| * http://portal.acm.org/citation.cfm?doid=277652.277725, for more information |
| * on the THE protocol. |
| */ |
| |
| /* the infinity value of E */ |
| #define EXC_INFINITY ((__cilkrts_stack_frame **) (-1)) |
| |
| static void increment_E(__cilkrts_worker *victim) |
| { |
| __cilkrts_stack_frame *volatile *tmp; |
| |
| // The currently executing worker must own the worker lock to touch |
| // victim->exc |
| ASSERT_WORKER_LOCK_OWNED(victim); |
| |
| tmp = victim->exc; |
| if (tmp != EXC_INFINITY) { |
| /* On most x86 this pair of operations would be slightly faster |
| as an atomic exchange due to the implicit memory barrier in |
| an atomic instruction. */ |
| victim->exc = tmp + 1; |
| __cilkrts_fence(); |
| } |
| } |
| |
| static void decrement_E(__cilkrts_worker *victim) |
| { |
| __cilkrts_stack_frame *volatile *tmp; |
| |
| // The currently executing worker must own the worker lock to touch |
| // victim->exc |
| ASSERT_WORKER_LOCK_OWNED(victim); |
| |
| tmp = victim->exc; |
| if (tmp != EXC_INFINITY) { |
| /* On most x86 this pair of operations would be slightly faster |
| as an atomic exchange due to the implicit memory barrier in |
| an atomic instruction. */ |
| victim->exc = tmp - 1; |
| __cilkrts_fence(); /* memory fence not really necessary */ |
| } |
| } |
| |
| #if 0 |
| /* for now unused, will be necessary if we implement abort */ |
| static void signal_THE_exception(__cilkrts_worker *wparent) |
| { |
| wparent->exc = EXC_INFINITY; |
| __cilkrts_fence(); |
| } |
| #endif |
| |
| static void reset_THE_exception(__cilkrts_worker *w) |
| { |
| // The currently executing worker must own the worker lock to touch |
| // w->exc |
| ASSERT_WORKER_LOCK_OWNED(w); |
| |
| w->exc = w->head; |
| __cilkrts_fence(); |
| } |
| |
| /* conditions under which victim->head can be stolen: */ |
| static int can_steal_from(__cilkrts_worker *victim) |
| { |
| return ((victim->head < victim->tail) && |
| (victim->head < victim->protected_tail)); |
| } |
| |
| /* Return TRUE if the frame can be stolen, false otherwise */ |
| static int dekker_protocol(__cilkrts_worker *victim) |
| { |
| // increment_E and decrement_E are going to touch victim->exc. The |
| // currently executing worker must own victim's lock before they can |
| // modify it |
| ASSERT_WORKER_LOCK_OWNED(victim); |
| |
| /* ASSERT(E >= H); */ |
| |
| increment_E(victim); |
| |
| /* ASSERT(E >= H + 1); */ |
| if (can_steal_from(victim)) { |
| /* success, we can steal victim->head and set H <- H + 1 |
| in detach() */ |
| return 1; |
| } else { |
| /* failure, restore previous state */ |
| decrement_E(victim); |
| return 0; |
| } |
| } |
| |
| |
| /* Link PARENT and CHILD in the spawn tree */ |
| static full_frame *make_child(__cilkrts_worker *w, |
| full_frame *parent_ff, |
| __cilkrts_stack_frame *child_sf, |
| cilk_fiber *fiber) |
| { |
| full_frame *child_ff = __cilkrts_make_full_frame(w, child_sf); |
| |
| child_ff->parent = parent_ff; |
| push_child(parent_ff, child_ff); |
| |
| //DBGPRINTF("%d- make_child - child_frame: %p, parent_frame: %p, child_sf: %p\n" |
| // " parent - parent: %p, left_sibling: %p, right_sibling: %p, rightmost_child: %p\n" |
| // " child - parent: %p, left_sibling: %p, right_sibling: %p, rightmost_child: %p\n", |
| // w->self, child, parent, child_sf, |
| // parent->parent, parent->left_sibling, parent->right_sibling, parent->rightmost_child, |
| // child->parent, child->left_sibling, child->right_sibling, child->rightmost_child); |
| CILK_ASSERT(parent_ff->call_stack); |
| child_ff->is_call_child = (fiber == NULL); |
| |
| /* PLACEHOLDER_FIBER is used as non-null marker indicating that |
| child should be treated as a spawn child even though we have not |
| yet assigned a real fiber to its parent. */ |
| if (fiber == PLACEHOLDER_FIBER) |
| fiber = NULL; /* Parent actually gets a null fiber, for now */ |
| |
| /* perform any system-dependent actions, such as capturing |
| parameter passing information */ |
| /*__cilkrts_make_child_sysdep(child, parent);*/ |
| |
| /* Child gets reducer map and stack of parent. |
| Parent gets a new map and new stack. */ |
| child_ff->fiber_self = parent_ff->fiber_self; |
| child_ff->sync_master = NULL; |
| |
| if (child_ff->is_call_child) { |
| /* Cause segfault on any attempted access. The parent gets |
| the child map and stack when the child completes. */ |
| parent_ff->fiber_self = 0; |
| } else { |
| parent_ff->fiber_self = fiber; |
| } |
| |
| incjoin(parent_ff); |
| return child_ff; |
| } |
| |
| static inline __cilkrts_stack_frame *__cilkrts_advance_frame(__cilkrts_stack_frame *sf) |
| { |
| __cilkrts_stack_frame *p = sf->call_parent; |
| sf->call_parent = 0; |
| return p; |
| } |
| |
| /* w should be the currently executing worker. |
| * loot_sf is the youngest stack frame in the call stack being |
| * unrolled (i.e., the most deeply nested stack frame.) |
| * |
| * When this method is called for a steal, loot_sf should be on a |
| * victim worker which is different from w. |
| * For CILK_FORCE_REDUCE, the victim worker will equal w. |
| * |
| * Before execution, the __cilkrts_stack_frame's have pointers from |
| * older to younger, i.e., a __cilkrts_stack_frame points to parent. |
| * |
| * This method creates a full frame for each __cilkrts_stack_frame in |
| * the call stack, with each full frame also pointing to its parent. |
| * |
| * The method returns the full frame created for loot_sf, i.e., the |
| * youngest full frame. |
| */ |
| static full_frame *unroll_call_stack(__cilkrts_worker *w, |
| full_frame *ff, |
| __cilkrts_stack_frame *const loot_sf) |
| { |
| __cilkrts_stack_frame *sf = loot_sf; |
| __cilkrts_stack_frame *rev_sf = 0; |
| __cilkrts_stack_frame *t_sf; |
| |
| CILK_ASSERT(sf); |
| /*CILK_ASSERT(sf->call_parent != sf);*/ |
| |
| /* The leafmost frame is unsynched. */ |
| if (sf->worker != w) |
| sf->flags |= CILK_FRAME_UNSYNCHED; |
| |
| /* Reverse the call stack to make a linked list ordered from parent |
| to child. sf->call_parent points to the child of SF instead of |
| the parent. */ |
| do { |
| t_sf = (sf->flags & (CILK_FRAME_DETACHED|CILK_FRAME_STOLEN|CILK_FRAME_LAST))? 0 : sf->call_parent; |
| sf->call_parent = rev_sf; |
| rev_sf = sf; |
| sf = t_sf; |
| } while (sf); |
| sf = rev_sf; |
| |
| /* Promote each stack frame to a full frame in order from parent |
| to child, following the reversed list we just built. */ |
| make_unrunnable(w, ff, sf, sf == loot_sf, "steal 1"); |
| /* T is the *child* of SF, because we have reversed the list */ |
| for (t_sf = __cilkrts_advance_frame(sf); t_sf; |
| sf = t_sf, t_sf = __cilkrts_advance_frame(sf)) { |
| ff = make_child(w, ff, t_sf, NULL); |
| make_unrunnable(w, ff, t_sf, t_sf == loot_sf, "steal 2"); |
| } |
| |
| /* XXX What if the leafmost frame does not contain a sync |
| and this steal is from promote own deque? */ |
| /*sf->flags |= CILK_FRAME_UNSYNCHED;*/ |
| |
| CILK_ASSERT(!sf->call_parent); |
| return ff; |
| } |
| |
| /* detach the top of the deque frame from the VICTIM and install a new |
| CHILD frame in its place */ |
| static void detach_for_steal(__cilkrts_worker *w, |
| __cilkrts_worker *victim, |
| cilk_fiber* fiber) |
| { |
| /* ASSERT: we own victim->lock */ |
| |
| full_frame *parent_ff, *child_ff, *loot_ff; |
| __cilkrts_stack_frame *volatile *h; |
| __cilkrts_stack_frame *sf; |
| |
| w->l->team = victim->l->team; |
| |
| CILK_ASSERT(w->l->frame_ff == 0 || w == victim); |
| |
| h = victim->head; |
| |
| CILK_ASSERT(*h); |
| |
| victim->head = h + 1; |
| |
| parent_ff = victim->l->frame_ff; |
| BEGIN_WITH_FRAME_LOCK(w, parent_ff) { |
| /* parent no longer referenced by victim */ |
| decjoin(parent_ff); |
| |
| /* obtain the victim call stack */ |
| sf = *h; |
| |
| /* perform system-dependent normalizations */ |
| /*__cilkrts_normalize_call_stack_on_steal(sf);*/ |
| |
| /* unroll PARENT_FF with call stack SF, adopt the youngest |
| frame LOOT. If loot_ff == parent_ff, then we hold loot_ff->lock, |
| otherwise, loot_ff is newly created and we can modify it without |
| holding its lock. */ |
| loot_ff = unroll_call_stack(w, parent_ff, sf); |
| |
| #if REDPAR_DEBUG >= 3 |
| fprintf(stderr, "[W=%d, victim=%d, desc=detach, parent_ff=%p, loot=%p]\n", |
| w->self, victim->self, |
| parent_ff, loot_ff); |
| #endif |
| |
| if (WORKER_USER == victim->l->type && |
| NULL == victim->l->last_full_frame) { |
| // Mark this looted frame as special: only the original user worker |
| // may cross the sync. |
| // |
| // This call is a shared access to |
| // victim->l->last_full_frame. |
| set_sync_master(victim, loot_ff); |
| } |
| |
| /* LOOT is the next frame that the thief W is supposed to |
| run, unless the thief is stealing from itself, in which |
| case the thief W == VICTIM executes CHILD and nobody |
| executes LOOT. */ |
| if (w == victim) { |
| /* Pretend that frame has been stolen */ |
| loot_ff->call_stack->flags |= CILK_FRAME_UNSYNCHED; |
| loot_ff->simulated_stolen = 1; |
| } |
| else |
| __cilkrts_push_next_frame(w, loot_ff); |
| |
| // After this "push_next_frame" call, w now owns loot_ff. |
| child_ff = make_child(w, loot_ff, 0, fiber); |
| |
| BEGIN_WITH_FRAME_LOCK(w, child_ff) { |
| /* install child in the victim's work queue, taking |
| the parent_ff's place */ |
| /* child is referenced by victim */ |
| incjoin(child_ff); |
| |
| // With this call, w is bestowing ownership of the newly |
| // created frame child_ff to the victim, and victim is |
| // giving up ownership of parent_ff. |
| // |
| // Worker w will either take ownership of parent_ff |
| // if parent_ff == loot_ff, or parent_ff will be |
| // suspended. |
| // |
| // Note that this call changes the victim->frame_ff |
| // while the victim may be executing. |
| make_runnable(victim, child_ff); |
| } END_WITH_FRAME_LOCK(w, child_ff); |
| } END_WITH_FRAME_LOCK(w, parent_ff); |
| } |
| |
| /** |
| * @brief cilk_fiber_proc that resumes user code after a successful |
| * random steal. |
| |
| * This function longjmps back into the user code whose state is |
| * stored in cilk_fiber_get_data(fiber)->resume_sf. The stack pointer |
| * is adjusted so that the code resumes on the specified fiber stack |
| * instead of its original stack. |
| * |
| * This method gets executed only on a fiber freshly allocated from a |
| * pool. |
| * |
| * @param fiber The fiber being used to resume user code. |
| * @param arg Unused. |
| */ |
| static |
| void fiber_proc_to_resume_user_code_for_random_steal(cilk_fiber *fiber) |
| { |
| cilk_fiber_data *data = cilk_fiber_get_data(fiber); |
| __cilkrts_stack_frame* sf = data->resume_sf; |
| full_frame *ff; |
| |
| CILK_ASSERT(sf); |
| |
| // When we pull the resume_sf out of the fiber to resume it, clear |
| // the old value. |
| data->resume_sf = NULL; |
| CILK_ASSERT(sf->worker == data->owner); |
| ff = sf->worker->l->frame_ff; |
| |
| // For Win32, we need to overwrite the default exception handler |
| // in this function, so that when the OS exception handling code |
| // walks off the top of the current Cilk stack, it reaches our stub |
| // handler. |
| |
| // Also, this function needs to be wrapped into a try-catch block |
| // so the compiler generates the appropriate exception information |
| // in this frame. |
| |
| // TBD: IS THIS HANDLER IN THE WRONG PLACE? Can we longjmp out of |
| // this function (and does it matter?) |
| #if defined(_WIN32) && !defined(_WIN64) |
| install_exception_stub_handler(); |
| __try |
| #endif |
| { |
| char* new_sp = sysdep_reset_jump_buffers_for_resume(fiber, ff, sf); |
| |
| // Notify the Intel tools that we're stealing code |
| ITT_SYNC_ACQUIRED(sf->worker); |
| NOTIFY_ZC_INTRINSIC("cilk_continue", sf); |
| |
| // TBD: We'd like to move TBB-interop methods into the fiber |
| // eventually. |
| cilk_fiber_invoke_tbb_stack_op(fiber, CILK_TBB_STACK_ADOPT); |
| |
| sf->flags &= ~CILK_FRAME_SUSPENDED; |
| |
| // longjmp to user code. Don't process exceptions here, |
| // because we are resuming a stolen frame. |
| sysdep_longjmp_to_sf(new_sp, sf, NULL); |
| /*NOTREACHED*/ |
| // Intel's C compiler respects the preceding lint pragma |
| } |
| #if defined(_WIN32) && !defined(_WIN64) |
| __except (CILK_ASSERT(!"should not execute the the stub filter"), |
| EXCEPTION_EXECUTE_HANDLER) |
| { |
| // If we are here, that means something very wrong |
| // has happened in our exception processing... |
| CILK_ASSERT(! "should not be here!"); |
| } |
| #endif |
| } |
| |
| static void random_steal(__cilkrts_worker *w) |
| { |
| __cilkrts_worker *victim = NULL; |
| cilk_fiber *fiber = NULL; |
| int n; |
| int success = 0; |
| int32_t victim_id; |
| |
| // Nothing's been stolen yet. When true, this will flag |
| // setup_for_execution_pedigree to increment the pedigree |
| w->l->work_stolen = 0; |
| |
| /* If the user has disabled stealing (using the debugger) we fail */ |
| if (__builtin_expect(w->g->stealing_disabled, 0)) |
| return; |
| |
| CILK_ASSERT(w->l->type == WORKER_SYSTEM || w->l->team == w); |
| |
| /* If there is only one processor work can still be stolen. |
| There must be only one worker to prevent stealing. */ |
| CILK_ASSERT(w->g->total_workers > 1); |
| |
| /* pick random *other* victim */ |
| n = myrand(w) % (w->g->total_workers - 1); |
| if (n >= w->self) |
| ++n; |
| |
| // If we're replaying a log, override the victim. -1 indicates that |
| // we've exhausted the list of things this worker stole when we recorded |
| // the log so just return. If we're not replaying a log, |
| // replay_get_next_recorded_victim() just returns the victim ID passed in. |
| n = replay_get_next_recorded_victim(w, n); |
| if (-1 == n) |
| return; |
| |
| victim = w->g->workers[n]; |
| |
| START_INTERVAL(w, INTERVAL_FIBER_ALLOCATE) { |
| /* Verify that we can get a stack. If not, no need to continue. */ |
| fiber = cilk_fiber_allocate(&w->l->fiber_pool); |
| } STOP_INTERVAL(w, INTERVAL_FIBER_ALLOCATE); |
| |
| |
| if (NULL == fiber) { |
| #if FIBER_DEBUG >= 2 |
| fprintf(stderr, "w=%d: failed steal because we could not get a fiber\n", |
| w->self); |
| #endif |
| return; |
| } |
| |
| /* do not steal from self */ |
| CILK_ASSERT (victim != w); |
| |
| /* Execute a quick check before engaging in the THE protocol. |
| Avoid grabbing locks if there is nothing to steal. */ |
| if (!can_steal_from(victim)) { |
| NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_EMPTYQ); |
| START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) { |
| int ref_count = cilk_fiber_remove_reference(fiber, &w->l->fiber_pool); |
| // Fibers we use when trying to steal should not be active, |
| // and thus should not have any other references. |
| CILK_ASSERT(0 == ref_count); |
| } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE); |
| return; |
| } |
| |
| /* Attempt to steal work from the victim */ |
| if (worker_trylock_other(w, victim)) { |
| if (w->l->type == WORKER_USER && victim->l->team != w) { |
| |
| // Fail to steal if this is a user worker and the victim is not |
| // on this team. If a user worker were allowed to steal work |
| // descended from another user worker, the former might not be |
| // done with its work by the time it was needed to resume and |
| // unbind. Therefore, user workers are not permitted to change |
| // teams. |
| |
| // There is no race on the victim's team because the victim cannot |
| // change its team until it runs out of work to do, at which point |
| // it will try to take out its own lock, and this worker already |
| // holds it. |
| NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_USER_WORKER); |
| |
| } else if (victim->l->frame_ff) { |
| // A successful steal will change victim->frame_ff, even |
| // though the victim may be executing. Thus, the lock on |
| // the victim's deque is also protecting victim->frame_ff. |
| if (dekker_protocol(victim)) { |
| int proceed_with_steal = 1; // optimistic |
| |
| // If we're replaying a log, verify that this the correct frame |
| // to steal from the victim |
| if (! replay_match_victim_pedigree(w, victim)) |
| { |
| // Abort the steal attempt. decrement_E(victim) to |
| // counter the increment_E(victim) done by the |
| // dekker protocol |
| decrement_E(victim); |
| proceed_with_steal = 0; |
| } |
| |
| if (proceed_with_steal) |
| { |
| START_INTERVAL(w, INTERVAL_STEAL_SUCCESS) { |
| success = 1; |
| detach_for_steal(w, victim, fiber); |
| victim_id = victim->self; |
| |
| #if REDPAR_DEBUG >= 1 |
| fprintf(stderr, "Wkr %d stole from victim %d, fiber = %p\n", |
| w->self, victim->self, fiber); |
| #endif |
| |
| // The use of victim->self contradicts our |
| // classification of the "self" field as |
| // local. But since this code is only for |
| // debugging, it is ok. |
| DBGPRINTF ("%d-%p: Stealing work from worker %d\n" |
| " sf: %p, call parent: %p\n", |
| w->self, GetCurrentFiber(), victim->self, |
| w->l->next_frame_ff->call_stack, |
| w->l->next_frame_ff->call_stack->call_parent); |
| } STOP_INTERVAL(w, INTERVAL_STEAL_SUCCESS); |
| } // end if(proceed_with_steal) |
| } else { |
| NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_DEKKER); |
| } |
| } else { |
| NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_EMPTYQ); |
| } |
| worker_unlock_other(w, victim); |
| } else { |
| NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_LOCK); |
| } |
| |
| // Record whether work was stolen. When true, this will flag |
| // setup_for_execution_pedigree to increment the pedigree |
| w->l->work_stolen = success; |
| |
| if (0 == success) { |
| // failed to steal work. Return the fiber to the pool. |
| START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) { |
| int ref_count = cilk_fiber_remove_reference(fiber, &w->l->fiber_pool); |
| // Fibers we use when trying to steal should not be active, |
| // and thus should not have any other references. |
| CILK_ASSERT(0 == ref_count); |
| } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE); |
| } |
| else |
| { |
| // Since our steal was successful, finish initialization of |
| // the fiber. |
| cilk_fiber_reset_state(fiber, |
| fiber_proc_to_resume_user_code_for_random_steal); |
| // Record the pedigree of the frame that w has stolen. |
| // record only if CILK_RECORD_LOG is set |
| replay_record_steal(w, victim_id); |
| } |
| } |
| |
| |
| |
| /** |
| * At a provably good steal, we need to transfer the child reducer map |
| * from ff->children_reducer_map into v->reducer_map, where v is the |
| * worker that resumes execution of ff. |
| * |
| * Normally, we have v == w, where w is the currently executing |
| * worker. In the case where we are resuming a team leader on a user |
| * worker, however, v might differ from w. |
| |
| * Thus, this, operation is a no-op, since we can't really move |
| * ff->children_reducer_map into w here. |
| * |
| * Instead, this work is done in setup_for_execution_reducers(). |
| */ |
| static inline void provably_good_steal_reducers(__cilkrts_worker *w, |
| full_frame *ff) |
| { |
| // No-op. |
| } |
| |
| /* at a provably good steal, incorporate the accumulated exceptions of |
| children into the parent's exception */ |
| static void provably_good_steal_exceptions(__cilkrts_worker *w, |
| full_frame *ff) |
| { |
| // ASSERT: we own ff->lock |
| ff->pending_exception = |
| __cilkrts_merge_pending_exceptions(w, |
| ff->child_pending_exception, |
| ff->pending_exception); |
| ff->child_pending_exception = NULL; |
| } |
| |
| /* At sync discard the frame's old stack and take the leftmost child's. */ |
| static void provably_good_steal_stacks(__cilkrts_worker *w, full_frame *ff) |
| { |
| CILK_ASSERT(NULL == ff->fiber_self); |
| ff->fiber_self = ff->fiber_child; |
| ff->fiber_child = NULL; |
| } |
| |
| static void __cilkrts_mark_synched(full_frame *ff) |
| { |
| ff->call_stack->flags &= ~CILK_FRAME_UNSYNCHED; |
| ff->simulated_stolen = 0; |
| } |
| |
| static |
| enum provably_good_steal_t provably_good_steal(__cilkrts_worker *w, |
| full_frame *ff) |
| { |
| // ASSERT: we hold w->lock and ff->lock |
| |
| enum provably_good_steal_t result = ABANDON_EXECUTION; |
| |
| // If the current replay entry is a sync record matching the worker's |
| // pedigree, AND this isn't the last child to the sync, return |
| // WAIT_FOR_CONTINUE to indicate that the caller should loop until |
| // we find the right frame to steal and CONTINUE_EXECUTION is returned. |
| int match_found = replay_match_sync_pedigree(w); |
| if (match_found && (0 != simulate_decjoin(ff))) |
| return WAIT_FOR_CONTINUE; |
| |
| START_INTERVAL(w, INTERVAL_PROVABLY_GOOD_STEAL) { |
| if (decjoin(ff) == 0) { |
| provably_good_steal_reducers(w, ff); |
| provably_good_steal_exceptions(w, ff); |
| provably_good_steal_stacks(w, ff); |
| __cilkrts_mark_synched(ff); |
| |
| // If the original owner wants this frame back (to resume |
| // it on its original thread) pass it back now. |
| if (NULL != ff->sync_master) { |
| // The frame wants to go back and be executed by the original |
| // user thread. We can throw caution to the wind and push the |
| // frame straight onto its queue because the only way we have |
| // gotten to this point of being able to continue execution of |
| // the frame is if the original user worker is spinning without |
| // work. |
| |
| unset_sync_master(w->l->team, ff); |
| __cilkrts_push_next_frame(w->l->team, ff); |
| |
| // If this is the team leader we're not abandoning the work |
| if (w == w->l->team) |
| result = CONTINUE_EXECUTION; |
| } else { |
| __cilkrts_push_next_frame(w, ff); |
| result = CONTINUE_EXECUTION; // Continue working on this thread |
| } |
| |
| // The __cilkrts_push_next_frame() call changes ownership |
| // of ff to the specified worker. |
| } |
| } STOP_INTERVAL(w, INTERVAL_PROVABLY_GOOD_STEAL); |
| |
| // Only write a SYNC record if: |
| // - We're recording a log *AND* |
| // - We're the worker continuing from this sync |
| replay_record_sync(w, result == CONTINUE_EXECUTION); |
| |
| // If we're replaying a log, and matched a sync from the log, mark the |
| // sync record seen if the sync isn't going to be abandoned. |
| replay_advance_from_sync (w, match_found, result == CONTINUE_EXECUTION); |
| |
| return result; |
| } |
| |
| static void unconditional_steal(__cilkrts_worker *w, |
| full_frame *ff) |
| { |
| // ASSERT: we hold ff->lock |
| |
| START_INTERVAL(w, INTERVAL_UNCONDITIONAL_STEAL) { |
| decjoin(ff); |
| __cilkrts_push_next_frame(w, ff); |
| } STOP_INTERVAL(w, INTERVAL_UNCONDITIONAL_STEAL); |
| } |
| |
| |
| /* CHILD is about to die. Give its exceptions to a sibling or to the |
| parent. */ |
| static inline void splice_exceptions_for_call(__cilkrts_worker *w, |
| full_frame *parent_ff, |
| full_frame *child_ff) |
| { |
| // ASSERT: We own parent_ff->lock |
| CILK_ASSERT(child_ff->is_call_child); |
| CILK_ASSERT(NULL == child_ff->right_pending_exception); |
| CILK_ASSERT(NULL == parent_ff->pending_exception); |
| |
| parent_ff->pending_exception = child_ff->pending_exception; |
| child_ff->pending_exception = NULL; |
| } |
| |
| /** |
| * Merge exceptions for a dying child. |
| * |
| * @param w The currently executing worker. |
| * @param ff The child frame that is dying. |
| * @param left_exception_ptr Pointer to the exception that is to our left. |
| */ |
| static inline |
| void splice_exceptions_for_spawn(__cilkrts_worker *w, |
| full_frame *ff, |
| struct pending_exception_info **left_exception_ptr) |
| { |
| // ASSERT: parent_ff == child_ff->parent. |
| // ASSERT: We own parent_ff->lock |
| |
| // Merge current exception into the slot where the left |
| // exception should go. |
| *left_exception_ptr = |
| __cilkrts_merge_pending_exceptions(w, |
| *left_exception_ptr, |
| ff->pending_exception); |
| ff->pending_exception = NULL; |
| |
| |
| // Merge right exception into the slot where the left exception |
| // should go. |
| *left_exception_ptr = |
| __cilkrts_merge_pending_exceptions(w, |
| *left_exception_ptr, |
| ff->right_pending_exception); |
| ff->right_pending_exception = NULL; |
| } |
| |
| |
| static inline void splice_stacks_for_call(__cilkrts_worker *w, |
| full_frame *parent_ff, |
| full_frame *child_ff) |
| { |
| #if CILK_LIB_DEBUG |
| if (parent_ff->call_stack) |
| CILK_ASSERT(!(parent_ff->call_stack->flags & CILK_FRAME_MBZ)); |
| #endif |
| |
| /* A synched frame does not have accumulated child reducers. */ |
| CILK_ASSERT(!child_ff->fiber_child); |
| CILK_ASSERT(child_ff->is_call_child); |
| |
| /* An attached parent has no self fiber. It may have |
| accumulated child fibers or child owners, which should be |
| ignored until sync. */ |
| CILK_ASSERT(!parent_ff->fiber_self); |
| parent_ff->fiber_self = child_ff->fiber_self; |
| child_ff->fiber_self = NULL; |
| } |
| |
| static void finalize_child_for_call(__cilkrts_worker *w, |
| full_frame *parent_ff, |
| full_frame *child_ff) |
| { |
| // ASSERT: we hold w->lock and parent_ff->lock |
| |
| START_INTERVAL(w, INTERVAL_FINALIZE_CHILD) { |
| CILK_ASSERT(child_ff->is_call_child); |
| CILK_ASSERT(child_ff->join_counter == 0); |
| CILK_ASSERT(!child_ff->rightmost_child); |
| CILK_ASSERT(child_ff == parent_ff->rightmost_child); |
| |
| // CHILD is about to die. |
| // Splicing out reducers is a no-op for a call since |
| // w->reducer_map should already store the correct |
| // reducer map. |
| |
| // ASSERT there are no maps left to reduce. |
| CILK_ASSERT(NULL == child_ff->children_reducer_map); |
| CILK_ASSERT(NULL == child_ff->right_reducer_map); |
| |
| splice_exceptions_for_call(w, parent_ff, child_ff); |
| |
| splice_stacks_for_call(w, parent_ff, child_ff); |
| |
| /* remove CHILD from list of children of PARENT */ |
| unlink_child(parent_ff, child_ff); |
| |
| /* continue with the parent. */ |
| unconditional_steal(w, parent_ff); |
| __cilkrts_destroy_full_frame(w, child_ff); |
| } STOP_INTERVAL(w, INTERVAL_FINALIZE_CHILD); |
| } |
| |
| |
| /** |
| * The invariant on ff->children_reducer_map is that when ff is |
| * synched and when we are about to resume execution of ff, at least |
| * one of ff->children_reducer_map and w->reducer_map must be NULL. |
| * |
| * Consider the two possibilities before resuming execution of ff: |
| * |
| * 1. Suppose ff is synched and suspended. Then either |
| * |
| * (a) ff->children_reducer_map stores the reducer map that w |
| * should use, where w is the worker resuming execution of ff, |
| * OR |
| * (b) w already has a user map, and ff->children_reducer_map is NULL. |
| * |
| * Case (a) happens when we are resuming execution of ff as a |
| * provably good steal. In this case, w->reducer_map should be |
| * NULL and ff->children_reducer_map is valid. To resume |
| * execution of ff on w, set w->reducer_map to |
| * ff->children_reducer_map. |
| * |
| * Case (b) occurs when we resume execution of ff because ff is a |
| * called child. Then, ff->children_reducer_map should be NULL, |
| * and w should already have a valid reducer map when resuming |
| * execution of ff. We resume execution of ff without changing |
| * w->reducer_map. |
| * |
| * 2. Suppose frame ff is not synched (i.e., it is active and might have |
| * active children). Then ff->children_reducer_map is the slot for |
| * storing the reducer map from ff's leftmost child, as in the reducer |
| * protocol. The runtime may resume execution of ff while it is not |
| * synched only because of a steal. |
| * In this case, while we are resuming ff, ff->children_reducer_map |
| * may be non-NULL (because one of ff's children has completed). |
| * We resume execution of ff without changing w->reducer_map. |
| */ |
| static void setup_for_execution_reducers(__cilkrts_worker *w, |
| full_frame *ff) |
| { |
| // We only need to move ff->children_reducer_map into |
| // w->reducer_map in case 1(a). |
| // |
| // First check whether ff is synched. |
| __cilkrts_stack_frame *sf = ff->call_stack; |
| if (!(sf->flags & CILK_FRAME_UNSYNCHED)) { |
| // In this case, ff is synched. (Case 1). |
| CILK_ASSERT(!ff->rightmost_child); |
| |
| // Test whether we are in case 1(a) and have |
| // something to do. Note that if both |
| // ff->children_reducer_map and w->reducer_map are NULL, we |
| // can't distinguish between cases 1(a) and 1(b) here. |
| if (ff->children_reducer_map) { |
| // We are in Case 1(a). |
| CILK_ASSERT(!w->reducer_map); |
| w->reducer_map = ff->children_reducer_map; |
| ff->children_reducer_map = NULL; |
| } |
| } |
| } |
| |
| static void setup_for_execution_exceptions(__cilkrts_worker *w, |
| full_frame *ff) |
| { |
| CILK_ASSERT(NULL == w->l->pending_exception); |
| w->l->pending_exception = ff->pending_exception; |
| ff->pending_exception = NULL; |
| } |
| |
| #if 0 /* unused */ |
| static void setup_for_execution_stack(__cilkrts_worker *w, |
| full_frame *ff) |
| { |
| } |
| #endif |
| |
| /* |
| * setup_for_execution_pedigree |
| * |
| * Copies the pedigree information from the frame we're resuming to the |
| * worker. Increments the pedigree if this is work that has been stolen |
| * to match the increment on a return from a spawn helper. |
| */ |
| static void setup_for_execution_pedigree(__cilkrts_worker *w) |
| { |
| int pedigree_unsynched; |
| __cilkrts_stack_frame *sf = w->current_stack_frame; |
| |
| CILK_ASSERT(NULL != sf); |
| |
| // If this isn't an ABI 1 or later frame, there's no pedigree information |
| if (0 == CILK_FRAME_VERSION_VALUE(sf->flags)) |
| return; |
| |
| // Note whether the pedigree is unsynched and clear the flag before |
| // we forget |
| pedigree_unsynched = sf->flags & CILK_FRAME_SF_PEDIGREE_UNSYNCHED; |
| sf->flags &= ~CILK_FRAME_SF_PEDIGREE_UNSYNCHED; |
| |
| // If we're just marshalling onto this worker, do not increment |
| // the rank since that wouldn't happen in a sequential execution |
| if (w->l->work_stolen || pedigree_unsynched) |
| { |
| if (w->l->work_stolen) |
| w->pedigree.rank = sf->parent_pedigree.rank + 1; |
| else |
| w->pedigree.rank = sf->parent_pedigree.rank; |
| } |
| |
| w->pedigree.parent = sf->parent_pedigree.parent; |
| w->l->work_stolen = 0; |
| } |
| |
| static void setup_for_execution(__cilkrts_worker *w, |
| full_frame *ff, |
| int is_return_from_call) |
| { |
| // ASSERT: We own w->lock and ff->lock || P == 1 |
| |
| setup_for_execution_reducers(w, ff); |
| setup_for_execution_exceptions(w, ff); |
| /*setup_for_execution_stack(w, ff);*/ |
| |
| ff->call_stack->worker = w; |
| w->current_stack_frame = ff->call_stack; |
| |
| // If this is a return from a call, leave the pedigree alone |
| if (! is_return_from_call) |
| setup_for_execution_pedigree(w); |
| |
| __cilkrts_setup_for_execution_sysdep(w, ff); |
| |
| w->head = w->tail = w->l->ltq; |
| reset_THE_exception(w); |
| |
| make_runnable(w, ff); |
| } |
| |
| |
| /* |
| * Called by the scheduling fiber, right before |
| * resuming a sf/ff for user code. |
| * |
| * This method associates the specified sf with the worker. |
| * |
| * It also asserts that w, ff, and sf all have the expected properties |
| * for resuming user code. |
| */ |
| void scheduling_fiber_prepare_to_resume_user_code(__cilkrts_worker *w, |
| full_frame *ff, |
| __cilkrts_stack_frame *sf) |
| { |
| w->current_stack_frame = sf; |
| sf->worker = w; |
| |
| // Lots of debugging checks on the state of the fiber we might be |
| // resuming. |
| #if FIBER_DEBUG >= 1 |
| # if FIBER_DEBUG >= 3 |
| { |
| fprintf(stderr, "w=%d: ff=%p, sf=%p. about to resume user code\n", |
| w->self, ff, sf); |
| } |
| # endif |
| |
| const int flags = sf->flags; |
| CILK_ASSERT(flags & CILK_FRAME_SUSPENDED); |
| CILK_ASSERT(!sf->call_parent); |
| CILK_ASSERT(w->head == w->tail); |
| |
| /* A frame can not be resumed unless it was suspended. */ |
| CILK_ASSERT(ff->sync_sp != NULL); |
| |
| /* The leftmost frame has no allocated stack */ |
| if (ff->simulated_stolen) |
| CILK_ASSERT(flags & CILK_FRAME_UNSYNCHED); |
| else if (flags & CILK_FRAME_UNSYNCHED) |
| /* XXX By coincidence sync_sp could be null. */ |
| CILK_ASSERT(ff->fiber_self != NULL); |
| else |
| /* XXX This frame could be resumed unsynched on the leftmost stack */ |
| CILK_ASSERT((ff->sync_master == 0 || ff->sync_master == w)); |
| CILK_ASSERT(w->l->frame_ff == ff); |
| #endif |
| } |
| |
| |
| /** |
| * This method is the first method that should execute after we've |
| * switched to a scheduling fiber from user code. |
| * |
| * @param fiber The scheduling fiber for the current worker. |
| * @param wptr The current worker. |
| */ |
| static void enter_runtime_transition_proc(cilk_fiber *fiber) |
| { |
| // We can execute this method for one of three reasons: |
| // 1. Undo-detach finds parent stolen. |
| // 2. Sync suspends frame. |
| // 3. Return from Cilk entry point. |
| // |
| // |
| // In cases 1 and 2, the frame may be truly suspended or |
| // may be immediately executed by this worker after provably_good_steal. |
| // |
| // |
| // There is a fourth case, which can, but does not need to execute |
| // this function: |
| // 4. Starting up the scheduling loop on a user or |
| // system worker. In this case, we won't have |
| // a scheduling stack function to run. |
| __cilkrts_worker* w = cilk_fiber_get_owner(fiber); |
| if (w->l->post_suspend) { |
| // Run the continuation function passed to longjmp_into_runtime |
| run_scheduling_stack_fcn(w); |
| |
| // After we have jumped into the runtime and run the |
| // scheduling function, any reducer map the worker had before entering the runtime |
| // should have already been saved into the appropriate full |
| // frame. |
| CILK_ASSERT(NULL == w->reducer_map); |
| |
| // There shouldn't be any uncaught exceptions. |
| // |
| // In Windows, the OS catches any exceptions not caught by the |
| // user code. Thus, we are omitting the check on Windows. |
| // |
| // On Android, calling std::uncaught_exception with the stlport |
| // library causes a seg fault. Since we're not supporting |
| // exceptions there at this point, just don't do the check |
| // |
| // TBD: Is this check also safe to do on Windows? |
| CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION(); |
| } |
| } |
| |
| |
| /** |
| * Method called to jump back to executing user code. |
| * |
| * A normal return from the runtime back to resuming user code calls |
| * this method. A computation executed using force_reduce also calls |
| * this method to return to user code. |
| * |
| * This function should not contain any code that depends on a fiber. |
| * In a force-reduce case, the user worker may not have a fiber. In |
| * the force-reduce case, we call this method directly instead of |
| * calling @c user_code_resume_after_switch_into_runtime. |
| */ |
| static inline NORETURN |
| cilkrts_resume(__cilkrts_stack_frame *sf, full_frame *ff) |
| { |
| // Save the sync stack pointer, and do the bookkeeping |
| char* sync_sp = ff->sync_sp; |
| __cilkrts_take_stack(ff, sync_sp); // leaves ff->sync_sp null |
| |
| sf->flags &= ~CILK_FRAME_SUSPENDED; |
| // Actually longjmp to the user code. |
| // We may have exceptions to deal with, since we are resuming |
| // a previous-suspended frame. |
| sysdep_longjmp_to_sf(sync_sp, sf, ff); |
| } |
| |
| |
| /** |
| * Called by the user-code fiber right before resuming a full frame |
| * (sf/ff). |
| * |
| * This method pulls sf/ff out of the worker, and then calls |
| * cilkrts_resume to jump to user code. |
| */ |
| static NORETURN |
| user_code_resume_after_switch_into_runtime(cilk_fiber *fiber) |
| { |
| __cilkrts_worker *w = cilk_fiber_get_owner(fiber); |
| __cilkrts_stack_frame *sf; |
| full_frame *ff; |
| sf = w->current_stack_frame; |
| ff = sf->worker->l->frame_ff; |
| |
| #if FIBER_DEBUG >= 1 |
| CILK_ASSERT(ff->fiber_self == fiber); |
| cilk_fiber_data *fdata = cilk_fiber_get_data(fiber); |
| DBGPRINTF ("%d-%p: resume_after_switch_into_runtime, fiber=%p\n", |
| w->self, w, fiber); |
| CILK_ASSERT(sf == fdata->resume_sf); |
| #endif |
| |
| // Notify the Intel tools that we're stealing code |
| ITT_SYNC_ACQUIRED(sf->worker); |
| NOTIFY_ZC_INTRINSIC("cilk_continue", sf); |
| cilk_fiber_invoke_tbb_stack_op(fiber, CILK_TBB_STACK_ADOPT); |
| |
| // Actually jump to user code. |
| cilkrts_resume(sf, ff); |
| } |
| |
| |
| /* The current stack is about to either be suspended or destroyed. This |
| * function will switch to the stack on which the scheduler is suspended and |
| * resume running the scheduler within function do_work(). Upon waking up, |
| * the scheduler will run the 'cont' function, using the supplied worker and |
| * frame. |
| */ |
| static NORETURN |
| longjmp_into_runtime(__cilkrts_worker *w, |
| scheduling_stack_fcn_t fcn, |
| __cilkrts_stack_frame *sf) |
| { |
| full_frame *ff, *ff2; |
| |
| CILK_ASSERT(!w->l->post_suspend); |
| ff = w->l->frame_ff; |
| |
| // If we've got only one worker, stealing shouldn't be possible. |
| // Assume that this is a steal or return from spawn in a force-reduce case. |
| // We don't have a scheduling stack to switch to, so call the continuation |
| // function directly. |
| if (1 == w->g->P) { |
| fcn(w, ff, sf); |
| |
| /* The call to function c() will have pushed ff as the next frame. If |
| * this were a normal (non-forced-reduce) execution, there would have |
| * been a pop_next_frame call in a separate part of the runtime. We |
| * must call pop_next_frame here to complete the push/pop cycle. */ |
| ff2 = pop_next_frame(w); |
| |
| setup_for_execution(w, ff2, 0); |
| scheduling_fiber_prepare_to_resume_user_code(w, ff2, w->current_stack_frame); |
| cilkrts_resume(w->current_stack_frame, ff2); |
| |
| // Suppress clang warning that the expression result is unused |
| #if defined(__clang__) && (! defined(__INTEL_COMPILER)) |
| # pragma clang diagnostic push |
| # pragma clang diagnostic ignored "-Wunused-value" |
| #endif // __clang__ |
| /* no return */ |
| CILK_ASSERT(((void)"returned from __cilkrts_resume", 0)); |
| #if defined(__clang__) && (! defined(__INTEL_COMPILER)) |
| # pragma clang diagnostic pop |
| #endif // __clang__ |
| } |
| |
| w->l->post_suspend = fcn; |
| w->l->suspended_stack = sf; |
| |
| ITT_SYNC_RELEASING(w); |
| ITT_SYNC_PREPARE(w); |
| |
| #if FIBER_DEBUG >= 2 |
| fprintf(stderr, "ThreadId=%p, W=%d: about to switch into runtime... w->l->frame_ff = %p, sf=%p\n", |
| cilkos_get_current_thread_id(), |
| w->self, w->l->frame_ff, |
| sf); |
| #endif |
| |
| // Current fiber is either the (1) one we are about to free, |
| // or (2) it has been passed up to the parent. |
| cilk_fiber *current_fiber = ( w->l->fiber_to_free ? |
| w->l->fiber_to_free : |
| w->l->frame_ff->parent->fiber_child ); |
| cilk_fiber_data* fdata = cilk_fiber_get_data(current_fiber); |
| CILK_ASSERT(NULL == w->l->frame_ff->fiber_self); |
| |
| // Clear the sf in the current fiber for cleanliness, to prevent |
| // us from accidentally resuming a bad sf. |
| // Technically, resume_sf gets overwritten for a fiber when |
| // we are about to resume it anyway. |
| fdata->resume_sf = NULL; |
| CILK_ASSERT(fdata->owner == w); |
| |
| // Set the function to execute immediately after switching to the |
| // scheduling fiber, but before freeing any fibers. |
| cilk_fiber_set_post_switch_proc(w->l->scheduling_fiber, |
| enter_runtime_transition_proc); |
| cilk_fiber_invoke_tbb_stack_op(current_fiber, CILK_TBB_STACK_ORPHAN); |
| |
| if (w->l->fiber_to_free) { |
| // Case 1: we are freeing this fiber. We never |
| // resume this fiber again after jumping into the runtime. |
| w->l->fiber_to_free = NULL; |
| |
| // Extra check. Normally, the fiber we are about to switch to |
| // should have a NULL owner. |
| CILK_ASSERT(NULL == cilk_fiber_get_data(w->l->scheduling_fiber)->owner); |
| #if FIBER_DEBUG >= 4 |
| fprintf(stderr, "ThreadId=%p, W=%d: about to switch into runtime.. current_fiber = %p, deallcoate, switch to fiber %p\n", |
| cilkos_get_current_thread_id(), |
| w->self, |
| current_fiber, w->l->scheduling_fiber); |
| #endif |
| cilk_fiber_invoke_tbb_stack_op(current_fiber, CILK_TBB_STACK_RELEASE); |
| NOTE_INTERVAL(w, INTERVAL_DEALLOCATE_RESUME_OTHER); |
| cilk_fiber_remove_reference_from_self_and_resume_other(current_fiber, |
| &w->l->fiber_pool, |
| w->l->scheduling_fiber); |
| // We should never come back here! |
| CILK_ASSERT(0); |
| } |
| else { |
| // Case 2: We are passing the fiber to our parent because we |
| // are leftmost. We should come back later to |
| // resume execution of user code. |
| // |
| // If we are not freeing a fiber, there we must be |
| // returning from a spawn or processing an exception. The |
| // "sync" path always frees a fiber. |
| // |
| // We must be the leftmost child, and by left holder logic, we |
| // have already moved the current fiber into our parent full |
| // frame. |
| #if FIBER_DEBUG >= 2 |
| fprintf(stderr, "ThreadId=%p, W=%d: about to suspend self into runtime.. current_fiber = %p, deallcoate, switch to fiber %p\n", |
| cilkos_get_current_thread_id(), |
| w->self, |
| current_fiber, w->l->scheduling_fiber); |
| #endif |
| |
| NOTE_INTERVAL(w, INTERVAL_SUSPEND_RESUME_OTHER); |
| |
| cilk_fiber_suspend_self_and_resume_other(current_fiber, |
| w->l->scheduling_fiber); |
| // Resuming this fiber returns control back to |
| // this function because our implementation uses OS fibers. |
| // |
| // On Unix, we could have the choice of passing the |
| // user_code_resume_after_switch_into_runtime as an extra "resume_proc" |
| // that resumes execution of user code instead of the |
| // jumping back here, and then jumping back to user code. |
| #if FIBER_DEBUG >= 2 |
| CILK_ASSERT(fdata->owner == __cilkrts_get_tls_worker()); |
| #endif |
| user_code_resume_after_switch_into_runtime(current_fiber); |
| } |
| } |
| |
| /* |
| * Send a message to the children of the specified worker: run or wait. |
| */ |
| static void notify_children(__cilkrts_worker *w, unsigned int msg) |
| { |
| int child_num; |
| __cilkrts_worker *child; |
| int num_sys_workers = w->g->P - 1; |
| |
| // If worker is "n", then its children are 2n + 1, and 2n + 2. |
| child_num = (w->self << 1) + 1; |
| if (child_num < num_sys_workers) { |
| child = w->g->workers[child_num]; |
| CILK_ASSERT(child->l->signal_node); |
| signal_node_msg(child->l->signal_node, msg); |
| child_num++; |
| if (child_num < num_sys_workers) { |
| child = w->g->workers[child_num]; |
| CILK_ASSERT(child->l->signal_node); |
| signal_node_msg(child->l->signal_node, msg); |
| } |
| } |
| } |
| |
| /* |
| * Notify this worker's children that they need to wait. |
| */ |
| static void notify_children_wait(__cilkrts_worker *w) |
| { |
| notify_children(w, 0); |
| } |
| |
| /* |
| * Notify this worker's children to run and start trying to steal. |
| */ |
| static void notify_children_run(__cilkrts_worker *w) |
| { |
| notify_children(w, 1); |
| } |
| |
| /** |
| * A single "check" to find work, either on our queue or through a |
| * steal attempt. This method checks our local queue once, and |
| * performs one steal attempt. |
| */ |
| static full_frame* check_for_work(__cilkrts_worker *w) |
| { |
| full_frame *ff = NULL; |
| ff = pop_next_frame(w); |
| // If there is no work on the queue, try to steal some. |
| if (NULL == ff) { |
| START_INTERVAL(w, INTERVAL_STEALING) { |
| if (w->l->type != WORKER_USER && w->l->team != NULL) { |
| // At this point, the worker knows for certain that it has run |
| // out of work. Therefore, it loses its team affiliation. User |
| // workers never change teams, of course. |
| __cilkrts_worker_lock(w); |
| w->l->team = NULL; |
| __cilkrts_worker_unlock(w); |
| } |
| |
| // If we are about to do a random steal, we should have no |
| // full frame... |
| CILK_ASSERT(NULL == w->l->frame_ff); |
| random_steal(w); |
| } STOP_INTERVAL(w, INTERVAL_STEALING); |
| |
| // If the steal was successful, then the worker has populated its next |
| // frame with the work to resume. |
| ff = pop_next_frame(w); |
| if (NULL == ff) { |
| // Punish the worker for failing to steal. |
| // No quantum for you! |
| __cilkrts_yield(); |
| w->l->steal_failure_count++; |
| } else { |
| // Reset steal_failure_count since there is obviously still work to |
| // be done. |
| w->l->steal_failure_count = 0; |
| } |
| } |
| return ff; |
| } |
| |
| /** |
| * Keep stealing or looking on our queue. |
| * |
| * Returns either when a full frame is found, or NULL if the |
| * computation is done. |
| */ |
| static full_frame* search_until_work_found_or_done(__cilkrts_worker *w) |
| { |
| full_frame *ff = NULL; |
| // Find a full frame to execute (either through random stealing, |
| // or because we pull it off w's 1-element queue). |
| while (!ff) { |
| // Check worker state to figure out our next action. |
| switch (worker_runnable(w)) |
| { |
| case SCHEDULE_RUN: // One attempt at checking for work. |
| ff = check_for_work(w); |
| break; |
| case SCHEDULE_WAIT: // go into wait-mode. |
| CILK_ASSERT(WORKER_SYSTEM == w->l->type); |
| // If we are about to wait, then we better not have |
| // a frame that we should execute... |
| CILK_ASSERT(NULL == w->l->next_frame_ff); |
| notify_children_wait(w); |
| signal_node_wait(w->l->signal_node); |
| // ... |
| // Runtime is waking up. |
| notify_children_run(w); |
| w->l->steal_failure_count = 0; |
| break; |
| case SCHEDULE_EXIT: // exit the scheduler. |
| CILK_ASSERT(WORKER_USER != w->l->type); |
| return NULL; |
| default: |
| CILK_ASSERT(0); |
| abort(); |
| } |
| } |
| return ff; |
| } |
| |
| /** |
| * The proc method for a scheduling fiber on a user worker. |
| * |
| * When a user worker jumps into the runtime, it jumps into this |
| * method by either starting it if the scheduling fiber has never run |
| * before, or resuming the fiber if it was previously suspended. |
| */ |
| COMMON_PORTABLE |
| void scheduler_fiber_proc_for_user_worker(cilk_fiber *fiber) |
| { |
| __cilkrts_worker* w = cilk_fiber_get_owner(fiber); |
| CILK_ASSERT(w); |
| |
| // This must be a user worker |
| CILK_ASSERT(WORKER_USER == w->l->type); |
| |
| // If we aren't the current worker, then something is very wrong |
| // here.. |
| verify_current_wkr(w); |
| |
| __cilkrts_run_scheduler_with_exceptions(w); |
| } |
| |
| |
| /** |
| * The body of the runtime scheduling loop. This function executes in |
| * 4 stages: |
| * |
| * 1. Transitions from the user code into the runtime by |
| * executing any scheduling-stack functions. |
| * 2. Looks for a full frame enqueued from a successful provably |
| * good steal. |
| * 3. If no full frame is found in step 2, steal until |
| * a frame is found or we are done. If we are done, finish |
| * the scheduling loop. |
| * 4. When a frame is found, setup to resume user code. |
| * In particular, suspend the current fiber and resume the |
| * user fiber to execute the frame. |
| * |
| * Returns a fiber object that we should switch to after completing |
| * the body of the loop, or NULL if we should continue executing on |
| * this fiber. |
| * |
| * @pre @c current_fiber should equal @c wptr->l->scheduling_fiber |
| * |
| * @param current_fiber The currently executing (scheduling_ fiber |
| * @param wptr The currently executing worker. |
| * @param return The next fiber we should switch to. |
| */ |
| static cilk_fiber* worker_scheduling_loop_body(cilk_fiber* current_fiber, |
| void* wptr) |
| { |
| __cilkrts_worker *w = (__cilkrts_worker*) wptr; |
| CILK_ASSERT(current_fiber == w->l->scheduling_fiber); |
| |
| // Stage 1: Transition from executing user code to the runtime code. |
| // We don't need to do this call here any more, because |
| // every switch to the scheduling fiber should make this call |
| // using a post_switch_proc on the fiber. |
| // |
| // enter_runtime_transition_proc(w->l->scheduling_fiber, wptr); |
| |
| // After Stage 1 is complete, w should no longer have |
| // an associated full frame. |
| CILK_ASSERT(NULL == w->l->frame_ff); |
| |
| // Stage 2. First do a quick check of our 1-element queue. |
| full_frame *ff = pop_next_frame(w); |
| |
| if (!ff) { |
| // Stage 3. We didn't find anything from our 1-element |
| // queue. Now go through the steal loop to find work. |
| ff = search_until_work_found_or_done(w); |
| if (!ff) { |
| CILK_ASSERT(w->g->work_done); |
| return NULL; |
| } |
| } |
| |
| // Stage 4. Now that we have found a full frame to work on, |
| // actually execute it. |
| __cilkrts_stack_frame *sf; |
| |
| // There shouldn't be any uncaught exceptions. |
| // |
| // In Windows, the OS catches any exceptions not caught by the |
| // user code. Thus, we are omitting the check on Windows. |
| // |
| // On Android, calling std::uncaught_exception with the stlport |
| // library causes a seg fault. Since we're not supporting |
| // exceptions there at this point, just don't do the check |
| CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION(); |
| |
| BEGIN_WITH_WORKER_LOCK(w) { |
| CILK_ASSERT(!w->l->frame_ff); |
| BEGIN_WITH_FRAME_LOCK(w, ff) { |
| sf = ff->call_stack; |
| CILK_ASSERT(sf && !sf->call_parent); |
| setup_for_execution(w, ff, 0); |
| } END_WITH_FRAME_LOCK(w, ff); |
| } END_WITH_WORKER_LOCK(w); |
| |
| /* run it */ |
| // |
| // Prepare to run the full frame. To do so, we need to: |
| // (a) Execute some code on this fiber (the scheduling |
| // fiber) to set up data structures, and |
| // (b) Suspend the scheduling fiber, and resume the |
| // user-code fiber. |
| |
| // Part (a). Set up data structures. |
| scheduling_fiber_prepare_to_resume_user_code(w, ff, sf); |
| |
| cilk_fiber *other = w->l->frame_ff->fiber_self; |
| cilk_fiber_data* other_data = cilk_fiber_get_data(other); |
| cilk_fiber_data* current_fiber_data = cilk_fiber_get_data(current_fiber); |
| |
| // I believe two cases are possible here, both of which |
| // should have other_data->resume_sf as NULL. |
| // |
| // 1. Resuming a fiber that was previously executing |
| // user code (i.e., a provably-good-steal). |
| // In this case, resume_sf should have been |
| // set to NULL when it was suspended. |
| // |
| // 2. Resuming code on a steal. In this case, since we |
| // grabbed a new fiber, resume_sf should be NULL. |
| CILK_ASSERT(NULL == other_data->resume_sf); |
| |
| #if FIBER_DEBUG >= 2 |
| fprintf(stderr, "W=%d: other fiber=%p, setting resume_sf to %p\n", |
| w->self, other, other_data->resume_sf); |
| #endif |
| // Update our own fiber's data. |
| current_fiber_data->resume_sf = NULL; |
| // The scheduling fiber should have the right owner from before. |
| CILK_ASSERT(current_fiber_data->owner == w); |
| other_data->resume_sf = sf; |
| |
| |
| #if FIBER_DEBUG >= 3 |
| fprintf(stderr, "ThreadId=%p (about to suspend self resume other), W=%d: current_fiber=%p, other=%p, current_fiber->resume_sf = %p, other->resume_sf = %p\n", |
| cilkos_get_current_thread_id(), |
| w->self, |
| current_fiber, other, |
| current_fiber_data->resume_sf, |
| other_data->resume_sf); |
| #endif |
| return other; |
| } |
| |
| |
| /** |
| * This function is executed once by each worker, to initialize its |
| * scheduling loop. |
| */ |
| static void worker_scheduler_init_function(__cilkrts_worker *w) |
| { |
| // First, execute the startup tasks that must happen for all |
| // worker types. |
| ITT_SYNC_PREPARE(w); |
| /* Notify tools about the new worker. Inspector needs this, but we |
| don't want to confuse Cilkscreen with system threads. User threads |
| do this notification in bind_thread */ |
| if (! w->g->under_ptool) |
| __cilkrts_cilkscreen_establish_worker(w); |
| |
| // Seed the initial random number generator. |
| // If we forget to do this, then the worker always steals from 0. |
| // Programs will still execute correctly, but |
| // you may see a subtle performance bug... |
| mysrand(w, (w->self + 1)); |
| |
| // The startup work varies, depending on the worker type. |
| switch (w->l->type) { |
| case WORKER_USER: |
| // Stop working once we've entered the scheduler. |
| // For user workers, INTERVAL_IN_SCHEDULER counts the time |
| // since we called bind_thread. |
| break; |
| |
| case WORKER_SYSTEM: |
| // If a system worker is starting, we must also be starting |
| // the runtime. |
| |
| // Runtime begins in a wait-state and is woken up by the first user |
| // worker when the runtime is ready. |
| signal_node_wait(w->l->signal_node); |
| // ... |
| // Runtime is waking up. |
| notify_children_run(w); |
| w->l->steal_failure_count = 0; |
| |
| // For system threads, count all the time this thread is |
| // alive in the scheduling loop. |
| START_INTERVAL(w, INTERVAL_IN_SCHEDULER); |
| START_INTERVAL(w, INTERVAL_WORKING); |
| break; |
| default: |
| __cilkrts_bug("Unknown worker %p of type %d entering scheduling loop\n", |
| w, w->l->type); |
| } |
| } |
| |
| /** |
| * This function is executed once by each worker, to finish its |
| * scheduling loop. |
| * |
| * @note Currently, only system workers finish their loops. User |
| * workers will jump away to user code without exiting their |
| * scheduling loop. |
| */ |
| static void worker_scheduler_terminate_function(__cilkrts_worker *w) |
| { |
| // A user worker should never finish by falling through the |
| // scheduling loop. |
| CILK_ASSERT(WORKER_USER != w->l->type); |
| STOP_INTERVAL(w, INTERVAL_IN_RUNTIME); |
| STOP_INTERVAL(w, INTERVAL_IN_SCHEDULER); |
| } |
| |
| /** |
| * The main scheduler function executed by a worker's scheduling |
| * fiber. |
| * |
| * This method is started by either a new system worker, or a user |
| * worker that has stalled and just been imported into the runtime. |
| */ |
| static void worker_scheduler_function(__cilkrts_worker *w) |
| { |
| worker_scheduler_init_function(w); |
| |
| // The main scheduling loop body. |
| |
| while (!w->g->work_done) { |
| // Set intervals. Now we are in the runtime instead of working. |
| START_INTERVAL(w, INTERVAL_IN_RUNTIME); |
| STOP_INTERVAL(w, INTERVAL_WORKING); |
| |
| // Execute the "body" of the scheduling loop, and figure |
| // out the fiber to jump to next. |
| cilk_fiber* fiber_to_resume |
| = worker_scheduling_loop_body(w->l->scheduling_fiber, w); |
| |
| if (fiber_to_resume) { |
| // Suspend the current fiber and resume next one. |
| NOTE_INTERVAL(w, INTERVAL_SUSPEND_RESUME_OTHER); |
| STOP_INTERVAL(w, INTERVAL_IN_RUNTIME); |
| START_INTERVAL(w, INTERVAL_WORKING); |
| cilk_fiber_suspend_self_and_resume_other(w->l->scheduling_fiber, |
| fiber_to_resume); |
| |
| // Return here only when this (scheduling) fiber is |
| // resumed (i.e., this worker wants to reenter the runtime). |
| } |
| } |
| |
| // Finish the scheduling loop. |
| worker_scheduler_terminate_function(w); |
| } |
| |
| |
| /************************************************************* |
| Forward declarations for reduction protocol. |
| *************************************************************/ |
| |
| static __cilkrts_worker* |
| execute_reductions_for_sync(__cilkrts_worker *w, |
| full_frame *ff, |
| __cilkrts_stack_frame *sf_at_sync); |
| |
| static __cilkrts_worker* |
| execute_reductions_for_spawn_return(__cilkrts_worker *w, |
| full_frame *ff, |
| __cilkrts_stack_frame *returning_sf); |
| |
| |
| |
| /************************************************************* |
| Scheduler functions that are callable by client code |
| *************************************************************/ |
| static full_frame *disown(__cilkrts_worker *w, |
| full_frame *ff, |
| __cilkrts_stack_frame *sf, |
| const char *why) |
| { |
| CILK_ASSERT(ff); |
| make_unrunnable(w, ff, sf, sf != 0, why); |
| w->l->frame_ff = 0; |
| return ff->parent; |
| } |
| |
| /** |
| * Called when ff is returning from a spawn, and we need to execute a |
| * reduction. |
| * |
| * @param w The currently executing worker. |
| * @param ff The full frame for w. |
| * @param returning_sf The stack frame for the spawn helper that is returning. |
| * |
| * Normally, by the time we gain control in the runtime, the worker |
| * has already popped off the __cilkrts_stack_frame "returning_sf" |
| * from its call chain. |
| * |
| * When we have only serial reductions, w->current_stack_frame is not |
| * needed any more, because w is about to enter the runtime scheduling |
| * loop anyway. Similarly, the frame "ff" is slated to be destroyed |
| * after the runtime finishes the return from spawn and splices ff out |
| * of the tree of full frames. |
| * |
| * To execute a parallel reduction, however, we still want |
| * w->current_stack_frame == returning_sf, and we are going to use the |
| * frame ff for a little bit longer. |
| * |
| * This method: |
| * |
| * 1. Puts returning_sf back as w's current stack frame. |
| * 2. Makes "ff" runnable again on w. |
| */ |
| static inline |
| void restore_frame_for_spawn_return_reduction(__cilkrts_worker *w, |
| full_frame *ff, |
| __cilkrts_stack_frame *returning_sf) { |
| #if REDPAR_DEBUG >= 2 |
| CILK_ASSERT(returning_sf); |
| CILK_ASSERT(returning_sf->worker == w); |
| #endif |
| // Change w's current stack frame back to "returning_sf". |
| // |
| // Intuitively, w->current_stack_frame should be |
| // returning_sf->call_parent at this point. |
| // |
| // We can not assert this, however, because the pop of |
| // returning_sf from the call chain has already cleared |
| // returning_sf->call_parent. We don't want to restore the call |
| // parent of returning_sf, because its parent has been stolen, and |
| // the runtime assumes that steals break this link. |
| |
| // We cannot assert call_parent is NULL either, since that's not true for |
| // Win64 exception handling |
| // CILK_ASSERT(returning_sf->call_parent == NULL); |
| w->current_stack_frame = returning_sf; |
| |
| // Make the full frame "ff" runnable again, in preparation for |
| // executing the reduction. |
| make_runnable(w, ff); |
| } |
| |
| |
| NORETURN __cilkrts_c_sync(__cilkrts_worker *w, |
| __cilkrts_stack_frame *sf_at_sync) |
| { |
| full_frame *ff; |
| |
| // Claim: This read of w->l->frame_ff can occur without |
| // holding the worker lock because when w has reached a sync |
| // and entered the runtime (because it stalls), w's deque is empty |
| // and no one else can steal and change w->l->frame_ff. |
| |
| ff = w->l->frame_ff; |
| #ifdef _WIN32 |
| __cilkrts_save_exception_state(w, ff); |
| #else |
| // Move any pending exceptions into the full frame |
| CILK_ASSERT(NULL == ff->pending_exception); |
| ff->pending_exception = w->l->pending_exception; |
| w->l->pending_exception = NULL; |
| #endif |
| |
| w = execute_reductions_for_sync(w, ff, sf_at_sync); |
| |
| #if FIBER_DEBUG >= 3 |
| fprintf(stderr, "ThreadId=%p, w->self = %d. about to longjmp_into_runtim[c_sync] with ff=%p\n", |
| cilkos_get_current_thread_id(), w->self, ff); |
| #endif |
| |
| longjmp_into_runtime(w, do_sync, sf_at_sync); |
| } |
| |
| static void do_sync(__cilkrts_worker *w, full_frame *ff, |
| __cilkrts_stack_frame *sf) |
| { |
| //int abandoned = 1; |
| enum provably_good_steal_t steal_result = ABANDON_EXECUTION; |
| |
| START_INTERVAL(w, INTERVAL_SYNC_CHECK) { |
| BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) { |
| |
| CILK_ASSERT(ff); |
| BEGIN_WITH_FRAME_LOCK(w, ff) { |
| CILK_ASSERT(sf->call_parent == 0); |
| CILK_ASSERT(sf->flags & CILK_FRAME_UNSYNCHED); |
| |
| // Before switching into the scheduling fiber, we should have |
| // already taken care of deallocating the current |
| // fiber. |
| CILK_ASSERT(NULL == ff->fiber_self); |
| |
| // Update the frame's pedigree information if this is an ABI 1 |
| // or later frame |
| if (CILK_FRAME_VERSION_VALUE(sf->flags) >= 1) |
| { |
| sf->parent_pedigree.rank = w->pedigree.rank; |
| sf->parent_pedigree.parent = w->pedigree.parent; |
| |
| // Note that the pedigree rank needs to be updated |
| // when setup_for_execution_pedigree runs |
| sf->flags |= CILK_FRAME_SF_PEDIGREE_UNSYNCHED; |
| } |
| |
| /* the decjoin() occurs in provably_good_steal() */ |
| steal_result = provably_good_steal(w, ff); |
| |
| } END_WITH_FRAME_LOCK(w, ff); |
| // set w->l->frame_ff = NULL after checking abandoned |
| if (WAIT_FOR_CONTINUE != steal_result) { |
| w->l->frame_ff = NULL; |
| } |
| } END_WITH_WORKER_LOCK_OPTIONAL(w); |
| } STOP_INTERVAL(w, INTERVAL_SYNC_CHECK); |
| |
| // Now, if we are in a replay situation and provably_good_steal() returned |
| // WAIT_FOR_CONTINUE, we should sleep, reacquire locks, call |
| // provably_good_steal(), and release locks until we get a value other |
| // than WAIT_FOR_CONTINUE from the function. |
| #ifdef CILK_RECORD_REPLAY |
| // We don't have to explicitly check for REPLAY_LOG below because |
| // steal_result can only be set to WAIT_FOR_CONTINUE during replay |
| while(WAIT_FOR_CONTINUE == steal_result) |
| { |
| __cilkrts_sleep(); |
| BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) |
| { |
| ff = w->l->frame_ff; |
| BEGIN_WITH_FRAME_LOCK(w, ff) |
| { |
| steal_result = provably_good_steal(w, ff); |
| } END_WITH_FRAME_LOCK(w, ff); |
| if (WAIT_FOR_CONTINUE != steal_result) |
| w->l->frame_ff = NULL; |
| } END_WITH_WORKER_LOCK_OPTIONAL(w); |
| } |
| #endif // CILK_RECORD_REPLAY |
| |
| #ifdef ENABLE_NOTIFY_ZC_INTRINSIC |
| // If we can't make any further progress on this thread, tell Inspector |
| // that we're abandoning the work and will go find something else to do. |
| if (ABANDON_EXECUTION == steal_result) |
| { |
| NOTIFY_ZC_INTRINSIC("cilk_sync_abandon", 0); |
| } |
| #endif // defined ENABLE_NOTIFY_ZC_INTRINSIC |
| |
| return; /* back to scheduler loop */ |
| } |
| |
| /* worker W completely promotes its own deque, simulating the case |
| where the whole deque is stolen. We use this mechanism to force |
| the allocation of new storage for reducers for race-detection |
| purposes. */ |
| void __cilkrts_promote_own_deque(__cilkrts_worker *w) |
| { |
| // Remember the fiber we start this method on. |
| CILK_ASSERT(w->l->frame_ff); |
| cilk_fiber* starting_fiber = w->l->frame_ff->fiber_self; |
| |
| BEGIN_WITH_WORKER_LOCK(w) { |
| while (dekker_protocol(w)) { |
| /* PLACEHOLDER_FIBER is used as non-null marker to tell detach() |
| and make_child() that this frame should be treated as a spawn |
| parent, even though we have not assigned it a stack. */ |
| detach_for_steal(w, w, PLACEHOLDER_FIBER); |
| } |
| } END_WITH_WORKER_LOCK(w); |
| |
| |
| // TBD: The management of full frames and fibers is a bit |
| // sketchy here. We are promoting stack frames into full frames, |
| // and pretending they are stolen away, but no other worker is |
| // actually working on them. Some runtime invariants |
| // may be broken here. |
| // |
| // Technically, if we are simulating a steal from w |
| // w should get a new full frame, but |
| // keep the same fiber. A real thief would be taking the |
| // loot frame away, get a new fiber, and starting executing the |
| // loot frame. |
| // |
| // What should a fake thief do? Where does the frame go? |
| |
| // In any case, we should be finishing the promotion process with |
| // the same fiber with. |
| CILK_ASSERT(w->l->frame_ff); |
| CILK_ASSERT(w->l->frame_ff->fiber_self == starting_fiber); |
| } |
| |
| |
| |
| /* the client code calls this function after a spawn when the dekker |
| protocol fails. The function may either return or longjmp |
| into the rts |
| |
| This function takes in a "returning_sf" argument which corresponds |
| to the __cilkrts_stack_frame that we are finishing (i.e., the |
| argument to __cilkrts_leave_frame). |
| */ |
| void __cilkrts_c_THE_exception_check(__cilkrts_worker *w, |
| __cilkrts_stack_frame *returning_sf) |
| { |
| full_frame *ff; |
| int stolen_p; |
| __cilkrts_stack_frame *saved_sf = NULL; |
| |
| START_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK); |
| |
| BEGIN_WITH_WORKER_LOCK(w) { |
| ff = w->l->frame_ff; |
| CILK_ASSERT(ff); |
| /* This code is called only upon a normal return and never |
| upon an exceptional return. Assert that this is the |
| case. */ |
| CILK_ASSERT(!w->l->pending_exception); |
| |
| reset_THE_exception(w); |
| stolen_p = !(w->head < (w->tail + 1)); /* +1 because tail was |
| speculatively |
| decremented by the |
| compiled code */ |
| |
| if (stolen_p) { |
| /* XXX This will be charged to THE for accounting purposes */ |
| __cilkrts_save_exception_state(w, ff); |
| |
| // Save the value of the current stack frame. |
| saved_sf = w->current_stack_frame; |
| |
| // Reverse the decrement from undo_detach. |
| // This update effectively resets the deque to be |
| // empty (i.e., changes w->tail back to equal w->head). |
| // We need to reset the deque to execute parallel |
| // reductions. When we have only serial reductions, it |
| // does not matter, since serial reductions do not |
| // change the deque. |
| w->tail++; |
| #if REDPAR_DEBUG > 1 |
| // ASSERT our deque is empty. |
| CILK_ASSERT(w->head == w->tail); |
| #endif |
| } |
| } END_WITH_WORKER_LOCK(w); |
| |
| STOP_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK); |
| |
| if (stolen_p) |
| { |
| w = execute_reductions_for_spawn_return(w, ff, returning_sf); |
| |
| // "Mr. Policeman? My parent always told me that if I was in trouble |
| // I should ask a nice policeman for help. I can't find my parent |
| // anywhere..." |
| // |
| // Write a record to the replay log for an attempt to return to a stolen parent |
| replay_record_orphaned(w); |
| |
| // Update the pedigree only after we've finished the |
| // reductions. |
| update_pedigree_on_leave_frame(w, returning_sf); |
| |
| // Notify Inspector that the parent has been stolen and we're |
| // going to abandon this work and go do something else. This |
| // will match the cilk_leave_begin in the compiled code |
| NOTIFY_ZC_INTRINSIC("cilk_leave_stolen", saved_sf); |
| |
| DBGPRINTF ("%d: longjmp_into_runtime from __cilkrts_c_THE_exception_check\n", w->self); |
| longjmp_into_runtime(w, do_return_from_spawn, 0); |
| DBGPRINTF ("%d: returned from longjmp_into_runtime from __cilkrts_c_THE_exception_check?!\n", w->self); |
| } |
| else |
| { |
| NOTE_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK_USELESS); |
| return; |
| } |
| } |
| |
| /* Return an exception to a stolen parent. */ |
| NORETURN __cilkrts_exception_from_spawn(__cilkrts_worker *w, |
| __cilkrts_stack_frame *returning_sf) |
| { |
| full_frame *ff = w->l->frame_ff; |
| // This is almost the same as THE_exception_check, except |
| // the detach didn't happen, we don't need to undo the tail |
| // update. |
| CILK_ASSERT(w->head == w->tail); |
| w = execute_reductions_for_spawn_return(w, ff, returning_sf); |
| |
| longjmp_into_runtime(w, do_return_from_spawn, 0); |
| CILK_ASSERT(0); |
| } |
| |
| static void do_return_from_spawn(__cilkrts_worker *w, |
| full_frame *ff, |
| __cilkrts_stack_frame *sf) |
| { |
| full_frame *parent_ff; |
| enum provably_good_steal_t steal_result = ABANDON_EXECUTION; |
| |
| BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) { |
| CILK_ASSERT(ff); |
| CILK_ASSERT(!ff->is_call_child); |
| CILK_ASSERT(sf == NULL); |
| parent_ff = ff->parent; |
| |
| BEGIN_WITH_FRAME_LOCK(w, ff) { |
| decjoin(ff); |
| } END_WITH_FRAME_LOCK(w, ff); |
| |
| BEGIN_WITH_FRAME_LOCK(w, parent_ff) { |
| if (parent_ff->simulated_stolen) |
| unconditional_steal(w, parent_ff); |
| else |
| steal_result = provably_good_steal(w, parent_ff); |
| } END_WITH_FRAME_LOCK(w, parent_ff); |
| |
| } END_WITH_WORKER_LOCK_OPTIONAL(w); |
| |
| // Loop here in replay mode |
| #ifdef CILK_RECORD_REPLAY |
| // We don't have to explicitly check for REPLAY_LOG below because |
| // steal_result can only get set to WAIT_FOR_CONTINUE during replay. |
| // We also don't have to worry about the simulated_stolen flag |
| // because steal_result can only be set to WAIT_FOR_CONTINUE by |
| // provably_good_steal(). |
| while(WAIT_FOR_CONTINUE == steal_result) |
| { |
| __cilkrts_sleep(); |
| BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) |
| { |
| BEGIN_WITH_FRAME_LOCK(w, parent_ff) |
| { |
| steal_result = provably_good_steal(w, parent_ff); |
| } END_WITH_FRAME_LOCK(w, parent_ff); |
| } END_WITH_WORKER_LOCK_OPTIONAL(w); |
| } |
| #endif // CILK_RECORD_REPLAY |
| |
| // Cleanup the child frame. |
| __cilkrts_destroy_full_frame(w, ff); |
| return; |
| } |
| |
| #ifdef _WIN32 |
| /* migrate an exception across fibers. Call this function when an exception has |
| * been thrown and has to traverse across a steal. The exception has already |
| * been wrapped up, so all that remains is to longjmp() into the continuation, |
| * sync, and re-raise it. |
| */ |
| void __cilkrts_migrate_exception(__cilkrts_stack_frame *sf) { |
| |
| __cilkrts_worker *w = sf->worker; |
| full_frame *ff; |
| |
| BEGIN_WITH_WORKER_LOCK(w) { |
| ff = w->l->frame_ff; |
| reset_THE_exception(w); |
| /* there is no need to check for a steal because we wouldn't be here if |
| there weren't a steal. */ |
| __cilkrts_save_exception_state(w, ff); |
| |
| CILK_ASSERT(w->head == w->tail); |
| } END_WITH_WORKER_LOCK(w); |
| |
| { |
| // TBD(jsukha): This function emulates the |
| // the "do_return_from_spawn" path. |
| w = execute_reductions_for_spawn_return(w, ff, sf); |
| } |
| |
| longjmp_into_runtime(w, do_return_from_spawn, 0); /* does not return. */ |
| CILK_ASSERT(! "Shouldn't be here..."); |
| } |
| #endif |
| |
| |
| /* Pop a call stack from TAIL. Return the call stack, or NULL if the |
| queue is empty */ |
| __cilkrts_stack_frame *__cilkrts_pop_tail(__cilkrts_worker *w) |
| { |
| __cilkrts_stack_frame *sf; |
| BEGIN_WITH_WORKER_LOCK(w) { |
| __cilkrts_stack_frame *volatile *tail = w->tail; |
| if (w->head < tail) { |
| --tail; |
| sf = *tail; |
| w->tail = tail; |
| } else { |
| sf = 0; |
| } |
| } END_WITH_WORKER_LOCK(w); |
| return sf; |
| } |
| |
| #ifdef CILK_RECORD_REPLAY |
| __cilkrts_stack_frame *simulate_pop_tail(__cilkrts_worker *w) |
| { |
| __cilkrts_stack_frame *sf; |
| BEGIN_WITH_WORKER_LOCK(w) { |
| if (w->head < w->tail) { |
| sf = *(w->tail-1); |
| } else { |
| sf = 0; |
| } |
| } END_WITH_WORKER_LOCK(w); |
| return sf; |
| } |
| #endif |
| |
| |
| /* Return from a call, not a spawn. */ |
| void __cilkrts_return(__cilkrts_worker *w) |
| { |
| full_frame *ff, *parent_ff; |
| START_INTERVAL(w, INTERVAL_RETURNING); |
| |
| BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) { |
| ff = w->l->frame_ff; |
| CILK_ASSERT(ff); |
| CILK_ASSERT(ff->join_counter == 1); |
| /* This path is not used to return from spawn. */ |
| CILK_ASSERT(ff->is_call_child); |
| |
| BEGIN_WITH_FRAME_LOCK(w, ff) { |
| // After this call, w->l->frame_ff != ff. |
| // Technically, w will "own" ff until ff is freed, |
| // however, because ff is a dying leaf full frame. |
| parent_ff = disown(w, ff, 0, "return"); |
| decjoin(ff); |
| |
| #ifdef _WIN32 |
| __cilkrts_save_exception_state(w, ff); |
| #else |
| // Move the pending exceptions into the full frame |
| // This should always be NULL if this isn't a |
| // return with an exception |
| CILK_ASSERT(NULL == ff->pending_exception); |
| ff->pending_exception = w->l->pending_exception; |
| w->l->pending_exception = NULL; |
| #endif // _WIN32 |
| |
| } END_WITH_FRAME_LOCK(w, ff); |
| |
| __cilkrts_fence(); /* redundant */ |
| |
| CILK_ASSERT(parent_ff); |
| |
| BEGIN_WITH_FRAME_LOCK(w, parent_ff) { |
| finalize_child_for_call(w, parent_ff, ff); |
| } END_WITH_FRAME_LOCK(w, parent_ff); |
| |
| ff = pop_next_frame(w); |
| /* ff will be non-null except when the parent frame is owned |
| by another worker. |
| CILK_ASSERT(ff) |
| */ |
| CILK_ASSERT(!w->l->frame_ff); |
| if (ff) { |
| BEGIN_WITH_FRAME_LOCK(w, ff) { |
| __cilkrts_stack_frame *sf = ff->call_stack; |
| CILK_ASSERT(sf && !sf->call_parent); |
| setup_for_execution(w, ff, 1); |
| } END_WITH_FRAME_LOCK(w, ff); |
| } |
| } END_WITH_WORKER_LOCK_OPTIONAL(w); |
| |
| STOP_INTERVAL(w, INTERVAL_RETURNING); |
| } |
| |
| static void __cilkrts_unbind_thread() |
| { |
| int stop_cilkscreen = 0; |
| global_state_t *g; |
| |
| // Take out the global OS mutex to protect accesses to the table of workers |
| global_os_mutex_lock(); |
| |
| if (cilkg_is_published()) { |
| __cilkrts_worker *w = __cilkrts_get_tls_worker(); |
| if (w) { |
| g = w->g; |
| |
| // If there's only 1 worker, the counts will be stopped in |
| // __cilkrts_scheduler |
| if (g->P > 1) |
| { |
| STOP_INTERVAL(w, INTERVAL_WORKING); |
| STOP_INTERVAL(w, INTERVAL_IN_SCHEDULER); |
| } |
| |
| __cilkrts_set_tls_worker(0); |
| |
| if (w->self == -1) { |
| // This worker is an overflow worker. I.e., it was created on- |
| // demand when the global pool ran out of workers. |
| destroy_worker(w); |
| __cilkrts_free(w); |
| } else { |
| // This is a normal user worker and needs to be counted by the |
| // global state for the purposes of throttling system workers. |
| w->l->type = WORKER_FREE; |
| __cilkrts_leave_cilk(g); |
| } |
| |
| stop_cilkscreen = (0 == g->Q); |
| } |
| } |
| global_os_mutex_unlock(); |
| |
| /* Turn off Cilkscreen. This needs to be done when we are NOT holding the |
| * os mutex. */ |
| if (stop_cilkscreen) |
| __cilkrts_cilkscreen_disable_instrumentation(); |
| } |
| |
| /* special return from the initial frame */ |
| |
| void __cilkrts_c_return_from_initial(__cilkrts_worker *w) |
| { |
| struct cilkred_map *rm; |
| |
| /* This is only called on a user thread worker. */ |
| CILK_ASSERT(w->l->type == WORKER_USER); |
| |
| #if REDPAR_DEBUG >= 3 |
| fprintf(stderr, "[W=%d, desc=cilkrts_c_return_from_initial, ff=%p]\n", |
| w->self, w->l->frame_ff); |
| #endif |
| |
| BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) { |
| full_frame *ff = w->l->frame_ff; |
| CILK_ASSERT(ff); |
| CILK_ASSERT(ff->join_counter == 1); |
| w->l->frame_ff = 0; |
| |
| CILK_ASSERT(ff->fiber_self); |
| // Save any TBB interop data for the next time this thread enters Cilk |
| cilk_fiber_tbb_interop_save_info_from_stack(ff->fiber_self); |
| |
| // Deallocate cilk_fiber that mapped to the user stack. The stack |
| // itself does not get deallocated (of course) but our data |
| // structure becomes divorced from it. |
| |
| #if FIBER_DEBUG >= 1 |
| fprintf(stderr, "ThreadId=%p: w=%d: We are about to deallocate ff->fiber_self = %p here. w->l->scheduling_fiber = %p. w->l->type = %d\n", |
| cilkos_get_current_thread_id(), |
| w->self, |
| ff->fiber_self, |
| w->l->scheduling_fiber, |
| w->l->type); |
| #endif |
| // The fiber in ff is a user-code fiber. The fiber in |
| // w->l->scheduling_fiber is a scheduling fiber. These fibers should |
| // never be equal. When a user worker returns (and will unbind), we |
| // should destroy only the fiber in ff. The scheduling fiber will be |
| // re-used. |
| |
| CILK_ASSERT(ff->fiber_self != w->l->scheduling_fiber); |
| |
| START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) { |
| // This fiber might not be deallocated here if there |
| // is a pending exception on Windows that refers |
| // to this fiber. |
| // |
| // First "suspend" the fiber, and then try to delete it. |
| cilk_fiber_deallocate_from_thread(ff->fiber_self); |
| } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE); |
| ff->fiber_self = NULL; |
| |
| /* Save reducer map into global_state object */ |
| rm = w->reducer_map; |
| w->reducer_map = NULL; |
| |
| #if REDPAR_DEBUG >= 3 |
| fprintf(stderr, "W=%d, reducer_map_to_delete=%p, was in ff=%p\n", |
| w->self, |
| rm, |
| ff); |
| #endif |
| __cilkrts_destroy_full_frame(w, ff); |
| |
| |
| /* Work is never done. w->g->work_done = 1; __cilkrts_fence(); */ |
| } END_WITH_WORKER_LOCK_OPTIONAL(w); |
| |
| |
| save_pedigree_leaf_from_user_worker(w); |
| |
| // Workers can have NULL reducer maps now. |
| if (rm) { |
| __cilkrts_destroy_reducer_map(w, rm); |
| } |
| |
| |
| #if FIBER_DEBUG >= 1 |
| __cilkrts_worker* tmp = w; |
| int tmp_id = w->self; |
| fprintf(stderr, "w=%d: We are about unbind thread (w= %p)\n", |
| w->self, |
| w); |
| #endif |
| |
| w = NULL; |
| |
| __cilkrts_unbind_thread(); |
| |
| #if FIBER_DEBUG >= 1 |
| |
| fprintf(stderr, "w=%p, %d: Finished unbind\n", |
| tmp, tmp_id); |
| #endif |
| |
| /* Other workers will stop trying to steal if this was the last worker. */ |
| |
| return; |
| } |
| |
| |
| /* |
| * __cilkrts_restore_stealing |
| * |
| * Restore the protected_tail to a previous state, possibly allowing frames |
| * to be stolen. The dekker_protocol has been extended to steal only if |
| * head+1 is < protected_tail. |
| */ |
| |
| void __cilkrts_restore_stealing( |
| __cilkrts_worker *w, |
| __cilkrts_stack_frame *volatile *saved_protected_tail) |
| { |
| /* On most x86 this pair of operations would be slightly faster |
| as an atomic exchange due to the implicit memory barrier in |
| an atomic instruction. */ |
| w->protected_tail = saved_protected_tail; |
| __cilkrts_fence(); |
| } |
| |
| /* |
| * __cilkrts_disallow_stealing |
| * |
| * Move the protected_tail to NEW_PROTECTED_TAIL, preventing any |
| * frames from being stolen. If NEW_PROTECTED_TAIL is NULL, prevent |
| * stealing from the whole queue. The dekker_protocol has been |
| * extended to only steal if head+1 is also < protected_tail. |
| */ |
| |
| __cilkrts_stack_frame *volatile *__cilkrts_disallow_stealing( |
| __cilkrts_worker *w, |
| __cilkrts_stack_frame *volatile *new_protected_tail) |
| { |
| __cilkrts_stack_frame *volatile *saved_protected_tail = w->protected_tail; |
| |
| if (!new_protected_tail) |
| new_protected_tail = w->l->ltq; |
| |
| if (w->protected_tail > new_protected_tail) { |
| w->protected_tail = new_protected_tail; |
| /* Issue a store-store barrier. The update to protected_tail |
| here must precede the update to tail in the next spawn. |
| On x86 this is probably not needed. */ |
| #if defined __GNUC__ && __ICC >= 1200 && !(__MIC__ ||__MIC2__) |
| _mm_sfence(); |
| #else |
| __cilkrts_fence(); |
| #endif |
| } |
| |
| return saved_protected_tail; |
| } |
| |
| /************************************************************* |
| Initialization and startup |
| *************************************************************/ |
| |
| __cilkrts_worker *make_worker(global_state_t *g, |
| int self, __cilkrts_worker *w) |
| { |
| w->self = self; |
| w->g = g; |
| |
| w->pedigree.rank = 0; // Initial rank is 0 |
| w->pedigree.parent = NULL; |
| |
| w->l = (local_state *)__cilkrts_malloc(sizeof(*w->l)); |
| |
| __cilkrts_frame_malloc_per_worker_init(w); |
| |
| w->reducer_map = NULL; |
| w->current_stack_frame = NULL; |
| w->reserved = NULL; |
| |
| w->l->worker_magic_0 = WORKER_MAGIC_0; |
| w->l->team = NULL; |
| w->l->type = WORKER_FREE; |
| |
| __cilkrts_mutex_init(&w->l->lock); |
| __cilkrts_mutex_init(&w->l->steal_lock); |
| w->l->do_not_steal = 0; |
| w->l->frame_ff = 0; |
| w->l->next_frame_ff = 0; |
| w->l->last_full_frame = NULL; |
| |
| w->l->ltq = (__cilkrts_stack_frame **) |
| __cilkrts_malloc(g->ltqsize * sizeof(*w->l->ltq)); |
| w->ltq_limit = w->l->ltq + g->ltqsize; |
| w->head = w->tail = w->l->ltq; |
| |
| cilk_fiber_pool_init(&w->l->fiber_pool, |
| &g->fiber_pool, |
| g->stack_size, |
| g->fiber_pool_size, |
| 0, // alloc_max is 0. We don't allocate from the heap directly without checking the parent pool. |
| 0); |
| #if FIBER_DEBUG >= 2 |
| fprintf(stderr, "ThreadId=%p: Making w=%d (%p), pool = %p\n", |
| cilkos_get_current_thread_id(), |
| w->self, w, |
| &w->l->fiber_pool); |
| #endif |
| w->l->scheduling_fiber = NULL; |
| w->l->original_pedigree_leaf = NULL; |
| w->l->rand_seed = 0; /* the scheduler will overwrite this field */ |
| |
| w->l->post_suspend = 0; |
| w->l->suspended_stack = 0; |
| w->l->fiber_to_free = NULL; |
| w->l->pending_exception = NULL; |
| |
| #if CILK_PROFILE |
| w->l->stats = __cilkrts_malloc(sizeof(statistics)); |
| __cilkrts_init_stats(w->l->stats); |
| #else |
| w->l->stats = NULL; |
| #endif |
| w->l->steal_failure_count = 0; |
| |
| w->l->work_stolen = 0; |
| |
| // Initialize record/replay assuming we're doing neither |
| w->l->record_replay_fptr = NULL; |
| w->l->replay_list_root = NULL; |
| w->l->replay_list_entry = NULL; |
| w->l->signal_node = NULL; |
| // Nothing's been stolen yet |
| w->l->worker_magic_1 = WORKER_MAGIC_1; |
| |
| /*w->parallelism_disabled = 0;*/ |
| |
| // Allow stealing all frames. Sets w->saved_protected_tail |
| __cilkrts_restore_stealing(w, w->ltq_limit); |
| |
| __cilkrts_init_worker_sysdep(w); |
| |
| reset_THE_exception(w); |
| |
| return w; |
| } |
| |
| void destroy_worker(__cilkrts_worker *w) |
| { |
| CILK_ASSERT (NULL == w->l->pending_exception); |
| |
| // Deallocate the scheduling fiber |
| if (NULL != w->l->scheduling_fiber) |
| { |
| // The scheduling fiber is the main fiber for system workers and must |
| // be deallocated by the thread that created it. Thus, we can |
| // deallocate only free workers' (formerly user workers) scheduling |
| // fibers here. |
| CILK_ASSERT(WORKER_FREE == w->l->type); |
| |
| #if FIBER_DEBUG >=1 |
| fprintf(stderr, "ThreadId=%p, w=%p, %d, deallocating scheduling fiber = %p, \n", |
| cilkos_get_current_thread_id(), |
| w, |
| w->self, |
| w->l->scheduling_fiber); |
| #endif |
| int ref_count = cilk_fiber_remove_reference(w->l->scheduling_fiber, NULL); |
| // Scheduling fiber should never have extra references because of exceptions. |
| CILK_ASSERT(0 == ref_count); |
| w->l->scheduling_fiber = NULL; |
| } |
| |
| #if CILK_PROFILE |
| if (w->l->stats) { |
| __cilkrts_free(w->l->stats); |
| } |
| #else |
| CILK_ASSERT(NULL == w->l->stats); |
| #endif |
| |
| /* Free any cached fibers. */ |
| cilk_fiber_pool_destroy(&w->l->fiber_pool); |
| |
| __cilkrts_destroy_worker_sysdep(w); |
| |
| if (w->l->signal_node) { |
| CILK_ASSERT(WORKER_SYSTEM == w->l->type); |
| signal_node_destroy(w->l->signal_node); |
| } |
| |
| __cilkrts_free(w->l->ltq); |
| __cilkrts_mutex_destroy(0, &w->l->lock); |
| __cilkrts_mutex_destroy(0, &w->l->steal_lock); |
| __cilkrts_frame_malloc_per_worker_cleanup(w); |
| |
| __cilkrts_free(w->l); |
| |
| // The caller is responsible for freeing the worker memory |
| } |
| |
| /* |
| * Make a worker into a system worker. |
| */ |
| static void make_worker_system(__cilkrts_worker *w) { |
| CILK_ASSERT(WORKER_FREE == w->l->type); |
| w->l->type = WORKER_SYSTEM; |
| w->l->signal_node = signal_node_create(); |
| } |
| |
| void __cilkrts_deinit_internal(global_state_t *g) |
| { |
| int i; |
| __cilkrts_worker *w; |
| |
| // If there's no global state then we're done |
| if (NULL == g) |
| return; |
| |
| #ifdef CILK_PROFILE |
| __cilkrts_dump_stats_to_stderr(g); |
| #endif |
| |
| w = g->workers[0]; |
| if (w->l->frame_ff) { |
| __cilkrts_destroy_full_frame(w, w->l->frame_ff); |
| w->l->frame_ff = 0; |
| } |
| |
| // Release any resources used for record/replay |
| replay_term(g); |
| |
| // Destroy any system dependent global state |
| __cilkrts_destroy_global_sysdep(g); |
| |
| for (i = 0; i < g->total_workers; ++i) |
| destroy_worker(g->workers[i]); |
| |
| // Free memory for all worker blocks which were allocated contiguously |
| __cilkrts_free(g->workers[0]); |
| |
| __cilkrts_free(g->workers); |
| |
| cilk_fiber_pool_destroy(&g->fiber_pool); |
| __cilkrts_frame_malloc_global_cleanup(g); |
| |
| cilkg_deinit_global_state(); |
| } |
| |
| /* |
| * Wake the runtime by notifying the system workers that they can steal. The |
| * first user worker into the runtime should call this. |
| */ |
| static void wake_runtime(global_state_t *g) |
| { |
| __cilkrts_worker *root; |
| if (g->P > 1) { |
| // Send a message to the root node. The message will propagate. |
| root = g->workers[0]; |
| CILK_ASSERT(root->l->signal_node); |
| signal_node_msg(root->l->signal_node, 1); |
| } |
| } |
| |
| /* |
| * Put the runtime to sleep. The last user worker out of the runtime should |
| * call this. Like Dad always said, turn out the lights when nobody's in the |
| * room. |
| */ |
| static void sleep_runtime(global_state_t *g) |
| { |
| __cilkrts_worker *root; |
| if (g->P > 1) { |
| // Send a message to the root node. The message will propagate. |
| root = g->workers[0]; |
| CILK_ASSERT(root->l->signal_node); |
| signal_node_msg(root->l->signal_node, 0); |
| } |
| } |
| |
| /* Called when a user thread joins Cilk. |
| Global lock must be held. */ |
| void __cilkrts_enter_cilk(global_state_t *g) |
| { |
| if (g->Q++ == 0) { |
| // If this is the first user thread to enter Cilk wake |
| // up all the workers. |
| wake_runtime(g); |
| } |
| } |
| |
| /* Called when a user thread leaves Cilk. |
| Global lock must be held. */ |
| void __cilkrts_leave_cilk(global_state_t *g) |
| { |
| if (--g->Q == 0) { |
| // Put the runtime to sleep. |
| sleep_runtime(g); |
| } |
| } |
| |
| /* |
| * worker_runnable |
| * |
| * Return true if the worker should continue to try to steal. False, otherwise. |
| */ |
| |
| NOINLINE |
| static enum schedule_t worker_runnable(__cilkrts_worker *w) |
| { |
| global_state_t *g = w->g; |
| |
| /* If this worker has something to do, do it. |
| Otherwise the work would be lost. */ |
| if (w->l->next_frame_ff) |
| return SCHEDULE_RUN; |
| |
| // If Cilk has explicitly (by the user) been told to exit (i.e., by |
| // __cilkrts_end_cilk() -> __cilkrts_stop_workers(g)), then return 0. |
| if (g->work_done) |
| return SCHEDULE_EXIT; |
| |
| if (0 == w->self) { |
| // This worker is the root node and is the only one that may query the |
| // global state to see if there are still any user workers in Cilk. |
| if (w->l->steal_failure_count > g->max_steal_failures) { |
| if (signal_node_should_wait(w->l->signal_node)) { |
| return SCHEDULE_WAIT; |
| } else { |
| // Reset the steal_failure_count since we have verified that |
| // user workers are still in Cilk. |
| w->l->steal_failure_count = 0; |
| } |
| } |
| } else if (WORKER_SYSTEM == w->l->type && |
| signal_node_should_wait(w->l->signal_node)) { |
| // This worker has been notified by its parent that it should stop |
| // trying to steal. |
| return SCHEDULE_WAIT; |
| } |
| |
| return SCHEDULE_RUN; |
| } |
| |
| |
| |
| // Initialize the worker structs, but don't start the workers themselves. |
| static void init_workers(global_state_t *g) |
| { |
| int total_workers = g->total_workers; |
| int i; |
| struct CILK_ALIGNAS(256) buffered_worker { |
| __cilkrts_worker w; |
| char buf[64]; |
| } *workers_memory; |
| |
| /* not needed if only one worker */ |
| cilk_fiber_pool_init(&g->fiber_pool, |
| NULL, |
| g->stack_size, |
| g->global_fiber_pool_size, // buffer_size |
| g->max_stacks, // maximum # to allocate |
| 1); |
| |
| cilk_fiber_pool_set_fiber_limit(&g->fiber_pool, |
| (g->max_stacks ? g->max_stacks : INT_MAX)); |
| |
| g->workers = (__cilkrts_worker **) |
| __cilkrts_malloc(total_workers * sizeof(*g->workers)); |
| |
| // Allocate 1 block of memory for workers to make life easier for tools |
| // like Inspector which run multithreaded and need to know the memory |
| // range for all the workers that will be accessed in a user's program |
| workers_memory = (struct buffered_worker*) |
| __cilkrts_malloc(sizeof(*workers_memory) * total_workers); |
| |
| // Notify any tools that care (Cilkscreen and Inspector) that they should |
| // ignore memory allocated for the workers |
| __cilkrts_cilkscreen_ignore_block(&workers_memory[0], |
| &workers_memory[total_workers]); |
| |
| // Initialize worker structs, including unused worker slots. |
| for (i = 0; i < total_workers; ++i) { |
| g->workers[i] = make_worker(g, i, &workers_memory[i].w); |
| } |
| |
| // Set the workers in the first P - 1 slots to be system workers. |
| // Remaining worker structs already have type == 0. |
| for (i = 0; i < g->system_workers; ++i) { |
| make_worker_system(g->workers[i]); |
| } |
| } |
| |
| void __cilkrts_init_internal(int start) |
| { |
| global_state_t *g = NULL; |
| |
| if (cilkg_is_published()) { |
| g = cilkg_init_global_state(); |
| } |
| else { |
| |
| // We think the state has not been published yet. |
| // Grab the lock and try to initialize/publish. |
| global_os_mutex_lock(); |
| |
| if (cilkg_is_published()) { |
| // Some other thread must have snuck in and published. |
| g = cilkg_init_global_state(); |
| } |
| else { |
| // Initialize and retrieve global state |
| g = cilkg_init_global_state(); |
| |
| // Set the scheduler pointer |
| g->scheduler = worker_scheduler_function; |
| |
| // If we're running under a sequential P-Tool (Cilkscreen or |
| // Cilkview) then there's only one worker and we need to tell |
| // the tool about the extent of the stack |
| if (g->under_ptool) |
| __cilkrts_establish_c_stack(); |
| init_workers(g); |
| |
| // Initialize per-work record/replay logging |
| replay_init_workers(g); |
| |
| // Initialize any system dependent global state |
| __cilkrts_init_global_sysdep(g); |
| |
| |
| cilkg_publish_global_state(g); |
| } |
| |
| global_os_mutex_unlock(); |
| } |
| |
| CILK_ASSERT(g); |
| |
| if (start && !g->workers_running) |
| { |
| // Acquire the global OS mutex while we're starting the workers |
| global_os_mutex_lock(); |
| if (!g->workers_running) |
| // Start P - 1 system workers since P includes the first user |
| // worker. |
| __cilkrts_start_workers(g, g->P - 1); |
| global_os_mutex_unlock(); |
| } |
| } |
| |
| |
| /************************************************************************ |
| Methods for reducer protocol. |
| |
| Reductions occur in two places: |
| A. A full frame "ff" is returning from a spawn with a stolen parent. |
| B. A full frame "ff" is stalling at a sync. |
| |
| To support parallel reductions, reduction functions need to be |
| executed while control is on a user stack, before jumping into the |
| runtime. These reductions can not occur while holding a worker or |
| frame lock. |
| |
| Before a worker w executes a reduction in either Case A or B, w's |
| deque is empty. |
| |
| Since parallel reductions push work onto the deque, we must do extra |
| work to set up runtime data structures properly before reductions |
| begin to allow stealing. ( Normally, when we have only serial |
| reductions, once a worker w starts a reduction, its deque remains |
| empty until w either steals another frame or resumes a suspended |
| frame. Thus, we don't care about the state of the deque, since w |
| will reset its deque when setting up execution of a frame. ) |
| |
| To allow for parallel reductions, we coerce the runtime data |
| structures so that, from their perspective, it looks as though we |
| have spliced in an "execute_reductions()" function. Consider the |
| two cases for reductions: |
| |
| Case A: Return from a spawn with a stolen parent. |
| Consider a spawned function g is returning on a worker w. |
| Assume: |
| - g was spawned from a parent function f. |
| - ff is the full frame for g's spawn helper |
| - sf be the __cilkrts_stack_frame for g's spawn helper. |
| |
| We are conceptually splicing "execute_reductions()" so that it |
| occurs immediately before the spawn helper of g returns to f. |
| |
| We do so by creating two different world views --- one for the |
| runtime data structures, and one for the actual control flow. |
| |
| - Before reductions begin, the runtime data structures should |
| look as though the spawn helper of g is calling |
| "execute_reductions()", in terms of both the user stack and |
| worker deque. More precisely, w should satisfy the |
| following properties: |
| |
| (a) w has ff as its full frame, |
| (b) w has sf as its __cilkrts_stack_frame, and |
| (c) w has an empty deque. |
| |
| If the runtime satisfies these properties, then if w |
| encounters a spawn in a parallel reduction, it can push onto |
| a valid deque. Also, when a steal from w occurs, it will |
| build the correct tree of full frames when w is stolen from. |
| |
| - In actual control flow, however, once the |
| "execute_reductions()" function returns, it is actually |
| returning to runtime code instead of g's spawn helper. |
| |
| At the point a worker w began executing reductions, the |
| control flow / compiled code had already finished g's spawn |
| helper, and w was about to enter the runtime. With parallel |
| reductions, some worker v (which might be different from w) |
| is the one returning to the runtime. |
| |
| |
| The reduction logic consists of 4 steps: |
| |
| A1. Restore runtime data structures to make it look as though |
| the spawn helper of g() is still the currently executing |
| frame for w. |
| |
| A2. Execute reductions on the user stack. Reductions also |
| includes the logic for exceptions and stacks. Note that |
| reductions start on w, but may finish on a different |
| worker if there is parallelism in the reduce. |
| |
| A3. Splice out ff from the tree of full frames. |
| |
| A4. Jump into the runtime/scheduling stack and execute |
| "do_return_from_spawn". This method |
| |
| (a) Frees the user stack we were just on if it is no longer needed. |
| (b) Decrement the join counter on ff->parent, and tries to do a |
| provably good steal. |
| (c) Clean up the full frame ff. |
| |
| |
| Case B: Stalling at a sync. |
| |
| Consider a function g(), with full frame ff and |
| __cilkrts_stack_frame sf. Suppose g() stalls at a sync, and we |
| are executing reductions. |
| |
| Conceptually, we are splicing in an "execute_reductions()" |
| function into g() as the last action that g() takes immediately |
| before it executes the cilk_sync. |
| |
| The reduction logic for this case is similar to Case A. |
| |
| B1. Restore the runtime data structures. |
| |
| The main difference from Case A is that ff/sf is still a |
| frame that needs to be executed later (since it is stalling |
| at a cilk_sync). Thus, we also need to save the current |
| stack information into "ff" so that we can correctly resume |
| execution of "ff" after the sync. |
| |
| B2. Execute reductions on the user stack. |
| |
| B3. No frame to splice out of the tree. |
| |
| B4. Jump into the runtime/scheduling stack and execute "do_sync". |
| This method: |
| (a) Frees the user stack we were just on if it is no longer needed. |
| (b) Tries to execute a provably good steal. |
| |
| Finally, for the reducer protocol, we consider two reduction paths, |
| namely a "fast" and "slow" path. On a fast path, only trivial |
| merges of reducer maps happen (i.e., one or both of the maps are |
| NULL). Otherwise, on the slow path, a reduction actually needs to |
| happen. |
| |
| *****************************************************************/ |
| |
| /** |
| * @brief Locations to store the result of a reduction. |
| * |
| * Struct storing pointers to the fields in our "left" sibling that we |
| * should update when splicing out a full frame or stalling at a sync. |
| */ |
| typedef struct { |
| /** A pointer to the location of our left reducer map. */ |
| struct cilkred_map **map_ptr; |
| |
| /** A pointer to the location of our left exception. */ |
| struct pending_exception_info **exception_ptr; |
| } splice_left_ptrs; |
| |
| /** |
| * For a full frame returning from a spawn, calculate the pointers to |
| * the maps and exceptions to my left. |
| * |
| * @param w The currently executing worker. |
| * @param ff Full frame that is dying |
| * @return Pointers to our "left" for reducers and exceptions. |
| */ |
| static inline |
| splice_left_ptrs compute_left_ptrs_for_spawn_return(__cilkrts_worker *w, |
| full_frame *ff) |
| { |
| // ASSERT: we hold the lock on ff->parent |
| |
| splice_left_ptrs left_ptrs; |
| if (ff->left_sibling) { |
| left_ptrs.map_ptr = &ff->left_sibling->right_reducer_map; |
| left_ptrs.exception_ptr = &ff->left_sibling->right_pending_exception; |
| } |
| else { |
| full_frame *parent_ff = ff->parent; |
| left_ptrs.map_ptr = &parent_ff->children_reducer_map; |
| left_ptrs.exception_ptr = &parent_ff->child_pending_exception; |
| } |
| return left_ptrs; |
| } |
| |
| /** |
| * For a full frame at a sync, calculate the pointers to the maps and |
| * exceptions to my left. |
| * |
| * @param w The currently executing worker. |
| * @param ff Full frame that is stalling at a sync. |
| * @return Pointers to our "left" for reducers and exceptions. |
| */ |
| static inline |
| splice_left_ptrs compute_left_ptrs_for_sync(__cilkrts_worker *w, |
| full_frame *ff) |
| { |
| // ASSERT: we hold the lock on ff |
| splice_left_ptrs left_ptrs; |
| |
| // Figure out which map to the left we should merge into. |
| if (ff->rightmost_child) { |
| CILK_ASSERT(ff->rightmost_child->parent == ff); |
| left_ptrs.map_ptr = &(ff->rightmost_child->right_reducer_map); |
| left_ptrs.exception_ptr = &(ff->rightmost_child->right_pending_exception); |
| } |
| else { |
| // We have no children. Then, we should be the last |
| // worker at the sync... "left" is our child map. |
| left_ptrs.map_ptr = &(ff->children_reducer_map); |
| left_ptrs.exception_ptr = &(ff->child_pending_exception); |
| } |
| return left_ptrs; |
|