libcilkrts/runtime/cilk-abi-cilk-for.cpp - gcc - Git at Google

 /* cilk-abi-cilk-for.cpp                  -*-C++-*-
  *
  *************************************************************************
  *
  *  Copyright (C) 2011-2016, Intel Corporation
  *  All rights reserved.
  *
  *  Redistribution and use in source and binary forms, with or without
  *  modification, are permitted provided that the following conditions
  *  are met:
  *
  *    * Redistributions of source code must retain the above copyright
  *      notice, this list of conditions and the following disclaimer.
  *    * Redistributions in binary form must reproduce the above copyright
  *      notice, this list of conditions and the following disclaimer in
  *      the documentation and/or other materials provided with the
  *      distribution.
  *    * Neither the name of Intel Corporation nor the names of its
  *      contributors may be used to endorse or promote products derived
  *      from this software without specific prior written permission.
  *
  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  *  POSSIBILITY OF SUCH DAMAGE.
  *
  *  *********************************************************************
  *
  *  PLEASE NOTE: This file is a downstream copy of a file mainitained in
  *  a repository at cilkplus.org. Changes made to this file that are not
  *  submitted through the contribution process detailed at
  *  http://www.cilkplus.org/submit-cilk-contribution will be lost the next
  *  time that a new version is released. Changes only submitted to the
  *  GNU compiler collection or posted to the git repository at
  *  https://bitbucket.org/intelcilkruntime/intel-cilk-runtime.git are
  *  not tracked.
  *
  *  We welcome your contributions to this open source project. Thank you
  *  for your assistance in helping us improve Cilk Plus.
  *
  **************************************************************************/

 /* Implementation of cilk_for ABI.
  *
  * This file must be C++, not C, in order to handle C++ exceptions correctly
  * from within the body of the cilk_for loop
  */

 #include "internal/abi.h"
 #include "metacall_impl.h"
 #include "global_state.h"

 // Icky macros to determine if we're compiled with optimization.  Based on
 // the declaration of __CILKRTS_ASSERT in common.h
 #if defined(_WIN32)
 # if defined (_DEBUG)
 #   define CILKRTS_OPTIMIZED 0    // Assumes /MDd is always used with /Od
 # else
 #   define CILKRTS_OPTIMIZED 1
 # endif // defined(_DEBUG)
 #else
 # if defined(__OPTIMIZE__)
 #   define CILKRTS_OPTIMIZED 1
 # else
 #   define CILKRTS_OPTIMIZED 0
 # endif
 #endif

 template <typename count_t>
 static inline int grainsize(int req, count_t count)
 {
     // A positive requested grain size comes from the user.  A very high grain
     // size risks losing parallelism, but the user told us what they want for
     // grainsize.  Who are we to argue?
     if (req > 0)
         return req;

     // At present, a negative requested grain size is treated the same way as
     // a zero grain size, i.e., the runtime computes the actual grainsize
     // using a hueristic.  In the future, the compiler may give us additional
     // information about the size of the cilk_for body by passing a negative
     // grain size.

     // Avoid generating a zero grainsize, even for empty loops.
     if (count < 1)
         return 1;

     global_state_t* g = cilkg_get_global_state();
     if (g->under_ptool)
     {
         // Grainsize = 1, when running under PIN, and when the grainsize has
         // not explicitly been set by the user.
         return 1;
     }
     else
     {
         // Divide loop count by 8 times the worker count and round up.
         const int Px8 = g->P * 8;
         count_t n = (count + Px8 - 1) / Px8;

         // 2K should be enough to amortize the cost of the cilk_for. Any
         // larger grainsize risks losing parallelism.
         if (n > 2048)
             return 2048;
         return (int) n;  // n <= 2048, so no loss of precision on cast to int
     }
 }

 /*
  * call_cilk_for_loop_body
  *
  * Centralizes the code to call the loop body.  The compiler should be
  * inlining this code
  *
  * low   - Low loop index we're considering in this portion of the algorithm
  * high  - High loop index we're considering in this portion of the algorithm
  * body  - lambda function for the cilk_for loop body
  * data  - data used by the lambda function
  * w     - __cilkrts_worker we're currently executing on
  * loop_root_pedigree - __cilkrts_pedigree node we generated for the root of
  *         the cilk_for loop to flatten out the internal nodes
  */
 template <typename count_t, typename F>
 inline static
 void call_cilk_for_loop_body(count_t low, count_t high,
                              F body, void *data,
                              __cilkrts_worker *w,
                              __cilkrts_pedigree *loop_root_pedigree)
 {
     // Cilkscreen should not report this call in a stack trace
     NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);

     // The worker is only valid until the first spawn.  Fetch the
     // __cilkrts_stack_frame out of the worker, since it will be stable across
     // steals.  The sf pointer actually points to the *parent's*
     // __cilkrts_stack_frame, since this function is a non-spawning function
     // and therefore has no cilk stack frame of its own.
     __cilkrts_stack_frame *sf = w->current_stack_frame;

     // Save the pedigree node pointed to by the worker.  We'll need to restore
     // that when we exit since the spawn helpers in the cilk_for call tree
     // will assume that it's valid
     const __cilkrts_pedigree *saved_next_pedigree_node = w->pedigree.parent;

     // Add the leaf pedigree node to the chain. The parent is the root node
     // to flatten the tree regardless of the DAG branches in the cilk_for
     // divide-and-conquer recursion.
     //
     // The rank is initialized to the low index.  The user is
     // expected to call __cilkrts_bump_loop_rank at the end of the cilk_for
     // loop body.
     __cilkrts_pedigree loop_leaf_pedigree;

     loop_leaf_pedigree.rank = (uint64_t)low;
     loop_leaf_pedigree.parent = loop_root_pedigree;

     // The worker's pedigree always starts with a rank of 0
     w->pedigree.rank = 0;
     w->pedigree.parent = &loop_leaf_pedigree;

     // Call the compiler generated cilk_for loop body lambda function
     body(data, low, high);

     // The loop body may have included spawns, so we must refetch the worker
     // from the __cilkrts_stack_frame, which is stable regardless of which
     // worker we're executing on.
     w = sf->worker;

     // Restore the pedigree chain. It must be valid because the spawn helpers
     // generated by the cilk_for implementation will access it.
     w->pedigree.parent = saved_next_pedigree_node;
 }

 /* capture_spawn_arg_stack_frame
  *
  * Efficiently get the address of the caller's __cilkrts_stack_frame.  The
  * preconditons are that 'w' is the worker at the time of the call and
  * 'w->current_stack_frame' points to the __cilkrts_stack_frame within the
  * spawn helper.  This function should be called only within the argument list
  * of a function that is being spawned because that is the only situation in
  * which these preconditions hold.  This function returns the worker
  * (unchanged) after storing the captured stack frame pointer is stored in the
  * sf argument.
  *
  * The purpose of this function is to get the caller's stack frame in a
  * context where the caller's worker is known but its stack frame is not
  * necessarily initialized.  The "shrink wrap" optimization delays
  * initializing the contents of a spawning function's '__cilkrts_stack_frame'
  * as well as the 'current_stack_frame' pointer within the worker.  By calling
  * this function within a spawning function's argument list, we can ensure
  * that these initializations have occured but that a detach (which would
  * invalidate the worker pointer in the caller) has not yet occured.  Once the
  * '__cilkrts_stack_frame' has been retrieved in this way, it is stable for the
  * remainder of the caller's execution, and becomes an efficient way to get
  * the worker (much more efficient than calling '__cilkrts_get_tls_worker()'),
  * even after a spawn or sync.
  */
 inline __cilkrts_worker*
 capture_spawn_arg_stack_frame(__cilkrts_stack_frame* &sf, __cilkrts_worker* w)
 {
     // Get current stack frame
     sf = w->current_stack_frame;
 #ifdef __INTEL_COMPILER
 #   if __INTEL_COMPILER <= 1300 && __INTEL_COMPILER_BUILD_DATE < 20130101
     // In older compilers 'w->current_stack_frame' points to the
     // spawn-helper's stack frame.  In newer compiler's however, it points
     // directly to the pointer's stack frame.  (This change was made to avoid
     // having the spawn helper in the frame list when evaluating function
     // arguments, thus avoiding corruption when those arguments themselves
     // contain cilk_spawns.)

     // w->current_stack_frame is the spawn helper's stack frame.
     // w->current_stack_frame->call_parent is the caller's stack frame.
     sf = sf->call_parent;
 #   endif
 #endif
     return w;
 }

 /*
  * cilk_for_recursive
  *
  * Templatized function to implement the recursive divide-and-conquer
  * algorithm that's how we implement a cilk_for.
  *
  * low   - Low loop index we're considering in this portion of the algorithm
  * high  - High loop index we're considering in this portion of the algorithm
  * body  - lambda function for the cilk_for loop body
  * data  - data used by the lambda function
  * grain - grain size (0 if it should be computed)
  * w     - __cilkrts_worker we're currently executing on
  * loop_root_pedigree - __cilkrts_pedigree node we generated for the root of
  *         the cilk_for loop to flatten out the internal nodes
  */
 template <typename count_t, typename F>
 static
 void cilk_for_recursive(count_t low, count_t high,
                         F body, void *data, int grain,
                         __cilkrts_worker *w,
                         __cilkrts_pedigree *loop_root_pedigree)
 {
 tail_recurse:
     // Cilkscreen should not report this call in a stack trace
     // This needs to be done everytime the worker resumes
     NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);

     count_t count = high - low;
     // Invariant: count > 0, grain >= 1
     if (count > grain)
     {
         // Invariant: count >= 2
         count_t mid = low + count / 2;
         // The worker is valid only until the first spawn and is expensive to
         // retrieve (using '__cilkrts_get_tls_worker') after the spawn.  The
         // '__cilkrts_stack_frame' is more stable, but isn't initialized until
         // the first spawn.  Thus, we want to grab the address of the
         // '__cilkrts_stack_frame' after it is initialized but before the
         // spawn detaches.  The only place we can do that is within the
         // argument list of the spawned function, hence the call to
         // capture_spawn_arg_stack_frame().
         __cilkrts_stack_frame *sf;
 #if defined(__GNUC__) && ! defined(__INTEL_COMPILER) && ! defined(__clang__)
         // The current version of gcc initializes the sf structure eagerly.
         // We can take advantage of this fact to avoid calling
         // `capture_spawn_arg_stack_frame` when compiling with gcc.
         // Remove this if the "shrink-wrap" optimization is implemented.
         sf = w->current_stack_frame;
         _Cilk_spawn cilk_for_recursive(low, mid, body, data, grain, w,
                                        loop_root_pedigree);
 #else
         _Cilk_spawn cilk_for_recursive(low, mid, body, data, grain,
                                        capture_spawn_arg_stack_frame(sf, w),
                                        loop_root_pedigree);
 #endif
         w = sf->worker;
         low = mid;

         goto tail_recurse;
     }

     // Call the cilk_for loop body lambda function passed in by the compiler to
     // execute one grain
     call_cilk_for_loop_body(low, high, body, data, w, loop_root_pedigree);
 }

 static void noop() { }

 /*
  * cilk_for_root
  *
  * Templatized function to implement the top level of a cilk_for loop.
  *
  * body  - lambda function for the cilk_for loop body
  * data  - data used by the lambda function
  * count - trip count for loop
  * grain - grain size (0 if it should be computed)
  */
 template <typename count_t, typename F>
 static void cilk_for_root(F body, void *data, count_t count, int grain)
 {
     // Cilkscreen should not report this call in a stack trace
     NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);

     // Pedigree computation:
     //
     //    If the last pedigree node on entry to the _Cilk_for has value X,
     //    then at the start of each iteration of the loop body, the value of
     //    the last pedigree node should be 0, the value of the second-to-last
     //    node should equal the loop counter, and the value of the
     //    third-to-last node should be X.  On return from the _Cilk_for, the
     //    value of the last pedigree should be incremented to X+2. The
     //    pedigree within the loop is thus flattened, such that the depth of
     //    recursion does not affect the results either inside or outside of
     //    the loop.  Note that the pedigree after the loop exists is the same
     //    as if a single spawn and sync were executed within this function.

     // TBD: Since the shrink-wrap optimization was turned on in the compiler,
     // it is not possible to get the current stack frame without actually
     // forcing a call to bind-thread.  This spurious spawn is a temporary
     // stopgap until the correct intrinsics are added to give us total control
     // over frame initialization.
     _Cilk_spawn noop();

     // Fetch the current worker.  From that we can get the current stack frame
     // which will be constant even if we're stolen
     __cilkrts_worker *w = __cilkrts_get_tls_worker();
     __cilkrts_stack_frame *sf = w->current_stack_frame;

     // Decrement the rank by one to undo the pedigree change from the
     // _Cilk_spawn
     --w->pedigree.rank;

     // Save the current worker pedigree into loop_root_pedigree, which will be
     // the root node for our flattened pedigree.
     __cilkrts_pedigree loop_root_pedigree = w->pedigree;

     // Don't splice the loop_root node in yet.  It will be done when we
     // call the loop body lambda function
 //    w->pedigree.rank = 0;
 //    w->pedigree.next = &loop_root_pedigree;

     /* Spawn is necessary at top-level to force runtime to start up.
      * Runtime must be started in order to call the grainsize() function.
      */
     int gs = grainsize(grain, count);
     cilk_for_recursive((count_t) 0, count, body, data, gs, w,
                        &loop_root_pedigree);

     // Need to refetch the worker after calling a spawning function.
     w = sf->worker;

     // Restore the pedigree in the worker.
     w->pedigree = loop_root_pedigree;

     // Bump the worker pedigree.
     ++w->pedigree.rank;

     // Implicit sync will increment the pedigree leaf rank again, for a total
     // of two increments.  If the noop spawn above is removed, then we'll need
     // to re-enable the following code:
 //     // If this is an optimized build, then the compiler will have optimized
 //     // out the increment of the worker's pedigree in the implied sync.  We
 //     // need to add one to make the pedigree_loop test work correctly.
 // #if CILKRTS_OPTIMIZED
 //     ++sf->worker->pedigree.rank;
 // #endif
 }

 // Use extern "C" to suppress name mangling of __cilkrts_cilk_for_32 and
 // __cilkrts_cilk_for_64.
 extern "C" {

 /*
  * __cilkrts_cilk_for_32
  *
  * Implementation of cilk_for for 32-bit trip counts (regardless of processor
  * word size).  Assumes that the range is 0 - count.
  *
  * body  - lambda function for the cilk_for loop body
  * data  - data used by the lambda function
  * count - trip count for loop
  * grain - grain size (0 if it should be computed)
  */

 CILK_ABI_THROWS_VOID __cilkrts_cilk_for_32(__cilk_abi_f32_t body, void *data,
                                             cilk32_t count, int grain)
 {
     // Cilkscreen should not report this call in a stack trace
     NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);

     // Check for an empty range here as an optimization - don't need to do any
     // __cilkrts_stack_frame initialization
     if (count > 0)
         cilk_for_root(body, data, count, grain);
 }

 /*
  * __cilkrts_cilk_for_64
  *
  * Implementation of cilk_for for 64-bit trip counts (regardless of processor
  * word size).  Assumes that the range is 0 - count.
  *
  * body  - lambda function for the cilk_for loop body
  * data  - data used by the lambda function
  * count - trip count for loop
  * grain - grain size (0 if it should be computed)
  */
 CILK_ABI_THROWS_VOID __cilkrts_cilk_for_64(__cilk_abi_f64_t body, void *data,
                                             cilk64_t count, int grain)
 {
     // Cilkscreen should not report this call in a stack trace
     NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);

     // Check for an empty range here as an optimization - don't need to do any
     // __cilkrts_stack_frame initialization
     if (count > 0)
         cilk_for_root(body, data, count, grain);
 }

 } // end extern "C"

 /* End cilk-abi-cilk-for.cpp */
	/* cilk-abi-cilk-for.cpp --C++--
	*
	*************************************************************************
	*
	* Copyright (C) 2011-2016, Intel Corporation
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in
	* the documentation and/or other materials provided with the
	* distribution.
	* * Neither the name of Intel Corporation nor the names of its
	* contributors may be used to endorse or promote products derived
	* from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
	* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*
	* *********************************************************************
	*
	* PLEASE NOTE: This file is a downstream copy of a file mainitained in
	* a repository at cilkplus.org. Changes made to this file that are not
	* submitted through the contribution process detailed at
	* http://www.cilkplus.org/submit-cilk-contribution will be lost the next
	* time that a new version is released. Changes only submitted to the
	* GNU compiler collection or posted to the git repository at
	* https://bitbucket.org/intelcilkruntime/intel-cilk-runtime.git are
	* not tracked.
	*
	* We welcome your contributions to this open source project. Thank you
	* for your assistance in helping us improve Cilk Plus.
	*
	**************************************************************************/

	/* Implementation of cilk_for ABI.
	*
	* This file must be C++, not C, in order to handle C++ exceptions correctly
	* from within the body of the cilk_for loop
	*/

	#include "internal/abi.h"
	#include "metacall_impl.h"
	#include "global_state.h"

	// Icky macros to determine if we're compiled with optimization. Based on
	// the declaration of __CILKRTS_ASSERT in common.h
	#if defined(_WIN32)
	# if defined (_DEBUG)
	# define CILKRTS_OPTIMIZED 0 // Assumes /MDd is always used with /Od
	# else
	# define CILKRTS_OPTIMIZED 1
	# endif // defined(_DEBUG)
	#else
	# if defined(__OPTIMIZE__)
	# define CILKRTS_OPTIMIZED 1
	# else
	# define CILKRTS_OPTIMIZED 0
	# endif
	#endif

	template <typename count_t>
	static inline int grainsize(int req, count_t count)
	{
	// A positive requested grain size comes from the user. A very high grain
	// size risks losing parallelism, but the user told us what they want for
	// grainsize. Who are we to argue?
	if (req > 0)
	return req;

	// At present, a negative requested grain size is treated the same way as
	// a zero grain size, i.e., the runtime computes the actual grainsize
	// using a hueristic. In the future, the compiler may give us additional
	// information about the size of the cilk_for body by passing a negative
	// grain size.

	// Avoid generating a zero grainsize, even for empty loops.
	if (count < 1)
	return 1;

	global_state_t* g = cilkg_get_global_state();
	if (g->under_ptool)
	{
	// Grainsize = 1, when running under PIN, and when the grainsize has
	// not explicitly been set by the user.
	return 1;
	}
	else
	{
	// Divide loop count by 8 times the worker count and round up.
	const int Px8 = g->P * 8;
	count_t n = (count + Px8 - 1) / Px8;

	// 2K should be enough to amortize the cost of the cilk_for. Any
	// larger grainsize risks losing parallelism.
	if (n > 2048)
	return 2048;
	return (int) n; // n <= 2048, so no loss of precision on cast to int
	}
	}

	/*
	* call_cilk_for_loop_body
	*
	* Centralizes the code to call the loop body. The compiler should be
	* inlining this code
	*
	* low - Low loop index we're considering in this portion of the algorithm
	* high - High loop index we're considering in this portion of the algorithm
	* body - lambda function for the cilk_for loop body
	* data - data used by the lambda function
	* w - __cilkrts_worker we're currently executing on
	* loop_root_pedigree - __cilkrts_pedigree node we generated for the root of
	* the cilk_for loop to flatten out the internal nodes
	*/
	template <typename count_t, typename F>
	inline static
	void call_cilk_for_loop_body(count_t low, count_t high,
	F body, void *data,
	__cilkrts_worker *w,
	__cilkrts_pedigree *loop_root_pedigree)
	{
	// Cilkscreen should not report this call in a stack trace
	NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);

	// The worker is only valid until the first spawn. Fetch the
	// __cilkrts_stack_frame out of the worker, since it will be stable across
	// steals. The sf pointer actually points to the parent's
	// __cilkrts_stack_frame, since this function is a non-spawning function
	// and therefore has no cilk stack frame of its own.
	__cilkrts_stack_frame *sf = w->current_stack_frame;

	// Save the pedigree node pointed to by the worker. We'll need to restore
	// that when we exit since the spawn helpers in the cilk_for call tree
	// will assume that it's valid
	const __cilkrts_pedigree *saved_next_pedigree_node = w->pedigree.parent;

	// Add the leaf pedigree node to the chain. The parent is the root node
	// to flatten the tree regardless of the DAG branches in the cilk_for
	// divide-and-conquer recursion.
	//
	// The rank is initialized to the low index. The user is
	// expected to call __cilkrts_bump_loop_rank at the end of the cilk_for
	// loop body.
	__cilkrts_pedigree loop_leaf_pedigree;

	loop_leaf_pedigree.rank = (uint64_t)low;
	loop_leaf_pedigree.parent = loop_root_pedigree;

	// The worker's pedigree always starts with a rank of 0
	w->pedigree.rank = 0;
	w->pedigree.parent = &loop_leaf_pedigree;

	// Call the compiler generated cilk_for loop body lambda function
	body(data, low, high);

	// The loop body may have included spawns, so we must refetch the worker
	// from the __cilkrts_stack_frame, which is stable regardless of which
	// worker we're executing on.
	w = sf->worker;

	// Restore the pedigree chain. It must be valid because the spawn helpers
	// generated by the cilk_for implementation will access it.
	w->pedigree.parent = saved_next_pedigree_node;
	}

	/* capture_spawn_arg_stack_frame
	*
	* Efficiently get the address of the caller's __cilkrts_stack_frame. The
	* preconditons are that 'w' is the worker at the time of the call and
	* 'w->current_stack_frame' points to the __cilkrts_stack_frame within the
	* spawn helper. This function should be called only within the argument list
	* of a function that is being spawned because that is the only situation in
	* which these preconditions hold. This function returns the worker
	* (unchanged) after storing the captured stack frame pointer is stored in the
	* sf argument.
	*
	* The purpose of this function is to get the caller's stack frame in a
	* context where the caller's worker is known but its stack frame is not
	* necessarily initialized. The "shrink wrap" optimization delays
	* initializing the contents of a spawning function's '__cilkrts_stack_frame'
	* as well as the 'current_stack_frame' pointer within the worker. By calling
	* this function within a spawning function's argument list, we can ensure
	* that these initializations have occured but that a detach (which would
	* invalidate the worker pointer in the caller) has not yet occured. Once the
	* '__cilkrts_stack_frame' has been retrieved in this way, it is stable for the
	* remainder of the caller's execution, and becomes an efficient way to get
	* the worker (much more efficient than calling '__cilkrts_get_tls_worker()'),
	* even after a spawn or sync.
	*/
	inline __cilkrts_worker*
	capture_spawn_arg_stack_frame(__cilkrts_stack_frame* &sf, __cilkrts_worker* w)
	{
	// Get current stack frame
	sf = w->current_stack_frame;
	#ifdef __INTEL_COMPILER
	# if __INTEL_COMPILER <= 1300 && __INTEL_COMPILER_BUILD_DATE < 20130101
	// In older compilers 'w->current_stack_frame' points to the
	// spawn-helper's stack frame. In newer compiler's however, it points
	// directly to the pointer's stack frame. (This change was made to avoid
	// having the spawn helper in the frame list when evaluating function
	// arguments, thus avoiding corruption when those arguments themselves
	// contain cilk_spawns.)

	// w->current_stack_frame is the spawn helper's stack frame.
	// w->current_stack_frame->call_parent is the caller's stack frame.
	sf = sf->call_parent;
	# endif
	#endif
	return w;
	}

	/*
	* cilk_for_recursive
	*
	* Templatized function to implement the recursive divide-and-conquer
	* algorithm that's how we implement a cilk_for.
	*
	* low - Low loop index we're considering in this portion of the algorithm
	* high - High loop index we're considering in this portion of the algorithm
	* body - lambda function for the cilk_for loop body
	* data - data used by the lambda function
	* grain - grain size (0 if it should be computed)
	* w - __cilkrts_worker we're currently executing on
	* loop_root_pedigree - __cilkrts_pedigree node we generated for the root of
	* the cilk_for loop to flatten out the internal nodes
	*/
	template <typename count_t, typename F>
	static
	void cilk_for_recursive(count_t low, count_t high,
	F body, void *data, int grain,
	__cilkrts_worker *w,
	__cilkrts_pedigree *loop_root_pedigree)
	{
	tail_recurse:
	// Cilkscreen should not report this call in a stack trace
	// This needs to be done everytime the worker resumes
	NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);

	count_t count = high - low;
	// Invariant: count > 0, grain >= 1
	if (count > grain)
	{
	// Invariant: count >= 2
	count_t mid = low + count / 2;
	// The worker is valid only until the first spawn and is expensive to
	// retrieve (using '__cilkrts_get_tls_worker') after the spawn. The
	// '__cilkrts_stack_frame' is more stable, but isn't initialized until
	// the first spawn. Thus, we want to grab the address of the
	// '__cilkrts_stack_frame' after it is initialized but before the
	// spawn detaches. The only place we can do that is within the
	// argument list of the spawned function, hence the call to
	// capture_spawn_arg_stack_frame().
	__cilkrts_stack_frame *sf;
	#if defined(__GNUC__) && ! defined(__INTEL_COMPILER) && ! defined(__clang__)
	// The current version of gcc initializes the sf structure eagerly.
	// We can take advantage of this fact to avoid calling
	// `capture_spawn_arg_stack_frame` when compiling with gcc.
	// Remove this if the "shrink-wrap" optimization is implemented.
	sf = w->current_stack_frame;
	_Cilk_spawn cilk_for_recursive(low, mid, body, data, grain, w,
	loop_root_pedigree);
	#else
	_Cilk_spawn cilk_for_recursive(low, mid, body, data, grain,
	capture_spawn_arg_stack_frame(sf, w),
	loop_root_pedigree);
	#endif
	w = sf->worker;
	low = mid;

	goto tail_recurse;
	}

	// Call the cilk_for loop body lambda function passed in by the compiler to
	// execute one grain
	call_cilk_for_loop_body(low, high, body, data, w, loop_root_pedigree);
	}

	static void noop() { }

	/*
	* cilk_for_root
	*
	* Templatized function to implement the top level of a cilk_for loop.
	*
	* body - lambda function for the cilk_for loop body
	* data - data used by the lambda function
	* count - trip count for loop
	* grain - grain size (0 if it should be computed)
	*/
	template <typename count_t, typename F>
	static void cilk_for_root(F body, void *data, count_t count, int grain)
	{
	// Cilkscreen should not report this call in a stack trace
	NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);

	// Pedigree computation:
	//
	// If the last pedigree node on entry to the _Cilk_for has value X,
	// then at the start of each iteration of the loop body, the value of
	// the last pedigree node should be 0, the value of the second-to-last
	// node should equal the loop counter, and the value of the
	// third-to-last node should be X. On return from the _Cilk_for, the
	// value of the last pedigree should be incremented to X+2. The
	// pedigree within the loop is thus flattened, such that the depth of
	// recursion does not affect the results either inside or outside of
	// the loop. Note that the pedigree after the loop exists is the same
	// as if a single spawn and sync were executed within this function.

	// TBD: Since the shrink-wrap optimization was turned on in the compiler,
	// it is not possible to get the current stack frame without actually
	// forcing a call to bind-thread. This spurious spawn is a temporary
	// stopgap until the correct intrinsics are added to give us total control
	// over frame initialization.
	_Cilk_spawn noop();

	// Fetch the current worker. From that we can get the current stack frame
	// which will be constant even if we're stolen
	__cilkrts_worker *w = __cilkrts_get_tls_worker();
	__cilkrts_stack_frame *sf = w->current_stack_frame;

	// Decrement the rank by one to undo the pedigree change from the
	// _Cilk_spawn
	--w->pedigree.rank;

	// Save the current worker pedigree into loop_root_pedigree, which will be
	// the root node for our flattened pedigree.
	__cilkrts_pedigree loop_root_pedigree = w->pedigree;

	// Don't splice the loop_root node in yet. It will be done when we
	// call the loop body lambda function
	// w->pedigree.rank = 0;
	// w->pedigree.next = &loop_root_pedigree;

	/* Spawn is necessary at top-level to force runtime to start up.
	* Runtime must be started in order to call the grainsize() function.
	*/
	int gs = grainsize(grain, count);
	cilk_for_recursive((count_t) 0, count, body, data, gs, w,
	&loop_root_pedigree);

	// Need to refetch the worker after calling a spawning function.
	w = sf->worker;

	// Restore the pedigree in the worker.
	w->pedigree = loop_root_pedigree;

	// Bump the worker pedigree.
	++w->pedigree.rank;

	// Implicit sync will increment the pedigree leaf rank again, for a total
	// of two increments. If the noop spawn above is removed, then we'll need
	// to re-enable the following code:
	// // If this is an optimized build, then the compiler will have optimized
	// // out the increment of the worker's pedigree in the implied sync. We
	// // need to add one to make the pedigree_loop test work correctly.
	// #if CILKRTS_OPTIMIZED
	// ++sf->worker->pedigree.rank;
	// #endif
	}

	// Use extern "C" to suppress name mangling of __cilkrts_cilk_for_32 and
	// __cilkrts_cilk_for_64.
	extern "C" {

	/*
	* __cilkrts_cilk_for_32
	*
	* Implementation of cilk_for for 32-bit trip counts (regardless of processor
	* word size). Assumes that the range is 0 - count.
	*
	* body - lambda function for the cilk_for loop body
	* data - data used by the lambda function
	* count - trip count for loop
	* grain - grain size (0 if it should be computed)
	*/

	CILK_ABI_THROWS_VOID __cilkrts_cilk_for_32(__cilk_abi_f32_t body, void *data,
	cilk32_t count, int grain)
	{
	// Cilkscreen should not report this call in a stack trace
	NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);

	// Check for an empty range here as an optimization - don't need to do any
	// __cilkrts_stack_frame initialization
	if (count > 0)
	cilk_for_root(body, data, count, grain);
	}

	/*
	* __cilkrts_cilk_for_64
	*
	* Implementation of cilk_for for 64-bit trip counts (regardless of processor
	* word size). Assumes that the range is 0 - count.
	*
	* body - lambda function for the cilk_for loop body
	* data - data used by the lambda function
	* count - trip count for loop
	* grain - grain size (0 if it should be computed)
	*/
	CILK_ABI_THROWS_VOID __cilkrts_cilk_for_64(__cilk_abi_f64_t body, void *data,
	cilk64_t count, int grain)
	{
	// Cilkscreen should not report this call in a stack trace
	NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);

	// Check for an empty range here as an optimization - don't need to do any
	// __cilkrts_stack_frame initialization
	if (count > 0)
	cilk_for_root(body, data, count, grain);
	}

	} // end extern "C"

	/* End cilk-abi-cilk-for.cpp */