| /* cilk-abi-vla.cpp -*-C++-*- |
| * |
| ************************************************************************* |
| * |
| * Copyright (C) 2013-2016, Intel Corporation |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in |
| * the documentation and/or other materials provided with the |
| * distribution. |
| * * Neither the name of Intel Corporation nor the names of its |
| * contributors may be used to endorse or promote products derived |
| * from this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
| * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
| * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS |
| * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED |
| * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY |
| * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| * POSSIBILITY OF SUCH DAMAGE. |
| * |
| * ********************************************************************* |
| * |
| * PLEASE NOTE: This file is a downstream copy of a file mainitained in |
| * a repository at cilkplus.org. Changes made to this file that are not |
| * submitted through the contribution process detailed at |
| * http://www.cilkplus.org/submit-cilk-contribution will be lost the next |
| * time that a new version is released. Changes only submitted to the |
| * GNU compiler collection or posted to the git repository at |
| * https://bitbucket.org/intelcilkruntime/intel-cilk-runtime.git are |
| * not tracked. |
| * |
| * We welcome your contributions to this open source project. Thank you |
| * for your assistance in helping us improve Cilk Plus. |
| * |
| **************************************************************************/ |
| |
| /* |
| * Implementation of Variable Length Array (VLA) ABI. |
| * |
| * __cilkrts_stack_alloc() and __cilkrts_stack_free must be compiled |
| * such that ebp/rbp is used for the stack frames. This is done by having |
| * each of them use alloca, which forces the special frame types needed on |
| * each of the ABIs. Additionally, for some forms of stack frame, special |
| * care must be taken because the alloca space may not be at the bottom of the |
| * stack frame of the caller. For Intel64 windows, and for some options |
| * with other ABIs, a preallocated parameter block may exist on the stack |
| * at a lower address than the alloca. If this is the case, the parameter |
| * distance_from_sp_to_alloca_area will be non-zero, and will indicate how |
| * much pre-allocated parameter space resides in the caller's stack frame |
| * between the alloca area, and the bottom of the stack when the call to |
| * the cilkrts is made. As such, when non-zero it also includes any space |
| * used for passing the cilkrts_stack_alloc or cilkrts_stack_free parameters. |
| */ |
| |
| #include <assert.h> |
| #include <stdlib.h> |
| #include <stdint.h> |
| #include "declare-alloca.h" |
| |
| #ifdef _WIN32 |
| # define INLINE static __inline |
| # pragma warning(disable:1025) // Don't whine about zero extending result of unary operation |
| #else |
| # define INLINE static inline |
| #endif |
| |
| |
| #include "internal/abi.h" |
| #include "cilk-abi-vla-internal.h" |
| |
| #ifdef __INTEL_COMPILER |
| // These functions are used only within __INTEL_COMPILER. |
| #if defined(__x86_64) || defined(_M_X64) |
| INLINE void setsp(void *val) |
| { |
| __asm__("movq %0, %%rsp" : : "r"(val): "rsp"); |
| } |
| INLINE char* getsp(void) |
| { |
| void *res; |
| |
| __asm__("movq %%rsp, %0" : "=r"(res): : "rsp"); |
| return res; |
| } |
| INLINE char* getbp(void) |
| { |
| void *res; |
| |
| __asm__("movq %%rbp, %0" : "=r"(res): : "rbp"); |
| return res; |
| } |
| INLINE void copy_frame_down_and_move_bp( |
| char *dst, |
| char *src, |
| size_t cpy_bytes, |
| char *new_ebp |
| ) |
| { |
| // In this version, dst is guaranteed to be lower address than src, |
| // therefore copying upwards from src into dst is safe in case |
| // there is overlap. The number of bytes is also guaranteed to be |
| // a multiple of 8, and the copy is done in 64 bit word chunks for |
| // best efficiency. |
| __asm__( |
| "movq %0, %%rdi;" |
| "movq %1, %%rsi;" |
| "movq %2, %%rcx;" |
| "shrq $3, %%rcx;" |
| "rep movsq;" |
| "movq %3, %%rbp" : |
| : |
| "rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) : |
| "rsi", "rdi", "rcx", "rbp", "memory"); |
| } |
| INLINE void copy_frame_up_and_move_bp( |
| char *dst, |
| char *src, |
| size_t cpy_bytes, |
| char *new_ebp |
| ) |
| { |
| // In this version, dst is guaranteed to be higher address than src, |
| // therefore copying downwards from src into dst is safe in case |
| // there is overlap. The number of bytes is also guaranteed to be |
| // a multiple of 8, and the copy is done in 64 bit word chunks for |
| // best efficiency. |
| dst += cpy_bytes - 8; |
| src += cpy_bytes - 8; |
| __asm__( |
| "movq %0, %%rdi;" |
| "movq %1, %%rsi;" |
| "movq %2, %%rcx;" |
| "shrq $3, %%rcx;" |
| "std; rep movsq; cld;" |
| "movq %3, %%rbp;" : |
| : |
| "rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) : |
| "rsi", "rdi", "rcx", "rbp", "memory"); |
| } |
| #else |
| INLINE void setsp(void *val) |
| { |
| __asm__("movl %0, %%esp" : : "r"(val): "esp"); |
| } |
| INLINE char* getsp(void) |
| { |
| void *res; |
| |
| __asm__("movl %%esp, %0" : "=r"(res): : "esp"); |
| return res; |
| } |
| INLINE char* getbp(void) |
| { |
| void *res; |
| |
| __asm__("movl %%ebp, %0" : "=r"(res): : "ebp"); |
| return res; |
| } |
| INLINE void copy_frame_down_and_move_bp( |
| char *dst, |
| char *src, |
| size_t cpy_bytes, |
| char *new_ebp |
| ) |
| { |
| // In this version, dst is guaranteed to be lower address than src, |
| // therefore copying upwards from src into dst is safe in case |
| // there is overlap. The number of bytes is also guaranteed to be |
| // a multiple of 4, and the copy is done in 32 bit word chunks for |
| // best efficiency. |
| __asm__( |
| "movl %0, %%edi;" |
| "movl %1, %%esi;" |
| "movl %2, %%ecx;" |
| "shrl $2, %%ecx;" |
| "rep movsd;" |
| "movl %3, %%ebp" : |
| : |
| "rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) : |
| "esi", "edi", "ecx", "ebp", "memory"); |
| } |
| INLINE void copy_frame_up_and_move_bp( |
| char *dst, |
| char *src, |
| size_t cpy_bytes, |
| char *new_ebp |
| ) |
| { |
| // In this version, dst is guaranteed to be higher address than src, |
| // therefore copying downwards from src into dst is safe in case |
| // there is overlap. The number of bytes is also guaranteed to be |
| // a multiple of 4, and the copy is done in 32 bit word chunks for |
| // best efficiency. |
| dst += cpy_bytes - 4; |
| src += cpy_bytes - 4; |
| __asm__( |
| "movl %0, %%edi;" |
| "movl %1, %%esi;" |
| "movl %2, %%ecx;" |
| "shrl $2, %%ecx;" |
| "std; rep movsd; cld;" |
| "movl %3, %%ebp" : |
| // "=D"(dst), "=S"(src), "=C"(cpy_bytes) : |
| : |
| "rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) : |
| "esi", "edi", "ecx", "ebp", "memory"); |
| } |
| #endif |
| |
| #endif // __INTEL_COMPILER |
| |
| #define c_cilk_ptr_from_heap 0xc2f2f00d |
| #define c_cilk_ptr_from_stack 0xc3f30d0f |
| |
| CILK_ABI(__cilkrts_void_ptr) |
| __cilkrts_stack_alloc( |
| __cilkrts_stack_frame *sf, |
| size_t size, |
| size_t distance_from_sp_to_alloca_area, |
| uint32_t align, // align is always >= minimum stack alignment and |
| // >= ptr_size as well, and must be a power of 2. |
| uint32_t needs_tag // non-zero if the pointer being returned needs to |
| // be tagged |
| ) |
| { |
| // full_size will be a multiple of align, and contains |
| // enough extra space to allocate a marker. |
| size_t full_size = (size + align - 1) & ~(align - 1); |
| |
| #ifndef __INTEL_COMPILER |
| // Allocate memory from the heap. The compiler is responsible |
| // for guaranteeing us a chance to free it before the function |
| // exits |
| return (void *)vla_internal_heap_alloc(sf, full_size, align); |
| |
| #else |
| if (needs_tag) { |
| full_size += align; |
| } |
| |
| char *t; |
| if (sf->worker != 0 && |
| ((sf->flags & CILK_FRAME_UNSYNCHED) != 0)) { |
| t = vla_internal_heap_alloc(sf, full_size, align); |
| if (needs_tag) { |
| t += align; |
| ((uint32_t*)t)[-1] = c_cilk_ptr_from_heap; |
| } |
| return (void *)t; |
| } |
| |
| // stack is still synced, allocate full_size from esp, |
| // and record in 32 bits immediately below the space |
| // allocated that this was space that this was |
| // allocated in the stack. |
| char *old_ebp = getbp(); |
| char *old_esp = getsp(); |
| |
| // make top_ptr point to base of first parameter. |
| char *top_ptr = ((char *)(_AddressOfReturnAddress()) + |
| sizeof(char *)); |
| size_t param_size = 0; |
| |
| #if defined(__x86_64) |
| // For Intel64 linux & MACH ABI, all the parameters were passed in |
| // register, so top of the stack frame above the return address |
| // is just the size of the return address plus |
| // distance_from_sp_to_alloca_area on the chance that the alloca |
| // area isn't at the very bottom of the calling functions stack. |
| #elif defined(__MACH__) |
| // For ia32 MACH, parameter size is always a mutliple of 16 |
| // bytes to keep the stack 16 byte aligned. So we need to round |
| // number of parameters up to multiple of 4. |
| param_size = 8 * sizeof(char *); |
| #else |
| // For both windows Intel64 ABI, and the IA32 windows and |
| // linux ABIs, space is reserved on the stack for all these |
| // parameters. param_size is 5 * size of a stack slot. |
| param_size = 5 * sizeof(char *); |
| #endif |
| |
| // now make top_ptr point above the params, or if |
| // distance_from_sp_to_alloca_area is not zero, make |
| // it point above that area. When non-zero, |
| // distance_from_sp_to_alloca area is expected to contain |
| // the parameter space, so we only add one or the other, |
| // not both. |
| top_ptr += (distance_from_sp_to_alloca_area != 0) ? |
| distance_from_sp_to_alloca_area : param_size; |
| |
| // t needs to end up at current value of top_ptr less full_size and less |
| // distance_from_sp_to_alloca_area and |
| // then rounded down to the alignment needed. Then we have to bump |
| // esp down by current frame_size, so that when all is done with respect |
| // to executing the return sequence, the final value of esp will be the |
| // same value as t. |
| t = (top_ptr - full_size) - distance_from_sp_to_alloca_area; |
| intptr_t temp = (intptr_t)t; |
| temp &= ~((intptr_t)(align - 1)); |
| t = (char *)temp; |
| |
| // ok, the value of t is set where we need it. Now set esp |
| // to the value of t less the current frame size. |
| // So now when we do regular return esp should be left such |
| // that it has moved down by full_size. |
| size_t cur_fm_size = (top_ptr - old_esp); |
| char *new_esp = t - cur_fm_size; |
| char *new_ebp = old_ebp - (old_esp - new_esp); |
| |
| // extend the stack down by at least the difference between where |
| // I want it to be and where it currently is. This should take care |
| // of touching any pages necessary. |
| char *foo = alloca(old_esp - new_esp); |
| setsp(foo < new_esp ? foo : new_esp); |
| |
| // Now set esp exactly where I want it. |
| // setsp(new_esp); |
| |
| copy_frame_down_and_move_bp(new_esp, old_esp, cur_fm_size, new_ebp); |
| |
| if (needs_tag) { |
| t += align; |
| ((uint32_t*)t)[-1] = c_cilk_ptr_from_stack; |
| } |
| |
| return t; |
| #endif // __INTEL_COMPILER |
| } |
| |
| // This frees the space allocated for a variable length array. |
| CILK_ABI(void) |
| __cilkrts_stack_free( |
| __cilkrts_stack_frame *sf, |
| void *p, |
| size_t size, |
| size_t distance_from_sp_to_alloca_area, |
| uint32_t align, // same requirements as for align in allocation, |
| // and must match alignment that was passed when |
| // doing the allocation |
| uint32_t known_from_stack // non-zero if this is known to be allocated |
| // on the stack, and therefore has no tag |
| ) |
| { |
| // full_size will be a multiple of align, and contains |
| // enough extra space to allocate a marker if one was needed. |
| size_t full_size = (size + align - 1) & ~(align - 1); |
| |
| #ifndef __INTEL_COMPILER |
| // Just free the allocated memory to the heap since we don't know |
| // how to expand/contract the calling frame |
| vla_internal_heap_free(p, full_size); |
| |
| #else |
| uint32_t *t = (uint32_t*)p; |
| if (known_from_stack == 0) { |
| // if the compiler hasn't told the run-time that this is |
| // known to be on the stack, then this pointer must have been |
| // tagged such that the run-time can tell. |
| assert(t[-1] == c_cilk_ptr_from_stack || |
| t[-1] == c_cilk_ptr_from_heap); |
| |
| known_from_stack = t[-1] == c_cilk_ptr_from_stack; |
| full_size += align; // accounts for extra space for marker |
| t = (uint32_t *)(((char *)t) - align); |
| } |
| |
| if (known_from_stack) { |
| // alloca useage forces an ebp/rbp based stack frame even though |
| // 0 and unused. |
| char *foo = alloca(0); |
| if (sf->worker == 0 || (sf->flags & CILK_FRAME_UNSYNCHED) == 0) { |
| // p was allocated from current stack frame and we |
| // are synced on current stack frame. Return the |
| // amount of the stack that needs to be freed. |
| char *old_ebp = getbp(); |
| char *old_esp = getsp(); |
| |
| // make top_ptr point to base of first parameter. |
| char *top_ptr = ((char *)(_AddressOfReturnAddress()) + |
| sizeof(char *)); |
| size_t param_size = 0; |
| |
| #if defined(__x86_64) |
| // For Intel64 linux & MACH ABI, all the parameters were passed in |
| // register, so top of the stack frame above the return address |
| // is just the size of the return address plus |
| // distance_from_sp_to_alloca_area on the chance that the alloca |
| // area isn't at the very bottom of the calling functions stack. |
| #elif defined(__MACH__) |
| // For ia32 MACH, parameter size is always a mutliple of 16 |
| // bytes to keep the stack 16 byte aligned. So we need to round |
| // number of parameters up to multiple of 4. |
| param_size = 8 * sizeof(char *); |
| #else |
| // For both windows Intel64 ABI, and the IA32 windows and |
| // linux ABIs, space is reserved on the stack for all these |
| // parameters. param_size is 5 * size of a stack slot. |
| param_size = 6 * sizeof(char *); |
| #endif |
| |
| // now make top_ptr point above the params, or if |
| // distance_from_sp_to_alloca_area is not zero, make |
| // it point above that area. When non-zero, |
| // distance_from_sp_to_alloca area is expected to contain |
| // the parameter space, so we only add one or the other, |
| // not both. |
| top_ptr += (distance_from_sp_to_alloca_area != 0) ? |
| distance_from_sp_to_alloca_area : param_size; |
| |
| size_t cur_fm_size = (top_ptr - old_esp); |
| char *new_esp = old_esp + full_size; |
| char *new_ebp = old_ebp + full_size; |
| |
| copy_frame_up_and_move_bp(new_esp, old_esp, cur_fm_size, new_ebp); |
| setsp(new_esp); |
| } |
| else { |
| // p was allocated on stack frame, but that is |
| // no longer the current stack frame. Need to adjust the |
| // saved esp that is somewhere in the cilk runtime so that |
| // on sync, esp will be cut back correctly. |
| vla_free_from_original_stack(sf, full_size); |
| } |
| } |
| else { |
| vla_internal_heap_free(t, full_size); |
| } |
| #endif // __INTEL_COMPILER |
| } |