| /* Code locality based function cloning. |
| Copyright The GNU Toolchain Authors |
| |
| This file is part of GCC. |
| |
| GCC is free software; you can redistribute it and/or modify it under |
| the terms of the GNU General Public License as published by the Free |
| Software Foundation; either version 3, or (at your option) any later |
| version. |
| |
| GCC is distributed in the hope that it will be useful, but WITHOUT ANY |
| WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with GCC; see the file COPYING3. If not see |
| <http://www.gnu.org/licenses/>. */ |
| |
| /* This file implements cloning required to improve partitioning of the |
| callgraph for locality considerations. |
| |
| Partitioning for improving code locality. |
| This pass aims to place frequently executed callchains closer together in |
| memory to improve performance through improved locality. If any frequent |
| callchains cannot be placed together because they are already placed |
| elsewhere, local function clones are created and all callers near to the |
| clones are redirected to use this copy. |
| |
| Locality code placement is done in 2 parts. |
| 1. IPA pass to be executed after ipa-inline and before ipa-pure-const. |
| Execute stage prepares the plan to place all nodes into partitions. |
| 2. WPA Partition stage actually implements the plan. |
| |
| Brief overview of the IPA pass: |
| 1. Create and sort callchains. If PGO is available, use real profile |
| counts. Otherwise, use a set of heuristics to sort the callchains. |
| 2. Create a partition plan for the callchains, processing them in the sorted |
| order. |
| 1. If a function is unpartitioned, place it in the current partition. |
| 2. If a function is already placed in a partition away from current |
| partition as part of another callchain: |
| Create a local clone in current partition, if cloning criteria is |
| satisfied. |
| 3. Redirect any new caller to a local clone if one exists. |
| |
| Another heuristics can be used in the absense of profile information. This |
| approach uses C++ template instantiation types to group functions together. |
| Entry functions are sorted in the beginning by most used template types, and |
| callees are sorted as part of partition_callchain (), again by template |
| types. |
| |
| For bothe schemes, partition size is param controlled to fine tune per |
| program behavior. */ |
| |
| #include "config.h" |
| #define INCLUDE_ALGORITHM |
| #include "system.h" |
| #include "coretypes.h" |
| #include "target.h" |
| #include "function.h" |
| #include "tree.h" |
| #include "alloc-pool.h" |
| #include "tree-pass.h" |
| #include "cgraph.h" |
| #include "symbol-summary.h" |
| #include "tree-vrp.h" |
| #include "symtab-thunks.h" |
| #include "sreal.h" |
| #include "ipa-cp.h" |
| #include "ipa-prop.h" |
| #include "ipa-fnsummary.h" |
| #include "ipa-modref-tree.h" |
| #include "ipa-modref.h" |
| #include "symtab-clones.h" |
| #include "demangle.h" |
| #include "ipa-locality-cloning.h" |
| |
| /* Locality partitions, assigns nodes to partitions. These are used later in |
| WPA partitioning. */ |
| vec<locality_partition> locality_partitions; |
| |
| using loc_map_hash = int_hash<int, 0, -1>; |
| |
| /* Map from original node to its latest clone. Gets overwritten whenever a new |
| clone is created from the same node. */ |
| static std::unique_ptr<hash_map<loc_map_hash, cgraph_node *>> node_to_clone; |
| /* Map from clone to its original node. */ |
| static std::unique_ptr<hash_map<loc_map_hash, cgraph_node *>> clone_to_node; |
| |
| /* Data structure to hold static heuristics and orders for cgraph_nodes. */ |
| struct locality_order |
| { |
| cgraph_node *node; |
| sreal order; |
| locality_order (cgraph_node *node, sreal order) : node (node), order (order) |
| {} |
| }; |
| |
| /* Data structure to hold information about unique template instantiations. */ |
| struct templ_info |
| { |
| hashval_t templ_hash; |
| int count; |
| int order; |
| }; |
| |
| /* Data structure to hold accumulated edge frequencies for unique callees. |
| Frequencies of aliased callees are accumulated with the ultimate target |
| alias represented by ultimate_callee. |
| ultimate_caller is the ultimate non-inlined caller of the unique callee. |
| Edge is the first edge encountered for the callee. */ |
| struct locality_callee_info |
| { |
| cgraph_node *ultimate_caller; |
| cgraph_node *ultimate_callee; |
| cgraph_edge *edge; |
| sreal freq; |
| }; |
| |
| /* Data structure to hold precomputed callchain information. */ |
| struct locality_info |
| { |
| cgraph_node *node; |
| |
| /* Consolidated callees, including callees of inlined nodes. */ |
| vec<locality_callee_info *> all_callees; |
| |
| /* Accumulated caller->node edge frequencies for unique callers. */ |
| hash_map<loc_map_hash, sreal> caller_freq; |
| /* Accumulated node->callee edge frequencies for unique callees. */ |
| hash_map<loc_map_hash, locality_callee_info> callee_info; |
| |
| /* Template instantiation info. */ |
| hashval_t templ_hash; |
| |
| locality_info () |
| { |
| all_callees.create (1); |
| templ_hash = 0; |
| } |
| ~locality_info () |
| { |
| all_callees.release (); |
| } |
| }; |
| |
| /* Template instantiation hash_map. */ |
| typedef hash_map<int_hash<hashval_t, 0>, templ_info> hash_freq_map_t; |
| static std::unique_ptr<hash_freq_map_t> templ_hash_map; |
| |
| /* Pool allocation for locality_info. */ |
| static object_allocator<locality_info> loc_infos ("IPA locality callchain"); |
| static |
| std::unique_ptr<hash_map<loc_map_hash, locality_info *>> node_to_ch_info; |
| |
| /* Return locality_info for NODE if present, otherwise return NULL. */ |
| static inline locality_info * |
| get_locality_info (cgraph_node *node) |
| { |
| locality_info **ninfo = node_to_ch_info->get (node->get_uid ()); |
| if (ninfo) |
| return *ninfo; |
| return NULL; |
| } |
| |
| /* Forward declaration. */ |
| static int compare_node_uids (cgraph_node *n1, cgraph_node *n2); |
| |
| /* Helper function to qsort all_callees in locality_info by edge hotness. */ |
| static int |
| callee_default_cmp (const void *pa, const void *pb) |
| { |
| const locality_callee_info *a = *(static_cast<const locality_callee_info |
| *const *> (pa)); |
| const locality_callee_info *b = *(static_cast<const locality_callee_info |
| *const *> (pb)); |
| if (a->freq < b->freq) |
| return 1; |
| else if (a->freq > b->freq) |
| return -1; |
| return compare_node_uids (a->ultimate_callee, b->ultimate_callee); |
| } |
| |
| /* Sort all_callees of NODE by template hashes and edge hotness. |
| |
| all_callees is already sorted by call frequency. If templ_hash of the |
| caller is same as a callee's templ_hash then that callee is ordered first. |
| Callees having same templ_hash are sorted by call frequency, callees with |
| different templ_hash are sorted by the template order. |
| |
| if (Both PA and PB have templates && PA's template == PB'template) || |
| (Both A and B don't have templates): sort by frequency |
| |
| if (PA doesn't have template && PB has) || (Both have different templates && |
| caller's template == PB's template): PB before PA |
| |
| if (PA has template && PB doesn't) || (Both have different templates && |
| caller's template == PA's template): PA before PB |
| |
| if (Both have different templates && both are different than caller's |
| template: sort by template order. */ |
| static int |
| callee_templ_cmp (const void *pa, const void *pb) |
| { |
| const locality_callee_info *a = *static_cast<const locality_callee_info |
| *const *> (pa); |
| const locality_callee_info *b = *static_cast<const locality_callee_info |
| *const *> (pb); |
| |
| locality_info *caller_info = get_locality_info (a->ultimate_caller); |
| hashval_t caller_hash = 0; |
| if (caller_info) |
| caller_hash = caller_info->templ_hash; |
| |
| locality_info *ainfo = get_locality_info (a->edge->callee); |
| locality_info *binfo = get_locality_info (b->edge->callee); |
| templ_info *atinfo = NULL, *btinfo = NULL; |
| hashval_t a_hash = 0, b_hash = 0; |
| if (ainfo) |
| { |
| a_hash = ainfo->templ_hash; |
| atinfo = templ_hash_map->get (a_hash); |
| } |
| if (binfo) |
| { |
| b_hash = binfo->templ_hash; |
| btinfo = templ_hash_map->get (b_hash); |
| } |
| |
| if (a_hash == b_hash) |
| { |
| /* We get here if both have same template type or both have 0 as hash. */ |
| return callee_default_cmp (pa, pb); |
| } |
| else if (a_hash && (!b_hash || a_hash == caller_hash)) |
| return -1; |
| else if (b_hash && (!a_hash || b_hash == caller_hash)) |
| return 1; |
| else |
| { |
| gcc_assert (atinfo && btinfo); |
| if (atinfo->order < btinfo->order) |
| return -1; |
| /* Order being same is already handled above. */ |
| return 1; |
| } |
| return callee_default_cmp (pa, pb); |
| } |
| |
| /* Sort all_callees in INFO by edge hotness. */ |
| static void |
| sort_all_callees_default (locality_info *info) |
| { |
| hash_map<loc_map_hash, locality_callee_info>::iterator iter |
| = info->callee_info.begin (); |
| for (; iter != info->callee_info.end (); ++iter) |
| info->all_callees.safe_push (&((*iter).second)); |
| info->all_callees.qsort (callee_default_cmp); |
| } |
| |
| /* Populate locality_info for NODE from its direct callees and callees via |
| inlined nodes. N is used to iterate callees of NODE and callees of inlined |
| callees of NODE. */ |
| static void |
| populate_callee_locality_info (cgraph_node *node, cgraph_node *n, |
| locality_info *info) |
| { |
| for (cgraph_edge *e = n->callees; e; e = e->next_callee) |
| { |
| cgraph_node *c = e->callee->ultimate_alias_target (); |
| if (c->inlined_to == node) |
| populate_callee_locality_info (node, c, info); |
| else |
| { |
| bool existed; |
| locality_callee_info &cinfo = info->callee_info.get_or_insert |
| (c->get_uid (), &existed); |
| if (!existed) |
| { |
| cinfo.ultimate_callee = c; |
| cinfo.ultimate_caller = node; |
| cinfo.edge = e; |
| cinfo.freq = e->sreal_frequency (); |
| } |
| else |
| cinfo.freq = cinfo.freq + e->sreal_frequency (); |
| } |
| } |
| } |
| |
| /* Forward declaration. */ |
| static int static_profile_cmp (const void *pa, const void *pb); |
| |
| /* Helper function for qsort; sort nodes in the descending order derived of |
| template instantiation types by frequency count. */ |
| static int |
| static_profile_templ_cmp (const void *pa, const void *pb) |
| { |
| const locality_order *a = *static_cast<const locality_order *const *> (pa); |
| const locality_order *b = *static_cast<const locality_order *const *> (pb); |
| locality_info *ainfo = get_locality_info (a->node); |
| templ_info *atinfo = templ_hash_map->get (ainfo->templ_hash); |
| locality_info *binfo = get_locality_info (b->node); |
| templ_info *btinfo = templ_hash_map->get (binfo->templ_hash); |
| if ((atinfo && !btinfo) |
| || (atinfo && btinfo && atinfo->order < btinfo->order)) |
| return -1; |
| else if ((btinfo && !atinfo) |
| || (atinfo && btinfo && atinfo->order > btinfo->order)) |
| return 1; |
| return static_profile_cmp (pa, pb); |
| } |
| |
| /* Helper function for qsort; sort template instantiation types in descending |
| order of their frequency count. */ |
| static int |
| sort_templ_hashes_cmp (const void *pa, const void *pb) |
| { |
| const std::pair<hashval_t, int> *a = static_cast<const std::pair<hashval_t, |
| int> *> (pa); |
| const std::pair<hashval_t, int> *b = static_cast<const std::pair<hashval_t, |
| int> *> (pb); |
| if (a->second < b->second) |
| return 1; |
| else if (a->second > b->second) |
| return -1; |
| else |
| { |
| long long int res = (long long int)a->first - (long long int)b->first; |
| /* Hashes from templ_hash_map cannot be equal. */ |
| gcc_assert (res != 0); |
| return res > 0 ? 1 : -1; |
| } |
| } |
| |
| /* Sort template instantiation types and record the sorted order. */ |
| static void |
| order_templ_hashes () |
| { |
| auto_vec<std::pair<hashval_t, int>> templ_hashes; |
| hash_freq_map_t::iterator iter = templ_hash_map->begin (); |
| for (; iter != templ_hash_map->end (); ++iter) |
| templ_hashes.safe_push (std::make_pair ((*iter).first, |
| (*iter).second.count)); |
| templ_hashes.qsort (sort_templ_hashes_cmp); |
| for (unsigned i = 0; i < templ_hashes.length (); i++) |
| { |
| templ_info *tinfo = templ_hash_map->get (templ_hashes[i].first); |
| gcc_assert (tinfo); |
| tinfo->order = i; |
| } |
| } |
| |
| /* Populate locality_info for NODE from its direct callers. */ |
| static void |
| populate_caller_locality_info (cgraph_node *node, locality_info *info) |
| { |
| struct cgraph_edge *e; |
| for (e = node->callers; e; e = e->next_caller) |
| { |
| /* Make a local decision about all edges for EDGE->caller but not the |
| other nodes already in the partition. Their edges will be visited |
| later or may have been visited before and not fit the |
| cut-off criteria. */ |
| if (auto cfreq = info->caller_freq.get (e->caller->get_uid ())) |
| (*cfreq) = (*cfreq) + e->sreal_frequency (); |
| else |
| info->caller_freq.put (e->caller->get_uid (), e->sreal_frequency ()); |
| } |
| } |
| |
| /* Return true if template component is found in the demangled function name in |
| DC. */ |
| static bool |
| locality_dc_template_p (struct demangle_component *dc) |
| { |
| switch (dc->type) |
| { |
| case DEMANGLE_COMPONENT_QUAL_NAME: |
| return locality_dc_template_p (dc->u.s_binary.left); |
| case DEMANGLE_COMPONENT_TEMPLATE: |
| return true; |
| default: |
| return false; |
| } |
| return false; |
| } |
| |
| /* Generate hash for the template type from NODE's name if NODE represents a |
| templated functions. If present, record in INFO. */ |
| |
| static void |
| populate_templ_info (cgraph_node *node, locality_info *info) |
| { |
| if (!info) |
| return; |
| void *alloc = NULL; |
| const char *asm_name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME |
| (node->decl)); |
| const char *demangled_name = cplus_demangle_v3 (asm_name, 0); |
| struct demangle_component *dc = |
| cplus_demangle_v3_components (asm_name, DMGL_NO_OPTS, &alloc); |
| |
| if (demangled_name && dc && locality_dc_template_p (dc)) |
| { |
| const char *templ = strchr (demangled_name, '<'); |
| hashval_t t_hash = htab_hash_string (templ); |
| info->templ_hash = t_hash; |
| |
| bool existed; |
| templ_info &tinfo = templ_hash_map->get_or_insert (t_hash, &existed); |
| if (existed) |
| tinfo.count = tinfo.count + 1; |
| else |
| tinfo.count = 1; |
| } |
| free (alloc); |
| } |
| |
| /* Initialize locality_info for node V. If CLONE_P is true, V is a locality |
| clone; populate callee information for locality clones because caller info |
| is needed for cloning decisions and clones are not cloned again. Populate |
| both caller and callee info for non-clone nodes. */ |
| |
| static inline void |
| create_locality_info (cgraph_node *v, bool templ_p = false, |
| bool clone_p = false) |
| { |
| locality_info **info = node_to_ch_info->get (v->get_uid ()); |
| gcc_assert (!info); |
| |
| locality_info *vinfo = loc_infos.allocate (); |
| vinfo->node = v; |
| node_to_ch_info->put (v->get_uid (), vinfo); |
| |
| /* Locality clones are not cloned again. */ |
| if (!clone_p) |
| populate_caller_locality_info (v, vinfo); |
| populate_callee_locality_info (v, v, vinfo); |
| sort_all_callees_default (vinfo); |
| if (templ_p) |
| populate_templ_info (v, vinfo); |
| } |
| |
| /* Return true if NODE is already in some partition. */ |
| static inline bool |
| node_partitioned_p (cgraph_node *node) |
| { |
| return node->aux; |
| } |
| |
| /* Add symbol NODE to partition PART. */ |
| static void |
| add_node_to_partition (locality_partition part, cgraph_node *node) |
| { |
| struct cgraph_edge *e; |
| if (node_partitioned_p (node)) |
| return; |
| |
| part->nodes.safe_push (node); |
| node->aux = (void *) (uintptr_t) (part->part_id); |
| |
| if (!node->alias && node->get_partitioning_class () == SYMBOL_PARTITION) |
| part->insns += ipa_size_summaries->get (node)->size; |
| |
| /* Add all inline clones and callees that are duplicated. */ |
| for (e = node->callees; e; e = e->next_callee) |
| if (!e->inline_failed) |
| add_node_to_partition (part, e->callee); |
| /* omp declare_variant_alt or transparent_alias with definition or linker |
| discardable (non-local comdat but not forced and not |
| used by non-LTO). */ |
| else if (e->callee->get_partitioning_class () == SYMBOL_DUPLICATE) |
| add_node_to_partition (part, e->callee); |
| |
| /* Add all thunks associated with the function. */ |
| for (e = node->callers; e; e = e->next_caller) |
| if (e->caller->thunk && !e->caller->inlined_to) |
| add_node_to_partition (part, e->caller); |
| |
| /* Add all aliases associated with the symbol. */ |
| struct ipa_ref *ref; |
| FOR_EACH_ALIAS (node, ref) |
| if (!ref->referring->transparent_alias) |
| { |
| cgraph_node *referring = dyn_cast<cgraph_node *> (ref->referring); |
| /* Only add function aliases. |
| Varpool refs are added later in LTO partitioning pass. */ |
| if (referring) |
| add_node_to_partition (part, referring); |
| } |
| else |
| { |
| struct ipa_ref *ref2; |
| /* We do not need to add transparent aliases if they are not used. |
| However we must add aliases of transparent aliases if they exist. */ |
| FOR_EACH_ALIAS (ref->referring, ref2) |
| { |
| /* Nested transparent aliases are not permitted. */ |
| gcc_checking_assert (!ref2->referring->transparent_alias); |
| cgraph_node *referring = dyn_cast<cgraph_node *> (ref2->referring); |
| if (referring) |
| add_node_to_partition (part, referring); |
| } |
| } |
| } |
| |
| /* Return TRUE if NODE is in PARTITION. */ |
| static bool |
| node_in_partition_p (locality_partition partition, cgraph_node *node) |
| { |
| return ((uintptr_t) (partition->part_id) == (uintptr_t) (node->aux)); |
| } |
| |
| /* Helper function for qsort; to break ties. */ |
| static int |
| compare_node_uids (cgraph_node *n1, cgraph_node *n2) |
| { |
| int res = n1->get_uid () - n2->get_uid (); |
| gcc_assert (res != 0); |
| return res > 0 ? 1 : -1; |
| } |
| |
| /* Helper function for qsort; sort nodes by order. */ |
| static int |
| static_profile_cmp (const void *pa, const void *pb) |
| { |
| const locality_order *a = *static_cast<const locality_order *const *> (pa); |
| const locality_order *b = *static_cast<const locality_order *const *> (pb); |
| /* Ascending order. */ |
| if (b->order < a->order) |
| return 1; |
| if (b->order > a->order) |
| return -1; |
| return compare_node_uids (a->node, b->node); |
| } |
| |
| /* Helper function for qsort; sort nodes by profile count. */ |
| static int |
| compare_edge_profile_counts (const void *pa, const void *pb) |
| { |
| const locality_order *a = *static_cast<const locality_order *const *> (pa); |
| const locality_order *b = *static_cast<const locality_order *const *> (pb); |
| |
| profile_count cnt1 = a->node->count.ipa (); |
| profile_count cnt2 = b->node->count.ipa (); |
| if (!cnt1.compatible_p (cnt2)) |
| return static_profile_cmp (pa, pb); |
| |
| if (cnt1 < cnt2) |
| return 1; |
| if (cnt1 > cnt2) |
| return -1; |
| return static_profile_cmp (pa, pb); |
| } |
| |
| /* Create and return a new partition and increment NPARTITIONS. */ |
| |
| static locality_partition |
| create_partition (int &npartitions) |
| { |
| locality_partition part = XCNEW (struct locality_partition_def); |
| npartitions++; |
| part->part_id = npartitions; |
| part->nodes.create (1); |
| part->insns = 0; |
| locality_partitions.safe_push (part); |
| return part; |
| } |
| |
| /* Structure for holding profile count information of callers of a node. */ |
| struct profile_stats |
| { |
| /* Sum of non-recursive call counts. */ |
| profile_count nonrec_count; |
| |
| /* Sum of recursive call counts. */ |
| profile_count rec_count; |
| |
| /* If non-NULL, this node is the target of alias or thunk and calls from this |
| should be count in rec_count. */ |
| cgraph_node *target; |
| }; |
| |
| /* Initialize fields of STATS. */ |
| static inline void |
| init_profile_stats (profile_stats *stats, cgraph_node *target = NULL) |
| { |
| stats->nonrec_count = profile_count::zero (); |
| stats->rec_count = profile_count::zero (); |
| stats->target = target; |
| } |
| |
| /* Helper function of to accumulate call counts. */ |
| static bool |
| accumulate_profile_counts_after_cloning (cgraph_node *node, void *data) |
| { |
| struct profile_stats *stats = (struct profile_stats *) data; |
| for (cgraph_edge *e = node->callers; e; e = e->next_caller) |
| { |
| if (!e->count.initialized_p ()) |
| continue; |
| |
| if (e->caller == stats->target) |
| stats->rec_count += e->count.ipa (); |
| else |
| stats->nonrec_count += e->count.ipa (); |
| } |
| return false; |
| } |
| |
| /* NEW_NODE is a previously created clone of ORIG_NODE already present in |
| current partition. EDGES contains newly redirected edges to NEW_NODE. |
| Adjust profile information for both nodes and the edge. */ |
| |
| static void |
| adjust_profile_info_for_non_self_rec_edges (auto_vec<cgraph_edge *> &edges, |
| cgraph_node *new_node, |
| cgraph_node *orig_node) |
| { |
| profile_count orig_node_count = orig_node->count.ipa (); |
| profile_count edge_count = profile_count::zero (); |
| profile_count final_new_count = profile_count::zero (); |
| profile_count final_orig_count = profile_count::zero (); |
| |
| for (unsigned i = 0; i < edges.length (); ++i) |
| if (edges[i]->count.initialized_p ()) |
| edge_count += edges[i]->count.ipa (); |
| |
| final_orig_count = orig_node_count - edge_count; |
| |
| /* NEW_NODE->count was adjusted for other callers when the clone was |
| first created. Just add the new edge count. */ |
| final_new_count = new_node->count + edge_count; |
| |
| final_new_count = orig_node_count.combine_with_ipa_count (final_new_count); |
| orig_node->count = final_orig_count; |
| new_node->count = final_new_count; |
| |
| if (dump_file) |
| { |
| fprintf (dump_file, "Adjusting profile information for %s\n", |
| new_node->dump_asm_name ()); |
| fprintf (dump_file, "\tOriginal node %s\n", orig_node->dump_asm_name ()); |
| fprintf (dump_file, "\tOriginal count: "); |
| orig_node_count.dump (dump_file); |
| fprintf (dump_file, "\n\tAdjusted original count to: "); |
| final_orig_count.dump (dump_file); |
| fprintf (dump_file, "\n\tAdjusted clone count to: "); |
| final_new_count.dump (dump_file); |
| fprintf (dump_file, "\n"); |
| } |
| |
| /* Scale all callee edges according to adjusted counts. */ |
| profile_count orig_node_count_copy = orig_node_count; |
| profile_count::adjust_for_ipa_scaling (&final_new_count, |
| &orig_node_count_copy); |
| for (cgraph_edge *cs = new_node->callees; cs; cs = cs->next_callee) |
| cs->count = cs->count.apply_scale (final_new_count, orig_node_count_copy); |
| for (cgraph_edge *cs = new_node->indirect_calls; cs; cs = cs->next_callee) |
| cs->count = cs->count.apply_scale (final_new_count, orig_node_count_copy); |
| |
| profile_count::adjust_for_ipa_scaling (&final_orig_count, &orig_node_count); |
| for (cgraph_edge *cs = orig_node->callees; cs; cs = cs->next_callee) |
| cs->count = cs->count.apply_scale (final_orig_count, orig_node_count); |
| for (cgraph_edge *cs = orig_node->indirect_calls; cs; cs = cs->next_callee) |
| cs->count = cs->count.apply_scale (final_orig_count, orig_node_count); |
| } |
| |
| /* Adjust profile counts of NEW_NODE and ORIG_NODE, where NEW_NODE is a clone |
| of OLD_NODE. |
| Assumes that all eligible edges from current partition so far are redirected |
| to NEW_NODE and recursive edges are adjusted. */ |
| |
| static void |
| adjust_profile_info (cgraph_node *new_node, cgraph_node *orig_node) |
| { |
| /* If all calls to NEW_NODE are non-recursive, subtract corresponding count |
| from ORIG_NODE and assign to NEW_NODE, any unexpected remainder stays with |
| ORIG_NODE. |
| Recursive calls if present, likely contribute to majority of count; |
| scale according to redirected callers' count. */ |
| |
| profile_count orig_node_count = orig_node->count.ipa (); |
| profile_stats new_stats, orig_stats; |
| |
| init_profile_stats (&new_stats); |
| init_profile_stats (&orig_stats); |
| |
| new_node->call_for_symbol_thunks_and_aliases |
| (accumulate_profile_counts_after_cloning, &new_stats, false); |
| orig_node->call_for_symbol_thunks_and_aliases |
| (accumulate_profile_counts_after_cloning, &orig_stats, false); |
| |
| profile_count orig_nonrec_count = orig_stats.nonrec_count; |
| profile_count orig_rec_count = orig_stats.rec_count; |
| profile_count new_nonrec_count = new_stats.nonrec_count; |
| profile_count new_rec_count = new_stats.rec_count; |
| |
| profile_count final_new_count = new_nonrec_count; |
| profile_count final_orig_count = profile_count::zero (); |
| |
| /* All calls to NEW_NODE are non-recursive or recursive calls have |
| zero count. */ |
| if (!new_rec_count.nonzero_p ()) |
| final_orig_count = orig_node_count - new_nonrec_count; |
| else |
| { |
| /* If ORIG_NODE is externally visible, indirect calls or calls from |
| another part of the code may contribute to the count. |
| update_profiling_info () from ipa-cp.cc pretends to have an extra |
| caller to represent the extra counts. */ |
| if (!orig_node->local) |
| { |
| profile_count pretend_count = (orig_node_count - new_nonrec_count - |
| orig_nonrec_count - orig_rec_count); |
| orig_nonrec_count += pretend_count; |
| } |
| |
| /* Remaining rec_count is assigned in proportion to clone's non-recursive |
| count. */ |
| profile_count rec_count = orig_node_count - new_nonrec_count |
| - orig_nonrec_count; |
| profile_count new_rec_scaled |
| = rec_count.apply_scale (new_nonrec_count, |
| new_nonrec_count + orig_nonrec_count); |
| final_new_count += new_rec_scaled; |
| final_orig_count = orig_node_count - final_new_count; |
| } |
| |
| final_new_count = orig_node_count.combine_with_ipa_count (final_new_count); |
| new_node->count = final_new_count; |
| orig_node->count = final_orig_count; |
| |
| if (dump_file) |
| { |
| fprintf (dump_file, "Adjusting profile information for %s\n", |
| new_node->dump_asm_name ()); |
| fprintf (dump_file, "\tOriginal node %s\n", orig_node->dump_asm_name ()); |
| fprintf (dump_file, "\tOriginal count: "); |
| orig_node_count.dump (dump_file); |
| fprintf (dump_file, "\n\tAdjusted original count to: "); |
| final_orig_count.dump (dump_file); |
| fprintf (dump_file, "\n\tAdjusted clone count to: "); |
| final_new_count.dump (dump_file); |
| fprintf (dump_file, "\n"); |
| } |
| |
| /* Scale all callee edges according to adjusted counts. */ |
| profile_count orig_node_count_copy = orig_node_count; |
| profile_count::adjust_for_ipa_scaling (&final_new_count, |
| &orig_node_count_copy); |
| for (cgraph_edge *cs = new_node->callees; cs; cs = cs->next_callee) |
| cs->count = cs->count.apply_scale (final_new_count, orig_node_count_copy); |
| for (cgraph_edge *cs = new_node->indirect_calls; cs; cs = cs->next_callee) |
| cs->count = cs->count.apply_scale (final_new_count, orig_node_count_copy); |
| |
| profile_count::adjust_for_ipa_scaling (&final_orig_count, &orig_node_count); |
| for (cgraph_edge *cs = orig_node->callees; cs; cs = cs->next_callee) |
| cs->count = cs->count.apply_scale (final_orig_count, orig_node_count); |
| for (cgraph_edge *cs = orig_node->indirect_calls; cs; cs = cs->next_callee) |
| cs->count = cs->count.apply_scale (final_orig_count, orig_node_count); |
| } |
| |
| /* Return true if EDGE can be safely redirected to another callee. */ |
| static inline bool |
| edge_redirectable_p (cgraph_edge *edge, lto_locality_cloning_model cm) |
| { |
| if (cm == LTO_LOCALITY_NON_INTERPOSABLE_CLONING) |
| { |
| /* Interposability may change on edge basis. */ |
| enum availability avail; |
| avail = edge->callee->get_availability (edge->caller); |
| if (avail <= AVAIL_INTERPOSABLE) |
| return false; |
| } |
| return true; |
| } |
| |
| /* Create a locality clone of CNODE and redirect all callers present in |
| PARTITION. |
| Create a clone dpending on whether CNODE itself is a clone or not. */ |
| |
| static cgraph_node * |
| create_locality_clone (cgraph_node *cnode, |
| locality_partition partition, int &cl_num, |
| lto_locality_cloning_model cm) |
| { |
| cgraph_node *cl_node = NULL; |
| vec<cgraph_edge *> redirect_callers = vNULL; |
| /* All callers of cnode in current partition are redirected. */ |
| struct cgraph_edge *edge; |
| for (edge = cnode->callers; edge; edge = edge->next_caller) |
| { |
| struct cgraph_node *caller = edge->caller; |
| if (node_in_partition_p (partition, caller) && caller->definition |
| && caller != cnode && edge_redirectable_p (edge, cm)) |
| redirect_callers.safe_push (edge); |
| } |
| |
| const char *suffix = "locality_clone"; |
| |
| tree old_decl = cnode->decl; |
| tree new_decl = copy_node (old_decl); |
| |
| /* Generate a new name for the new version. */ |
| const char *name = IDENTIFIER_POINTER (DECL_NAME (old_decl)); |
| DECL_NAME (new_decl) = clone_function_name (name, suffix, cl_num); |
| SET_DECL_ASSEMBLER_NAME (new_decl, |
| clone_function_name (old_decl, suffix, cl_num)); |
| cl_num++; |
| if (dump_file) |
| fprintf (dump_file, "\tNew name %s\n", |
| IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (new_decl))); |
| |
| cl_node = cnode->create_clone (new_decl, cnode->count /*profile_count*/, |
| false /*update_original*/, redirect_callers, |
| false /*call_duplication_hook*/, |
| NULL /*new_inlined_to*/, |
| NULL /*param_adjustments*/, suffix); |
| |
| set_new_clone_decl_and_node_flags (cl_node); |
| |
| if (cnode->ipa_transforms_to_apply.exists ()) |
| cl_node->ipa_transforms_to_apply |
| = cnode->ipa_transforms_to_apply.copy (); |
| |
| if (dump_file) |
| { |
| fprintf (dump_file, "Cloned Node: %s %s\n", cnode->dump_asm_name (), |
| cl_node->dump_asm_name ()); |
| |
| for (edge = cl_node->callers; edge; edge = edge->next_caller) |
| fprintf (dump_file, "Redirected callers: %s\n", |
| edge->caller->dump_asm_name ()); |
| |
| for (edge = cl_node->callees; edge; edge = edge->next_callee) |
| fprintf (dump_file, "Callees of clone: %s %d\n", |
| edge->callee->dump_asm_name (), edge->frequency ()); |
| } |
| return cl_node; |
| } |
| |
| /* Redirect recursive edges of CLONE to correctly point to CLONE. As part of |
| cloning process, all callee edges of a node are just duplicated but not |
| redirected. Therefore, these edges still call to original of CLONE. |
| |
| For non-inlined CLONEs, NEW_CALLEE == CLONE and ORIG_CALLEE is CLONE's |
| original node. |
| |
| For inlined node, self recursion to CLONE's original same as non-inlined, |
| additionally, calls to CLONE->inlined_to are also recursive: |
| NEW_CALLEE == CLONE->inlined_into and |
| ORIG_CALLEE == original node of CLONE->inlined_into. */ |
| |
| static void |
| adjust_recursive_callees (cgraph_node *clone, cgraph_node *new_callee, |
| cgraph_node *orig_callee) |
| { |
| cgraph_node *alias = NULL; |
| for (cgraph_edge *e = clone->callees; e; e = e->next_callee) |
| { |
| if (!e->inline_failed) |
| continue; |
| |
| /* Only self-cycle or local alias are handled. */ |
| cgraph_node *callee = e->callee; |
| if (callee == orig_callee) |
| { |
| cgraph_node **cl = node_to_clone->get (orig_callee->get_uid ()); |
| gcc_assert (cl && *cl == new_callee); |
| e->redirect_callee_duplicating_thunks (new_callee); |
| if (dump_file) |
| fprintf (dump_file, "recursive call from %s to %s orig %s\n", |
| e->caller->dump_asm_name (), e->callee->dump_asm_name (), |
| callee->dump_asm_name ()); |
| } |
| else if (callee->alias |
| && e->callee->ultimate_alias_target () == orig_callee) |
| { |
| if (!alias) |
| { |
| alias = dyn_cast<cgraph_node *> ( |
| new_callee->noninterposable_alias ()); |
| } |
| e->redirect_callee_duplicating_thunks (alias); |
| if (dump_file) |
| fprintf (dump_file, "recursive call from %s to %s orig %s\n", |
| e->caller->dump_asm_name (), e->callee->dump_asm_name (), |
| callee->dump_asm_name ()); |
| } |
| } |
| new_callee->expand_all_artificial_thunks (); |
| if (alias) |
| alias->expand_all_artificial_thunks (); |
| } |
| |
| /* Create clones for CALLER's inlined callees, ORIG_INLINED_TO is the original |
| node from clone_as_needed () such that new_inlined_to is a clone of it. */ |
| |
| static void |
| inline_clones (cgraph_node *caller, cgraph_node *orig_inlined_to) |
| { |
| struct cgraph_edge *edge; |
| for (edge = caller->callees; edge; edge = edge->next_callee) |
| { |
| struct cgraph_node *callee = edge->callee; |
| if (edge->inline_failed) |
| continue; |
| |
| if (callee->inlined_to != orig_inlined_to) |
| continue; |
| |
| struct cgraph_node *new_inlined_to, *cl; |
| if (caller->inlined_to) |
| new_inlined_to = caller->inlined_to; |
| else |
| new_inlined_to = caller; |
| |
| cl = callee->create_clone (callee->decl, |
| edge->count /*profile_count*/, |
| true /*update_original*/, |
| vNULL /*redirect_callers*/, |
| false /*call_duplication_hook*/, |
| new_inlined_to /*new_inlined_to*/, |
| NULL /*param_adjustments*/, |
| "locality_clone" /*suffix*/); |
| edge->redirect_callee (cl); |
| |
| node_to_clone->put (callee->get_uid (), cl); |
| clone_to_node->put (cl->get_uid (), callee); |
| |
| if (callee->thunk) |
| { |
| thunk_info *info = thunk_info::get (callee); |
| *thunk_info::get_create (cl) = *info; |
| } |
| |
| adjust_recursive_callees (cl, new_inlined_to, orig_inlined_to); |
| adjust_recursive_callees (cl, cl, callee); |
| if (dump_file) |
| { |
| fprintf (dump_file, "Inline cloned\n"); |
| cl->dump (dump_file); |
| } |
| |
| /* Recursively inline till end of this callchain. */ |
| inline_clones (cl, orig_inlined_to); |
| } |
| } |
| |
| /* Clone EDGE->CALLEE if it or a clone of it is not already in PARTITION. |
| Redirect all callers of EDGE->CALLEE that are in PARTITION, not just the |
| EDGE. If a clone is already present in PARTITION, redirect all edges from |
| EDGE->CALLER to EDGE->CALLEE. This is because we only visit one edge per |
| caller to callee and redirect for all others from there. |
| |
| If cloning, also recursively clone inlined functions till the end of the |
| callchain because inlined clones have 1-1 exclusive copy and edge from |
| caller to inlined node. |
| |
| There are 2 flows possible: |
| 1. Only redirect |
| 1.1. cnode is already in current partition - cnode mustn't be a |
| locality_clone -> nothing to do |
| 1.2. A clone of cnode is in current partition - find out if it's the |
| correct clone for edge - must be a locality_clone but the exact same |
| kind as callee i.e. orig or cp/sra clone, if yes, redirect, else go to #2 |
| 1.3. Cnode/a clone of cnode is in current partition but caller is inlined |
| 2. Clone and redirect |
| 2.1. cnode is original node |
| 2.2. cnode itself is a clone |
| Clone inlines |
| Flavors of edges: |
| 1. Normal -> orig nodes, locality clones or cp/sra clones |
| 2. Recursive -> direct recursion |
| 3. Alias -> recursion via aliasing or as a result of IPA code duplication |
| 4. Inline -> shouldn't be included in callchain. */ |
| |
| static cgraph_node * |
| clone_node_as_needed (cgraph_edge *edge, locality_partition partition, |
| int &cl_num, lto_locality_cloning_model cm) |
| { |
| /* suitable_for_locality_cloning_p () currently prohibits cloning aliases due |
| to potential versioning and materialization issues. Could be enabled in |
| the future. suitable_for_locality_cloning_p () also checks for |
| interposability for CNODE but not for edge redirection. */ |
| struct cgraph_node *cnode = edge->callee; |
| struct cgraph_node *caller = edge->caller; |
| |
| /* If clone of cnode is already in the partition |
| Get latest clone of cnode. If current partition has cloned cnode, that |
| clone should be returned. Otherwise, clone from previous partition is |
| returned |
| Original node and its clone shouldn't co-exist in current partition |
| |
| This is required if callee is partitioned via another edge before caller |
| was, and we are now visiting caller->callee edge |
| |
| 1) a -> b ==> a -> bc1; b was cloned say via d -> bc1, a is orig |
| 2) ac1 -> b ==> ac1 -> bc1; b was cloned and a was just cloned |
| 3) a -> bc1 and bc2 present, mustn't happen, b was cloned and a was |
| redirected without being partitioned first. |
| Why will we do this again - multiple edges and something's wrong in |
| partition_callchain () |
| 4) ac1 -> bc1 ==> ac1 -> bc2; a was cloned and we already got (1) in some |
| other partition |
| 5) ac1 -> bc1 but no clone present in this PARTITION. Create from b, not |
| from bc1? |
| 6) a -> b; a -> bc0; create new clone, no clone present |
| 7) ac0 -> b; ac0 -> bc0 same as (6) |
| 8) a -> bc0 and no clone present, mustn't happen, same as (3) |
| |
| Redirect when bc1 is present and: |
| a -> b or ac -> b or ac -> bc0 */ |
| |
| cgraph_node *orig_cnode = cnode; |
| cgraph_node **o_cnode = clone_to_node->get (cnode->get_uid ()); |
| if (o_cnode) |
| orig_cnode = *o_cnode; |
| |
| cgraph_node **cnode_cl = node_to_clone->get (orig_cnode->get_uid ()); |
| |
| if (cnode_cl && node_in_partition_p (partition, *cnode_cl)) |
| { |
| if (node_in_partition_p (partition, caller)) |
| { |
| bool clone_p = false; |
| auto_vec<cgraph_edge *> redirected_edges; |
| for (cgraph_edge *ec = caller->callees; ec; ec = ec->next_callee) |
| if (ec->callee == cnode && edge_redirectable_p (ec, cm)) |
| { |
| ec->redirect_callee_duplicating_thunks (*cnode_cl); |
| clone_p = true; |
| redirected_edges.safe_push (ec); |
| if (dump_file) |
| { |
| fprintf (dump_file, "clone present %s %s redirecting %s\n", |
| cnode->dump_asm_name (), |
| (*cnode_cl)->dump_asm_name (), |
| caller->dump_asm_name ()); |
| } |
| } |
| if (clone_p) |
| { |
| (*cnode_cl)->expand_all_artificial_thunks (); |
| adjust_profile_info_for_non_self_rec_edges (redirected_edges, |
| *cnode_cl, cnode); |
| return NULL; |
| } |
| } |
| } |
| |
| /* Create a new clone for a -> b, ac -> b. |
| For ac -> bc, should be done on bc or b? |
| bc could be from b_cp/b_sra or b. */ |
| |
| if (orig_cnode != cnode) |
| { |
| if (dump_file) |
| fprintf (dump_file, "Clone of clone %s %s\n", cnode->dump_asm_name (), |
| orig_cnode->dump_asm_name ()); |
| return NULL; |
| } |
| |
| struct cgraph_node *cloned_node |
| = create_locality_clone (cnode, partition, cl_num, cm); |
| |
| gcc_assert (cloned_node); |
| if (!cloned_node) |
| return NULL; |
| |
| node_to_clone->put (cnode->get_uid (), cloned_node); |
| clone_to_node->put (cloned_node->get_uid (), cnode); |
| |
| adjust_recursive_callees (cloned_node, cloned_node, cnode); |
| symtab->call_cgraph_duplication_hooks (cnode, cloned_node); |
| |
| adjust_profile_info (cloned_node, cnode); |
| /* Inline clones are created iff their inlined_to == CNODE. */ |
| inline_clones (cloned_node, cnode); |
| |
| return cloned_node; |
| } |
| |
| /* Determine if EDGE->CALLEE is suitable for cloning. It is assummed that the |
| callee is not an inlined node. */ |
| |
| static bool |
| suitable_for_locality_cloning_p (cgraph_edge *edge, |
| lto_locality_cloning_model cm) |
| { |
| cgraph_node *node = edge->callee; |
| if (!node->versionable) |
| return false; |
| |
| /* Out-of-line locality clones of ipcp or sra clones will be created in this |
| pass after IPA inline is run. A locality clone has the same function |
| body and the same updated signature as the ipcp/sra clone. |
| This fails or asserts based on how the clone is created: |
| 1. If param_adjustments and tree_map are not recorded for locality clone: |
| clone materialization (tree_function_versioning ()) fails when |
| updating signature and remapping calls because clone_of (ipcp/sra |
| clone) and locality clone differ in param information. |
| 2. If param_adjustments and tree_map are provided: asserts are triggered |
| in fnsummary duplication because IPA inline resets some summaries. |
| |
| One inelegant solution is to provide param_adjustments and tree_map, and |
| then set clone_of to ipcp/sra clone's clone_of. However, this sometimes |
| results in segmentation fault when the compiled program is run. |
| Disabling clone of clones altogether for now with an aim to resolve this |
| is future. */ |
| if (node->clone_of) |
| return false; |
| |
| if (node->alias) |
| return false; |
| |
| if (edge->recursive_p ()) |
| return false; |
| |
| if (!node->definition) |
| return false; |
| |
| /* Don't clone NODE if IPA count of NODE or EDGE is zero. */ |
| if (!node->count.ipa ().nonzero_p () || !edge->count.ipa ().nonzero_p ()) |
| return false; |
| |
| if (cm == LTO_LOCALITY_NON_INTERPOSABLE_CLONING) |
| { |
| /* Interposability may change on edge basis. */ |
| enum availability avail; |
| edge->callee->ultimate_alias_target (&avail, edge->caller); |
| if (avail <= AVAIL_INTERPOSABLE) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| /* Return true if edge->callee->ultimate_alias_target can be cloned. */ |
| static bool |
| clone_node_p (cgraph_edge *edge, lto_locality_cloning_model cloning_model, |
| double freq_cutoff, int size) |
| { |
| cgraph_node *node = edge->callee->ultimate_alias_target (); |
| |
| if (!suitable_for_locality_cloning_p (edge, cloning_model)) |
| return false; |
| |
| if (!node->alias) |
| if (ipa_size_summaries->get (node)->size >= size) |
| return false; |
| |
| if (freq_cutoff != 0.0) |
| { |
| locality_info *info = get_locality_info (node); |
| gcc_assert (info); |
| if (auto cfreq = info->caller_freq.get (edge->caller->get_uid ())) |
| { |
| if ((*cfreq).to_double () < freq_cutoff) |
| return false; |
| } |
| else if (edge->sreal_frequency ().to_double () < freq_cutoff) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| /* Partition NODE's callees into PARTITION or clone if already partitioned and |
| satisfies cloning criteria such as CLONING_MODEL, REAL_FREQ and SIZE |
| cut-offs. */ |
| |
| /* callgraph can have multiple caller to callee edges for multiple callsites |
| For the first such edge, we make decisions about cutoffs and cloning because |
| we redirect ALL callsites to cloned callee, not just one of them. */ |
| |
| static void |
| partition_callchain (cgraph_node *node, locality_partition &partition, |
| lto_locality_cloning_model cloning_model, |
| double freq_cutoff, int size, int &cl_num, |
| int &npartitions, int64_t partition_size, |
| lto_locality_heuristics scheme) |
| { |
| /* Aliases are added in the same partition as their targets. |
| Aliases are not cloned and their callees are not processed separately. */ |
| cgraph_node *cl_node = NULL; |
| if (partition->insns > partition_size) |
| partition = create_partition (npartitions); |
| |
| /* Iterate over all unique callees of NODE, direct callees and callees via |
| inlined nodes. This avoids calling partition_callchain () separately for |
| inlined nodes which themselves are already partitioned along with their |
| inlined_to nodes. */ |
| locality_info *info = get_locality_info (node); |
| if (scheme == LTO_LOCALITY_CPP_TEMPLATE) |
| info->all_callees.qsort (callee_templ_cmp); |
| for (unsigned i = 0; i < info->all_callees.length (); i++) |
| { |
| cgraph_edge *e = info->all_callees[i]->edge; |
| cgraph_node *n = e->callee->ultimate_alias_target (); |
| if (n->get_partitioning_class () == SYMBOL_PARTITION) |
| { |
| if (!node_partitioned_p (n)) |
| { |
| add_node_to_partition (partition, n); |
| if (dump_file) |
| fprintf (dump_file, "Partitioned node: %s\n", |
| n->dump_asm_name ()); |
| partition_callchain (n, partition, cloning_model, freq_cutoff, |
| size, cl_num, npartitions, partition_size, |
| scheme); |
| } |
| else if (cloning_model >= LTO_LOCALITY_NON_INTERPOSABLE_CLONING |
| && (!e->callee->alias) |
| && node_in_partition_p (partition, e->caller) |
| && (!node_in_partition_p (partition, n))) |
| |
| { |
| /* 3 possible scenarios if N is already partitioned but not in |
| present in PARTITION: |
| 1. There's a clone of N present in PARTITION, redirect to that |
| clone, no need to check for suitability. |
| 2. N itself is a locality clone, cloned as part of another |
| callchain. If a clone of N's original node is present in |
| PARTITION, redirect to it without checking for suitability. |
| Cloned node itself is not cloned again. |
| Example: suppose N = B_clone (). |
| In partition X, edge A->B was transformed to A->B_clone0. |
| In current partition, A was cloned to A_clone0 and now |
| B_clone0 is visited via edge A_clone0->B_clone0. If a |
| B_clonei is present, redirect A_clone0 to it, otherise do |
| nothing. |
| 3. N is not a locality clone and no clone of N is present in |
| PARTITION, check for suitability and clone. */ |
| cgraph_node *orig_cnode = n; |
| cgraph_node **o_cnode = clone_to_node->get (n->get_uid ()); |
| if (o_cnode) |
| orig_cnode = *o_cnode; |
| |
| cgraph_node **cnode_cl = node_to_clone->get (orig_cnode->get_uid |
| ()); |
| |
| if ((cnode_cl && node_in_partition_p (partition, *cnode_cl)) |
| || (orig_cnode == n |
| && clone_node_p (e, cloning_model, freq_cutoff, size))) |
| { |
| cl_node = clone_node_as_needed (e, partition, cl_num, |
| cloning_model); |
| if (cl_node) |
| { |
| create_locality_info (cl_node, |
| scheme == LTO_LOCALITY_CPP_TEMPLATE, |
| true); |
| add_node_to_partition (partition, cl_node); |
| partition_callchain (cl_node, partition, cloning_model, |
| freq_cutoff, size, cl_num, |
| npartitions, partition_size, scheme); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| /* Determine whether NODE is an entrypoint to a callchain. */ |
| |
| static bool |
| is_entry_node_p (cgraph_node *node) |
| { |
| /* node->inlined_to is returned as SYMBOL_DUPLICATE. */ |
| if (node->get_partitioning_class () != SYMBOL_PARTITION) |
| return false; |
| |
| /* NODE and all its aliases or thunk are local. */ |
| if (node->local_p ()) |
| return false; |
| |
| if (!node->callers) |
| return true; |
| |
| for (cgraph_edge *e = node->callers; e; e = e->next_caller) |
| { |
| if (! e->recursive_p ()) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| /* Determine order of all external nodes if PGO profile is available. |
| Store the order in ORDER. */ |
| |
| static bool |
| locality_determine_ipa_order (auto_vec<locality_order *> *order) |
| { |
| struct cgraph_node *node; |
| auto_vec<locality_order *> non_comparable_nodes; |
| FOR_EACH_DEFINED_FUNCTION (node) |
| if (node->get_partitioning_class () == SYMBOL_PARTITION) |
| { |
| create_locality_info (node); |
| if (node->no_reorder) |
| { |
| if (dump_file) |
| fprintf (dump_file, "no reorder %s\n", node->dump_asm_name ()); |
| return false; |
| } |
| else if (is_entry_node_p (node)) |
| { |
| profile_count pcnt = node->count.ipa (); |
| if (!pcnt.initialized_p () || !pcnt.ipa_p ()) |
| { |
| sreal cnt = 0; |
| locality_order *lo = new locality_order (node, cnt); |
| non_comparable_nodes.safe_push (lo); |
| continue; |
| } |
| sreal count = 0; |
| struct cgraph_edge *edge; |
| for (edge = node->callees; edge; edge = edge->next_callee) |
| { |
| /* For PGO, frequency is not used in |
| compare_edge_profile_counts (), it's used only as part of |
| static profile order. */ |
| sreal freq = edge->sreal_frequency (); |
| count += freq; |
| } |
| locality_order *cl = new locality_order (node, count); |
| order->safe_push (cl); |
| } |
| } |
| order->qsort (compare_edge_profile_counts); |
| for (auto el : non_comparable_nodes) |
| order->safe_push (el); |
| return true; |
| } |
| |
| /* Determine order of all external nodes if only static profile is available. |
| Store the order in ORDER. */ |
| |
| static bool |
| locality_determine_static_order (auto_vec<locality_order *> *order, |
| lto_locality_heuristics scheme) |
| { |
| struct cgraph_node *node; |
| FOR_EACH_DEFINED_FUNCTION (node) |
| if (node->get_partitioning_class () == SYMBOL_PARTITION) |
| { |
| create_locality_info (node, scheme == LTO_LOCALITY_CPP_TEMPLATE); |
| if (node->no_reorder) |
| { |
| if (dump_file) |
| fprintf (dump_file, "no reorder %s\n", node->dump_asm_name ()); |
| return false; |
| } |
| else if (is_entry_node_p (node)) |
| { |
| sreal count = 0; |
| struct cgraph_edge *edge; |
| for (edge = node->callees; edge; edge = edge->next_callee) |
| { |
| sreal freq = edge->sreal_frequency (); |
| count += freq; |
| } |
| locality_order *cl = new locality_order (node, count); |
| order->safe_push (cl); |
| } |
| } |
| |
| if (scheme == LTO_LOCALITY_CPP_TEMPLATE) |
| { |
| order_templ_hashes (); |
| order->qsort (static_profile_templ_cmp); |
| return true; |
| } |
| |
| order->qsort (static_profile_cmp); |
| return true; |
| } |
| |
| /* Partitioning for code locality. |
| 1. Create and sort callchains. If PGO is available, use real profile |
| counts. Otherwise, use a set of heuristics to sort the callchains. |
| 2. Partition the external nodes and their callchains in the determined order |
| 2.1. If !partition, partition, else try and clone if it satisfies cloning |
| criteria. |
| 3. Partition all other unpartitioned nodes. */ |
| |
| static void |
| locality_partition_and_clone (int max_locality_partition_size, |
| lto_locality_cloning_model cloning_model, |
| lto_locality_heuristics scheme, |
| int freq_denominator, int size) |
| { |
| locality_partition partition; |
| int npartitions = 0; |
| |
| auto_vec<locality_order *> order; |
| auto_vec<varpool_node *> varpool_order; |
| struct cgraph_node *node; |
| bool order_p; |
| |
| int cl_num = 0; |
| |
| double real_freq = 0.0; |
| if (freq_denominator > 0) |
| real_freq = 1.0 / (double) freq_denominator; |
| |
| cgraph_node *n = symtab->first_defined_function (); |
| if (n && n->count.ipa_p ()) |
| order_p = locality_determine_ipa_order (&order); |
| else |
| order_p = locality_determine_static_order (&order, scheme); |
| if (!order_p) |
| { |
| if (dump_file) |
| { |
| fprintf (dump_file, "Locality partition: falling back to balanced" |
| "model\n"); |
| } |
| |
| return; |
| } |
| |
| int64_t partition_size |
| = max_locality_partition_size |
| ? max_locality_partition_size : param_max_partition_size; |
| partition = create_partition (npartitions); |
| |
| for (unsigned i = 0; i < order.length (); i++) |
| { |
| node = order[i]->node; |
| if (node_partitioned_p (node)) |
| continue; |
| |
| if (partition->insns > partition_size) |
| partition = create_partition (npartitions); |
| if (dump_file) |
| fprintf (dump_file, "Partition id: %d\n", partition->part_id); |
| |
| add_node_to_partition (partition, node); |
| if (dump_file) |
| fprintf (dump_file, "Ordered Node: %s\n", node->dump_asm_name ()); |
| |
| partition_callchain (node, partition, cloning_model, real_freq, size, |
| cl_num, npartitions, partition_size, scheme); |
| } |
| |
| for (unsigned i = 0; i < order.length (); i++) |
| delete order[i]; |
| order = vNULL; |
| |
| loc_infos.release (); |
| } |
| |
| /* Entry point to locality-clone pass. */ |
| static int |
| lc_execute (void) |
| { |
| symtab_node *node; |
| FOR_EACH_SYMBOL (node) |
| node->aux = NULL; |
| |
| node_to_ch_info = std::make_unique<hash_map<loc_map_hash, locality_info *>> |
| (); |
| node_to_clone = std::make_unique<hash_map<loc_map_hash, cgraph_node *>> (); |
| clone_to_node = std::make_unique<hash_map<loc_map_hash, cgraph_node *>> (); |
| templ_hash_map = std::make_unique<hash_freq_map_t> (); |
| |
| locality_partition_and_clone (param_max_locality_partition_size, |
| flag_lto_locality_cloning, |
| flag_lto_locality_heuristics, |
| param_lto_locality_frequency, |
| param_lto_locality_size); |
| |
| FOR_EACH_SYMBOL (node) |
| node->aux = NULL; |
| return 0; |
| } |
| |
| namespace { |
| |
| const pass_data pass_data_ipa_locality_clone = { |
| IPA_PASS, /* type */ |
| "locality-clone", /* name */ |
| OPTGROUP_NONE, /* optinfo_flags */ |
| TV_IPA_LC, /* tv_id */ |
| 0, /* properties_required */ |
| 0, /* properties_provided */ |
| 0, /* properties_destroyed */ |
| 0, /* todo_flags_start */ |
| (TODO_dump_symtab | TODO_remove_functions), /* todo_flags_finish */ |
| }; |
| |
| class pass_ipa_locality_cloning : public ipa_opt_pass_d |
| { |
| public: |
| pass_ipa_locality_cloning (gcc::context *ctxt) |
| : ipa_opt_pass_d (pass_data_ipa_locality_clone, ctxt, |
| NULL, /* generate_summary */ |
| NULL, /* write_summary */ |
| NULL, /* read_summary */ |
| NULL, /* write_optimization_summary */ |
| NULL, /* read_optimization_summary */ |
| NULL, /* stmt_fixup */ |
| 0, /* function_transform_todo_flags_start */ |
| NULL, /* function_transform */ |
| NULL) /* variable_transform */ |
| {} |
| |
| /* opt_pass methods: */ |
| virtual bool gate (function *) |
| { |
| return (flag_wpa && flag_ipa_reorder_for_locality); |
| } |
| |
| virtual unsigned int execute (function *) { return lc_execute (); } |
| |
| }; // class pass_ipa_locality_cloning |
| |
| } // namespace |
| |
| ipa_opt_pass_d * |
| make_pass_ipa_locality_cloning (gcc::context *ctxt) |
| { |
| return new pass_ipa_locality_cloning (ctxt); |
| } |