| /* GIMPLE store merging and byte swapping passes. |
| Copyright (C) 2009-2022 Free Software Foundation, Inc. |
| Contributed by ARM Ltd. |
| |
| This file is part of GCC. |
| |
| GCC is free software; you can redistribute it and/or modify it |
| under the terms of the GNU General Public License as published by |
| the Free Software Foundation; either version 3, or (at your option) |
| any later version. |
| |
| GCC is distributed in the hope that it will be useful, but |
| WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with GCC; see the file COPYING3. If not see |
| <http://www.gnu.org/licenses/>. */ |
| |
| /* The purpose of the store merging pass is to combine multiple memory stores |
| of constant values, values loaded from memory, bitwise operations on those, |
| or bit-field values, to consecutive locations, into fewer wider stores. |
| |
| For example, if we have a sequence peforming four byte stores to |
| consecutive memory locations: |
| [p ] := imm1; |
| [p + 1B] := imm2; |
| [p + 2B] := imm3; |
| [p + 3B] := imm4; |
| we can transform this into a single 4-byte store if the target supports it: |
| [p] := imm1:imm2:imm3:imm4 concatenated according to endianness. |
| |
| Or: |
| [p ] := [q ]; |
| [p + 1B] := [q + 1B]; |
| [p + 2B] := [q + 2B]; |
| [p + 3B] := [q + 3B]; |
| if there is no overlap can be transformed into a single 4-byte |
| load followed by single 4-byte store. |
| |
| Or: |
| [p ] := [q ] ^ imm1; |
| [p + 1B] := [q + 1B] ^ imm2; |
| [p + 2B] := [q + 2B] ^ imm3; |
| [p + 3B] := [q + 3B] ^ imm4; |
| if there is no overlap can be transformed into a single 4-byte |
| load, xored with imm1:imm2:imm3:imm4 and stored using a single 4-byte store. |
| |
| Or: |
| [p:1 ] := imm; |
| [p:31] := val & 0x7FFFFFFF; |
| we can transform this into a single 4-byte store if the target supports it: |
| [p] := imm:(val & 0x7FFFFFFF) concatenated according to endianness. |
| |
| The algorithm is applied to each basic block in three phases: |
| |
| 1) Scan through the basic block and record assignments to destinations |
| that can be expressed as a store to memory of a certain size at a certain |
| bit offset from base expressions we can handle. For bit-fields we also |
| record the surrounding bit region, i.e. bits that could be stored in |
| a read-modify-write operation when storing the bit-field. Record store |
| chains to different bases in a hash_map (m_stores) and make sure to |
| terminate such chains when appropriate (for example when the stored |
| values get used subsequently). |
| These stores can be a result of structure element initializers, array stores |
| etc. A store_immediate_info object is recorded for every such store. |
| Record as many such assignments to a single base as possible until a |
| statement that interferes with the store sequence is encountered. |
| Each store has up to 2 operands, which can be a either constant, a memory |
| load or an SSA name, from which the value to be stored can be computed. |
| At most one of the operands can be a constant. The operands are recorded |
| in store_operand_info struct. |
| |
| 2) Analyze the chains of stores recorded in phase 1) (i.e. the vector of |
| store_immediate_info objects) and coalesce contiguous stores into |
| merged_store_group objects. For bit-field stores, we don't need to |
| require the stores to be contiguous, just their surrounding bit regions |
| have to be contiguous. If the expression being stored is different |
| between adjacent stores, such as one store storing a constant and |
| following storing a value loaded from memory, or if the loaded memory |
| objects are not adjacent, a new merged_store_group is created as well. |
| |
| For example, given the stores: |
| [p ] := 0; |
| [p + 1B] := 1; |
| [p + 3B] := 0; |
| [p + 4B] := 1; |
| [p + 5B] := 0; |
| [p + 6B] := 0; |
| This phase would produce two merged_store_group objects, one recording the |
| two bytes stored in the memory region [p : p + 1] and another |
| recording the four bytes stored in the memory region [p + 3 : p + 6]. |
| |
| 3) The merged_store_group objects produced in phase 2) are processed |
| to generate the sequence of wider stores that set the contiguous memory |
| regions to the sequence of bytes that correspond to it. This may emit |
| multiple stores per store group to handle contiguous stores that are not |
| of a size that is a power of 2. For example it can try to emit a 40-bit |
| store as a 32-bit store followed by an 8-bit store. |
| We try to emit as wide stores as we can while respecting STRICT_ALIGNMENT |
| or TARGET_SLOW_UNALIGNED_ACCESS settings. |
| |
| Note on endianness and example: |
| Consider 2 contiguous 16-bit stores followed by 2 contiguous 8-bit stores: |
| [p ] := 0x1234; |
| [p + 2B] := 0x5678; |
| [p + 4B] := 0xab; |
| [p + 5B] := 0xcd; |
| |
| The memory layout for little-endian (LE) and big-endian (BE) must be: |
| p |LE|BE| |
| --------- |
| 0 |34|12| |
| 1 |12|34| |
| 2 |78|56| |
| 3 |56|78| |
| 4 |ab|ab| |
| 5 |cd|cd| |
| |
| To merge these into a single 48-bit merged value 'val' in phase 2) |
| on little-endian we insert stores to higher (consecutive) bitpositions |
| into the most significant bits of the merged value. |
| The final merged value would be: 0xcdab56781234 |
| |
| For big-endian we insert stores to higher bitpositions into the least |
| significant bits of the merged value. |
| The final merged value would be: 0x12345678abcd |
| |
| Then, in phase 3), we want to emit this 48-bit value as a 32-bit store |
| followed by a 16-bit store. Again, we must consider endianness when |
| breaking down the 48-bit value 'val' computed above. |
| For little endian we emit: |
| [p] (32-bit) := 0x56781234; // val & 0x0000ffffffff; |
| [p + 4B] (16-bit) := 0xcdab; // (val & 0xffff00000000) >> 32; |
| |
| Whereas for big-endian we emit: |
| [p] (32-bit) := 0x12345678; // (val & 0xffffffff0000) >> 16; |
| [p + 4B] (16-bit) := 0xabcd; // val & 0x00000000ffff; */ |
| |
| #include "config.h" |
| #include "system.h" |
| #include "coretypes.h" |
| #include "backend.h" |
| #include "tree.h" |
| #include "gimple.h" |
| #include "builtins.h" |
| #include "fold-const.h" |
| #include "tree-pass.h" |
| #include "ssa.h" |
| #include "gimple-pretty-print.h" |
| #include "alias.h" |
| #include "fold-const.h" |
| #include "print-tree.h" |
| #include "tree-hash-traits.h" |
| #include "gimple-iterator.h" |
| #include "gimplify.h" |
| #include "gimple-fold.h" |
| #include "stor-layout.h" |
| #include "timevar.h" |
| #include "cfganal.h" |
| #include "cfgcleanup.h" |
| #include "tree-cfg.h" |
| #include "except.h" |
| #include "tree-eh.h" |
| #include "target.h" |
| #include "gimplify-me.h" |
| #include "rtl.h" |
| #include "expr.h" /* For get_bit_range. */ |
| #include "optabs-tree.h" |
| #include "dbgcnt.h" |
| #include "selftest.h" |
| |
| /* The maximum size (in bits) of the stores this pass should generate. */ |
| #define MAX_STORE_BITSIZE (BITS_PER_WORD) |
| #define MAX_STORE_BYTES (MAX_STORE_BITSIZE / BITS_PER_UNIT) |
| |
| /* Limit to bound the number of aliasing checks for loads with the same |
| vuse as the corresponding store. */ |
| #define MAX_STORE_ALIAS_CHECKS 64 |
| |
| namespace { |
| |
| struct bswap_stat |
| { |
| /* Number of hand-written 16-bit nop / bswaps found. */ |
| int found_16bit; |
| |
| /* Number of hand-written 32-bit nop / bswaps found. */ |
| int found_32bit; |
| |
| /* Number of hand-written 64-bit nop / bswaps found. */ |
| int found_64bit; |
| } nop_stats, bswap_stats; |
| |
| /* A symbolic number structure is used to detect byte permutation and selection |
| patterns of a source. To achieve that, its field N contains an artificial |
| number consisting of BITS_PER_MARKER sized markers tracking where does each |
| byte come from in the source: |
| |
| 0 - target byte has the value 0 |
| FF - target byte has an unknown value (eg. due to sign extension) |
| 1..size - marker value is the byte index in the source (0 for lsb). |
| |
| To detect permutations on memory sources (arrays and structures), a symbolic |
| number is also associated: |
| - a base address BASE_ADDR and an OFFSET giving the address of the source; |
| - a range which gives the difference between the highest and lowest accessed |
| memory location to make such a symbolic number; |
| - the address SRC of the source element of lowest address as a convenience |
| to easily get BASE_ADDR + offset + lowest bytepos; |
| - number of expressions N_OPS bitwise ored together to represent |
| approximate cost of the computation. |
| |
| Note 1: the range is different from size as size reflects the size of the |
| type of the current expression. For instance, for an array char a[], |
| (short) a[0] | (short) a[3] would have a size of 2 but a range of 4 while |
| (short) a[0] | ((short) a[0] << 1) would still have a size of 2 but this |
| time a range of 1. |
| |
| Note 2: for non-memory sources, range holds the same value as size. |
| |
| Note 3: SRC points to the SSA_NAME in case of non-memory source. */ |
| |
| struct symbolic_number { |
| uint64_t n; |
| tree type; |
| tree base_addr; |
| tree offset; |
| poly_int64_pod bytepos; |
| tree src; |
| tree alias_set; |
| tree vuse; |
| unsigned HOST_WIDE_INT range; |
| int n_ops; |
| }; |
| |
| #define BITS_PER_MARKER 8 |
| #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1) |
| #define MARKER_BYTE_UNKNOWN MARKER_MASK |
| #define HEAD_MARKER(n, size) \ |
| ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER))) |
| |
| /* The number which the find_bswap_or_nop_1 result should match in |
| order to have a nop. The number is masked according to the size of |
| the symbolic number before using it. */ |
| #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \ |
| (uint64_t)0x08070605 << 32 | 0x04030201) |
| |
| /* The number which the find_bswap_or_nop_1 result should match in |
| order to have a byte swap. The number is masked according to the |
| size of the symbolic number before using it. */ |
| #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \ |
| (uint64_t)0x01020304 << 32 | 0x05060708) |
| |
| /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic |
| number N. Return false if the requested operation is not permitted |
| on a symbolic number. */ |
| |
| inline bool |
| do_shift_rotate (enum tree_code code, |
| struct symbolic_number *n, |
| int count) |
| { |
| int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT; |
| unsigned head_marker; |
| |
| if (count < 0 |
| || count >= TYPE_PRECISION (n->type) |
| || count % BITS_PER_UNIT != 0) |
| return false; |
| count = (count / BITS_PER_UNIT) * BITS_PER_MARKER; |
| |
| /* Zero out the extra bits of N in order to avoid them being shifted |
| into the significant bits. */ |
| if (size < 64 / BITS_PER_MARKER) |
| n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1; |
| |
| switch (code) |
| { |
| case LSHIFT_EXPR: |
| n->n <<= count; |
| break; |
| case RSHIFT_EXPR: |
| head_marker = HEAD_MARKER (n->n, size); |
| n->n >>= count; |
| /* Arithmetic shift of signed type: result is dependent on the value. */ |
| if (!TYPE_UNSIGNED (n->type) && head_marker) |
| for (i = 0; i < count / BITS_PER_MARKER; i++) |
| n->n |= (uint64_t) MARKER_BYTE_UNKNOWN |
| << ((size - 1 - i) * BITS_PER_MARKER); |
| break; |
| case LROTATE_EXPR: |
| n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count)); |
| break; |
| case RROTATE_EXPR: |
| n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count)); |
| break; |
| default: |
| return false; |
| } |
| /* Zero unused bits for size. */ |
| if (size < 64 / BITS_PER_MARKER) |
| n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1; |
| return true; |
| } |
| |
| /* Perform sanity checking for the symbolic number N and the gimple |
| statement STMT. */ |
| |
| inline bool |
| verify_symbolic_number_p (struct symbolic_number *n, gimple *stmt) |
| { |
| tree lhs_type; |
| |
| lhs_type = TREE_TYPE (gimple_get_lhs (stmt)); |
| |
| if (TREE_CODE (lhs_type) != INTEGER_TYPE |
| && TREE_CODE (lhs_type) != ENUMERAL_TYPE) |
| return false; |
| |
| if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type)) |
| return false; |
| |
| return true; |
| } |
| |
| /* Initialize the symbolic number N for the bswap pass from the base element |
| SRC manipulated by the bitwise OR expression. */ |
| |
| bool |
| init_symbolic_number (struct symbolic_number *n, tree src) |
| { |
| int size; |
| |
| if (!INTEGRAL_TYPE_P (TREE_TYPE (src)) && !POINTER_TYPE_P (TREE_TYPE (src))) |
| return false; |
| |
| n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE; |
| n->src = src; |
| |
| /* Set up the symbolic number N by setting each byte to a value between 1 and |
| the byte size of rhs1. The highest order byte is set to n->size and the |
| lowest order byte to 1. */ |
| n->type = TREE_TYPE (src); |
| size = TYPE_PRECISION (n->type); |
| if (size % BITS_PER_UNIT != 0) |
| return false; |
| size /= BITS_PER_UNIT; |
| if (size > 64 / BITS_PER_MARKER) |
| return false; |
| n->range = size; |
| n->n = CMPNOP; |
| n->n_ops = 1; |
| |
| if (size < 64 / BITS_PER_MARKER) |
| n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1; |
| |
| return true; |
| } |
| |
| /* Check if STMT might be a byte swap or a nop from a memory source and returns |
| the answer. If so, REF is that memory source and the base of the memory area |
| accessed and the offset of the access from that base are recorded in N. */ |
| |
| bool |
| find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n) |
| { |
| /* Leaf node is an array or component ref. Memorize its base and |
| offset from base to compare to other such leaf node. */ |
| poly_int64 bitsize, bitpos, bytepos; |
| machine_mode mode; |
| int unsignedp, reversep, volatilep; |
| tree offset, base_addr; |
| |
| /* Not prepared to handle PDP endian. */ |
| if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN) |
| return false; |
| |
| if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt)) |
| return false; |
| |
| base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode, |
| &unsignedp, &reversep, &volatilep); |
| |
| if (TREE_CODE (base_addr) == TARGET_MEM_REF) |
| /* Do not rewrite TARGET_MEM_REF. */ |
| return false; |
| else if (TREE_CODE (base_addr) == MEM_REF) |
| { |
| poly_offset_int bit_offset = 0; |
| tree off = TREE_OPERAND (base_addr, 1); |
| |
| if (!integer_zerop (off)) |
| { |
| poly_offset_int boff = mem_ref_offset (base_addr); |
| boff <<= LOG2_BITS_PER_UNIT; |
| bit_offset += boff; |
| } |
| |
| base_addr = TREE_OPERAND (base_addr, 0); |
| |
| /* Avoid returning a negative bitpos as this may wreak havoc later. */ |
| if (maybe_lt (bit_offset, 0)) |
| { |
| tree byte_offset = wide_int_to_tree |
| (sizetype, bits_to_bytes_round_down (bit_offset)); |
| bit_offset = num_trailing_bits (bit_offset); |
| if (offset) |
| offset = size_binop (PLUS_EXPR, offset, byte_offset); |
| else |
| offset = byte_offset; |
| } |
| |
| bitpos += bit_offset.force_shwi (); |
| } |
| else |
| base_addr = build_fold_addr_expr (base_addr); |
| |
| if (!multiple_p (bitpos, BITS_PER_UNIT, &bytepos)) |
| return false; |
| if (!multiple_p (bitsize, BITS_PER_UNIT)) |
| return false; |
| if (reversep) |
| return false; |
| |
| if (!init_symbolic_number (n, ref)) |
| return false; |
| n->base_addr = base_addr; |
| n->offset = offset; |
| n->bytepos = bytepos; |
| n->alias_set = reference_alias_ptr_type (ref); |
| n->vuse = gimple_vuse (stmt); |
| return true; |
| } |
| |
| /* Compute the symbolic number N representing the result of a bitwise OR, |
| bitwise XOR or plus on 2 symbolic number N1 and N2 whose source statements |
| are respectively SOURCE_STMT1 and SOURCE_STMT2. CODE is the operation. */ |
| |
| gimple * |
| perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1, |
| gimple *source_stmt2, struct symbolic_number *n2, |
| struct symbolic_number *n, enum tree_code code) |
| { |
| int i, size; |
| uint64_t mask; |
| gimple *source_stmt; |
| struct symbolic_number *n_start; |
| |
| tree rhs1 = gimple_assign_rhs1 (source_stmt1); |
| if (TREE_CODE (rhs1) == BIT_FIELD_REF |
| && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME) |
| rhs1 = TREE_OPERAND (rhs1, 0); |
| tree rhs2 = gimple_assign_rhs1 (source_stmt2); |
| if (TREE_CODE (rhs2) == BIT_FIELD_REF |
| && TREE_CODE (TREE_OPERAND (rhs2, 0)) == SSA_NAME) |
| rhs2 = TREE_OPERAND (rhs2, 0); |
| |
| /* Sources are different, cancel bswap if they are not memory location with |
| the same base (array, structure, ...). */ |
| if (rhs1 != rhs2) |
| { |
| uint64_t inc; |
| HOST_WIDE_INT start1, start2, start_sub, end_sub, end1, end2, end; |
| struct symbolic_number *toinc_n_ptr, *n_end; |
| basic_block bb1, bb2; |
| |
| if (!n1->base_addr || !n2->base_addr |
| || !operand_equal_p (n1->base_addr, n2->base_addr, 0)) |
| return NULL; |
| |
| if (!n1->offset != !n2->offset |
| || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0))) |
| return NULL; |
| |
| start1 = 0; |
| if (!(n2->bytepos - n1->bytepos).is_constant (&start2)) |
| return NULL; |
| |
| if (start1 < start2) |
| { |
| n_start = n1; |
| start_sub = start2 - start1; |
| } |
| else |
| { |
| n_start = n2; |
| start_sub = start1 - start2; |
| } |
| |
| bb1 = gimple_bb (source_stmt1); |
| bb2 = gimple_bb (source_stmt2); |
| if (dominated_by_p (CDI_DOMINATORS, bb1, bb2)) |
| source_stmt = source_stmt1; |
| else |
| source_stmt = source_stmt2; |
| |
| /* Find the highest address at which a load is performed and |
| compute related info. */ |
| end1 = start1 + (n1->range - 1); |
| end2 = start2 + (n2->range - 1); |
| if (end1 < end2) |
| { |
| end = end2; |
| end_sub = end2 - end1; |
| } |
| else |
| { |
| end = end1; |
| end_sub = end1 - end2; |
| } |
| n_end = (end2 > end1) ? n2 : n1; |
| |
| /* Find symbolic number whose lsb is the most significant. */ |
| if (BYTES_BIG_ENDIAN) |
| toinc_n_ptr = (n_end == n1) ? n2 : n1; |
| else |
| toinc_n_ptr = (n_start == n1) ? n2 : n1; |
| |
| n->range = end - MIN (start1, start2) + 1; |
| |
| /* Check that the range of memory covered can be represented by |
| a symbolic number. */ |
| if (n->range > 64 / BITS_PER_MARKER) |
| return NULL; |
| |
| /* Reinterpret byte marks in symbolic number holding the value of |
| bigger weight according to target endianness. */ |
| inc = BYTES_BIG_ENDIAN ? end_sub : start_sub; |
| size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT; |
| for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER) |
| { |
| unsigned marker |
| = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK; |
| if (marker && marker != MARKER_BYTE_UNKNOWN) |
| toinc_n_ptr->n += inc; |
| } |
| } |
| else |
| { |
| n->range = n1->range; |
| n_start = n1; |
| source_stmt = source_stmt1; |
| } |
| |
| if (!n1->alias_set |
| || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set)) |
| n->alias_set = n1->alias_set; |
| else |
| n->alias_set = ptr_type_node; |
| n->vuse = n_start->vuse; |
| n->base_addr = n_start->base_addr; |
| n->offset = n_start->offset; |
| n->src = n_start->src; |
| n->bytepos = n_start->bytepos; |
| n->type = n_start->type; |
| size = TYPE_PRECISION (n->type) / BITS_PER_UNIT; |
| uint64_t res_n = n1->n | n2->n; |
| |
| for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER) |
| { |
| uint64_t masked1, masked2; |
| |
| masked1 = n1->n & mask; |
| masked2 = n2->n & mask; |
| /* If at least one byte is 0, all of 0 | x == 0 ^ x == 0 + x == x. */ |
| if (masked1 && masked2) |
| { |
| /* + can carry into upper bits, just punt. */ |
| if (code == PLUS_EXPR) |
| return NULL; |
| /* x | x is still x. */ |
| if (code == BIT_IOR_EXPR && masked1 == masked2) |
| continue; |
| if (code == BIT_XOR_EXPR) |
| { |
| /* x ^ x is 0, but MARKER_BYTE_UNKNOWN stands for |
| unknown values and unknown ^ unknown is unknown. */ |
| if (masked1 == masked2 |
| && masked1 != ((uint64_t) MARKER_BYTE_UNKNOWN |
| << i * BITS_PER_MARKER)) |
| { |
| res_n &= ~mask; |
| continue; |
| } |
| } |
| /* Otherwise set the byte to unknown, it might still be |
| later masked off. */ |
| res_n |= mask; |
| } |
| } |
| n->n = res_n; |
| n->n_ops = n1->n_ops + n2->n_ops; |
| |
| return source_stmt; |
| } |
| |
| /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform |
| the operation given by the rhs of STMT on the result. If the operation |
| could successfully be executed the function returns a gimple stmt whose |
| rhs's first tree is the expression of the source operand and NULL |
| otherwise. */ |
| |
| gimple * |
| find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit) |
| { |
| enum tree_code code; |
| tree rhs1, rhs2 = NULL; |
| gimple *rhs1_stmt, *rhs2_stmt, *source_stmt1; |
| enum gimple_rhs_class rhs_class; |
| |
| if (!limit || !is_gimple_assign (stmt)) |
| return NULL; |
| |
| rhs1 = gimple_assign_rhs1 (stmt); |
| |
| if (find_bswap_or_nop_load (stmt, rhs1, n)) |
| return stmt; |
| |
| /* Handle BIT_FIELD_REF. */ |
| if (TREE_CODE (rhs1) == BIT_FIELD_REF |
| && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME) |
| { |
| if (!tree_fits_uhwi_p (TREE_OPERAND (rhs1, 1)) |
| || !tree_fits_uhwi_p (TREE_OPERAND (rhs1, 2))) |
| return NULL; |
| |
| unsigned HOST_WIDE_INT bitsize = tree_to_uhwi (TREE_OPERAND (rhs1, 1)); |
| unsigned HOST_WIDE_INT bitpos = tree_to_uhwi (TREE_OPERAND (rhs1, 2)); |
| if (bitpos % BITS_PER_UNIT == 0 |
| && bitsize % BITS_PER_UNIT == 0 |
| && init_symbolic_number (n, TREE_OPERAND (rhs1, 0))) |
| { |
| /* Handle big-endian bit numbering in BIT_FIELD_REF. */ |
| if (BYTES_BIG_ENDIAN) |
| bitpos = TYPE_PRECISION (n->type) - bitpos - bitsize; |
| |
| /* Shift. */ |
| if (!do_shift_rotate (RSHIFT_EXPR, n, bitpos)) |
| return NULL; |
| |
| /* Mask. */ |
| uint64_t mask = 0; |
| uint64_t tmp = (1 << BITS_PER_UNIT) - 1; |
| for (unsigned i = 0; i < bitsize / BITS_PER_UNIT; |
| i++, tmp <<= BITS_PER_UNIT) |
| mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER); |
| n->n &= mask; |
| |
| /* Convert. */ |
| n->type = TREE_TYPE (rhs1); |
| if (!n->base_addr) |
| n->range = TYPE_PRECISION (n->type) / BITS_PER_UNIT; |
| |
| return verify_symbolic_number_p (n, stmt) ? stmt : NULL; |
| } |
| |
| return NULL; |
| } |
| |
| if (TREE_CODE (rhs1) != SSA_NAME) |
| return NULL; |
| |
| code = gimple_assign_rhs_code (stmt); |
| rhs_class = gimple_assign_rhs_class (stmt); |
| rhs1_stmt = SSA_NAME_DEF_STMT (rhs1); |
| |
| if (rhs_class == GIMPLE_BINARY_RHS) |
| rhs2 = gimple_assign_rhs2 (stmt); |
| |
| /* Handle unary rhs and binary rhs with integer constants as second |
| operand. */ |
| |
| if (rhs_class == GIMPLE_UNARY_RHS |
| || (rhs_class == GIMPLE_BINARY_RHS |
| && TREE_CODE (rhs2) == INTEGER_CST)) |
| { |
| if (code != BIT_AND_EXPR |
| && code != LSHIFT_EXPR |
| && code != RSHIFT_EXPR |
| && code != LROTATE_EXPR |
| && code != RROTATE_EXPR |
| && !CONVERT_EXPR_CODE_P (code)) |
| return NULL; |
| |
| source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1); |
| |
| /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and |
| we have to initialize the symbolic number. */ |
| if (!source_stmt1) |
| { |
| if (gimple_assign_load_p (stmt) |
| || !init_symbolic_number (n, rhs1)) |
| return NULL; |
| source_stmt1 = stmt; |
| } |
| |
| switch (code) |
| { |
| case BIT_AND_EXPR: |
| { |
| int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT; |
| uint64_t val = int_cst_value (rhs2), mask = 0; |
| uint64_t tmp = (1 << BITS_PER_UNIT) - 1; |
| |
| /* Only constants masking full bytes are allowed. */ |
| for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT) |
| if ((val & tmp) != 0 && (val & tmp) != tmp) |
| return NULL; |
| else if (val & tmp) |
| mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER); |
| |
| n->n &= mask; |
| } |
| break; |
| case LSHIFT_EXPR: |
| case RSHIFT_EXPR: |
| case LROTATE_EXPR: |
| case RROTATE_EXPR: |
| if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2))) |
| return NULL; |
| break; |
| CASE_CONVERT: |
| { |
| int i, type_size, old_type_size; |
| tree type; |
| |
| type = TREE_TYPE (gimple_assign_lhs (stmt)); |
| type_size = TYPE_PRECISION (type); |
| if (type_size % BITS_PER_UNIT != 0) |
| return NULL; |
| type_size /= BITS_PER_UNIT; |
| if (type_size > 64 / BITS_PER_MARKER) |
| return NULL; |
| |
| /* Sign extension: result is dependent on the value. */ |
| old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT; |
| if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size |
| && HEAD_MARKER (n->n, old_type_size)) |
| for (i = 0; i < type_size - old_type_size; i++) |
| n->n |= (uint64_t) MARKER_BYTE_UNKNOWN |
| << ((type_size - 1 - i) * BITS_PER_MARKER); |
| |
| if (type_size < 64 / BITS_PER_MARKER) |
| { |
| /* If STMT casts to a smaller type mask out the bits not |
| belonging to the target type. */ |
| n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1; |
| } |
| n->type = type; |
| if (!n->base_addr) |
| n->range = type_size; |
| } |
| break; |
| default: |
| return NULL; |
| }; |
| return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL; |
| } |
| |
| /* Handle binary rhs. */ |
| |
| if (rhs_class == GIMPLE_BINARY_RHS) |
| { |
| struct symbolic_number n1, n2; |
| gimple *source_stmt, *source_stmt2; |
| |
| if (!rhs2 || TREE_CODE (rhs2) != SSA_NAME) |
| return NULL; |
| |
| rhs2_stmt = SSA_NAME_DEF_STMT (rhs2); |
| |
| switch (code) |
| { |
| case BIT_IOR_EXPR: |
| case BIT_XOR_EXPR: |
| case PLUS_EXPR: |
| source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1); |
| |
| if (!source_stmt1) |
| return NULL; |
| |
| source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1); |
| |
| if (!source_stmt2) |
| return NULL; |
| |
| if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type)) |
| return NULL; |
| |
| if (n1.vuse != n2.vuse) |
| return NULL; |
| |
| source_stmt |
| = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n, |
| code); |
| |
| if (!source_stmt) |
| return NULL; |
| |
| if (!verify_symbolic_number_p (n, stmt)) |
| return NULL; |
| |
| break; |
| default: |
| return NULL; |
| } |
| return source_stmt; |
| } |
| return NULL; |
| } |
| |
| /* Helper for find_bswap_or_nop and try_coalesce_bswap to compute |
| *CMPXCHG, *CMPNOP and adjust *N. */ |
| |
| void |
| find_bswap_or_nop_finalize (struct symbolic_number *n, uint64_t *cmpxchg, |
| uint64_t *cmpnop, bool *cast64_to_32) |
| { |
| unsigned rsize; |
| uint64_t tmpn, mask; |
| |
| /* The number which the find_bswap_or_nop_1 result should match in order |
| to have a full byte swap. The number is shifted to the right |
| according to the size of the symbolic number before using it. */ |
| *cmpxchg = CMPXCHG; |
| *cmpnop = CMPNOP; |
| *cast64_to_32 = false; |
| |
| /* Find real size of result (highest non-zero byte). */ |
| if (n->base_addr) |
| for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++); |
| else |
| rsize = n->range; |
| |
| /* Zero out the bits corresponding to untouched bytes in original gimple |
| expression. */ |
| if (n->range < (int) sizeof (int64_t)) |
| { |
| mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1; |
| if (n->base_addr == NULL |
| && n->range == 4 |
| && int_size_in_bytes (TREE_TYPE (n->src)) == 8) |
| { |
| /* If all bytes in n->n are either 0 or in [5..8] range, this |
| might be a candidate for (unsigned) __builtin_bswap64 (src). |
| It is not worth it for (unsigned short) __builtin_bswap64 (src) |
| or (unsigned short) __builtin_bswap32 (src). */ |
| *cast64_to_32 = true; |
| for (tmpn = n->n; tmpn; tmpn >>= BITS_PER_MARKER) |
| if ((tmpn & MARKER_MASK) |
| && ((tmpn & MARKER_MASK) <= 4 || (tmpn & MARKER_MASK) > 8)) |
| { |
| *cast64_to_32 = false; |
| break; |
| } |
| } |
| if (*cast64_to_32) |
| *cmpxchg &= mask; |
| else |
| *cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER; |
| *cmpnop &= mask; |
| } |
| |
| /* Zero out the bits corresponding to unused bytes in the result of the |
| gimple expression. */ |
| if (rsize < n->range) |
| { |
| if (BYTES_BIG_ENDIAN) |
| { |
| mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1; |
| *cmpxchg &= mask; |
| if (n->range - rsize == sizeof (int64_t)) |
| *cmpnop = 0; |
| else |
| *cmpnop >>= (n->range - rsize) * BITS_PER_MARKER; |
| } |
| else |
| { |
| mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1; |
| if (n->range - rsize == sizeof (int64_t)) |
| *cmpxchg = 0; |
| else |
| *cmpxchg >>= (n->range - rsize) * BITS_PER_MARKER; |
| *cmpnop &= mask; |
| } |
| n->range = rsize; |
| } |
| |
| if (*cast64_to_32) |
| n->range = 8; |
| n->range *= BITS_PER_UNIT; |
| } |
| |
| /* Check if STMT completes a bswap implementation or a read in a given |
| endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP |
| accordingly. It also sets N to represent the kind of operations |
| performed: size of the resulting expression and whether it works on |
| a memory source, and if so alias-set and vuse. At last, the |
| function returns a stmt whose rhs's first tree is the source |
| expression. */ |
| |
| gimple * |
| find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap, |
| bool *cast64_to_32, uint64_t *mask) |
| { |
| tree type_size = TYPE_SIZE_UNIT (TREE_TYPE (gimple_get_lhs (stmt))); |
| if (!tree_fits_uhwi_p (type_size)) |
| return NULL; |
| |
| /* The last parameter determines the depth search limit. It usually |
| correlates directly to the number n of bytes to be touched. We |
| increase that number by 2 * (log2(n) + 1) here in order to also |
| cover signed -> unsigned conversions of the src operand as can be seen |
| in libgcc, and for initial shift/and operation of the src operand. */ |
| int limit = tree_to_uhwi (type_size); |
| limit += 2 * (1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit)); |
| gimple *ins_stmt = find_bswap_or_nop_1 (stmt, n, limit); |
| |
| if (!ins_stmt) |
| { |
| if (gimple_assign_rhs_code (stmt) != CONSTRUCTOR |
| || BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN) |
| return NULL; |
| unsigned HOST_WIDE_INT sz = tree_to_uhwi (type_size) * BITS_PER_UNIT; |
| if (sz != 16 && sz != 32 && sz != 64) |
| return NULL; |
| tree rhs = gimple_assign_rhs1 (stmt); |
| if (CONSTRUCTOR_NELTS (rhs) == 0) |
| return NULL; |
| tree eltype = TREE_TYPE (TREE_TYPE (rhs)); |
| unsigned HOST_WIDE_INT eltsz |
| = int_size_in_bytes (eltype) * BITS_PER_UNIT; |
| if (TYPE_PRECISION (eltype) != eltsz) |
| return NULL; |
| constructor_elt *elt; |
| unsigned int i; |
| tree type = build_nonstandard_integer_type (sz, 1); |
| FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (rhs), i, elt) |
| { |
| if (TREE_CODE (elt->value) != SSA_NAME |
| || !INTEGRAL_TYPE_P (TREE_TYPE (elt->value))) |
| return NULL; |
| struct symbolic_number n1; |
| gimple *source_stmt |
| = find_bswap_or_nop_1 (SSA_NAME_DEF_STMT (elt->value), &n1, |
| limit - 1); |
| |
| if (!source_stmt) |
| return NULL; |
| |
| n1.type = type; |
| if (!n1.base_addr) |
| n1.range = sz / BITS_PER_UNIT; |
| |
| if (i == 0) |
| { |
| ins_stmt = source_stmt; |
| *n = n1; |
| } |
| else |
| { |
| if (n->vuse != n1.vuse) |
| return NULL; |
| |
| struct symbolic_number n0 = *n; |
| |
| if (!BYTES_BIG_ENDIAN) |
| { |
| if (!do_shift_rotate (LSHIFT_EXPR, &n1, i * eltsz)) |
| return NULL; |
| } |
| else if (!do_shift_rotate (LSHIFT_EXPR, &n0, eltsz)) |
| return NULL; |
| ins_stmt |
| = perform_symbolic_merge (ins_stmt, &n0, source_stmt, &n1, n, |
| BIT_IOR_EXPR); |
| |
| if (!ins_stmt) |
| return NULL; |
| } |
| } |
| } |
| |
| uint64_t cmpxchg, cmpnop; |
| find_bswap_or_nop_finalize (n, &cmpxchg, &cmpnop, cast64_to_32); |
| |
| /* A complete byte swap should make the symbolic number to start with |
| the largest digit in the highest order byte. Unchanged symbolic |
| number indicates a read with same endianness as target architecture. */ |
| *mask = ~(uint64_t) 0; |
| if (n->n == cmpnop) |
| *bswap = false; |
| else if (n->n == cmpxchg) |
| *bswap = true; |
| else |
| { |
| int set = 0; |
| for (uint64_t msk = MARKER_MASK; msk; msk <<= BITS_PER_MARKER) |
| if ((n->n & msk) == 0) |
| *mask &= ~msk; |
| else if ((n->n & msk) == (cmpxchg & msk)) |
| set++; |
| else |
| return NULL; |
| if (set < 2) |
| return NULL; |
| *bswap = true; |
| } |
| |
| /* Useless bit manipulation performed by code. */ |
| if (!n->base_addr && n->n == cmpnop && n->n_ops == 1) |
| return NULL; |
| |
| return ins_stmt; |
| } |
| |
| const pass_data pass_data_optimize_bswap = |
| { |
| GIMPLE_PASS, /* type */ |
| "bswap", /* name */ |
| OPTGROUP_NONE, /* optinfo_flags */ |
| TV_NONE, /* tv_id */ |
| PROP_ssa, /* properties_required */ |
| 0, /* properties_provided */ |
| 0, /* properties_destroyed */ |
| 0, /* todo_flags_start */ |
| 0, /* todo_flags_finish */ |
| }; |
| |
| class pass_optimize_bswap : public gimple_opt_pass |
| { |
| public: |
| pass_optimize_bswap (gcc::context *ctxt) |
| : gimple_opt_pass (pass_data_optimize_bswap, ctxt) |
| {} |
| |
| /* opt_pass methods: */ |
| virtual bool gate (function *) |
| { |
| return flag_expensive_optimizations && optimize && BITS_PER_UNIT == 8; |
| } |
| |
| virtual unsigned int execute (function *); |
| |
| }; // class pass_optimize_bswap |
| |
| /* Helper function for bswap_replace. Build VIEW_CONVERT_EXPR from |
| VAL to TYPE. If VAL has different type size, emit a NOP_EXPR cast |
| first. */ |
| |
| static tree |
| bswap_view_convert (gimple_stmt_iterator *gsi, tree type, tree val, |
| bool before) |
| { |
| gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (val)) |
| || POINTER_TYPE_P (TREE_TYPE (val))); |
| if (TYPE_SIZE (type) != TYPE_SIZE (TREE_TYPE (val))) |
| { |
| HOST_WIDE_INT prec = TREE_INT_CST_LOW (TYPE_SIZE (type)); |
| if (POINTER_TYPE_P (TREE_TYPE (val))) |
| { |
| gimple *g |
| = gimple_build_assign (make_ssa_name (pointer_sized_int_node), |
| NOP_EXPR, val); |
| if (before) |
| gsi_insert_before (gsi, g, GSI_SAME_STMT); |
| else |
| gsi_insert_after (gsi, g, GSI_NEW_STMT); |
| val = gimple_assign_lhs (g); |
| } |
| tree itype = build_nonstandard_integer_type (prec, 1); |
| gimple *g = gimple_build_assign (make_ssa_name (itype), NOP_EXPR, val); |
| if (before) |
| gsi_insert_before (gsi, g, GSI_SAME_STMT); |
| else |
| gsi_insert_after (gsi, g, GSI_NEW_STMT); |
| val = gimple_assign_lhs (g); |
| } |
| return build1 (VIEW_CONVERT_EXPR, type, val); |
| } |
| |
| /* Perform the bswap optimization: replace the expression computed in the rhs |
| of gsi_stmt (GSI) (or if NULL add instead of replace) by an equivalent |
| bswap, load or load + bswap expression. |
| Which of these alternatives replace the rhs is given by N->base_addr (non |
| null if a load is needed) and BSWAP. The type, VUSE and set-alias of the |
| load to perform are also given in N while the builtin bswap invoke is given |
| in FNDEL. Finally, if a load is involved, INS_STMT refers to one of the |
| load statements involved to construct the rhs in gsi_stmt (GSI) and |
| N->range gives the size of the rhs expression for maintaining some |
| statistics. |
| |
| Note that if the replacement involve a load and if gsi_stmt (GSI) is |
| non-NULL, that stmt is moved just after INS_STMT to do the load with the |
| same VUSE which can lead to gsi_stmt (GSI) changing of basic block. */ |
| |
| tree |
| bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, |
| tree bswap_type, tree load_type, struct symbolic_number *n, |
| bool bswap, uint64_t mask) |
| { |
| tree src, tmp, tgt = NULL_TREE; |
| gimple *bswap_stmt, *mask_stmt = NULL; |
| tree_code conv_code = NOP_EXPR; |
| |
| gimple *cur_stmt = gsi_stmt (gsi); |
| src = n->src; |
| if (cur_stmt) |
| { |
| tgt = gimple_assign_lhs (cur_stmt); |
| if (gimple_assign_rhs_code (cur_stmt) == CONSTRUCTOR |
| && tgt |
| && VECTOR_TYPE_P (TREE_TYPE (tgt))) |
| conv_code = VIEW_CONVERT_EXPR; |
| } |
| |
| /* Need to load the value from memory first. */ |
| if (n->base_addr) |
| { |
| gimple_stmt_iterator gsi_ins = gsi; |
| if (ins_stmt) |
| gsi_ins = gsi_for_stmt (ins_stmt); |
| tree addr_expr, addr_tmp, val_expr, val_tmp; |
| tree load_offset_ptr, aligned_load_type; |
| gimple *load_stmt; |
| unsigned align = get_object_alignment (src); |
| poly_int64 load_offset = 0; |
| |
| if (cur_stmt) |
| { |
| basic_block ins_bb = gimple_bb (ins_stmt); |
| basic_block cur_bb = gimple_bb (cur_stmt); |
| if (!dominated_by_p (CDI_DOMINATORS, cur_bb, ins_bb)) |
| return NULL_TREE; |
| |
| /* Move cur_stmt just before one of the load of the original |
| to ensure it has the same VUSE. See PR61517 for what could |
| go wrong. */ |
| if (gimple_bb (cur_stmt) != gimple_bb (ins_stmt)) |
| reset_flow_sensitive_info (gimple_assign_lhs (cur_stmt)); |
| gsi_move_before (&gsi, &gsi_ins); |
| gsi = gsi_for_stmt (cur_stmt); |
| } |
| else |
| gsi = gsi_ins; |
| |
| /* Compute address to load from and cast according to the size |
| of the load. */ |
| addr_expr = build_fold_addr_expr (src); |
| if (is_gimple_mem_ref_addr (addr_expr)) |
| addr_tmp = unshare_expr (addr_expr); |
| else |
| { |
| addr_tmp = unshare_expr (n->base_addr); |
| if (!is_gimple_mem_ref_addr (addr_tmp)) |
| addr_tmp = force_gimple_operand_gsi_1 (&gsi, addr_tmp, |
| is_gimple_mem_ref_addr, |
| NULL_TREE, true, |
| GSI_SAME_STMT); |
| load_offset = n->bytepos; |
| if (n->offset) |
| { |
| tree off |
| = force_gimple_operand_gsi (&gsi, unshare_expr (n->offset), |
| true, NULL_TREE, true, |
| GSI_SAME_STMT); |
| gimple *stmt |
| = gimple_build_assign (make_ssa_name (TREE_TYPE (addr_tmp)), |
| POINTER_PLUS_EXPR, addr_tmp, off); |
| gsi_insert_before (&gsi, stmt, GSI_SAME_STMT); |
| addr_tmp = gimple_assign_lhs (stmt); |
| } |
| } |
| |
| /* Perform the load. */ |
| aligned_load_type = load_type; |
| if (align < TYPE_ALIGN (load_type)) |
| aligned_load_type = build_aligned_type (load_type, align); |
| load_offset_ptr = build_int_cst (n->alias_set, load_offset); |
| val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp, |
| load_offset_ptr); |
| |
| if (!bswap) |
| { |
| if (n->range == 16) |
| nop_stats.found_16bit++; |
| else if (n->range == 32) |
| nop_stats.found_32bit++; |
| else |
| { |
| gcc_assert (n->range == 64); |
| nop_stats.found_64bit++; |
| } |
| |
| /* Convert the result of load if necessary. */ |
| if (tgt && !useless_type_conversion_p (TREE_TYPE (tgt), load_type)) |
| { |
| val_tmp = make_temp_ssa_name (aligned_load_type, NULL, |
| "load_dst"); |
| load_stmt = gimple_build_assign (val_tmp, val_expr); |
| gimple_set_vuse (load_stmt, n->vuse); |
| gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT); |
| if (conv_code == VIEW_CONVERT_EXPR) |
| val_tmp = bswap_view_convert (&gsi, TREE_TYPE (tgt), val_tmp, |
| true); |
| gimple_assign_set_rhs_with_ops (&gsi, conv_code, val_tmp); |
| update_stmt (cur_stmt); |
| } |
| else if (cur_stmt) |
| { |
| gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr); |
| gimple_set_vuse (cur_stmt, n->vuse); |
| update_stmt (cur_stmt); |
| } |
| else |
| { |
| tgt = make_ssa_name (load_type); |
| cur_stmt = gimple_build_assign (tgt, MEM_REF, val_expr); |
| gimple_set_vuse (cur_stmt, n->vuse); |
| gsi_insert_before (&gsi, cur_stmt, GSI_SAME_STMT); |
| } |
| |
| if (dump_file) |
| { |
| fprintf (dump_file, |
| "%d bit load in target endianness found at: ", |
| (int) n->range); |
| print_gimple_stmt (dump_file, cur_stmt, 0); |
| } |
| return tgt; |
| } |
| else |
| { |
| val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst"); |
| load_stmt = gimple_build_assign (val_tmp, val_expr); |
| gimple_set_vuse (load_stmt, n->vuse); |
| gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT); |
| } |
| src = val_tmp; |
| } |
| else if (!bswap) |
| { |
| gimple *g = NULL; |
| if (tgt && !useless_type_conversion_p (TREE_TYPE (tgt), TREE_TYPE (src))) |
| { |
| if (!is_gimple_val (src)) |
| return NULL_TREE; |
| if (conv_code == VIEW_CONVERT_EXPR) |
| src = bswap_view_convert (&gsi, TREE_TYPE (tgt), src, true); |
| g = gimple_build_assign (tgt, conv_code, src); |
| } |
| else if (cur_stmt) |
| g = gimple_build_assign (tgt, src); |
| else |
| tgt = src; |
| if (n->range == 16) |
| nop_stats.found_16bit++; |
| else if (n->range == 32) |
| nop_stats.found_32bit++; |
| else |
| { |
| gcc_assert (n->range == 64); |
| nop_stats.found_64bit++; |
| } |
| if (dump_file) |
| { |
| fprintf (dump_file, |
| "%d bit reshuffle in target endianness found at: ", |
| (int) n->range); |
| if (cur_stmt) |
| print_gimple_stmt (dump_file, cur_stmt, 0); |
| else |
| { |
| print_generic_expr (dump_file, tgt, TDF_NONE); |
| fprintf (dump_file, "\n"); |
| } |
| } |
| if (cur_stmt) |
| gsi_replace (&gsi, g, true); |
| return tgt; |
| } |
| else if (TREE_CODE (src) == BIT_FIELD_REF) |
| src = TREE_OPERAND (src, 0); |
| |
| if (n->range == 16) |
| bswap_stats.found_16bit++; |
| else if (n->range == 32) |
| bswap_stats.found_32bit++; |
| else |
| { |
| gcc_assert (n->range == 64); |
| bswap_stats.found_64bit++; |
| } |
| |
| tmp = src; |
| |
| /* Convert the src expression if necessary. */ |
| if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type)) |
| { |
| gimple *convert_stmt; |
| |
| tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc"); |
| convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src); |
| gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT); |
| } |
| |
| /* Canonical form for 16 bit bswap is a rotate expression. Only 16bit values |
| are considered as rotation of 2N bit values by N bits is generally not |
| equivalent to a bswap. Consider for instance 0x01020304 r>> 16 which |
| gives 0x03040102 while a bswap for that value is 0x04030201. */ |
| if (bswap && n->range == 16) |
| { |
| tree count = build_int_cst (NULL, BITS_PER_UNIT); |
| src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count); |
| bswap_stmt = gimple_build_assign (NULL, src); |
| } |
| else |
| bswap_stmt = gimple_build_call (fndecl, 1, tmp); |
| |
| if (tgt == NULL_TREE) |
| tgt = make_ssa_name (bswap_type); |
| tmp = tgt; |
| |
| if (mask != ~(uint64_t) 0) |
| { |
| tree m = build_int_cst (bswap_type, mask); |
| tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst"); |
| gimple_set_lhs (bswap_stmt, tmp); |
| mask_stmt = gimple_build_assign (tgt, BIT_AND_EXPR, tmp, m); |
| tmp = tgt; |
| } |
| |
| /* Convert the result if necessary. */ |
| if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type)) |
| { |
| tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst"); |
| tree atmp = tmp; |
| gimple_stmt_iterator gsi2 = gsi; |
| if (conv_code == VIEW_CONVERT_EXPR) |
| atmp = bswap_view_convert (&gsi2, TREE_TYPE (tgt), tmp, false); |
| gimple *convert_stmt = gimple_build_assign (tgt, conv_code, atmp); |
| gsi_insert_after (&gsi2, convert_stmt, GSI_SAME_STMT); |
| } |
| |
| gimple_set_lhs (mask_stmt ? mask_stmt : bswap_stmt, tmp); |
| |
| if (dump_file) |
| { |
| fprintf (dump_file, "%d bit bswap implementation found at: ", |
| (int) n->range); |
| if (cur_stmt) |
| print_gimple_stmt (dump_file, cur_stmt, 0); |
| else |
| { |
| print_generic_expr (dump_file, tgt, TDF_NONE); |
| fprintf (dump_file, "\n"); |
| } |
| } |
| |
| if (cur_stmt) |
| { |
| if (mask_stmt) |
| gsi_insert_after (&gsi, mask_stmt, GSI_SAME_STMT); |
| gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT); |
| gsi_remove (&gsi, true); |
| } |
| else |
| { |
| gsi_insert_before (&gsi, bswap_stmt, GSI_SAME_STMT); |
| if (mask_stmt) |
| gsi_insert_before (&gsi, mask_stmt, GSI_SAME_STMT); |
| } |
| return tgt; |
| } |
| |
| /* Try to optimize an assignment CUR_STMT with CONSTRUCTOR on the rhs |
| using bswap optimizations. CDI_DOMINATORS need to be |
| computed on entry. Return true if it has been optimized and |
| TODO_update_ssa is needed. */ |
| |
| static bool |
| maybe_optimize_vector_constructor (gimple *cur_stmt) |
| { |
| tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type; |
| struct symbolic_number n; |
| bool bswap; |
| |
| gcc_assert (is_gimple_assign (cur_stmt) |
| && gimple_assign_rhs_code (cur_stmt) == CONSTRUCTOR); |
| |
| tree rhs = gimple_assign_rhs1 (cur_stmt); |
| if (!VECTOR_TYPE_P (TREE_TYPE (rhs)) |
| || !INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (rhs))) |
| || gimple_assign_lhs (cur_stmt) == NULL_TREE) |
| return false; |
| |
| HOST_WIDE_INT sz = int_size_in_bytes (TREE_TYPE (rhs)) * BITS_PER_UNIT; |
| switch (sz) |
| { |
| case 16: |
| load_type = bswap_type = uint16_type_node; |
| break; |
| case 32: |
| if (builtin_decl_explicit_p (BUILT_IN_BSWAP32) |
| && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing) |
| { |
| load_type = uint32_type_node; |
| fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32); |
| bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl))); |
| } |
| else |
| return false; |
| break; |
| case 64: |
| if (builtin_decl_explicit_p (BUILT_IN_BSWAP64) |
| && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing |
| || (word_mode == SImode |
| && builtin_decl_explicit_p (BUILT_IN_BSWAP32) |
| && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing))) |
| { |
| load_type = uint64_type_node; |
| fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64); |
| bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl))); |
| } |
| else |
| return false; |
| break; |
| default: |
| return false; |
| } |
| |
| bool cast64_to_32; |
| uint64_t mask; |
| gimple *ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap, |
| &cast64_to_32, &mask); |
| if (!ins_stmt |
| || n.range != (unsigned HOST_WIDE_INT) sz |
| || cast64_to_32 |
| || mask != ~(uint64_t) 0) |
| return false; |
| |
| if (bswap && !fndecl && n.range != 16) |
| return false; |
| |
| memset (&nop_stats, 0, sizeof (nop_stats)); |
| memset (&bswap_stats, 0, sizeof (bswap_stats)); |
| return bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl, |
| bswap_type, load_type, &n, bswap, mask) != NULL_TREE; |
| } |
| |
| /* Find manual byte swap implementations as well as load in a given |
| endianness. Byte swaps are turned into a bswap builtin invokation |
| while endian loads are converted to bswap builtin invokation or |
| simple load according to the target endianness. */ |
| |
| unsigned int |
| pass_optimize_bswap::execute (function *fun) |
| { |
| basic_block bb; |
| bool bswap32_p, bswap64_p; |
| bool changed = false; |
| tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE; |
| |
| bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32) |
| && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing); |
| bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64) |
| && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing |
| || (bswap32_p && word_mode == SImode))); |
| |
| /* Determine the argument type of the builtins. The code later on |
| assumes that the return and argument type are the same. */ |
| if (bswap32_p) |
| { |
| tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32); |
| bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl))); |
| } |
| |
| if (bswap64_p) |
| { |
| tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64); |
| bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl))); |
| } |
| |
| memset (&nop_stats, 0, sizeof (nop_stats)); |
| memset (&bswap_stats, 0, sizeof (bswap_stats)); |
| calculate_dominance_info (CDI_DOMINATORS); |
| |
| FOR_EACH_BB_FN (bb, fun) |
| { |
| gimple_stmt_iterator gsi; |
| |
| /* We do a reverse scan for bswap patterns to make sure we get the |
| widest match. As bswap pattern matching doesn't handle previously |
| inserted smaller bswap replacements as sub-patterns, the wider |
| variant wouldn't be detected. */ |
| for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);) |
| { |
| gimple *ins_stmt, *cur_stmt = gsi_stmt (gsi); |
| tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type; |
| enum tree_code code; |
| struct symbolic_number n; |
| bool bswap, cast64_to_32; |
| uint64_t mask; |
| |
| /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt |
| might be moved to a different basic block by bswap_replace and gsi |
| must not points to it if that's the case. Moving the gsi_prev |
| there make sure that gsi points to the statement previous to |
| cur_stmt while still making sure that all statements are |
| considered in this basic block. */ |
| gsi_prev (&gsi); |
| |
| if (!is_gimple_assign (cur_stmt)) |
| continue; |
| |
| code = gimple_assign_rhs_code (cur_stmt); |
| switch (code) |
| { |
| case LROTATE_EXPR: |
| case RROTATE_EXPR: |
| if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt)) |
| || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt)) |
| % BITS_PER_UNIT) |
| continue; |
| /* Fall through. */ |
| case BIT_IOR_EXPR: |
| case BIT_XOR_EXPR: |
| case PLUS_EXPR: |
| break; |
| case CONSTRUCTOR: |
| { |
| tree rhs = gimple_assign_rhs1 (cur_stmt); |
| if (VECTOR_TYPE_P (TREE_TYPE (rhs)) |
| && INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (rhs)))) |
| break; |
| } |
| continue; |
| default: |
| continue; |
| } |
| |
| ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap, |
| &cast64_to_32, &mask); |
| |
| if (!ins_stmt) |
| continue; |
| |
| switch (n.range) |
| { |
| case 16: |
| /* Already in canonical form, nothing to do. */ |
| if (code == LROTATE_EXPR || code == RROTATE_EXPR) |
| continue; |
| load_type = bswap_type = uint16_type_node; |
| break; |
| case 32: |
| load_type = uint32_type_node; |
| if (bswap32_p) |
| { |
| fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32); |
| bswap_type = bswap32_type; |
| } |
| break; |
| case 64: |
| load_type = uint64_type_node; |
| if (bswap64_p) |
| { |
| fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64); |
| bswap_type = bswap64_type; |
| } |
| break; |
| default: |
| continue; |
| } |
| |
| if (bswap && !fndecl && n.range != 16) |
| continue; |
| |
| if (bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl, |
| bswap_type, load_type, &n, bswap, mask)) |
| changed = true; |
| } |
| } |
| |
| statistics_counter_event (fun, "16-bit nop implementations found", |
| nop_stats.found_16bit); |
| statistics_counter_event (fun, "32-bit nop implementations found", |
| nop_stats.found_32bit); |
| statistics_counter_event (fun, "64-bit nop implementations found", |
| nop_stats.found_64bit); |
| statistics_counter_event (fun, "16-bit bswap implementations found", |
| bswap_stats.found_16bit); |
| statistics_counter_event (fun, "32-bit bswap implementations found", |
| bswap_stats.found_32bit); |
| statistics_counter_event (fun, "64-bit bswap implementations found", |
| bswap_stats.found_64bit); |
| |
| return (changed ? TODO_update_ssa : 0); |
| } |
| |
| } // anon namespace |
| |
| gimple_opt_pass * |
| make_pass_optimize_bswap (gcc::context *ctxt) |
| { |
| return new pass_optimize_bswap (ctxt); |
| } |
| |
| namespace { |
| |
| /* Struct recording one operand for the store, which is either a constant, |
| then VAL represents the constant and all the other fields are zero, or |
| a memory load, then VAL represents the reference, BASE_ADDR is non-NULL |
| and the other fields also reflect the memory load, or an SSA name, then |
| VAL represents the SSA name and all the other fields are zero, */ |
| |
| class store_operand_info |
| { |
| public: |
| tree val; |
| tree base_addr; |
| poly_uint64 bitsize; |
| poly_uint64 bitpos; |
| poly_uint64 bitregion_start; |
| poly_uint64 bitregion_end; |
| gimple *stmt; |
| bool bit_not_p; |
| store_operand_info (); |
| }; |
| |
| store_operand_info::store_operand_info () |
| : val (NULL_TREE), base_addr (NULL_TREE), bitsize (0), bitpos (0), |
| bitregion_start (0), bitregion_end (0), stmt (NULL), bit_not_p (false) |
| { |
| } |
| |
| /* Struct recording the information about a single store of an immediate |
| to memory. These are created in the first phase and coalesced into |
| merged_store_group objects in the second phase. */ |
| |
| class store_immediate_info |
| { |
| public: |
| unsigned HOST_WIDE_INT bitsize; |
| unsigned HOST_WIDE_INT bitpos; |
| unsigned HOST_WIDE_INT bitregion_start; |
| /* This is one past the last bit of the bit region. */ |
| unsigned HOST_WIDE_INT bitregion_end; |
| gimple *stmt; |
| unsigned int order; |
| /* INTEGER_CST for constant store, STRING_CST for string store, |
| MEM_REF for memory copy, BIT_*_EXPR for logical bitwise operation, |
| BIT_INSERT_EXPR for bit insertion. |
| LROTATE_EXPR if it can be only bswap optimized and |
| ops are not really meaningful. |
| NOP_EXPR if bswap optimization detected identity, ops |
| are not meaningful. */ |
| enum tree_code rhs_code; |
| /* Two fields for bswap optimization purposes. */ |
| struct symbolic_number n; |
| gimple *ins_stmt; |
| /* True if BIT_{AND,IOR,XOR}_EXPR result is inverted before storing. */ |
| bool bit_not_p; |
| /* True if ops have been swapped and thus ops[1] represents |
| rhs1 of BIT_{AND,IOR,XOR}_EXPR and ops[0] represents rhs2. */ |
| bool ops_swapped_p; |
| /* The index number of the landing pad, or 0 if there is none. */ |
| int lp_nr; |
| /* Operands. For BIT_*_EXPR rhs_code both operands are used, otherwise |
| just the first one. */ |
| store_operand_info ops[2]; |
| store_immediate_info (unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT, |
| unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT, |
| gimple *, unsigned int, enum tree_code, |
| struct symbolic_number &, gimple *, bool, int, |
| const store_operand_info &, |
| const store_operand_info &); |
| }; |
| |
| store_immediate_info::store_immediate_info (unsigned HOST_WIDE_INT bs, |
| unsigned HOST_WIDE_INT bp, |
| unsigned HOST_WIDE_INT brs, |
| unsigned HOST_WIDE_INT bre, |
| gimple *st, |
| unsigned int ord, |
| enum tree_code rhscode, |
| struct symbolic_number &nr, |
| gimple *ins_stmtp, |
| bool bitnotp, |
| int nr2, |
| const store_operand_info &op0r, |
| const store_operand_info &op1r) |
| : bitsize (bs), bitpos (bp), bitregion_start (brs), bitregion_end (bre), |
| stmt (st), order (ord), rhs_code (rhscode), n (nr), |
| ins_stmt (ins_stmtp), bit_not_p (bitnotp), ops_swapped_p (false), |
| lp_nr (nr2), ops { op0r, op1r } |
| { |
| } |
| |
| /* Struct representing a group of stores to contiguous memory locations. |
| These are produced by the second phase (coalescing) and consumed in the |
| third phase that outputs the widened stores. */ |
| |
| class merged_store_group |
| { |
| public: |
| unsigned HOST_WIDE_INT start; |
| unsigned HOST_WIDE_INT width; |
| unsigned HOST_WIDE_INT bitregion_start; |
| unsigned HOST_WIDE_INT bitregion_end; |
| /* The size of the allocated memory for val and mask. */ |
| unsigned HOST_WIDE_INT buf_size; |
| unsigned HOST_WIDE_INT align_base; |
| poly_uint64 load_align_base[2]; |
| |
| unsigned int align; |
| unsigned int load_align[2]; |
| unsigned int first_order; |
| unsigned int last_order; |
| bool bit_insertion; |
| bool string_concatenation; |
| bool only_constants; |
| bool consecutive; |
| unsigned int first_nonmergeable_order; |
| int lp_nr; |
| |
| auto_vec<store_immediate_info *> stores; |
| /* We record the first and last original statements in the sequence because |
| we'll need their vuse/vdef and replacement position. It's easier to keep |
| track of them separately as 'stores' is reordered by apply_stores. */ |
| gimple *last_stmt; |
| gimple *first_stmt; |
| unsigned char *val; |
| unsigned char *mask; |
| |
| merged_store_group (store_immediate_info *); |
| ~merged_store_group (); |
| bool can_be_merged_into (store_immediate_info *); |
| void merge_into (store_immediate_info *); |
| void merge_overlapping (store_immediate_info *); |
| bool apply_stores (); |
| private: |
| void do_merge (store_immediate_info *); |
| }; |
| |
| /* Debug helper. Dump LEN elements of byte array PTR to FD in hex. */ |
| |
| static void |
| dump_char_array (FILE *fd, unsigned char *ptr, unsigned int len) |
| { |
| if (!fd) |
| return; |
| |
| for (unsigned int i = 0; i < len; i++) |
| fprintf (fd, "%02x ", ptr[i]); |
| fprintf (fd, "\n"); |
| } |
| |
| /* Clear out LEN bits starting from bit START in the byte array |
| PTR. This clears the bits to the *right* from START. |
| START must be within [0, BITS_PER_UNIT) and counts starting from |
| the least significant bit. */ |
| |
| static void |
| clear_bit_region_be (unsigned char *ptr, unsigned int start, |
| unsigned int len) |
| { |
| if (len == 0) |
| return; |
| /* Clear len bits to the right of start. */ |
| else if (len <= start + 1) |
| { |
| unsigned char mask = (~(~0U << len)); |
| mask = mask << (start + 1U - len); |
| ptr[0] &= ~mask; |
| } |
| else if (start != BITS_PER_UNIT - 1) |
| { |
| clear_bit_region_be (ptr, start, (start % BITS_PER_UNIT) + 1); |
| clear_bit_region_be (ptr + 1, BITS_PER_UNIT - 1, |
| len - (start % BITS_PER_UNIT) - 1); |
| } |
| else if (start == BITS_PER_UNIT - 1 |
| && len > BITS_PER_UNIT) |
| { |
| unsigned int nbytes = len / BITS_PER_UNIT; |
| memset (ptr, 0, nbytes); |
| if (len % BITS_PER_UNIT != 0) |
| clear_bit_region_be (ptr + nbytes, BITS_PER_UNIT - 1, |
| len % BITS_PER_UNIT); |
| } |
| else |
| gcc_unreachable (); |
| } |
| |
| /* In the byte array PTR clear the bit region starting at bit |
| START and is LEN bits wide. |
| For regions spanning multiple bytes do this recursively until we reach |
| zero LEN or a region contained within a single byte. */ |
| |
| static void |
| clear_bit_region (unsigned char *ptr, unsigned int start, |
| unsigned int len) |
| { |
| /* Degenerate base case. */ |
| if (len == 0) |
| return; |
| else if (start >= BITS_PER_UNIT) |
| clear_bit_region (ptr + 1, start - BITS_PER_UNIT, len); |
| /* Second base case. */ |
| else if ((start + len) <= BITS_PER_UNIT) |
| { |
| unsigned char mask = (~0U) << (unsigned char) (BITS_PER_UNIT - len); |
| mask >>= BITS_PER_UNIT - (start + len); |
| |
| ptr[0] &= ~mask; |
| |
| return; |
| } |
| /* Clear most significant bits in a byte and proceed with the next byte. */ |
| else if (start != 0) |
| { |
| clear_bit_region (ptr, start, BITS_PER_UNIT - start); |
| clear_bit_region (ptr + 1, 0, len - (BITS_PER_UNIT - start)); |
| } |
| /* Whole bytes need to be cleared. */ |
| else if (start == 0 && len > BITS_PER_UNIT) |
| { |
| unsigned int nbytes = len / BITS_PER_UNIT; |
| /* We could recurse on each byte but we clear whole bytes, so a simple |
| memset will do. */ |
| memset (ptr, '\0', nbytes); |
| /* Clear the remaining sub-byte region if there is one. */ |
| if (len % BITS_PER_UNIT != 0) |
| clear_bit_region (ptr + nbytes, 0, len % BITS_PER_UNIT); |
| } |
| else |
| gcc_unreachable (); |
| } |
| |
| /* Write BITLEN bits of EXPR to the byte array PTR at |
| bit position BITPOS. PTR should contain TOTAL_BYTES elements. |
| Return true if the operation succeeded. */ |
| |
| static bool |
| encode_tree_to_bitpos (tree expr, unsigned char *ptr, int bitlen, int bitpos, |
| unsigned int total_bytes) |
| { |
| unsigned int first_byte = bitpos / BITS_PER_UNIT; |
| bool sub_byte_op_p = ((bitlen % BITS_PER_UNIT) |
| || (bitpos % BITS_PER_UNIT) |
| || !int_mode_for_size (bitlen, 0).exists ()); |
| bool empty_ctor_p |
| = (TREE_CODE (expr) == CONSTRUCTOR |
| && CONSTRUCTOR_NELTS (expr) == 0 |
| && TYPE_SIZE_UNIT (TREE_TYPE (expr)) |
| && tree_fits_uhwi_p (TYPE_SIZE_UNIT (TREE_TYPE (expr)))); |
| |
| if (!sub_byte_op_p) |
| { |
| if (first_byte >= total_bytes) |
| return false; |
| total_bytes -= first_byte; |
| if (empty_ctor_p) |
| { |
| unsigned HOST_WIDE_INT rhs_bytes |
| = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (expr))); |
| if (rhs_bytes > total_bytes) |
| return false; |
| memset (ptr + first_byte, '\0', rhs_bytes); |
| return true; |
| } |
| return native_encode_expr (expr, ptr + first_byte, total_bytes) != 0; |
| } |
| |
| /* LITTLE-ENDIAN |
| We are writing a non byte-sized quantity or at a position that is not |
| at a byte boundary. |
| |--------|--------|--------| ptr + first_byte |
| ^ ^ |
| xxx xxxxxxxx xxx< bp> |
| |______EXPR____| |
| |
| First native_encode_expr EXPR into a temporary buffer and shift each |
| byte in the buffer by 'bp' (carrying the bits over as necessary). |
| |00000000|00xxxxxx|xxxxxxxx| << bp = |000xxxxx|xxxxxxxx|xxx00000| |
| <------bitlen---->< bp> |
| Then we clear the destination bits: |
| |---00000|00000000|000-----| ptr + first_byte |
| <-------bitlen--->< bp> |
| |
| Finally we ORR the bytes of the shifted EXPR into the cleared region: |
| |---xxxxx||xxxxxxxx||xxx-----| ptr + first_byte. |
| |
| BIG-ENDIAN |
| We are writing a non byte-sized quantity or at a position that is not |
| at a byte boundary. |
| ptr + first_byte |--------|--------|--------| |
| ^ ^ |
| <bp >xxx xxxxxxxx xxx |
| |_____EXPR_____| |
| |
| First native_encode_expr EXPR into a temporary buffer and shift each |
| byte in the buffer to the right by (carrying the bits over as necessary). |
| We shift by as much as needed to align the most significant bit of EXPR |
| with bitpos: |
| |00xxxxxx|xxxxxxxx| >> 3 = |00000xxx|xxxxxxxx|xxxxx000| |
| <---bitlen----> <bp ><-----bitlen-----> |
| Then we clear the destination bits: |
| ptr + first_byte |-----000||00000000||00000---| |
| <bp ><-------bitlen-----> |
| |
| Finally we ORR the bytes of the shifted EXPR into the cleared region: |
| ptr + first_byte |---xxxxx||xxxxxxxx||xxx-----|. |
| The awkwardness comes from the fact that bitpos is counted from the |
| most significant bit of a byte. */ |
| |
| /* We must be dealing with fixed-size data at this point, since the |
| total size is also fixed. */ |
| unsigned int byte_size; |
| if (empty_ctor_p) |
| { |
| unsigned HOST_WIDE_INT rhs_bytes |
| = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (expr))); |
| if (rhs_bytes > total_bytes) |
| return false; |
| byte_size = rhs_bytes; |
| } |
| else |
| { |
| fixed_size_mode mode |
| = as_a <fixed_size_mode> (TYPE_MODE (TREE_TYPE (expr))); |
| byte_size |
| = mode == BLKmode |
| ? tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (expr))) |
| : GET_MODE_SIZE (mode); |
| } |
| /* Allocate an extra byte so that we have space to shift into. */ |
| byte_size++; |
| unsigned char *tmpbuf = XALLOCAVEC (unsigned char, byte_size); |
| memset (tmpbuf, '\0', byte_size); |
| /* The store detection code should only have allowed constants that are |
| accepted by native_encode_expr or empty ctors. */ |
| if (!empty_ctor_p |
| && native_encode_expr (expr, tmpbuf, byte_size - 1) == 0) |
| gcc_unreachable (); |
| |
| /* The native_encode_expr machinery uses TYPE_MODE to determine how many |
| bytes to write. This means it can write more than |
| ROUND_UP (bitlen, BITS_PER_UNIT) / BITS_PER_UNIT bytes (for example |
| write 8 bytes for a bitlen of 40). Skip the bytes that are not within |
| bitlen and zero out the bits that are not relevant as well (that may |
| contain a sign bit due to sign-extension). */ |
| unsigned int padding |
| = byte_size - ROUND_UP (bitlen, BITS_PER_UNIT) / BITS_PER_UNIT - 1; |
| /* On big-endian the padding is at the 'front' so just skip the initial |
| bytes. */ |
| if (BYTES_BIG_ENDIAN) |
| tmpbuf += padding; |
| |
| byte_size -= padding; |
| |
| if (bitlen % BITS_PER_UNIT != 0) |
| { |
| if (BYTES_BIG_ENDIAN) |
| clear_bit_region_be (tmpbuf, BITS_PER_UNIT - 1, |
| BITS_PER_UNIT - (bitlen % BITS_PER_UNIT)); |
| else |
| clear_bit_region (tmpbuf, bitlen, |
| byte_size * BITS_PER_UNIT - bitlen); |
| } |
| /* Left shifting relies on the last byte being clear if bitlen is |
| a multiple of BITS_PER_UNIT, which might not be clear if |
| there are padding bytes. */ |
| else if (!BYTES_BIG_ENDIAN) |
| tmpbuf[byte_size - 1] = '\0'; |
| |
| /* Clear the bit region in PTR where the bits from TMPBUF will be |
| inserted into. */ |
| if (BYTES_BIG_ENDIAN) |
| clear_bit_region_be (ptr + first_byte, |
| BITS_PER_UNIT - 1 - (bitpos % BITS_PER_UNIT), bitlen); |
| else |
| clear_bit_region (ptr + first_byte, bitpos % BITS_PER_UNIT, bitlen); |
| |
| int shift_amnt; |
| int bitlen_mod = bitlen % BITS_PER_UNIT; |
| int bitpos_mod = bitpos % BITS_PER_UNIT; |
| |
| bool skip_byte = false; |
| if (BYTES_BIG_ENDIAN) |
| { |
| /* BITPOS and BITLEN are exactly aligned and no shifting |
| is necessary. */ |
| if (bitpos_mod + bitlen_mod == BITS_PER_UNIT |
| || (bitpos_mod == 0 && bitlen_mod == 0)) |
| shift_amnt = 0; |
| /* |. . . . . . . .| |
| <bp > <blen >. |
| We always shift right for BYTES_BIG_ENDIAN so shift the beginning |
| of the value until it aligns with 'bp' in the next byte over. */ |
| else if (bitpos_mod + bitlen_mod < BITS_PER_UNIT) |
| { |
| shift_amnt = bitlen_mod + bitpos_mod; |
| skip_byte = bitlen_mod != 0; |
| } |
| /* |. . . . . . . .| |
| <----bp---> |
| <---blen---->. |
| Shift the value right within the same byte so it aligns with 'bp'. */ |
| else |
| shift_amnt = bitlen_mod + bitpos_mod - BITS_PER_UNIT; |
| } |
| else |
| shift_amnt = bitpos % BITS_PER_UNIT; |
| |
| /* Create the shifted version of EXPR. */ |
| if (!BYTES_BIG_ENDIAN) |
| { |
| shift_bytes_in_array_left (tmpbuf, byte_size, shift_amnt); |
| if (shift_amnt == 0) |
| byte_size--; |
| } |
| else |
| { |
| gcc_assert (BYTES_BIG_ENDIAN); |
| shift_bytes_in_array_right (tmpbuf, byte_size, shift_amnt); |
| /* If shifting right forced us to move into the next byte skip the now |
| empty byte. */ |
| if (skip_byte) |
| { |
| tmpbuf++; |
| byte_size--; |
| } |
| } |
| |
| /* Insert the bits from TMPBUF. */ |
| for (unsigned int i = 0; i < byte_size; i++) |
| ptr[first_byte + i] |= tmpbuf[i]; |
| |
| return true; |
| } |
| |
| /* Sorting function for store_immediate_info objects. |
| Sorts them by bitposition. */ |
| |
| static int |
| sort_by_bitpos (const void *x, const void *y) |
| { |
| store_immediate_info *const *tmp = (store_immediate_info * const *) x; |
| store_immediate_info *const *tmp2 = (store_immediate_info * const *) y; |
| |
| if ((*tmp)->bitpos < (*tmp2)->bitpos) |
| return -1; |
| else if ((*tmp)->bitpos > (*tmp2)->bitpos) |
| return 1; |
| else |
| /* If they are the same let's use the order which is guaranteed to |
| be different. */ |
| return (*tmp)->order - (*tmp2)->order; |
| } |
| |
| /* Sorting function for store_immediate_info objects. |
| Sorts them by the order field. */ |
| |
| static int |
| sort_by_order (const void *x, const void *y) |
| { |
| store_immediate_info *const *tmp = (store_immediate_info * const *) x; |
| store_immediate_info *const *tmp2 = (store_immediate_info * const *) y; |
| |
| if ((*tmp)->order < (*tmp2)->order) |
| return -1; |
| else if ((*tmp)->order > (*tmp2)->order) |
| return 1; |
| |
| gcc_unreachable (); |
| } |
| |
| /* Initialize a merged_store_group object from a store_immediate_info |
| object. */ |
| |
| merged_store_group::merged_store_group (store_immediate_info *info) |
| { |
| start = info->bitpos; |
| width = info->bitsize; |
| bitregion_start = info->bitregion_start; |
| bitregion_end = info->bitregion_end; |
| /* VAL has memory allocated for it in apply_stores once the group |
| width has been finalized. */ |
| val = NULL; |
| mask = NULL; |
| bit_insertion = info->rhs_code == BIT_INSERT_EXPR; |
| string_concatenation = info->rhs_code == STRING_CST; |
| only_constants = info->rhs_code == INTEGER_CST; |
| consecutive = true; |
| first_nonmergeable_order = ~0U; |
| lp_nr = info->lp_nr; |
| unsigned HOST_WIDE_INT align_bitpos = 0; |
| get_object_alignment_1 (gimple_assign_lhs (info->stmt), |
| &align, &align_bitpos); |
| align_base = start - align_bitpos; |
| for (int i = 0; i < 2; ++i) |
| { |
| store_operand_info &op = info->ops[i]; |
| if (op.base_addr == NULL_TREE) |
| { |
| load_align[i] = 0; |
| load_align_base[i] = 0; |
| } |
| else |
| { |
| get_object_alignment_1 (op.val, &load_align[i], &align_bitpos); |
| load_align_base[i] = op.bitpos - align_bitpos; |
| } |
| } |
| stores.create (1); |
| stores.safe_push (info); |
| last_stmt = info->stmt; |
| last_order = info->order; |
| first_stmt = last_stmt; |
| first_order = last_order; |
| buf_size = 0; |
| } |
| |
| merged_store_group::~merged_store_group () |
| { |
| if (val) |
| XDELETEVEC (val); |
| } |
| |
| /* Return true if the store described by INFO can be merged into the group. */ |
| |
| bool |
| merged_store_group::can_be_merged_into (store_immediate_info *info) |
| { |
| /* Do not merge bswap patterns. */ |
| if (info->rhs_code == LROTATE_EXPR) |
| return false; |
| |
| if (info->lp_nr != lp_nr) |
| return false; |
| |
| /* The canonical case. */ |
| if (info->rhs_code == stores[0]->rhs_code) |
| return true; |
| |
| /* BIT_INSERT_EXPR is compatible with INTEGER_CST if no STRING_CST. */ |
| if (info->rhs_code == BIT_INSERT_EXPR && stores[0]->rhs_code == INTEGER_CST) |
| return !string_concatenation; |
| |
| if (stores[0]->rhs_code == BIT_INSERT_EXPR && info->rhs_code == INTEGER_CST) |
| return !string_concatenation; |
| |
| /* We can turn MEM_REF into BIT_INSERT_EXPR for bit-field stores, but do it |
| only for small regions since this can generate a lot of instructions. */ |
| if (info->rhs_code == MEM_REF |
| && (stores[0]->rhs_code == INTEGER_CST |
| || stores[0]->rhs_code == BIT_INSERT_EXPR) |
| && info->bitregion_start == stores[0]->bitregion_start |
| && info->bitregion_end == stores[0]->bitregion_end |
| && info->bitregion_end - info->bitregion_start <= MAX_FIXED_MODE_SIZE) |
| return !string_concatenation; |
| |
| if (stores[0]->rhs_code == MEM_REF |
| && (info->rhs_code == INTEGER_CST |
| || info->rhs_code == BIT_INSERT_EXPR) |
| && info->bitregion_start == stores[0]->bitregion_start |
| && info->bitregion_end == stores[0]->bitregion_end |
| && info->bitregion_end - info->bitregion_start <= MAX_FIXED_MODE_SIZE) |
| return !string_concatenation; |
| |
| /* STRING_CST is compatible with INTEGER_CST if no BIT_INSERT_EXPR. */ |
| if (info->rhs_code == STRING_CST |
| && stores[0]->rhs_code == INTEGER_CST |
| && stores[0]->bitsize == CHAR_BIT) |
| return !bit_insertion; |
| |
| if (stores[0]->rhs_code == STRING_CST |
| && info->rhs_code == INTEGER_CST |
| && info->bitsize == CHAR_BIT) |
| return !bit_insertion; |
| |
| return false; |
| } |
| |
| /* Helper method for merge_into and merge_overlapping to do |
| the common part. */ |
| |
| void |
| merged_store_group::do_merge (store_immediate_info *info) |
| { |
| bitregion_start = MIN (bitregion_start, info->bitregion_start); |
| bitregion_end = MAX (bitregion_end, info->bitregion_end); |
| |
| unsigned int this_align; |
| unsigned HOST_WIDE_INT align_bitpos = 0; |
| get_object_alignment_1 (gimple_assign_lhs (info->stmt), |
| &this_align, &align_bitpos); |
| if (this_align > align) |
| { |
| align = this_align; |
| align_base = info->bitpos - align_bitpos; |
| } |
| for (int i = 0; i < 2; ++i) |
| { |
| store_operand_info &op = info->ops[i]; |
| if (!op.base_addr) |
| continue; |
| |
| get_object_alignment_1 (op.val, &this_align, &align_bitpos); |
| if (this_align > load_align[i]) |
| { |
| load_align[i] = this_align; |
| load_align_base[i] = op.bitpos - align_bitpos; |
| } |
| } |
| |
| gimple *stmt = info->stmt; |
| stores.safe_push (info); |
| if (info->order > last_order) |
| { |
| last_order = info->order; |
| last_stmt = stmt; |
| } |
| else if (info->order < first_order) |
| { |
| first_order = info->order; |
| first_stmt = stmt; |
| } |
| |
| if (info->bitpos != start + width) |
| consecutive = false; |
| |
| /* We need to use extraction if there is any bit-field. */ |
| if (info->rhs_code == BIT_INSERT_EXPR) |
| { |
| bit_insertion = true; |
| gcc_assert (!string_concatenation); |
| } |
| |
| /* We want to use concatenation if there is any string. */ |
| if (info->rhs_code == STRING_CST) |
| { |
| string_concatenation = true; |
| gcc_assert (!bit_insertion); |
| } |
| |
| /* But we cannot use it if we don't have consecutive stores. */ |
| if (!consecutive) |
| string_concatenation = false; |
| |
| if (info->rhs_code != INTEGER_CST) |
| only_constants = false; |
| } |
| |
| /* Merge a store recorded by INFO into this merged store. |
| The store is not overlapping with the existing recorded |
| stores. */ |
| |
| void |
| merged_store_group::merge_into (store_immediate_info *info) |
| { |
| do_merge (info); |
| |
| /* Make sure we're inserting in the position we think we're inserting. */ |
| gcc_assert (info->bitpos >= start + width |
| && info->bitregion_start <= bitregion_end); |
| |
| width = info->bitpos + info->bitsize - start; |
| } |
| |
| /* Merge a store described by INFO into this merged store. |
| INFO overlaps in some way with the current store (i.e. it's not contiguous |
| which is handled by merged_store_group::merge_into). */ |
| |
| void |
| merged_store_group::merge_overlapping (store_immediate_info *info) |
| { |
| do_merge (info); |
| |
| /* If the store extends the size of the group, extend the width. */ |
| if (info->bitpos + info->bitsize > start + width) |
| width = info->bitpos + info->bitsize - start; |
| } |
| |
| /* Go through all the recorded stores in this group in program order and |
| apply their values to the VAL byte array to create the final merged |
| value. Return true if the operation succeeded. */ |
| |
| bool |
| merged_store_group::apply_stores () |
| { |
| store_immediate_info *info; |
| unsigned int i; |
| |
| /* Make sure we have more than one store in the group, otherwise we cannot |
| merge anything. */ |
| if (bitregion_start % BITS_PER_UNIT != 0 |
| || bitregion_end % BITS_PER_UNIT != 0 |
| || stores.length () == 1) |
| return false; |
| |
| buf_size = (bitregion_end - bitregion_start) / BITS_PER_UNIT; |
| |
| /* Really do string concatenation for large strings only. */ |
| if (buf_size <= MOVE_MAX) |
| string_concatenation = false; |
| |
| /* Create a power-of-2-sized buffer for native_encode_expr. */ |
| if (!string_concatenation) |
| buf_size = 1 << ceil_log2 (buf_size); |
| |
| val = XNEWVEC (unsigned char, 2 * buf_size); |
| mask = val + buf_size; |
| memset (val, 0, buf_size); |
| memset (mask, ~0U, buf_size); |
| |
| stores.qsort (sort_by_order); |
| |
| FOR_EACH_VEC_ELT (stores, i, info) |
| { |
| unsigned int pos_in_buffer = info->bitpos - bitregion_start; |
| tree cst; |
| if (info->ops[0].val && info->ops[0].base_addr == NULL_TREE) |
| cst = info->ops[0].val; |
| else if (info->ops[1].val && info->ops[1].base_addr == NULL_TREE) |
| cst = info->ops[1].val; |
| else |
| cst = NULL_TREE; |
| bool ret = true; |
| if (cst && info->rhs_code != BIT_INSERT_EXPR) |
| ret = encode_tree_to_bitpos (cst, val, info->bitsize, pos_in_buffer, |
| buf_size); |
| unsigned char *m = mask + (pos_in_buffer / BITS_PER_UNIT); |
| if (BYTES_BIG_ENDIAN) |
| clear_bit_region_be (m, (BITS_PER_UNIT - 1 |
| - (pos_in_buffer % BITS_PER_UNIT)), |
| info->bitsize); |
| else |
| clear_bit_region (m, pos_in_buffer % BITS_PER_UNIT, info->bitsize); |
| if (cst && dump_file && (dump_flags & TDF_DETAILS)) |
| { |
| if (ret) |
| { |
| fputs ("After writing ", dump_file); |
| print_generic_expr (dump_file, cst, TDF_NONE); |
| fprintf (dump_file, " of size " HOST_WIDE_INT_PRINT_DEC |
| " at position %d\n", info->bitsize, pos_in_buffer); |
| fputs (" the merged value contains ", dump_file); |
| dump_char_array (dump_file, val, buf_size); |
| fputs (" the merged mask contains ", dump_file); |
| dump_char_array (dump_file, mask, buf_size); |
| if (bit_insertion) |
| fputs (" bit insertion is required\n", dump_file); |
| if (string_concatenation) |
| fputs (" string concatenation is required\n", dump_file); |
| } |
| else |
| fprintf (dump_file, "Failed to merge stores\n"); |
| } |
| if (!ret) |
| return false; |
| } |
| stores.qsort (sort_by_bitpos); |
| return true; |
| } |
| |
| /* Structure describing the store chain. */ |
| |
| class imm_store_chain_info |
| { |
| public: |
| /* Doubly-linked list that imposes an order on chain processing. |
| PNXP (prev's next pointer) points to the head of a list, or to |
| the next field in the previous chain in the list. |
| See pass_store_merging::m_stores_head for more rationale. */ |
| imm_store_chain_info *next, **pnxp; |
| tree base_addr; |
| auto_vec<store_immediate_info *> m_store_info; |
| auto_vec<merged_store_group *> m_merged_store_groups; |
| |
| imm_store_chain_info (imm_store_chain_info *&inspt, tree b_a) |
| : next (inspt), pnxp (&inspt), base_addr (b_a) |
| { |
| inspt = this; |
| if (next) |
| { |
| gcc_checking_assert (pnxp == next->pnxp); |
| next->pnxp = &next; |
| } |
| } |
| ~imm_store_chain_info () |
| { |
| *pnxp = next; |
| if (next) |
| { |
| gcc_checking_assert (&next == next->pnxp); |
| next->pnxp = pnxp; |
| } |
| } |
| bool terminate_and_process_chain (); |
| bool try_coalesce_bswap (merged_store_group *, unsigned int, unsigned int, |
| unsigned int); |
| bool coalesce_immediate_stores (); |
| bool output_merged_store (merged_store_group *); |
| bool output_merged_stores (); |
| }; |
| |
| const pass_data pass_data_tree_store_merging = { |
| GIMPLE_PASS, /* type */ |
| "store-merging", /* name */ |
| OPTGROUP_NONE, /* optinfo_flags */ |
| TV_GIMPLE_STORE_MERGING, /* tv_id */ |
| PROP_ssa, /* properties_required */ |
| 0, /* properties_provided */ |
| 0, /* properties_destroyed */ |
| 0, /* todo_flags_start */ |
| TODO_update_ssa, /* todo_flags_finish */ |
| }; |
| |
| class pass_store_merging : public gimple_opt_pass |
| { |
| public: |
| pass_store_merging (gcc::context *ctxt) |
| : gimple_opt_pass (pass_data_tree_store_merging, ctxt), m_stores_head (), |
| m_n_chains (0), m_n_stores (0) |
| { |
| } |
| |
| /* Pass not supported for PDP-endian, nor for insane hosts or |
| target character sizes where native_{encode,interpret}_expr |
| doesn't work properly. */ |
| virtual bool |
| gate (function *) |
| { |
| return flag_store_merging |
| && BYTES_BIG_ENDIAN == WORDS_BIG_ENDIAN |
| && CHAR_BIT == 8 |
| && BITS_PER_UNIT == 8; |
| } |
| |
| virtual unsigned int execute (function *); |
| |
| private: |
| hash_map<tree_operand_hash, class imm_store_chain_info *> m_stores; |
| |
| /* Form a doubly-linked stack of the elements of m_stores, so that |
| we can iterate over them in a predictable way. Using this order |
| avoids extraneous differences in the compiler output just because |
| of tree pointer variations (e.g. different chains end up in |
| different positions of m_stores, so they are handled in different |
| orders, so they allocate or release SSA names in different |
| orders, and when they get reused, subsequent passes end up |
| getting different SSA names, which may ultimately change |
| decisions when going out of SSA). */ |
| imm_store_chain_info *m_stores_head; |
| |
| /* The number of store chains currently tracked. */ |
| unsigned m_n_chains; |
| /* The number of stores currently tracked. */ |
| unsigned m_n_stores; |
| |
| bool process_store (gimple *); |
| bool terminate_and_process_chain (imm_store_chain_info *); |
| bool terminate_all_aliasing_chains (imm_store_chain_info **, gimple *); |
| bool terminate_and_process_all_chains (); |
| }; // class pass_store_merging |
| |
| /* Terminate and process all recorded chains. Return true if any changes |
| were made. */ |
| |
| bool |
| pass_store_merging::terminate_and_process_all_chains () |
| { |
| bool ret = false; |
| while (m_stores_head) |
| ret |= terminate_and_process_chain (m_stores_head); |
| gcc_assert (m_stores.is_empty ()); |
| return ret; |
| } |
| |
| /* Terminate all chains that are affected by the statement STMT. |
| CHAIN_INFO is the chain we should ignore from the checks if |
| non-NULL. Return true if any changes were made. */ |
| |
| bool |
| pass_store_merging::terminate_all_aliasing_chains (imm_store_chain_info |
| **chain_info, |
| gimple *stmt) |
| { |
| bool ret = false; |
| |
| /* If the statement doesn't touch memory it can't alias. */ |
| if (!gimple_vuse (stmt)) |
| return false; |
| |
| tree store_lhs = gimple_store_p (stmt) ? gimple_get_lhs (stmt) : NULL_TREE; |
| ao_ref store_lhs_ref; |
| ao_ref_init (&store_lhs_ref, store_lhs); |
| for (imm_store_chain_info *next = m_stores_head, *cur = next; cur; cur = next) |
| { |
| next = cur->next; |
| |
| /* We already checked all the stores in chain_info and terminated the |
| chain if necessary. Skip it here. */ |
| if (chain_info && *chain_info == cur) |
| continue; |
| |
| store_immediate_info *info; |
| unsigned int i; |
| FOR_EACH_VEC_ELT (cur->m_store_info, i, info) |
| { |
| tree lhs = gimple_assign_lhs (info->stmt); |
| ao_ref lhs_ref; |
| ao_ref_init (&lhs_ref, lhs); |
| if (ref_maybe_used_by_stmt_p (stmt, &lhs_ref) |
| || stmt_may_clobber_ref_p_1 (stmt, &lhs_ref) |
| || (store_lhs && refs_may_alias_p_1 (&store_lhs_ref, |
| &lhs_ref, false))) |
| { |
| if (dump_file && (dump_flags & TDF_DETAILS)) |
| { |
| fprintf (dump_file, "stmt causes chain termination:\n"); |
| print_gimple_stmt (dump_file, stmt, 0); |
| } |
| ret |= terminate_and_process_chain (cur); |
| break; |
| } |
| } |
| } |
| |
| return ret; |
| } |
| |
| /* Helper function. Terminate the recorded chain storing to base object |
| BASE. Return true if the merging and output was successful. The m_stores |
| entry is removed after the processing in any case. */ |
| |
| bool |
| pass_store_merging::terminate_and_process_chain (imm_store_chain_info *chain_info) |
| { |
| m_n_stores -= chain_info->m_store_info.length (); |
| m_n_chains--; |
| bool ret = chain_info->terminate_and_process_chain (); |
| m_stores.remove (chain_info->base_addr); |
| delete chain_info; |
| return ret; |
| } |
| |
| /* Return true if stmts in between FIRST (inclusive) and LAST (exclusive) |
| may clobber REF. FIRST and LAST must have non-NULL vdef. We want to |
| be able to sink load of REF across stores between FIRST and LAST, up |
| to right before LAST. */ |
| |
| bool |
| stmts_may_clobber_ref_p (gimple *first, gimple *last, tree ref) |
| { |
| ao_ref r; |
| ao_ref_init (&r, ref); |
| unsigned int count = 0; |
| tree vop = gimple_vdef (last); |
| gimple *stmt; |
| |
| /* Return true conservatively if the basic blocks are different. */ |
| if (gimple_bb (first) != gimple_bb (last)) |
| return true; |
| |
| do |
| { |
| stmt = SSA_NAME_DEF_STMT (vop); |
| if (stmt_may_clobber_ref_p_1 (stmt, &r)) |
| return true; |
| if (gimple_store_p (stmt) |
| && refs_anti_dependent_p (ref, gimple_get_lhs (stmt))) |
| return true; |
| /* Avoid quadratic compile time by bounding the number of checks |
| we perform. */ |
| if (++count > MAX_STORE_ALIAS_CHECKS) |
| return true; |
| vop = gimple_vuse (stmt); |
| } |
| while (stmt != first); |
| |
| return false; |
| } |
| |
| /* Return true if INFO->ops[IDX] is mergeable with the |
| corresponding loads already in MERGED_STORE group. |
| BASE_ADDR is the base address of the whole store group. */ |
| |
| bool |
| compatible_load_p (merged_store_group *merged_store, |
| store_immediate_info *info, |
| tree base_addr, int idx) |
| { |
| store_immediate_info *infof = merged_store->stores[0]; |
| if (!info->ops[idx].base_addr |
| || maybe_ne (info->ops[idx].bitpos - infof->ops[idx].bitpos, |
| info->bitpos - infof->bitpos) |
| || !operand_equal_p (info->ops[idx].base_addr, |
| infof->ops[idx].base_addr, 0)) |
| return false; |
| |
| store_immediate_info *infol = merged_store->stores.last (); |
| tree load_vuse = gimple_vuse (info->ops[idx].stmt); |
| /* In this case all vuses should be the same, e.g. |
| _1 = s.a; _2 = s.b; _3 = _1 | 1; t.a = _3; _4 = _2 | 2; t.b = _4; |
| or |
| _1 = s.a; _2 = s.b; t.a = _1; t.b = _2; |
| and we can emit the coalesced load next to any of those loads. */ |
| if (gimple_vuse (infof->ops[idx].stmt) == load_vuse |
| && gimple_vuse (infol->ops[idx].stmt) == load_vuse) |
| return true; |
| |
| /* Otherwise, at least for now require that the load has the same |
| vuse as the store. See following examples. */ |
| if (gimple_vuse (info->stmt) != load_vuse) |
| return false; |
| |
| if (gimple_vuse (infof->stmt) != gimple_vuse (infof->ops[idx].stmt) |
| || (infof != infol |
| && gimple_vuse (infol->stmt) != gimple_vuse (infol->ops[idx].stmt))) |
| return false; |
| |
| /* If the load is from the same location as the store, already |
| the construction of the immediate chain info guarantees no intervening |
| stores, so no further checks are needed. Example: |
| _1 = s.a; _2 = _1 & -7; s.a = _2; _3 = s.b; _4 = _3 & -7; s.b = _4; */ |
| if (known_eq (info->ops[idx].bitpos, info->bitpos) |
| && operand_equal_p (info->ops[idx].base_addr, base_addr, 0)) |
| return true; |
| |
| /* Otherwise, we need to punt if any of the loads can be clobbered by any |
| of the stores in the group, or any other stores in between those. |
| Previous calls to compatible_load_p ensured that for all the |
| merged_store->stores IDX loads, no stmts starting with |
| merged_store->first_stmt and ending right before merged_store->last_stmt |
| clobbers those loads. */ |
| gimple *first = merged_store->first_stmt; |
| gimple *last = merged_store->last_stmt; |
| /* The stores are sorted by increasing store bitpos, so if info->stmt store |
| comes before the so far first load, we'll be changing |
| merged_store->first_stmt. In that case we need to give up if |
| any of the earlier processed loads clobber with the stmts in the new |
| range. */ |
| if (info->order < merged_store->first_order) |
| { |
| for (store_immediate_info *infoc : merged_store->stores) |
| if (stmts_may_clobber_ref_p (info->stmt, first, infoc->ops[idx].val)) |
| return false; |
| first = info->stmt; |
| } |
| /* Similarly, we could change merged_store->last_stmt, so ensure |
| in that case no stmts in the new range clobber any of the earlier |
| processed loads. */ |
| else if (info->order > merged_store->last_order) |
| { |
| for (store_immediate_info *infoc : merged_store->stores) |
| if (stmts_may_clobber_ref_p (last, info->stmt, infoc->ops[idx].val)) |
| return false; |
| last = info->stmt; |
| } |
| /* And finally, we'd be adding a new load to the set, ensure it isn't |
| clobbered in the new range. */ |
| if (stmts_may_clobber_ref_p (first, last, info->ops[idx].val)) |
| return false; |
| |
| /* Otherwise, we are looking for: |
| _1 = s.a; _2 = _1 ^ 15; t.a = _2; _3 = s.b; _4 = _3 ^ 15; t.b = _4; |
| or |
| _1 = s.a; t.a = _1; _2 = s.b; t.b = _2; */ |
| return true; |
| } |
| |
| /* Add all refs loaded to compute VAL to REFS vector. */ |
| |
| void |
| gather_bswap_load_refs (vec<tree> *refs, tree val) |
| { |
| if (TREE_CODE (val) != SSA_NAME) |
| return; |
| |
| gimple *stmt = SSA_NAME_DEF_STMT (val); |
| if (!is_gimple_assign (stmt)) |
| return; |
| |
| if (gimple_assign_load_p (stmt)) |
| { |
| refs->safe_push (gimple_assign_rhs1 (stmt)); |
| return; |
| } |
| |
| switch (gimple_assign_rhs_class (stmt)) |
| { |
| case GIMPLE_BINARY_RHS: |
| gather_bswap_load_refs (refs, gimple_assign_rhs2 (stmt)); |
| /* FALLTHRU */ |
| case GIMPLE_UNARY_RHS: |
| gather_bswap_load_refs (refs, gimple_assign_rhs1 (stmt)); |
| break; |
| default: |
| gcc_unreachable (); |
| } |
| } |
| |
| /* Check if there are any stores in M_STORE_INFO after index I |
| (where M_STORE_INFO must be sorted by sort_by_bitpos) that overlap |
| a potential group ending with END that have their order |
| smaller than LAST_ORDER. ALL_INTEGER_CST_P is true if |
| all the stores already merged and the one under consideration |
| have rhs_code of INTEGER_CST. Return true if there are no such stores. |
| Consider: |
| MEM[(long long int *)p_28] = 0; |
| MEM[(long long int *)p_28 + 8B] = 0; |
| MEM[(long long int *)p_28 + 16B] = 0; |
| MEM[(long long int *)p_28 + 24B] = 0; |
| _129 = (int) _130; |
| MEM[(int *)p_28 + 8B] = _129; |
| MEM[(int *)p_28].a = -1; |
| We already have |
| MEM[(long long int *)p_28] = 0; |
| MEM[(int *)p_28].a = -1; |
| stmts in the current group and need to consider if it is safe to |
| add MEM[(long long int *)p_28 + 8B] = 0; store into the same group. |
| There is an overlap between that store and the MEM[(int *)p_28 + 8B] = _129; |
| store though, so if we add the MEM[(long long int *)p_28 + 8B] = 0; |
| into the group and merging of those 3 stores is successful, merged |
| stmts will be emitted at the latest store from that group, i.e. |
| LAST_ORDER, which is the MEM[(int *)p_28].a = -1; store. |
| The MEM[(int *)p_28 + 8B] = _129; store that originally follows |
| the MEM[(long long int *)p_28 + 8B] = 0; would now be before it, |
| so we need to refuse merging MEM[(long long int *)p_28 + 8B] = 0; |
| into the group. That way it will be its own store group and will |
| not be touched. If ALL_INTEGER_CST_P and there are overlapping |
| INTEGER_CST stores, those are mergeable using merge_overlapping, |
| so don't return false for those. |
| |
| Similarly, check stores from FIRST_EARLIER (inclusive) to END_EARLIER |
| (exclusive), whether they don't overlap the bitrange START to END |
| and have order in between FIRST_ORDER and LAST_ORDER. This is to |
| prevent merging in cases like: |
| MEM <char[12]> [&b + 8B] = {}; |
| MEM[(short *) &b] = 5; |
| _5 = *x_4(D); |
| MEM <long long unsigned int> [&b + 2B] = _5; |
| MEM[(char *)&b + 16B] = 88; |
| MEM[(int *)&b + 20B] = 1; |
| The = {} store comes in sort_by_bitpos before the = 88 store, and can't |
| be merged with it, because the = _5 store overlaps these and is in between |
| them in sort_by_order ordering. If it was merged, the merged store would |
| go after the = _5 store and thus change behavior. */ |
| |
| static bool |
| check_no_overlap (const vec<store_immediate_info *> &m_store_info, |
| unsigned int i, |
| bool all_integer_cst_p, unsigned int first_order, |
| unsigned int last_order, unsigned HOST_WIDE_INT start, |
| unsigned HOST_WIDE_INT end, unsigned int first_earlier, |
| unsigned end_earlier) |
| { |
| unsigned int len = m_store_info.length (); |
| for (unsigned int j = first_earlier; j < end_earlier; j++) |
| { |
| store_immediate_info *info = m_store_info[j]; |
| if (info->order > first_order |
| && info->order < last_order |
| && info->bitpos + info->bitsize > start) |
| return false; |
| } |
| for (++i; i < len; ++i) |
| { |
| store_immediate_info *info = m_store_info[i]; |
| if (info->bitpos >= end) |
| break; |
| if (info->order < last_order |
| && (!all_integer_cst_p || info->rhs_code != INTEGER_CST)) |
| return false; |
| } |
| return true; |
| } |
| |
| /* Return true if m_store_info[first] and at least one following store |
| form a group which store try_size bitsize value which is byte swapped |
| from a memory load or some value, or identity from some value. |
| This uses the bswap pass APIs. */ |
| |
| bool |
| imm_store_chain_info::try_coalesce_bswap (merged_store_group *merged_store, |
| unsigned int first, |
| unsigned int try_size, |
| unsigned int first_earlier) |
| { |
| unsigned int len = m_store_info.length (), last = first; |
| unsigned HOST_WIDE_INT width = m_store_info[first]->bitsize; |
| if (width >= try_size) |
| return false; |
| for (unsigned int i = first + 1; i < len; ++i) |
| { |
| if (m_store_info[i]->bitpos != m_store_info[first]->bitpos + width |
| || m_store_info[i]->lp_nr != merged_store->lp_nr |
| || m_store_info[i]->ins_stmt == NULL) |
| return false; |
| width += m_store_info[i]->bitsize; |
| if (width >= try_size) |
| { |
| last = i; |
| break; |
| } |
| } |
| if (width != try_size) |
| return false; |
| |
| bool allow_unaligned |
| = !STRICT_ALIGNMENT && param_store_merging_allow_unaligned; |
| /* Punt if the combined store would not be aligned and we need alignment. */ |
| if (!allow_unaligned) |
| { |
| unsigned int align = merged_store->align; |
| unsigned HOST_WIDE_INT align_base = merged_store->align_base; |
| for (unsigned int i = first + 1; i <= last; ++i) |
| { |
| unsigned int this_align; |
| unsigned HOST_WIDE_INT align_bitpos = 0; |
| get_object_alignment_1 (gimple_assign_lhs (m_store_info[i]->stmt), |
| &this_align, &align_bitpos); |
| if (this_align > align) |
| { |
| align = this_align; |
| align_base = m_store_info[i]->bitpos - align_bitpos; |
| } |
| } |
| unsigned HOST_WIDE_INT align_bitpos |
| = (m_store_info[first]->bitpos - align_base) & (align - 1); |
| if (align_bitpos) |
| align = least_bit_hwi (align_bitpos); |
| if (align < try_size) |
| return false; |
| } |
| |
| tree type; |
| switch (try_size) |
| { |
| case 16: type = uint16_type_node; break; |
| case 32: type = uint32_type_node; break; |
| case 64: type = uint64_type_node; break; |
| default: gcc_unreachable (); |
| } |
| struct symbolic_number n; |
| gimple *ins_stmt = NULL; |
| int vuse_store = -1; |
| unsigned int first_order = merged_store->first_order; |
| unsigned int last_order = merged_store->last_order; |
| gimple *first_stmt = merged_store->first_stmt; |
| gimple *last_stmt = merged_store->last_stmt; |
| unsigned HOST_WIDE_INT end = merged_store->start + merged_store->width; |
| store_immediate_info *infof = m_store_info[first]; |
| |
| for (unsigned int i = first; i <= last; ++i) |
| { |
| store_immediate_info *info = m_store_info[i]; |
| struct symbolic_number this_n = info->n; |
| this_n.type = type; |
| if (!this_n.base_addr) |
| this_n.range = try_size / BITS_PER_UNIT; |
| else |
| /* Update vuse in case it has changed by output_merged_stores. */ |
| this_n.vuse = gimple_vuse (info->ins_stmt); |
| unsigned int bitpos = info->bitpos - infof->bitpos; |
| if (!do_shift_rotate (LSHIFT_EXPR, &this_n, |
| BYTES_BIG_ENDIAN |
| ? try_size - info->bitsize - bitpos |
| : bitpos)) |
| return false; |
| if (this_n.base_addr && vuse_store) |
| { |
| unsigned int j; |
| for (j = first; j <= last; ++j) |
| if (this_n.vuse == gimple_vuse (m_store_info[j]->stmt)) |
| break; |
| if (j > last) |
| { |
| if (vuse_store == 1) |
| return false; |
| vuse_store = 0; |
| } |
| } |
| if (i == first) |
| { |
| n = this_n; |
| ins_stmt = info->ins_stmt; |
| } |
| else |
| { |
| if (n.base_addr && n.vuse != this_n.vuse) |
| { |
| if (vuse_store == 0) |
| return false; |
| vuse_store = 1; |
| } |
| if (info->order > last_order) |
| { |
| last_order = info->order; |
| last_stmt = info->stmt; |
| } |
| else if (info->order < first_order) |
| { |
| first_order = info->order; |
| first_stmt = info->stmt; |
| } |
| end = MAX (end, info->bitpos + info->bitsize); |
| |
| ins_stmt = perform_symbolic_merge (ins_stmt, &n, info->ins_stmt, |
| &this_n, &n, BIT_IOR_EXPR); |
| if (ins_stmt == NULL) |
| return false; |
| } |
| } |
| |
| uint64_t cmpxchg, cmpnop; |
| bool cast64_to_32; |
| find_bswap_or_nop_finalize (&n, &cmpxchg, &cmpnop, &cast64_to_32); |
| |
| /* A complete byte swap should make the symbolic number to start with |
| the largest digit in the highest order byte. Unchanged symbolic |
| number indicates a read with same endianness as target architecture. */ |
| if (n.n != cmpnop && n.n != cmpxchg) |
| return false; |
| |
| /* For now. */ |
| if (cast64_to_32) |
| return false; |
| |
| if (n.base_addr == NULL_TREE && !is_gimple_val (n.src)) |
| return false; |
| |
| if (!check_no_overlap (m_store_info, last, false, first_order, last_order, |
| merged_store->start, end, first_earlier, first)) |
| return false; |
| |
| /* Don't handle memory copy this way if normal non-bswap processing |
| would handle it too. */ |
| if (n.n == cmpnop && (unsigned) n.n_ops == last - first + 1) |
| { |
| unsigned int i; |
| for (i = first; i <= last; ++i) |
| if (m_store_info[i]->rhs_code != MEM_REF) |
| break; |
| if (i == last + 1) |
| return false; |
| } |
| |
| if (n.n == cmpxchg) |
| switch (try_size) |
| { |
| case 16: |
| /* Will emit LROTATE_EXPR. */ |
| break; |
| case 32: |
| if (builtin_decl_explicit_p (BUILT_IN_BSWAP32) |
| && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing) |
| break; |
| return false; |
| case 64: |
| if (builtin_decl_explicit_p (BUILT_IN_BSWAP64) |
| && optab_handler (bswap_optab, DImode) != CODE_FOR_nothing) |
| break; |
| return false; |
| default: |
| gcc_unreachable (); |
| } |
| |
| if (!allow_unaligned && n.base_addr) |
| { |
| unsigned int align = get_object_alignment (n.src); |
| if (align < try_size) |
| return false; |
| } |
| |
| /* If each load has vuse of the corresponding store, need to verify |
| the loads can be sunk right before the last store. */ |
| if (vuse_store == 1) |
| { |
| auto_vec<tree, 64> refs; |
| for (unsigned int i = first; i <= last; ++i) |
| gather_bswap_load_refs (&refs, |
| gimple_assign_rhs1 (m_store_info[i]->stmt)); |
| |
| for (tree ref : refs) |
| if (stmts_may_clobber_ref_p (first_stmt, last_stmt, ref)) |
| return false; |
| n.vuse = NULL_TREE; |
| } |
| |
| infof->n = n; |
| infof->ins_stmt = ins_stmt; |
| for (unsigned int i = first; i <= last; ++i) |
| { |
| m_store_info[i]->rhs_code = n.n == cmpxchg ? LROTATE_EXPR : NOP_EXPR; |
| m_store_info[i]->ops[0].base_addr = NULL_TREE; |
| m_store_info[i]->ops[1].base_addr = NULL_TREE; |
| if (i != first) |
| merged_store->merge_into (m_store_info[i]); |
| } |
| |
| return true; |
| } |
| |
| /* Go through the candidate stores recorded in m_store_info and merge them |
| into merged_store_group objects recorded into m_merged_store_groups |
| representing the widened stores. Return true if coalescing was successful |
| and the number of widened stores is fewer than the original number |
| of stores. */ |
| |
| bool |
| imm_store_chain_info::coalesce_immediate_stores () |
| { |
| /* Anything less can't be processed. */ |
| if (m_store_info.length () < 2) |
| return false; |
| |
| if (dump_file && (dump_flags & TDF_DETAILS)) |
| fprintf (dump_file, "Attempting to coalesce %u stores in chain\n", |
| m_store_info.length ()); |
| |
| store_immediate_info *info; |
| unsigned int i, ignore = 0; |
| unsigned int first_earlier = 0; |
| unsigned int end_earlier = 0; |
| |
| /* Order the stores by the bitposition they write to. */ |
| m_store_info.qsort (sort_by_bitpos); |
| |
| info = m_store_info[0]; |
| merged_store_group *merged_store = new merged_store_group (info); |
| if (dump_file && (dump_flags & TDF_DETAILS)) |
| fputs ("New store group\n", dump_file); |
| |
| FOR_EACH_VEC_ELT (m_store_info, i, info) |
| { |
| unsigned HOST_WIDE_INT new_bitregion_start, new_bitregion_end; |
| |
| if (i <= ignore) |
| goto done; |
| |
| while (first_earlier < end_earlier |
| && (m_store_info[first_earlier]->bitpos |
| + m_store_info[first_earlier]->bitsize |
| <= merged_store->start)) |
| first_earlier++; |
| |
| /* First try to handle group of stores like: |
| p[0] = data >> 24; |
| p[1] = data >> 16; |
| p[2] = data >> 8; |
| p[3] = data; |
| using the bswap framework. */ |
| if (info->bitpos == merged_store->start + merged_store->width |
| && merged_store->stores.length () == 1 |
| && merged_store->stores[0]->ins_stmt != NULL |
| && info->lp_nr == merged_store->lp_nr |
| && info->ins_stmt != NULL) |
| { |
| unsigned int try_size; |
| for (try_size = 64; try_size >= 16; try_size >>= 1) |
| if (try_coalesce_bswap (merged_store, i - 1, try_size, |
| first_earlier)) |
| break; |
| |
| if (try_size >= 16) |
| { |
| ignore = i + merged_store->stores.length () - 1; |
| m_merged_store_groups.safe_push (merged_store); |
| if (ignore < m_store_info.length ()) |
| { |
| merged_store = new merged_store_group (m_store_info[ignore]); |
| end_earlier = ignore; |
| } |
| else |
| merged_store = NULL; |
| goto done; |
| } |
| } |
| |
| new_bitregion_start |
| = MIN (merged_store->bitregion_start, info->bitregion_start); |
| new_bitregion_end |
| = MAX (merged_store->bitregion_end, info->bitregion_end); |
| |
| if (info->order >= merged_store->first_nonmergeable_order |
| || (((new_bitregion_end - new_bitregion_start + 1) / BITS_PER_UNIT) |
| > (unsigned) param_store_merging_max_size)) |
|