blob: 1d6bc263c2ed7a8587f0ee15b11699280505445a [file] [log] [blame]
/* brig-basic-inst-handler.cc -- brig basic instruction handling
Copyright (C) 2016-2020 Free Software Foundation, Inc.
Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
for General Processor Tech.
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3, or (at your option) any later
version.
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
#include <sstream>
#include "brig-code-entry-handler.h"
#include "brig-util.h"
#include "errors.h"
#include "gimple-expr.h"
#include "convert.h"
#include "print-tree.h"
#include "tree-pretty-print.h"
#include "langhooks.h"
#include "stor-layout.h"
#include "diagnostic-core.h"
#include "brig-builtins.h"
#include "fold-const.h"
brig_basic_inst_handler::brig_basic_inst_handler (brig_to_generic &parent)
: brig_code_entry_handler (parent)
{
}
class scalarized_sat_arithmetics : public tree_element_binary_visitor
{
public:
scalarized_sat_arithmetics (const BrigInstBase &brig_inst)
: m_brig_inst (brig_inst)
{
BrigType16_t element_type = brig_inst.type & BRIG_TYPE_BASE_MASK;
#undef DEF_HSAIL_SAT_BUILTIN
#undef DEF_HSAIL_BUILTIN
#undef DEF_HSAIL_ATOMIC_BUILTIN
#undef DEF_HSAIL_INTR_BUILTIN
#undef DEF_HSAIL_CVT_ZEROI_SAT_BUILTIN
#define DEF_HSAIL_SAT_BUILTIN(ENUM, BRIG_OPCODE, HSAIL_TYPE, \
NAME, TYPE, ATTRS) \
if (brig_inst.opcode == BRIG_OPCODE && element_type == HSAIL_TYPE) \
m_builtin = builtin_decl_explicit (ENUM); \
else
#include "brig-builtins.def"
gcc_unreachable ();
}
virtual tree
visit_element (brig_code_entry_handler &, tree operand0, tree operand1)
{
/* Implement saturating arithmetics with scalar built-ins for now.
TODO: emit GENERIC nodes for the simplest cases or at least
emit vector built-ins. */
return call_builtin (m_builtin, 2, TREE_TYPE (operand0),
TREE_TYPE (operand0), operand0,
TREE_TYPE (operand1), operand1);
}
const BrigInstBase &m_brig_inst;
tree m_builtin;
};
/* Implements a vector shuffle. ARITH_TYPE is the type of the vector,
OPERANDS[0] is the first vector, OPERAND[1] the second vector and
OPERANDS[2] the shuffle mask in HSAIL format. The output is a VEC_PERM_EXPR
that implements the shuffle as a GENERIC expression. */
tree
brig_basic_inst_handler::build_shuffle (tree arith_type,
tree_stl_vec &operands)
{
tree element_type
= get_unsigned_int_type (TREE_TYPE (TREE_TYPE (operands[0])));
/* Offsets to add to the mask values to convert from the
HSAIL mask to VEC_PERM_EXPR masks. VEC_PERM_EXPR mask
assumes an index spanning from 0 to 2 times the vec
width while HSAIL refers separately to two different
input vectors, thus is not a "full shuffle" where all
output elements can originate from any input element. */
vec<constructor_elt, va_gc> *mask_offset_vals = NULL;
unsigned int element_count = gccbrig_type_vector_subparts (arith_type);
vec<constructor_elt, va_gc> *input_mask_vals = NULL;
size_t input_mask_element_size = exact_log2 (element_count);
/* Unpack the tightly packed mask elements to BIT_FIELD_REFs
from which to construct the mask vector as understood by
VEC_PERM_EXPR. */
tree mask_operand
= m_parent.m_cf->add_temp_var ("shuffle_mask", operands[2]);
tree mask_element_type
= build_nonstandard_integer_type (input_mask_element_size, true);
for (size_t i = 0; i < element_count; ++i)
{
tree mask_element
= build3 (BIT_FIELD_REF, mask_element_type, mask_operand,
bitsize_int (input_mask_element_size),
bitsize_int (i * input_mask_element_size));
mask_element = convert (element_type, mask_element);
tree offset;
if (i < element_count / 2)
offset = build_int_cst (element_type, 0);
else
offset = build_int_cst (element_type, element_count);
CONSTRUCTOR_APPEND_ELT (mask_offset_vals, NULL_TREE, offset);
CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE, mask_element);
}
tree mask_vec_type = build_vector_type (element_type, element_count);
tree mask_vec = build_constructor (mask_vec_type, input_mask_vals);
tree offset_vec = build_constructor (mask_vec_type, mask_offset_vals);
tree mask = build2 (PLUS_EXPR, mask_vec_type, mask_vec, offset_vec);
tree perm = build3 (VEC_PERM_EXPR, TREE_TYPE (operands[0]), operands[0],
operands[1], mask);
return perm;
}
/* Unpacks (extracts) a scalar element with an index in OPERANDS[1]
from the vector expression in OPERANDS[0]. */
tree
brig_basic_inst_handler::build_unpack (tree_stl_vec &operands)
{
/* Implement the unpack with a shuffle that stores the unpacked
element to the lowest bit positions in the dest. After that
a bitwise AND is used to clear the uppermost bits. */
tree src_element_type = TREE_TYPE (TREE_TYPE (operands[0]));
/* Perform the operations with a raw (unsigned int type) type. */
tree element_type = get_unsigned_int_type (src_element_type);
vec<constructor_elt, va_gc> *input_mask_vals = NULL;
vec<constructor_elt, va_gc> *and_mask_vals = NULL;
size_t element_count
= gccbrig_type_vector_subparts (TREE_TYPE (operands[0]));
tree vec_type = build_vector_type (element_type, element_count);
for (size_t i = 0; i < element_count; ++i)
{
tree mask_element;
if (i == 0)
mask_element = convert (element_type, operands[1]);
else
mask_element = build_int_cst (element_type, 0);
CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE, mask_element);
tree and_mask_element;
if (i == 0)
and_mask_element = build_int_cst (element_type, -1);
else
and_mask_element = build_int_cst (element_type, 0);
CONSTRUCTOR_APPEND_ELT (and_mask_vals, NULL_TREE, and_mask_element);
}
tree mask_vec = build_constructor (vec_type, input_mask_vals);
tree and_mask_vec = build_constructor (vec_type, and_mask_vals);
tree perm = build3 (VEC_PERM_EXPR, vec_type,
build_resize_convert_view (vec_type, operands[0]),
build_resize_convert_view (vec_type, operands[0]),
mask_vec);
tree cleared = build2 (BIT_AND_EXPR, vec_type, perm, and_mask_vec);
size_t s = int_size_in_bytes (TREE_TYPE (cleared)) * BITS_PER_UNIT;
tree raw_type = build_nonstandard_integer_type (s, true);
tree as_int = build_resize_convert_view (raw_type, cleared);
if (int_size_in_bytes (src_element_type) < 4)
{
if (INTEGRAL_TYPE_P (src_element_type))
return extend_int (as_int, uint32_type_node, src_element_type);
}
return as_int;
}
/* Packs (inserts) a scalar element in OPERANDS[1]
to the vector in OPERANDS[0] at element position defined by
OPERANDS[2]. */
tree
brig_basic_inst_handler::build_pack (tree_stl_vec &operands)
{
/* Implement using a bit level insertion.
TODO: Reuse this for implementing 'bitinsert'
without a builtin call. */
size_t ecount = gccbrig_type_vector_subparts (TREE_TYPE (operands[0]));
size_t vecsize = int_size_in_bytes (TREE_TYPE (operands[0])) * BITS_PER_UNIT;
tree wide_type = build_nonstandard_integer_type (vecsize, 1);
tree src_vect = build_resize_convert_view (wide_type, operands[0]);
src_vect = m_parent.m_cf->add_temp_var ("src_vect", src_vect);
tree scalar = operands[1];
scalar = m_parent.m_cf->add_temp_var ("scalar",
convert_to_integer (wide_type, scalar));
tree pos = operands[2];
/* The upper bits of the position can contain garbage.
Zero them for well-defined semantics. */
tree t = build2 (BIT_AND_EXPR, TREE_TYPE (pos), operands[2],
build_int_cstu (TREE_TYPE (pos), ecount - 1));
pos = m_parent.m_cf->add_temp_var ("pos", convert (wide_type, t));
tree element_type = TREE_TYPE (TREE_TYPE (operands[0]));
size_t element_width = int_size_in_bytes (element_type) * BITS_PER_UNIT;
tree ewidth = build_int_cstu (wide_type, element_width);
tree bitoffset = build2 (MULT_EXPR, wide_type, ewidth, pos);
bitoffset = m_parent.m_cf->add_temp_var ("offset", bitoffset);
uint64_t mask_int
= element_width == 64 ? (uint64_t) -1 : ((uint64_t) 1 << element_width) - 1;
tree mask = build_int_cstu (wide_type, mask_int);
mask = m_parent.m_cf->add_temp_var ("mask",
convert_to_integer (wide_type, mask));
tree clearing_mask
= build1 (BIT_NOT_EXPR, wide_type,
build2 (LSHIFT_EXPR, wide_type, mask, bitoffset));
tree zeroed_element
= build2 (BIT_AND_EXPR, wide_type, src_vect, clearing_mask);
/* TODO: Is the AND necessary: does HSA define what
happens if the upper bits in the inserted element are not
zero? */
tree element_in_position
= build2 (LSHIFT_EXPR, wide_type,
build2 (BIT_AND_EXPR, wide_type, scalar, mask), bitoffset);
tree inserted
= build2 (BIT_IOR_EXPR, wide_type, zeroed_element, element_in_position);
return inserted;
}
/* Implement the unpack{lo,hi}. BRIG_OPCODE should tell which one and
ARITH_TYPE describe the type of the vector arithmetics.
OPERANDS[0] and OPERANDS[1] are the input vectors. */
tree
brig_basic_inst_handler::build_unpack_lo_or_hi (BrigOpcode16_t brig_opcode,
tree arith_type,
tree_stl_vec &operands)
{
tree element_type = get_unsigned_int_type (TREE_TYPE (arith_type));
tree mask_vec_type
= build_vector_type (element_type,
gccbrig_type_vector_subparts (arith_type));
size_t element_count = gccbrig_type_vector_subparts (arith_type);
vec<constructor_elt, va_gc> *input_mask_vals = NULL;
size_t offset = (brig_opcode == BRIG_OPCODE_UNPACKLO) ? 0 : element_count / 2;
for (size_t i = 0; i < element_count / 2; ++i)
{
CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE,
build_int_cst (element_type, offset + i));
CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE,
build_int_cst (element_type,
offset + i + element_count));
}
tree mask_vec = build_constructor (mask_vec_type, input_mask_vals);
tree perm = build3 (VEC_PERM_EXPR, TREE_TYPE (operands[0]), operands[0],
operands[1], mask_vec);
return perm;
}
/* Builds a basic instruction expression from a BRIG instruction. BRIG_OPCODE
is the opcode, BRIG_TYPE the brig type of the instruction, ARITH_TYPE the
desired tree type for the instruction, and OPERANDS the instruction's
input operands already converted to tree nodes. */
tree
brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode,
BrigType16_t brig_type,
tree arith_type,
tree_stl_vec &operands)
{
tree_code opcode
= brig_function::get_tree_code_for_hsa_opcode (brig_opcode, brig_type);
BrigType16_t inner_type = brig_type & BRIG_TYPE_BASE_MASK;
tree instr_inner_type
= VECTOR_TYPE_P (arith_type) ? TREE_TYPE (arith_type) : arith_type;
if (opcode == RSHIFT_EXPR || opcode == LSHIFT_EXPR)
{
/* HSA defines modulo/clipping behavior for shift amounts larger
than the bit width, while tree.def leaves it undefined.
We need to mask the upper bits to ensure the defined behavior. */
tree scalar_mask
= build_int_cst (instr_inner_type,
gccbrig_hsa_type_bit_size (inner_type) - 1);
tree mask = VECTOR_TYPE_P (arith_type)
? build_vector_from_val (arith_type, scalar_mask)
: scalar_mask;
/* The shift amount is a scalar, broadcast it to produce
a vector shift. */
if (VECTOR_TYPE_P (arith_type))
operands[1] = build_vector_from_val (arith_type, operands[1]);
operands[1] = build2 (BIT_AND_EXPR, arith_type, operands[1], mask);
}
size_t input_count = operands.size ();
size_t output_count = gccbrig_hsa_opcode_op_output_p (brig_opcode, 0) ?
1 : 0;
if (opcode == TREE_LIST)
{
/* There was no direct GENERIC opcode for the instruction;
try to emulate it with a chain of GENERIC nodes. */
if (brig_opcode == BRIG_OPCODE_MAD || brig_opcode == BRIG_OPCODE_MAD24)
{
/* There doesn't seem to be a "standard" MAD built-in in gcc so let's
use a chain of multiply + add for now (double rounding method).
It should be easier for optimizers than a custom built-in call
WIDEN_MULT_EXPR is close, but requires a double size result
type. */
tree mult_res
= build2 (MULT_EXPR, arith_type, operands[0], operands[1]);
return build2 (PLUS_EXPR, arith_type, mult_res, operands[2]);
}
else if (brig_opcode == BRIG_OPCODE_MAD24HI)
{
tree mult_res
= build2 (MULT_HIGHPART_EXPR, arith_type, operands[0], operands[1]);
return build2 (PLUS_EXPR, arith_type, mult_res, operands[2]);
}
else if (brig_opcode == BRIG_OPCODE_SHUFFLE)
{
return build_shuffle (arith_type, operands);
}
else if (brig_opcode == BRIG_OPCODE_UNPACKLO
|| brig_opcode == BRIG_OPCODE_UNPACKHI)
{
return build_unpack_lo_or_hi (brig_opcode, arith_type, operands);
}
else if (brig_opcode == BRIG_OPCODE_UNPACK)
{
return build_unpack (operands);
}
else if (brig_opcode == BRIG_OPCODE_PACK)
{
return build_pack (operands);
}
else if (brig_opcode == BRIG_OPCODE_NRSQRT)
{
/* Implement as 1.0/sqrt (x) and assume gcc instruction selects to
native ISA other than a division, if available.
TODO: this will happen only with unsafe math optimizations
on which cannot be used in general to remain HSAIL compliant.
Perhaps a builtin call would be better option here. */
return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type),
m_parent.m_cf->expand_or_call_builtin
(BRIG_OPCODE_SQRT, brig_type, arith_type, operands));
}
else if (brig_opcode == BRIG_OPCODE_NRCP)
{
/* Implement as 1.0/x and assume gcc instruction selects to
native ISA other than a division, if available. */
return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type),
operands[0]);
}
else if (brig_opcode == BRIG_OPCODE_LANEID
|| brig_opcode == BRIG_OPCODE_MAXWAVEID
|| brig_opcode == BRIG_OPCODE_WAVEID)
{
/* Assuming WAVESIZE 1 (for now), therefore LANEID, WAVEID and
MAXWAVEID always return 0. */
return build_zero_cst (arith_type);
}
else
gcc_unreachable ();
}
else if (opcode == CALL_EXPR)
return m_parent.m_cf->expand_or_call_builtin (brig_opcode, brig_type,
arith_type, operands);
else if (output_count == 1)
{
if (input_count == 1)
{
if (opcode == MODIFY_EXPR)
return operands[0];
else
return build1 (opcode, arith_type, operands[0]);
}
else if (input_count == 2)
return build2 (opcode, arith_type, operands[0], operands[1]);
else if (input_count == 3)
return build3 (opcode, arith_type, operands[0], operands[1],
operands[2]);
else
gcc_unreachable ();
}
else
gcc_unreachable ();
return NULL_TREE;
}
/* Handles the basic instructions, including packed instructions. Deals
with the different packing modes by unpacking/packing the wanted
elements. Delegates most of the instruction cases to build_inst_expr(). */
size_t
brig_basic_inst_handler::operator () (const BrigBase *base)
{
const BrigInstBase *brig_inst = (const BrigInstBase *) base;
if (brig_inst->opcode == BRIG_OPCODE_NOP)
return base->byteCount;
tree_stl_vec operands = build_operands (*brig_inst);
size_t output_count
= gccbrig_hsa_opcode_op_output_p (brig_inst->opcode, 0) ? 1 : 0;
size_t input_count
= operands.size () == 0 ? 0 : (operands.size () - output_count);
gcc_assert (output_count == 0 || output_count == 1);
tree_stl_vec::iterator first_input_i = operands.begin ();
if (output_count > 0 && operands.size () > 0)
++first_input_i;
tree_stl_vec in_operands;
in_operands.assign (first_input_i, operands.end ());
BrigType16_t brig_inst_type = brig_inst->type;
if (brig_inst->opcode == BRIG_OPCODE_FIRSTBIT
|| brig_inst->opcode == BRIG_OPCODE_LASTBIT
|| brig_inst->opcode == BRIG_OPCODE_SAD)
/* These instructions are reported to be always 32b in HSAIL, but we want
to treat them according to their input argument's type to select the
correct instruction/builtin. */
brig_inst_type
= gccbrig_tree_type_to_hsa_type (TREE_TYPE (in_operands[0]));
tree instr_type = gccbrig_tree_type_for_hsa_type (brig_inst_type);
if (!instr_type)
{
gcc_unreachable ();
return base->byteCount;
}
bool is_vec_instr = hsa_type_packed_p (brig_inst_type);
size_t element_size_bits;
size_t element_count;
if (is_vec_instr)
{
BrigType16_t brig_element_type = brig_inst_type & BRIG_TYPE_BASE_MASK;
element_size_bits = gccbrig_hsa_type_bit_size (brig_element_type);
element_count = gccbrig_hsa_type_bit_size (brig_inst_type)
/ gccbrig_hsa_type_bit_size (brig_element_type);
}
else
{
element_size_bits = gccbrig_hsa_type_bit_size (brig_inst_type);
element_count = 1;
}
/* The actual arithmetics type that should be performed with the
operation. This is not always the same as the original BRIG
opcode's type due to implicit conversions of storage-only f16. */
tree arith_type = gccbrig_is_bit_operation (brig_inst->opcode)
? gccbrig_tree_type_for_hsa_type (brig_inst_type)
: get_tree_expr_type_for_hsa_type (brig_inst_type);
tree instr_expr = NULL_TREE;
BrigPack8_t p = BRIG_PACK_NONE;
if (brig_inst->base.kind == BRIG_KIND_INST_MOD)
p = ((const BrigInstMod *) brig_inst)->pack;
else if (brig_inst->base.kind == BRIG_KIND_INST_CMP)
p = ((const BrigInstCmp *) brig_inst)->pack;
if (p == BRIG_PACK_PS || p == BRIG_PACK_PSSAT)
in_operands[1] = build_lower_element_broadcast (in_operands[1]);
else if (p == BRIG_PACK_SP || p == BRIG_PACK_SPSAT)
in_operands[0] = build_lower_element_broadcast (in_operands[0]);
tree_code opcode
= brig_function::get_tree_code_for_hsa_opcode (brig_inst->opcode,
brig_inst_type);
if (p >= BRIG_PACK_PPSAT && p <= BRIG_PACK_PSAT)
{
scalarized_sat_arithmetics sat_arith (*brig_inst);
gcc_assert (input_count == 2);
instr_expr = sat_arith (*this, in_operands[0], in_operands[1]);
}
else if (opcode == RETURN_EXPR)
{
if (m_parent.m_cf->m_is_kernel)
{
tree goto_stmt
= build1 (GOTO_EXPR, void_type_node, m_parent.m_cf->m_exit_label);
m_parent.m_cf->append_statement (goto_stmt);
return base->byteCount;
}
else
{
m_parent.m_cf->append_return_stmt ();
return base->byteCount;
}
}
else if (opcode == MULT_HIGHPART_EXPR &&
is_vec_instr && element_size_bits < 64)
{
/* MULT_HIGHPART_EXPR works only on target dependent vector sizes and
even the scalars do not seem to work at least for char elements.
Let's fall back to scalarization and promotion of the vector elements
to larger types with the MULHI computed as a regular MUL.
MULHI for 2x64b seems to work with the Intel CPUs I've tested so
that is passed on for vector processing so there is no need for
128b scalar arithmetics.
This is not modular as these type of things do not belong to the
frontend, there should be a legalization phase before the backend
that figures out the best way to compute the MULHI for any
integer vector datatype.
TODO: promote to larger vector types instead. For example
MULT_HIGHPART_EXPR with s8x8 doesn't work, but s16x8 seems to at least
with my x86-64.
*/
tree_stl_vec operand0_elements;
if (input_count > 0)
m_parent.m_cf->unpack (in_operands[0], operand0_elements);
tree_stl_vec operand1_elements;
if (input_count > 1)
m_parent.m_cf->unpack (in_operands[1], operand1_elements);
tree_stl_vec result_elements;
tree scalar_type = TREE_TYPE (arith_type);
BrigType16_t element_type = brig_inst_type & BRIG_TYPE_BASE_MASK;
tree promoted_type = short_integer_type_node;
switch (element_type)
{
case BRIG_TYPE_S8:
promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S16);
break;
case BRIG_TYPE_U8:
promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U16);
break;
case BRIG_TYPE_S16:
promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S32);
break;
case BRIG_TYPE_U16:
promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U32);
break;
case BRIG_TYPE_S32:
promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S64);
break;
case BRIG_TYPE_U32:
promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U64);
break;
default:
gcc_unreachable ();
}
size_t promoted_type_size = int_size_in_bytes (promoted_type) * 8;
size_t element_count = gccbrig_type_vector_subparts (arith_type);
for (size_t i = 0; i < element_count; ++i)
{
tree operand0 = convert (promoted_type, operand0_elements.at (i));
tree operand1 = convert (promoted_type, operand1_elements.at (i));
tree scalar_expr
= build2 (MULT_EXPR, promoted_type, operand0, operand1);
scalar_expr
= build2 (RSHIFT_EXPR, promoted_type, scalar_expr,
build_int_cstu (promoted_type, promoted_type_size / 2));
result_elements.push_back (convert (scalar_type, scalar_expr));
}
instr_expr = m_parent.m_cf->pack (result_elements);
}
else
{
/* 'class' is always of b1 type, let's consider it by its
float type when building the instruction to find the
correct builtin. */
if (brig_inst->opcode == BRIG_OPCODE_CLASS)
brig_inst_type = ((const BrigInstSourceType *) base)->sourceType;
instr_expr = build_inst_expr (brig_inst->opcode, brig_inst_type,
arith_type, in_operands);
}
if (instr_expr == NULL_TREE)
{
gcc_unreachable ();
return base->byteCount;
}
if (p == BRIG_PACK_SS || p == BRIG_PACK_S || p == BRIG_PACK_SSSAT
|| p == BRIG_PACK_SSAT)
{
/* In case of _s_ or _ss_, select only the lowest element
from the new input to the output. We could extract
the element and use a scalar operation, but try
to keep data in vector registers as much as possible
to avoid copies between scalar and vector datapaths. */
tree old_value;
tree half_storage_type = gccbrig_tree_type_for_hsa_type (brig_inst_type);
bool is_fp16_operation
= (brig_inst_type & BRIG_TYPE_BASE_MASK) == BRIG_TYPE_F16
&& !gccbrig_is_bit_operation (brig_inst->opcode);
if (is_fp16_operation)
old_value = build_h2f_conversion
(build_resize_convert_view (half_storage_type, operands[0]));
else
old_value
= build_resize_convert_view (TREE_TYPE (instr_expr), operands[0]);
size_t esize = is_fp16_operation ? 32 : element_size_bits;
/* Construct a permutation mask where other elements than the lowest one
is picked from the old_value. */
tree mask_inner_type = build_nonstandard_integer_type (esize, 1);
vec<constructor_elt, va_gc> *constructor_vals = NULL;
for (size_t i = 0; i < element_count; ++i)
{
tree cst;
if (i == 0)
cst = build_int_cstu (mask_inner_type, element_count);
else
cst = build_int_cstu (mask_inner_type, i);
CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, cst);
}
tree mask_vec_type = build_vector_type (mask_inner_type, element_count);
tree mask = build_vector_from_ctor (mask_vec_type, constructor_vals);
tree new_value = create_tmp_var (TREE_TYPE (instr_expr), "new_output");
tree assign
= build2 (MODIFY_EXPR, TREE_TYPE (instr_expr), new_value, instr_expr);
m_parent.m_cf->append_statement (assign);
instr_expr
= build3 (VEC_PERM_EXPR, arith_type, old_value, new_value, mask);
tree lower_output = create_tmp_var (TREE_TYPE (instr_expr), "s_output");
tree assign_lower = build2 (MODIFY_EXPR, TREE_TYPE (instr_expr),
lower_output, instr_expr);
m_parent.m_cf->append_statement (assign_lower);
instr_expr = lower_output;
}
if (output_count == 1)
build_output_assignment (*brig_inst, operands[0], instr_expr);
else
m_parent.m_cf->append_statement (instr_expr);
return base->byteCount;
}
/* Create an expression that broadcasts the lowest element of the
vector in VEC_OPERAND to all elements of the returned vector. */
tree
brig_basic_inst_handler::build_lower_element_broadcast (tree vec_operand)
{
/* Build the broadcast using shuffle because there's no
direct broadcast in GENERIC and this way there's no need for
a separate extract of the lowest element. */
tree element_type = TREE_TYPE (TREE_TYPE (vec_operand));
size_t esize = 8 * int_size_in_bytes (element_type);
size_t element_count
= gccbrig_type_vector_subparts (TREE_TYPE (vec_operand));
tree mask_inner_type = build_nonstandard_integer_type (esize, 1);
vec<constructor_elt, va_gc> *constructor_vals = NULL;
/* Construct the mask. */
for (size_t i = 0; i < element_count; ++i)
{
tree cst = build_int_cstu (mask_inner_type, element_count);
CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, cst);
}
tree mask_vec_type = build_vector_type (mask_inner_type, element_count);
tree mask = build_vector_from_ctor (mask_vec_type, constructor_vals);
return build3 (VEC_PERM_EXPR, TREE_TYPE (vec_operand), vec_operand,
vec_operand, mask);
}