blob: 2c70bb9de9c518afa101d2271a30d13742905b13 [file] [log] [blame]
/* { dg-do run { target { aarch64*-*-* } } } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-add-options arm_v8_2a_bf16_neon } */
#include <arm_neon.h>
#include "arm-neon-ref.h"
#include "compute-ref-data.h"
/* Expected results for vst2, chunk 0. */
VECT_VAR_DECL(expected_st2_0,hbfloat,16,4) [] = { 0xABAB, 0x3210, 0x0, 0x0 };
VECT_VAR_DECL(expected_st2_0,hbfloat,16,8) [] = { 0xABAB, 0x3210, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0 };
/* Expected results for vst2, chunk 1. */
VECT_VAR_DECL(expected_st2_1,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
VECT_VAR_DECL(expected_st2_1,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0 };
/* Expected results for vst3, chunk 0. */
VECT_VAR_DECL(expected_st3_0,hbfloat,16,4) [] = { 0xABAB, 0x3210, 0xCAFE, 0x0 };
VECT_VAR_DECL(expected_st3_0,hbfloat,16,8) [] = { 0xABAB, 0x3210, 0xCAFE, 0x0,
0x0, 0x0, 0x0, 0x0 };
/* Expected results for vst3, chunk 1. */
VECT_VAR_DECL(expected_st3_1,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
VECT_VAR_DECL(expected_st3_1,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0 };
/* Expected results for vst3, chunk 2. */
VECT_VAR_DECL(expected_st3_2,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
VECT_VAR_DECL(expected_st3_2,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0 };
/* Expected results for vst4, chunk 0. */
VECT_VAR_DECL(expected_st4_0,hbfloat,16,4) [] =
{ 0xABAB, 0x3210, 0xCAFE, 0x1234 };
VECT_VAR_DECL(expected_st4_0,hbfloat,16,8) [] =
{ 0xABAB, 0x3210, 0xCAFE, 0x1234, 0x0, 0x0, 0x0, 0x0 };
/* Expected results for vst4, chunk 1. */
VECT_VAR_DECL(expected_st4_1,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
VECT_VAR_DECL(expected_st4_1,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0 };
/* Expected results for vst4, chunk 2. */
VECT_VAR_DECL(expected_st4_2,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
VECT_VAR_DECL(expected_st4_2,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0 };
/* Expected results for vst4, chunk 3. */
VECT_VAR_DECL(expected_st4_3,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
VECT_VAR_DECL(expected_st4_3,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0 };
typedef union
{
bfloat16_t bf16;
uint16_t u16;
} bfloat16_u_t;
static bfloat16_t result_bfloat16x4[4];
static bfloat16_t result_bfloat16x8[8];
void exec_vstX_lane (void)
{
bfloat16_u_t bfloat16_data[4];
bfloat16_data[0].u16 = 0xABAB;
bfloat16_data[1].u16 = 0x3210;
bfloat16_data[2].u16 = 0xCAFE;
bfloat16_data[3].u16 = 0x1234;
bfloat16_t buffer_vld2_lane_bfloat16x2 [2] =
{ bfloat16_data[0].bf16,
bfloat16_data[1].bf16 };
bfloat16_t buffer_vld3_lane_bfloat16x3 [3] =
{ bfloat16_data[0].bf16,
bfloat16_data[1].bf16,
bfloat16_data[2].bf16 };
bfloat16_t buffer_vld4_lane_bfloat16x4 [4] =
{ bfloat16_data[0].bf16,
bfloat16_data[1].bf16,
bfloat16_data[2].bf16,
bfloat16_data[3].bf16 };
/* In this case, input variables are arrays of vectors. */
#define DECL_VSTX_LANE(T1, W, N, X) \
VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector, T1, W, N, X); \
VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector_src, T1, W, N, X); \
VECT_VAR_DECL(result_bis_##X, T1, W, N)[X * N]
/* We need to use a temporary result buffer (result_bis), because
the one used for other tests is not large enough. A subset of the
result data is moved from result_bis to result, and it is this
subset which is used to check the actual behavior. The next
macro enables to move another chunk of data from result_bis to
result. */
/* We also use another extra input buffer (buffer_src), which we
fill with 0xAA, and which it used to load a vector from which we
read a given lane. */
#define TEST_VSTX_LANE(Q, T1, T2, W, N, X, L) \
memset (VECT_VAR(buffer_src, T1, W, N), 0xAA, \
sizeof(VECT_VAR(buffer_src, T1, W, N))); \
memset (VECT_VAR(result_bis_##X, T1, W, N), 0, \
sizeof(VECT_VAR(result_bis_##X, T1, W, N))); \
\
VECT_ARRAY_VAR(vector_src, T1, W, N, X) = \
vld##X##Q##_##T2##W(VECT_VAR(buffer_src, T1, W, N)); \
\
VECT_ARRAY_VAR(vector, T1, W, N, X) = \
/* Use dedicated init buffer, of size X. */ \
vld##X##Q##_lane_##T2##W(VECT_VAR(buffer_vld##X##_lane, T1, W, X), \
VECT_ARRAY_VAR(vector_src, T1, W, N, X), \
L); \
vst##X##Q##_lane_##T2##W(VECT_VAR(result_bis_##X, T1, W, N), \
VECT_ARRAY_VAR(vector, T1, W, N, X), \
L); \
memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(result_bis_##X, T1, W, N), \
sizeof(VECT_VAR(result, T1, W, N)));
/* Overwrite "result" with the contents of "result_bis"[Y]. */
#define TEST_EXTRA_CHUNK(T1, W, N, X, Y) \
memcpy(VECT_VAR(result, T1, W, N), \
&(VECT_VAR(result_bis_##X, T1, W, N)[Y*N]), \
sizeof(VECT_VAR(result, T1, W, N)));
#define DUMMY_ARRAY(V, T, W, N, L) VECT_VAR_DECL(V,T,W,N)[N*L]
DECL_VSTX_LANE(bfloat, 16, 4, 2);
DECL_VSTX_LANE(bfloat, 16, 8, 2);
DECL_VSTX_LANE(bfloat, 16, 4, 3);
DECL_VSTX_LANE(bfloat, 16, 8, 3);
DECL_VSTX_LANE(bfloat, 16, 4, 4);
DECL_VSTX_LANE(bfloat, 16, 8, 4);
DUMMY_ARRAY(buffer_src, bfloat, 16, 4, 4);
DUMMY_ARRAY(buffer_src, bfloat, 16, 8, 4);
/* Check vst2_lane/vst2q_lane. */
clean_results ();
TEST_VSTX_LANE(, bfloat, bf, 16, 4, 2, 2);
TEST_VSTX_LANE(q, bfloat, bf, 16, 8, 2, 6);
#undef CMT
#define CMT " (chunk 0)"
#undef TEST_MSG
#define TEST_MSG "VST2_LANE/VST2Q_LANE"
CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st2_0, CMT);
CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st2_0, CMT);
TEST_EXTRA_CHUNK(bfloat, 16, 4, 2, 1);
TEST_EXTRA_CHUNK(bfloat, 16, 8, 2, 1);
#undef CMT
#define CMT " (chunk 1)"
CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st2_1, CMT);
CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st2_1, CMT);
/* Check vst3_lane/vst3q_lane. */
clean_results ();
#undef TEST_MSG
#define TEST_MSG "VST3_LANE/VST3Q_LANE"
TEST_VSTX_LANE(, bfloat, bf, 16, 4, 3, 2);
TEST_VSTX_LANE(q, bfloat, bf, 16, 8, 3, 6);
#undef CMT
#define CMT " (chunk 0)"
CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st3_0, CMT);
CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st3_0, CMT);
TEST_EXTRA_CHUNK(bfloat, 16, 4, 3, 1);
TEST_EXTRA_CHUNK(bfloat, 16, 8, 3, 1);
#undef CMT
#define CMT " (chunk 1)"
CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st3_1, CMT);
CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st3_1, CMT);
TEST_EXTRA_CHUNK(bfloat, 16, 4, 3, 2);
TEST_EXTRA_CHUNK(bfloat, 16, 8, 3, 2);
#undef CMT
#define CMT " (chunk 2)"
CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st3_2, CMT);
CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st3_2, CMT);
/* Check vst4_lane/vst4q_lane. */
clean_results ();
#undef TEST_MSG
#define TEST_MSG "VST4_LANE/VST4Q_LANE"
TEST_VSTX_LANE(, bfloat, bf, 16, 4, 4, 2);
TEST_VSTX_LANE(q, bfloat, bf, 16, 8, 4, 6);
#undef CMT
#define CMT " (chunk 0)"
CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st4_0, CMT);
CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st4_0, CMT);
TEST_EXTRA_CHUNK(bfloat, 16, 4, 4, 1);
TEST_EXTRA_CHUNK(bfloat, 16, 8, 4, 1);
#undef CMT
#define CMT " (chunk 1)"
CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st4_1, CMT);
CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st4_1, CMT);
TEST_EXTRA_CHUNK(bfloat, 16, 4, 4, 2);
TEST_EXTRA_CHUNK(bfloat, 16, 8, 4, 2);
#undef CMT
#define CMT " (chunk 2)"
CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st4_2, CMT);
CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st4_2, CMT);
TEST_EXTRA_CHUNK(bfloat, 16, 4, 4, 3);
TEST_EXTRA_CHUNK(bfloat, 16, 8, 4, 3);
#undef CMT
#define CMT " (chunk 3)"
CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st4_3, CMT);
CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st4_3, CMT);
}
int main (void)
{
exec_vstX_lane ();
return 0;
}