blob: 69948f8ee4f55cbfea1fec9bb4d5bb25ec57b163 [file] [log] [blame]
/* This file is used for emulation of avx512fp16 runtime tests. To
verify the correctness of _Float16 type calculation, the idea is
convert _Float16 to float and do emulation using float instructions.
_Float16 type should not be emulate or check by itself. */
#include "avx512f-helper.h"
#ifndef AVX512FP16_HELPER_INCLUDED
#define AVX512FP16_HELPER_INCLUDED
#ifdef DEBUG
#include <string.h>
#endif
#include <math.h>
#include <limits.h>
#include <float.h>
/* Useful macros. */
#define NOINLINE __attribute__((noinline,noclone))
#define _ROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
#define _ROUND_CUR 8
#define AVX512F_MAX_ELEM 512 / 32
/* Structure for _Float16 emulation */
typedef union
{
__m512 zmm;
__m512h zmmh;
__m512i zmmi;
__m512d zmmd;
__m256 ymm[2];
__m256h ymmh[2];
__m256i ymmi[2];
__m256d ymmd[2];
__m128h xmmh[4];
__m128 xmm[4];
__m128i xmmi[4];
__m128d xmmd[4];
unsigned short u16[32];
unsigned int u32[16];
int i32[16];
long long s64[8];
unsigned long long u64[8];
double f64[8];
float f32[16];
_Float16 f16[32];
} V512;
/* Global variables. */
V512 src1, src2, src3, src3f;
int n_errs = 0;
/* Helper function for packing/unpacking ph operands. */
void NOINLINE
unpack_ph_2twops(V512 src, V512 *op1, V512 *op2)
{
V512 v1;
op1->zmm = _mm512_cvtph_ps(src.ymmi[0]);
v1.ymm[0] = _mm512_extractf32x8_ps(src.zmm, 1);
op2->zmm = _mm512_cvtph_ps(v1.ymmi[0]);
}
V512 NOINLINE
pack_twops_2ph(V512 op1, V512 op2)
{
V512 v1, v2, v3;
v1.ymmi[0] = _mm512_cvtps_ph(op1.zmm, _MM_FROUND_TO_NEAREST_INT);
v2.ymmi[0] = _mm512_cvtps_ph(op2.zmm, _MM_FROUND_TO_NEAREST_INT);
v3.zmm = _mm512_insertf32x8(v1.zmm, v2.ymm[0], 1);
return v3;
}
/* Helper function used for result debugging */
#ifdef DEBUG
void NOINLINE
display_ps(const void *p, const char *banner, int n_elems)
{
int i;
V512 *v = (V512*)p;
if (banner) {
printf("%s", banner);
}
for (i = 15; i >= n_elems; i--) {
printf(" --------");
if (i == 8) {
printf("\n");
if (banner) {
printf("%*s", (int)strlen(banner), "");
}
}
}
for (; i >= 0; i--) {
printf(" %x", v->u32[i]);
if (i == 8) {
printf("\n");
if (banner) {
printf("%*s", (int)strlen(banner), "");
}
}
}
printf("\n");
}
#endif
/* Functions/macros used for init/result checking.
Only check components within AVX512F_LEN. */
#define TO_STRING(x) #x
#define STRINGIFY(x) TO_STRING(x)
#define NAME_OF(NAME) STRINGIFY(INTRINSIC (NAME))
#define CHECK_RESULT(res, exp, size, intrin) \
check_results ((void*)res, (void*)exp, size,\
NAME_OF(intrin))
#define CHECK_RESULT_MASK(res, exp, size, intrin) \
check_results_mask ((__mmask32)res, (__mmask32)exp, size,\
NAME_OF(intrin))
/* To evaluate whether result match _Float16 precision,
only the last bit of real/emulate result could be
different. */
void NOINLINE
check_results(void *got, void *exp, int n_elems, char *banner)
{
int i;
V512 *v1 = (V512*)got;
V512 *v2 = (V512*)exp;
for (i = 0; i < n_elems; i++) {
if (v1->u16[i] != v2->u16[i] &&
((v1->u16[i] > (v2->u16[i] + 1)) ||
(v1->u16[i] < (v2->u16[i] - 1)))) {
#ifdef DEBUG
printf("ERROR: %s failed at %d'th element: %x(%f) != %x(%f)\n",
banner ? banner : "", i,
v1->u16[i], *(float *)(&v1->u16[i]),
v2->u16[i], *(float *)(&v2->u16[i]));
display_ps(got, "got:", n_elems);
display_ps(exp, "exp:", n_elems);
#endif
n_errs++;
break;
}
}
}
void NOINLINE
check_results_mask(__mmask32 got, __mmask32 exp, int n_elems, char *banner)
{
if (got != exp) {
#ifdef DEBUG
printf("ERROR: %s failed : got mask %x != exp mask %x\n",
banner ? banner : "", got, exp);
#endif
n_errs++;
}
}
/* Functions for src/dest initialization */
void NOINLINE
init_src()
{
V512 v1, v2, v3, v4;
int i;
for (i = 0; i < AVX512F_MAX_ELEM; i++) {
v1.f32[i] = i + 1;
v2.f32[i] = (i + 2) * 0.5f;
v3.f32[i] = i * 1.5f;
v4.f32[i] = i - 1.5f;
src3.u32[i] = (i + 1) * 10;
}
for (i = 0; i < 8; i++) {
src3f.f64[i] = (i + 1) * 7.5;
}
src1 = pack_twops_2ph(v1, v2);
src2 = pack_twops_2ph(v3, v4);
}
void NOINLINE
init_src_nanf()
{
V512 v1, v2, v3, v4;
int i;
for (i = 0; i < 16; i++) {
v1.f32[i] = i + 1 + 0.5;
v2.f32[i] = i + 17 + 0.5;
v3.f32[i] = i * 2 + 2 + 0.5;
v4.f32[i] = i * 2 + 34 + 0.5;
src3.u32[i] = (i + 1) * 10;
}
v1.f32[0] = __builtin_nanf("");
src1 = pack_twops_2ph(v1, v2);
src2 = pack_twops_2ph(v3, v4);
}
void NOINLINE
init_dest(V512 * res, V512 * exp)
{
int i;
V512 v1;
for (i = 0; i < AVX512F_MAX_ELEM; i++) {
v1.f32[i] = 12 + 0.5f * i;
}
*res = *exp = pack_twops_2ph(v1, v1);
}
#define EMULATE(NAME) EVAL(emulate_, NAME, AVX512F_LEN)
#endif /* AVX512FP16_HELPER_INCLUDED */
/* Macros for AVX512VL Testing. Include V512 component usage
and mask type for emulation. */
#if AVX512F_LEN == 256
#undef HF
#undef SF
#undef SI
#undef DF
#undef H_HF
#undef NET_MASK
#undef NET_CMASK
#undef MASK_VALUE
#undef HALF_MASK
#undef ZMASK_VALUE
#define NET_MASK 0xffff
#define NET_CMASK 0xff
#define MASK_VALUE 0xcccc
#define ZMASK_VALUE 0xfcc1
#define HALF_MASK 0xcc
#define HF(x) x.ymmh[0]
#define H_HF(x) x.xmmh[0]
#define SF(x) x.ymm[0]
#define DF(x) x.ymmd[0]
#define SI(x) x.ymmi[0]
#elif AVX512F_LEN == 128
#undef HF
#undef SF
#undef DF
#undef SI
#undef H_HF
#undef NET_MASK
#undef NET_CMASK
#undef MASK_VALUE
#undef ZMASK_VALUE
#undef HALF_MASK
#define NET_MASK 0xff
#define NET_CMASK 0xff
#define MASK_VALUE 0xcc
#define HALF_MASK MASK_VALUE
#define ZMASK_VALUE 0xc1
#define HF(x) x.xmmh[0]
#define SF(x) x.xmm[0]
#define DF(x) x.xmmd[0]
#define SI(x) x.xmmi[0]
#define H_HF(x) x.xmmh[0]
#else
#define NET_MASK 0xffffffff
#define NET_CMASK 0xffff
#define MASK_VALUE 0xcccccccc
#define ZMASK_VALUE 0xfcc1fcc1
#define HALF_MASK 0xcccc
#define HF(x) x.zmmh
#define SF(x) x.zmm
#define DF(x) x.zmmd
#define SI(x) x.zmmi
#define H_HF(x) x.ymmh[0]
#endif