| // Written in the D programming language. |
| |
| /** |
| * Builtin SIMD intrinsics |
| * |
| * Source: $(DRUNTIMESRC core/_simd.d) |
| * |
| * Copyright: Copyright Digital Mars 2012-2020 |
| * License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). |
| * Authors: $(HTTP digitalmars.com, Walter Bright), |
| * Source: $(DRUNTIMESRC core/_simd.d) |
| */ |
| |
| module core.simd; |
| |
| pure: |
| nothrow: |
| @safe: |
| @nogc: |
| |
| /******************************* |
| * Create a vector type. |
| * |
| * Parameters: |
| * T = one of double[2], float[4], void[16], byte[16], ubyte[16], |
| * short[8], ushort[8], int[4], uint[4], long[2], ulong[2]. |
| * For 256 bit vectors, |
| * one of double[4], float[8], void[32], byte[32], ubyte[32], |
| * short[16], ushort[16], int[8], uint[8], long[4], ulong[4] |
| */ |
| |
| template Vector(T) |
| { |
| /* __vector is compiler magic, hide it behind a template. |
| * The compiler will reject T's that don't work. |
| */ |
| alias __vector(T) Vector; |
| } |
| |
| /* Handy aliases |
| */ |
| static if (is(Vector!(void[8]))) alias Vector!(void[8]) void8; /// |
| static if (is(Vector!(double[1]))) alias Vector!(double[1]) double1; /// |
| static if (is(Vector!(float[2]))) alias Vector!(float[2]) float2; /// |
| static if (is(Vector!(byte[8]))) alias Vector!(byte[8]) byte8; /// |
| static if (is(Vector!(ubyte[8]))) alias Vector!(ubyte[8]) ubyte8; /// |
| static if (is(Vector!(short[4]))) alias Vector!(short[4]) short4; /// |
| static if (is(Vector!(ushort[4]))) alias Vector!(ushort[4]) ushort4; /// |
| static if (is(Vector!(int[2]))) alias Vector!(int[2]) int2; /// |
| static if (is(Vector!(uint[2]))) alias Vector!(uint[2]) uint2; /// |
| static if (is(Vector!(long[1]))) alias Vector!(long[1]) long1; /// |
| static if (is(Vector!(ulong[1]))) alias Vector!(ulong[1]) ulong1; /// |
| |
| static if (is(Vector!(void[16]))) alias Vector!(void[16]) void16; /// |
| static if (is(Vector!(double[2]))) alias Vector!(double[2]) double2; /// |
| static if (is(Vector!(float[4]))) alias Vector!(float[4]) float4; /// |
| static if (is(Vector!(byte[16]))) alias Vector!(byte[16]) byte16; /// |
| static if (is(Vector!(ubyte[16]))) alias Vector!(ubyte[16]) ubyte16; /// |
| static if (is(Vector!(short[8]))) alias Vector!(short[8]) short8; /// |
| static if (is(Vector!(ushort[8]))) alias Vector!(ushort[8]) ushort8; /// |
| static if (is(Vector!(int[4]))) alias Vector!(int[4]) int4; /// |
| static if (is(Vector!(uint[4]))) alias Vector!(uint[4]) uint4; /// |
| static if (is(Vector!(long[2]))) alias Vector!(long[2]) long2; /// |
| static if (is(Vector!(ulong[2]))) alias Vector!(ulong[2]) ulong2; /// |
| |
| static if (is(Vector!(void[32]))) alias Vector!(void[32]) void32; /// |
| static if (is(Vector!(double[4]))) alias Vector!(double[4]) double4; /// |
| static if (is(Vector!(float[8]))) alias Vector!(float[8]) float8; /// |
| static if (is(Vector!(byte[32]))) alias Vector!(byte[32]) byte32; /// |
| static if (is(Vector!(ubyte[32]))) alias Vector!(ubyte[32]) ubyte32; /// |
| static if (is(Vector!(short[16]))) alias Vector!(short[16]) short16; /// |
| static if (is(Vector!(ushort[16]))) alias Vector!(ushort[16]) ushort16; /// |
| static if (is(Vector!(int[8]))) alias Vector!(int[8]) int8; /// |
| static if (is(Vector!(uint[8]))) alias Vector!(uint[8]) uint8; /// |
| static if (is(Vector!(long[4]))) alias Vector!(long[4]) long4; /// |
| static if (is(Vector!(ulong[4]))) alias Vector!(ulong[4]) ulong4; /// |
| |
| static if (is(Vector!(void[64]))) alias Vector!(void[64]) void64; /// |
| static if (is(Vector!(double[8]))) alias Vector!(double[8]) double8; /// |
| static if (is(Vector!(float[16]))) alias Vector!(float[16]) float16; /// |
| static if (is(Vector!(byte[64]))) alias Vector!(byte[64]) byte64; /// |
| static if (is(Vector!(ubyte[64]))) alias Vector!(ubyte[64]) ubyte64; /// |
| static if (is(Vector!(short[32]))) alias Vector!(short[32]) short32; /// |
| static if (is(Vector!(ushort[32]))) alias Vector!(ushort[32]) ushort32; /// |
| static if (is(Vector!(int[16]))) alias Vector!(int[16]) int16; /// |
| static if (is(Vector!(uint[16]))) alias Vector!(uint[16]) uint16; /// |
| static if (is(Vector!(long[8]))) alias Vector!(long[8]) long8; /// |
| static if (is(Vector!(ulong[8]))) alias Vector!(ulong[8]) ulong8; /// |
| |
| version (D_SIMD) |
| { |
| /** XMM opcodes that conform to the following: |
| * |
| * opcode xmm1,xmm2/mem |
| * |
| * and do not have side effects (i.e. do not write to memory). |
| */ |
| enum XMM |
| { |
| ADDSS = 0xF30F58, |
| ADDSD = 0xF20F58, |
| ADDPS = 0x000F58, |
| ADDPD = 0x660F58, |
| PADDB = 0x660FFC, |
| PADDW = 0x660FFD, |
| PADDD = 0x660FFE, |
| PADDQ = 0x660FD4, |
| |
| SUBSS = 0xF30F5C, |
| SUBSD = 0xF20F5C, |
| SUBPS = 0x000F5C, |
| SUBPD = 0x660F5C, |
| PSUBB = 0x660FF8, |
| PSUBW = 0x660FF9, |
| PSUBD = 0x660FFA, |
| PSUBQ = 0x660FFB, |
| |
| MULSS = 0xF30F59, |
| MULSD = 0xF20F59, |
| MULPS = 0x000F59, |
| MULPD = 0x660F59, |
| PMULLW = 0x660FD5, |
| |
| DIVSS = 0xF30F5E, |
| DIVSD = 0xF20F5E, |
| DIVPS = 0x000F5E, |
| DIVPD = 0x660F5E, |
| |
| PAND = 0x660FDB, |
| POR = 0x660FEB, |
| |
| UCOMISS = 0x000F2E, |
| UCOMISD = 0x660F2E, |
| |
| XORPS = 0x000F57, |
| XORPD = 0x660F57, |
| |
| // Use STO and LOD instead of MOV to distinguish the direction |
| // (Destination is first operand, Source is second operand) |
| STOSS = 0xF30F11, /// MOVSS xmm1/m32, xmm2 |
| STOSD = 0xF20F11, /// MOVSD xmm1/m64, xmm2 |
| STOAPS = 0x000F29, /// MOVAPS xmm2/m128, xmm1 |
| STOAPD = 0x660F29, /// MOVAPD xmm2/m128, xmm1 |
| STODQA = 0x660F7F, /// MOVDQA xmm2/m128, xmm1 |
| STOD = 0x660F7E, /// MOVD reg/mem64, xmm 66 0F 7E /r |
| STOQ = 0x660FD6, /// MOVQ xmm2/m64, xmm1 |
| |
| LODSS = 0xF30F10, /// MOVSS xmm1, xmm2/m32 |
| LODSD = 0xF20F10, /// MOVSD xmm1, xmm2/m64 |
| LODAPS = 0x000F28, /// MOVAPS xmm1, xmm2/m128 |
| LODAPD = 0x660F28, /// MOVAPD xmm1, xmm2/m128 |
| LODDQA = 0x660F6F, /// MOVDQA xmm1, xmm2/m128 |
| LODD = 0x660F6E, /// MOVD xmm, reg/mem64 66 0F 6E /r |
| LODQ = 0xF30F7E, /// MOVQ xmm1, xmm2/m64 |
| |
| LODDQU = 0xF30F6F, /// MOVDQU xmm1, xmm2/mem128 F3 0F 6F /r |
| STODQU = 0xF30F7F, /// MOVDQU xmm1/mem128, xmm2 F3 0F 7F /r |
| MOVDQ2Q = 0xF20FD6, /// MOVDQ2Q mmx, xmm F2 0F D6 /r |
| MOVHLPS = 0x0F12, /// MOVHLPS xmm1, xmm2 0F 12 /r |
| LODHPD = 0x660F16, /// MOVHPD xmm1, m64 |
| STOHPD = 0x660F17, /// MOVHPD mem64, xmm1 66 0F 17 /r |
| LODHPS = 0x0F16, /// MOVHPS xmm1, m64 |
| STOHPS = 0x0F17, /// MOVHPS m64, xmm1 |
| MOVLHPS = 0x0F16, /// MOVLHPS xmm1, xmm2 |
| LODLPD = 0x660F12, /// MOVLPD xmm1, m64 |
| STOLPD = 0x660F13, /// MOVLPD m64, xmm1 |
| LODLPS = 0x0F12, /// MOVLPS xmm1, m64 |
| STOLPS = 0x0F13, /// MOVLPS m64, xmm1 |
| MOVMSKPD = 0x660F50, /// MOVMSKPD reg, xmm |
| MOVMSKPS = 0x0F50, /// MOVMSKPS reg, xmm |
| MOVNTDQ = 0x660FE7, /// MOVNTDQ m128, xmm1 |
| MOVNTI = 0x0FC3, /// MOVNTI m32, r32 |
| MOVNTPD = 0x660F2B, /// MOVNTPD m128, xmm1 |
| MOVNTPS = 0x0F2B, /// MOVNTPS m128, xmm1 |
| MOVNTQ = 0x0FE7, /// MOVNTQ m64, mm |
| MOVQ2DQ = 0xF30FD6, /// MOVQ2DQ |
| LODUPD = 0x660F10, /// MOVUPD xmm1, xmm2/m128 |
| STOUPD = 0x660F11, /// MOVUPD xmm2/m128, xmm1 |
| LODUPS = 0x0F10, /// MOVUPS xmm1, xmm2/m128 |
| STOUPS = 0x0F11, /// MOVUPS xmm2/m128, xmm1 |
| |
| PACKSSDW = 0x660F6B, |
| PACKSSWB = 0x660F63, |
| PACKUSWB = 0x660F67, |
| PADDSB = 0x660FEC, |
| PADDSW = 0x660FED, |
| PADDUSB = 0x660FDC, |
| PADDUSW = 0x660FDD, |
| PANDN = 0x660FDF, |
| PCMPEQB = 0x660F74, |
| PCMPEQD = 0x660F76, |
| PCMPEQW = 0x660F75, |
| PCMPGTB = 0x660F64, |
| PCMPGTD = 0x660F66, |
| PCMPGTW = 0x660F65, |
| PMADDWD = 0x660FF5, |
| PSLLW = 0x660FF1, |
| PSLLD = 0x660FF2, |
| PSLLQ = 0x660FF3, |
| PSRAW = 0x660FE1, |
| PSRAD = 0x660FE2, |
| PSRLW = 0x660FD1, |
| PSRLD = 0x660FD2, |
| PSRLQ = 0x660FD3, |
| PSUBSB = 0x660FE8, |
| PSUBSW = 0x660FE9, |
| PSUBUSB = 0x660FD8, |
| PSUBUSW = 0x660FD9, |
| PUNPCKHBW = 0x660F68, |
| PUNPCKHDQ = 0x660F6A, |
| PUNPCKHWD = 0x660F69, |
| PUNPCKLBW = 0x660F60, |
| PUNPCKLDQ = 0x660F62, |
| PUNPCKLWD = 0x660F61, |
| PXOR = 0x660FEF, |
| ANDPD = 0x660F54, |
| ANDPS = 0x0F54, |
| ANDNPD = 0x660F55, |
| ANDNPS = 0x0F55, |
| CMPPS = 0x0FC2, |
| CMPPD = 0x660FC2, |
| CMPSD = 0xF20FC2, |
| CMPSS = 0xF30FC2, |
| COMISD = 0x660F2F, |
| COMISS = 0x0F2F, |
| CVTDQ2PD = 0xF30FE6, |
| CVTDQ2PS = 0x0F5B, |
| CVTPD2DQ = 0xF20FE6, |
| CVTPD2PI = 0x660F2D, |
| CVTPD2PS = 0x660F5A, |
| CVTPI2PD = 0x660F2A, |
| CVTPI2PS = 0x0F2A, |
| CVTPS2DQ = 0x660F5B, |
| CVTPS2PD = 0x0F5A, |
| CVTPS2PI = 0x0F2D, |
| CVTSD2SI = 0xF20F2D, |
| CVTSD2SS = 0xF20F5A, |
| CVTSI2SD = 0xF20F2A, |
| CVTSI2SS = 0xF30F2A, |
| CVTSS2SD = 0xF30F5A, |
| CVTSS2SI = 0xF30F2D, |
| CVTTPD2PI = 0x660F2C, |
| CVTTPD2DQ = 0x660FE6, |
| CVTTPS2DQ = 0xF30F5B, |
| CVTTPS2PI = 0x0F2C, |
| CVTTSD2SI = 0xF20F2C, |
| CVTTSS2SI = 0xF30F2C, |
| MASKMOVDQU = 0x660FF7, |
| MASKMOVQ = 0x0FF7, |
| MAXPD = 0x660F5F, |
| MAXPS = 0x0F5F, |
| MAXSD = 0xF20F5F, |
| MAXSS = 0xF30F5F, |
| MINPD = 0x660F5D, |
| MINPS = 0x0F5D, |
| MINSD = 0xF20F5D, |
| MINSS = 0xF30F5D, |
| ORPD = 0x660F56, |
| ORPS = 0x0F56, |
| PAVGB = 0x660FE0, |
| PAVGW = 0x660FE3, |
| PMAXSW = 0x660FEE, |
| //PINSRW = 0x660FC4, |
| PMAXUB = 0x660FDE, |
| PMINSW = 0x660FEA, |
| PMINUB = 0x660FDA, |
| //PMOVMSKB = 0x660FD7, |
| PMULHUW = 0x660FE4, |
| PMULHW = 0x660FE5, |
| PMULUDQ = 0x660FF4, |
| PSADBW = 0x660FF6, |
| PUNPCKHQDQ = 0x660F6D, |
| PUNPCKLQDQ = 0x660F6C, |
| RCPPS = 0x0F53, |
| RCPSS = 0xF30F53, |
| RSQRTPS = 0x0F52, |
| RSQRTSS = 0xF30F52, |
| SQRTPD = 0x660F51, |
| SHUFPD = 0x660FC6, |
| SHUFPS = 0x0FC6, |
| SQRTPS = 0x0F51, |
| SQRTSD = 0xF20F51, |
| SQRTSS = 0xF30F51, |
| UNPCKHPD = 0x660F15, |
| UNPCKHPS = 0x0F15, |
| UNPCKLPD = 0x660F14, |
| UNPCKLPS = 0x0F14, |
| |
| PSHUFD = 0x660F70, |
| PSHUFHW = 0xF30F70, |
| PSHUFLW = 0xF20F70, |
| PSHUFW = 0x0F70, |
| PSLLDQ = 0x07660F73, |
| PSRLDQ = 0x03660F73, |
| |
| //PREFETCH = 0x0F18, |
| |
| // SSE3 Pentium 4 (Prescott) |
| |
| ADDSUBPD = 0x660FD0, |
| ADDSUBPS = 0xF20FD0, |
| HADDPD = 0x660F7C, |
| HADDPS = 0xF20F7C, |
| HSUBPD = 0x660F7D, |
| HSUBPS = 0xF20F7D, |
| MOVDDUP = 0xF20F12, |
| MOVSHDUP = 0xF30F16, |
| MOVSLDUP = 0xF30F12, |
| LDDQU = 0xF20FF0, |
| MONITOR = 0x0F01C8, |
| MWAIT = 0x0F01C9, |
| |
| // SSSE3 |
| PALIGNR = 0x660F3A0F, |
| PHADDD = 0x660F3802, |
| PHADDW = 0x660F3801, |
| PHADDSW = 0x660F3803, |
| PABSB = 0x660F381C, |
| PABSD = 0x660F381E, |
| PABSW = 0x660F381D, |
| PSIGNB = 0x660F3808, |
| PSIGND = 0x660F380A, |
| PSIGNW = 0x660F3809, |
| PSHUFB = 0x660F3800, |
| PMADDUBSW = 0x660F3804, |
| PMULHRSW = 0x660F380B, |
| PHSUBD = 0x660F3806, |
| PHSUBW = 0x660F3805, |
| PHSUBSW = 0x660F3807, |
| |
| // SSE4.1 |
| |
| BLENDPD = 0x660F3A0D, |
| BLENDPS = 0x660F3A0C, |
| BLENDVPD = 0x660F3815, |
| BLENDVPS = 0x660F3814, |
| DPPD = 0x660F3A41, |
| DPPS = 0x660F3A40, |
| EXTRACTPS = 0x660F3A17, |
| INSERTPS = 0x660F3A21, |
| MPSADBW = 0x660F3A42, |
| PBLENDVB = 0x660F3810, |
| PBLENDW = 0x660F3A0E, |
| PEXTRD = 0x660F3A16, |
| PEXTRQ = 0x660F3A16, |
| PINSRB = 0x660F3A20, |
| PINSRD = 0x660F3A22, |
| PINSRQ = 0x660F3A22, |
| |
| MOVNTDQA = 0x660F382A, |
| PACKUSDW = 0x660F382B, |
| PCMPEQQ = 0x660F3829, |
| PEXTRB = 0x660F3A14, |
| PHMINPOSUW = 0x660F3841, |
| PMAXSB = 0x660F383C, |
| PMAXSD = 0x660F383D, |
| PMAXUD = 0x660F383F, |
| PMAXUW = 0x660F383E, |
| PMINSB = 0x660F3838, |
| PMINSD = 0x660F3839, |
| PMINUD = 0x660F383B, |
| PMINUW = 0x660F383A, |
| PMOVSXBW = 0x660F3820, |
| PMOVSXBD = 0x660F3821, |
| PMOVSXBQ = 0x660F3822, |
| PMOVSXWD = 0x660F3823, |
| PMOVSXWQ = 0x660F3824, |
| PMOVSXDQ = 0x660F3825, |
| PMOVZXBW = 0x660F3830, |
| PMOVZXBD = 0x660F3831, |
| PMOVZXBQ = 0x660F3832, |
| PMOVZXWD = 0x660F3833, |
| PMOVZXWQ = 0x660F3834, |
| PMOVZXDQ = 0x660F3835, |
| PMULDQ = 0x660F3828, |
| PMULLD = 0x660F3840, |
| PTEST = 0x660F3817, |
| |
| ROUNDPD = 0x660F3A09, |
| ROUNDPS = 0x660F3A08, |
| ROUNDSD = 0x660F3A0B, |
| ROUNDSS = 0x660F3A0A, |
| |
| // SSE4.2 |
| PCMPESTRI = 0x660F3A61, |
| PCMPESTRM = 0x660F3A60, |
| PCMPISTRI = 0x660F3A63, |
| PCMPISTRM = 0x660F3A62, |
| PCMPGTQ = 0x660F3837, |
| //CRC32 |
| |
| // SSE4a (AMD only) |
| // EXTRQ,INSERTQ,MOVNTSD,MOVNTSS |
| |
| // POPCNT and LZCNT (have their own CPUID bits) |
| POPCNT = 0xF30FB8, |
| // LZCNT |
| } |
| |
| /** |
| * Generate two operand instruction with XMM 128 bit operands. |
| * |
| * This is a compiler magic function - it doesn't behave like |
| * regular D functions. |
| * |
| * Parameters: |
| * opcode = any of the XMM opcodes; it must be a compile time constant |
| * op1 = first operand |
| * op2 = second operand |
| * Returns: |
| * result of opcode |
| */ |
| pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2); |
| |
| /// |
| unittest |
| { |
| float4 a; |
| a = cast(float4)__simd(XMM.PXOR, a, a); |
| } |
| |
| /** |
| * Unary SIMD instructions. |
| */ |
| pure @safe void16 __simd(XMM opcode, void16 op1); |
| pure @safe void16 __simd(XMM opcode, double d); /// |
| pure @safe void16 __simd(XMM opcode, float f); /// |
| |
| /// |
| unittest |
| { |
| float4 a; |
| a = cast(float4)__simd(XMM.LODSS, a); |
| } |
| |
| /**** |
| * For instructions: |
| * CMPPD, CMPSS, CMPSD, CMPPS, |
| * PSHUFD, PSHUFHW, PSHUFLW, |
| * BLENDPD, BLENDPS, DPPD, DPPS, |
| * MPSADBW, PBLENDW, |
| * ROUNDPD, ROUNDPS, ROUNDSD, ROUNDSS |
| * Parameters: |
| * opcode = any of the above XMM opcodes; it must be a compile time constant |
| * op1 = first operand |
| * op2 = second operand |
| * imm8 = third operand; must be a compile time constant |
| * Returns: |
| * result of opcode |
| */ |
| pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2, ubyte imm8); |
| |
| /// |
| unittest |
| { |
| float4 a; |
| a = cast(float4)__simd(XMM.CMPPD, a, a, 0x7A); |
| } |
| |
| /*** |
| * For instructions with the imm8 version: |
| * PSLLD, PSLLQ, PSLLW, PSRAD, PSRAW, PSRLD, PSRLQ, PSRLW, |
| * PSRLDQ, PSLLDQ |
| * Parameters: |
| * opcode = any of the XMM opcodes; it must be a compile time constant |
| * op1 = first operand |
| * imm8 = second operand; must be a compile time constant |
| * Returns: |
| * result of opcode |
| */ |
| pure @safe void16 __simd_ib(XMM opcode, void16 op1, ubyte imm8); |
| |
| /// |
| unittest |
| { |
| float4 a; |
| a = cast(float4) __simd_ib(XMM.PSRLQ, a, 0x7A); |
| } |
| |
| /***** |
| * For "store" operations of the form: |
| * op1 op= op2 |
| * Returns: |
| * op2 |
| * These cannot be marked as pure, as semantic() doesn't check them. |
| */ |
| @safe void16 __simd_sto(XMM opcode, void16 op1, void16 op2); |
| @safe void16 __simd_sto(XMM opcode, double op1, void16 op2); /// |
| @safe void16 __simd_sto(XMM opcode, float op1, void16 op2); /// |
| |
| /// |
| unittest |
| { |
| void16 a; |
| float f = 1; |
| double d = 1; |
| |
| cast(void)__simd_sto(XMM.STOUPS, a, a); |
| cast(void)__simd_sto(XMM.STOUPS, f, a); |
| cast(void)__simd_sto(XMM.STOUPS, d, a); |
| } |
| |
| /* The following use overloading to ensure correct typing. |
| * Compile with inlining on for best performance. |
| */ |
| |
| pure @safe short8 pcmpeq()(short8 v1, short8 v2) |
| { |
| return cast(short8)__simd(XMM.PCMPEQW, v1, v2); |
| } |
| |
| pure @safe ushort8 pcmpeq()(ushort8 v1, ushort8 v2) |
| { |
| return cast(ushort8)__simd(XMM.PCMPEQW, v1, v2); |
| } |
| |
| /********************* |
| * Emit prefetch instruction. |
| * Params: |
| * address = address to be prefetched |
| * writeFetch = true for write fetch, false for read fetch |
| * locality = 0..3 (0 meaning least local, 3 meaning most local) |
| * Note: |
| * The Intel mappings are: |
| * $(TABLE |
| * $(THEAD writeFetch, locality, Instruction) |
| * $(TROW false, 0, prefetchnta) |
| * $(TROW false, 1, prefetch2) |
| * $(TROW false, 2, prefetch1) |
| * $(TROW false, 3, prefetch0) |
| * $(TROW true, 0, prefetchw) |
| * $(TROW true, 1, prefetchw) |
| * $(TROW true, 2, prefetchw) |
| * $(TROW true, 3, prefetchw) |
| * ) |
| */ |
| void prefetch(bool writeFetch, ubyte locality)(const(void)* address) |
| { |
| static if (writeFetch) |
| __prefetch(address, 4); |
| else static if (locality < 4) |
| __prefetch(address, 3 - locality); |
| else |
| static assert(0, "0..3 expected for locality"); |
| } |
| |
| private void __prefetch(const(void*) address, ubyte encoding); |
| |
| /************************************* |
| * Load unaligned vector from address. |
| * This is a compiler intrinsic. |
| * Params: |
| * p = pointer to vector |
| * Returns: |
| * vector |
| */ |
| |
| V loadUnaligned(V)(const V* p) |
| if (is(V == void16) || |
| is(V == byte16) || |
| is(V == ubyte16) || |
| is(V == short8) || |
| is(V == ushort8) || |
| is(V == int4) || |
| is(V == uint4) || |
| is(V == long2) || |
| is(V == ulong2) || |
| is(V == double2) || |
| is(V == float4)) |
| { |
| pragma(inline, true); |
| static if (is(V == double2)) |
| return cast(V)__simd(XMM.LODUPD, *cast(const void16*)p); |
| else static if (is(V == float4)) |
| return cast(V)__simd(XMM.LODUPS, *cast(const void16*)p); |
| else |
| return cast(V)__simd(XMM.LODDQU, *cast(const void16*)p); |
| } |
| |
| @system |
| unittest |
| { |
| // Memory to load into the vector: |
| // Should have enough data to test all 16-byte alignments, and still |
| // have room for a 16-byte vector |
| ubyte[32] data; |
| foreach (i; 0..data.length) |
| { |
| data[i] = cast(ubyte)i; |
| } |
| |
| // to test all alignments from 1 ~ 16 |
| foreach (i; 0..16) |
| { |
| ubyte* d = &data[i]; |
| |
| void test(T)() |
| { |
| // load the data |
| T v = loadUnaligned(cast(T*)d); |
| |
| // check that the data was loaded correctly |
| ubyte* ptrToV = cast(ubyte*)&v; |
| foreach (j; 0..T.sizeof) |
| { |
| assert(ptrToV[j] == d[j]); |
| } |
| } |
| |
| test!void16(); |
| test!byte16(); |
| test!ubyte16(); |
| test!short8(); |
| test!ushort8(); |
| test!int4(); |
| test!uint4(); |
| test!long2(); |
| test!ulong2(); |
| test!double2(); |
| test!float4(); |
| } |
| } |
| |
| /************************************* |
| * Store vector to unaligned address. |
| * This is a compiler intrinsic. |
| * Params: |
| * p = pointer to vector |
| * value = value to store |
| * Returns: |
| * value |
| */ |
| |
| V storeUnaligned(V)(V* p, V value) |
| if (is(V == void16) || |
| is(V == byte16) || |
| is(V == ubyte16) || |
| is(V == short8) || |
| is(V == ushort8) || |
| is(V == int4) || |
| is(V == uint4) || |
| is(V == long2) || |
| is(V == ulong2) || |
| is(V == double2) || |
| is(V == float4)) |
| { |
| pragma(inline, true); |
| static if (is(V == double2)) |
| return cast(V)__simd_sto(XMM.STOUPD, *cast(void16*)p, value); |
| else static if (is(V == float4)) |
| return cast(V)__simd_sto(XMM.STOUPS, *cast(void16*)p, value); |
| else |
| return cast(V)__simd_sto(XMM.STODQU, *cast(void16*)p, value); |
| } |
| |
| @system |
| unittest |
| { |
| // Memory to store the vector to: |
| // Should have enough data to test all 16-byte alignments, and still |
| // have room for a 16-byte vector |
| ubyte[32] data; |
| |
| // to test all alignments from 1 ~ 16 |
| foreach (i; 0..16) |
| { |
| ubyte* d = &data[i]; |
| |
| void test(T)() |
| { |
| T v; |
| |
| // populate v` with data |
| ubyte* ptrToV = cast(ubyte*)&v; |
| foreach (j; 0..T.sizeof) |
| { |
| ptrToV[j] = cast(ubyte)j; |
| } |
| |
| // store `v` to location pointed to by `d` |
| storeUnaligned(cast(T*)d, v); |
| |
| // check that the the data was stored correctly |
| foreach (j; 0..T.sizeof) |
| { |
| assert(ptrToV[j] == d[j]); |
| } |
| } |
| |
| test!void16(); |
| test!byte16(); |
| test!ubyte16(); |
| test!short8(); |
| test!ushort8(); |
| test!int4(); |
| test!uint4(); |
| test!long2(); |
| test!ulong2(); |
| test!double2(); |
| test!float4(); |
| } |
| } |
| } |