libstdc++-v3/src/c++17/ryu/d2s_intrinsics.h - gcc - Git at Google

 // Copyright 2018 Ulf Adams
 //
 // The contents of this file may be used under the terms of the Apache License,
 // Version 2.0.
 //
 //    (See accompanying file LICENSE-Apache or copy at
 //     http://www.apache.org/licenses/LICENSE-2.0)
 //
 // Alternatively, the contents of this file may be used under the terms of
 // the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE-Boost or copy at
 //     https://www.boost.org/LICENSE_1_0.txt)
 //
 // Unless required by applicable law or agreed to in writing, this software
 // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.
 #ifndef RYU_D2S_INTRINSICS_H
 #define RYU_D2S_INTRINSICS_H


 // Defines RYU_32_BIT_PLATFORM if applicable.

 // ABSL avoids uint128_t on Win32 even if __SIZEOF_INT128__ is defined.
 // Let's do the same for now.
 #if defined(__SIZEOF_INT128__) && !defined(_MSC_VER) && !defined(RYU_ONLY_64_BIT_OPS)
 #define HAS_UINT128
 #elif defined(_MSC_VER) && !defined(RYU_ONLY_64_BIT_OPS) && defined(_M_X64)
 #define HAS_64_BIT_INTRINSICS
 #endif

 #if defined(HAS_64_BIT_INTRINSICS)


 static inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) {
   return _umul128(a, b, productHi);
 }

 // Returns the lower 64 bits of (hi*2^64 + lo) >> dist, with 0 < dist < 64.
 static inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) {
   // For the __shiftright128 intrinsic, the shift value is always
   // modulo 64.
   // In the current implementation of the double-precision version
   // of Ryu, the shift value is always < 64. (In the case
   // RYU_OPTIMIZE_SIZE == 0, the shift value is in the range [49, 58].
   // Otherwise in the range [2, 59].)
   // However, this function is now also called by s2d, which requires supporting
   // the larger shift range (TODO: what is the actual range?).
   // Check this here in case a future change requires larger shift
   // values. In this case this function needs to be adjusted.
   assert(dist < 64);
   return __shiftright128(lo, hi, (unsigned char) dist);
 }

 #else // defined(HAS_64_BIT_INTRINSICS)

 static inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) {
   // The casts here help MSVC to avoid calls to the __allmul library function.
   const uint32_t aLo = (uint32_t)a;
   const uint32_t aHi = (uint32_t)(a >> 32);
   const uint32_t bLo = (uint32_t)b;
   const uint32_t bHi = (uint32_t)(b >> 32);

   const uint64_t b00 = (uint64_t)aLo * bLo;
   const uint64_t b01 = (uint64_t)aLo * bHi;
   const uint64_t b10 = (uint64_t)aHi * bLo;
   const uint64_t b11 = (uint64_t)aHi * bHi;

   const uint32_t b00Lo = (uint32_t)b00;
   const uint32_t b00Hi = (uint32_t)(b00 >> 32);

   const uint64_t mid1 = b10 + b00Hi;
   const uint32_t mid1Lo = (uint32_t)(mid1);
   const uint32_t mid1Hi = (uint32_t)(mid1 >> 32);

   const uint64_t mid2 = b01 + mid1Lo;
   const uint32_t mid2Lo = (uint32_t)(mid2);
   const uint32_t mid2Hi = (uint32_t)(mid2 >> 32);

   const uint64_t pHi = b11 + mid1Hi + mid2Hi;
   const uint64_t pLo = ((uint64_t)mid2Lo << 32) | b00Lo;

   *productHi = pHi;
   return pLo;
 }

 static inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) {
   // We don't need to handle the case dist >= 64 here (see above).
   assert(dist < 64);
   assert(dist > 0);
   return (hi << (64 - dist)) | (lo >> dist);
 }

 #endif // defined(HAS_64_BIT_INTRINSICS)

 #if defined(RYU_32_BIT_PLATFORM)

 // Returns the high 64 bits of the 128-bit product of a and b.
 static inline uint64_t umulh(const uint64_t a, const uint64_t b) {
   // Reuse the umul128 implementation.
   // Optimizers will likely eliminate the instructions used to compute the
   // low part of the product.
   uint64_t hi;
   umul128(a, b, &hi);
   return hi;
 }

 // On 32-bit platforms, compilers typically generate calls to library
 // functions for 64-bit divisions, even if the divisor is a constant.
 //
 // E.g.:
 // https://bugs.llvm.org/show_bug.cgi?id=37932
 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=17958
 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=37443
 //
 // The functions here perform division-by-constant using multiplications
 // in the same way as 64-bit compilers would do.
 //
 // NB:
 // The multipliers and shift values are the ones generated by clang x64
 // for expressions like x/5, x/10, etc.

 static inline uint64_t div5(const uint64_t x) {
   return umulh(x, 0xCCCCCCCCCCCCCCCDu) >> 2;
 }

 static inline uint64_t div10(const uint64_t x) {
   return umulh(x, 0xCCCCCCCCCCCCCCCDu) >> 3;
 }

 static inline uint64_t div100(const uint64_t x) {
   return umulh(x >> 2, 0x28F5C28F5C28F5C3u) >> 2;
 }

 static inline uint64_t div1e8(const uint64_t x) {
   return umulh(x, 0xABCC77118461CEFDu) >> 26;
 }

 static inline uint64_t div1e9(const uint64_t x) {
   return umulh(x >> 9, 0x44B82FA09B5A53u) >> 11;
 }

 static inline uint32_t mod1e9(const uint64_t x) {
   // Avoid 64-bit math as much as possible.
   // Returning (uint32_t) (x - 1000000000 * div1e9(x)) would
   // perform 32x64-bit multiplication and 64-bit subtraction.
   // x and 1000000000 * div1e9(x) are guaranteed to differ by
   // less than 10^9, so their highest 32 bits must be identical,
   // so we can truncate both sides to uint32_t before subtracting.
   // We can also simplify (uint32_t) (1000000000 * div1e9(x)).
   // We can truncate before multiplying instead of after, as multiplying
   // the highest 32 bits of div1e9(x) can't affect the lowest 32 bits.
   return ((uint32_t) x) - 1000000000 * ((uint32_t) div1e9(x));
 }

 #else // defined(RYU_32_BIT_PLATFORM)

 static inline uint64_t div5(const uint64_t x) {
   return x / 5;
 }

 static inline uint64_t div10(const uint64_t x) {
   return x / 10;
 }

 static inline uint64_t div100(const uint64_t x) {
   return x / 100;
 }

 static inline uint64_t div1e8(const uint64_t x) {
   return x / 100000000;
 }

 static inline uint64_t div1e9(const uint64_t x) {
   return x / 1000000000;
 }

 static inline uint32_t mod1e9(const uint64_t x) {
   return (uint32_t) (x - 1000000000 * div1e9(x));
 }

 #endif // defined(RYU_32_BIT_PLATFORM)

 static inline uint32_t pow5Factor(uint64_t value) {
   uint32_t count = 0;
   for (;;) {
     assert(value != 0);
     const uint64_t q = div5(value);
     const uint32_t r = ((uint32_t) value) - 5 * ((uint32_t) q);
     if (r != 0) {
       break;
     }
     value = q;
     ++count;
   }
   return count;
 }

 // Returns true if value is divisible by 5^p.
 static inline bool multipleOfPowerOf5(const uint64_t value, const uint32_t p) {
   // I tried a case distinction on p, but there was no performance difference.
   return pow5Factor(value) >= p;
 }

 // Returns true if value is divisible by 2^p.
 static inline bool multipleOfPowerOf2(const uint64_t value, const uint32_t p) {
   assert(value != 0);
   assert(p < 64);
   // __builtin_ctzll doesn't appear to be faster here.
   return (value & ((1ull << p) - 1)) == 0;
 }

 // We need a 64x128-bit multiplication and a subsequent 128-bit shift.
 // Multiplication:
 //   The 64-bit factor is variable and passed in, the 128-bit factor comes
 //   from a lookup table. We know that the 64-bit factor only has 55
 //   significant bits (i.e., the 9 topmost bits are zeros). The 128-bit
 //   factor only has 124 significant bits (i.e., the 4 topmost bits are
 //   zeros).
 // Shift:
 //   In principle, the multiplication result requires 55 + 124 = 179 bits to
 //   represent. However, we then shift this value to the right by j, which is
 //   at least j >= 115, so the result is guaranteed to fit into 179 - 115 = 64
 //   bits. This means that we only need the topmost 64 significant bits of
 //   the 64x128-bit multiplication.
 //
 // There are several ways to do this:
 // 1. Best case: the compiler exposes a 128-bit type.
 //    We perform two 64x64-bit multiplications, add the higher 64 bits of the
 //    lower result to the higher result, and shift by j - 64 bits.
 //
 //    We explicitly cast from 64-bit to 128-bit, so the compiler can tell
 //    that these are only 64-bit inputs, and can map these to the best
 //    possible sequence of assembly instructions.
 //    x64 machines happen to have matching assembly instructions for
 //    64x64-bit multiplications and 128-bit shifts.
 //
 // 2. Second best case: the compiler exposes intrinsics for the x64 assembly
 //    instructions mentioned in 1.
 //
 // 3. We only have 64x64 bit instructions that return the lower 64 bits of
 //    the result, i.e., we have to use plain C.
 //    Our inputs are less than the full width, so we have three options:
 //    a. Ignore this fact and just implement the intrinsics manually.
 //    b. Split both into 31-bit pieces, which guarantees no internal overflow,
 //       but requires extra work upfront (unless we change the lookup table).
 //    c. Split only the first factor into 31-bit pieces, which also guarantees
 //       no internal overflow, but requires extra work since the intermediate
 //       results are not perfectly aligned.
 #if defined(HAS_UINT128)

 // Best case: use 128-bit type.
 static inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) {
   const uint128_t b0 = ((uint128_t) m) * mul[0];
   const uint128_t b2 = ((uint128_t) m) * mul[1];
   return (uint64_t) (((b0 >> 64) + b2) >> (j - 64));
 }

 static inline uint64_t mulShiftAll64(const uint64_t m, const uint64_t* const mul, const int32_t j,
   uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
 //  m <<= 2;
 //  uint128_t b0 = ((uint128_t) m) * mul[0]; // 0
 //  uint128_t b2 = ((uint128_t) m) * mul[1]; // 64
 //
 //  uint128_t hi = (b0 >> 64) + b2;
 //  uint128_t lo = b0 & 0xffffffffffffffffull;
 //  uint128_t factor = (((uint128_t) mul[1]) << 64) + mul[0];
 //  uint128_t vpLo = lo + (factor << 1);
 //  *vp = (uint64_t) ((hi + (vpLo >> 64)) >> (j - 64));
 //  uint128_t vmLo = lo - (factor << mmShift);
 //  *vm = (uint64_t) ((hi + (vmLo >> 64) - (((uint128_t) 1ull) << 64)) >> (j - 64));
 //  return (uint64_t) (hi >> (j - 64));
   *vp = mulShift64(4 * m + 2, mul, j);
   *vm = mulShift64(4 * m - 1 - mmShift, mul, j);
   return mulShift64(4 * m, mul, j);
 }

 #elif defined(HAS_64_BIT_INTRINSICS)

 static inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) {
   // m is maximum 55 bits
   uint64_t high1;                                   // 128
   const uint64_t low1 = umul128(m, mul[1], &high1); // 64
   uint64_t high0;                                   // 64
   umul128(m, mul[0], &high0);                       // 0
   const uint64_t sum = high0 + low1;
   if (sum < high0) {
     ++high1; // overflow into high1
   }
   return shiftright128(sum, high1, j - 64);
 }

 static inline uint64_t mulShiftAll64(const uint64_t m, const uint64_t* const mul, const int32_t j,
   uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
   *vp = mulShift64(4 * m + 2, mul, j);
   *vm = mulShift64(4 * m - 1 - mmShift, mul, j);
   return mulShift64(4 * m, mul, j);
 }

 #else // !defined(HAS_UINT128) && !defined(HAS_64_BIT_INTRINSICS)

 static inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) {
   // m is maximum 55 bits
   uint64_t high1;                                   // 128
   const uint64_t low1 = umul128(m, mul[1], &high1); // 64
   uint64_t high0;                                   // 64
   umul128(m, mul[0], &high0);                       // 0
   const uint64_t sum = high0 + low1;
   if (sum < high0) {
     ++high1; // overflow into high1
   }
   return shiftright128(sum, high1, j - 64);
 }

 // This is faster if we don't have a 64x64->128-bit multiplication.
 static inline uint64_t mulShiftAll64(uint64_t m, const uint64_t* const mul, const int32_t j,
   uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
   m <<= 1;
   // m is maximum 55 bits
   uint64_t tmp;
   const uint64_t lo = umul128(m, mul[0], &tmp);
   uint64_t hi;
   const uint64_t mid = tmp + umul128(m, mul[1], &hi);
   hi += mid < tmp; // overflow into hi

   const uint64_t lo2 = lo + mul[0];
   const uint64_t mid2 = mid + mul[1] + (lo2 < lo);
   const uint64_t hi2 = hi + (mid2 < mid);
   *vp = shiftright128(mid2, hi2, (uint32_t) (j - 64 - 1));

   if (mmShift == 1) {
     const uint64_t lo3 = lo - mul[0];
     const uint64_t mid3 = mid - mul[1] - (lo3 > lo);
     const uint64_t hi3 = hi - (mid3 > mid);
     *vm = shiftright128(mid3, hi3, (uint32_t) (j - 64 - 1));
   } else {
     const uint64_t lo3 = lo + lo;
     const uint64_t mid3 = mid + mid + (lo3 < lo);
     const uint64_t hi3 = hi + hi + (mid3 < mid);
     const uint64_t lo4 = lo3 - mul[0];
     const uint64_t mid4 = mid3 - mul[1] - (lo4 > lo3);
     const uint64_t hi4 = hi3 - (mid4 > mid3);
     *vm = shiftright128(mid4, hi4, (uint32_t) (j - 64));
   }

   return shiftright128(mid, hi, (uint32_t) (j - 64 - 1));
 }

 #endif // HAS_64_BIT_INTRINSICS

 #endif // RYU_D2S_INTRINSICS_H
	// Copyright 2018 Ulf Adams
	//
	// The contents of this file may be used under the terms of the Apache License,
	// Version 2.0.
	//
	// (See accompanying file LICENSE-Apache or copy at
	// http://www.apache.org/licenses/LICENSE-2.0)
	//
	// Alternatively, the contents of this file may be used under the terms of
	// the Boost Software License, Version 1.0.
	// (See accompanying file LICENSE-Boost or copy at
	// https://www.boost.org/LICENSE_1_0.txt)
	//
	// Unless required by applicable law or agreed to in writing, this software
	// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied.
	#ifndef RYU_D2S_INTRINSICS_H
	#define RYU_D2S_INTRINSICS_H


	// Defines RYU_32_BIT_PLATFORM if applicable.

	// ABSL avoids uint128_t on Win32 even if __SIZEOF_INT128__ is defined.
	// Let's do the same for now.
	#if defined(__SIZEOF_INT128__) && !defined(_MSC_VER) && !defined(RYU_ONLY_64_BIT_OPS)
	#define HAS_UINT128
	#elif defined(_MSC_VER) && !defined(RYU_ONLY_64_BIT_OPS) && defined(_M_X64)
	#define HAS_64_BIT_INTRINSICS
	#endif

	#if defined(HAS_64_BIT_INTRINSICS)


	static inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) {
	return _umul128(a, b, productHi);
	}

	// Returns the lower 64 bits of (hi*2^64 + lo) >> dist, with 0 < dist < 64.
	static inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) {
	// For the __shiftright128 intrinsic, the shift value is always
	// modulo 64.
	// In the current implementation of the double-precision version
	// of Ryu, the shift value is always < 64. (In the case
	// RYU_OPTIMIZE_SIZE == 0, the shift value is in the range [49, 58].
	// Otherwise in the range [2, 59].)
	// However, this function is now also called by s2d, which requires supporting
	// the larger shift range (TODO: what is the actual range?).
	// Check this here in case a future change requires larger shift
	// values. In this case this function needs to be adjusted.
	assert(dist < 64);
	return __shiftright128(lo, hi, (unsigned char) dist);
	}

	#else // defined(HAS_64_BIT_INTRINSICS)

	static inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) {
	// The casts here help MSVC to avoid calls to the __allmul library function.
	const uint32_t aLo = (uint32_t)a;
	const uint32_t aHi = (uint32_t)(a >> 32);
	const uint32_t bLo = (uint32_t)b;
	const uint32_t bHi = (uint32_t)(b >> 32);

	const uint64_t b00 = (uint64_t)aLo * bLo;
	const uint64_t b01 = (uint64_t)aLo * bHi;
	const uint64_t b10 = (uint64_t)aHi * bLo;
	const uint64_t b11 = (uint64_t)aHi * bHi;

	const uint32_t b00Lo = (uint32_t)b00;
	const uint32_t b00Hi = (uint32_t)(b00 >> 32);

	const uint64_t mid1 = b10 + b00Hi;
	const uint32_t mid1Lo = (uint32_t)(mid1);
	const uint32_t mid1Hi = (uint32_t)(mid1 >> 32);

	const uint64_t mid2 = b01 + mid1Lo;
	const uint32_t mid2Lo = (uint32_t)(mid2);
	const uint32_t mid2Hi = (uint32_t)(mid2 >> 32);

	const uint64_t pHi = b11 + mid1Hi + mid2Hi;
	const uint64_t pLo = ((uint64_t)mid2Lo << 32) \| b00Lo;

	*productHi = pHi;
	return pLo;
	}

	static inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) {
	// We don't need to handle the case dist >= 64 here (see above).
	assert(dist < 64);
	assert(dist > 0);
	return (hi << (64 - dist)) \| (lo >> dist);
	}

	#endif // defined(HAS_64_BIT_INTRINSICS)

	#if defined(RYU_32_BIT_PLATFORM)

	// Returns the high 64 bits of the 128-bit product of a and b.
	static inline uint64_t umulh(const uint64_t a, const uint64_t b) {
	// Reuse the umul128 implementation.
	// Optimizers will likely eliminate the instructions used to compute the
	// low part of the product.
	uint64_t hi;
	umul128(a, b, &hi);
	return hi;
	}

	// On 32-bit platforms, compilers typically generate calls to library
	// functions for 64-bit divisions, even if the divisor is a constant.
	//
	// E.g.:
	// https://bugs.llvm.org/show_bug.cgi?id=37932
	// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=17958
	// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=37443
	//
	// The functions here perform division-by-constant using multiplications
	// in the same way as 64-bit compilers would do.
	//
	// NB:
	// The multipliers and shift values are the ones generated by clang x64
	// for expressions like x/5, x/10, etc.

	static inline uint64_t div5(const uint64_t x) {
	return umulh(x, 0xCCCCCCCCCCCCCCCDu) >> 2;
	}

	static inline uint64_t div10(const uint64_t x) {
	return umulh(x, 0xCCCCCCCCCCCCCCCDu) >> 3;
	}

	static inline uint64_t div100(const uint64_t x) {
	return umulh(x >> 2, 0x28F5C28F5C28F5C3u) >> 2;
	}

	static inline uint64_t div1e8(const uint64_t x) {
	return umulh(x, 0xABCC77118461CEFDu) >> 26;
	}

	static inline uint64_t div1e9(const uint64_t x) {
	return umulh(x >> 9, 0x44B82FA09B5A53u) >> 11;
	}

	static inline uint32_t mod1e9(const uint64_t x) {
	// Avoid 64-bit math as much as possible.
	// Returning (uint32_t) (x - 1000000000 * div1e9(x)) would
	// perform 32x64-bit multiplication and 64-bit subtraction.
	// x and 1000000000 * div1e9(x) are guaranteed to differ by
	// less than 10^9, so their highest 32 bits must be identical,
	// so we can truncate both sides to uint32_t before subtracting.
	// We can also simplify (uint32_t) (1000000000 * div1e9(x)).
	// We can truncate before multiplying instead of after, as multiplying
	// the highest 32 bits of div1e9(x) can't affect the lowest 32 bits.
	return ((uint32_t) x) - 1000000000 * ((uint32_t) div1e9(x));
	}

	#else // defined(RYU_32_BIT_PLATFORM)

	static inline uint64_t div5(const uint64_t x) {
	return x / 5;
	}

	static inline uint64_t div10(const uint64_t x) {
	return x / 10;
	}

	static inline uint64_t div100(const uint64_t x) {
	return x / 100;
	}

	static inline uint64_t div1e8(const uint64_t x) {
	return x / 100000000;
	}

	static inline uint64_t div1e9(const uint64_t x) {
	return x / 1000000000;
	}

	static inline uint32_t mod1e9(const uint64_t x) {
	return (uint32_t) (x - 1000000000 * div1e9(x));
	}

	#endif // defined(RYU_32_BIT_PLATFORM)

	static inline uint32_t pow5Factor(uint64_t value) {
	uint32_t count = 0;
	for (;;) {
	assert(value != 0);
	const uint64_t q = div5(value);
	const uint32_t r = ((uint32_t) value) - 5 * ((uint32_t) q);
	if (r != 0) {
	break;
	}
	value = q;
	++count;
	}
	return count;
	}

	// Returns true if value is divisible by 5^p.
	static inline bool multipleOfPowerOf5(const uint64_t value, const uint32_t p) {
	// I tried a case distinction on p, but there was no performance difference.
	return pow5Factor(value) >= p;
	}

	// Returns true if value is divisible by 2^p.
	static inline bool multipleOfPowerOf2(const uint64_t value, const uint32_t p) {
	assert(value != 0);
	assert(p < 64);
	// __builtin_ctzll doesn't appear to be faster here.
	return (value & ((1ull << p) - 1)) == 0;
	}

	// We need a 64x128-bit multiplication and a subsequent 128-bit shift.
	// Multiplication:
	// The 64-bit factor is variable and passed in, the 128-bit factor comes
	// from a lookup table. We know that the 64-bit factor only has 55
	// significant bits (i.e., the 9 topmost bits are zeros). The 128-bit
	// factor only has 124 significant bits (i.e., the 4 topmost bits are
	// zeros).
	// Shift:
	// In principle, the multiplication result requires 55 + 124 = 179 bits to
	// represent. However, we then shift this value to the right by j, which is
	// at least j >= 115, so the result is guaranteed to fit into 179 - 115 = 64
	// bits. This means that we only need the topmost 64 significant bits of
	// the 64x128-bit multiplication.
	//
	// There are several ways to do this:
	// 1. Best case: the compiler exposes a 128-bit type.
	// We perform two 64x64-bit multiplications, add the higher 64 bits of the
	// lower result to the higher result, and shift by j - 64 bits.
	//
	// We explicitly cast from 64-bit to 128-bit, so the compiler can tell
	// that these are only 64-bit inputs, and can map these to the best
	// possible sequence of assembly instructions.
	// x64 machines happen to have matching assembly instructions for
	// 64x64-bit multiplications and 128-bit shifts.
	//
	// 2. Second best case: the compiler exposes intrinsics for the x64 assembly
	// instructions mentioned in 1.
	//
	// 3. We only have 64x64 bit instructions that return the lower 64 bits of
	// the result, i.e., we have to use plain C.
	// Our inputs are less than the full width, so we have three options:
	// a. Ignore this fact and just implement the intrinsics manually.
	// b. Split both into 31-bit pieces, which guarantees no internal overflow,
	// but requires extra work upfront (unless we change the lookup table).
	// c. Split only the first factor into 31-bit pieces, which also guarantees
	// no internal overflow, but requires extra work since the intermediate
	// results are not perfectly aligned.
	#if defined(HAS_UINT128)

	// Best case: use 128-bit type.
	static inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) {
	const uint128_t b0 = ((uint128_t) m) * mul[0];
	const uint128_t b2 = ((uint128_t) m) * mul[1];
	return (uint64_t) (((b0 >> 64) + b2) >> (j - 64));
	}

	static inline uint64_t mulShiftAll64(const uint64_t m, const uint64_t* const mul, const int32_t j,
	uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
	// m <<= 2;
	// uint128_t b0 = ((uint128_t) m) * mul[0]; // 0
	// uint128_t b2 = ((uint128_t) m) * mul[1]; // 64
	//
	// uint128_t hi = (b0 >> 64) + b2;
	// uint128_t lo = b0 & 0xffffffffffffffffull;
	// uint128_t factor = (((uint128_t) mul[1]) << 64) + mul[0];
	// uint128_t vpLo = lo + (factor << 1);
	// *vp = (uint64_t) ((hi + (vpLo >> 64)) >> (j - 64));
	// uint128_t vmLo = lo - (factor << mmShift);
	// *vm = (uint64_t) ((hi + (vmLo >> 64) - (((uint128_t) 1ull) << 64)) >> (j - 64));
	// return (uint64_t) (hi >> (j - 64));
	vp = mulShift64(4 m + 2, mul, j);
	vm = mulShift64(4 m - 1 - mmShift, mul, j);
	return mulShift64(4 * m, mul, j);
	}

	#elif defined(HAS_64_BIT_INTRINSICS)

	static inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) {
	// m is maximum 55 bits
	uint64_t high1; // 128
	const uint64_t low1 = umul128(m, mul[1], &high1); // 64
	uint64_t high0; // 64
	umul128(m, mul[0], &high0); // 0
	const uint64_t sum = high0 + low1;
	if (sum < high0) {
	++high1; // overflow into high1
	}
	return shiftright128(sum, high1, j - 64);
	}

	static inline uint64_t mulShiftAll64(const uint64_t m, const uint64_t* const mul, const int32_t j,
	uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
	vp = mulShift64(4 m + 2, mul, j);
	vm = mulShift64(4 m - 1 - mmShift, mul, j);
	return mulShift64(4 * m, mul, j);
	}

	#else // !defined(HAS_UINT128) && !defined(HAS_64_BIT_INTRINSICS)

	static inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) {
	// m is maximum 55 bits
	uint64_t high1; // 128
	const uint64_t low1 = umul128(m, mul[1], &high1); // 64
	uint64_t high0; // 64
	umul128(m, mul[0], &high0); // 0
	const uint64_t sum = high0 + low1;
	if (sum < high0) {
	++high1; // overflow into high1
	}
	return shiftright128(sum, high1, j - 64);
	}

	// This is faster if we don't have a 64x64->128-bit multiplication.
	static inline uint64_t mulShiftAll64(uint64_t m, const uint64_t* const mul, const int32_t j,
	uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
	m <<= 1;
	// m is maximum 55 bits
	uint64_t tmp;
	const uint64_t lo = umul128(m, mul[0], &tmp);
	uint64_t hi;
	const uint64_t mid = tmp + umul128(m, mul[1], &hi);
	hi += mid < tmp; // overflow into hi

	const uint64_t lo2 = lo + mul[0];
	const uint64_t mid2 = mid + mul[1] + (lo2 < lo);
	const uint64_t hi2 = hi + (mid2 < mid);
	*vp = shiftright128(mid2, hi2, (uint32_t) (j - 64 - 1));

	if (mmShift == 1) {
	const uint64_t lo3 = lo - mul[0];
	const uint64_t mid3 = mid - mul[1] - (lo3 > lo);
	const uint64_t hi3 = hi - (mid3 > mid);
	*vm = shiftright128(mid3, hi3, (uint32_t) (j - 64 - 1));
	} else {
	const uint64_t lo3 = lo + lo;
	const uint64_t mid3 = mid + mid + (lo3 < lo);
	const uint64_t hi3 = hi + hi + (mid3 < mid);
	const uint64_t lo4 = lo3 - mul[0];
	const uint64_t mid4 = mid3 - mul[1] - (lo4 > lo3);
	const uint64_t hi4 = hi3 - (mid4 > mid3);
	*vm = shiftright128(mid4, hi4, (uint32_t) (j - 64));
	}

	return shiftright128(mid, hi, (uint32_t) (j - 64 - 1));
	}

	#endif // HAS_64_BIT_INTRINSICS

	#endif // RYU_D2S_INTRINSICS_H