File Coverage

ryu_headers/d2s_intrinsics.h
Criterion Covered Total %
statement 0 17 0.0
branch 0 8 0.0
condition n/a
subroutine n/a
pod n/a
total 0 25 0.0


line stmt bran cond sub pod time code
1             // Copyright 2018 Ulf Adams
2             //
3             // The contents of this file may be used under the terms of the Apache License,
4             // Version 2.0.
5             //
6             // (See accompanying file LICENSE-Apache or copy at
7             // http://www.apache.org/licenses/LICENSE-2.0)
8             //
9             // Alternatively, the contents of this file may be used under the terms of
10             // the Boost Software License, Version 1.0.
11             // (See accompanying file LICENSE-Boost or copy at
12             // https://www.boost.org/LICENSE_1_0.txt)
13             //
14             // Unless required by applicable law or agreed to in writing, this software
15             // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16             // KIND, either express or implied.
17             #ifndef RYU_D2S_INTRINSICS_H
18             #define RYU_D2S_INTRINSICS_H
19              
20             #include
21             #include
22              
23             // Defines RYU_32_BIT_PLATFORM if applicable.
24             #include "common.h"
25              
26             // ABSL avoids uint128_t on Win32 even if __SIZEOF_INT128__ is defined.
27             // Let's do the same for now.
28             #if defined(__SIZEOF_INT128__) && !defined(_MSC_VER) && !defined(RYU_ONLY_64_BIT_OPS)
29             #define HAS_UINT128
30             #elif defined(_MSC_VER) && !defined(RYU_ONLY_64_BIT_OPS) && defined(_M_X64)
31             #define HAS_64_BIT_INTRINSICS
32             #endif
33              
34             #if defined(HAS_UINT128)
35             typedef __uint128_t uint128_t;
36             #endif
37              
38             #if defined(HAS_64_BIT_INTRINSICS)
39              
40             #include
41              
42             static inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) {
43             return _umul128(a, b, productHi);
44             }
45              
46             // Returns the lower 64 bits of (hi*2^64 + lo) >> dist, with 0 < dist < 64.
47             static inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) {
48             // For the __shiftright128 intrinsic, the shift value is always
49             // modulo 64.
50             // In the current implementation of the double-precision version
51             // of Ryu, the shift value is always < 64. (In the case
52             // RYU_OPTIMIZE_SIZE == 0, the shift value is in the range [49, 58].
53             // Otherwise in the range [2, 59].)
54             // However, this function is now also called by s2d, which requires supporting
55             // the larger shift range (TODO: what is the actual range?).
56             // Check this here in case a future change requires larger shift
57             // values. In this case this function needs to be adjusted.
58             assert(dist < 64);
59             return __shiftright128(lo, hi, (unsigned char) dist);
60             }
61              
62             #else // defined(HAS_64_BIT_INTRINSICS)
63              
64             static inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) {
65             // The casts here help MSVC to avoid calls to the __allmul library function.
66             const uint32_t aLo = (uint32_t)a;
67             const uint32_t aHi = (uint32_t)(a >> 32);
68             const uint32_t bLo = (uint32_t)b;
69             const uint32_t bHi = (uint32_t)(b >> 32);
70              
71             const uint64_t b00 = (uint64_t)aLo * bLo;
72             const uint64_t b01 = (uint64_t)aLo * bHi;
73             const uint64_t b10 = (uint64_t)aHi * bLo;
74             const uint64_t b11 = (uint64_t)aHi * bHi;
75              
76             const uint32_t b00Lo = (uint32_t)b00;
77             const uint32_t b00Hi = (uint32_t)(b00 >> 32);
78              
79             const uint64_t mid1 = b10 + b00Hi;
80             const uint32_t mid1Lo = (uint32_t)(mid1);
81             const uint32_t mid1Hi = (uint32_t)(mid1 >> 32);
82              
83             const uint64_t mid2 = b01 + mid1Lo;
84             const uint32_t mid2Lo = (uint32_t)(mid2);
85             const uint32_t mid2Hi = (uint32_t)(mid2 >> 32);
86              
87             const uint64_t pHi = b11 + mid1Hi + mid2Hi;
88             const uint64_t pLo = ((uint64_t)mid2Lo << 32) | b00Lo;
89              
90             *productHi = pHi;
91             return pLo;
92             }
93              
94             static inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) {
95             // We don't need to handle the case dist >= 64 here (see above).
96             assert(dist < 64);
97             assert(dist > 0);
98             return (hi << (64 - dist)) | (lo >> dist);
99             }
100              
101             #endif // defined(HAS_64_BIT_INTRINSICS)
102              
103             #if defined(RYU_32_BIT_PLATFORM)
104              
105             // Returns the high 64 bits of the 128-bit product of a and b.
106             static inline uint64_t umulh(const uint64_t a, const uint64_t b) {
107             // Reuse the umul128 implementation.
108             // Optimizers will likely eliminate the instructions used to compute the
109             // low part of the product.
110             uint64_t hi;
111             umul128(a, b, &hi);
112             return hi;
113             }
114              
115             // On 32-bit platforms, compilers typically generate calls to library
116             // functions for 64-bit divisions, even if the divisor is a constant.
117             //
118             // E.g.:
119             // https://bugs.llvm.org/show_bug.cgi?id=37932
120             // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=17958
121             // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=37443
122             //
123             // The functions here perform division-by-constant using multiplications
124             // in the same way as 64-bit compilers would do.
125             //
126             // NB:
127             // The multipliers and shift values are the ones generated by clang x64
128             // for expressions like x/5, x/10, etc.
129              
130             static inline uint64_t div5(const uint64_t x) {
131             return umulh(x, 0xCCCCCCCCCCCCCCCDu) >> 2;
132             }
133              
134             static inline uint64_t div10(const uint64_t x) {
135             return umulh(x, 0xCCCCCCCCCCCCCCCDu) >> 3;
136             }
137              
138             static inline uint64_t div100(const uint64_t x) {
139             return umulh(x >> 2, 0x28F5C28F5C28F5C3u) >> 2;
140             }
141              
142             static inline uint64_t div1e8(const uint64_t x) {
143             return umulh(x, 0xABCC77118461CEFDu) >> 26;
144             }
145              
146             static inline uint64_t div1e9(const uint64_t x) {
147             return umulh(x >> 9, 0x44B82FA09B5A53u) >> 11;
148             }
149              
150             static inline uint32_t mod1e9(const uint64_t x) {
151             // Avoid 64-bit math as much as possible.
152             // Returning (uint32_t) (x - 1000000000 * div1e9(x)) would
153             // perform 32x64-bit multiplication and 64-bit subtraction.
154             // x and 1000000000 * div1e9(x) are guaranteed to differ by
155             // less than 10^9, so their highest 32 bits must be identical,
156             // so we can truncate both sides to uint32_t before subtracting.
157             // We can also simplify (uint32_t) (1000000000 * div1e9(x)).
158             // We can truncate before multiplying instead of after, as multiplying
159             // the highest 32 bits of div1e9(x) can't affect the lowest 32 bits.
160             return ((uint32_t) x) - 1000000000 * ((uint32_t) div1e9(x));
161             }
162              
163             #else // defined(RYU_32_BIT_PLATFORM)
164              
165             static inline uint64_t div5(const uint64_t x) {
166             return x / 5;
167             }
168              
169             static inline uint64_t div10(const uint64_t x) {
170             return x / 10;
171             }
172              
173             static inline uint64_t div100(const uint64_t x) {
174             return x / 100;
175             }
176              
177             static inline uint64_t div1e8(const uint64_t x) {
178             return x / 100000000;
179             }
180              
181             static inline uint64_t div1e9(const uint64_t x) {
182             return x / 1000000000;
183             }
184              
185             static inline uint32_t mod1e9(const uint64_t x) {
186             return (uint32_t) (x - 1000000000 * div1e9(x));
187             }
188              
189             #endif // defined(RYU_32_BIT_PLATFORM)
190              
191 0           static inline uint32_t pow5Factor(uint64_t value) {
192 0           const uint64_t m_inv_5 = 14757395258967641293u; // 5 * m_inv_5 = 1 (mod 2^64)
193 0           const uint64_t n_div_5 = 3689348814741910323u; // #{ n | n = 0 (mod 2^64) } = 2^64 / 5
194 0           uint32_t count = 0;
195             for (;;) {
196 0 0         assert(value != 0);
197 0           value *= m_inv_5;
198 0 0         if (value > n_div_5)
199 0           break;
200 0           ++count;
201 0           }
202 0           return count;
203             }
204              
205             // Returns true if value is divisible by 5^p.
206 0           static inline bool multipleOfPowerOf5(const uint64_t value, const uint32_t p) {
207             // I tried a case distinction on p, but there was no performance difference.
208 0           return pow5Factor(value) >= p;
209             }
210              
211             // Returns true if value is divisible by 2^p.
212 0           static inline bool multipleOfPowerOf2(const uint64_t value, const uint32_t p) {
213 0 0         assert(value != 0);
214 0 0         assert(p < 64);
215             // __builtin_ctzll doesn't appear to be faster here.
216 0           return (value & ((1ull << p) - 1)) == 0;
217             }
218              
219             // We need a 64x128-bit multiplication and a subsequent 128-bit shift.
220             // Multiplication:
221             // The 64-bit factor is variable and passed in, the 128-bit factor comes
222             // from a lookup table. We know that the 64-bit factor only has 55
223             // significant bits (i.e., the 9 topmost bits are zeros). The 128-bit
224             // factor only has 124 significant bits (i.e., the 4 topmost bits are
225             // zeros).
226             // Shift:
227             // In principle, the multiplication result requires 55 + 124 = 179 bits to
228             // represent. However, we then shift this value to the right by j, which is
229             // at least j >= 115, so the result is guaranteed to fit into 179 - 115 = 64
230             // bits. This means that we only need the topmost 64 significant bits of
231             // the 64x128-bit multiplication.
232             //
233             // There are several ways to do this:
234             // 1. Best case: the compiler exposes a 128-bit type.
235             // We perform two 64x64-bit multiplications, add the higher 64 bits of the
236             // lower result to the higher result, and shift by j - 64 bits.
237             //
238             // We explicitly cast from 64-bit to 128-bit, so the compiler can tell
239             // that these are only 64-bit inputs, and can map these to the best
240             // possible sequence of assembly instructions.
241             // x64 machines happen to have matching assembly instructions for
242             // 64x64-bit multiplications and 128-bit shifts.
243             //
244             // 2. Second best case: the compiler exposes intrinsics for the x64 assembly
245             // instructions mentioned in 1.
246             //
247             // 3. We only have 64x64 bit instructions that return the lower 64 bits of
248             // the result, i.e., we have to use plain C.
249             // Our inputs are less than the full width, so we have three options:
250             // a. Ignore this fact and just implement the intrinsics manually.
251             // b. Split both into 31-bit pieces, which guarantees no internal overflow,
252             // but requires extra work upfront (unless we change the lookup table).
253             // c. Split only the first factor into 31-bit pieces, which also guarantees
254             // no internal overflow, but requires extra work since the intermediate
255             // results are not perfectly aligned.
256             #if defined(HAS_UINT128)
257              
258             // Best case: use 128-bit type.
259             static inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) {
260             const uint128_t b0 = ((uint128_t) m) * mul[0];
261             const uint128_t b2 = ((uint128_t) m) * mul[1];
262             return (uint64_t) (((b0 >> 64) + b2) >> (j - 64));
263             }
264              
265             static inline uint64_t mulShiftAll64(const uint64_t m, const uint64_t* const mul, const int32_t j,
266             uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
267             // m <<= 2;
268             // uint128_t b0 = ((uint128_t) m) * mul[0]; // 0
269             // uint128_t b2 = ((uint128_t) m) * mul[1]; // 64
270             //
271             // uint128_t hi = (b0 >> 64) + b2;
272             // uint128_t lo = b0 & 0xffffffffffffffffull;
273             // uint128_t factor = (((uint128_t) mul[1]) << 64) + mul[0];
274             // uint128_t vpLo = lo + (factor << 1);
275             // *vp = (uint64_t) ((hi + (vpLo >> 64)) >> (j - 64));
276             // uint128_t vmLo = lo - (factor << mmShift);
277             // *vm = (uint64_t) ((hi + (vmLo >> 64) - (((uint128_t) 1ull) << 64)) >> (j - 64));
278             // return (uint64_t) (hi >> (j - 64));
279             *vp = mulShift64(4 * m + 2, mul, j);
280             *vm = mulShift64(4 * m - 1 - mmShift, mul, j);
281             return mulShift64(4 * m, mul, j);
282             }
283              
284             #elif defined(HAS_64_BIT_INTRINSICS)
285              
286             static inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) {
287             // m is maximum 55 bits
288             uint64_t high1; // 128
289             const uint64_t low1 = umul128(m, mul[1], &high1); // 64
290             uint64_t high0; // 64
291             umul128(m, mul[0], &high0); // 0
292             const uint64_t sum = high0 + low1;
293             if (sum < high0) {
294             ++high1; // overflow into high1
295             }
296             return shiftright128(sum, high1, j - 64);
297             }
298              
299             static inline uint64_t mulShiftAll64(const uint64_t m, const uint64_t* const mul, const int32_t j,
300             uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
301             *vp = mulShift64(4 * m + 2, mul, j);
302             *vm = mulShift64(4 * m - 1 - mmShift, mul, j);
303             return mulShift64(4 * m, mul, j);
304             }
305              
306             #else // !defined(HAS_UINT128) && !defined(HAS_64_BIT_INTRINSICS)
307              
308             static inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) {
309             // m is maximum 55 bits
310             uint64_t high1; // 128
311             const uint64_t low1 = umul128(m, mul[1], &high1); // 64
312             uint64_t high0; // 64
313             umul128(m, mul[0], &high0); // 0
314             const uint64_t sum = high0 + low1;
315             if (sum < high0) {
316             ++high1; // overflow into high1
317             }
318             return shiftright128(sum, high1, j - 64);
319             }
320              
321             // This is faster if we don't have a 64x64->128-bit multiplication.
322             static inline uint64_t mulShiftAll64(uint64_t m, const uint64_t* const mul, const int32_t j,
323             uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
324             m <<= 1;
325             // m is maximum 55 bits
326             uint64_t tmp;
327             const uint64_t lo = umul128(m, mul[0], &tmp);
328             uint64_t hi;
329             const uint64_t mid = tmp + umul128(m, mul[1], &hi);
330             hi += mid < tmp; // overflow into hi
331              
332             const uint64_t lo2 = lo + mul[0];
333             const uint64_t mid2 = mid + mul[1] + (lo2 < lo);
334             const uint64_t hi2 = hi + (mid2 < mid);
335             *vp = shiftright128(mid2, hi2, (uint32_t) (j - 64 - 1));
336              
337             if (mmShift == 1) {
338             const uint64_t lo3 = lo - mul[0];
339             const uint64_t mid3 = mid - mul[1] - (lo3 > lo);
340             const uint64_t hi3 = hi - (mid3 > mid);
341             *vm = shiftright128(mid3, hi3, (uint32_t) (j - 64 - 1));
342             } else {
343             const uint64_t lo3 = lo + lo;
344             const uint64_t mid3 = mid + mid + (lo3 < lo);
345             const uint64_t hi3 = hi + hi + (mid3 < mid);
346             const uint64_t lo4 = lo3 - mul[0];
347             const uint64_t mid4 = mid3 - mul[1] - (lo4 > lo3);
348             const uint64_t hi4 = hi3 - (mid4 > mid3);
349             *vm = shiftright128(mid4, hi4, (uint32_t) (j - 64));
350             }
351              
352             return shiftright128(mid, hi, (uint32_t) (j - 64 - 1));
353             }
354              
355             #endif // HAS_64_BIT_INTRINSICS
356              
357             #endif // RYU_D2S_INTRINSICS_H