| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // Copyright 2018 Ulf Adams | ||
| 2 | // | ||
| 3 | // The contents of this file may be used under the terms of the Apache License, | ||
| 4 | // Version 2.0. | ||
| 5 | // | ||
| 6 | // (See accompanying file LICENSE-Apache or copy at | ||
| 7 | // http://www.apache.org/licenses/LICENSE-2.0) | ||
| 8 | // | ||
| 9 | // Alternatively, the contents of this file may be used under the terms of | ||
| 10 | // the Boost Software License, Version 1.0. | ||
| 11 | // (See accompanying file LICENSE-Boost or copy at | ||
| 12 | // https://www.boost.org/LICENSE_1_0.txt) | ||
| 13 | // | ||
| 14 | // Unless required by applicable law or agreed to in writing, this software | ||
| 15 | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| 16 | // KIND, either express or implied. | ||
| 17 | |||
| 18 | /* | ||
| 19 | This is a derivative work | ||
| 20 | */ | ||
| 21 | |||
| 22 | #ifndef BOOST_JSON_DETAIL_RYU_DETAIL_D2S_INTRINSICS_HPP | ||
| 23 | #define BOOST_JSON_DETAIL_RYU_DETAIL_D2S_INTRINSICS_HPP | ||
| 24 | |||
| 25 | #include <boost/json/detail/config.hpp> | ||
| 26 | |||
| 27 | // This sets BOOST_JSON_RYU_32_BIT_PLATFORM as a side effect if applicable. | ||
| 28 | #include <boost/json/detail/ryu/detail/common.hpp> | ||
| 29 | |||
| 30 | #if defined(BOOST_JSON_RYU_HAS_64_BIT_INTRINSICS) | ||
| 31 | #include <intrin.h> | ||
| 32 | #endif | ||
| 33 | |||
| 34 | namespace boost { | ||
| 35 | namespace json { | ||
| 36 | namespace detail { | ||
| 37 | |||
| 38 | namespace ryu { | ||
| 39 | namespace detail { | ||
| 40 | |||
| 41 | #if defined(BOOST_JSON_RYU_HAS_64_BIT_INTRINSICS) | ||
| 42 | |||
| 43 | inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) { | ||
| 44 | return _umul128(a, b, productHi); | ||
| 45 | } | ||
| 46 | |||
| 47 | inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) { | ||
| 48 | // For the __shiftright128 intrinsic, the shift value is always | ||
| 49 | // modulo 64. | ||
| 50 | // In the current implementation of the double-precision version | ||
| 51 | // of Ryu, the shift value is always < 64. (In the case | ||
| 52 | // RYU_OPTIMIZE_SIZE == 0, the shift value is in the range [49, 58]. | ||
| 53 | // Otherwise in the range [2, 59].) | ||
| 54 | // Check this here in case a future change requires larger shift | ||
| 55 | // values. In this case this function needs to be adjusted. | ||
| 56 | BOOST_ASSERT(dist < 64); | ||
| 57 | return __shiftright128(lo, hi, (unsigned char) dist); | ||
| 58 | } | ||
| 59 | |||
| 60 | #else // defined(HAS_64_BIT_INTRINSICS) | ||
| 61 | |||
| 62 | inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) { | ||
| 63 | // The casts here help MSVC to avoid calls to the __allmul library function. | ||
| 64 | const uint32_t aLo = (uint32_t)a; | ||
| 65 | const uint32_t aHi = (uint32_t)(a >> 32); | ||
| 66 | const uint32_t bLo = (uint32_t)b; | ||
| 67 | const uint32_t bHi = (uint32_t)(b >> 32); | ||
| 68 | |||
| 69 | const uint64_t b00 = (uint64_t)aLo * bLo; | ||
| 70 | const uint64_t b01 = (uint64_t)aLo * bHi; | ||
| 71 | const uint64_t b10 = (uint64_t)aHi * bLo; | ||
| 72 | const uint64_t b11 = (uint64_t)aHi * bHi; | ||
| 73 | |||
| 74 | const uint32_t b00Lo = (uint32_t)b00; | ||
| 75 | const uint32_t b00Hi = (uint32_t)(b00 >> 32); | ||
| 76 | |||
| 77 | const uint64_t mid1 = b10 + b00Hi; | ||
| 78 | const uint32_t mid1Lo = (uint32_t)(mid1); | ||
| 79 | const uint32_t mid1Hi = (uint32_t)(mid1 >> 32); | ||
| 80 | |||
| 81 | const uint64_t mid2 = b01 + mid1Lo; | ||
| 82 | const uint32_t mid2Lo = (uint32_t)(mid2); | ||
| 83 | const uint32_t mid2Hi = (uint32_t)(mid2 >> 32); | ||
| 84 | |||
| 85 | const uint64_t pHi = b11 + mid1Hi + mid2Hi; | ||
| 86 | const uint64_t pLo = ((uint64_t)mid2Lo << 32) | b00Lo; | ||
| 87 | |||
| 88 | *productHi = pHi; | ||
| 89 | return pLo; | ||
| 90 | } | ||
| 91 | |||
| 92 | inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) { | ||
| 93 | // We don't need to handle the case dist >= 64 here (see above). | ||
| 94 | BOOST_ASSERT(dist < 64); | ||
| 95 | #if defined(RYU_OPTIMIZE_SIZE) || !defined(RYU_32_BIT_PLATFORM) | ||
| 96 | BOOST_ASSERT(dist > 0); | ||
| 97 | return (hi << (64 - dist)) | (lo >> dist); | ||
| 98 | #else | ||
| 99 | // Avoid a 64-bit shift by taking advantage of the range of shift values. | ||
| 100 | BOOST_ASSERT(dist >= 32); | ||
| 101 | return (hi << (64 - dist)) | ((uint32_t)(lo >> 32) >> (dist - 32)); | ||
| 102 | #endif | ||
| 103 | } | ||
| 104 | |||
| 105 | #endif // defined(HAS_64_BIT_INTRINSICS) | ||
| 106 | |||
| 107 | #ifdef RYU_32_BIT_PLATFORM | ||
| 108 | |||
| 109 | // Returns the high 64 bits of the 128-bit product of a and b. | ||
| 110 | inline uint64_t umulh(const uint64_t a, const uint64_t b) { | ||
| 111 | // Reuse the umul128 implementation. | ||
| 112 | // Optimizers will likely eliminate the instructions used to compute the | ||
| 113 | // low part of the product. | ||
| 114 | uint64_t hi; | ||
| 115 | umul128(a, b, &hi); | ||
| 116 | return hi; | ||
| 117 | } | ||
| 118 | |||
| 119 | // On 32-bit platforms, compilers typically generate calls to library | ||
| 120 | // functions for 64-bit divisions, even if the divisor is a constant. | ||
| 121 | // | ||
| 122 | // E.g.: | ||
| 123 | // https://bugs.llvm.org/show_bug.cgi?id=37932 | ||
| 124 | // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=17958 | ||
| 125 | // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=37443 | ||
| 126 | // | ||
| 127 | // The functions here perform division-by-constant using multiplications | ||
| 128 | // in the same way as 64-bit compilers would do. | ||
| 129 | // | ||
| 130 | // NB: | ||
| 131 | // The multipliers and shift values are the ones generated by clang x64 | ||
| 132 | // for expressions like x/5, x/10, etc. | ||
| 133 | |||
| 134 | inline uint64_t div5(const uint64_t x) { | ||
| 135 | return umulh(x, 0xCCCCCCCCCCCCCCCDu) >> 2; | ||
| 136 | } | ||
| 137 | |||
| 138 | inline uint64_t div10(const uint64_t x) { | ||
| 139 | return umulh(x, 0xCCCCCCCCCCCCCCCDu) >> 3; | ||
| 140 | } | ||
| 141 | |||
| 142 | inline uint64_t div100(const uint64_t x) { | ||
| 143 | return umulh(x >> 2, 0x28F5C28F5C28F5C3u) >> 2; | ||
| 144 | } | ||
| 145 | |||
| 146 | inline uint64_t div1e8(const uint64_t x) { | ||
| 147 | return umulh(x, 0xABCC77118461CEFDu) >> 26; | ||
| 148 | } | ||
| 149 | |||
| 150 | inline uint64_t div1e9(const uint64_t x) { | ||
| 151 | return umulh(x >> 9, 0x44B82FA09B5A53u) >> 11; | ||
| 152 | } | ||
| 153 | |||
| 154 | inline uint32_t mod1e9(const uint64_t x) { | ||
| 155 | // Avoid 64-bit math as much as possible. | ||
| 156 | // Returning (uint32_t) (x - 1000000000 * div1e9(x)) would | ||
| 157 | // perform 32x64-bit multiplication and 64-bit subtraction. | ||
| 158 | // x and 1000000000 * div1e9(x) are guaranteed to differ by | ||
| 159 | // less than 10^9, so their highest 32 bits must be identical, | ||
| 160 | // so we can truncate both sides to uint32_t before subtracting. | ||
| 161 | // We can also simplify (uint32_t) (1000000000 * div1e9(x)). | ||
| 162 | // We can truncate before multiplying instead of after, as multiplying | ||
| 163 | // the highest 32 bits of div1e9(x) can't affect the lowest 32 bits. | ||
| 164 | return ((uint32_t) x) - 1000000000 * ((uint32_t) div1e9(x)); | ||
| 165 | } | ||
| 166 | |||
| 167 | #else // RYU_32_BIT_PLATFORM | ||
| 168 | |||
| 169 | 1974 | inline uint64_t div5(const uint64_t x) { | |
| 170 | 1974 | return x / 5; | |
| 171 | } | ||
| 172 | |||
| 173 | 12198 | inline uint64_t div10(const uint64_t x) { | |
| 174 | 12198 | return x / 10; | |
| 175 | } | ||
| 176 | |||
| 177 | 497 | inline uint64_t div100(const uint64_t x) { | |
| 178 | 497 | return x / 100; | |
| 179 | } | ||
| 180 | |||
| 181 | 59 | inline uint64_t div1e8(const uint64_t x) { | |
| 182 | 59 | return x / 100000000; | |
| 183 | } | ||
| 184 | |||
| 185 | 17 | inline uint64_t div1e9(const uint64_t x) { | |
| 186 | 17 | return x / 1000000000; | |
| 187 | } | ||
| 188 | |||
| 189 | 17 | inline uint32_t mod1e9(const uint64_t x) { | |
| 190 | 17 | return (uint32_t) (x - 1000000000 * div1e9(x)); | |
| 191 | } | ||
| 192 | |||
| 193 | #endif // RYU_32_BIT_PLATFORM | ||
| 194 | |||
| 195 | 114 | inline uint32_t pow5Factor(uint64_t value) { | |
| 196 | 114 | uint32_t count = 0; | |
| 197 | for (;;) { | ||
| 198 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1860 times.
|
1860 | BOOST_ASSERT(value != 0); |
| 199 | 1860 | const uint64_t q = div5(value); | |
| 200 | 1860 | const uint32_t r = ((uint32_t) value) - 5 * ((uint32_t) q); | |
| 201 |
2/2✓ Branch 0 taken 114 times.
✓ Branch 1 taken 1746 times.
|
1860 | if (r != 0) { |
| 202 | 114 | break; | |
| 203 | } | ||
| 204 | 1746 | value = q; | |
| 205 | 1746 | ++count; | |
| 206 | 1746 | } | |
| 207 | 114 | return count; | |
| 208 | } | ||
| 209 | |||
| 210 | // Returns true if value is divisible by 5^p. | ||
| 211 | 114 | inline bool multipleOfPowerOf5(const uint64_t value, const uint32_t p) { | |
| 212 | // I tried a case distinction on p, but there was no performance difference. | ||
| 213 | 114 | return pow5Factor(value) >= p; | |
| 214 | } | ||
| 215 | |||
| 216 | // Returns true if value is divisible by 2^p. | ||
| 217 | 96 | inline bool multipleOfPowerOf2(const uint64_t value, const uint32_t p) { | |
| 218 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 96 times.
|
96 | BOOST_ASSERT(value != 0); |
| 219 | // return __builtin_ctzll(value) >= p; | ||
| 220 | 96 | return (value & ((1ull << p) - 1)) == 0; | |
| 221 | } | ||
| 222 | |||
| 223 | |||
| 224 | } // detail | ||
| 225 | } // ryu | ||
| 226 | |||
| 227 | } // detail | ||
| 228 | } // namespace json | ||
| 229 | } // namespace boost | ||
| 230 | |||
| 231 | #endif | ||
| 232 |