detail/ryu/detail/d2s_intrinsics.hpp

1

2

//

3

// The contents of this file may be used under the terms of the Apache License,

4

// Version 2.0.

5

//

6

// (See accompanying file LICENSE-Apache or copy at

7

// http://www.apache.org/licenses/LICENSE-2.0)

8

//

9

// Alternatively, the contents of this file may be used under the terms of

10

// the Boost Software License, Version 1.0.

11

// (See accompanying file LICENSE-Boost or copy at

12

// https://www.boost.org/LICENSE_1_0.txt)

13

//

14

// Unless required by applicable law or agreed to in writing, this software

15

// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

16

// KIND, either express or implied.

17

18

/*

19

This is a derivative work

20

*/

21

22

#ifndef BOOST_JSON_DETAIL_RYU_DETAIL_D2S_INTRINSICS_HPP

23

#define BOOST_JSON_DETAIL_RYU_DETAIL_D2S_INTRINSICS_HPP

24

25

#include <boost/json/detail/config.hpp>

26

27

// This sets BOOST_JSON_RYU_32_BIT_PLATFORM as a side effect if applicable.

28

#include <boost/json/detail/ryu/detail/common.hpp>

29

30

#if defined(BOOST_JSON_RYU_HAS_64_BIT_INTRINSICS)

31

#include <intrin.h>

32

#endif

33

34

namespace boost {

35

namespace json {

36

namespace detail {

37

38

namespace ryu {

39

namespace detail {

40

41

#if defined(BOOST_JSON_RYU_HAS_64_BIT_INTRINSICS)

42

43

inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) {

44

return _umul128(a, b, productHi);

45

}

46

47

inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) {

48

// For the __shiftright128 intrinsic, the shift value is always

49

// modulo 64.

50

// In the current implementation of the double-precision version

51

// of Ryu, the shift value is always < 64. (In the case

52

// RYU_OPTIMIZE_SIZE == 0, the shift value is in the range [49, 58].

53

// Otherwise in the range [2, 59].)

54

// Check this here in case a future change requires larger shift

55

// values. In this case this function needs to be adjusted.

56

BOOST_ASSERT(dist < 64);

57

return __shiftright128(lo, hi, (unsigned char) dist);

58

}

59

60

#else // defined(HAS_64_BIT_INTRINSICS)

61

62

inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) {

63

// The casts here help MSVC to avoid calls to the __allmul library function.

64

const uint32_t aLo = (uint32_t)a;

65

const uint32_t aHi = (uint32_t)(a >> 32);

66

const uint32_t bLo = (uint32_t)b;

67

const uint32_t bHi = (uint32_t)(b >> 32);

68

69

const uint64_t b00 = (uint64_t)aLo * bLo;

70

const uint64_t b01 = (uint64_t)aLo * bHi;

71

const uint64_t b10 = (uint64_t)aHi * bLo;

72

const uint64_t b11 = (uint64_t)aHi * bHi;

73

74

const uint32_t b00Lo = (uint32_t)b00;

75

const uint32_t b00Hi = (uint32_t)(b00 >> 32);

76

77

const uint64_t mid1 = b10 + b00Hi;

78

const uint32_t mid1Lo = (uint32_t)(mid1);

79

const uint32_t mid1Hi = (uint32_t)(mid1 >> 32);

80

81

const uint64_t mid2 = b01 + mid1Lo;

82

const uint32_t mid2Lo = (uint32_t)(mid2);

83

const uint32_t mid2Hi = (uint32_t)(mid2 >> 32);

84

85

const uint64_t pHi = b11 + mid1Hi + mid2Hi;

86

const uint64_t pLo = ((uint64_t)mid2Lo << 32) | b00Lo;

87

88

*productHi = pHi;

89

return pLo;

90

}

91

92

inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) {

93

// We don't need to handle the case dist >= 64 here (see above).

94

BOOST_ASSERT(dist < 64);

95

#if defined(RYU_OPTIMIZE_SIZE) || !defined(RYU_32_BIT_PLATFORM)

96

BOOST_ASSERT(dist > 0);

97

return (hi << (64 - dist)) | (lo >> dist);

98

#else

99

// Avoid a 64-bit shift by taking advantage of the range of shift values.

100

BOOST_ASSERT(dist >= 32);

101

return (hi << (64 - dist)) | ((uint32_t)(lo >> 32) >> (dist - 32));

102

#endif

103

}

104

105

#endif // defined(HAS_64_BIT_INTRINSICS)

106

107

#ifdef RYU_32_BIT_PLATFORM

108

109

// Returns the high 64 bits of the 128-bit product of a and b.

110

inline uint64_t umulh(const uint64_t a, const uint64_t b) {

111

// Reuse the umul128 implementation.

112

// Optimizers will likely eliminate the instructions used to compute the

113

// low part of the product.

114

uint64_t hi;

115

umul128(a, b, &hi);

116

return hi;

117

}

118

119

// On 32-bit platforms, compilers typically generate calls to library

120

// functions for 64-bit divisions, even if the divisor is a constant.

121

//

122

// E.g.:

123

// https://bugs.llvm.org/show_bug.cgi?id=37932

124

// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=17958

125

// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=37443

126

//

127

// The functions here perform division-by-constant using multiplications

128

// in the same way as 64-bit compilers would do.

129

//

130

// NB:

131

// The multipliers and shift values are the ones generated by clang x64

132

// for expressions like x/5, x/10, etc.

133

134

inline uint64_t div5(const uint64_t x) {

135

return umulh(x, 0xCCCCCCCCCCCCCCCDu) >> 2;

136

}

137

138

inline uint64_t div10(const uint64_t x) {

139

return umulh(x, 0xCCCCCCCCCCCCCCCDu) >> 3;

140

}

141

142

inline uint64_t div100(const uint64_t x) {

143

return umulh(x >> 2, 0x28F5C28F5C28F5C3u) >> 2;

144

}

145

146

inline uint64_t div1e8(const uint64_t x) {

147

return umulh(x, 0xABCC77118461CEFDu) >> 26;

148

}

149

150

inline uint64_t div1e9(const uint64_t x) {

151

return umulh(x >> 9, 0x44B82FA09B5A53u) >> 11;

152

}

153

154

inline uint32_t mod1e9(const uint64_t x) {

155

// Avoid 64-bit math as much as possible.

156

// Returning (uint32_t) (x - 1000000000 * div1e9(x)) would

157

// perform 32x64-bit multiplication and 64-bit subtraction.

158

// x and 1000000000 * div1e9(x) are guaranteed to differ by

159

// less than 10^9, so their highest 32 bits must be identical,

160

// so we can truncate both sides to uint32_t before subtracting.

161

// We can also simplify (uint32_t) (1000000000 * div1e9(x)).

162

// We can truncate before multiplying instead of after, as multiplying

163

// the highest 32 bits of div1e9(x) can't affect the lowest 32 bits.

164

return ((uint32_t) x) - 1000000000 * ((uint32_t) div1e9(x));

165

}

166

167

#else // RYU_32_BIT_PLATFORM

168

169

1974

inline uint64_t div5(const uint64_t x) {

170

1974

return x / 5;

171

}

172

173

12198

inline uint64_t div10(const uint64_t x) {

174

12198

return x / 10;

175

}

176

177

497

inline uint64_t div100(const uint64_t x) {

178

497

return x / 100;

179

}

180

181

59

inline uint64_t div1e8(const uint64_t x) {

182

59

return x / 100000000;

183

}

184

185

17

inline uint64_t div1e9(const uint64_t x) {

186

17

return x / 1000000000;

187

}

188

189

17

inline uint32_t mod1e9(const uint64_t x) {

190

17

return (uint32_t) (x - 1000000000 * div1e9(x));

191

}

192

193

#endif // RYU_32_BIT_PLATFORM

194

195

114

inline uint32_t pow5Factor(uint64_t value) {

196

114

uint32_t count = 0;

197

for (;;) {

198

1/2

1860

BOOST_ASSERT(value != 0);

199

1860

const uint64_t q = div5(value);

200

1860

const uint32_t r = ((uint32_t) value) - 5 * ((uint32_t) q);

201

2/2

1860

if (r != 0) {

202

114

break;

203

}

204

1746

value = q;

205

1746

++count;

206

1746

}

207

114

return count;

208

}

209

210

// Returns true if value is divisible by 5^p.

211

114

inline bool multipleOfPowerOf5(const uint64_t value, const uint32_t p) {

212

// I tried a case distinction on p, but there was no performance difference.

213

114

return pow5Factor(value) >= p;

214

}

215

216

// Returns true if value is divisible by 2^p.

217

96

inline bool multipleOfPowerOf2(const uint64_t value, const uint32_t p) {

218

1/2

96

BOOST_ASSERT(value != 0);

219

// return __builtin_ctzll(value) >= p;

220

96

return (value & ((1ull << p) - 1)) == 0;

221

}

222

223

224

} // detail

225

} // ryu

226

227

} // detail

228

} // namespace json

229

} // namespace boost

230

231

#endif

232

Line	Branch	Hits	Source Code
1			// Copyright 2018 Ulf Adams
2			//
3			// The contents of this file may be used under the terms of the Apache License,
4			// Version 2.0.
5			//
6			// (See accompanying file LICENSE-Apache or copy at
7			// http://www.apache.org/licenses/LICENSE-2.0)
8			//
9			// Alternatively, the contents of this file may be used under the terms of
10			// the Boost Software License, Version 1.0.
11			// (See accompanying file LICENSE-Boost or copy at
12			// https://www.boost.org/LICENSE_1_0.txt)
13			//
14			// Unless required by applicable law or agreed to in writing, this software
15			// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16			// KIND, either express or implied.
17
18			/*
19			This is a derivative work
20			*/
21
22			#ifndef BOOST_JSON_DETAIL_RYU_DETAIL_D2S_INTRINSICS_HPP
23			#define BOOST_JSON_DETAIL_RYU_DETAIL_D2S_INTRINSICS_HPP
24
25			#include <boost/json/detail/config.hpp>
26
27			// This sets BOOST_JSON_RYU_32_BIT_PLATFORM as a side effect if applicable.
28			#include <boost/json/detail/ryu/detail/common.hpp>
29
30			#if defined(BOOST_JSON_RYU_HAS_64_BIT_INTRINSICS)
31			#include <intrin.h>
32			#endif
33
34			namespace boost {
35			namespace json {
36			namespace detail {
37
38			namespace ryu {
39			namespace detail {
40
41			#if defined(BOOST_JSON_RYU_HAS_64_BIT_INTRINSICS)
42
43			inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) {
44			return _umul128(a, b, productHi);
45			}
46
47			inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) {
48			// For the __shiftright128 intrinsic, the shift value is always
49			// modulo 64.
50			// In the current implementation of the double-precision version
51			// of Ryu, the shift value is always < 64. (In the case
52			// RYU_OPTIMIZE_SIZE == 0, the shift value is in the range [49, 58].
53			// Otherwise in the range [2, 59].)
54			// Check this here in case a future change requires larger shift
55			// values. In this case this function needs to be adjusted.
56			BOOST_ASSERT(dist < 64);
57			return __shiftright128(lo, hi, (unsigned char) dist);
58			}
59
60			#else // defined(HAS_64_BIT_INTRINSICS)
61
62			inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) {
63			// The casts here help MSVC to avoid calls to the __allmul library function.
64			const uint32_t aLo = (uint32_t)a;
65			const uint32_t aHi = (uint32_t)(a >> 32);
66			const uint32_t bLo = (uint32_t)b;
67			const uint32_t bHi = (uint32_t)(b >> 32);
68
69			const uint64_t b00 = (uint64_t)aLo * bLo;
70			const uint64_t b01 = (uint64_t)aLo * bHi;
71			const uint64_t b10 = (uint64_t)aHi * bLo;
72			const uint64_t b11 = (uint64_t)aHi * bHi;
73
74			const uint32_t b00Lo = (uint32_t)b00;
75			const uint32_t b00Hi = (uint32_t)(b00 >> 32);
76
77			const uint64_t mid1 = b10 + b00Hi;
78			const uint32_t mid1Lo = (uint32_t)(mid1);
79			const uint32_t mid1Hi = (uint32_t)(mid1 >> 32);
80
81			const uint64_t mid2 = b01 + mid1Lo;
82			const uint32_t mid2Lo = (uint32_t)(mid2);
83			const uint32_t mid2Hi = (uint32_t)(mid2 >> 32);
84
85			const uint64_t pHi = b11 + mid1Hi + mid2Hi;
86			const uint64_t pLo = ((uint64_t)mid2Lo << 32) \| b00Lo;
87
88			*productHi = pHi;
89			return pLo;
90			}
91
92			inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) {
93			// We don't need to handle the case dist >= 64 here (see above).
94			BOOST_ASSERT(dist < 64);
95			#if defined(RYU_OPTIMIZE_SIZE) \|\| !defined(RYU_32_BIT_PLATFORM)
96			BOOST_ASSERT(dist > 0);
97			return (hi << (64 - dist)) \| (lo >> dist);
98			#else
99			// Avoid a 64-bit shift by taking advantage of the range of shift values.
100			BOOST_ASSERT(dist >= 32);
101			return (hi << (64 - dist)) \| ((uint32_t)(lo >> 32) >> (dist - 32));
102			#endif
103			}
104
105			#endif // defined(HAS_64_BIT_INTRINSICS)
106
107			#ifdef RYU_32_BIT_PLATFORM
108
109			// Returns the high 64 bits of the 128-bit product of a and b.
110			inline uint64_t umulh(const uint64_t a, const uint64_t b) {
111			// Reuse the umul128 implementation.
112			// Optimizers will likely eliminate the instructions used to compute the
113			// low part of the product.
114			uint64_t hi;
115			umul128(a, b, &hi);
116			return hi;
117			}
118
119			// On 32-bit platforms, compilers typically generate calls to library
120			// functions for 64-bit divisions, even if the divisor is a constant.
121			//
122			// E.g.:
123			// https://bugs.llvm.org/show_bug.cgi?id=37932
124			// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=17958
125			// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=37443
126			//
127			// The functions here perform division-by-constant using multiplications
128			// in the same way as 64-bit compilers would do.
129			//
130			// NB:
131			// The multipliers and shift values are the ones generated by clang x64
132			// for expressions like x/5, x/10, etc.
133
134			inline uint64_t div5(const uint64_t x) {
135			return umulh(x, 0xCCCCCCCCCCCCCCCDu) >> 2;
136			}
137
138			inline uint64_t div10(const uint64_t x) {
139			return umulh(x, 0xCCCCCCCCCCCCCCCDu) >> 3;
140			}
141
142			inline uint64_t div100(const uint64_t x) {
143			return umulh(x >> 2, 0x28F5C28F5C28F5C3u) >> 2;
144			}
145
146			inline uint64_t div1e8(const uint64_t x) {
147			return umulh(x, 0xABCC77118461CEFDu) >> 26;
148			}
149
150			inline uint64_t div1e9(const uint64_t x) {
151			return umulh(x >> 9, 0x44B82FA09B5A53u) >> 11;
152			}
153
154			inline uint32_t mod1e9(const uint64_t x) {
155			// Avoid 64-bit math as much as possible.
156			// Returning (uint32_t) (x - 1000000000 * div1e9(x)) would
157			// perform 32x64-bit multiplication and 64-bit subtraction.
158			// x and 1000000000 * div1e9(x) are guaranteed to differ by
159			// less than 10^9, so their highest 32 bits must be identical,
160			// so we can truncate both sides to uint32_t before subtracting.
161			// We can also simplify (uint32_t) (1000000000 * div1e9(x)).
162			// We can truncate before multiplying instead of after, as multiplying
163			// the highest 32 bits of div1e9(x) can't affect the lowest 32 bits.
164			return ((uint32_t) x) - 1000000000 * ((uint32_t) div1e9(x));
165			}
166
167			#else // RYU_32_BIT_PLATFORM
168
169		1974	inline uint64_t div5(const uint64_t x) {
170		1974	return x / 5;
171			}
172
173		12198	inline uint64_t div10(const uint64_t x) {
174		12198	return x / 10;
175			}
176
177		497	inline uint64_t div100(const uint64_t x) {
178		497	return x / 100;
179			}
180
181		59	inline uint64_t div1e8(const uint64_t x) {
182		59	return x / 100000000;
183			}
184
185		17	inline uint64_t div1e9(const uint64_t x) {
186		17	return x / 1000000000;
187			}
188
189		17	inline uint32_t mod1e9(const uint64_t x) {
190		17	return (uint32_t) (x - 1000000000 * div1e9(x));
191			}
192
193			#endif // RYU_32_BIT_PLATFORM
194
195		114	inline uint32_t pow5Factor(uint64_t value) {
196		114	uint32_t count = 0;
197			for (;;) {
198	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 1860 times.	1860	BOOST_ASSERT(value != 0);
199		1860	const uint64_t q = div5(value);
200		1860	const uint32_t r = ((uint32_t) value) - 5 * ((uint32_t) q);
201	2/2 ✓ Branch 0 taken 114 times. ✓ Branch 1 taken 1746 times.	1860	if (r != 0) {
202		114	break;
203			}
204		1746	value = q;
205		1746	++count;
206		1746	}
207		114	return count;
208			}
209
210			// Returns true if value is divisible by 5^p.
211		114	inline bool multipleOfPowerOf5(const uint64_t value, const uint32_t p) {
212			// I tried a case distinction on p, but there was no performance difference.
213		114	return pow5Factor(value) >= p;
214			}
215
216			// Returns true if value is divisible by 2^p.
217		96	inline bool multipleOfPowerOf2(const uint64_t value, const uint32_t p) {
218	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 96 times.	96	BOOST_ASSERT(value != 0);
219			// return __builtin_ctzll(value) >= p;
220		96	return (value & ((1ull << p) - 1)) == 0;
221			}
222
223
224			} // detail
225			} // ryu
226
227			} // detail
228			} // namespace json
229			} // namespace boost
230
231			#endif
232