-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Description
<charconv> is very fast, but it could be faster. One issue is that MSVC's optimizer often generates slower code for <charconv> than Clang/LLVM. We should create reduced test cases and file bugs on Developer Community for the optimizer team to investigate and fix.
At the bottom of this issue, I've prepared a self-contained test case using <charconv>. Here's a table of my timing results (Intel Core i7-8700, VS 2019 16.8 Preview 2, Clang/LLVM 10.0.0). The measured times are average nanoseconds per floating-point value.
| Scenario | MSVC x86 | LLVM x86 | LLVM x86 Speedup Ratio | MSVC x64 | LLVM x64 | LLVM x64 Speedup Ratio |
|---|---|---|---|---|---|---|
float plain shortest |
48.3 | 45.5 | 1.06 | 41.9 | 35.7 | |
double plain shortest |
92.5 | 71.8 | ❌ 1.29 | 49.1 | 39.4 | ❌ 1.25 |
float scientific shortest |
47.2 | 41.4 | 38.3 | 31.6 | ❌ 1.21 | |
double scientific shortest |
92.6 | 72.5 | ❌ 1.28 | 48.0 | 39.6 | ❌ 1.21 |
float fixed shortest |
58.9 | 51.1 | 47.6 | 38.8 | ❌ 1.23 | |
double fixed shortest |
338.5 | 298.4 | 145.9 | 111.6 | ❌ 1.31 | |
float general shortest |
48.8 | 42.5 | 39.3 | 32.5 | ❌ 1.21 | |
double general shortest |
92.9 | 72.4 | ❌ 1.28 | 48.6 | 38.5 | ❌ 1.26 |
float hex shortest |
25.8 | 26.4 | ✔️ 0.98 | 21.0 | 23.8 | ✔️ 0.88 |
double hex shortest |
109.3 | 53.8 | 💥 2.03 | 27.7 | 26.6 | 1.04 |
float scientific 8 |
116.5 | 102.3 | 63.8 | 58.0 | ||
double scientific 16 |
145.5 | 128.7 | 77.5 | 65.1 | ||
float fixed 6 (lossy) |
83.8 | 79.3 | 1.06 | 45.2 | 41.2 | |
double fixed 6 (lossy) |
294.7 | 270.6 | 1.09 | 121.4 | 94.1 | ❌ 1.29 |
float general 9 |
134.5 | 118.8 | 79.5 | 70.7 | ||
double general 17 |
176.2 | 153.8 | 97.6 | 81.4 | ❌ 1.20 | |
float hex 6 |
25.7 | 21.2 | ❌ 1.21 | 21.0 | 17.4 | ❌ 1.21 |
double hex 13 |
103.0 | 26.4 | 💥 3.90 | 26.9 | 30.6 | ✔️ 0.88 |
The "LLVM Speedup Ratio" is the MSVC time divided by the LLVM time, so higher values mean that MSVC is slower than what LLVM has proven is possible on this hardware. I've marked values >= 2.0 with 💥, values >= 1.2 with ❌, values >= 1.1 with
There are at least two separate issues here:
-
x86
doublehex 💥. This codepath performs bitwise operations withuint64_t. Something terrible is happening here, causing code that should be ultra-fast to instead be extremely slow (relatively speaking). This seems specific to the integer type being wider than the machine's preferred width. The good news is that because this codepath is so simple, it should be easy to extract into a minimal test case, and because the issue is so severe, the cause will hopefully be easy for the optimizer devs to find and fix. The functions are:Lines 2079 to 2432 in b90b3e0
template <class _Floating> _NODISCARD to_chars_result _Floating_to_chars_hex_precision( char* _First, char* const _Last, const _Floating _Value, int _Precision) noexcept { // * Determine the effective _Precision. // * Later, we'll decrement _Precision when printing each hexit after the decimal point. // The hexits after the decimal point correspond to the explicitly stored fraction bits. // float explicitly stores 23 fraction bits. 23 / 4 == 5.75, which is 6 hexits. // double explicitly stores 52 fraction bits. 52 / 4 == 13, which is 13 hexits. constexpr int _Full_precision = is_same_v<_Floating, float> ? 6 : 13; constexpr int _Adjusted_explicit_bits = _Full_precision * 4; if (_Precision < 0) { // C11 7.21.6.1 "The fprintf function"/5: "A negative precision argument is taken as if the precision were // omitted." /8: "if the precision is missing and FLT_RADIX is a power of 2, then the precision is sufficient // for an exact representation of the value" _Precision = _Full_precision; } // * Extract the _Ieee_mantissa and _Ieee_exponent. using _Traits = _Floating_type_traits<_Floating>; using _Uint_type = typename _Traits::_Uint_type; const _Uint_type _Uint_value = _Bit_cast<_Uint_type>(_Value); const _Uint_type _Ieee_mantissa = _Uint_value & _Traits::_Denormal_mantissa_mask; const int32_t _Ieee_exponent = static_cast<int32_t>(_Uint_value >> _Traits::_Exponent_shift); // * Prepare the _Adjusted_mantissa. This is aligned to hexit boundaries, // * with the implicit bit restored (0 for zero values and subnormal values, 1 for normal values). // * Also calculate the _Unbiased_exponent. This unifies the processing of zero, subnormal, and normal values. _Uint_type _Adjusted_mantissa; if constexpr (is_same_v<_Floating, float>) { _Adjusted_mantissa = _Ieee_mantissa << 1; // align to hexit boundary (23 isn't divisible by 4) } else { _Adjusted_mantissa = _Ieee_mantissa; // already aligned (52 is divisible by 4) } int32_t _Unbiased_exponent; if (_Ieee_exponent == 0) { // zero or subnormal // implicit bit is 0 if (_Ieee_mantissa == 0) { // zero // C11 7.21.6.1 "The fprintf function"/8: "If the value is zero, the exponent is zero." _Unbiased_exponent = 0; } else { // subnormal _Unbiased_exponent = 1 - _Traits::_Exponent_bias; } } else { // normal _Adjusted_mantissa |= _Uint_type{1} << _Adjusted_explicit_bits; // implicit bit is 1 _Unbiased_exponent = _Ieee_exponent - _Traits::_Exponent_bias; } // _Unbiased_exponent is within [-126, 127] for float, [-1022, 1023] for double. // * Decompose _Unbiased_exponent into _Sign_character and _Absolute_exponent. char _Sign_character; uint32_t _Absolute_exponent; if (_Unbiased_exponent < 0) { _Sign_character = '-'; _Absolute_exponent = static_cast<uint32_t>(-_Unbiased_exponent); } else { _Sign_character = '+'; _Absolute_exponent = static_cast<uint32_t>(_Unbiased_exponent); } // _Absolute_exponent is within [0, 127] for float, [0, 1023] for double. // * Perform a single bounds check. { int32_t _Exponent_length; if (_Absolute_exponent < 10) { _Exponent_length = 1; } else if (_Absolute_exponent < 100) { _Exponent_length = 2; } else if constexpr (is_same_v<_Floating, float>) { _Exponent_length = 3; } else if (_Absolute_exponent < 1000) { _Exponent_length = 3; } else { _Exponent_length = 4; } // _Precision might be enormous; avoid integer overflow by testing it separately. ptrdiff_t _Buffer_size = _Last - _First; if (_Buffer_size < _Precision) { return {_Last, errc::value_too_large}; } _Buffer_size -= _Precision; const int32_t _Length_excluding_precision = 1 // leading hexit + static_cast<int32_t>(_Precision > 0) // possible decimal point // excluding `+ _Precision`, hexits after decimal point + 2 // "p+" or "p-" + _Exponent_length; // exponent if (_Buffer_size < _Length_excluding_precision) { return {_Last, errc::value_too_large}; } } // * Perform rounding when we've been asked to omit hexits. if (_Precision < _Full_precision) { // _Precision is within [0, 5] for float, [0, 12] for double. // _Dropped_bits is within [4, 24] for float, [4, 52] for double. const int _Dropped_bits = (_Full_precision - _Precision) * 4; // Perform rounding by adding an appropriately-shifted bit. // This can propagate carries all the way into the leading hexit. Examples: // "0.ff9" rounded to a precision of 2 is "1.00". // "1.ff9" rounded to a precision of 2 is "2.00". // Note that the leading hexit participates in the rounding decision. Examples: // "0.8" rounded to a precision of 0 is "0". // "1.8" rounded to a precision of 0 is "2". // Reference implementation with suboptimal codegen: // const bool _Lsb_bit = (_Adjusted_mantissa & (_Uint_type{1} << _Dropped_bits)) != 0; // const bool _Round_bit = (_Adjusted_mantissa & (_Uint_type{1} << (_Dropped_bits - 1))) != 0; // const bool _Has_tail_bits = (_Adjusted_mantissa & ((_Uint_type{1} << (_Dropped_bits - 1)) - 1)) != 0; // const bool _Should_round = _Should_round_up(_Lsb_bit, _Round_bit, _Has_tail_bits); // _Adjusted_mantissa += _Uint_type{_Should_round} << _Dropped_bits; // Example for optimized implementation: Let _Dropped_bits be 8. // Bit index: ...[8]76543210 // _Adjusted_mantissa: ...[L]RTTTTTTT (not depicting known details, like hexit alignment) // By focusing on the bit at index _Dropped_bits, we can avoid unnecessary branching and shifting. // Bit index: ...[8]76543210 // _Lsb_bit: ...[L]RTTTTTTT const _Uint_type _Lsb_bit = _Adjusted_mantissa; // Bit index: ...9[8]76543210 // _Round_bit: ...L[R]TTTTTTT0 const _Uint_type _Round_bit = _Adjusted_mantissa << 1; // We can detect (without branching) whether any of the trailing bits are set. // Due to _Should_round below, this computation will be used if and only if R is 1, so we can assume that here. // Bit index: ...9[8]76543210 // _Round_bit: ...L[1]TTTTTTT0 // _Has_tail_bits: ....[H]........ // If all of the trailing bits T are 0, then `_Round_bit - 1` will produce 0 for H (due to R being 1). // If any of the trailing bits T are 1, then `_Round_bit - 1` will produce 1 for H (due to R being 1). const _Uint_type _Has_tail_bits = _Round_bit - 1; // Finally, we can use _Should_round_up() logic with bitwise-AND and bitwise-OR, // selecting just the bit at index _Dropped_bits. This is the appropriately-shifted bit that we want. const _Uint_type _Should_round = _Round_bit & (_Has_tail_bits | _Lsb_bit) & (_Uint_type{1} << _Dropped_bits); // This rounding technique is dedicated to the memory of Peppermint. =^..^= _Adjusted_mantissa += _Should_round; } // * Print the leading hexit, then mask it away. { const uint32_t _Nibble = static_cast<uint32_t>(_Adjusted_mantissa >> _Adjusted_explicit_bits); _STL_INTERNAL_CHECK(_Nibble < 3); const char _Leading_hexit = static_cast<char>('0' + _Nibble); *_First++ = _Leading_hexit; constexpr _Uint_type _Mask = (_Uint_type{1} << _Adjusted_explicit_bits) - 1; _Adjusted_mantissa &= _Mask; } // * Print the decimal point and trailing hexits. // C11 7.21.6.1 "The fprintf function"/8: // "if the precision is zero and the # flag is not specified, no decimal-point character appears." if (_Precision > 0) { *_First++ = '.'; int32_t _Number_of_bits_remaining = _Adjusted_explicit_bits; // 24 for float, 52 for double for (;;) { _STL_INTERNAL_CHECK(_Number_of_bits_remaining >= 4); _STL_INTERNAL_CHECK(_Number_of_bits_remaining % 4 == 0); _Number_of_bits_remaining -= 4; const uint32_t _Nibble = static_cast<uint32_t>(_Adjusted_mantissa >> _Number_of_bits_remaining); _STL_INTERNAL_CHECK(_Nibble < 16); const char _Hexit = _Charconv_digits[_Nibble]; *_First++ = _Hexit; // _Precision is the number of hexits that still need to be printed. --_Precision; if (_Precision == 0) { break; // We're completely done with this phase. } // Otherwise, we need to keep printing hexits. if (_Number_of_bits_remaining == 0) { // We've finished printing _Adjusted_mantissa, so all remaining hexits are '0'. _CSTD memset(_First, '0', static_cast<size_t>(_Precision)); _First += _Precision; break; } // Mask away the hexit that we just printed, then keep looping. // (We skip this when breaking out of the loop above, because _Adjusted_mantissa isn't used later.) const _Uint_type _Mask = (_Uint_type{1} << _Number_of_bits_remaining) - 1; _Adjusted_mantissa &= _Mask; } } // * Print the exponent. // C11 7.21.6.1 "The fprintf function"/8: "The exponent always contains at least one digit, and only as many more // digits as necessary to represent the decimal exponent of 2." // Performance note: We should take advantage of the known ranges of possible exponents. *_First++ = 'p'; *_First++ = _Sign_character; // We've already printed '-' if necessary, so uint32_t _Absolute_exponent avoids testing that again. return _STD to_chars(_First, _Last, _Absolute_exponent); } template <class _Floating> _NODISCARD to_chars_result _Floating_to_chars_hex_shortest( char* _First, char* const _Last, const _Floating _Value) noexcept { // This prints "1.728p+0" instead of "2.e5p-1". // This prints "0.000002p-126" instead of "1p-149" for float. // This prints "0.0000000000001p-1022" instead of "1p-1074" for double. // This prioritizes being consistent with printf's de facto behavior (and hex-precision's behavior) // over minimizing the number of characters printed. using _Traits = _Floating_type_traits<_Floating>; using _Uint_type = typename _Traits::_Uint_type; const _Uint_type _Uint_value = _Bit_cast<_Uint_type>(_Value); if (_Uint_value == 0) { // zero detected; write "0p+0" and return // C11 7.21.6.1 "The fprintf function"/8: "If the value is zero, the exponent is zero." // Special-casing zero is necessary because of the exponent. const char* const _Str = "0p+0"; const size_t _Len = 4; if (_Last - _First < static_cast<ptrdiff_t>(_Len)) { return {_Last, errc::value_too_large}; } _CSTD memcpy(_First, _Str, _Len); return {_First + _Len, errc{}}; } const _Uint_type _Ieee_mantissa = _Uint_value & _Traits::_Denormal_mantissa_mask; const int32_t _Ieee_exponent = static_cast<int32_t>(_Uint_value >> _Traits::_Exponent_shift); char _Leading_hexit; // implicit bit int32_t _Unbiased_exponent; if (_Ieee_exponent == 0) { // subnormal _Leading_hexit = '0'; _Unbiased_exponent = 1 - _Traits::_Exponent_bias; } else { // normal _Leading_hexit = '1'; _Unbiased_exponent = _Ieee_exponent - _Traits::_Exponent_bias; } // Performance note: Consider avoiding per-character bounds checking when there's plenty of space. if (_First == _Last) { return {_Last, errc::value_too_large}; } *_First++ = _Leading_hexit; if (_Ieee_mantissa == 0) { // The fraction bits are all 0. Trim them away, including the decimal point. } else { if (_First == _Last) { return {_Last, errc::value_too_large}; } *_First++ = '.'; // The hexits after the decimal point correspond to the explicitly stored fraction bits. // float explicitly stores 23 fraction bits. 23 / 4 == 5.75, so we'll print at most 6 hexits. // double explicitly stores 52 fraction bits. 52 / 4 == 13, so we'll print at most 13 hexits. _Uint_type _Adjusted_mantissa; int32_t _Number_of_bits_remaining; if constexpr (is_same_v<_Floating, float>) { _Adjusted_mantissa = _Ieee_mantissa << 1; // align to hexit boundary (23 isn't divisible by 4) _Number_of_bits_remaining = 24; // 23 fraction bits + 1 alignment bit } else { _Adjusted_mantissa = _Ieee_mantissa; // already aligned (52 is divisible by 4) _Number_of_bits_remaining = 52; // 52 fraction bits } // do-while: The condition _Adjusted_mantissa != 0 is initially true - we have nonzero fraction bits and we've // printed the decimal point. Each iteration, we print a hexit, mask it away, and keep looping if we still have // nonzero fraction bits. If there would be trailing '0' hexits, this trims them. If there wouldn't be trailing // '0' hexits, the same condition works (as we print the final hexit and mask it away); we don't need to test // _Number_of_bits_remaining. do { _STL_INTERNAL_CHECK(_Number_of_bits_remaining >= 4); _STL_INTERNAL_CHECK(_Number_of_bits_remaining % 4 == 0); _Number_of_bits_remaining -= 4; const uint32_t _Nibble = static_cast<uint32_t>(_Adjusted_mantissa >> _Number_of_bits_remaining); _STL_INTERNAL_CHECK(_Nibble < 16); const char _Hexit = _Charconv_digits[_Nibble]; if (_First == _Last) { return {_Last, errc::value_too_large}; } *_First++ = _Hexit; const _Uint_type _Mask = (_Uint_type{1} << _Number_of_bits_remaining) - 1; _Adjusted_mantissa &= _Mask; } while (_Adjusted_mantissa != 0); } // C11 7.21.6.1 "The fprintf function"/8: "The exponent always contains at least one digit, and only as many more // digits as necessary to represent the decimal exponent of 2." // Performance note: We should take advantage of the known ranges of possible exponents. // float: _Unbiased_exponent is within [-126, 127]. // double: _Unbiased_exponent is within [-1022, 1023]. if (_Last - _First < 2) { return {_Last, errc::value_too_large}; } *_First++ = 'p'; if (_Unbiased_exponent < 0) { *_First++ = '-'; _Unbiased_exponent = -_Unbiased_exponent; } else { *_First++ = '+'; } // We've already printed '-' if necessary, so static_cast<uint32_t> avoids testing that again. return _STD to_chars(_First, _Last, static_cast<uint32_t>(_Unbiased_exponent)); } -
Decimal slowness, from moderate
⚠️ to significant ❌. These optimizer issues affect @ulfjack's amazing Ryu and Ryu Printf algorithms (used for the "shortest" and "precision" scenarios, respectively, although shortest does need to invoke Ryu Printf in a specific situation). When I worked on importing them to<charconv>, I reported as many optimizer issues as I could, MSVC (and LLVM) were able to fix some of them, and I was able to improve the source workarounds for other issues. However, there are still persistent issues with MSVC's optimizer.
Bonus issue: Conversely, MSVC appears to be generating noticeably faster code for the double hex 13 scenario on x64 ✔️. This indicates that LLVM has room for improvement; an issue could be reduced and reported to them.
<charconv> test case ("C1XX/C2" refers to the MSVC compiler front-end/back-end, like "Clang/LLVM"):
C:\Temp>type cppcon.cpp
// cl /std:c++17 /EHsc /nologo /W4 /MT /O2 cppcon.cpp && cppcon
// clang-cl /std:c++17 /EHsc /nologo /W4 /MT /O2 cppcon.cpp && cppcon
#include <charconv>
#include <chrono>
#include <random>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <system_error>
#include <type_traits>
#include <vector>
using namespace std;
using namespace std::chrono;
void verify(const bool b) {
if (!b) {
puts("FAIL");
exit(EXIT_FAILURE);
}
}
enum class RoundTrip { Sci, Fix, Gen, Hex, Lossy };
constexpr size_t N = 2'000'000; // how many floating-point values to test
constexpr size_t K = 5; // how many times to repeat the test, for cleaner timing
constexpr size_t BufSize = 2'000; // more than enough
unsigned int global_dummy = 0;
constexpr chars_format chars_format_from_RoundTrip(const RoundTrip rt) {
switch (rt) {
case RoundTrip::Sci:
return chars_format::scientific;
case RoundTrip::Fix:
return chars_format::fixed;
case RoundTrip::Gen:
return chars_format::general;
case RoundTrip::Hex:
return chars_format::hex;
case RoundTrip::Lossy:
default:
puts("FAIL");
exit(EXIT_FAILURE);
}
}
template <RoundTrip RT, typename Floating, typename... Args>
void test_to_chars(const char* const str, const vector<Floating>& vec, const Args&... args) {
char buf[BufSize];
const auto start = steady_clock::now();
for (size_t k = 0; k < K; ++k) {
for (const auto& elem : vec) {
const auto result = to_chars(buf, buf + BufSize, elem, args...);
global_dummy += static_cast<unsigned int>(result.ptr - buf);
global_dummy += static_cast<unsigned int>(buf[0]);
}
}
const auto finish = steady_clock::now();
printf("%6.1f ns | %s\n", duration<double, nano>{finish - start}.count() / (N * K), str);
for (const auto& elem : vec) {
const auto result = to_chars(buf, buf + BufSize, elem, args...);
verify(result.ec == errc{});
if constexpr (RT == RoundTrip::Lossy) {
// skip lossy conversions
} else {
Floating round_trip;
const auto from_result = from_chars(buf, result.ptr, round_trip, chars_format_from_RoundTrip(RT));
verify(from_result.ec == errc{});
verify(from_result.ptr == result.ptr);
verify(round_trip == elem);
}
}
}
int main() {
#if defined(__clang__) && defined(_M_IX86)
const char* const toolset = "Clang/LLVM x86 + MSVC STL";
#elif defined(__clang__) && defined(_M_X64)
const char* const toolset = "Clang/LLVM x64 + MSVC STL";
#elif !defined(__clang__) && defined(_M_IX86)
const char* const toolset = "C1XX/C2 x86 + MSVC STL";
#elif !defined(__clang__) && defined(_M_X64)
const char* const toolset = "C1XX/C2 x64 + MSVC STL";
#else
const char* const toolset = "Unknown Toolset";
#endif
puts(toolset);
vector<float> vec_flt;
vector<double> vec_dbl;
{
mt19937_64 mt64;
vec_flt.reserve(N);
while (vec_flt.size() < N) {
const uint32_t val = static_cast<uint32_t>(mt64());
constexpr uint32_t inf_nan = 0x7F800000U;
if ((val & inf_nan) == inf_nan) {
continue; // skip INF/NAN
}
float flt;
static_assert(sizeof(flt) == sizeof(val));
memcpy(&flt, &val, sizeof(flt));
vec_flt.push_back(flt);
}
vec_dbl.reserve(N);
while (vec_dbl.size() < N) {
const uint64_t val = mt64();
constexpr uint64_t inf_nan = 0x7FF0000000000000ULL;
if ((val & inf_nan) == inf_nan) {
continue; // skip INF/NAN
}
double dbl;
static_assert(sizeof(dbl) == sizeof(val));
memcpy(&dbl, &val, sizeof(dbl));
vec_dbl.push_back(dbl);
}
}
test_to_chars<RoundTrip::Gen>("STL float plain shortest", vec_flt);
test_to_chars<RoundTrip::Gen>("STL double plain shortest", vec_dbl);
test_to_chars<RoundTrip::Sci>("STL float scientific shortest", vec_flt, chars_format::scientific);
test_to_chars<RoundTrip::Sci>("STL double scientific shortest", vec_dbl, chars_format::scientific);
test_to_chars<RoundTrip::Fix>("STL float fixed shortest", vec_flt, chars_format::fixed);
test_to_chars<RoundTrip::Fix>("STL double fixed shortest", vec_dbl, chars_format::fixed);
test_to_chars<RoundTrip::Gen>("STL float general shortest", vec_flt, chars_format::general);
test_to_chars<RoundTrip::Gen>("STL double general shortest", vec_dbl, chars_format::general);
test_to_chars<RoundTrip::Hex>("STL float hex shortest", vec_flt, chars_format::hex);
test_to_chars<RoundTrip::Hex>("STL double hex shortest", vec_dbl, chars_format::hex);
test_to_chars<RoundTrip::Sci>("STL float scientific 8", vec_flt, chars_format::scientific, 8);
test_to_chars<RoundTrip::Sci>("STL double scientific 16", vec_dbl, chars_format::scientific, 16);
test_to_chars<RoundTrip::Lossy>("STL float fixed 6 (lossy)", vec_flt, chars_format::fixed, 6);
test_to_chars<RoundTrip::Lossy>("STL double fixed 6 (lossy)", vec_dbl, chars_format::fixed, 6);
test_to_chars<RoundTrip::Gen>("STL float general 9", vec_flt, chars_format::general, 9);
test_to_chars<RoundTrip::Gen>("STL double general 17", vec_dbl, chars_format::general, 17);
test_to_chars<RoundTrip::Hex>("STL float hex 6", vec_flt, chars_format::hex, 6);
test_to_chars<RoundTrip::Hex>("STL double hex 13", vec_dbl, chars_format::hex, 13);
puts("----------");
printf("global_dummy: %u\n", global_dummy);
}