diff options
Diffstat (limited to 'absl/strings/internal/charconv_parse.cc')
-rw-r--r-- | absl/strings/internal/charconv_parse.cc | 496 |
1 files changed, 496 insertions, 0 deletions
diff --git a/absl/strings/internal/charconv_parse.cc b/absl/strings/internal/charconv_parse.cc new file mode 100644 index 000000000000..a04cc67669a7 --- /dev/null +++ b/absl/strings/internal/charconv_parse.cc @@ -0,0 +1,496 @@ +// Copyright 2018 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "absl/strings/internal/charconv_parse.h" +#include "absl/strings/charconv.h" + +#include <cassert> +#include <cstdint> +#include <limits> + +#include "absl/strings/internal/memutil.h" + +namespace absl { +namespace { + +// ParseFloat<10> will read the first 19 significant digits of the mantissa. +// This number was chosen for multiple reasons. +// +// (a) First, for whatever integer type we choose to represent the mantissa, we +// want to choose the largest possible number of decimal digits for that integer +// type. We are using uint64_t, which can express any 19-digit unsigned +// integer. +// +// (b) Second, we need to parse enough digits that the binary value of any +// mantissa we capture has more bits of resolution than the mantissa +// representation in the target float. Our algorithm requires at least 3 bits +// of headway, but 19 decimal digits give a little more than that. +// +// The following static assertions verify the above comments: +constexpr int kDecimalMantissaDigitsMax = 19; + +static_assert(std::numeric_limits<uint64_t>::digits10 == + kDecimalMantissaDigitsMax, + "(a) above"); + +// IEEE doubles, which we assume in Abseil, have 53 binary bits of mantissa. +static_assert(std::numeric_limits<double>::is_iec559, "IEEE double assumed"); +static_assert(std::numeric_limits<double>::radix == 2, "IEEE double fact"); +static_assert(std::numeric_limits<double>::digits == 53, "IEEE double fact"); + +// The lowest valued 19-digit decimal mantissa we can read still contains +// sufficient information to reconstruct a binary mantissa. +static_assert(1000000000000000000u > (uint64_t(1) << (53 + 3)), "(b) above"); + +// ParseFloat<16> will read the first 15 significant digits of the mantissa. +// +// Because a base-16-to-base-2 conversion can be done exactly, we do not need +// to maximize the number of scanned hex digits to improve our conversion. What +// is required is to scan two more bits than the mantissa can represent, so that +// we always round correctly. +// +// (One extra bit does not suffice to perform correct rounding, since a number +// exactly halfway between two representable floats has unique rounding rules, +// so we need to differentiate between a "halfway between" number and a "closer +// to the larger value" number.) +constexpr int kHexadecimalMantissaDigitsMax = 15; + +// The minimum number of significant bits that will be read from +// kHexadecimalMantissaDigitsMax hex digits. We must subtract by three, since +// the most significant digit can be a "1", which only contributes a single +// significant bit. +constexpr int kGuaranteedHexadecimalMantissaBitPrecision = + 4 * kHexadecimalMantissaDigitsMax - 3; + +static_assert(kGuaranteedHexadecimalMantissaBitPrecision > + std::numeric_limits<double>::digits + 2, + "kHexadecimalMantissaDigitsMax too small"); + +// We also impose a limit on the number of significant digits we will read from +// an exponent, to avoid having to deal with integer overflow. We use 9 for +// this purpose. +// +// If we read a 9 digit exponent, the end result of the conversion will +// necessarily be infinity or zero, depending on the sign of the exponent. +// Therefore we can just drop extra digits on the floor without any extra +// logic. +constexpr int kDecimalExponentDigitsMax = 9; +static_assert(std::numeric_limits<int>::digits10 >= kDecimalExponentDigitsMax, + "int type too small"); + +// To avoid incredibly large inputs causing integer overflow for our exponent, +// we impose an arbitrary but very large limit on the number of significant +// digits we will accept. The implementation refuses to match a std::string with +// more consecutive significant mantissa digits than this. +constexpr int kDecimalDigitLimit = 50000000; + +// Corresponding limit for hexadecimal digit inputs. This is one fourth the +// amount of kDecimalDigitLimit, since each dropped hexadecimal digit requires +// a binary exponent adjustment of 4. +constexpr int kHexadecimalDigitLimit = kDecimalDigitLimit / 4; + +// The largest exponent we can read is 999999999 (per +// kDecimalExponentDigitsMax), and the largest exponent adjustment we can get +// from dropped mantissa digits is 2 * kDecimalDigitLimit, and the sum of these +// comfortably fits in an integer. +// +// We count kDecimalDigitLimit twice because there are independent limits for +// numbers before and after the decimal point. (In the case where there are no +// significant digits before the decimal point, there are independent limits for +// post-decimal-point leading zeroes and for significant digits.) +static_assert(999999999 + 2 * kDecimalDigitLimit < + std::numeric_limits<int>::max(), + "int type too small"); +static_assert(999999999 + 2 * (4 * kHexadecimalDigitLimit) < + std::numeric_limits<int>::max(), + "int type too small"); + +// Returns true if the provided bitfield allows parsing an exponent value +// (e.g., "1.5e100"). +bool AllowExponent(chars_format flags) { + bool fixed = (flags & chars_format::fixed) == chars_format::fixed; + bool scientific = + (flags & chars_format::scientific) == chars_format::scientific; + return scientific || !fixed; +} + +// Returns true if the provided bitfield requires an exponent value be present. +bool RequireExponent(chars_format flags) { + bool fixed = (flags & chars_format::fixed) == chars_format::fixed; + bool scientific = + (flags & chars_format::scientific) == chars_format::scientific; + return scientific && !fixed; +} + +const int8_t kAsciiToInt[256] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1}; + +// Returns true if `ch` is a digit in the given base +template <int base> +bool IsDigit(char ch); + +// Converts a valid `ch` to its digit value in the given base. +template <int base> +unsigned ToDigit(char ch); + +// Returns true if `ch` is the exponent delimiter for the given base. +template <int base> +bool IsExponentCharacter(char ch); + +// Returns the maximum number of significant digits we will read for a float +// in the given base. +template <int base> +constexpr int MantissaDigitsMax(); + +// Returns the largest consecutive run of digits we will accept when parsing a +// number in the given base. +template <int base> +constexpr int DigitLimit(); + +// Returns the amount the exponent must be adjusted by for each dropped digit. +// (For decimal this is 1, since the digits are in base 10 and the exponent base +// is also 10, but for hexadecimal this is 4, since the digits are base 16 but +// the exponent base is 2.) +template <int base> +constexpr int DigitMagnitude(); + +template <> +bool IsDigit<10>(char ch) { + return ch >= '0' && ch <= '9'; +} +template <> +bool IsDigit<16>(char ch) { + return kAsciiToInt[static_cast<unsigned char>(ch)] >= 0; +} + +template <> +unsigned ToDigit<10>(char ch) { + return ch - '0'; +} +template <> +unsigned ToDigit<16>(char ch) { + return kAsciiToInt[static_cast<unsigned char>(ch)]; +} + +template <> +bool IsExponentCharacter<10>(char ch) { + return ch == 'e' || ch == 'E'; +} + +template <> +bool IsExponentCharacter<16>(char ch) { + return ch == 'p' || ch == 'P'; +} + +template <> +constexpr int MantissaDigitsMax<10>() { + return kDecimalMantissaDigitsMax; +} +template <> +constexpr int MantissaDigitsMax<16>() { + return kHexadecimalMantissaDigitsMax; +} + +template <> +constexpr int DigitLimit<10>() { + return kDecimalDigitLimit; +} +template <> +constexpr int DigitLimit<16>() { + return kHexadecimalDigitLimit; +} + +template <> +constexpr int DigitMagnitude<10>() { + return 1; +} +template <> +constexpr int DigitMagnitude<16>() { + return 4; +} + +// Reads decimal digits from [begin, end) into *out. Returns the number of +// digits consumed. +// +// After max_digits has been read, keeps consuming characters, but no longer +// adjusts *out. If a nonzero digit is dropped this way, *dropped_nonzero_digit +// is set; otherwise, it is left unmodified. +// +// If no digits are matched, returns 0 and leaves *out unchanged. +// +// ConsumeDigits does not protect against overflow on *out; max_digits must +// be chosen with respect to type T to avoid the possibility of overflow. +template <int base, typename T> +std::size_t ConsumeDigits(const char* begin, const char* end, int max_digits, + T* out, bool* dropped_nonzero_digit) { + if (base == 10) { + assert(max_digits <= std::numeric_limits<T>::digits10); + } else if (base == 16) { + assert(max_digits * 4 <= std::numeric_limits<T>::digits); + } + const char* const original_begin = begin; + T accumulator = *out; + const char* significant_digits_end = + (end - begin > max_digits) ? begin + max_digits : end; + while (begin < significant_digits_end && IsDigit<base>(*begin)) { + // Do not guard against *out overflow; max_digits was chosen to avoid this. + // Do assert against it, to detect problems in debug builds. + auto digit = static_cast<T>(ToDigit<base>(*begin)); + assert(accumulator * base >= accumulator); + accumulator *= base; + assert(accumulator + digit >= accumulator); + accumulator += digit; + ++begin; + } + bool dropped_nonzero = false; + while (begin < end && IsDigit<base>(*begin)) { + dropped_nonzero = dropped_nonzero || (*begin != '0'); + ++begin; + } + if (dropped_nonzero && dropped_nonzero_digit != nullptr) { + *dropped_nonzero_digit = true; + } + *out = accumulator; + return begin - original_begin; +} + +// Returns true if `v` is one of the chars allowed inside parentheses following +// a NaN. +bool IsNanChar(char v) { + return (v == '_') || (v >= '0' && v <= '9') || (v >= 'a' && v <= 'z') || + (v >= 'A' && v <= 'Z'); +} + +// Checks the range [begin, end) for a strtod()-formatted infinity or NaN. If +// one is found, sets `out` appropriately and returns true. +bool ParseInfinityOrNan(const char* begin, const char* end, + strings_internal::ParsedFloat* out) { + if (end - begin < 3) { + return false; + } + switch (*begin) { + case 'i': + case 'I': { + // An infinity std::string consists of the characters "inf" or "infinity", + // case insensitive. + if (strings_internal::memcasecmp(begin + 1, "nf", 2) != 0) { + return false; + } + out->type = strings_internal::FloatType::kInfinity; + if (end - begin >= 8 && + strings_internal::memcasecmp(begin + 3, "inity", 5) == 0) { + out->end = begin + 8; + } else { + out->end = begin + 3; + } + return true; + } + case 'n': + case 'N': { + // A NaN consists of the characters "nan", case insensitive, optionally + // followed by a parenthesized sequence of zero or more alphanumeric + // characters and/or underscores. + if (strings_internal::memcasecmp(begin + 1, "an", 2) != 0) { + return false; + } + out->type = strings_internal::FloatType::kNan; + out->end = begin + 3; + // NaN is allowed to be followed by a parenthesized std::string, consisting of + // only the characters [a-zA-Z0-9_]. Match that if it's present. + begin += 3; + if (begin < end && *begin == '(') { + const char* nan_begin = begin + 1; + while (nan_begin < end && IsNanChar(*nan_begin)) { + ++nan_begin; + } + if (nan_begin < end && *nan_begin == ')') { + // We found an extra NaN specifier range + out->subrange_begin = begin + 1; + out->subrange_end = nan_begin; + out->end = nan_begin + 1; + } + } + return true; + } + default: + return false; + } +} +} // namespace + +namespace strings_internal { + +template <int base> +strings_internal::ParsedFloat ParseFloat(const char* begin, const char* end, + chars_format format_flags) { + strings_internal::ParsedFloat result; + + // Exit early if we're given an empty range. + if (begin == end) return result; + + // Handle the infinity and NaN cases. + if (ParseInfinityOrNan(begin, end, &result)) { + return result; + } + + const char* const mantissa_begin = begin; + while (begin < end && *begin == '0') { + ++begin; // skip leading zeros + } + uint64_t mantissa = 0; + + int exponent_adjustment = 0; + bool mantissa_is_inexact = false; + std::size_t pre_decimal_digits = ConsumeDigits<base>( + begin, end, MantissaDigitsMax<base>(), &mantissa, &mantissa_is_inexact); + begin += pre_decimal_digits; + int digits_left; + if (pre_decimal_digits >= DigitLimit<base>()) { + // refuse to parse pathological inputs + return result; + } else if (pre_decimal_digits > MantissaDigitsMax<base>()) { + // We dropped some non-fraction digits on the floor. Adjust our exponent + // to compensate. + exponent_adjustment = + static_cast<int>(pre_decimal_digits - MantissaDigitsMax<base>()); + digits_left = 0; + } else { + digits_left = + static_cast<int>(MantissaDigitsMax<base>() - pre_decimal_digits); + } + if (begin < end && *begin == '.') { + ++begin; + if (mantissa == 0) { + // If we haven't seen any nonzero digits yet, keep skipping zeros. We + // have to adjust the exponent to reflect the changed place value. + const char* begin_zeros = begin; + while (begin < end && *begin == '0') { + ++begin; + } + std::size_t zeros_skipped = begin - begin_zeros; + if (zeros_skipped >= DigitLimit<base>()) { + // refuse to parse pathological inputs + return result; + } + exponent_adjustment -= static_cast<int>(zeros_skipped); + } + std::size_t post_decimal_digits = ConsumeDigits<base>( + begin, end, digits_left, &mantissa, &mantissa_is_inexact); + begin += post_decimal_digits; + + // Since `mantissa` is an integer, each significant digit we read after + // the decimal point requires an adjustment to the exponent. "1.23e0" will + // be stored as `mantissa` == 123 and `exponent` == -2 (that is, + // "123e-2"). + if (post_decimal_digits >= DigitLimit<base>()) { + // refuse to parse pathological inputs + return result; + } else if (post_decimal_digits > digits_left) { + exponent_adjustment -= digits_left; + } else { + exponent_adjustment -= post_decimal_digits; + } + } + // If we've found no mantissa whatsoever, this isn't a number. + if (mantissa_begin == begin) { + return result; + } + // A bare "." doesn't count as a mantissa either. + if (begin - mantissa_begin == 1 && *mantissa_begin == '.') { + return result; + } + + if (mantissa_is_inexact) { + // We dropped significant digits on the floor. Handle this appropriately. + if (base == 10) { + // If we truncated significant decimal digits, store the full range of the + // mantissa for future big integer math for exact rounding. + result.subrange_begin = mantissa_begin; + result.subrange_end = begin; + } else if (base == 16) { + // If we truncated hex digits, reflect this fact by setting the low + // ("sticky") bit. This allows for correct rounding in all cases. + mantissa |= 1; + } + } + result.mantissa = mantissa; + + const char* const exponent_begin = begin; + result.literal_exponent = 0; + bool found_exponent = false; + if (AllowExponent(format_flags) && begin < end && + IsExponentCharacter<base>(*begin)) { + bool negative_exponent = false; + ++begin; + if (begin < end && *begin == '-') { + negative_exponent = true; + ++begin; + } else if (begin < end && *begin == '+') { + ++begin; + } + const char* const exponent_digits_begin = begin; + // Exponent is always expressed in decimal, even for hexadecimal floats. + begin += ConsumeDigits<10>(begin, end, kDecimalExponentDigitsMax, + &result.literal_exponent, nullptr); + if (begin == exponent_digits_begin) { + // there were no digits where we expected an exponent. We failed to read + // an exponent and should not consume the 'e' after all. Rewind 'begin'. + found_exponent = false; + begin = exponent_begin; + } else { + found_exponent = true; + if (negative_exponent) { + result.literal_exponent = -result.literal_exponent; + } + } + } + + if (!found_exponent && RequireExponent(format_flags)) { + // Provided flags required an exponent, but none was found. This results + // in a failure to scan. + return result; + } + + // Success! + result.type = strings_internal::FloatType::kNumber; + if (result.mantissa > 0) { + result.exponent = result.literal_exponent + + (DigitMagnitude<base>() * exponent_adjustment); + } else { + result.exponent = 0; + } + result.end = begin; + return result; +} + +template ParsedFloat ParseFloat<10>(const char* begin, const char* end, + chars_format format_flags); +template ParsedFloat ParseFloat<16>(const char* begin, const char* end, + chars_format format_flags); + +} // namespace strings_internal +} // namespace absl |