about summary refs log tree commit diff
path: root/absl/strings/internal/charconv_parse.cc
diff options
context:
space:
mode:
authorAbseil Team <absl-team@google.com>2018-06-18T20·18-0700
committerShaindel Schwartz <shaindel@google.com>2018-06-18T20·20-0400
commitbd40a41cc142b36c73b881099d08a9d83f7f4780 (patch)
treea73a9bbdfdb823edee52f3b8a2973c4e240ceecc /absl/strings/internal/charconv_parse.cc
parentf44e1eed08cd7871de4fb7c40cae27c6f727455d (diff)
--
f28d30df5769bb832dec3ff36d2fcd2bcdf494a3 by Shaindel Schwartz <shaindel@google.com>:

Internal change

PiperOrigin-RevId: 201046831

--
711715a78b7e53dfaafd4d7f08a74e76db22af88 by Mark Barolak <mbar@google.com>:

Internal fix

PiperOrigin-RevId: 201043684

--
64b53edd6bf1fa48f74e7f5d33f00f80d5089147 by Shaindel Schwartz <shaindel@google.com>:

Remove extra whitespace

PiperOrigin-RevId: 201041989

--
0bdd2a0b33657b688e4a04aeba9ebba47e4dc6ca by Shaindel Schwartz <shaindel@google.com>:

Whitespace fix.

PiperOrigin-RevId: 201034413

--
3deb0ac296ef1b74c4789e114a8a8bf53253f26b by Shaindel Schwartz <shaindel@google.com>:

Scrub build tags. No functional changes.

PiperOrigin-RevId: 201032927

--
da75d0f8b73baa7e8f4e9a092bba546012ed3b71 by Alex Strelnikov <strel@google.com>:

Internal change.

PiperOrigin-RevId: 201026131

--
6815d80caa19870d0c441b6b9816c68db41393a5 by Tom Manshreck <shreck@google.com>:

Add documentation for our LTS snapshot branches

PiperOrigin-RevId: 201025191

--
64c3b02006f39e6a8127bbabf9ec947fb45b6504 by Greg Falcon <gfalcon@google.com>:

Provide absl::from_chars for double and float types.  This is a forward-compatible implementation of std::from_chars from C++17.

This provides exact "round_to_nearest" conversions, and has some nice properties:

* Works with string_view (it can convert numbers from non-NUL-terminated buffers)
* Never allocates memory
* Faster than the standard library strtod() in our toolchain
* Uses integer math in its calculations, so is unaffected by floating point environment
* Unaffected by C locale

Also change SimpleAtod/SimpleAtoi to use this new API under the hood.

PiperOrigin-RevId: 201003324

--
542869258eb100779497c899103dc96aced52749 by Greg Falcon <gfalcon@google.com>:

Internal change

PiperOrigin-RevId: 200999200

--
3aba192775c7f80e2cd7f221b0a73537823c54ea by Gennadiy Rozental <rogeeff@google.com>:

Internal change

PiperOrigin-RevId: 200947470

--
daf9b9feedd748d5364a4c06165b7cb7604d3e1e by Mark Barolak <mbar@google.com>:

Add an absl:: qualification to a usage of base_internal::SchedulingMode outside of an absl:: namespace.

PiperOrigin-RevId: 200748234

--
a8d265290a22d629f3d9bf9f872c204200bfe8c8 by Mark Barolak <mbar@google.com>:

Add a missing namespace closing comment to optional.h.

PiperOrigin-RevId: 200739934

--
f05af8ee1c6b864dad2df7c907d424209a3e3202 by Abseil Team <absl-team@google.com>:

Internal change

PiperOrigin-RevId: 200719115
GitOrigin-RevId: f28d30df5769bb832dec3ff36d2fcd2bcdf494a3
Change-Id: Ie4fa601078fd4aa57286372611f1d114fdec82c0
Diffstat (limited to 'absl/strings/internal/charconv_parse.cc')
-rw-r--r--absl/strings/internal/charconv_parse.cc496
1 files changed, 496 insertions, 0 deletions
diff --git a/absl/strings/internal/charconv_parse.cc b/absl/strings/internal/charconv_parse.cc
new file mode 100644
index 000000000000..a04cc67669a7
--- /dev/null
+++ b/absl/strings/internal/charconv_parse.cc
@@ -0,0 +1,496 @@
+// Copyright 2018 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "absl/strings/internal/charconv_parse.h"
+#include "absl/strings/charconv.h"
+
+#include <cassert>
+#include <cstdint>
+#include <limits>
+
+#include "absl/strings/internal/memutil.h"
+
+namespace absl {
+namespace {
+
+// ParseFloat<10> will read the first 19 significant digits of the mantissa.
+// This number was chosen for multiple reasons.
+//
+// (a) First, for whatever integer type we choose to represent the mantissa, we
+// want to choose the largest possible number of decimal digits for that integer
+// type.  We are using uint64_t, which can express any 19-digit unsigned
+// integer.
+//
+// (b) Second, we need to parse enough digits that the binary value of any
+// mantissa we capture has more bits of resolution than the mantissa
+// representation in the target float.  Our algorithm requires at least 3 bits
+// of headway, but 19 decimal digits give a little more than that.
+//
+// The following static assertions verify the above comments:
+constexpr int kDecimalMantissaDigitsMax = 19;
+
+static_assert(std::numeric_limits<uint64_t>::digits10 ==
+                  kDecimalMantissaDigitsMax,
+              "(a) above");
+
+// IEEE doubles, which we assume in Abseil, have 53 binary bits of mantissa.
+static_assert(std::numeric_limits<double>::is_iec559, "IEEE double assumed");
+static_assert(std::numeric_limits<double>::radix == 2, "IEEE double fact");
+static_assert(std::numeric_limits<double>::digits == 53, "IEEE double fact");
+
+// The lowest valued 19-digit decimal mantissa we can read still contains
+// sufficient information to reconstruct a binary mantissa.
+static_assert(1000000000000000000u > (uint64_t(1) << (53 + 3)), "(b) above");
+
+// ParseFloat<16> will read the first 15 significant digits of the mantissa.
+//
+// Because a base-16-to-base-2 conversion can be done exactly, we do not need
+// to maximize the number of scanned hex digits to improve our conversion.  What
+// is required is to scan two more bits than the mantissa can represent, so that
+// we always round correctly.
+//
+// (One extra bit does not suffice to perform correct rounding, since a number
+// exactly halfway between two representable floats has unique rounding rules,
+// so we need to differentiate between a "halfway between" number and a "closer
+// to the larger value" number.)
+constexpr int kHexadecimalMantissaDigitsMax = 15;
+
+// The minimum number of significant bits that will be read from
+// kHexadecimalMantissaDigitsMax hex digits.  We must subtract by three, since
+// the most significant digit can be a "1", which only contributes a single
+// significant bit.
+constexpr int kGuaranteedHexadecimalMantissaBitPrecision =
+    4 * kHexadecimalMantissaDigitsMax - 3;
+
+static_assert(kGuaranteedHexadecimalMantissaBitPrecision >
+                  std::numeric_limits<double>::digits + 2,
+              "kHexadecimalMantissaDigitsMax too small");
+
+// We also impose a limit on the number of significant digits we will read from
+// an exponent, to avoid having to deal with integer overflow.  We use 9 for
+// this purpose.
+//
+// If we read a 9 digit exponent, the end result of the conversion will
+// necessarily be infinity or zero, depending on the sign of the exponent.
+// Therefore we can just drop extra digits on the floor without any extra
+// logic.
+constexpr int kDecimalExponentDigitsMax = 9;
+static_assert(std::numeric_limits<int>::digits10 >= kDecimalExponentDigitsMax,
+              "int type too small");
+
+// To avoid incredibly large inputs causing integer overflow for our exponent,
+// we impose an arbitrary but very large limit on the number of significant
+// digits we will accept.  The implementation refuses to match a std::string with
+// more consecutive significant mantissa digits than this.
+constexpr int kDecimalDigitLimit = 50000000;
+
+// Corresponding limit for hexadecimal digit inputs.  This is one fourth the
+// amount of kDecimalDigitLimit, since each dropped hexadecimal digit requires
+// a binary exponent adjustment of 4.
+constexpr int kHexadecimalDigitLimit = kDecimalDigitLimit / 4;
+
+// The largest exponent we can read is 999999999 (per
+// kDecimalExponentDigitsMax), and the largest exponent adjustment we can get
+// from dropped mantissa digits is 2 * kDecimalDigitLimit, and the sum of these
+// comfortably fits in an integer.
+//
+// We count kDecimalDigitLimit twice because there are independent limits for
+// numbers before and after the decimal point.  (In the case where there are no
+// significant digits before the decimal point, there are independent limits for
+// post-decimal-point leading zeroes and for significant digits.)
+static_assert(999999999 + 2 * kDecimalDigitLimit <
+                  std::numeric_limits<int>::max(),
+              "int type too small");
+static_assert(999999999 + 2 * (4 * kHexadecimalDigitLimit) <
+                  std::numeric_limits<int>::max(),
+              "int type too small");
+
+// Returns true if the provided bitfield allows parsing an exponent value
+// (e.g., "1.5e100").
+bool AllowExponent(chars_format flags) {
+  bool fixed = (flags & chars_format::fixed) == chars_format::fixed;
+  bool scientific =
+      (flags & chars_format::scientific) == chars_format::scientific;
+  return scientific || !fixed;
+}
+
+// Returns true if the provided bitfield requires an exponent value be present.
+bool RequireExponent(chars_format flags) {
+  bool fixed = (flags & chars_format::fixed) == chars_format::fixed;
+  bool scientific =
+      (flags & chars_format::scientific) == chars_format::scientific;
+  return scientific && !fixed;
+}
+
+const int8_t kAsciiToInt[256] = {
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0,  1,  2,  3,  4,  5,  6,  7,  8,
+    9,  -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1};
+
+// Returns true if `ch` is a digit in the given base
+template <int base>
+bool IsDigit(char ch);
+
+// Converts a valid `ch` to its digit value in the given base.
+template <int base>
+unsigned ToDigit(char ch);
+
+// Returns true if `ch` is the exponent delimiter for the given base.
+template <int base>
+bool IsExponentCharacter(char ch);
+
+// Returns the maximum number of significant digits we will read for a float
+// in the given base.
+template <int base>
+constexpr int MantissaDigitsMax();
+
+// Returns the largest consecutive run of digits we will accept when parsing a
+// number in the given base.
+template <int base>
+constexpr int DigitLimit();
+
+// Returns the amount the exponent must be adjusted by for each dropped digit.
+// (For decimal this is 1, since the digits are in base 10 and the exponent base
+// is also 10, but for hexadecimal this is 4, since the digits are base 16 but
+// the exponent base is 2.)
+template <int base>
+constexpr int DigitMagnitude();
+
+template <>
+bool IsDigit<10>(char ch) {
+  return ch >= '0' && ch <= '9';
+}
+template <>
+bool IsDigit<16>(char ch) {
+  return kAsciiToInt[static_cast<unsigned char>(ch)] >= 0;
+}
+
+template <>
+unsigned ToDigit<10>(char ch) {
+  return ch - '0';
+}
+template <>
+unsigned ToDigit<16>(char ch) {
+  return kAsciiToInt[static_cast<unsigned char>(ch)];
+}
+
+template <>
+bool IsExponentCharacter<10>(char ch) {
+  return ch == 'e' || ch == 'E';
+}
+
+template <>
+bool IsExponentCharacter<16>(char ch) {
+  return ch == 'p' || ch == 'P';
+}
+
+template <>
+constexpr int MantissaDigitsMax<10>() {
+  return kDecimalMantissaDigitsMax;
+}
+template <>
+constexpr int MantissaDigitsMax<16>() {
+  return kHexadecimalMantissaDigitsMax;
+}
+
+template <>
+constexpr int DigitLimit<10>() {
+  return kDecimalDigitLimit;
+}
+template <>
+constexpr int DigitLimit<16>() {
+  return kHexadecimalDigitLimit;
+}
+
+template <>
+constexpr int DigitMagnitude<10>() {
+  return 1;
+}
+template <>
+constexpr int DigitMagnitude<16>() {
+  return 4;
+}
+
+// Reads decimal digits from [begin, end) into *out.  Returns the number of
+// digits consumed.
+//
+// After max_digits has been read, keeps consuming characters, but no longer
+// adjusts *out.  If a nonzero digit is dropped this way, *dropped_nonzero_digit
+// is set; otherwise, it is left unmodified.
+//
+// If no digits are matched, returns 0 and leaves *out unchanged.
+//
+// ConsumeDigits does not protect against overflow on *out; max_digits must
+// be chosen with respect to type T to avoid the possibility of overflow.
+template <int base, typename T>
+std::size_t ConsumeDigits(const char* begin, const char* end, int max_digits,
+                          T* out, bool* dropped_nonzero_digit) {
+  if (base == 10) {
+    assert(max_digits <= std::numeric_limits<T>::digits10);
+  } else if (base == 16) {
+    assert(max_digits * 4 <= std::numeric_limits<T>::digits);
+  }
+  const char* const original_begin = begin;
+  T accumulator = *out;
+  const char* significant_digits_end =
+      (end - begin > max_digits) ? begin + max_digits : end;
+  while (begin < significant_digits_end && IsDigit<base>(*begin)) {
+    // Do not guard against *out overflow; max_digits was chosen to avoid this.
+    // Do assert against it, to detect problems in debug builds.
+    auto digit = static_cast<T>(ToDigit<base>(*begin));
+    assert(accumulator * base >= accumulator);
+    accumulator *= base;
+    assert(accumulator + digit >= accumulator);
+    accumulator += digit;
+    ++begin;
+  }
+  bool dropped_nonzero = false;
+  while (begin < end && IsDigit<base>(*begin)) {
+    dropped_nonzero = dropped_nonzero || (*begin != '0');
+    ++begin;
+  }
+  if (dropped_nonzero && dropped_nonzero_digit != nullptr) {
+    *dropped_nonzero_digit = true;
+  }
+  *out = accumulator;
+  return begin - original_begin;
+}
+
+// Returns true if `v` is one of the chars allowed inside parentheses following
+// a NaN.
+bool IsNanChar(char v) {
+  return (v == '_') || (v >= '0' && v <= '9') || (v >= 'a' && v <= 'z') ||
+         (v >= 'A' && v <= 'Z');
+}
+
+// Checks the range [begin, end) for a strtod()-formatted infinity or NaN.  If
+// one is found, sets `out` appropriately and returns true.
+bool ParseInfinityOrNan(const char* begin, const char* end,
+                        strings_internal::ParsedFloat* out) {
+  if (end - begin < 3) {
+    return false;
+  }
+  switch (*begin) {
+    case 'i':
+    case 'I': {
+      // An infinity std::string consists of the characters "inf" or "infinity",
+      // case insensitive.
+      if (strings_internal::memcasecmp(begin + 1, "nf", 2) != 0) {
+        return false;
+      }
+      out->type = strings_internal::FloatType::kInfinity;
+      if (end - begin >= 8 &&
+          strings_internal::memcasecmp(begin + 3, "inity", 5) == 0) {
+        out->end = begin + 8;
+      } else {
+        out->end = begin + 3;
+      }
+      return true;
+    }
+    case 'n':
+    case 'N': {
+      // A NaN consists of the characters "nan", case insensitive, optionally
+      // followed by a parenthesized sequence of zero or more alphanumeric
+      // characters and/or underscores.
+      if (strings_internal::memcasecmp(begin + 1, "an", 2) != 0) {
+        return false;
+      }
+      out->type = strings_internal::FloatType::kNan;
+      out->end = begin + 3;
+      // NaN is allowed to be followed by a parenthesized std::string, consisting of
+      // only the characters [a-zA-Z0-9_].  Match that if it's present.
+      begin += 3;
+      if (begin < end && *begin == '(') {
+        const char* nan_begin = begin + 1;
+        while (nan_begin < end && IsNanChar(*nan_begin)) {
+          ++nan_begin;
+        }
+        if (nan_begin < end && *nan_begin == ')') {
+          // We found an extra NaN specifier range
+          out->subrange_begin = begin + 1;
+          out->subrange_end = nan_begin;
+          out->end = nan_begin + 1;
+        }
+      }
+      return true;
+    }
+    default:
+      return false;
+  }
+}
+}  // namespace
+
+namespace strings_internal {
+
+template <int base>
+strings_internal::ParsedFloat ParseFloat(const char* begin, const char* end,
+                                         chars_format format_flags) {
+  strings_internal::ParsedFloat result;
+
+  // Exit early if we're given an empty range.
+  if (begin == end) return result;
+
+  // Handle the infinity and NaN cases.
+  if (ParseInfinityOrNan(begin, end, &result)) {
+    return result;
+  }
+
+  const char* const mantissa_begin = begin;
+  while (begin < end && *begin == '0') {
+    ++begin;  // skip leading zeros
+  }
+  uint64_t mantissa = 0;
+
+  int exponent_adjustment = 0;
+  bool mantissa_is_inexact = false;
+  std::size_t pre_decimal_digits = ConsumeDigits<base>(
+      begin, end, MantissaDigitsMax<base>(), &mantissa, &mantissa_is_inexact);
+  begin += pre_decimal_digits;
+  int digits_left;
+  if (pre_decimal_digits >= DigitLimit<base>()) {
+    // refuse to parse pathological inputs
+    return result;
+  } else if (pre_decimal_digits > MantissaDigitsMax<base>()) {
+    // We dropped some non-fraction digits on the floor.  Adjust our exponent
+    // to compensate.
+    exponent_adjustment =
+        static_cast<int>(pre_decimal_digits - MantissaDigitsMax<base>());
+    digits_left = 0;
+  } else {
+    digits_left =
+        static_cast<int>(MantissaDigitsMax<base>() - pre_decimal_digits);
+  }
+  if (begin < end && *begin == '.') {
+    ++begin;
+    if (mantissa == 0) {
+      // If we haven't seen any nonzero digits yet, keep skipping zeros.  We
+      // have to adjust the exponent to reflect the changed place value.
+      const char* begin_zeros = begin;
+      while (begin < end && *begin == '0') {
+        ++begin;
+      }
+      std::size_t zeros_skipped = begin - begin_zeros;
+      if (zeros_skipped >= DigitLimit<base>()) {
+        // refuse to parse pathological inputs
+        return result;
+      }
+      exponent_adjustment -= static_cast<int>(zeros_skipped);
+    }
+    std::size_t post_decimal_digits = ConsumeDigits<base>(
+        begin, end, digits_left, &mantissa, &mantissa_is_inexact);
+    begin += post_decimal_digits;
+
+    // Since `mantissa` is an integer, each significant digit we read after
+    // the decimal point requires an adjustment to the exponent. "1.23e0" will
+    // be stored as `mantissa` == 123 and `exponent` == -2 (that is,
+    // "123e-2").
+    if (post_decimal_digits >= DigitLimit<base>()) {
+      // refuse to parse pathological inputs
+      return result;
+    } else if (post_decimal_digits > digits_left) {
+      exponent_adjustment -= digits_left;
+    } else {
+      exponent_adjustment -= post_decimal_digits;
+    }
+  }
+  // If we've found no mantissa whatsoever, this isn't a number.
+  if (mantissa_begin == begin) {
+    return result;
+  }
+  // A bare "." doesn't count as a mantissa either.
+  if (begin - mantissa_begin == 1 && *mantissa_begin == '.') {
+    return result;
+  }
+
+  if (mantissa_is_inexact) {
+    // We dropped significant digits on the floor.  Handle this appropriately.
+    if (base == 10) {
+      // If we truncated significant decimal digits, store the full range of the
+      // mantissa for future big integer math for exact rounding.
+      result.subrange_begin = mantissa_begin;
+      result.subrange_end = begin;
+    } else if (base == 16) {
+      // If we truncated hex digits, reflect this fact by setting the low
+      // ("sticky") bit.  This allows for correct rounding in all cases.
+      mantissa |= 1;
+    }
+  }
+  result.mantissa = mantissa;
+
+  const char* const exponent_begin = begin;
+  result.literal_exponent = 0;
+  bool found_exponent = false;
+  if (AllowExponent(format_flags) && begin < end &&
+      IsExponentCharacter<base>(*begin)) {
+    bool negative_exponent = false;
+    ++begin;
+    if (begin < end && *begin == '-') {
+      negative_exponent = true;
+      ++begin;
+    } else if (begin < end && *begin == '+') {
+      ++begin;
+    }
+    const char* const exponent_digits_begin = begin;
+    // Exponent is always expressed in decimal, even for hexadecimal floats.
+    begin += ConsumeDigits<10>(begin, end, kDecimalExponentDigitsMax,
+                               &result.literal_exponent, nullptr);
+    if (begin == exponent_digits_begin) {
+      // there were no digits where we expected an exponent.  We failed to read
+      // an exponent and should not consume the 'e' after all.  Rewind 'begin'.
+      found_exponent = false;
+      begin = exponent_begin;
+    } else {
+      found_exponent = true;
+      if (negative_exponent) {
+        result.literal_exponent = -result.literal_exponent;
+      }
+    }
+  }
+
+  if (!found_exponent && RequireExponent(format_flags)) {
+    // Provided flags required an exponent, but none was found.  This results
+    // in a failure to scan.
+    return result;
+  }
+
+  // Success!
+  result.type = strings_internal::FloatType::kNumber;
+  if (result.mantissa > 0) {
+    result.exponent = result.literal_exponent +
+                      (DigitMagnitude<base>() * exponent_adjustment);
+  } else {
+    result.exponent = 0;
+  }
+  result.end = begin;
+  return result;
+}
+
+template ParsedFloat ParseFloat<10>(const char* begin, const char* end,
+                                    chars_format format_flags);
+template ParsedFloat ParseFloat<16>(const char* begin, const char* end,
+                                    chars_format format_flags);
+
+}  // namespace strings_internal
+}  // namespace absl