diff options
Diffstat (limited to 'third_party/abseil_cpp/absl/random/zipf_distribution_test.cc')
-rw-r--r-- | third_party/abseil_cpp/absl/random/zipf_distribution_test.cc | 427 |
1 files changed, 0 insertions, 427 deletions
diff --git a/third_party/abseil_cpp/absl/random/zipf_distribution_test.cc b/third_party/abseil_cpp/absl/random/zipf_distribution_test.cc deleted file mode 100644 index f8cf70e0dddf..000000000000 --- a/third_party/abseil_cpp/absl/random/zipf_distribution_test.cc +++ /dev/null @@ -1,427 +0,0 @@ -// Copyright 2017 The Abseil Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "absl/random/zipf_distribution.h" - -#include <algorithm> -#include <cstddef> -#include <cstdint> -#include <iterator> -#include <random> -#include <string> -#include <utility> -#include <vector> - -#include "gmock/gmock.h" -#include "gtest/gtest.h" -#include "absl/base/internal/raw_logging.h" -#include "absl/random/internal/chi_square.h" -#include "absl/random/internal/pcg_engine.h" -#include "absl/random/internal/sequence_urbg.h" -#include "absl/random/random.h" -#include "absl/strings/str_cat.h" -#include "absl/strings/str_replace.h" -#include "absl/strings/strip.h" - -namespace { - -using ::absl::random_internal::kChiSquared; -using ::testing::ElementsAre; - -template <typename IntType> -class ZipfDistributionTypedTest : public ::testing::Test {}; - -using IntTypes = ::testing::Types<int, int8_t, int16_t, int32_t, int64_t, - uint8_t, uint16_t, uint32_t, uint64_t>; -TYPED_TEST_CASE(ZipfDistributionTypedTest, IntTypes); - -TYPED_TEST(ZipfDistributionTypedTest, SerializeTest) { - using param_type = typename absl::zipf_distribution<TypeParam>::param_type; - - constexpr int kCount = 1000; - absl::InsecureBitGen gen; - for (const auto& param : { - param_type(), - param_type(32), - param_type(100, 3, 2), - param_type(std::numeric_limits<TypeParam>::max(), 4, 3), - param_type(std::numeric_limits<TypeParam>::max() / 2), - }) { - // Validate parameters. - const auto k = param.k(); - const auto q = param.q(); - const auto v = param.v(); - - absl::zipf_distribution<TypeParam> before(k, q, v); - EXPECT_EQ(before.k(), param.k()); - EXPECT_EQ(before.q(), param.q()); - EXPECT_EQ(before.v(), param.v()); - - { - absl::zipf_distribution<TypeParam> via_param(param); - EXPECT_EQ(via_param, before); - } - - // Validate stream serialization. - std::stringstream ss; - ss << before; - absl::zipf_distribution<TypeParam> after(4, 5.5, 4.4); - - EXPECT_NE(before.k(), after.k()); - EXPECT_NE(before.q(), after.q()); - EXPECT_NE(before.v(), after.v()); - EXPECT_NE(before.param(), after.param()); - EXPECT_NE(before, after); - - ss >> after; - - EXPECT_EQ(before.k(), after.k()); - EXPECT_EQ(before.q(), after.q()); - EXPECT_EQ(before.v(), after.v()); - EXPECT_EQ(before.param(), after.param()); - EXPECT_EQ(before, after); - - // Smoke test. - auto sample_min = after.max(); - auto sample_max = after.min(); - for (int i = 0; i < kCount; i++) { - auto sample = after(gen); - EXPECT_GE(sample, after.min()); - EXPECT_LE(sample, after.max()); - if (sample > sample_max) sample_max = sample; - if (sample < sample_min) sample_min = sample; - } - ABSL_INTERNAL_LOG(INFO, - absl::StrCat("Range: ", +sample_min, ", ", +sample_max)); - } -} - -class ZipfModel { - public: - ZipfModel(size_t k, double q, double v) : k_(k), q_(q), v_(v) {} - - double mean() const { return mean_; } - - // For the other moments of the Zipf distribution, see, for example, - // http://mathworld.wolfram.com/ZipfDistribution.html - - // PMF(k) = (1 / k^s) / H(N,s) - // Returns the probability that any single invocation returns k. - double PMF(size_t i) { return i >= hnq_.size() ? 0.0 : hnq_[i] / sum_hnq_; } - - // CDF = H(k, s) / H(N,s) - double CDF(size_t i) { - if (i >= hnq_.size()) { - return 1.0; - } - auto it = std::begin(hnq_); - double h = 0.0; - for (const auto end = it; it != end; it++) { - h += *it; - } - return h / sum_hnq_; - } - - // The InverseCDF returns the k values which bound p on the upper and lower - // bound. Since there is no closed-form solution, this is implemented as a - // bisction of the cdf. - std::pair<size_t, size_t> InverseCDF(double p) { - size_t min = 0; - size_t max = hnq_.size(); - while (max > min + 1) { - size_t target = (max + min) >> 1; - double x = CDF(target); - if (x > p) { - max = target; - } else { - min = target; - } - } - return {min, max}; - } - - // Compute the probability totals, which are based on the generalized harmonic - // number, H(N,s). - // H(N,s) == SUM(k=1..N, 1 / k^s) - // - // In the limit, H(N,s) == zetac(s) + 1. - // - // NOTE: The mean of a zipf distribution could be computed here as well. - // Mean := H(N, s-1) / H(N,s). - // Given the parameter v = 1, this gives the following function: - // (Hn(100, 1) - Hn(1,1)) / (Hn(100,2) - Hn(1,2)) = 6.5944 - // - void Init() { - if (!hnq_.empty()) { - return; - } - hnq_.clear(); - hnq_.reserve(std::min(k_, size_t{1000})); - - sum_hnq_ = 0; - double qm1 = q_ - 1.0; - double sum_hnq_m1 = 0; - for (size_t i = 0; i < k_; i++) { - // Partial n-th generalized harmonic number - const double x = v_ + i; - - // H(n, q-1) - const double hnqm1 = - (q_ == 2.0) ? (1.0 / x) - : (q_ == 3.0) ? (1.0 / (x * x)) : std::pow(x, -qm1); - sum_hnq_m1 += hnqm1; - - // H(n, q) - const double hnq = - (q_ == 2.0) ? (1.0 / (x * x)) - : (q_ == 3.0) ? (1.0 / (x * x * x)) : std::pow(x, -q_); - sum_hnq_ += hnq; - hnq_.push_back(hnq); - if (i > 1000 && hnq <= 1e-10) { - // The harmonic number is too small. - break; - } - } - assert(sum_hnq_ > 0); - mean_ = sum_hnq_m1 / sum_hnq_; - } - - private: - const size_t k_; - const double q_; - const double v_; - - double mean_; - std::vector<double> hnq_; - double sum_hnq_; -}; - -using zipf_u64 = absl::zipf_distribution<uint64_t>; - -class ZipfTest : public testing::TestWithParam<zipf_u64::param_type>, - public ZipfModel { - public: - ZipfTest() : ZipfModel(GetParam().k(), GetParam().q(), GetParam().v()) {} - - // We use a fixed bit generator for distribution accuracy tests. This allows - // these tests to be deterministic, while still testing the qualify of the - // implementation. - absl::random_internal::pcg64_2018_engine rng_{0x2B7E151628AED2A6}; -}; - -TEST_P(ZipfTest, ChiSquaredTest) { - const auto& param = GetParam(); - Init(); - - size_t trials = 10000; - - // Find the split-points for the buckets. - std::vector<size_t> points; - std::vector<double> expected; - { - double last_cdf = 0.0; - double min_p = 1.0; - for (double p = 0.01; p < 1.0; p += 0.01) { - auto x = InverseCDF(p); - if (points.empty() || points.back() < x.second) { - const double p = CDF(x.second); - points.push_back(x.second); - double q = p - last_cdf; - expected.push_back(q); - last_cdf = p; - if (q < min_p) { - min_p = q; - } - } - } - if (last_cdf < 0.999) { - points.push_back(std::numeric_limits<size_t>::max()); - double q = 1.0 - last_cdf; - expected.push_back(q); - if (q < min_p) { - min_p = q; - } - } else { - points.back() = std::numeric_limits<size_t>::max(); - expected.back() += (1.0 - last_cdf); - } - // The Chi-Squared score is not completely scale-invariant; it works best - // when the small values are in the small digits. - trials = static_cast<size_t>(8.0 / min_p); - } - ASSERT_GT(points.size(), 0); - - // Generate n variates and fill the counts vector with the count of their - // occurrences. - std::vector<int64_t> buckets(points.size(), 0); - double avg = 0; - { - zipf_u64 dis(param); - for (size_t i = 0; i < trials; i++) { - uint64_t x = dis(rng_); - ASSERT_LE(x, dis.max()); - ASSERT_GE(x, dis.min()); - avg += static_cast<double>(x); - auto it = std::upper_bound(std::begin(points), std::end(points), - static_cast<size_t>(x)); - buckets[std::distance(std::begin(points), it)]++; - } - avg = avg / static_cast<double>(trials); - } - - // Validate the output using the Chi-Squared test. - for (auto& e : expected) { - e *= trials; - } - - // The null-hypothesis is that the distribution is a poisson distribution with - // the provided mean (not estimated from the data). - const int dof = static_cast<int>(expected.size()) - 1; - - // NOTE: This test runs about 15x per invocation, so a value of 0.9995 is - // approximately correct for a test suite failure rate of 1 in 100. In - // practice we see failures slightly higher than that. - const double threshold = absl::random_internal::ChiSquareValue(dof, 0.9999); - - const double chi_square = absl::random_internal::ChiSquare( - std::begin(buckets), std::end(buckets), std::begin(expected), - std::end(expected)); - - const double p_actual = - absl::random_internal::ChiSquarePValue(chi_square, dof); - - // Log if the chi_squared value is above the threshold. - if (chi_square > threshold) { - ABSL_INTERNAL_LOG(INFO, "values"); - for (size_t i = 0; i < expected.size(); i++) { - ABSL_INTERNAL_LOG(INFO, absl::StrCat(points[i], ": ", buckets[i], - " vs. E=", expected[i])); - } - ABSL_INTERNAL_LOG(INFO, absl::StrCat("trials ", trials)); - ABSL_INTERNAL_LOG(INFO, - absl::StrCat("mean ", avg, " vs. expected ", mean())); - ABSL_INTERNAL_LOG(INFO, absl::StrCat(kChiSquared, "(data, ", dof, ") = ", - chi_square, " (", p_actual, ")")); - ABSL_INTERNAL_LOG(INFO, - absl::StrCat(kChiSquared, " @ 0.9995 = ", threshold)); - FAIL() << kChiSquared << " value of " << chi_square - << " is above the threshold."; - } -} - -std::vector<zipf_u64::param_type> GenParams() { - using param = zipf_u64::param_type; - const auto k = param().k(); - const auto q = param().q(); - const auto v = param().v(); - const uint64_t k2 = 1 << 10; - return std::vector<zipf_u64::param_type>{ - // Default - param(k, q, v), - // vary K - param(4, q, v), param(1 << 4, q, v), param(k2, q, v), - // vary V - param(k2, q, 0.5), param(k2, q, 1.5), param(k2, q, 2.5), param(k2, q, 10), - // vary Q - param(k2, 1.5, v), param(k2, 3, v), param(k2, 5, v), param(k2, 10, v), - // Vary V & Q - param(k2, 1.5, 0.5), param(k2, 3, 1.5), param(k, 10, 10)}; -} - -std::string ParamName( - const ::testing::TestParamInfo<zipf_u64::param_type>& info) { - const auto& p = info.param; - std::string name = absl::StrCat("k_", p.k(), "__q_", absl::SixDigits(p.q()), - "__v_", absl::SixDigits(p.v())); - return absl::StrReplaceAll(name, {{"+", "_"}, {"-", "_"}, {".", "_"}}); -} - -INSTANTIATE_TEST_SUITE_P(All, ZipfTest, ::testing::ValuesIn(GenParams()), - ParamName); - -// NOTE: absl::zipf_distribution is not guaranteed to be stable. -TEST(ZipfDistributionTest, StabilityTest) { - // absl::zipf_distribution stability relies on - // absl::uniform_real_distribution, std::log, std::exp, std::log1p - absl::random_internal::sequence_urbg urbg( - {0x0003eb76f6f7f755ull, 0xFFCEA50FDB2F953Bull, 0xC332DDEFBE6C5AA5ull, - 0x6558218568AB9702ull, 0x2AEF7DAD5B6E2F84ull, 0x1521B62829076170ull, - 0xECDD4775619F1510ull, 0x13CCA830EB61BD96ull, 0x0334FE1EAA0363CFull, - 0xB5735C904C70A239ull, 0xD59E9E0BCBAADE14ull, 0xEECC86BC60622CA7ull}); - - std::vector<int> output(10); - - { - absl::zipf_distribution<int32_t> dist; - std::generate(std::begin(output), std::end(output), - [&] { return dist(urbg); }); - EXPECT_THAT(output, ElementsAre(10031, 0, 0, 3, 6, 0, 7, 47, 0, 0)); - } - urbg.reset(); - { - absl::zipf_distribution<int32_t> dist(std::numeric_limits<int32_t>::max(), - 3.3); - std::generate(std::begin(output), std::end(output), - [&] { return dist(urbg); }); - EXPECT_THAT(output, ElementsAre(44, 0, 0, 0, 0, 1, 0, 1, 3, 0)); - } -} - -TEST(ZipfDistributionTest, AlgorithmBounds) { - absl::zipf_distribution<int32_t> dist; - - // Small values from absl::uniform_real_distribution map to larger Zipf - // distribution values. - const std::pair<uint64_t, int32_t> kInputs[] = { - {0xffffffffffffffff, 0x0}, {0x7fffffffffffffff, 0x0}, - {0x3ffffffffffffffb, 0x1}, {0x1ffffffffffffffd, 0x4}, - {0xffffffffffffffe, 0x9}, {0x7ffffffffffffff, 0x12}, - {0x3ffffffffffffff, 0x25}, {0x1ffffffffffffff, 0x4c}, - {0xffffffffffffff, 0x99}, {0x7fffffffffffff, 0x132}, - {0x3fffffffffffff, 0x265}, {0x1fffffffffffff, 0x4cc}, - {0xfffffffffffff, 0x999}, {0x7ffffffffffff, 0x1332}, - {0x3ffffffffffff, 0x2665}, {0x1ffffffffffff, 0x4ccc}, - {0xffffffffffff, 0x9998}, {0x7fffffffffff, 0x1332f}, - {0x3fffffffffff, 0x2665a}, {0x1fffffffffff, 0x4cc9e}, - {0xfffffffffff, 0x998e0}, {0x7ffffffffff, 0x133051}, - {0x3ffffffffff, 0x265ae4}, {0x1ffffffffff, 0x4c9ed3}, - {0xffffffffff, 0x98e223}, {0x7fffffffff, 0x13058c4}, - {0x3fffffffff, 0x25b178e}, {0x1fffffffff, 0x4a062b2}, - {0xfffffffff, 0x8ee23b8}, {0x7ffffffff, 0x10b21642}, - {0x3ffffffff, 0x1d89d89d}, {0x1ffffffff, 0x2fffffff}, - {0xffffffff, 0x45d1745d}, {0x7fffffff, 0x5a5a5a5a}, - {0x3fffffff, 0x69ee5846}, {0x1fffffff, 0x73ecade3}, - {0xfffffff, 0x79a9d260}, {0x7ffffff, 0x7cc0532b}, - {0x3ffffff, 0x7e5ad146}, {0x1ffffff, 0x7f2c0bec}, - {0xffffff, 0x7f95adef}, {0x7fffff, 0x7fcac0da}, - {0x3fffff, 0x7fe55ae2}, {0x1fffff, 0x7ff2ac0e}, - {0xfffff, 0x7ff955ae}, {0x7ffff, 0x7ffcaac1}, - {0x3ffff, 0x7ffe555b}, {0x1ffff, 0x7fff2aac}, - {0xffff, 0x7fff9556}, {0x7fff, 0x7fffcaab}, - {0x3fff, 0x7fffe555}, {0x1fff, 0x7ffff2ab}, - {0xfff, 0x7ffff955}, {0x7ff, 0x7ffffcab}, - {0x3ff, 0x7ffffe55}, {0x1ff, 0x7fffff2b}, - {0xff, 0x7fffff95}, {0x7f, 0x7fffffcb}, - {0x3f, 0x7fffffe5}, {0x1f, 0x7ffffff3}, - {0xf, 0x7ffffff9}, {0x7, 0x7ffffffd}, - {0x3, 0x7ffffffe}, {0x1, 0x7fffffff}, - }; - - for (const auto& instance : kInputs) { - absl::random_internal::sequence_urbg urbg({instance.first}); - EXPECT_EQ(instance.second, dist(urbg)); - } -} - -} // namespace |