diff options
Diffstat (limited to 'absl/random/zipf_distribution_test.cc')
-rw-r--r-- | absl/random/zipf_distribution_test.cc | 423 |
1 files changed, 423 insertions, 0 deletions
diff --git a/absl/random/zipf_distribution_test.cc b/absl/random/zipf_distribution_test.cc new file mode 100644 index 000000000000..4d4a0fcf793b --- /dev/null +++ b/absl/random/zipf_distribution_test.cc @@ -0,0 +1,423 @@ +// Copyright 2017 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "absl/random/zipf_distribution.h" + +#include <algorithm> +#include <cstddef> +#include <cstdint> +#include <iterator> +#include <random> +#include <string> +#include <utility> +#include <vector> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "absl/base/internal/raw_logging.h" +#include "absl/random/internal/chi_square.h" +#include "absl/random/internal/sequence_urbg.h" +#include "absl/random/random.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_replace.h" +#include "absl/strings/strip.h" + +namespace { + +using ::absl::random_internal::kChiSquared; +using ::testing::ElementsAre; + +template <typename IntType> +class ZipfDistributionTypedTest : public ::testing::Test {}; + +using IntTypes = ::testing::Types<int, int8_t, int16_t, int32_t, int64_t, + uint8_t, uint16_t, uint32_t, uint64_t>; +TYPED_TEST_CASE(ZipfDistributionTypedTest, IntTypes); + +TYPED_TEST(ZipfDistributionTypedTest, SerializeTest) { + using param_type = typename absl::zipf_distribution<TypeParam>::param_type; + + constexpr int kCount = 1000; + absl::InsecureBitGen gen; + for (const auto& param : { + param_type(), + param_type(32), + param_type(100, 3, 2), + param_type(std::numeric_limits<TypeParam>::max(), 4, 3), + param_type(std::numeric_limits<TypeParam>::max() / 2), + }) { + // Validate parameters. + const auto k = param.k(); + const auto q = param.q(); + const auto v = param.v(); + + absl::zipf_distribution<TypeParam> before(k, q, v); + EXPECT_EQ(before.k(), param.k()); + EXPECT_EQ(before.q(), param.q()); + EXPECT_EQ(before.v(), param.v()); + + { + absl::zipf_distribution<TypeParam> via_param(param); + EXPECT_EQ(via_param, before); + } + + // Validate stream serialization. + std::stringstream ss; + ss << before; + absl::zipf_distribution<TypeParam> after(4, 5.5, 4.4); + + EXPECT_NE(before.k(), after.k()); + EXPECT_NE(before.q(), after.q()); + EXPECT_NE(before.v(), after.v()); + EXPECT_NE(before.param(), after.param()); + EXPECT_NE(before, after); + + ss >> after; + + EXPECT_EQ(before.k(), after.k()); + EXPECT_EQ(before.q(), after.q()); + EXPECT_EQ(before.v(), after.v()); + EXPECT_EQ(before.param(), after.param()); + EXPECT_EQ(before, after); + + // Smoke test. + auto sample_min = after.max(); + auto sample_max = after.min(); + for (int i = 0; i < kCount; i++) { + auto sample = after(gen); + EXPECT_GE(sample, after.min()); + EXPECT_LE(sample, after.max()); + if (sample > sample_max) sample_max = sample; + if (sample < sample_min) sample_min = sample; + } + ABSL_INTERNAL_LOG(INFO, + absl::StrCat("Range: ", +sample_min, ", ", +sample_max)); + } +} + +class ZipfModel { + public: + ZipfModel(size_t k, double q, double v) : k_(k), q_(q), v_(v) {} + + double mean() const { return mean_; } + + // For the other moments of the Zipf distribution, see, for example, + // http://mathworld.wolfram.com/ZipfDistribution.html + + // PMF(k) = (1 / k^s) / H(N,s) + // Returns the probability that any single invocation returns k. + double PMF(size_t i) { return i >= hnq_.size() ? 0.0 : hnq_[i] / sum_hnq_; } + + // CDF = H(k, s) / H(N,s) + double CDF(size_t i) { + if (i >= hnq_.size()) { + return 1.0; + } + auto it = std::begin(hnq_); + double h = 0.0; + for (const auto end = it; it != end; it++) { + h += *it; + } + return h / sum_hnq_; + } + + // The InverseCDF returns the k values which bound p on the upper and lower + // bound. Since there is no closed-form solution, this is implemented as a + // bisction of the cdf. + std::pair<size_t, size_t> InverseCDF(double p) { + size_t min = 0; + size_t max = hnq_.size(); + while (max > min + 1) { + size_t target = (max + min) >> 1; + double x = CDF(target); + if (x > p) { + max = target; + } else { + min = target; + } + } + return {min, max}; + } + + // Compute the probability totals, which are based on the generalized harmonic + // number, H(N,s). + // H(N,s) == SUM(k=1..N, 1 / k^s) + // + // In the limit, H(N,s) == zetac(s) + 1. + // + // NOTE: The mean of a zipf distribution could be computed here as well. + // Mean := H(N, s-1) / H(N,s). + // Given the parameter v = 1, this gives the following function: + // (Hn(100, 1) - Hn(1,1)) / (Hn(100,2) - Hn(1,2)) = 6.5944 + // + void Init() { + if (!hnq_.empty()) { + return; + } + hnq_.clear(); + hnq_.reserve(std::min(k_, size_t{1000})); + + sum_hnq_ = 0; + double qm1 = q_ - 1.0; + double sum_hnq_m1 = 0; + for (size_t i = 0; i < k_; i++) { + // Partial n-th generalized harmonic number + const double x = v_ + i; + + // H(n, q-1) + const double hnqm1 = + (q_ == 2.0) ? (1.0 / x) + : (q_ == 3.0) ? (1.0 / (x * x)) : std::pow(x, -qm1); + sum_hnq_m1 += hnqm1; + + // H(n, q) + const double hnq = + (q_ == 2.0) ? (1.0 / (x * x)) + : (q_ == 3.0) ? (1.0 / (x * x * x)) : std::pow(x, -q_); + sum_hnq_ += hnq; + hnq_.push_back(hnq); + if (i > 1000 && hnq <= 1e-10) { + // The harmonic number is too small. + break; + } + } + assert(sum_hnq_ > 0); + mean_ = sum_hnq_m1 / sum_hnq_; + } + + private: + const size_t k_; + const double q_; + const double v_; + + double mean_; + std::vector<double> hnq_; + double sum_hnq_; +}; + +using zipf_u64 = absl::zipf_distribution<uint64_t>; + +class ZipfTest : public testing::TestWithParam<zipf_u64::param_type>, + public ZipfModel { + public: + ZipfTest() : ZipfModel(GetParam().k(), GetParam().q(), GetParam().v()) {} + + absl::InsecureBitGen rng_; +}; + +TEST_P(ZipfTest, ChiSquaredTest) { + const auto& param = GetParam(); + Init(); + + size_t trials = 10000; + + // Find the split-points for the buckets. + std::vector<size_t> points; + std::vector<double> expected; + { + double last_cdf = 0.0; + double min_p = 1.0; + for (double p = 0.01; p < 1.0; p += 0.01) { + auto x = InverseCDF(p); + if (points.empty() || points.back() < x.second) { + const double p = CDF(x.second); + points.push_back(x.second); + double q = p - last_cdf; + expected.push_back(q); + last_cdf = p; + if (q < min_p) { + min_p = q; + } + } + } + if (last_cdf < 0.999) { + points.push_back(std::numeric_limits<size_t>::max()); + double q = 1.0 - last_cdf; + expected.push_back(q); + if (q < min_p) { + min_p = q; + } + } else { + points.back() = std::numeric_limits<size_t>::max(); + expected.back() += (1.0 - last_cdf); + } + // The Chi-Squared score is not completely scale-invariant; it works best + // when the small values are in the small digits. + trials = static_cast<size_t>(8.0 / min_p); + } + ASSERT_GT(points.size(), 0); + + // Generate n variates and fill the counts vector with the count of their + // occurrences. + std::vector<int64_t> buckets(points.size(), 0); + double avg = 0; + { + zipf_u64 dis(param); + for (size_t i = 0; i < trials; i++) { + uint64_t x = dis(rng_); + ASSERT_LE(x, dis.max()); + ASSERT_GE(x, dis.min()); + avg += static_cast<double>(x); + auto it = std::upper_bound(std::begin(points), std::end(points), + static_cast<size_t>(x)); + buckets[std::distance(std::begin(points), it)]++; + } + avg = avg / static_cast<double>(trials); + } + + // Validate the output using the Chi-Squared test. + for (auto& e : expected) { + e *= trials; + } + + // The null-hypothesis is that the distribution is a poisson distribution with + // the provided mean (not estimated from the data). + const int dof = static_cast<int>(expected.size()) - 1; + + // NOTE: This test runs about 15x per invocation, so a value of 0.9995 is + // approximately correct for a test suite failure rate of 1 in 100. In + // practice we see failures slightly higher than that. + const double threshold = absl::random_internal::ChiSquareValue(dof, 0.9999); + + const double chi_square = absl::random_internal::ChiSquare( + std::begin(buckets), std::end(buckets), std::begin(expected), + std::end(expected)); + + const double p_actual = + absl::random_internal::ChiSquarePValue(chi_square, dof); + + // Log if the chi_squared value is above the threshold. + if (chi_square > threshold) { + ABSL_INTERNAL_LOG(INFO, "values"); + for (size_t i = 0; i < expected.size(); i++) { + ABSL_INTERNAL_LOG(INFO, absl::StrCat(points[i], ": ", buckets[i], + " vs. E=", expected[i])); + } + ABSL_INTERNAL_LOG(INFO, absl::StrCat("trials ", trials)); + ABSL_INTERNAL_LOG(INFO, + absl::StrCat("mean ", avg, " vs. expected ", mean())); + ABSL_INTERNAL_LOG(INFO, absl::StrCat(kChiSquared, "(data, ", dof, ") = ", + chi_square, " (", p_actual, ")")); + ABSL_INTERNAL_LOG(INFO, + absl::StrCat(kChiSquared, " @ 0.9995 = ", threshold)); + FAIL() << kChiSquared << " value of " << chi_square + << " is above the threshold."; + } +} + +std::vector<zipf_u64::param_type> GenParams() { + using param = zipf_u64::param_type; + const auto k = param().k(); + const auto q = param().q(); + const auto v = param().v(); + const uint64_t k2 = 1 << 10; + return std::vector<zipf_u64::param_type>{ + // Default + param(k, q, v), + // vary K + param(4, q, v), param(1 << 4, q, v), param(k2, q, v), + // vary V + param(k2, q, 0.5), param(k2, q, 1.5), param(k2, q, 2.5), param(k2, q, 10), + // vary Q + param(k2, 1.5, v), param(k2, 3, v), param(k2, 5, v), param(k2, 10, v), + // Vary V & Q + param(k2, 1.5, 0.5), param(k2, 3, 1.5), param(k, 10, 10)}; +} + +std::string ParamName( + const ::testing::TestParamInfo<zipf_u64::param_type>& info) { + const auto& p = info.param; + std::string name = absl::StrCat("k_", p.k(), "__q_", absl::SixDigits(p.q()), + "__v_", absl::SixDigits(p.v())); + return absl::StrReplaceAll(name, {{"+", "_"}, {"-", "_"}, {".", "_"}}); +} + +INSTANTIATE_TEST_SUITE_P(All, ZipfTest, ::testing::ValuesIn(GenParams()), + ParamName); + +// NOTE: absl::zipf_distribution is not guaranteed to be stable. +TEST(ZipfDistributionTest, StabilityTest) { + // absl::zipf_distribution stability relies on + // absl::uniform_real_distribution, std::log, std::exp, std::log1p + absl::random_internal::sequence_urbg urbg( + {0x0003eb76f6f7f755ull, 0xFFCEA50FDB2F953Bull, 0xC332DDEFBE6C5AA5ull, + 0x6558218568AB9702ull, 0x2AEF7DAD5B6E2F84ull, 0x1521B62829076170ull, + 0xECDD4775619F1510ull, 0x13CCA830EB61BD96ull, 0x0334FE1EAA0363CFull, + 0xB5735C904C70A239ull, 0xD59E9E0BCBAADE14ull, 0xEECC86BC60622CA7ull}); + + std::vector<int> output(10); + + { + absl::zipf_distribution<int32_t> dist; + std::generate(std::begin(output), std::end(output), + [&] { return dist(urbg); }); + EXPECT_THAT(output, ElementsAre(10031, 0, 0, 3, 6, 0, 7, 47, 0, 0)); + } + urbg.reset(); + { + absl::zipf_distribution<int32_t> dist(std::numeric_limits<int32_t>::max(), + 3.3); + std::generate(std::begin(output), std::end(output), + [&] { return dist(urbg); }); + EXPECT_THAT(output, ElementsAre(44, 0, 0, 0, 0, 1, 0, 1, 3, 0)); + } +} + +TEST(ZipfDistributionTest, AlgorithmBounds) { + absl::zipf_distribution<int32_t> dist; + + // Small values from absl::uniform_real_distribution map to larger Zipf + // distribution values. + const std::pair<uint64_t, int32_t> kInputs[] = { + {0xffffffffffffffff, 0x0}, {0x7fffffffffffffff, 0x0}, + {0x3ffffffffffffffb, 0x1}, {0x1ffffffffffffffd, 0x4}, + {0xffffffffffffffe, 0x9}, {0x7ffffffffffffff, 0x12}, + {0x3ffffffffffffff, 0x25}, {0x1ffffffffffffff, 0x4c}, + {0xffffffffffffff, 0x99}, {0x7fffffffffffff, 0x132}, + {0x3fffffffffffff, 0x265}, {0x1fffffffffffff, 0x4cc}, + {0xfffffffffffff, 0x999}, {0x7ffffffffffff, 0x1332}, + {0x3ffffffffffff, 0x2665}, {0x1ffffffffffff, 0x4ccc}, + {0xffffffffffff, 0x9998}, {0x7fffffffffff, 0x1332f}, + {0x3fffffffffff, 0x2665a}, {0x1fffffffffff, 0x4cc9e}, + {0xfffffffffff, 0x998e0}, {0x7ffffffffff, 0x133051}, + {0x3ffffffffff, 0x265ae4}, {0x1ffffffffff, 0x4c9ed3}, + {0xffffffffff, 0x98e223}, {0x7fffffffff, 0x13058c4}, + {0x3fffffffff, 0x25b178e}, {0x1fffffffff, 0x4a062b2}, + {0xfffffffff, 0x8ee23b8}, {0x7ffffffff, 0x10b21642}, + {0x3ffffffff, 0x1d89d89d}, {0x1ffffffff, 0x2fffffff}, + {0xffffffff, 0x45d1745d}, {0x7fffffff, 0x5a5a5a5a}, + {0x3fffffff, 0x69ee5846}, {0x1fffffff, 0x73ecade3}, + {0xfffffff, 0x79a9d260}, {0x7ffffff, 0x7cc0532b}, + {0x3ffffff, 0x7e5ad146}, {0x1ffffff, 0x7f2c0bec}, + {0xffffff, 0x7f95adef}, {0x7fffff, 0x7fcac0da}, + {0x3fffff, 0x7fe55ae2}, {0x1fffff, 0x7ff2ac0e}, + {0xfffff, 0x7ff955ae}, {0x7ffff, 0x7ffcaac1}, + {0x3ffff, 0x7ffe555b}, {0x1ffff, 0x7fff2aac}, + {0xffff, 0x7fff9556}, {0x7fff, 0x7fffcaab}, + {0x3fff, 0x7fffe555}, {0x1fff, 0x7ffff2ab}, + {0xfff, 0x7ffff955}, {0x7ff, 0x7ffffcab}, + {0x3ff, 0x7ffffe55}, {0x1ff, 0x7fffff2b}, + {0xff, 0x7fffff95}, {0x7f, 0x7fffffcb}, + {0x3f, 0x7fffffe5}, {0x1f, 0x7ffffff3}, + {0xf, 0x7ffffff9}, {0x7, 0x7ffffffd}, + {0x3, 0x7ffffffe}, {0x1, 0x7fffffff}, + }; + + for (const auto& instance : kInputs) { + absl::random_internal::sequence_urbg urbg({instance.first}); + EXPECT_EQ(instance.second, dist(urbg)); + } +} + +} // namespace |