// Copyright 2017 The Abseil Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "absl/random/zipf_distribution.h" #include <algorithm> #include <cstddef> #include <cstdint> #include <iterator> #include <random> #include <string> #include <utility> #include <vector> #include "gmock/gmock.h" #include "gtest/gtest.h" #include "absl/base/internal/raw_logging.h" #include "absl/random/internal/chi_square.h" #include "absl/random/internal/pcg_engine.h" #include "absl/random/internal/sequence_urbg.h" #include "absl/random/random.h" #include "absl/strings/str_cat.h" #include "absl/strings/str_replace.h" #include "absl/strings/strip.h" namespace { using ::absl::random_internal::kChiSquared; using ::testing::ElementsAre; template <typename IntType> class ZipfDistributionTypedTest : public ::testing::Test {}; using IntTypes = ::testing::Types<int, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t>; TYPED_TEST_CASE(ZipfDistributionTypedTest, IntTypes); TYPED_TEST(ZipfDistributionTypedTest, SerializeTest) { using param_type = typename absl::zipf_distribution<TypeParam>::param_type; constexpr int kCount = 1000; absl::InsecureBitGen gen; for (const auto& param : { param_type(), param_type(32), param_type(100, 3, 2), param_type(std::numeric_limits<TypeParam>::max(), 4, 3), param_type(std::numeric_limits<TypeParam>::max() / 2), }) { // Validate parameters. const auto k = param.k(); const auto q = param.q(); const auto v = param.v(); absl::zipf_distribution<TypeParam> before(k, q, v); EXPECT_EQ(before.k(), param.k()); EXPECT_EQ(before.q(), param.q()); EXPECT_EQ(before.v(), param.v()); { absl::zipf_distribution<TypeParam> via_param(param); EXPECT_EQ(via_param, before); } // Validate stream serialization. std::stringstream ss; ss << before; absl::zipf_distribution<TypeParam> after(4, 5.5, 4.4); EXPECT_NE(before.k(), after.k()); EXPECT_NE(before.q(), after.q()); EXPECT_NE(before.v(), after.v()); EXPECT_NE(before.param(), after.param()); EXPECT_NE(before, after); ss >> after; EXPECT_EQ(before.k(), after.k()); EXPECT_EQ(before.q(), after.q()); EXPECT_EQ(before.v(), after.v()); EXPECT_EQ(before.param(), after.param()); EXPECT_EQ(before, after); // Smoke test. auto sample_min = after.max(); auto sample_max = after.min(); for (int i = 0; i < kCount; i++) { auto sample = after(gen); EXPECT_GE(sample, after.min()); EXPECT_LE(sample, after.max()); if (sample > sample_max) sample_max = sample; if (sample < sample_min) sample_min = sample; } ABSL_INTERNAL_LOG(INFO, absl::StrCat("Range: ", +sample_min, ", ", +sample_max)); } } class ZipfModel { public: ZipfModel(size_t k, double q, double v) : k_(k), q_(q), v_(v) {} double mean() const { return mean_; } // For the other moments of the Zipf distribution, see, for example, // http://mathworld.wolfram.com/ZipfDistribution.html // PMF(k) = (1 / k^s) / H(N,s) // Returns the probability that any single invocation returns k. double PMF(size_t i) { return i >= hnq_.size() ? 0.0 : hnq_[i] / sum_hnq_; } // CDF = H(k, s) / H(N,s) double CDF(size_t i) { if (i >= hnq_.size()) { return 1.0; } auto it = std::begin(hnq_); double h = 0.0; for (const auto end = it; it != end; it++) { h += *it; } return h / sum_hnq_; } // The InverseCDF returns the k values which bound p on the upper and lower // bound. Since there is no closed-form solution, this is implemented as a // bisction of the cdf. std::pair<size_t, size_t> InverseCDF(double p) { size_t min = 0; size_t max = hnq_.size(); while (max > min + 1) { size_t target = (max + min) >> 1; double x = CDF(target); if (x > p) { max = target; } else { min = target; } } return {min, max}; } // Compute the probability totals, which are based on the generalized harmonic // number, H(N,s). // H(N,s) == SUM(k=1..N, 1 / k^s) // // In the limit, H(N,s) == zetac(s) + 1. // // NOTE: The mean of a zipf distribution could be computed here as well. // Mean := H(N, s-1) / H(N,s). // Given the parameter v = 1, this gives the following function: // (Hn(100, 1) - Hn(1,1)) / (Hn(100,2) - Hn(1,2)) = 6.5944 // void Init() { if (!hnq_.empty()) { return; } hnq_.clear(); hnq_.reserve(std::min(k_, size_t{1000})); sum_hnq_ = 0; double qm1 = q_ - 1.0; double sum_hnq_m1 = 0; for (size_t i = 0; i < k_; i++) { // Partial n-th generalized harmonic number const double x = v_ + i; // H(n, q-1) const double hnqm1 = (q_ == 2.0) ? (1.0 / x) : (q_ == 3.0) ? (1.0 / (x * x)) : std::pow(x, -qm1); sum_hnq_m1 += hnqm1; // H(n, q) const double hnq = (q_ == 2.0) ? (1.0 / (x * x)) : (q_ == 3.0) ? (1.0 / (x * x * x)) : std::pow(x, -q_); sum_hnq_ += hnq; hnq_.push_back(hnq); if (i > 1000 && hnq <= 1e-10) { // The harmonic number is too small. break; } } assert(sum_hnq_ > 0); mean_ = sum_hnq_m1 / sum_hnq_; } private: const size_t k_; const double q_; const double v_; double mean_; std::vector<double> hnq_; double sum_hnq_; }; using zipf_u64 = absl::zipf_distribution<uint64_t>; class ZipfTest : public testing::TestWithParam<zipf_u64::param_type>, public ZipfModel { public: ZipfTest() : ZipfModel(GetParam().k(), GetParam().q(), GetParam().v()) {} // We use a fixed bit generator for distribution accuracy tests. This allows // these tests to be deterministic, while still testing the qualify of the // implementation. absl::random_internal::pcg64_2018_engine rng_{0x2B7E151628AED2A6}; }; TEST_P(ZipfTest, ChiSquaredTest) { const auto& param = GetParam(); Init(); size_t trials = 10000; // Find the split-points for the buckets. std::vector<size_t> points; std::vector<double> expected; { double last_cdf = 0.0; double min_p = 1.0; for (double p = 0.01; p < 1.0; p += 0.01) { auto x = InverseCDF(p); if (points.empty() || points.back() < x.second) { const double p = CDF(x.second); points.push_back(x.second); double q = p - last_cdf; expected.push_back(q); last_cdf = p; if (q < min_p) { min_p = q; } } } if (last_cdf < 0.999) { points.push_back(std::numeric_limits<size_t>::max()); double q = 1.0 - last_cdf; expected.push_back(q); if (q < min_p) { min_p = q; } } else { points.back() = std::numeric_limits<size_t>::max(); expected.back() += (1.0 - last_cdf); } // The Chi-Squared score is not completely scale-invariant; it works best // when the small values are in the small digits. trials = static_cast<size_t>(8.0 / min_p); } ASSERT_GT(points.size(), 0); // Generate n variates and fill the counts vector with the count of their // occurrences. std::vector<int64_t> buckets(points.size(), 0); double avg = 0; { zipf_u64 dis(param); for (size_t i = 0; i < trials; i++) { uint64_t x = dis(rng_); ASSERT_LE(x, dis.max()); ASSERT_GE(x, dis.min()); avg += static_cast<double>(x); auto it = std::upper_bound(std::begin(points), std::end(points), static_cast<size_t>(x)); buckets[std::distance(std::begin(points), it)]++; } avg = avg / static_cast<double>(trials); } // Validate the output using the Chi-Squared test. for (auto& e : expected) { e *= trials; } // The null-hypothesis is that the distribution is a poisson distribution with // the provided mean (not estimated from the data). const int dof = static_cast<int>(expected.size()) - 1; // NOTE: This test runs about 15x per invocation, so a value of 0.9995 is // approximately correct for a test suite failure rate of 1 in 100. In // practice we see failures slightly higher than that. const double threshold = absl::random_internal::ChiSquareValue(dof, 0.9999); const double chi_square = absl::random_internal::ChiSquare( std::begin(buckets), std::end(buckets), std::begin(expected), std::end(expected)); const double p_actual = absl::random_internal::ChiSquarePValue(chi_square, dof); // Log if the chi_squared value is above the threshold. if (chi_square > threshold) { ABSL_INTERNAL_LOG(INFO, "values"); for (size_t i = 0; i < expected.size(); i++) { ABSL_INTERNAL_LOG(INFO, absl::StrCat(points[i], ": ", buckets[i], " vs. E=", expected[i])); } ABSL_INTERNAL_LOG(INFO, absl::StrCat("trials ", trials)); ABSL_INTERNAL_LOG(INFO, absl::StrCat("mean ", avg, " vs. expected ", mean())); ABSL_INTERNAL_LOG(INFO, absl::StrCat(kChiSquared, "(data, ", dof, ") = ", chi_square, " (", p_actual, ")")); ABSL_INTERNAL_LOG(INFO, absl::StrCat(kChiSquared, " @ 0.9995 = ", threshold)); FAIL() << kChiSquared << " value of " << chi_square << " is above the threshold."; } } std::vector<zipf_u64::param_type> GenParams() { using param = zipf_u64::param_type; const auto k = param().k(); const auto q = param().q(); const auto v = param().v(); const uint64_t k2 = 1 << 10; return std::vector<zipf_u64::param_type>{ // Default param(k, q, v), // vary K param(4, q, v), param(1 << 4, q, v), param(k2, q, v), // vary V param(k2, q, 0.5), param(k2, q, 1.5), param(k2, q, 2.5), param(k2, q, 10), // vary Q param(k2, 1.5, v), param(k2, 3, v), param(k2, 5, v), param(k2, 10, v), // Vary V & Q param(k2, 1.5, 0.5), param(k2, 3, 1.5), param(k, 10, 10)}; } std::string ParamName( const ::testing::TestParamInfo<zipf_u64::param_type>& info) { const auto& p = info.param; std::string name = absl::StrCat("k_", p.k(), "__q_", absl::SixDigits(p.q()), "__v_", absl::SixDigits(p.v())); return absl::StrReplaceAll(name, {{"+", "_"}, {"-", "_"}, {".", "_"}}); } INSTANTIATE_TEST_SUITE_P(All, ZipfTest, ::testing::ValuesIn(GenParams()), ParamName); // NOTE: absl::zipf_distribution is not guaranteed to be stable. TEST(ZipfDistributionTest, StabilityTest) { // absl::zipf_distribution stability relies on // absl::uniform_real_distribution, std::log, std::exp, std::log1p absl::random_internal::sequence_urbg urbg( {0x0003eb76f6f7f755ull, 0xFFCEA50FDB2F953Bull, 0xC332DDEFBE6C5AA5ull, 0x6558218568AB9702ull, 0x2AEF7DAD5B6E2F84ull, 0x1521B62829076170ull, 0xECDD4775619F1510ull, 0x13CCA830EB61BD96ull, 0x0334FE1EAA0363CFull, 0xB5735C904C70A239ull, 0xD59E9E0BCBAADE14ull, 0xEECC86BC60622CA7ull}); std::vector<int> output(10); { absl::zipf_distribution<int32_t> dist; std::generate(std::begin(output), std::end(output), [&] { return dist(urbg); }); EXPECT_THAT(output, ElementsAre(10031, 0, 0, 3, 6, 0, 7, 47, 0, 0)); } urbg.reset(); { absl::zipf_distribution<int32_t> dist(std::numeric_limits<int32_t>::max(), 3.3); std::generate(std::begin(output), std::end(output), [&] { return dist(urbg); }); EXPECT_THAT(output, ElementsAre(44, 0, 0, 0, 0, 1, 0, 1, 3, 0)); } } TEST(ZipfDistributionTest, AlgorithmBounds) { absl::zipf_distribution<int32_t> dist; // Small values from absl::uniform_real_distribution map to larger Zipf // distribution values. const std::pair<uint64_t, int32_t> kInputs[] = { {0xffffffffffffffff, 0x0}, {0x7fffffffffffffff, 0x0}, {0x3ffffffffffffffb, 0x1}, {0x1ffffffffffffffd, 0x4}, {0xffffffffffffffe, 0x9}, {0x7ffffffffffffff, 0x12}, {0x3ffffffffffffff, 0x25}, {0x1ffffffffffffff, 0x4c}, {0xffffffffffffff, 0x99}, {0x7fffffffffffff, 0x132}, {0x3fffffffffffff, 0x265}, {0x1fffffffffffff, 0x4cc}, {0xfffffffffffff, 0x999}, {0x7ffffffffffff, 0x1332}, {0x3ffffffffffff, 0x2665}, {0x1ffffffffffff, 0x4ccc}, {0xffffffffffff, 0x9998}, {0x7fffffffffff, 0x1332f}, {0x3fffffffffff, 0x2665a}, {0x1fffffffffff, 0x4cc9e}, {0xfffffffffff, 0x998e0}, {0x7ffffffffff, 0x133051}, {0x3ffffffffff, 0x265ae4}, {0x1ffffffffff, 0x4c9ed3}, {0xffffffffff, 0x98e223}, {0x7fffffffff, 0x13058c4}, {0x3fffffffff, 0x25b178e}, {0x1fffffffff, 0x4a062b2}, {0xfffffffff, 0x8ee23b8}, {0x7ffffffff, 0x10b21642}, {0x3ffffffff, 0x1d89d89d}, {0x1ffffffff, 0x2fffffff}, {0xffffffff, 0x45d1745d}, {0x7fffffff, 0x5a5a5a5a}, {0x3fffffff, 0x69ee5846}, {0x1fffffff, 0x73ecade3}, {0xfffffff, 0x79a9d260}, {0x7ffffff, 0x7cc0532b}, {0x3ffffff, 0x7e5ad146}, {0x1ffffff, 0x7f2c0bec}, {0xffffff, 0x7f95adef}, {0x7fffff, 0x7fcac0da}, {0x3fffff, 0x7fe55ae2}, {0x1fffff, 0x7ff2ac0e}, {0xfffff, 0x7ff955ae}, {0x7ffff, 0x7ffcaac1}, {0x3ffff, 0x7ffe555b}, {0x1ffff, 0x7fff2aac}, {0xffff, 0x7fff9556}, {0x7fff, 0x7fffcaab}, {0x3fff, 0x7fffe555}, {0x1fff, 0x7ffff2ab}, {0xfff, 0x7ffff955}, {0x7ff, 0x7ffffcab}, {0x3ff, 0x7ffffe55}, {0x1ff, 0x7fffff2b}, {0xff, 0x7fffff95}, {0x7f, 0x7fffffcb}, {0x3f, 0x7fffffe5}, {0x1f, 0x7ffffff3}, {0xf, 0x7ffffff9}, {0x7, 0x7ffffffd}, {0x3, 0x7ffffffe}, {0x1, 0x7fffffff}, }; for (const auto& instance : kInputs) { absl::random_internal::sequence_urbg urbg({instance.first}); EXPECT_EQ(instance.second, dist(urbg)); } } } // namespace