about summary refs log tree commit diff
path: root/absl/random/internal/randen_slow.cc
diff options
context:
space:
mode:
authorAbseil Team <absl-team@google.com>2020-05-19T00·53-0700
committerMark Barolak <mbar@google.com>2020-05-19T14·59-0400
commit768eb2ca2857342673fcd462792ce04b8bac3fa3 (patch)
tree7c7951bde3dd27933b1d85621d7abf01179c0377 /absl/random/internal/randen_slow.cc
parent3f347c46272886a099852a4cd303ecf37a054de8 (diff)
Export of internal Abseil changes
--
f012012ef78234a6a4585321b67d7b7c92ebc266 by Laramie Leavitt <lar@google.com>:

Slight restructuring of absl/random/internal randen implementation.

Convert round-keys.inc into randen_round_keys.cc file.

Consistently use a 128-bit pointer type for internal method parameters. This allows simpler pointer arithmetic in C++ & permits removal of some constants and casts.

Remove some redundancy in comments & constexpr variables. Specifically, all references to Randen algorithm parameters use RandenTraits; duplication in RandenSlow removed.

PiperOrigin-RevId: 312190313

--
dc8b42e054046741e9ed65335bfdface997c6063 by Abseil Team <absl-team@google.com>:

Internal change.

PiperOrigin-RevId: 312167304

--
f13d248fafaf206492c1362c3574031aea3abaf7 by Matthew Brown <matthewbr@google.com>:

Cleanup StrFormat extensions a little.

PiperOrigin-RevId: 312166336

--
9d9117589667afe2332bb7ad42bc967ca7c54502 by Derek Mauro <dmauro@google.com>:

Internal change

PiperOrigin-RevId: 312105213

--
9a12b9b3aa0e59b8ee6cf9408ed0029045543a9b by Abseil Team <absl-team@google.com>:

Complete IGNORE_TYPE macro renaming.

PiperOrigin-RevId: 311999699

--
64756f20d61021d999bd0d4c15e9ad3857382f57 by Gennadiy Rozental <rogeeff@google.com>:

Switch to fixed bytes specific default value.

This fixes the Abseil Flags for big endian platforms.

PiperOrigin-RevId: 311844448

--
bdbe6b5b29791dbc3816ada1828458b3010ff1e9 by Laramie Leavitt <lar@google.com>:

Change many distribution tests to use pcg_engine as a deterministic source of entropy.

It's reasonable to test that the BitGen itself has good entropy, however when testing the cross product of all random distributions x all the architecture variations x all submitted changes results in a large number of tests. In order to account for these failures while still using good entropy requires that our allowed sigma need to account for all of these independent tests.

Our current sigma values are too restrictive, and we see a lot of failures, so we have to either relax the sigma values or convert some of the statistical tests to use deterministic values.

This changelist does the latter.

PiperOrigin-RevId: 311840096
GitOrigin-RevId: f012012ef78234a6a4585321b67d7b7c92ebc266
Change-Id: Ic84886f38ff30d7d72c126e9b63c9a61eb729a1a
Diffstat (limited to 'absl/random/internal/randen_slow.cc')
-rw-r--r--absl/random/internal/randen_slow.cc197
1 files changed, 74 insertions, 123 deletions
diff --git a/absl/random/internal/randen_slow.cc b/absl/random/internal/randen_slow.cc
index 8d07458254a5..4e5f3dc1c7b7 100644
--- a/absl/random/internal/randen_slow.cc
+++ b/absl/random/internal/randen_slow.cc
@@ -20,6 +20,7 @@
 
 #include "absl/base/attributes.h"
 #include "absl/random/internal/platform.h"
+#include "absl/random/internal/randen_traits.h"
 
 #if ABSL_HAVE_ATTRIBUTE(always_inline) || \
     (defined(__GNUC__) && !defined(__clang__))
@@ -225,35 +226,16 @@ constexpr uint32_t te3[256] = {
     0xb0b0cb7b, 0x5454fca8, 0xbbbbd66d, 0x16163a2c,
 };
 
-struct alignas(16) u64x2 {
-  constexpr u64x2() : v{0, 0} {};
-  constexpr u64x2(uint64_t hi, uint64_t lo) : v{lo, hi} {}
-
-  uint64_t v[2];
-};
-
 // Software implementation of the Vector128 class, using uint32_t
 // as an underlying vector register.
-//
-struct Vector128 {
-  inline ABSL_RANDOM_INTERNAL_ATTRIBUTE_ALWAYS_INLINE Vector128& operator^=(
-      const Vector128& other) {
-    s[0] ^= other.s[0];
-    s[1] ^= other.s[1];
-    s[2] ^= other.s[2];
-    s[3] ^= other.s[3];
-    return *this;
-  }
-
+struct alignas(16) Vector128 {
   uint32_t s[4];
 };
 
 inline ABSL_RANDOM_INTERNAL_ATTRIBUTE_ALWAYS_INLINE Vector128
-Vector128Load(const void* ABSL_RANDOM_INTERNAL_RESTRICT from) {
+Vector128Load(const void* from) {
   Vector128 result;
-  const uint8_t* ABSL_RANDOM_INTERNAL_RESTRICT src =
-      reinterpret_cast<const uint8_t*>(from);
-
+  const uint8_t* src = reinterpret_cast<const uint8_t*>(from);
   result.s[0] = static_cast<uint32_t>(src[0]) << 24 |
                 static_cast<uint32_t>(src[1]) << 16 |
                 static_cast<uint32_t>(src[2]) << 8 |
@@ -274,7 +256,7 @@ Vector128Load(const void* ABSL_RANDOM_INTERNAL_RESTRICT from) {
 }
 
 inline ABSL_RANDOM_INTERNAL_ATTRIBUTE_ALWAYS_INLINE void Vector128Store(
-    const Vector128& v, void* ABSL_RANDOM_INTERNAL_RESTRICT to) {
+    const Vector128& v, void* to) {
   uint8_t* dst = reinterpret_cast<uint8_t*>(to);
   dst[0] = static_cast<uint8_t>(v.s[0] >> 24);
   dst[1] = static_cast<uint8_t>(v.s[0] >> 16);
@@ -298,91 +280,57 @@ inline ABSL_RANDOM_INTERNAL_ATTRIBUTE_ALWAYS_INLINE void Vector128Store(
 // symmetry of AES (ensures previously equal columns differ afterwards).
 inline ABSL_RANDOM_INTERNAL_ATTRIBUTE_ALWAYS_INLINE Vector128
 AesRound(const Vector128& state, const Vector128& round_key) {
-  // clang-format off
   Vector128 result;
-  result.s[0] = round_key.s[0] ^
-                te0[uint8_t(state.s[0] >> 24)] ^
-                te1[uint8_t(state.s[1] >> 16)] ^
-                te2[uint8_t(state.s[2] >> 8)] ^
+  result.s[0] = round_key.s[0] ^                  //
+                te0[uint8_t(state.s[0] >> 24)] ^  //
+                te1[uint8_t(state.s[1] >> 16)] ^  //
+                te2[uint8_t(state.s[2] >> 8)] ^   //
                 te3[uint8_t(state.s[3])];
-  result.s[1] = round_key.s[1] ^
-                te0[uint8_t(state.s[1] >> 24)] ^
-                te1[uint8_t(state.s[2] >> 16)] ^
-                te2[uint8_t(state.s[3] >> 8)] ^
+  result.s[1] = round_key.s[1] ^                  //
+                te0[uint8_t(state.s[1] >> 24)] ^  //
+                te1[uint8_t(state.s[2] >> 16)] ^  //
+                te2[uint8_t(state.s[3] >> 8)] ^   //
                 te3[uint8_t(state.s[0])];
-  result.s[2] = round_key.s[2] ^
-                te0[uint8_t(state.s[2] >> 24)] ^
-                te1[uint8_t(state.s[3] >> 16)] ^
-                te2[uint8_t(state.s[0] >> 8)] ^
+  result.s[2] = round_key.s[2] ^                  //
+                te0[uint8_t(state.s[2] >> 24)] ^  //
+                te1[uint8_t(state.s[3] >> 16)] ^  //
+                te2[uint8_t(state.s[0] >> 8)] ^   //
                 te3[uint8_t(state.s[1])];
-  result.s[3] = round_key.s[3] ^
-                te0[uint8_t(state.s[3] >> 24)] ^
-                te1[uint8_t(state.s[0] >> 16)] ^
-                te2[uint8_t(state.s[1] >> 8)] ^
+  result.s[3] = round_key.s[3] ^                  //
+                te0[uint8_t(state.s[3] >> 24)] ^  //
+                te1[uint8_t(state.s[0] >> 16)] ^  //
+                te2[uint8_t(state.s[1] >> 8)] ^   //
                 te3[uint8_t(state.s[2])];
   return result;
-  // clang-format on
 }
 
-// RANDen = RANDom generator or beetroots in Swiss German.
-// 'Strong' (well-distributed, unpredictable, backtracking-resistant) random
-// generator, faster in some benchmarks than std::mt19937_64 and pcg64_c32.
-//
-// High-level summary:
-// 1) Reverie (see "A Robust and Sponge-Like PRNG with Improved Efficiency") is
-//    a sponge-like random generator that requires a cryptographic permutation.
-//    It improves upon "Provably Robust Sponge-Based PRNGs and KDFs" by
-//    achieving backtracking resistance with only one Permute() per buffer.
-//
-// 2) "Simpira v2: A Family of Efficient Permutations Using the AES Round
-//    Function" constructs up to 1024-bit permutations using an improved
-//    Generalized Feistel network with 2-round AES-128 functions. This Feistel
-//    block shuffle achieves diffusion faster and is less vulnerable to
-//    sliced-biclique attacks than the Type-2 cyclic shuffle.
-//
-// 3) "Improving the Generalized Feistel" and "New criterion for diffusion
-//    property" extends the same kind of improved Feistel block shuffle to 16
-//    branches, which enables a 2048-bit permutation.
-//
-// Combine these three ideas and also change Simpira's subround keys from
-// structured/low-entropy counters to digits of Pi.
-
-// Randen constants.
-constexpr size_t kFeistelBlocks = 16;
-constexpr size_t kFeistelFunctions = kFeistelBlocks / 2;  // = 8
-constexpr size_t kFeistelRounds = 16 + 1;  // > 4 * log2(kFeistelBlocks)
-constexpr size_t kKeys = kFeistelRounds * kFeistelFunctions;
-
-// INCLUDE keys.
-#include "absl/random/internal/randen-keys.inc"
-
-static_assert(kKeys == kRoundKeys, "kKeys and kRoundKeys must be equal");
+using ::absl::random_internal::RandenTraits;
 
-// 2 uint64_t lanes per Vector128
-static constexpr size_t kLanes = 2;
+// Randen operates on 128-bit vectors.
+struct alignas(16) u64x2 {
+  uint64_t data[2];
+};
 
 // The improved Feistel block shuffle function for 16 blocks.
 inline ABSL_RANDOM_INTERNAL_ATTRIBUTE_ALWAYS_INLINE void BlockShuffle(
-    uint64_t* ABSL_RANDOM_INTERNAL_RESTRICT state_u64) {
-  static_assert(kFeistelBlocks == 16,
+    u64x2* state) {
+  static_assert(RandenTraits::kFeistelBlocks == 16,
                 "Feistel block shuffle only works for 16 blocks.");
 
-  constexpr size_t shuffle[kFeistelBlocks] = {7,  2, 13, 4,  11, 8,  3, 6,
-                                              15, 0, 9,  10, 1,  14, 5, 12};
-
-  u64x2* ABSL_RANDOM_INTERNAL_RESTRICT state =
-      reinterpret_cast<u64x2*>(state_u64);
+  constexpr size_t shuffle[RandenTraits::kFeistelBlocks] = {
+      7, 2, 13, 4, 11, 8, 3, 6, 15, 0, 9, 10, 1, 14, 5, 12};
 
   // The fully unrolled loop without the memcpy improves the speed by about
-  // 30% over the equivalent (leaving code here as a comment):
-  if (false) {
-    u64x2 source[kFeistelBlocks];
-    std::memcpy(source, state, sizeof(source));
-    for (size_t i = 0; i < kFeistelBlocks; i++) {
-      const u64x2 v0 = source[shuffle[i]];
-      state[i] = v0;
-    }
+  // 30% over the equivalent:
+#if 0
+  u64x2 source[RandenTraits::kFeistelBlocks];
+  std::memcpy(source, state, sizeof(source));
+  for (size_t i = 0; i < RandenTraits::kFeistelBlocks; i++) {
+    const u64x2 v0 = source[shuffle[i]];
+    state[i] = v0;
   }
+  return;
+#endif
 
   const u64x2 v0 = state[shuffle[0]];
   const u64x2 v1 = state[shuffle[1]];
@@ -424,23 +372,23 @@ inline ABSL_RANDOM_INTERNAL_ATTRIBUTE_ALWAYS_INLINE void BlockShuffle(
 // parallel hides the 7-cycle AESNI latency on HSW. Note that the Feistel
 // XORs are 'free' (included in the second AES instruction).
 inline ABSL_RANDOM_INTERNAL_ATTRIBUTE_ALWAYS_INLINE const u64x2* FeistelRound(
-    uint64_t* ABSL_RANDOM_INTERNAL_RESTRICT state,
+    u64x2* ABSL_RANDOM_INTERNAL_RESTRICT state,
     const u64x2* ABSL_RANDOM_INTERNAL_RESTRICT keys) {
-  for (size_t branch = 0; branch < kFeistelBlocks; branch += 4) {
-    const Vector128 s0 = Vector128Load(state + kLanes * branch);
-    const Vector128 s1 = Vector128Load(state + kLanes * (branch + 1));
+  for (size_t branch = 0; branch < RandenTraits::kFeistelBlocks; branch += 4) {
+    const Vector128 s0 = Vector128Load(state + branch);
+    const Vector128 s1 = Vector128Load(state + branch + 1);
     const Vector128 f0 = AesRound(s0, Vector128Load(keys));
     keys++;
     const Vector128 o1 = AesRound(f0, s1);
-    Vector128Store(o1, state + kLanes * (branch + 1));
+    Vector128Store(o1, state + branch + 1);
 
     // Manually unroll this loop once. about 10% better than not unrolled.
-    const Vector128 s2 = Vector128Load(state + kLanes * (branch + 2));
-    const Vector128 s3 = Vector128Load(state + kLanes * (branch + 3));
+    const Vector128 s2 = Vector128Load(state + branch + 2);
+    const Vector128 s3 = Vector128Load(state + branch + 3);
     const Vector128 f2 = AesRound(s2, Vector128Load(keys));
     keys++;
     const Vector128 o3 = AesRound(f2, s3);
-    Vector128Store(o3, state + kLanes * (branch + 3));
+    Vector128Store(o3, state + branch + 3);
   }
   return keys;
 }
@@ -450,11 +398,9 @@ inline ABSL_RANDOM_INTERNAL_ATTRIBUTE_ALWAYS_INLINE const u64x2* FeistelRound(
 // 2^64 queries if the round function is a PRF. This is similar to the b=8 case
 // of Simpira v2, but more efficient than its generic construction for b=16.
 inline ABSL_RANDOM_INTERNAL_ATTRIBUTE_ALWAYS_INLINE void Permute(
-    const void* keys, uint64_t* ABSL_RANDOM_INTERNAL_RESTRICT state) {
-  const u64x2* ABSL_RANDOM_INTERNAL_RESTRICT keys128 =
-      static_cast<const u64x2*>(keys);
-  for (size_t round = 0; round < kFeistelRounds; ++round) {
-    keys128 = FeistelRound(state, keys128);
+    u64x2* state, const u64x2* ABSL_RANDOM_INTERNAL_RESTRICT keys) {
+  for (size_t round = 0; round < RandenTraits::kFeistelRounds; ++round) {
+    keys = FeistelRound(state, keys);
     BlockShuffle(state);
   }
 }
@@ -468,37 +414,42 @@ namespace random_internal {
 const void* RandenSlow::GetKeys() {
   // Round keys for one AES per Feistel round and branch.
   // The canonical implementation uses first digits of Pi.
-  return round_keys;
+  return kRandenRoundKeys;
 }
 
 void RandenSlow::Absorb(const void* seed_void, void* state_void) {
-  uint64_t* ABSL_RANDOM_INTERNAL_RESTRICT state =
-      reinterpret_cast<uint64_t*>(state_void);
-  const uint64_t* ABSL_RANDOM_INTERNAL_RESTRICT seed =
-      reinterpret_cast<const uint64_t*>(seed_void);
-
-  constexpr size_t kCapacityBlocks = kCapacityBytes / sizeof(uint64_t);
-  static_assert(kCapacityBlocks * sizeof(uint64_t) == kCapacityBytes,
-                "Not i*V");
-  for (size_t i = kCapacityBlocks; i < kStateBytes / sizeof(uint64_t); ++i) {
+  auto* state =
+      reinterpret_cast<uint64_t * ABSL_RANDOM_INTERNAL_RESTRICT>(state_void);
+  const auto* seed =
+      reinterpret_cast<const uint64_t * ABSL_RANDOM_INTERNAL_RESTRICT>(
+          seed_void);
+
+  constexpr size_t kCapacityBlocks =
+      RandenTraits::kCapacityBytes / sizeof(uint64_t);
+  static_assert(
+      kCapacityBlocks * sizeof(uint64_t) == RandenTraits::kCapacityBytes,
+      "Not i*V");
+
+  for (size_t i = kCapacityBlocks;
+       i < RandenTraits::kStateBytes / sizeof(uint64_t); ++i) {
     state[i] ^= seed[i - kCapacityBlocks];
   }
 }
 
-void RandenSlow::Generate(const void* keys, void* state_void) {
-  static_assert(kCapacityBytes == sizeof(Vector128), "Capacity mismatch");
+void RandenSlow::Generate(const void* keys_void, void* state_void) {
+  static_assert(RandenTraits::kCapacityBytes == sizeof(u64x2),
+                "Capacity mismatch");
 
-  uint64_t* ABSL_RANDOM_INTERNAL_RESTRICT state =
-      reinterpret_cast<uint64_t*>(state_void);
+  auto* state = reinterpret_cast<u64x2*>(state_void);
+  const auto* keys = reinterpret_cast<const u64x2*>(keys_void);
 
-  const Vector128 prev_inner = Vector128Load(state);
+  const u64x2 prev_inner = state[0];
 
-  Permute(keys, state);
+  Permute(state, keys);
 
   // Ensure backtracking resistance.
-  Vector128 inner = Vector128Load(state);
-  inner ^= prev_inner;
-  Vector128Store(inner, state);
+  state[0].data[0] ^= prev_inner.data[0];
+  state[0].data[1] ^= prev_inner.data[1];
 }
 
 }  // namespace random_internal