about summary refs log tree commit diff
path: root/absl/random/internal/randen_hwaes.cc
diff options
context:
space:
mode:
authorAbseil Team <absl-team@google.com>2020-05-19T00·53-0700
committerMark Barolak <mbar@google.com>2020-05-19T14·59-0400
commit768eb2ca2857342673fcd462792ce04b8bac3fa3 (patch)
tree7c7951bde3dd27933b1d85621d7abf01179c0377 /absl/random/internal/randen_hwaes.cc
parent3f347c46272886a099852a4cd303ecf37a054de8 (diff)
Export of internal Abseil changes
--
f012012ef78234a6a4585321b67d7b7c92ebc266 by Laramie Leavitt <lar@google.com>:

Slight restructuring of absl/random/internal randen implementation.

Convert round-keys.inc into randen_round_keys.cc file.

Consistently use a 128-bit pointer type for internal method parameters. This allows simpler pointer arithmetic in C++ & permits removal of some constants and casts.

Remove some redundancy in comments & constexpr variables. Specifically, all references to Randen algorithm parameters use RandenTraits; duplication in RandenSlow removed.

PiperOrigin-RevId: 312190313

--
dc8b42e054046741e9ed65335bfdface997c6063 by Abseil Team <absl-team@google.com>:

Internal change.

PiperOrigin-RevId: 312167304

--
f13d248fafaf206492c1362c3574031aea3abaf7 by Matthew Brown <matthewbr@google.com>:

Cleanup StrFormat extensions a little.

PiperOrigin-RevId: 312166336

--
9d9117589667afe2332bb7ad42bc967ca7c54502 by Derek Mauro <dmauro@google.com>:

Internal change

PiperOrigin-RevId: 312105213

--
9a12b9b3aa0e59b8ee6cf9408ed0029045543a9b by Abseil Team <absl-team@google.com>:

Complete IGNORE_TYPE macro renaming.

PiperOrigin-RevId: 311999699

--
64756f20d61021d999bd0d4c15e9ad3857382f57 by Gennadiy Rozental <rogeeff@google.com>:

Switch to fixed bytes specific default value.

This fixes the Abseil Flags for big endian platforms.

PiperOrigin-RevId: 311844448

--
bdbe6b5b29791dbc3816ada1828458b3010ff1e9 by Laramie Leavitt <lar@google.com>:

Change many distribution tests to use pcg_engine as a deterministic source of entropy.

It's reasonable to test that the BitGen itself has good entropy, however when testing the cross product of all random distributions x all the architecture variations x all submitted changes results in a large number of tests. In order to account for these failures while still using good entropy requires that our allowed sigma need to account for all of these independent tests.

Our current sigma values are too restrictive, and we see a lot of failures, so we have to either relax the sigma values or convert some of the statistical tests to use deterministic values.

This changelist does the latter.

PiperOrigin-RevId: 311840096
GitOrigin-RevId: f012012ef78234a6a4585321b67d7b7c92ebc266
Change-Id: Ic84886f38ff30d7d72c126e9b63c9a61eb729a1a
Diffstat (limited to 'absl/random/internal/randen_hwaes.cc')
-rw-r--r--absl/random/internal/randen_hwaes.cc398
1 files changed, 166 insertions, 232 deletions
diff --git a/absl/random/internal/randen_hwaes.cc b/absl/random/internal/randen_hwaes.cc
index e23844f12e56..9966486fde92 100644
--- a/absl/random/internal/randen_hwaes.cc
+++ b/absl/random/internal/randen_hwaes.cc
@@ -24,6 +24,7 @@
 
 #include "absl/base/attributes.h"
 #include "absl/random/internal/platform.h"
+#include "absl/random/internal/randen_traits.h"
 
 // ABSL_RANDEN_HWAES_IMPL indicates whether this file will contain
 // a hardware accelerated implementation of randen, or whether it
@@ -115,8 +116,16 @@ ABSL_NAMESPACE_END
 // Accelerated implementations are supported.
 // We need the per-architecture includes and defines.
 //
+namespace {
 
-#include "absl/random/internal/randen_traits.h"
+using absl::random_internal::RandenTraits;
+
+// Randen operates on 128-bit vectors.
+struct alignas(16) u64x2 {
+  uint64_t data[2];
+};
+
+}  // namespace
 
 // TARGET_CRYPTO defines a crypto attribute for each architecture.
 //
@@ -150,7 +159,6 @@ ABSL_NAMESPACE_END
 using Vector128 = __vector unsigned long long;  // NOLINT(runtime/int)
 
 namespace {
-
 inline ABSL_TARGET_CRYPTO Vector128 ReverseBytes(const Vector128& v) {
   // Reverses the bytes of the vector.
   const __vector unsigned char perm = {15, 14, 13, 12, 11, 10, 9, 8,
@@ -177,14 +185,9 @@ inline ABSL_TARGET_CRYPTO Vector128 AesRound(const Vector128& state,
 }
 
 // Enables native loads in the round loop by pre-swapping.
-inline ABSL_TARGET_CRYPTO void SwapEndian(uint64_t* state) {
-  using absl::random_internal::RandenTraits;
-  constexpr size_t kLanes = 2;
-  constexpr size_t kFeistelBlocks = RandenTraits::kFeistelBlocks;
-
-  for (uint32_t branch = 0; branch < kFeistelBlocks; ++branch) {
-    const Vector128 v = ReverseBytes(Vector128Load(state + kLanes * branch));
-    Vector128Store(v, state + kLanes * branch);
+inline ABSL_TARGET_CRYPTO void SwapEndian(u64x2* state) {
+  for (uint32_t block = 0; block < RandenTraits::kFeistelBlocks; ++block) {
+    Vector128Store(ReverseBytes(Vector128Load(state + block)), state + block);
   }
 }
 
@@ -251,7 +254,7 @@ inline ABSL_TARGET_CRYPTO Vector128 AesRound(const Vector128& state,
   return vaesmcq_u8(vaeseq_u8(state, uint8x16_t{})) ^ round_key;
 }
 
-inline ABSL_TARGET_CRYPTO void SwapEndian(uint64_t*) {}
+inline ABSL_TARGET_CRYPTO void SwapEndian(void*) {}
 
 }  // namespace
 
@@ -297,39 +300,12 @@ inline ABSL_TARGET_CRYPTO Vector128 AesRound(const Vector128& state,
   return Vector128(_mm_aesenc_si128(state.data(), round_key.data()));
 }
 
-inline ABSL_TARGET_CRYPTO void SwapEndian(uint64_t*) {}
+inline ABSL_TARGET_CRYPTO void SwapEndian(void*) {}
 
 }  // namespace
 
 #endif
 
-namespace {
-
-// u64x2 is a 128-bit, (2 x uint64_t lanes) struct used to store
-// the randen_keys.
-struct alignas(16) u64x2 {
-  constexpr u64x2(uint64_t hi, uint64_t lo)
-#if defined(ABSL_ARCH_PPC)
-      // This has been tested with PPC running in little-endian mode;
-      // We byte-swap the u64x2 structure from little-endian to big-endian
-      // because altivec always runs in big-endian mode.
-      : v{__builtin_bswap64(hi), __builtin_bswap64(lo)} {
-#else
-      : v{lo, hi} {
-#endif
-  }
-
-  constexpr bool operator==(const u64x2& other) const {
-    return v[0] == other.v[0] && v[1] == other.v[1];
-  }
-
-  constexpr bool operator!=(const u64x2& other) const {
-    return !(*this == other);
-  }
-
-  uint64_t v[2];
-};  // namespace
-
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunknown-pragmas"
@@ -338,7 +314,6 @@ struct alignas(16) u64x2 {
 // At this point, all of the platform-specific features have been defined /
 // implemented.
 //
-// REQUIRES: using u64x2 = ...
 // REQUIRES: using Vector128 = ...
 // REQUIRES: Vector128 Vector128Load(void*) {...}
 // REQUIRES: void Vector128Store(Vector128, void*) {...}
@@ -347,94 +322,50 @@ struct alignas(16) u64x2 {
 //
 // PROVIDES: absl::random_internal::RandenHwAes::Absorb
 // PROVIDES: absl::random_internal::RandenHwAes::Generate
-
-// RANDen = RANDom generator or beetroots in Swiss German.
-// 'Strong' (well-distributed, unpredictable, backtracking-resistant) random
-// generator, faster in some benchmarks than std::mt19937_64 and pcg64_c32.
-//
-// High-level summary:
-// 1) Reverie (see "A Robust and Sponge-Like PRNG with Improved Efficiency") is
-//    a sponge-like random generator that requires a cryptographic permutation.
-//    It improves upon "Provably Robust Sponge-Based PRNGs and KDFs" by
-//    achieving backtracking resistance with only one Permute() per buffer.
-//
-// 2) "Simpira v2: A Family of Efficient Permutations Using the AES Round
-//    Function" constructs up to 1024-bit permutations using an improved
-//    Generalized Feistel network with 2-round AES-128 functions. This Feistel
-//    block shuffle achieves diffusion faster and is less vulnerable to
-//    sliced-biclique attacks than the Type-2 cyclic shuffle.
-//
-// 3) "Improving the Generalized Feistel" and "New criterion for diffusion
-//    property" extends the same kind of improved Feistel block shuffle to 16
-//    branches, which enables a 2048-bit permutation.
-//
-// We combine these three ideas and also change Simpira's subround keys from
-// structured/low-entropy counters to digits of Pi.
-
-// Randen constants.
-using absl::random_internal::RandenTraits;
-constexpr size_t kStateBytes = RandenTraits::kStateBytes;
-constexpr size_t kCapacityBytes = RandenTraits::kCapacityBytes;
-constexpr size_t kFeistelBlocks = RandenTraits::kFeistelBlocks;
-constexpr size_t kFeistelRounds = RandenTraits::kFeistelRounds;
-constexpr size_t kFeistelFunctions = RandenTraits::kFeistelFunctions;
-
-// Independent keys (272 = 2.1 KiB) for the first AES subround of each function.
-constexpr size_t kKeys = kFeistelRounds * kFeistelFunctions;
-
-// INCLUDE keys.
-#include "absl/random/internal/randen-keys.inc"
-
-static_assert(kKeys == kRoundKeys, "kKeys and kRoundKeys must be equal");
-static_assert(round_keys[kKeys - 1] != u64x2(0, 0),
-              "Too few round_keys initializers");
-
-// Number of uint64_t lanes per 128-bit vector;
-constexpr size_t kLanes = 2;
+namespace {
 
 // Block shuffles applies a shuffle to the entire state between AES rounds.
 // Improved odd-even shuffle from "New criterion for diffusion property".
-inline ABSL_TARGET_CRYPTO void BlockShuffle(uint64_t* state) {
-  static_assert(kFeistelBlocks == 16, "Expecting 16 FeistelBlocks.");
-
-  constexpr size_t shuffle[kFeistelBlocks] = {7,  2, 13, 4,  11, 8,  3, 6,
-                                              15, 0, 9,  10, 1,  14, 5, 12};
-
-  // The fully unrolled loop without the memcpy improves the speed by about
-  // 30% over the equivalent loop.
-  const Vector128 v0 = Vector128Load(state + kLanes * shuffle[0]);
-  const Vector128 v1 = Vector128Load(state + kLanes * shuffle[1]);
-  const Vector128 v2 = Vector128Load(state + kLanes * shuffle[2]);
-  const Vector128 v3 = Vector128Load(state + kLanes * shuffle[3]);
-  const Vector128 v4 = Vector128Load(state + kLanes * shuffle[4]);
-  const Vector128 v5 = Vector128Load(state + kLanes * shuffle[5]);
-  const Vector128 v6 = Vector128Load(state + kLanes * shuffle[6]);
-  const Vector128 v7 = Vector128Load(state + kLanes * shuffle[7]);
-  const Vector128 w0 = Vector128Load(state + kLanes * shuffle[8]);
-  const Vector128 w1 = Vector128Load(state + kLanes * shuffle[9]);
-  const Vector128 w2 = Vector128Load(state + kLanes * shuffle[10]);
-  const Vector128 w3 = Vector128Load(state + kLanes * shuffle[11]);
-  const Vector128 w4 = Vector128Load(state + kLanes * shuffle[12]);
-  const Vector128 w5 = Vector128Load(state + kLanes * shuffle[13]);
-  const Vector128 w6 = Vector128Load(state + kLanes * shuffle[14]);
-  const Vector128 w7 = Vector128Load(state + kLanes * shuffle[15]);
-
-  Vector128Store(v0, state + kLanes * 0);
-  Vector128Store(v1, state + kLanes * 1);
-  Vector128Store(v2, state + kLanes * 2);
-  Vector128Store(v3, state + kLanes * 3);
-  Vector128Store(v4, state + kLanes * 4);
-  Vector128Store(v5, state + kLanes * 5);
-  Vector128Store(v6, state + kLanes * 6);
-  Vector128Store(v7, state + kLanes * 7);
-  Vector128Store(w0, state + kLanes * 8);
-  Vector128Store(w1, state + kLanes * 9);
-  Vector128Store(w2, state + kLanes * 10);
-  Vector128Store(w3, state + kLanes * 11);
-  Vector128Store(w4, state + kLanes * 12);
-  Vector128Store(w5, state + kLanes * 13);
-  Vector128Store(w6, state + kLanes * 14);
-  Vector128Store(w7, state + kLanes * 15);
+inline ABSL_TARGET_CRYPTO void BlockShuffle(u64x2* state) {
+  static_assert(RandenTraits::kFeistelBlocks == 16,
+                "Expecting 16 FeistelBlocks.");
+
+  constexpr size_t shuffle[RandenTraits::kFeistelBlocks] = {
+      7, 2, 13, 4, 11, 8, 3, 6, 15, 0, 9, 10, 1, 14, 5, 12};
+
+  const Vector128 v0 = Vector128Load(state + shuffle[0]);
+  const Vector128 v1 = Vector128Load(state + shuffle[1]);
+  const Vector128 v2 = Vector128Load(state + shuffle[2]);
+  const Vector128 v3 = Vector128Load(state + shuffle[3]);
+  const Vector128 v4 = Vector128Load(state + shuffle[4]);
+  const Vector128 v5 = Vector128Load(state + shuffle[5]);
+  const Vector128 v6 = Vector128Load(state + shuffle[6]);
+  const Vector128 v7 = Vector128Load(state + shuffle[7]);
+  const Vector128 w0 = Vector128Load(state + shuffle[8]);
+  const Vector128 w1 = Vector128Load(state + shuffle[9]);
+  const Vector128 w2 = Vector128Load(state + shuffle[10]);
+  const Vector128 w3 = Vector128Load(state + shuffle[11]);
+  const Vector128 w4 = Vector128Load(state + shuffle[12]);
+  const Vector128 w5 = Vector128Load(state + shuffle[13]);
+  const Vector128 w6 = Vector128Load(state + shuffle[14]);
+  const Vector128 w7 = Vector128Load(state + shuffle[15]);
+
+  Vector128Store(v0, state + 0);
+  Vector128Store(v1, state + 1);
+  Vector128Store(v2, state + 2);
+  Vector128Store(v3, state + 3);
+  Vector128Store(v4, state + 4);
+  Vector128Store(v5, state + 5);
+  Vector128Store(v6, state + 6);
+  Vector128Store(v7, state + 7);
+  Vector128Store(w0, state + 8);
+  Vector128Store(w1, state + 9);
+  Vector128Store(w2, state + 10);
+  Vector128Store(w3, state + 11);
+  Vector128Store(w4, state + 12);
+  Vector128Store(w5, state + 13);
+  Vector128Store(w6, state + 14);
+  Vector128Store(w7, state + 15);
 }
 
 // Feistel round function using two AES subrounds. Very similar to F()
@@ -443,27 +374,28 @@ inline ABSL_TARGET_CRYPTO void BlockShuffle(uint64_t* state) {
 // parallel hides the 7-cycle AESNI latency on HSW. Note that the Feistel
 // XORs are 'free' (included in the second AES instruction).
 inline ABSL_TARGET_CRYPTO const u64x2* FeistelRound(
-    uint64_t* state, const u64x2* ABSL_RANDOM_INTERNAL_RESTRICT keys) {
-  static_assert(kFeistelBlocks == 16, "Expecting 16 FeistelBlocks.");
+    u64x2* state, const u64x2* ABSL_RANDOM_INTERNAL_RESTRICT keys) {
+  static_assert(RandenTraits::kFeistelBlocks == 16,
+                "Expecting 16 FeistelBlocks.");
 
   // MSVC does a horrible job at unrolling loops.
   // So we unroll the loop by hand to improve the performance.
-  const Vector128 s0 = Vector128Load(state + kLanes * 0);
-  const Vector128 s1 = Vector128Load(state + kLanes * 1);
-  const Vector128 s2 = Vector128Load(state + kLanes * 2);
-  const Vector128 s3 = Vector128Load(state + kLanes * 3);
-  const Vector128 s4 = Vector128Load(state + kLanes * 4);
-  const Vector128 s5 = Vector128Load(state + kLanes * 5);
-  const Vector128 s6 = Vector128Load(state + kLanes * 6);
-  const Vector128 s7 = Vector128Load(state + kLanes * 7);
-  const Vector128 s8 = Vector128Load(state + kLanes * 8);
-  const Vector128 s9 = Vector128Load(state + kLanes * 9);
-  const Vector128 s10 = Vector128Load(state + kLanes * 10);
-  const Vector128 s11 = Vector128Load(state + kLanes * 11);
-  const Vector128 s12 = Vector128Load(state + kLanes * 12);
-  const Vector128 s13 = Vector128Load(state + kLanes * 13);
-  const Vector128 s14 = Vector128Load(state + kLanes * 14);
-  const Vector128 s15 = Vector128Load(state + kLanes * 15);
+  const Vector128 s0 = Vector128Load(state + 0);
+  const Vector128 s1 = Vector128Load(state + 1);
+  const Vector128 s2 = Vector128Load(state + 2);
+  const Vector128 s3 = Vector128Load(state + 3);
+  const Vector128 s4 = Vector128Load(state + 4);
+  const Vector128 s5 = Vector128Load(state + 5);
+  const Vector128 s6 = Vector128Load(state + 6);
+  const Vector128 s7 = Vector128Load(state + 7);
+  const Vector128 s8 = Vector128Load(state + 8);
+  const Vector128 s9 = Vector128Load(state + 9);
+  const Vector128 s10 = Vector128Load(state + 10);
+  const Vector128 s11 = Vector128Load(state + 11);
+  const Vector128 s12 = Vector128Load(state + 12);
+  const Vector128 s13 = Vector128Load(state + 13);
+  const Vector128 s14 = Vector128Load(state + 14);
+  const Vector128 s15 = Vector128Load(state + 15);
 
   // Encode even blocks with keys.
   const Vector128 e0 = AesRound(s0, Vector128Load(keys + 0));
@@ -486,14 +418,14 @@ inline ABSL_TARGET_CRYPTO const u64x2* FeistelRound(
   const Vector128 o15 = AesRound(e14, s15);
 
   // Store odd blocks. (These will be shuffled later).
-  Vector128Store(o1, state + kLanes * 1);
-  Vector128Store(o3, state + kLanes * 3);
-  Vector128Store(o5, state + kLanes * 5);
-  Vector128Store(o7, state + kLanes * 7);
-  Vector128Store(o9, state + kLanes * 9);
-  Vector128Store(o11, state + kLanes * 11);
-  Vector128Store(o13, state + kLanes * 13);
-  Vector128Store(o15, state + kLanes * 15);
+  Vector128Store(o1, state + 1);
+  Vector128Store(o3, state + 3);
+  Vector128Store(o5, state + 5);
+  Vector128Store(o7, state + 7);
+  Vector128Store(o9, state + 9);
+  Vector128Store(o11, state + 11);
+  Vector128Store(o13, state + 13);
+  Vector128Store(o15, state + 15);
 
   return keys + 8;
 }
@@ -503,16 +435,13 @@ inline ABSL_TARGET_CRYPTO const u64x2* FeistelRound(
 // 2^64 queries if the round function is a PRF. This is similar to the b=8 case
 // of Simpira v2, but more efficient than its generic construction for b=16.
 inline ABSL_TARGET_CRYPTO void Permute(
-    const void* ABSL_RANDOM_INTERNAL_RESTRICT keys, uint64_t* state) {
-  const u64x2* ABSL_RANDOM_INTERNAL_RESTRICT keys128 =
-      static_cast<const u64x2*>(keys);
-
+    u64x2* state, const u64x2* ABSL_RANDOM_INTERNAL_RESTRICT keys) {
   // (Successfully unrolled; the first iteration jumps into the second half)
 #ifdef __clang__
 #pragma clang loop unroll_count(2)
 #endif
-  for (size_t round = 0; round < kFeistelRounds; ++round) {
-    keys128 = FeistelRound(state, keys128);
+  for (size_t round = 0; round < RandenTraits::kFeistelRounds; ++round) {
+    keys = FeistelRound(state, keys);
     BlockShuffle(state);
   }
 }
@@ -528,96 +457,101 @@ bool HasRandenHwAesImplementation() { return true; }
 const void* ABSL_TARGET_CRYPTO RandenHwAes::GetKeys() {
   // Round keys for one AES per Feistel round and branch.
   // The canonical implementation uses first digits of Pi.
-  return round_keys;
+#if defined(ABSL_ARCH_PPC)
+  return kRandenRoundKeysBE;
+#else
+  return kRandenRoundKeys;
+#endif
 }
 
 // NOLINTNEXTLINE
 void ABSL_TARGET_CRYPTO RandenHwAes::Absorb(const void* seed_void,
                                             void* state_void) {
-  auto* state = static_cast<uint64_t*>(state_void);
-  const auto* seed = static_cast<const uint64_t*>(seed_void);
-
-  constexpr size_t kCapacityBlocks = kCapacityBytes / sizeof(Vector128);
-  constexpr size_t kStateBlocks = kStateBytes / sizeof(Vector128);
-
-  static_assert(kCapacityBlocks * sizeof(Vector128) == kCapacityBytes,
-                "Not i*V");
-  static_assert(kCapacityBlocks == 1, "Unexpected Randen kCapacityBlocks");
-  static_assert(kStateBlocks == 16, "Unexpected Randen kStateBlocks");
-
-  Vector128 b1 = Vector128Load(state + kLanes * 1);
-  b1 ^= Vector128Load(seed + kLanes * 0);
-  Vector128Store(b1, state + kLanes * 1);
-
-  Vector128 b2 = Vector128Load(state + kLanes * 2);
-  b2 ^= Vector128Load(seed + kLanes * 1);
-  Vector128Store(b2, state + kLanes * 2);
-
-  Vector128 b3 = Vector128Load(state + kLanes * 3);
-  b3 ^= Vector128Load(seed + kLanes * 2);
-  Vector128Store(b3, state + kLanes * 3);
-
-  Vector128 b4 = Vector128Load(state + kLanes * 4);
-  b4 ^= Vector128Load(seed + kLanes * 3);
-  Vector128Store(b4, state + kLanes * 4);
-
-  Vector128 b5 = Vector128Load(state + kLanes * 5);
-  b5 ^= Vector128Load(seed + kLanes * 4);
-  Vector128Store(b5, state + kLanes * 5);
-
-  Vector128 b6 = Vector128Load(state + kLanes * 6);
-  b6 ^= Vector128Load(seed + kLanes * 5);
-  Vector128Store(b6, state + kLanes * 6);
-
-  Vector128 b7 = Vector128Load(state + kLanes * 7);
-  b7 ^= Vector128Load(seed + kLanes * 6);
-  Vector128Store(b7, state + kLanes * 7);
-
-  Vector128 b8 = Vector128Load(state + kLanes * 8);
-  b8 ^= Vector128Load(seed + kLanes * 7);
-  Vector128Store(b8, state + kLanes * 8);
-
-  Vector128 b9 = Vector128Load(state + kLanes * 9);
-  b9 ^= Vector128Load(seed + kLanes * 8);
-  Vector128Store(b9, state + kLanes * 9);
-
-  Vector128 b10 = Vector128Load(state + kLanes * 10);
-  b10 ^= Vector128Load(seed + kLanes * 9);
-  Vector128Store(b10, state + kLanes * 10);
-
-  Vector128 b11 = Vector128Load(state + kLanes * 11);
-  b11 ^= Vector128Load(seed + kLanes * 10);
-  Vector128Store(b11, state + kLanes * 11);
-
-  Vector128 b12 = Vector128Load(state + kLanes * 12);
-  b12 ^= Vector128Load(seed + kLanes * 11);
-  Vector128Store(b12, state + kLanes * 12);
-
-  Vector128 b13 = Vector128Load(state + kLanes * 13);
-  b13 ^= Vector128Load(seed + kLanes * 12);
-  Vector128Store(b13, state + kLanes * 13);
-
-  Vector128 b14 = Vector128Load(state + kLanes * 14);
-  b14 ^= Vector128Load(seed + kLanes * 13);
-  Vector128Store(b14, state + kLanes * 14);
-
-  Vector128 b15 = Vector128Load(state + kLanes * 15);
-  b15 ^= Vector128Load(seed + kLanes * 14);
-  Vector128Store(b15, state + kLanes * 15);
+  static_assert(RandenTraits::kCapacityBytes / sizeof(Vector128) == 1,
+                "Unexpected Randen kCapacityBlocks");
+  static_assert(RandenTraits::kStateBytes / sizeof(Vector128) == 16,
+                "Unexpected Randen kStateBlocks");
+
+  auto* state =
+      reinterpret_cast<u64x2 * ABSL_RANDOM_INTERNAL_RESTRICT>(state_void);
+  const auto* seed =
+      reinterpret_cast<const u64x2 * ABSL_RANDOM_INTERNAL_RESTRICT>(seed_void);
+
+  Vector128 b1 = Vector128Load(state + 1);
+  b1 ^= Vector128Load(seed + 0);
+  Vector128Store(b1, state + 1);
+
+  Vector128 b2 = Vector128Load(state + 2);
+  b2 ^= Vector128Load(seed + 1);
+  Vector128Store(b2, state + 2);
+
+  Vector128 b3 = Vector128Load(state + 3);
+  b3 ^= Vector128Load(seed + 2);
+  Vector128Store(b3, state + 3);
+
+  Vector128 b4 = Vector128Load(state + 4);
+  b4 ^= Vector128Load(seed + 3);
+  Vector128Store(b4, state + 4);
+
+  Vector128 b5 = Vector128Load(state + 5);
+  b5 ^= Vector128Load(seed + 4);
+  Vector128Store(b5, state + 5);
+
+  Vector128 b6 = Vector128Load(state + 6);
+  b6 ^= Vector128Load(seed + 5);
+  Vector128Store(b6, state + 6);
+
+  Vector128 b7 = Vector128Load(state + 7);
+  b7 ^= Vector128Load(seed + 6);
+  Vector128Store(b7, state + 7);
+
+  Vector128 b8 = Vector128Load(state + 8);
+  b8 ^= Vector128Load(seed + 7);
+  Vector128Store(b8, state + 8);
+
+  Vector128 b9 = Vector128Load(state + 9);
+  b9 ^= Vector128Load(seed + 8);
+  Vector128Store(b9, state + 9);
+
+  Vector128 b10 = Vector128Load(state + 10);
+  b10 ^= Vector128Load(seed + 9);
+  Vector128Store(b10, state + 10);
+
+  Vector128 b11 = Vector128Load(state + 11);
+  b11 ^= Vector128Load(seed + 10);
+  Vector128Store(b11, state + 11);
+
+  Vector128 b12 = Vector128Load(state + 12);
+  b12 ^= Vector128Load(seed + 11);
+  Vector128Store(b12, state + 12);
+
+  Vector128 b13 = Vector128Load(state + 13);
+  b13 ^= Vector128Load(seed + 12);
+  Vector128Store(b13, state + 13);
+
+  Vector128 b14 = Vector128Load(state + 14);
+  b14 ^= Vector128Load(seed + 13);
+  Vector128Store(b14, state + 14);
+
+  Vector128 b15 = Vector128Load(state + 15);
+  b15 ^= Vector128Load(seed + 14);
+  Vector128Store(b15, state + 15);
 }
 
 // NOLINTNEXTLINE
-void ABSL_TARGET_CRYPTO RandenHwAes::Generate(const void* keys,
+void ABSL_TARGET_CRYPTO RandenHwAes::Generate(const void* keys_void,
                                               void* state_void) {
-  static_assert(kCapacityBytes == sizeof(Vector128), "Capacity mismatch");
+  static_assert(RandenTraits::kCapacityBytes == sizeof(Vector128),
+                "Capacity mismatch");
 
-  auto* state = static_cast<uint64_t*>(state_void);
+  auto* state = reinterpret_cast<u64x2*>(state_void);
+  const auto* keys = reinterpret_cast<const u64x2*>(keys_void);
 
   const Vector128 prev_inner = Vector128Load(state);
 
   SwapEndian(state);
 
-  Permute(keys, state);
+  Permute(state, keys);
 
   SwapEndian(state);