From 0b3c0725a28786c8d8f2bfc659e8f0a5beedb05a Mon Sep 17 00:00:00 2001 From: edef Date: Mon, 9 Jan 2023 20:12:26 +0000 Subject: feat(users/edef/refscan): high-performance Nix reference scanner Research-grade code, treat with care. Change-Id: I99804df93e64101ef24928238ef0a8a02b59c2aa Reviewed-on: https://cl.tvl.fyi/c/depot/+/7686 Reviewed-by: edef Tested-by: BuildkiteCI --- users/edef/refscan/.gitignore | 2 ++ users/edef/refscan/Cargo.lock | 25 +++++++++++++++ users/edef/refscan/Cargo.toml | 10 ++++++ users/edef/refscan/src/lib.rs | 53 +++++++++++++++++++++++++++++++ users/edef/refscan/src/main.rs | 55 +++++++++++++++++++++++++++++++++ users/edef/refscan/testdata/.gitignore | 3 ++ users/edef/refscan/testdata/generate.sh | 6 ++++ 7 files changed, 154 insertions(+) create mode 100644 users/edef/refscan/.gitignore create mode 100644 users/edef/refscan/Cargo.lock create mode 100644 users/edef/refscan/Cargo.toml create mode 100644 users/edef/refscan/src/lib.rs create mode 100644 users/edef/refscan/src/main.rs create mode 100644 users/edef/refscan/testdata/.gitignore create mode 100755 users/edef/refscan/testdata/generate.sh (limited to 'users/edef') diff --git a/users/edef/refscan/.gitignore b/users/edef/refscan/.gitignore new file mode 100644 index 0000000000..53eaa21960 --- /dev/null +++ b/users/edef/refscan/.gitignore @@ -0,0 +1,2 @@ +/target +**/*.rs.bk diff --git a/users/edef/refscan/Cargo.lock b/users/edef/refscan/Cargo.lock new file mode 100644 index 0000000000..6a079249b3 --- /dev/null +++ b/users/edef/refscan/Cargo.lock @@ -0,0 +1,25 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "packed_simd" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "refscan" +version = "0.1.0" +dependencies = [ + "packed_simd 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[metadata] +"checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" +"checksum packed_simd 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a85ea9fc0d4ac0deb6fe7911d38786b32fc11119afd9e9d38b84ff691ce64220" diff --git a/users/edef/refscan/Cargo.toml b/users/edef/refscan/Cargo.toml new file mode 100644 index 0000000000..778d9d24ed --- /dev/null +++ b/users/edef/refscan/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "refscan" +version = "0.1.0" +authors = ["edef "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +packed_simd = "0.3.3" diff --git a/users/edef/refscan/src/lib.rs b/users/edef/refscan/src/lib.rs new file mode 100644 index 0000000000..79cf21a1b9 --- /dev/null +++ b/users/edef/refscan/src/lib.rs @@ -0,0 +1,53 @@ +use packed_simd::{m8x32, u8x32}; + +fn prefilter(haystack: u8x32) -> m8x32 { + let alp = haystack.gt(u8x32::splat(b'a' - 1)) & haystack.lt(u8x32::splat(b'z' + 1)); + let num = haystack.gt(u8x32::splat(b'0' - 1)) & haystack.lt(u8x32::splat(b'9' + 1)); + alp | num +} + +/// scan_clean returns `Err(&buffer[..n])` of known pointer-free data, +/// or `Ok(buffer)` if the entire buffer is pointer-free. +pub fn scan_clean(buffer: &[u8]) -> Result<&[u8], &[u8]> { + let buffer = { + let n = buffer.len() & !31; + &buffer[..n] + }; + + let mut masks = buffer + .chunks_exact(32) + .map(|chunk| prefilter(u8x32::from_slice_unaligned(chunk)).bitmask()) + .enumerate() + .map(|e| (e.0 * 32, e.1)) + .peekable(); + + while let Some((offset, mask)) = masks.next() { + let peek = masks.peek().map(|x| x.1).unwrap_or(!0 >> 1); + let n = (!mask).leading_zeros() + (!peek).trailing_zeros(); + if n >= 32 { + let offset = offset + mask.trailing_zeros() as usize; + return Err(&buffer[..offset]); + } + } + + Ok(buffer) +} + +#[cfg(test)] +mod test { + #[test] + fn scan_tail() { + let buffer = b"_xfbmj7sl2ikicym9x3yq7cms5qx1w39k"; + assert_eq!(crate::scan_clean(buffer), Err(&buffer[..1])); + } + #[test] + fn scan_straddle() { + let buffer = b"________________xfbmj7sl2ikicym9x3yq7cms5qx1w39k________________"; + assert_eq!(crate::scan_clean(buffer), Err(&buffer[..16])); + } + #[test] + fn scan_clean() { + let buffer = b"x_______________xfbmj7sl2ikicym9x3yq-cms5qx1w3-k________________"; + assert_eq!(crate::scan_clean(buffer), Ok(&buffer[..])); + } +} diff --git a/users/edef/refscan/src/main.rs b/users/edef/refscan/src/main.rs new file mode 100644 index 0000000000..9bbb5ed823 --- /dev/null +++ b/users/edef/refscan/src/main.rs @@ -0,0 +1,55 @@ +use std::{ + collections::BTreeSet as Set, + convert::TryInto, + io::{self, Read}, + str, +}; + +fn main() { + let max_refs: Set<[u8; 32]> = include_str!("../testdata/maxrefs") + .lines() + .map(|l| l.as_bytes().try_into().unwrap()) + .collect(); + + let input = { + let stdin = io::stdin(); + let mut buffer = Vec::new(); + stdin.lock().read_to_end(&mut buffer).unwrap(); + buffer + }; + + let base = input.as_ptr() as usize; + let mut input: &[u8] = &input; + while input.len() >= 32 { + match refscan::scan_clean(&input) { + Ok(buffer) | Err(buffer) => { + let n = buffer.len(); + input = &input[n..]; + } + } + + let buffer = { + let idx = input.iter().position(|x| match x { + b'a'..=b'z' | b'0'..=b'9' => false, + _ => true, + }); + idx.map(|idx| &input[..idx]).unwrap_or(input) + }; + + for chunk in buffer.windows(32) { + let offset = (chunk.as_ptr() as usize) - base; + let chunk = { + let mut fixed = [0u8; 32]; + fixed.copy_from_slice(chunk); + fixed + }; + if max_refs.contains(&chunk) { + let seen = unsafe { str::from_utf8_unchecked(&chunk) }; + println!("{} {}", seen, offset); + } + } + + let n = buffer.len(); + input = &input[n..]; + } +} diff --git a/users/edef/refscan/testdata/.gitignore b/users/edef/refscan/testdata/.gitignore new file mode 100644 index 0000000000..93c04efcf6 --- /dev/null +++ b/users/edef/refscan/testdata/.gitignore @@ -0,0 +1,3 @@ +/maxrefs +/nar +/result diff --git a/users/edef/refscan/testdata/generate.sh b/users/edef/refscan/testdata/generate.sh new file mode 100755 index 0000000000..02d8c40567 --- /dev/null +++ b/users/edef/refscan/testdata/generate.sh @@ -0,0 +1,6 @@ +#! /usr/bin/env bash +set -euo pipefail + +drv=$(nix-instantiate '' -A ghc) +nix --extra-experimental-features nix-command show-derivation -r "$drv" | jq -r '.[] | .outputs[].path, .inputSrcs[]' | sort -u | cut -d/ -f4 | cut -d- -f1 > maxrefs +nix-store --dump "$(nix-build "$drv")" > nar -- cgit 1.4.1