about summary refs log tree commit diff
path: root/users/edef/refscan
diff options
context:
space:
mode:
authoredef <edef@edef.eu>2023-01-09T20·12+0000
committeredef <edef@edef.eu>2023-01-09T20·15+0000
commit0b3c0725a28786c8d8f2bfc659e8f0a5beedb05a (patch)
tree0d028b57aec6ef25be236992d0d06c9e4305a0c7 /users/edef/refscan
parent681800b438fa66f897759a197aba82f0122efcc3 (diff)
feat(users/edef/refscan): high-performance Nix reference scanner r/5636
Research-grade code, treat with care.

Change-Id: I99804df93e64101ef24928238ef0a8a02b59c2aa
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7686
Reviewed-by: edef <edef@edef.eu>
Tested-by: BuildkiteCI
Diffstat (limited to 'users/edef/refscan')
-rw-r--r--users/edef/refscan/.gitignore2
-rw-r--r--users/edef/refscan/Cargo.lock25
-rw-r--r--users/edef/refscan/Cargo.toml10
-rw-r--r--users/edef/refscan/src/lib.rs53
-rw-r--r--users/edef/refscan/src/main.rs55
-rw-r--r--users/edef/refscan/testdata/.gitignore3
-rwxr-xr-xusers/edef/refscan/testdata/generate.sh6
7 files changed, 154 insertions, 0 deletions
diff --git a/users/edef/refscan/.gitignore b/users/edef/refscan/.gitignore
new file mode 100644
index 000000000000..53eaa21960d1
--- /dev/null
+++ b/users/edef/refscan/.gitignore
@@ -0,0 +1,2 @@
+/target
+**/*.rs.bk
diff --git a/users/edef/refscan/Cargo.lock b/users/edef/refscan/Cargo.lock
new file mode 100644
index 000000000000..6a079249b3c0
--- /dev/null
+++ b/users/edef/refscan/Cargo.lock
@@ -0,0 +1,25 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+[[package]]
+name = "cfg-if"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
+[[package]]
+name = "packed_simd"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
+name = "refscan"
+version = "0.1.0"
+dependencies = [
+ "packed_simd 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[metadata]
+"checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
+"checksum packed_simd 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a85ea9fc0d4ac0deb6fe7911d38786b32fc11119afd9e9d38b84ff691ce64220"
diff --git a/users/edef/refscan/Cargo.toml b/users/edef/refscan/Cargo.toml
new file mode 100644
index 000000000000..778d9d24edb4
--- /dev/null
+++ b/users/edef/refscan/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "refscan"
+version = "0.1.0"
+authors = ["edef <edef@edef.eu>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+packed_simd = "0.3.3"
diff --git a/users/edef/refscan/src/lib.rs b/users/edef/refscan/src/lib.rs
new file mode 100644
index 000000000000..79cf21a1b984
--- /dev/null
+++ b/users/edef/refscan/src/lib.rs
@@ -0,0 +1,53 @@
+use packed_simd::{m8x32, u8x32};
+
+fn prefilter(haystack: u8x32) -> m8x32 {
+    let alp = haystack.gt(u8x32::splat(b'a' - 1)) & haystack.lt(u8x32::splat(b'z' + 1));
+    let num = haystack.gt(u8x32::splat(b'0' - 1)) & haystack.lt(u8x32::splat(b'9' + 1));
+    alp | num
+}
+
+/// scan_clean returns `Err(&buffer[..n])` of known pointer-free data,
+/// or `Ok(buffer)` if the entire buffer is pointer-free.
+pub fn scan_clean(buffer: &[u8]) -> Result<&[u8], &[u8]> {
+    let buffer = {
+        let n = buffer.len() & !31;
+        &buffer[..n]
+    };
+
+    let mut masks = buffer
+        .chunks_exact(32)
+        .map(|chunk| prefilter(u8x32::from_slice_unaligned(chunk)).bitmask())
+        .enumerate()
+        .map(|e| (e.0 * 32, e.1))
+        .peekable();
+
+    while let Some((offset, mask)) = masks.next() {
+        let peek = masks.peek().map(|x| x.1).unwrap_or(!0 >> 1);
+        let n = (!mask).leading_zeros() + (!peek).trailing_zeros();
+        if n >= 32 {
+            let offset = offset + mask.trailing_zeros() as usize;
+            return Err(&buffer[..offset]);
+        }
+    }
+
+    Ok(buffer)
+}
+
+#[cfg(test)]
+mod test {
+    #[test]
+    fn scan_tail() {
+        let buffer = b"_xfbmj7sl2ikicym9x3yq7cms5qx1w39k";
+        assert_eq!(crate::scan_clean(buffer), Err(&buffer[..1]));
+    }
+    #[test]
+    fn scan_straddle() {
+        let buffer = b"________________xfbmj7sl2ikicym9x3yq7cms5qx1w39k________________";
+        assert_eq!(crate::scan_clean(buffer), Err(&buffer[..16]));
+    }
+    #[test]
+    fn scan_clean() {
+        let buffer = b"x_______________xfbmj7sl2ikicym9x3yq-cms5qx1w3-k________________";
+        assert_eq!(crate::scan_clean(buffer), Ok(&buffer[..]));
+    }
+}
diff --git a/users/edef/refscan/src/main.rs b/users/edef/refscan/src/main.rs
new file mode 100644
index 000000000000..9bbb5ed82312
--- /dev/null
+++ b/users/edef/refscan/src/main.rs
@@ -0,0 +1,55 @@
+use std::{
+    collections::BTreeSet as Set,
+    convert::TryInto,
+    io::{self, Read},
+    str,
+};
+
+fn main() {
+    let max_refs: Set<[u8; 32]> = include_str!("../testdata/maxrefs")
+        .lines()
+        .map(|l| l.as_bytes().try_into().unwrap())
+        .collect();
+
+    let input = {
+        let stdin = io::stdin();
+        let mut buffer = Vec::new();
+        stdin.lock().read_to_end(&mut buffer).unwrap();
+        buffer
+    };
+
+    let base = input.as_ptr() as usize;
+    let mut input: &[u8] = &input;
+    while input.len() >= 32 {
+        match refscan::scan_clean(&input) {
+            Ok(buffer) | Err(buffer) => {
+                let n = buffer.len();
+                input = &input[n..];
+            }
+        }
+
+        let buffer = {
+            let idx = input.iter().position(|x| match x {
+                b'a'..=b'z' | b'0'..=b'9' => false,
+                _ => true,
+            });
+            idx.map(|idx| &input[..idx]).unwrap_or(input)
+        };
+
+        for chunk in buffer.windows(32) {
+            let offset = (chunk.as_ptr() as usize) - base;
+            let chunk = {
+                let mut fixed = [0u8; 32];
+                fixed.copy_from_slice(chunk);
+                fixed
+            };
+            if max_refs.contains(&chunk) {
+                let seen = unsafe { str::from_utf8_unchecked(&chunk) };
+                println!("{} {}", seen, offset);
+            }
+        }
+
+        let n = buffer.len();
+        input = &input[n..];
+    }
+}
diff --git a/users/edef/refscan/testdata/.gitignore b/users/edef/refscan/testdata/.gitignore
new file mode 100644
index 000000000000..93c04efcf655
--- /dev/null
+++ b/users/edef/refscan/testdata/.gitignore
@@ -0,0 +1,3 @@
+/maxrefs
+/nar
+/result
diff --git a/users/edef/refscan/testdata/generate.sh b/users/edef/refscan/testdata/generate.sh
new file mode 100755
index 000000000000..02d8c4056789
--- /dev/null
+++ b/users/edef/refscan/testdata/generate.sh
@@ -0,0 +1,6 @@
+#! /usr/bin/env bash
+set -euo pipefail
+
+drv=$(nix-instantiate '<nixpkgs>' -A ghc)
+nix --extra-experimental-features nix-command show-derivation -r "$drv" | jq -r '.[] | .outputs[].path, .inputSrcs[]' | sort -u | cut -d/ -f4 | cut -d- -f1 > maxrefs
+nix-store --dump "$(nix-build "$drv")" > nar