about summary refs log tree commit diff
path: root/users/zseri
diff options
context:
space:
mode:
Diffstat (limited to 'users/zseri')
-rw-r--r--users/zseri/.gitignore2
-rw-r--r--users/zseri/OWNERS3
-rw-r--r--users/zseri/dbwospof.md112
-rw-r--r--users/zseri/store-ref-scanner/.gitignore1
-rw-r--r--users/zseri/store-ref-scanner/Cargo.toml6
-rw-r--r--users/zseri/store-ref-scanner/default.nix49
-rw-r--r--users/zseri/store-ref-scanner/fuzz/.gitignore2
-rw-r--r--users/zseri/store-ref-scanner/fuzz/Cargo.lock44
-rw-r--r--users/zseri/store-ref-scanner/fuzz/Cargo.toml36
-rw-r--r--users/zseri/store-ref-scanner/fuzz/fuzz_targets/hbm-roundtrip.rs10
-rw-r--r--users/zseri/store-ref-scanner/fuzz/fuzz_targets/nocrash.rs9
-rw-r--r--users/zseri/store-ref-scanner/src/hbm.rs167
-rw-r--r--users/zseri/store-ref-scanner/src/lib.rs215
-rw-r--r--users/zseri/store-ref-scanner/src/spec.rs40
-rw-r--r--users/zseri/store-ref-scanner/tests.nix32
15 files changed, 728 insertions, 0 deletions
diff --git a/users/zseri/.gitignore b/users/zseri/.gitignore
new file mode 100644
index 000000000000..b8553ace5532
--- /dev/null
+++ b/users/zseri/.gitignore
@@ -0,0 +1,2 @@
+.#*
+target/
diff --git a/users/zseri/OWNERS b/users/zseri/OWNERS
new file mode 100644
index 000000000000..4f565712149c
--- /dev/null
+++ b/users/zseri/OWNERS
@@ -0,0 +1,3 @@
+inherited: false
+owners:
+  - zseri
diff --git a/users/zseri/dbwospof.md b/users/zseri/dbwospof.md
new file mode 100644
index 000000000000..f1d68cde069c
--- /dev/null
+++ b/users/zseri/dbwospof.md
@@ -0,0 +1,112 @@
+# distributed build without single points of failure
+
+## problem statement
+> If we want to distribute a build across several build nodes, and want to avoid
+> a "single point of failure", what needs to be considered?
+
+## motivation
+
+* distribute the build across several build nodes, because some packages take
+  extremely long to build
+  (e.g. `firefox`, `thunderbird`, `qtwebengine`, `webkitgtk`, ...)
+* avoid a centralised setup like e.g. with Hydra, because we want to keep using
+  an on-demand workflow as usual with Nix
+  (e.g. `nixos-rebuild` on each host when necessary).
+
+## list of abbreviations
+
+<dl>
+  <dt>CA</dt>		<dd>content-addressed</dd>
+  <dt>drv</dt>		<dd>derivation</dd>
+  <dt>FOD</dt>		<dd>fixed-output derivation</dd>
+  <dt>IHA</dt>		<dd>input-hash-addressed</dd>
+  <dt>inhash</dt>	<dd>input hash</dd>
+  <dt>outhash</dt>	<dd>output hash</dd>
+</dl>
+
+## build graph
+
+The build graph can't be easily distributed. It is instead left on the coordinator,
+and the build nodes just get individual build jobs, which just consist of
+derivations (and some information about how to get the inputs from some central
+or distributed store (e.g. Ceph), this may be transmitted "out of band").
+
+## inhash-exclusive
+
+It is necessary that each derivation build is exclusive in the sense that
+the same derivation is never build multiple times simultaneously, because
+this otherwise either wastes compute resources (obviously) and, in the case
+of non-deterministic builds, increases complexity
+(the store needs to decide which result to prefer, and the build nodes with
+"losing" build results need to pull the "winning" build results from the store,
+replacing the local version). Although this might be unnecessary in case
+of IHA drvs, enforcing it always reduces the amount of possible suprising
+results when mixing CA drvs and IHA drvs.
+
+## what can be meaningfully distributed
+
+The following is strongly opinionated, but I consider the following
+(based upon the original build graph implementation from yzix 12.2021):
+* We can't push the entire build graph to each build node, because they would
+  overlap 100%, and thus create extreme contention on the inhash-exclusive lock
+* We could try to split the build graph into multiple parts with independent
+  inputs (partitioning), but this can be really complex, and I'm not sure
+  if it is worth it... This also basically excludes the yzix node types
+  [ `Eval`, `AssertEqual` ] (should be done by the evaluator).
+  Implementing this option however would make an abort of a build graph
+  (the simple variant does not kill running tasks,
+  just stop new tasks from being scheduled) really hard, and complex to get right.
+* It does not make sense to distribute "node tasks" across build nodes which
+  almost exclusively interact with the store, and are not CPU-bound, but I/O bound.
+  This applies to most, if not all, useful FODs. It applies to the yzix node types
+  [ `Dump`, `UnDump`, `Fetch`, `Require` ] (should be performed by evaluator+store).
+* TODO: figure out how to do forced rebuilds (e.g. also prefer a node which is not
+  the build node of the previous realisation of that task)
+
+## coarse per-derivation workflow
+
+```
+    derivation
+    |        |
+    |        |
+   key     build
+    |        |
+    |        |
+    V        V
+  inhash  outhash
+    |     (either CA or IHA)
+     \      /
+      \    /
+       \  /
+    realisation
+```
+
+## build results
+
+Just for completeness, two build results are currently considered:
+
+* success: the build succeeded, and the result is uploaded to the central store
+* failure: the build failed (e.g. build process terminated via error exit code or was killed)
+* another case might be "partial": the build succeeded, but uploading to the
+  central store failed (the result is only available on the build node that built it).
+  This case is interesting, because we don't need to rerun the build, just the upload step
+  needs to be fixed/done semi-manually (e.g. maybe the central store ran out of storage,
+  or the network was unavailable)
+
+## build task queue
+
+It is naïve to think that something like a queue via `rabbitmq` (`AMQP`) or `MQTT`
+suffices, because some requirements are missing:
+
+1. some way to push build results to the clients, and these should be associated
+  to the build inputs (a hacky way might use multiple queues for that, e.g.
+  a `tasks` input queue and a `done` output queue).
+2. some way to lock "inhashes" (see section inhash-exclusive).
+
+The second point is somewhat easy to realise using `etcd`, and using the `watch`
+mechanism it can be used to simulate a queue, and the inhash-addressing of
+queued derivations can be seamlessly integrated.
+
+TODO: maybe we want to adjust the priorities of tasks in the queue, but Nix currently
+doesn't seem to do this, so consider this only when it starts to make sense as a
+performance or lag optimization.
diff --git a/users/zseri/store-ref-scanner/.gitignore b/users/zseri/store-ref-scanner/.gitignore
new file mode 100644
index 000000000000..5a44eef09a54
--- /dev/null
+++ b/users/zseri/store-ref-scanner/.gitignore
@@ -0,0 +1 @@
+/Cargo.lock
diff --git a/users/zseri/store-ref-scanner/Cargo.toml b/users/zseri/store-ref-scanner/Cargo.toml
new file mode 100644
index 000000000000..ad565f09af2e
--- /dev/null
+++ b/users/zseri/store-ref-scanner/Cargo.toml
@@ -0,0 +1,6 @@
+[package]
+name = "store-ref-scanner"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
diff --git a/users/zseri/store-ref-scanner/default.nix b/users/zseri/store-ref-scanner/default.nix
new file mode 100644
index 000000000000..38f3fd64ecd7
--- /dev/null
+++ b/users/zseri/store-ref-scanner/default.nix
@@ -0,0 +1,49 @@
+{ depot, lib, pkgs, ... }:
+
+let
+  sourceFilter = name: type:
+    let
+      baseName = builtins.baseNameOf (builtins.toString name);
+    in
+    (baseName == "Cargo.toml")
+    || (type == "directory" && baseName == "src")
+    || (lib.hasSuffix ".rs" baseName)
+  ;
+in
+
+pkgs.buildRustCrate rec {
+  pname = "store-ref-scanner";
+  crateName = "store-ref-scanner";
+  version = "0.1.0";
+  edition = "2021";
+  src = lib.cleanSourceWith { filter = sourceFilter; src = ./.; };
+
+  passthru.tests = pkgs.buildRustCrate {
+    pname = "store-ref-scanner-tests";
+    inherit crateName src version edition;
+    buildTests = true;
+    postInstall = ''
+      set -ex
+      export RUST_BACKTRACE=1
+      # recreate a file hierarchy as when running tests with cargo
+      # the source for test data
+      # build outputs
+      testRoot=target/debug
+      mkdir -p $testRoot
+      chmod +w -R .
+      # test harness executables are suffixed with a hash,
+      # like cargo does this allows to prevent name collision
+      # with the main executables of the crate
+      hash=$(basename $out)
+      ls -lasR $out
+      for file in $out/tests/*; do
+        f=$testRoot/$(basename $file)-$hash
+        cp $file $f
+        $f 2>&1 | tee -a $out/tests.log
+      done
+      rm -rf $out/tests
+      set +ex
+    '';
+  };
+
+}
diff --git a/users/zseri/store-ref-scanner/fuzz/.gitignore b/users/zseri/store-ref-scanner/fuzz/.gitignore
new file mode 100644
index 000000000000..b400c2782601
--- /dev/null
+++ b/users/zseri/store-ref-scanner/fuzz/.gitignore
@@ -0,0 +1,2 @@
+corpus
+artifacts
diff --git a/users/zseri/store-ref-scanner/fuzz/Cargo.lock b/users/zseri/store-ref-scanner/fuzz/Cargo.lock
new file mode 100644
index 000000000000..7395dec05e45
--- /dev/null
+++ b/users/zseri/store-ref-scanner/fuzz/Cargo.lock
@@ -0,0 +1,44 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "arbitrary"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "510c76ecefdceada737ea728f4f9a84bd2e1ef29f1ba555e560940fe279954de"
+
+[[package]]
+name = "cc"
+version = "1.0.72"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22a9137b95ea06864e018375b72adfb7db6e6f68cfc8df5a04d00288050485ee"
+
+[[package]]
+name = "libfuzzer-sys"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "36a9a84a6e8b55dfefb04235e55edb2b9a2a18488fcae777a6bdaa6f06f1deb3"
+dependencies = [
+ "arbitrary",
+ "cc",
+ "once_cell",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5"
+
+[[package]]
+name = "store-ref-scanner"
+version = "0.1.0"
+
+[[package]]
+name = "store-ref-scanner-fuzz"
+version = "0.0.0"
+dependencies = [
+ "libfuzzer-sys",
+ "store-ref-scanner",
+]
diff --git a/users/zseri/store-ref-scanner/fuzz/Cargo.toml b/users/zseri/store-ref-scanner/fuzz/Cargo.toml
new file mode 100644
index 000000000000..1832be00329c
--- /dev/null
+++ b/users/zseri/store-ref-scanner/fuzz/Cargo.toml
@@ -0,0 +1,36 @@
+[package]
+name = "store-ref-scanner-fuzz"
+version = "0.0.0"
+authors = ["Automatically generated"]
+publish = false
+edition = "2018"
+
+[package.metadata]
+cargo-fuzz = true
+
+[dependencies]
+libfuzzer-sys = "0.4"
+
+[dependencies.store-ref-scanner]
+path = ".."
+
+# Prevent this from interfering with workspaces
+[workspace]
+members = ["."]
+
+[[bin]]
+name = "hbm-roundtrip"
+path = "fuzz_targets/hbm-roundtrip.rs"
+test = false
+doc = false
+
+[[bin]]
+name = "nocrash"
+path = "fuzz_targets/nocrash.rs"
+test = false
+doc = false
+
+[profile.release]
+incremental = false
+overflow-checks = true
+panic = "abort"
diff --git a/users/zseri/store-ref-scanner/fuzz/fuzz_targets/hbm-roundtrip.rs b/users/zseri/store-ref-scanner/fuzz/fuzz_targets/hbm-roundtrip.rs
new file mode 100644
index 000000000000..9e21a7738a38
--- /dev/null
+++ b/users/zseri/store-ref-scanner/fuzz/fuzz_targets/hbm-roundtrip.rs
@@ -0,0 +1,10 @@
+#![no_main]
+use libfuzzer_sys::fuzz_target;
+
+fuzz_target!(|data: [u8; 16]| {
+    use store_ref_scanner::HalfBytesMask;
+    let a = HalfBytesMask(data);
+    let b = a.into_expanded();
+    let c = HalfBytesMask::from_expanded(b);
+    assert_eq!(a, c);
+});
diff --git a/users/zseri/store-ref-scanner/fuzz/fuzz_targets/nocrash.rs b/users/zseri/store-ref-scanner/fuzz/fuzz_targets/nocrash.rs
new file mode 100644
index 000000000000..48100a628d7a
--- /dev/null
+++ b/users/zseri/store-ref-scanner/fuzz/fuzz_targets/nocrash.rs
@@ -0,0 +1,9 @@
+#![no_main]
+use libfuzzer_sys::fuzz_target;
+
+fuzz_target!(|data: &[u8]| {
+    use store_ref_scanner::{StoreRefScanner, StoreSpec};
+
+    StoreRefScanner::new(&data[..], &StoreSpec::DFL_NIX2).count();
+    StoreRefScanner::new(&data[..], &StoreSpec::DFL_YZIX1).count();
+});
diff --git a/users/zseri/store-ref-scanner/src/hbm.rs b/users/zseri/store-ref-scanner/src/hbm.rs
new file mode 100644
index 000000000000..2520efd8363d
--- /dev/null
+++ b/users/zseri/store-ref-scanner/src/hbm.rs
@@ -0,0 +1,167 @@
+#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
+pub struct HalfBytesMask(pub [u8; 16]);
+
+#[allow(clippy::as_conversions, clippy::zero_prefixed_literal)]
+impl HalfBytesMask {
+    pub const B32_REVSHA256: HalfBytesMask =
+        HalfBytesMask([0, 0, 0, 0, 0, 0, 255, 3, 0, 0, 0, 0, 222, 127, 207, 7]);
+
+    pub const B64_BLAKE2B256: HalfBytesMask = HalfBytesMask([
+        0, 0, 0, 0, 0, 8, 255, 3, 254, 255, 255, 135, 254, 255, 255, 7,
+    ]);
+
+    pub const DFL_REST: HalfBytesMask = HalfBytesMask([
+        0, 0, 0, 0, 0, 104, 255, 163, 254, 255, 255, 135, 254, 255, 255, 7,
+    ]);
+
+    #[inline]
+    pub const fn from_expanded(x: [bool; 128]) -> Self {
+        let mut ret = [0u8; 16];
+        let mut idx = 0;
+        while idx < 16 {
+            let fin = idx * 8;
+            let mut idx2 = 0;
+            while idx2 < 8 {
+                if x[fin + idx2] {
+                    ret[idx] += (1 << idx2) as u8;
+                }
+                idx2 += 1;
+            }
+            idx += 1;
+        }
+        Self(ret)
+    }
+
+    /// create a mask by allowing all characters via the mask which are included in the given string
+    pub fn from_bytes(s: &[u8]) -> Self {
+        s.iter().fold(Self([0u8; 16]), |mut ret, &i| {
+            ret.set(i, true);
+            ret
+        })
+    }
+
+    pub const fn into_expanded(self) -> [bool; 128] {
+        let Self(ihbm) = self;
+        let mut ret = [false; 128];
+        let mut idx = 0;
+        while idx < 16 {
+            let fin = idx * 8;
+            let curi = ihbm[idx];
+            let mut idx2 = 0;
+            while idx2 < 8 {
+                ret[fin + idx2] = (curi >> idx2) & 0b1 != 0;
+                idx2 += 1;
+            }
+            idx += 1;
+        }
+        ret
+    }
+
+    pub fn contains(&self, byte: u8) -> bool {
+        if byte >= 0x80 {
+            false
+        } else {
+            (self.0[usize::from(byte / 8)] >> u32::from(byte % 8)) & 0b1 != 0
+        }
+    }
+
+    pub fn set(&mut self, byte: u8, allow: bool) {
+        if byte >= 0x80 {
+            if cfg!(debug_assertions) {
+                panic!(
+                    "tried to manipulate invalid byte {:?} in HalfBytesMask",
+                    byte
+                );
+            } else {
+                return;
+            }
+        }
+        let block = &mut self.0[usize::from(byte / 8)];
+        let bitpat = (1 << u32::from(byte % 8)) as u8;
+        if allow {
+            *block |= bitpat;
+        } else {
+            *block &= !bitpat;
+        }
+    }
+
+    #[cfg(test)]
+    fn count_ones(&self) -> u8 {
+        self.0
+            .iter()
+            .map(|i| i.count_ones())
+            .sum::<u32>()
+            .try_into()
+            .unwrap()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn maskbase() {
+        assert_eq!(HalfBytesMask::B32_REVSHA256.count_ones(), 32);
+        assert_eq!(HalfBytesMask::B64_BLAKE2B256.count_ones(), 64);
+    }
+
+    #[test]
+    fn non_ascii() {
+        for i in 0x80..=0xff {
+            assert!(!HalfBytesMask::DFL_REST.contains(i));
+        }
+    }
+
+    #[test]
+    fn dflmask() {
+        assert_eq!(
+            HalfBytesMask::from_expanded(
+                [
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                ]
+                .map(|i| i != 0)
+            ),
+            Default::default(),
+        );
+
+        assert_eq!(
+            HalfBytesMask::from_expanded(
+                [
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
+                    1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
+                    1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
+                ]
+                .map(|i| i != 0)
+            ),
+            HalfBytesMask::B32_REVSHA256,
+        );
+
+        assert_eq!(
+            HalfBytesMask::from_expanded(
+                [
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1,
+                    1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
+                ]
+                .map(|i| i != 0)
+            ),
+            HalfBytesMask::B64_BLAKE2B256,
+        );
+
+        assert_eq!(
+            HalfBytesMask::from_bytes(
+                b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+-._?="
+            ),
+            HalfBytesMask::DFL_REST,
+        );
+    }
+}
diff --git a/users/zseri/store-ref-scanner/src/lib.rs b/users/zseri/store-ref-scanner/src/lib.rs
new file mode 100644
index 000000000000..0f86a769fe63
--- /dev/null
+++ b/users/zseri/store-ref-scanner/src/lib.rs
@@ -0,0 +1,215 @@
+#![no_std]
+#![forbid(clippy::cast_ptr_alignment, trivial_casts, unconditional_recursion)]
+#![deny(clippy::as_conversions)]
+
+mod hbm;
+pub use hbm::HalfBytesMask;
+
+mod spec;
+pub use spec::*;
+
+/// limit maximal length of store basename
+const BASENAME_MAXLEN: usize = 255;
+
+/// this is a trait which implements the interface of possible inputs
+/// (usually byte slices)
+pub trait ScannerInput: AsRef<[u8]> + Sized {
+    /// Splits the input into two at the given index.
+    /// Afterwards self contains elements [at, len), and the returned input part contains elements [0, at).
+    fn split_to(&mut self, at: usize) -> Self;
+    fn finish(&mut self);
+}
+
+impl ScannerInput for &[u8] {
+    fn split_to(&mut self, at: usize) -> Self {
+        let (a, b) = self.split_at(at);
+        *self = b;
+        a
+    }
+
+    fn finish(&mut self) {
+        *self = &[];
+    }
+}
+
+impl ScannerInput for &mut [u8] {
+    fn split_to(&mut self, at: usize) -> Self {
+        // Lifetime dance taken from `impl Write for &mut [u8]`.
+        // Taken from crate `std`.
+        let (a, b) = core::mem::take(self).split_at_mut(at);
+        *self = b;
+        a
+    }
+
+    fn finish(&mut self) {
+        *self = &mut [];
+    }
+}
+
+/// this is the primary structure of this crate
+///
+/// it represents a scanner which scans binary slices for store references,
+/// and implements an iterator interfaces which returns these as byte slices.
+pub struct StoreRefScanner<'x, Input: 'x> {
+    input: Input,
+    spec: &'x StoreSpec<'x>,
+}
+
+impl<'x, Input> StoreRefScanner<'x, Input>
+where
+    Input: ScannerInput + 'x,
+{
+    pub fn new(input: Input, spec: &'x StoreSpec<'x>) -> Self {
+        for i in [&spec.valid_hashbytes, &spec.valid_restbytes] {
+            for j in [b'\0', b' ', b'\t', b'\n', b'/', b'\\'] {
+                assert!(!i.contains(j));
+            }
+        }
+        Self { input, spec }
+    }
+}
+
+impl<'x, Input: 'x> Iterator for StoreRefScanner<'x, Input>
+where
+    Input: ScannerInput + 'x,
+{
+    type Item = Input;
+
+    fn next(&mut self) -> Option<Input> {
+        let hbl: usize = self.spec.hashbytes_len.into();
+        'outer: while !self.input.as_ref().is_empty() {
+            if !self.spec.path_to_store.is_empty() {
+                let p2sas = self.spec.path_to_store;
+                while !self.input.as_ref().starts_with(p2sas.as_bytes()) {
+                    if self.input.as_ref().is_empty() {
+                        break 'outer;
+                    }
+                    self.input.split_to(1);
+                }
+                self.input.split_to(p2sas.len());
+                if self.input.as_ref().is_empty() {
+                    break 'outer;
+                }
+            }
+            let hsep = matches!(self.input.as_ref().iter().next(), Some(b'/') | Some(b'\\'));
+            self.input.split_to(1);
+            if hsep && self.spec.check_rest(self.input.as_ref()) {
+                // we have found a valid hash
+                // rest contains the store basename and all following components
+                // now let's search for the end
+                // and then cut off possible following components after the basename
+                let rlen = self
+                    .input
+                    .as_ref()
+                    .iter()
+                    .enumerate()
+                    .take(BASENAME_MAXLEN)
+                    .skip(hbl)
+                    .find(|&(_, &i)| !self.spec.valid_restbytes.contains(i))
+                    .map(|(eosp, _)| eosp)
+                    .unwrap_or_else(|| core::cmp::min(BASENAME_MAXLEN, self.input.as_ref().len()));
+                return Some(self.input.split_to(rlen));
+            }
+        }
+        self.input.finish();
+        None
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    extern crate alloc;
+    use alloc::{vec, vec::Vec};
+
+    #[test]
+    fn simple_nix2() {
+        let drv: &[u8] = br#"
+            Derive([("out","","r:sha256","")],[("/nix/store/2ax7bvjdfkzim69q957i0jlg0nvmapg0-util-linux-2.37.2.drv",["dev"]),("/nix/store/6b55ssmh8pzqsc4q4kw1yl3kqvr4fvqj-bash-5.1-p12.drv",["out"]),("/nix/store/fp2vx24kczlzv84avds28wyzsmrn8kyv-source.drv",["out"]),("/nix/store/s6c2lm5hpsvdwnxq9y1g3ngncghjzc3k-stdenv-linux.drv",["out"]),("/nix/store/xlnzpf4mzghi8vl0krabrgcbnqk5qjf3-pkg-config-wrapper-0.29.2.drv",["out"])],["/nix/store/03sl46khd8gmjpsad7223m32ma965vy9-fix-static.patch","/nix/store/2q3z7587yhlz0i2xvfvvap42zk5carlv-bcache-udev-modern.patch","/nix/store/9krlzvny65gdc8s7kpb6lkx8cd02c25b-default-builder.sh"],"x86_64-linux","/0g15yibzzi3rmw29gqlbms05x9dbghbvh61v1qggydvmzh3bginw/bin/bash",["-e","/nix/store/9krlzvny65gdc8s7kpb6lkx8cd02c25b-default-builder.sh"],[("buildInputs","/0sdk1r4l43yw4g6lmqdhd92vhdfhlwz3m76jxzvzsqsv63czw2km"),("builder","/0g15yibzzi3rmw29gqlbms05x9dbghbvh61v1qggydvmzh3bginw/bin/bash"),("configureFlags",""),("depsBuildBuild",""),("depsBuildBuildPropagated",""),("depsBuildTarget",""),("depsBuildTargetPropagated",""),("depsHostHost",""),("depsHostHostPropagated",""),("depsTargetTarget",""),("depsTargetTargetPropagated",""),("doCheck",""),("doInstallCheck",""),("makeFlags","PREFIX=/1rz4g4znpzjwh1xymhjpm42vipw92pr73vdgl6xs1hycac8kf2n9 UDEVLIBDIR=/1rz4g4znpzjwh1xymhjpm42vipw92pr73vdgl6xs1hycac8kf2n9/lib/udev/"),("name","bcache-tools-1.0.7"),("nativeBuildInputs","/1kw0rwgdyq9q69wmmsa5d2kap6p52b0yldbzi4w17bhcq5g5cp2f"),("out","/1rz4g4znpzjwh1xymhjpm42vipw92pr73vdgl6xs1hycac8kf2n9"),("outputHashAlgo","sha256"),("outputHashMode","recursive"),("outputs","out"),("patches","/nix/store/2q3z7587yhlz0i2xvfvvap42zk5carlv-bcache-udev-modern.patch /nix/store/03sl46khd8gmjpsad7223m32ma965vy9-fix-static.patch"),("pname","bcache-tools"),("preBuild","sed -e \"s|/bin/sh|/0g15yibzzi3rmw29gqlbms05x9dbghbvh61v1qggydvmzh3bginw/bin/sh|\" -i *.rules\n"),("preInstall","mkdir -p \"$out/sbin\" \"$out/lib/udev/rules.d\" \"$out/share/man/man8\"\n"),("prePatch","sed -e \"/INSTALL.*initramfs\\/hook/d\" \\\n    -e \"/INSTALL.*initcpio\\/install/d\" \\\n    -e \"/INSTALL.*dracut\\/module-setup.sh/d\" \\\n    -e \"s/pkg-config/$PKG_CONFIG/\" \\\n    -i Makefile\n"),("propagatedBuildInputs",""),("propagatedNativeBuildInputs",""),("src","/nix/store/6izcafvfcbz19chi7hl20834g0fa043n-source"),("stdenv","/01ncyv8bxibj0imgfvmxgqy648n697bachil6aw6i46g1jk0bbds"),("strictDeps",""),("system","x86_64-linux"),("version","1.0.7")])
+        "#;
+        // we convert everything into strings because it is way easier to compare elements in error messages
+        let refs: Vec<&str> = StoreRefScanner::new(drv, &StoreSpec::DFL_NIX2)
+            .map(|i| core::str::from_utf8(i).unwrap())
+            .collect();
+        let refs_expect: Vec<&[u8]> = vec![
+            b"2ax7bvjdfkzim69q957i0jlg0nvmapg0-util-linux-2.37.2.drv",
+            b"6b55ssmh8pzqsc4q4kw1yl3kqvr4fvqj-bash-5.1-p12.drv",
+            b"fp2vx24kczlzv84avds28wyzsmrn8kyv-source.drv",
+            b"s6c2lm5hpsvdwnxq9y1g3ngncghjzc3k-stdenv-linux.drv",
+            b"xlnzpf4mzghi8vl0krabrgcbnqk5qjf3-pkg-config-wrapper-0.29.2.drv",
+            b"03sl46khd8gmjpsad7223m32ma965vy9-fix-static.patch",
+            b"2q3z7587yhlz0i2xvfvvap42zk5carlv-bcache-udev-modern.patch",
+            b"9krlzvny65gdc8s7kpb6lkx8cd02c25b-default-builder.sh",
+            b"9krlzvny65gdc8s7kpb6lkx8cd02c25b-default-builder.sh",
+            b"2q3z7587yhlz0i2xvfvvap42zk5carlv-bcache-udev-modern.patch",
+            b"03sl46khd8gmjpsad7223m32ma965vy9-fix-static.patch",
+            b"6izcafvfcbz19chi7hl20834g0fa043n-source",
+        ];
+        let refs_expect: Vec<&str> = refs_expect
+            .into_iter()
+            .map(|i| core::str::from_utf8(i).unwrap())
+            .collect();
+        assert_eq!(refs, refs_expect);
+    }
+
+    #[test]
+    fn simple_yzix1() {
+        // I haven't yet produced any yzix derivation which included /yzixs absolute paths...
+        let fake: &[u8] = br#"
+            /yzixs/4Zx1PBoft1YyAuKdhjAY1seZFHloxQ+8voHQRkRMuys:         ASCII text
+            /yzixs/dNE3yogD4JHKHzNa2t3jQMZddT8wjqlMDB0naDIFo0A:         ASCII text
+            /yzixs/FMluSVOHLc4bxX7F4lBCXafNljBnDn+rAM5HzG7k8LI:         unified diff output, ASCII text
+            /yzixs/g2G3GRL87hGEdw9cq2BZWqDQP_HeHSPRLbJ9P9KH+HI:         unified diff output, ASCII text
+            /yzixs/H08Av1ZAONwFdzVLpFQm0Sc0dvyk0sbnk82waoBig7I:         ASCII text
+            /yzixs/IndARQp+gaGDLS3K+PeyXdaRqAcCyS3EIbRXkkYjC94:         unified diff output, ASCII text
+            /yzixs/IrLPnbkEolTAuWRxkXpuvVs6Imb1iB6wUJcI+fxWwkU:         POSIX shell script, ASCII text executable
+            /yzixs/JsS_H3n3TSh2R6fiIzgOPZdjSmRkV71vGxstJJKPmr4:         unified diff output, ASCII text
+            /yzixs/LZ6pQh1x8DRxZ2IYzetBRS4LuE__IXFjpOfQPxHVwpw:         unified diff output, ASCII text
+            /yzixs/mEi2RPep9daRs0JUvwt1JsDfgYSph5sH_+_ihwn8IGQ:         ASCII text
+            /yzixs/nd4DyljinP3auDMHL_LrpsRJkWQpSHQK2jqtyyzWcBA:         POSIX shell script, ASCII text executable
+            /yzixs/nzpaknF0_ONSHtd0i_e1E3pkLF1QPeJQhAB7x9Ogo_M:         unified diff output, ASCII text
+            /yzixs/UZ3uzVUUMC1gKGLw6tg_aLFwoFrJedXB3xbhEgQOaiY:         unified diff output, ASCII text
+            /yzixs/VKyXxKTXsDGxYJ24YgbvCc1bZkA5twp3TC+Gbi4Kwd8:         unified diff output, ASCII text
+            /yzixs/VPJMl8O1xkc1LsJznpoQrCrQO0Iy+ODCPsgoUBLiRZc:         unified diff output, ASCII text
+            /yzixs/W6r1ow001ASHRj+gtRfyj9Fb_gCO_pBztX8WhYXVdIc:         unified diff output, ASCII text
+            /yzixs/xvwEcXIob_rQynUEtQiQbwaDXEobTVKEGaBMir9oH9k:         unified diff output, ASCII text
+            /yzixs/ZPvQbRJrtyeSITvW3FUZvw99hhNOO3CFqGgmWgScxcg:         ASCII text
+        "#;
+        let refs: Vec<&str> = StoreRefScanner::new(fake, &StoreSpec::DFL_YZIX1)
+            .map(|i| core::str::from_utf8(i).unwrap())
+            .collect();
+        let refs_expect: Vec<&[u8]> = vec![
+            b"4Zx1PBoft1YyAuKdhjAY1seZFHloxQ+8voHQRkRMuys",
+            b"dNE3yogD4JHKHzNa2t3jQMZddT8wjqlMDB0naDIFo0A",
+            b"FMluSVOHLc4bxX7F4lBCXafNljBnDn+rAM5HzG7k8LI",
+            b"g2G3GRL87hGEdw9cq2BZWqDQP_HeHSPRLbJ9P9KH+HI",
+            b"H08Av1ZAONwFdzVLpFQm0Sc0dvyk0sbnk82waoBig7I",
+            b"IndARQp+gaGDLS3K+PeyXdaRqAcCyS3EIbRXkkYjC94",
+            b"IrLPnbkEolTAuWRxkXpuvVs6Imb1iB6wUJcI+fxWwkU",
+            b"JsS_H3n3TSh2R6fiIzgOPZdjSmRkV71vGxstJJKPmr4",
+            b"LZ6pQh1x8DRxZ2IYzetBRS4LuE__IXFjpOfQPxHVwpw",
+            b"mEi2RPep9daRs0JUvwt1JsDfgYSph5sH_+_ihwn8IGQ",
+            b"nd4DyljinP3auDMHL_LrpsRJkWQpSHQK2jqtyyzWcBA",
+            b"nzpaknF0_ONSHtd0i_e1E3pkLF1QPeJQhAB7x9Ogo_M",
+            b"UZ3uzVUUMC1gKGLw6tg_aLFwoFrJedXB3xbhEgQOaiY",
+            b"VKyXxKTXsDGxYJ24YgbvCc1bZkA5twp3TC+Gbi4Kwd8",
+            b"VPJMl8O1xkc1LsJznpoQrCrQO0Iy+ODCPsgoUBLiRZc",
+            b"W6r1ow001ASHRj+gtRfyj9Fb_gCO_pBztX8WhYXVdIc",
+            b"xvwEcXIob_rQynUEtQiQbwaDXEobTVKEGaBMir9oH9k",
+            b"ZPvQbRJrtyeSITvW3FUZvw99hhNOO3CFqGgmWgScxcg",
+        ];
+        let refs_expect: Vec<&str> = refs_expect
+            .into_iter()
+            .map(|i| core::str::from_utf8(i).unwrap())
+            .collect();
+        assert_eq!(refs, refs_expect);
+    }
+
+    #[test]
+    fn just_store() {
+        for i in [&StoreSpec::DFL_NIX2, &StoreSpec::DFL_YZIX1] {
+            let refs: Vec<&[u8]> = StoreRefScanner::new(i.path_to_store.as_bytes(), i).collect();
+            assert!(refs.is_empty());
+        }
+    }
+}
diff --git a/users/zseri/store-ref-scanner/src/spec.rs b/users/zseri/store-ref-scanner/src/spec.rs
new file mode 100644
index 000000000000..79da0842c529
--- /dev/null
+++ b/users/zseri/store-ref-scanner/src/spec.rs
@@ -0,0 +1,40 @@
+use crate::hbm::HalfBytesMask;
+
+pub struct StoreSpec<'path> {
+    /// path to store without trailing slash
+    pub path_to_store: &'path str,
+
+    /// compressed map of allowed ASCII characters in hash part
+    pub valid_hashbytes: HalfBytesMask,
+
+    /// compressed map of allowed ASCII characters in part after hash
+    pub valid_restbytes: HalfBytesMask,
+
+    /// exact length of hash part of store paths
+    pub hashbytes_len: u8,
+}
+
+impl StoreSpec<'_> {
+    pub(crate) fn check_rest(&self, rest: &[u8]) -> bool {
+        let hbl = self.hashbytes_len.into();
+        rest.iter()
+            .take(hbl)
+            .take_while(|&&i| self.valid_hashbytes.contains(i))
+            .count()
+            == hbl
+    }
+
+    pub const DFL_NIX2: StoreSpec<'static> = StoreSpec {
+        path_to_store: "/nix/store",
+        valid_hashbytes: HalfBytesMask::B32_REVSHA256,
+        valid_restbytes: HalfBytesMask::DFL_REST,
+        hashbytes_len: 32,
+    };
+
+    pub const DFL_YZIX1: StoreSpec<'static> = StoreSpec {
+        path_to_store: "/yzixs",
+        valid_hashbytes: HalfBytesMask::B64_BLAKE2B256,
+        valid_restbytes: HalfBytesMask::DFL_REST,
+        hashbytes_len: 43,
+    };
+}
diff --git a/users/zseri/store-ref-scanner/tests.nix b/users/zseri/store-ref-scanner/tests.nix
new file mode 100644
index 000000000000..a4c82fe3a936
--- /dev/null
+++ b/users/zseri/store-ref-scanner/tests.nix
@@ -0,0 +1,32 @@
+{ depot, lib, pkgs, ... }:
+
+let
+  parent = depot.users.zseri.store-ref-scanner;
+in
+pkgs.buildRustCrate {
+  pname = "store-ref-scanner-tests";
+  inherit (parent) crateName src version edition;
+  buildTests = true;
+  postInstall = ''
+    set -ex
+    export RUST_BACKTRACE=1
+    # recreate a file hierarchy as when running tests with cargo
+    # the source for test data
+    # build outputs
+    testRoot=target/debug
+    mkdir -p $testRoot
+    chmod +w -R .
+    # test harness executables are suffixed with a hash,
+    # like cargo does this allows to prevent name collision
+    # with the main executables of the crate
+    hash=$(basename $out)
+    ls -lasR $out
+    for file in $out/tests/*; do
+      f=$testRoot/$(basename $file)-$hash
+      cp $file $f
+      $f 2>&1 | tee -a $out/tests.log
+    done
+    rm -rf $out/tests
+    set +ex
+  '';
+}