about summary refs log tree commit diff
path: root/tvix/cli
diff options
context:
space:
mode:
authorVincent Ambo <mail@tazj.in>2023-01-11T15·14+0300
committerclbot <clbot@tvl.fyi>2023-01-11T16·03+0000
commit3045645df07ffdb54f9d2a11ee2e41e31999986f (patch)
treeee768e7a42e4addd6c5f9591163b8b8a683d68ca /tvix/cli
parent9382afdb0d7258cee8e7f20d646a85076f38011a (diff)
feat(tvix/cli): implement initial refscan module r/5643
This module implements a ReferenceScanner struct which uses the
aho_corasick crate to scan string inputs for known, non-overlapping
candidates (store paths, in our case).

I experimented with several different APIs, and landed on this version
with an initial accumulator in the scanner. The scanner is
instantiated from the candidates and "fed" all the strings, then
consumed by the caller to retrieve the result.

Right now only things that look vaguely like bytestrings can be fed to
the scanner, there is no streaming support in the API yet.

Change-Id: I7782f0f0df5fc64bccd813aa14712f5525b0168c
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7808
Autosubmit: tazjin <tazjin@tvl.su>
Reviewed-by: flokli <flokli@flokli.de>
Tested-by: BuildkiteCI
Diffstat (limited to 'tvix/cli')
-rw-r--r--tvix/cli/Cargo.toml1
-rw-r--r--tvix/cli/src/main.rs1
-rw-r--r--tvix/cli/src/refscan.rs97
3 files changed, 99 insertions, 0 deletions
diff --git a/tvix/cli/Cargo.toml b/tvix/cli/Cargo.toml
index 099002353dc2..f3324f2611ea 100644
--- a/tvix/cli/Cargo.toml
+++ b/tvix/cli/Cargo.toml
@@ -13,3 +13,4 @@ rustyline = "10.0.0"
 clap = { version = "4.0", features = ["derive", "env"] }
 dirs = "4.0.0"
 smol_str = "0.1"
+aho-corasick = "0.7"
diff --git a/tvix/cli/src/main.rs b/tvix/cli/src/main.rs
index eec4d8bbb2ac..0f837b346c2d 100644
--- a/tvix/cli/src/main.rs
+++ b/tvix/cli/src/main.rs
@@ -1,4 +1,5 @@
 mod nix_compat;
+mod refscan;
 
 use std::{fs, path::PathBuf};
 
diff --git a/tvix/cli/src/refscan.rs b/tvix/cli/src/refscan.rs
new file mode 100644
index 000000000000..76857142e87e
--- /dev/null
+++ b/tvix/cli/src/refscan.rs
@@ -0,0 +1,97 @@
+//! Simple scanner for non-overlapping, known references of Nix store paths in a
+//! given string.
+//!
+//! This is used for determining build references (see
+//! //tvix/eval/docs/build-references.md for more details).
+//!
+//! The scanner itself is an Aho-Corasick automaton, using the `aho-corasick`
+//! crate.
+
+use aho_corasick::AhoCorasick;
+use std::collections::BTreeSet;
+
+/// Represents a "primed" reference scanner with an automaton that knows the set
+/// of store paths to scan for.
+pub struct ReferenceScanner<'c, 's> {
+    candidates: &'c [&'s str],
+    searcher: AhoCorasick,
+    matches: BTreeSet<&'s str>,
+}
+
+impl<'c, 's> ReferenceScanner<'c, 's> {
+    /// Construct a new `ReferenceScanner` that knows how to scan for the given
+    /// candidate store paths.
+    pub fn new(candidates: &'c [&'s str]) -> Self {
+        let searcher = AhoCorasick::new_auto_configured(candidates);
+
+        ReferenceScanner {
+            searcher,
+            candidates,
+            matches: Default::default(),
+        }
+    }
+
+    /// Scan the given string for all non-overlapping matches and collect them
+    /// in the scanner.
+    pub fn scan_str<H: AsRef<[u8]>>(&mut self, haystack: H) {
+        for m in self.searcher.find_iter(&haystack) {
+            let needle = self.candidates[m.pattern()];
+            self.matches.insert(needle);
+        }
+    }
+
+    /// Finalise the reference scanner and return the resulting matches.
+    pub fn finalise(self) -> BTreeSet<&'s str> {
+        self.matches
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // The actual derivation of `nixpkgs.hello`.
+    const HELLO_DRV: &'static str = r#"Derive([("out","/nix/store/33l4p0pn0mybmqzaxfkpppyh7vx1c74p-hello-2.12.1","","")],[("/nix/store/6z1jfnqqgyqr221zgbpm30v91yfj3r45-bash-5.1-p16.drv",["out"]),("/nix/store/ap9g09fxbicj836zm88d56dn3ff4clxl-stdenv-linux.drv",["out"]),("/nix/store/pf80kikyxr63wrw56k00i1kw6ba76qik-hello-2.12.1.tar.gz.drv",["out"])],["/nix/store/9krlzvny65gdc8s7kpb6lkx8cd02c25b-default-builder.sh"],"x86_64-linux","/nix/store/4xw8n979xpivdc46a9ndcvyhwgif00hz-bash-5.1-p16/bin/bash",["-e","/nix/store/9krlzvny65gdc8s7kpb6lkx8cd02c25b-default-builder.sh"],[("buildInputs",""),("builder","/nix/store/4xw8n979xpivdc46a9ndcvyhwgif00hz-bash-5.1-p16/bin/bash"),("cmakeFlags",""),("configureFlags",""),("depsBuildBuild",""),("depsBuildBuildPropagated",""),("depsBuildTarget",""),("depsBuildTargetPropagated",""),("depsHostHost",""),("depsHostHostPropagated",""),("depsTargetTarget",""),("depsTargetTargetPropagated",""),("doCheck","1"),("doInstallCheck",""),("mesonFlags",""),("name","hello-2.12.1"),("nativeBuildInputs",""),("out","/nix/store/33l4p0pn0mybmqzaxfkpppyh7vx1c74p-hello-2.12.1"),("outputs","out"),("patches",""),("pname","hello"),("propagatedBuildInputs",""),("propagatedNativeBuildInputs",""),("src","/nix/store/pa10z4ngm0g83kx9mssrqzz30s84vq7k-hello-2.12.1.tar.gz"),("stdenv","/nix/store/cp65c8nk29qq5cl1wyy5qyw103cwmax7-stdenv-linux"),("strictDeps",""),("system","x86_64-linux"),("version","2.12.1")])"#;
+
+    #[test]
+    fn test_empty() {
+        let mut scanner = ReferenceScanner::new(&[]);
+        scanner.scan_str("hello world");
+        assert!(scanner.finalise().is_empty());
+    }
+
+    #[test]
+    fn test_single_match() {
+        let mut scanner =
+            ReferenceScanner::new(&["/nix/store/4xw8n979xpivdc46a9ndcvyhwgif00hz-bash-5.1-p16"]);
+        scanner.scan_str(HELLO_DRV);
+
+        let result = scanner.finalise();
+
+        assert_eq!(result.len(), 1);
+        assert!(result.contains("/nix/store/4xw8n979xpivdc46a9ndcvyhwgif00hz-bash-5.1-p16"));
+    }
+
+    #[test]
+    fn test_multiple_matches() {
+        let candidates = &[
+            // these exist in the drv:
+            "/nix/store/33l4p0pn0mybmqzaxfkpppyh7vx1c74p-hello-2.12.1",
+            "/nix/store/pf80kikyxr63wrw56k00i1kw6ba76qik-hello-2.12.1.tar.gz.drv",
+            "/nix/store/cp65c8nk29qq5cl1wyy5qyw103cwmax7-stdenv-linux",
+            // this doesn't:
+            "/nix/store/fn7zvafq26f0c8b17brs7s95s10ibfzs-emacs-28.2.drv",
+        ];
+
+        let mut scanner = ReferenceScanner::new(candidates);
+        scanner.scan_str(HELLO_DRV);
+
+        let result = scanner.finalise();
+
+        assert_eq!(result.len(), 3);
+
+        for c in candidates[..3].iter() {
+            assert!(result.contains(c));
+        }
+    }
+}