diff options
author | Vincent Ambo <mail@tazj.in> | 2023-01-11T15·14+0300 |
---|---|---|
committer | clbot <clbot@tvl.fyi> | 2023-01-11T16·03+0000 |
commit | 3045645df07ffdb54f9d2a11ee2e41e31999986f (patch) | |
tree | ee768e7a42e4addd6c5f9591163b8b8a683d68ca | |
parent | 9382afdb0d7258cee8e7f20d646a85076f38011a (diff) |
feat(tvix/cli): implement initial refscan module r/5643
This module implements a ReferenceScanner struct which uses the aho_corasick crate to scan string inputs for known, non-overlapping candidates (store paths, in our case). I experimented with several different APIs, and landed on this version with an initial accumulator in the scanner. The scanner is instantiated from the candidates and "fed" all the strings, then consumed by the caller to retrieve the result. Right now only things that look vaguely like bytestrings can be fed to the scanner, there is no streaming support in the API yet. Change-Id: I7782f0f0df5fc64bccd813aa14712f5525b0168c Reviewed-on: https://cl.tvl.fyi/c/depot/+/7808 Autosubmit: tazjin <tazjin@tvl.su> Reviewed-by: flokli <flokli@flokli.de> Tested-by: BuildkiteCI
-rw-r--r-- | tvix/Cargo.lock | 1 | ||||
-rw-r--r-- | tvix/Cargo.nix | 4 | ||||
-rw-r--r-- | tvix/cli/Cargo.toml | 1 | ||||
-rw-r--r-- | tvix/cli/src/main.rs | 1 | ||||
-rw-r--r-- | tvix/cli/src/refscan.rs | 97 |
5 files changed, 104 insertions, 0 deletions
diff --git a/tvix/Cargo.lock b/tvix/Cargo.lock index 197bee75e6c8..a5ea4b82e85c 100644 --- a/tvix/Cargo.lock +++ b/tvix/Cargo.lock @@ -2177,6 +2177,7 @@ checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642" name = "tvix-cli" version = "0.1.0" dependencies = [ + "aho-corasick", "clap 4.0.32", "dirs", "rustyline", diff --git a/tvix/Cargo.nix b/tvix/Cargo.nix index 9f13a1607336..0f2b6bdd98b4 100644 --- a/tvix/Cargo.nix +++ b/tvix/Cargo.nix @@ -6478,6 +6478,10 @@ rec { else ./cli; dependencies = [ { + name = "aho-corasick"; + packageId = "aho-corasick"; + } + { name = "clap"; packageId = "clap 4.0.32"; features = [ "derive" "env" ]; diff --git a/tvix/cli/Cargo.toml b/tvix/cli/Cargo.toml index 099002353dc2..f3324f2611ea 100644 --- a/tvix/cli/Cargo.toml +++ b/tvix/cli/Cargo.toml @@ -13,3 +13,4 @@ rustyline = "10.0.0" clap = { version = "4.0", features = ["derive", "env"] } dirs = "4.0.0" smol_str = "0.1" +aho-corasick = "0.7" diff --git a/tvix/cli/src/main.rs b/tvix/cli/src/main.rs index eec4d8bbb2ac..0f837b346c2d 100644 --- a/tvix/cli/src/main.rs +++ b/tvix/cli/src/main.rs @@ -1,4 +1,5 @@ mod nix_compat; +mod refscan; use std::{fs, path::PathBuf}; diff --git a/tvix/cli/src/refscan.rs b/tvix/cli/src/refscan.rs new file mode 100644 index 000000000000..76857142e87e --- /dev/null +++ b/tvix/cli/src/refscan.rs @@ -0,0 +1,97 @@ +//! Simple scanner for non-overlapping, known references of Nix store paths in a +//! given string. +//! +//! This is used for determining build references (see +//! //tvix/eval/docs/build-references.md for more details). +//! +//! The scanner itself is an Aho-Corasick automaton, using the `aho-corasick` +//! crate. + +use aho_corasick::AhoCorasick; +use std::collections::BTreeSet; + +/// Represents a "primed" reference scanner with an automaton that knows the set +/// of store paths to scan for. +pub struct ReferenceScanner<'c, 's> { + candidates: &'c [&'s str], + searcher: AhoCorasick, + matches: BTreeSet<&'s str>, +} + +impl<'c, 's> ReferenceScanner<'c, 's> { + /// Construct a new `ReferenceScanner` that knows how to scan for the given + /// candidate store paths. + pub fn new(candidates: &'c [&'s str]) -> Self { + let searcher = AhoCorasick::new_auto_configured(candidates); + + ReferenceScanner { + searcher, + candidates, + matches: Default::default(), + } + } + + /// Scan the given string for all non-overlapping matches and collect them + /// in the scanner. + pub fn scan_str<H: AsRef<[u8]>>(&mut self, haystack: H) { + for m in self.searcher.find_iter(&haystack) { + let needle = self.candidates[m.pattern()]; + self.matches.insert(needle); + } + } + + /// Finalise the reference scanner and return the resulting matches. + pub fn finalise(self) -> BTreeSet<&'s str> { + self.matches + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // The actual derivation of `nixpkgs.hello`. + const HELLO_DRV: &'static str = r#"Derive([("out","/nix/store/33l4p0pn0mybmqzaxfkpppyh7vx1c74p-hello-2.12.1","","")],[("/nix/store/6z1jfnqqgyqr221zgbpm30v91yfj3r45-bash-5.1-p16.drv",["out"]),("/nix/store/ap9g09fxbicj836zm88d56dn3ff4clxl-stdenv-linux.drv",["out"]),("/nix/store/pf80kikyxr63wrw56k00i1kw6ba76qik-hello-2.12.1.tar.gz.drv",["out"])],["/nix/store/9krlzvny65gdc8s7kpb6lkx8cd02c25b-default-builder.sh"],"x86_64-linux","/nix/store/4xw8n979xpivdc46a9ndcvyhwgif00hz-bash-5.1-p16/bin/bash",["-e","/nix/store/9krlzvny65gdc8s7kpb6lkx8cd02c25b-default-builder.sh"],[("buildInputs",""),("builder","/nix/store/4xw8n979xpivdc46a9ndcvyhwgif00hz-bash-5.1-p16/bin/bash"),("cmakeFlags",""),("configureFlags",""),("depsBuildBuild",""),("depsBuildBuildPropagated",""),("depsBuildTarget",""),("depsBuildTargetPropagated",""),("depsHostHost",""),("depsHostHostPropagated",""),("depsTargetTarget",""),("depsTargetTargetPropagated",""),("doCheck","1"),("doInstallCheck",""),("mesonFlags",""),("name","hello-2.12.1"),("nativeBuildInputs",""),("out","/nix/store/33l4p0pn0mybmqzaxfkpppyh7vx1c74p-hello-2.12.1"),("outputs","out"),("patches",""),("pname","hello"),("propagatedBuildInputs",""),("propagatedNativeBuildInputs",""),("src","/nix/store/pa10z4ngm0g83kx9mssrqzz30s84vq7k-hello-2.12.1.tar.gz"),("stdenv","/nix/store/cp65c8nk29qq5cl1wyy5qyw103cwmax7-stdenv-linux"),("strictDeps",""),("system","x86_64-linux"),("version","2.12.1")])"#; + + #[test] + fn test_empty() { + let mut scanner = ReferenceScanner::new(&[]); + scanner.scan_str("hello world"); + assert!(scanner.finalise().is_empty()); + } + + #[test] + fn test_single_match() { + let mut scanner = + ReferenceScanner::new(&["/nix/store/4xw8n979xpivdc46a9ndcvyhwgif00hz-bash-5.1-p16"]); + scanner.scan_str(HELLO_DRV); + + let result = scanner.finalise(); + + assert_eq!(result.len(), 1); + assert!(result.contains("/nix/store/4xw8n979xpivdc46a9ndcvyhwgif00hz-bash-5.1-p16")); + } + + #[test] + fn test_multiple_matches() { + let candidates = &[ + // these exist in the drv: + "/nix/store/33l4p0pn0mybmqzaxfkpppyh7vx1c74p-hello-2.12.1", + "/nix/store/pf80kikyxr63wrw56k00i1kw6ba76qik-hello-2.12.1.tar.gz.drv", + "/nix/store/cp65c8nk29qq5cl1wyy5qyw103cwmax7-stdenv-linux", + // this doesn't: + "/nix/store/fn7zvafq26f0c8b17brs7s95s10ibfzs-emacs-28.2.drv", + ]; + + let mut scanner = ReferenceScanner::new(candidates); + scanner.scan_str(HELLO_DRV); + + let result = scanner.finalise(); + + assert_eq!(result.len(), 3); + + for c in candidates[..3].iter() { + assert!(result.contains(c)); + } + } +} |