From 9d6f29a72b3b466dd697c2eaa97f9a41b767fdff Mon Sep 17 00:00:00 2001 From: Vincent Ambo Date: Thu, 2 Feb 2023 15:51:59 +0300 Subject: refactor(tvix/cli): use Wu-Manber string scanning for drv references Switch out the string-scanning algorithm used in the reference scanner. The construction of aho-corasick automata made up the vast majority of runtime when evaluating nixpkgs previously. While the actual scanning with a constructed automaton is relatively fast, we almost never scan for the same set of strings twice and the cost is not worth it. An algorithm that better matches our needs is the Wu-Manber multiple string match algorithm, which works efficiently on *long* and *random* strings of the *same length*, which describes store paths (up to their hash component). This switches the refscanner crate to a Rust implementation[0][1] of this algorithm. This has several implications: 1. This crate does not provide a way to scan streams. I'm not sure if this is an inherent problem with the algorithm (probably not, but it would need buffering). Either way, related functions and tests (which were actually unused) have been removed. 2. All strings need to be of the same length. For this reason, we truncate the known paths after their hash part (they are still unique, of course). 3. Passing an empty set of matches, or a match that is shorter than the length of a store path, causes the crate to panic. We safeguard against this by completely skipping the refscanning if there are no known paths (i.e. when evaluating the first derivation of an eval), and by bailing out of scanning a string that is shorter than a store path. On the upside, this reduces overall runtime to less 1/5 of what it was before when evaluating `pkgs.stdenv.drvPath`. [0]: Frankly, it's a random, research-grade MIT-licensed crate that I found on Github: https://github.com/jneem/wu-manber [1]: We probably want to rewrite or at least fork the above crate, and add things like a three-byte wide scanner. Evaluating large portions of nixpkgs can easily lead to more than 65k derivations being scanned for. Change-Id: I08926778e1e5d5a87fc9ac26e0437aed8bbd9eb0 Reviewed-on: https://cl.tvl.fyi/c/depot/+/8017 Tested-by: BuildkiteCI Reviewed-by: flokli --- tvix/Cargo.lock | 7 +++++- tvix/Cargo.nix | 19 ++++++++++++++ tvix/cli/Cargo.toml | 4 ++- tvix/cli/src/derivation.rs | 19 ++++++++++---- tvix/cli/src/known_paths.rs | 12 +++++++-- tvix/cli/src/refscan.rs | 61 ++++++++------------------------------------- tvix/crate-hashes.json | 3 ++- 7 files changed, 64 insertions(+), 61 deletions(-) diff --git a/tvix/Cargo.lock b/tvix/Cargo.lock index dd7d71906a..131b868608 100644 --- a/tvix/Cargo.lock +++ b/tvix/Cargo.lock @@ -2622,7 +2622,6 @@ checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" name = "tvix-cli" version = "0.1.0" dependencies = [ - "aho-corasick", "clap 4.1.4", "data-encoding", "dirs", @@ -2632,6 +2631,7 @@ dependencies = [ "ssri", "thiserror", "tvix-eval", + "wu-manber", ] [[package]] @@ -2957,6 +2957,11 @@ version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd" +[[package]] +name = "wu-manber" +version = "0.1.0" +source = "git+https://github.com/jneem/wu-manber.git#acff1ffc303935ce55a6c36bd89b0819688b27b7" + [[package]] name = "xml-rs" version = "0.8.4" diff --git a/tvix/Cargo.nix b/tvix/Cargo.nix index 287e209557..1b566b5417 100644 --- a/tvix/Cargo.nix +++ b/tvix/Cargo.nix @@ -7809,6 +7809,10 @@ rec { name = "tvix-eval"; packageId = "tvix-eval"; } + { + name = "wu-manber"; + packageId = "wu-manber"; + } ]; }; @@ -9463,6 +9467,21 @@ rec { "Microsoft" ]; + }; + "wu-manber" = rec { + crateName = "wu-manber"; + version = "0.1.0"; + edition = "2015"; + workspace_member = null; + src = pkgs.fetchgit { + url = "https://github.com/jneem/wu-manber.git"; + rev = "acff1ffc303935ce55a6c36bd89b0819688b27b7"; + sha256 = "1r6r87746qry48rmsdk7x3l4w5wlhkny8al9mz2ic83j7vm1wivz"; + }; + authors = [ + "Joe Neeman " + ]; + }; "xml-rs" = rec { crateName = "xml-rs"; diff --git a/tvix/cli/Cargo.toml b/tvix/cli/Cargo.toml index 69b54bd299..2cda1c6c15 100644 --- a/tvix/cli/Cargo.toml +++ b/tvix/cli/Cargo.toml @@ -14,7 +14,9 @@ rustyline = "10.0.0" clap = { version = "4.0", features = ["derive", "env"] } dirs = "4.0.0" smol_str = "0.1" -aho-corasick = "0.7" ssri = "7.0.0" data-encoding = "2.3.3" thiserror = "1.0.38" + +[dependencies.wu-manber] +git = "https://github.com/jneem/wu-manber.git" diff --git a/tvix/cli/src/derivation.rs b/tvix/cli/src/derivation.rs index fce9a62859..6af3d24a24 100644 --- a/tvix/cli/src/derivation.rs +++ b/tvix/cli/src/derivation.rs @@ -298,10 +298,19 @@ mod derivation_builtins { } // Scan references in relevant attributes to detect any build-references. - let mut refscan = state.borrow().reference_scanner(); - drv.arguments.iter().for_each(|s| refscan.scan_str(s)); - drv.environment.values().for_each(|s| refscan.scan_str(s)); - refscan.scan_str(&drv.builder); + let references = { + let state = state.borrow(); + if state.is_empty() { + // skip reference scanning, create an empty result + Default::default() + } else { + let mut refscan = state.reference_scanner(); + drv.arguments.iter().for_each(|s| refscan.scan_str(s)); + drv.environment.values().for_each(|s| refscan.scan_str(s)); + refscan.scan_str(&drv.builder); + refscan.finalise() + } + }; // Each output name needs to exist in the environment, at this // point initialised as an empty string because that is the @@ -317,7 +326,7 @@ mod derivation_builtins { } let mut known_paths = state.borrow_mut(); - populate_inputs(&mut drv, &known_paths, refscan.finalise()); + populate_inputs(&mut drv, &known_paths, references); // At this point, derivation fields are fully populated from // eval data structures. diff --git a/tvix/cli/src/known_paths.rs b/tvix/cli/src/known_paths.rs index 165bc3ea41..69651d4180 100644 --- a/tvix/cli/src/known_paths.rs +++ b/tvix/cli/src/known_paths.rs @@ -11,7 +11,7 @@ //! Please see //tvix/eval/docs/build-references.md for more //! information. -use crate::refscan::ReferenceScanner; +use crate::refscan::{ReferenceScanner, STORE_PATH_LEN}; use std::{ collections::{hash_map, BTreeSet, HashMap}, ops::Index, @@ -45,12 +45,14 @@ impl Index<&str> for KnownPaths { type Output = PathType; fn index(&self, index: &str) -> &Self::Output { - &self.paths[index] + &self.paths[&index[..STORE_PATH_LEN]] } } impl KnownPaths { fn insert_path(&mut self, path: String, path_type: PathType) { + let path = path[..STORE_PATH_LEN].to_owned(); + assert_eq!(path.len(), STORE_PATH_LEN, "should match"); match self.paths.entry(path) { hash_map::Entry::Vacant(entry) => { entry.insert(path_type); @@ -108,6 +110,12 @@ impl KnownPaths { ); } + /// Checks whether there are any known paths. If not, a reference + /// scanner can not be created. + pub fn is_empty(&self) -> bool { + self.paths.is_empty() + } + /// Create a reference scanner from the current set of known paths. pub fn reference_scanner(&self) -> ReferenceScanner { let candidates = self.paths.keys().map(Clone::clone).collect(); diff --git a/tvix/cli/src/refscan.rs b/tvix/cli/src/refscan.rs index 74110e1088..4314e01644 100644 --- a/tvix/cli/src/refscan.rs +++ b/tvix/cli/src/refscan.rs @@ -7,15 +7,16 @@ //! The scanner itself is an Aho-Corasick automaton, using the `aho-corasick` //! crate. -use aho_corasick::AhoCorasick; use std::collections::BTreeSet; -use std::io; +use wu_manber::TwoByteWM; + +pub const STORE_PATH_LEN: usize = "/nix/store/00000000000000000000000000000000".len(); /// Represents a "primed" reference scanner with an automaton that knows the set /// of store paths to scan for. pub struct ReferenceScanner { candidates: Vec, - searcher: AhoCorasick, + searcher: TwoByteWM, matches: Vec, } @@ -23,7 +24,7 @@ impl ReferenceScanner { /// Construct a new `ReferenceScanner` that knows how to scan for the given /// candidate store paths. pub fn new(candidates: Vec) -> Self { - let searcher = AhoCorasick::new_auto_configured(&candidates); + let searcher = TwoByteWM::new(&candidates); ReferenceScanner { searcher, @@ -34,25 +35,14 @@ impl ReferenceScanner { /// Scan the given string for all non-overlapping matches and collect them /// in the scanner. - pub fn scan_str>(&mut self, haystack: H) { - for m in self.searcher.find_iter(&haystack) { - self.matches.push(m.pattern()); + pub fn scan_str(&mut self, haystack: &str) { + if haystack.len() < STORE_PATH_LEN { + return; } - } - /// Scan the given reader for all non-overlapping matches, and collect them - /// in the scanner. On read failures, this method aborts and returns an - /// error to the caller. - /// - /// Please note that the internal machinery has its own buffering mechanism, - /// and where possible the given reader should be unbuffered. See - /// [`AhoCorasick::stream_find_iter`] for details on this. - pub fn scan_stream(&mut self, stream: R) -> io::Result<()> { - for m in self.searcher.stream_find_iter(stream) { - self.matches.push(m?.pattern()); + for m in self.searcher.find(&haystack) { + self.matches.push(m.pat_idx); } - - Ok(()) } /// Finalise the reference scanner and return the resulting matches. @@ -71,13 +61,6 @@ mod tests { // The actual derivation of `nixpkgs.hello`. const HELLO_DRV: &'static str = r#"Derive([("out","/nix/store/33l4p0pn0mybmqzaxfkpppyh7vx1c74p-hello-2.12.1","","")],[("/nix/store/6z1jfnqqgyqr221zgbpm30v91yfj3r45-bash-5.1-p16.drv",["out"]),("/nix/store/ap9g09fxbicj836zm88d56dn3ff4clxl-stdenv-linux.drv",["out"]),("/nix/store/pf80kikyxr63wrw56k00i1kw6ba76qik-hello-2.12.1.tar.gz.drv",["out"])],["/nix/store/9krlzvny65gdc8s7kpb6lkx8cd02c25b-default-builder.sh"],"x86_64-linux","/nix/store/4xw8n979xpivdc46a9ndcvyhwgif00hz-bash-5.1-p16/bin/bash",["-e","/nix/store/9krlzvny65gdc8s7kpb6lkx8cd02c25b-default-builder.sh"],[("buildInputs",""),("builder","/nix/store/4xw8n979xpivdc46a9ndcvyhwgif00hz-bash-5.1-p16/bin/bash"),("cmakeFlags",""),("configureFlags",""),("depsBuildBuild",""),("depsBuildBuildPropagated",""),("depsBuildTarget",""),("depsBuildTargetPropagated",""),("depsHostHost",""),("depsHostHostPropagated",""),("depsTargetTarget",""),("depsTargetTargetPropagated",""),("doCheck","1"),("doInstallCheck",""),("mesonFlags",""),("name","hello-2.12.1"),("nativeBuildInputs",""),("out","/nix/store/33l4p0pn0mybmqzaxfkpppyh7vx1c74p-hello-2.12.1"),("outputs","out"),("patches",""),("pname","hello"),("propagatedBuildInputs",""),("propagatedNativeBuildInputs",""),("src","/nix/store/pa10z4ngm0g83kx9mssrqzz30s84vq7k-hello-2.12.1.tar.gz"),("stdenv","/nix/store/cp65c8nk29qq5cl1wyy5qyw103cwmax7-stdenv-linux"),("strictDeps",""),("system","x86_64-linux"),("version","2.12.1")])"#; - #[test] - fn test_empty() { - let mut scanner = ReferenceScanner::new(vec![]); - scanner.scan_str("hello world"); - assert!(scanner.finalise().is_empty()); - } - #[test] fn test_single_match() { let mut scanner = ReferenceScanner::new(vec![ @@ -112,28 +95,4 @@ mod tests { assert!(result.contains(c)); } } - - #[test] - fn test_multiple_stream() { - let candidates = vec![ - // these exist in the drv: - "/nix/store/33l4p0pn0mybmqzaxfkpppyh7vx1c74p-hello-2.12.1".into(), - "/nix/store/pf80kikyxr63wrw56k00i1kw6ba76qik-hello-2.12.1.tar.gz.drv".into(), - "/nix/store/cp65c8nk29qq5cl1wyy5qyw103cwmax7-stdenv-linux".into(), - // this doesn't: - "/nix/store/fn7zvafq26f0c8b17brs7s95s10ibfzs-emacs-28.2.drv".into(), - ]; - - let mut scanner = ReferenceScanner::new(candidates.clone()); - scanner - .scan_stream(HELLO_DRV.as_bytes()) - .expect("scanning should succeed"); - - let result = scanner.finalise(); - assert_eq!(result.len(), 3); - - for c in candidates[..3].iter() { - assert!(result.contains(c)); - } - } } diff --git a/tvix/crate-hashes.json b/tvix/crate-hashes.json index 5b07633342..2528588b18 100644 --- a/tvix/crate-hashes.json +++ b/tvix/crate-hashes.json @@ -1,4 +1,5 @@ { "test-generator 0.3.0 (git+https://github.com/JamesGuthrie/test-generator.git?rev=82e799979980962aec1aa324ec6e0e4cad781f41#82e799979980962aec1aa324ec6e0e4cad781f41)": "08brp3qqa55hijc7xby3lam2cc84hvx1zzfqv6lj7smlczh8k32y", - "tonic-mock 0.1.0 (git+https://github.com/brainrake/tonic-mock?branch=bump-dependencies#ec1a15510875de99d709d684190db5d9beab175e)": "0lwa03hpp0mxa6aa1zv5w68k61y4hccfm0q2ykyq392fwal8vb50" + "tonic-mock 0.1.0 (git+https://github.com/brainrake/tonic-mock?branch=bump-dependencies#ec1a15510875de99d709d684190db5d9beab175e)": "0lwa03hpp0mxa6aa1zv5w68k61y4hccfm0q2ykyq392fwal8vb50", + "wu-manber 0.1.0 (git+https://github.com/jneem/wu-manber.git#acff1ffc303935ce55a6c36bd89b0819688b27b7)": "1r6r87746qry48rmsdk7x3l4w5wlhkny8al9mz2ic83j7vm1wivz" } \ No newline at end of file -- cgit 1.4.1