diff options
author | Vincent Ambo <mail@tazj.in> | 2023-01-11T15·23+0300 |
---|---|---|
committer | clbot <clbot@tvl.fyi> | 2023-01-11T16·03+0000 |
commit | e63bff5545385089508ee2588b17c17d2e4d7352 (patch) | |
tree | 03b2c8598527d27d727e65166ebabd9ffbc29fd9 /tvix/cli/src | |
parent | 3045645df07ffdb54f9d2a11ee2e41e31999986f (diff) |
feat(tvix/refscan): implement reference scanning over data streams r/5644
Using yet more machinery from the pretty comprehensive aho_corasick crate, this makes it possible to pass anything implementing `io::Read` to the `ReferenceScanner` to accumulate matches. Change-Id: I5b0e28eb44ea4df24010f40831e29f2cbb8c1f80 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7810 Autosubmit: tazjin <tazjin@tvl.su> Reviewed-by: flokli <flokli@flokli.de> Tested-by: BuildkiteCI
Diffstat (limited to 'tvix/cli/src')
-rw-r--r-- | tvix/cli/src/refscan.rs | 40 |
1 files changed, 40 insertions, 0 deletions
diff --git a/tvix/cli/src/refscan.rs b/tvix/cli/src/refscan.rs index 76857142e87e..31fccb797a5e 100644 --- a/tvix/cli/src/refscan.rs +++ b/tvix/cli/src/refscan.rs @@ -9,6 +9,7 @@ use aho_corasick::AhoCorasick; use std::collections::BTreeSet; +use std::io; /// Represents a "primed" reference scanner with an automaton that knows the set /// of store paths to scan for. @@ -40,6 +41,22 @@ impl<'c, 's> ReferenceScanner<'c, 's> { } } + /// Scan the given reader for all non-overlapping matches, and collect them + /// in the scanner. On read failures, this method aborts and returns an + /// error to the caller. + /// + /// Please note that the internal machinery has its own buffering mechanism, + /// and where possible the given reader should be unbuffered. See + /// [`AhoCorasick::stream_find_iter`] for details on this. + pub fn scan_stream<R: io::Read>(&mut self, stream: R) -> io::Result<()> { + for m in self.searcher.stream_find_iter(stream) { + let needle = self.candidates[m?.pattern()]; + self.matches.insert(needle); + } + + Ok(()) + } + /// Finalise the reference scanner and return the resulting matches. pub fn finalise(self) -> BTreeSet<&'s str> { self.matches @@ -87,7 +104,30 @@ mod tests { scanner.scan_str(HELLO_DRV); let result = scanner.finalise(); + assert_eq!(result.len(), 3); + + for c in candidates[..3].iter() { + assert!(result.contains(c)); + } + } + + #[test] + fn test_multiple_stream() { + let candidates = &[ + // these exist in the drv: + "/nix/store/33l4p0pn0mybmqzaxfkpppyh7vx1c74p-hello-2.12.1", + "/nix/store/pf80kikyxr63wrw56k00i1kw6ba76qik-hello-2.12.1.tar.gz.drv", + "/nix/store/cp65c8nk29qq5cl1wyy5qyw103cwmax7-stdenv-linux", + // this doesn't: + "/nix/store/fn7zvafq26f0c8b17brs7s95s10ibfzs-emacs-28.2.drv", + ]; + let mut scanner = ReferenceScanner::new(candidates); + scanner + .scan_stream(HELLO_DRV.as_bytes()) + .expect("scanning should succeed"); + + let result = scanner.finalise(); assert_eq!(result.len(), 3); for c in candidates[..3].iter() { |