about summary refs log tree commit diff
diff options
context:
space:
mode:
authorVincent Ambo <mail@tazj.in>2023-01-11T15·23+0300
committerclbot <clbot@tvl.fyi>2023-01-11T16·03+0000
commite63bff5545385089508ee2588b17c17d2e4d7352 (patch)
tree03b2c8598527d27d727e65166ebabd9ffbc29fd9
parent3045645df07ffdb54f9d2a11ee2e41e31999986f (diff)
feat(tvix/refscan): implement reference scanning over data streams r/5644
Using yet more machinery from the pretty comprehensive aho_corasick
crate, this makes it possible to pass anything implementing `io::Read`
to the `ReferenceScanner` to accumulate matches.

Change-Id: I5b0e28eb44ea4df24010f40831e29f2cbb8c1f80
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7810
Autosubmit: tazjin <tazjin@tvl.su>
Reviewed-by: flokli <flokli@flokli.de>
Tested-by: BuildkiteCI
-rw-r--r--tvix/cli/src/refscan.rs40
1 files changed, 40 insertions, 0 deletions
diff --git a/tvix/cli/src/refscan.rs b/tvix/cli/src/refscan.rs
index 76857142e8..31fccb797a 100644
--- a/tvix/cli/src/refscan.rs
+++ b/tvix/cli/src/refscan.rs
@@ -9,6 +9,7 @@
 
 use aho_corasick::AhoCorasick;
 use std::collections::BTreeSet;
+use std::io;
 
 /// Represents a "primed" reference scanner with an automaton that knows the set
 /// of store paths to scan for.
@@ -40,6 +41,22 @@ impl<'c, 's> ReferenceScanner<'c, 's> {
         }
     }
 
+    /// Scan the given reader for all non-overlapping matches, and collect them
+    /// in the scanner. On read failures, this method aborts and returns an
+    /// error to the caller.
+    ///
+    /// Please note that the internal machinery has its own buffering mechanism,
+    /// and where possible the given reader should be unbuffered. See
+    /// [`AhoCorasick::stream_find_iter`] for details on this.
+    pub fn scan_stream<R: io::Read>(&mut self, stream: R) -> io::Result<()> {
+        for m in self.searcher.stream_find_iter(stream) {
+            let needle = self.candidates[m?.pattern()];
+            self.matches.insert(needle);
+        }
+
+        Ok(())
+    }
+
     /// Finalise the reference scanner and return the resulting matches.
     pub fn finalise(self) -> BTreeSet<&'s str> {
         self.matches
@@ -87,7 +104,30 @@ mod tests {
         scanner.scan_str(HELLO_DRV);
 
         let result = scanner.finalise();
+        assert_eq!(result.len(), 3);
+
+        for c in candidates[..3].iter() {
+            assert!(result.contains(c));
+        }
+    }
+
+    #[test]
+    fn test_multiple_stream() {
+        let candidates = &[
+            // these exist in the drv:
+            "/nix/store/33l4p0pn0mybmqzaxfkpppyh7vx1c74p-hello-2.12.1",
+            "/nix/store/pf80kikyxr63wrw56k00i1kw6ba76qik-hello-2.12.1.tar.gz.drv",
+            "/nix/store/cp65c8nk29qq5cl1wyy5qyw103cwmax7-stdenv-linux",
+            // this doesn't:
+            "/nix/store/fn7zvafq26f0c8b17brs7s95s10ibfzs-emacs-28.2.drv",
+        ];
 
+        let mut scanner = ReferenceScanner::new(candidates);
+        scanner
+            .scan_stream(HELLO_DRV.as_bytes())
+            .expect("scanning should succeed");
+
+        let result = scanner.finalise();
         assert_eq!(result.len(), 3);
 
         for c in candidates[..3].iter() {