about summary refs log tree commit diff
path: root/tvix/glue
diff options
context:
space:
mode:
authorYureka <tvl@yuka.dev>2024-09-27T14·56+0200
committerclbot <clbot@tvl.fyi>2024-09-27T20·39+0000
commitcaf597db8107d7cb8c4772e83552e9774f85c5c5 (patch)
treeec166388b2d8ddef9b1d42656c4dd14f5f06d501 /tvix/glue
parented2bd8885052494bcf208f97e292d746f223a117 (diff)
refactor(tvix): move refscan module to castore r/8724
This is required to add the optional refscanner parameter to the
ingest functions.

Change-Id: Ib40a7287cf857eb55e31e0df309a79474fefb518
Reviewed-on: https://cl.tvl.fyi/c/depot/+/12527
Autosubmit: yuka <yuka@yuka.dev>
Reviewed-by: flokli <flokli@flokli.de>
Tested-by: BuildkiteCI
Diffstat (limited to 'tvix/glue')
-rw-r--r--tvix/glue/Cargo.toml2
-rw-r--r--tvix/glue/src/lib.rs1
-rw-r--r--tvix/glue/src/refscan.rs339
3 files changed, 0 insertions, 342 deletions
diff --git a/tvix/glue/Cargo.toml b/tvix/glue/Cargo.toml
index bb522dceb902..4836e1e3a5d8 100644
--- a/tvix/glue/Cargo.toml
+++ b/tvix/glue/Cargo.toml
@@ -32,7 +32,6 @@ md-5 = { workspace = true }
 url = { workspace = true }
 walkdir = { workspace = true }
 clap = { workspace = true }
-wu-manber = { workspace = true }
 
 [dev-dependencies]
 criterion = { workspace = true, features = ["html_reports"] }
@@ -43,7 +42,6 @@ nix = { workspace = true, features = ["fs"] }
 pretty_assertions = { workspace = true }
 rstest = { workspace = true }
 tempfile = { workspace = true }
-tokio-test = { workspace = true }
 
 [features]
 default = ["nix_tests"]
diff --git a/tvix/glue/src/lib.rs b/tvix/glue/src/lib.rs
index f6f99f3c32a3..320d1f6fede2 100644
--- a/tvix/glue/src/lib.rs
+++ b/tvix/glue/src/lib.rs
@@ -1,7 +1,6 @@
 pub mod builtins;
 pub mod fetchers;
 pub mod known_paths;
-pub mod refscan;
 pub mod tvix_build;
 pub mod tvix_io;
 pub mod tvix_store_io;
diff --git a/tvix/glue/src/refscan.rs b/tvix/glue/src/refscan.rs
deleted file mode 100644
index 80a126349746..000000000000
--- a/tvix/glue/src/refscan.rs
+++ /dev/null
@@ -1,339 +0,0 @@
-//! Simple scanner for non-overlapping, known references of Nix store paths in a
-//! given string.
-//!
-//! This is used for determining build references (see
-//! //tvix/eval/docs/build-references.md for more details).
-//!
-//! The scanner itself is using the Wu-Manber string-matching algorithm, using
-//! our fork of the `wu-mamber` crate.
-use pin_project::pin_project;
-use std::collections::BTreeSet;
-use std::pin::Pin;
-use std::sync::Arc;
-use std::task::{ready, Poll};
-use tokio::io::{AsyncBufRead, AsyncRead, ReadBuf};
-use wu_manber::TwoByteWM;
-
-/// A searcher that incapsulates the candidates and the Wu-Manber searcher.
-/// This is separate from the scanner because we need to look for the same
-/// pattern in multiple outputs and don't want to pay the price of constructing
-/// the searcher for each build output.
-pub struct ReferencePatternInner<P> {
-    candidates: Vec<P>,
-    longest_candidate: usize,
-    // FUTUREWORK: Support overlapping patterns to be compatible with cpp Nix
-    searcher: Option<TwoByteWM>,
-}
-
-#[derive(Clone)]
-pub struct ReferencePattern<P> {
-    inner: Arc<ReferencePatternInner<P>>,
-}
-
-impl<P> ReferencePattern<P> {
-    pub fn candidates(&self) -> &[P] {
-        &self.inner.candidates
-    }
-
-    pub fn longest_candidate(&self) -> usize {
-        self.inner.longest_candidate
-    }
-}
-
-impl<P: AsRef<[u8]>> ReferencePattern<P> {
-    /// Construct a new `ReferencePattern` that knows how to scan for the given
-    /// candidates.
-    pub fn new(candidates: Vec<P>) -> Self {
-        let searcher = if candidates.is_empty() {
-            None
-        } else {
-            Some(TwoByteWM::new(&candidates))
-        };
-        let longest_candidate = candidates.iter().fold(0, |v, c| v.max(c.as_ref().len()));
-
-        ReferencePattern {
-            inner: Arc::new(ReferencePatternInner {
-                searcher,
-                candidates,
-                longest_candidate,
-            }),
-        }
-    }
-}
-
-impl<P> From<Vec<P>> for ReferencePattern<P>
-where
-    P: AsRef<[u8]>,
-{
-    fn from(candidates: Vec<P>) -> Self {
-        Self::new(candidates)
-    }
-}
-
-/// Represents a "primed" reference scanner with an automaton that knows the set
-/// of bytes patterns to scan for.
-pub struct ReferenceScanner<P> {
-    pattern: ReferencePattern<P>,
-    matches: Vec<bool>,
-}
-
-impl<P: AsRef<[u8]>> ReferenceScanner<P> {
-    /// Construct a new `ReferenceScanner` that knows how to scan for the given
-    /// candidate bytes patterns.
-    pub fn new<IP: Into<ReferencePattern<P>>>(pattern: IP) -> Self {
-        let pattern = pattern.into();
-        let matches = vec![false; pattern.candidates().len()];
-        ReferenceScanner { pattern, matches }
-    }
-
-    /// Scan the given buffer for all non-overlapping matches and collect them
-    /// in the scanner.
-    pub fn scan<S: AsRef<[u8]>>(&mut self, haystack: S) {
-        if haystack.as_ref().len() < self.pattern.longest_candidate() {
-            return;
-        }
-
-        if let Some(searcher) = &self.pattern.inner.searcher {
-            for m in searcher.find(haystack) {
-                self.matches[m.pat_idx] = true;
-            }
-        }
-    }
-
-    pub fn pattern(&self) -> &ReferencePattern<P> {
-        &self.pattern
-    }
-
-    pub fn matches(&self) -> &[bool] {
-        &self.matches
-    }
-
-    pub fn candidate_matches(&self) -> impl Iterator<Item = &P> {
-        let candidates = self.pattern.candidates();
-        self.matches.iter().enumerate().filter_map(|(idx, found)| {
-            if *found {
-                Some(&candidates[idx])
-            } else {
-                None
-            }
-        })
-    }
-}
-
-impl<P: Clone + Ord + AsRef<[u8]>> ReferenceScanner<P> {
-    /// Finalise the reference scanner and return the resulting matches.
-    pub fn finalise(self) -> BTreeSet<P> {
-        self.candidate_matches().cloned().collect()
-    }
-}
-
-const DEFAULT_BUF_SIZE: usize = 8 * 1024;
-
-#[pin_project]
-pub struct ReferenceReader<P, R> {
-    scanner: ReferenceScanner<P>,
-    buffer: Vec<u8>,
-    consumed: usize,
-    #[pin]
-    reader: R,
-}
-
-impl<P, R> ReferenceReader<P, R>
-where
-    P: AsRef<[u8]>,
-{
-    pub fn new(pattern: ReferencePattern<P>, reader: R) -> ReferenceReader<P, R> {
-        Self::with_capacity(DEFAULT_BUF_SIZE, pattern, reader)
-    }
-
-    pub fn with_capacity(
-        capacity: usize,
-        pattern: ReferencePattern<P>,
-        reader: R,
-    ) -> ReferenceReader<P, R> {
-        // If capacity is not at least as long as longest_candidate we can't do a scan
-        let capacity = capacity.max(pattern.longest_candidate());
-        ReferenceReader {
-            scanner: ReferenceScanner::new(pattern),
-            buffer: Vec::with_capacity(capacity),
-            consumed: 0,
-            reader,
-        }
-    }
-
-    pub fn scanner(&self) -> &ReferenceScanner<P> {
-        &self.scanner
-    }
-}
-
-impl<P, R> ReferenceReader<P, R>
-where
-    P: Clone + Ord + AsRef<[u8]>,
-{
-    pub fn finalise(self) -> BTreeSet<P> {
-        self.scanner.finalise()
-    }
-}
-
-impl<P, R> AsyncRead for ReferenceReader<P, R>
-where
-    R: AsyncRead,
-    P: AsRef<[u8]>,
-{
-    fn poll_read(
-        mut self: Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-        buf: &mut tokio::io::ReadBuf<'_>,
-    ) -> Poll<std::io::Result<()>> {
-        let internal_buf = ready!(self.as_mut().poll_fill_buf(cx))?;
-        let amt = buf.remaining().min(internal_buf.len());
-        buf.put_slice(&internal_buf[..amt]);
-        self.consume(amt);
-        Poll::Ready(Ok(()))
-    }
-}
-
-impl<P, R> AsyncBufRead for ReferenceReader<P, R>
-where
-    R: AsyncRead,
-    P: AsRef<[u8]>,
-{
-    fn poll_fill_buf(
-        self: Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-    ) -> Poll<std::io::Result<&[u8]>> {
-        let overlap = self.scanner.pattern.longest_candidate() - 1;
-        let mut this = self.project();
-        // Still data in buffer
-        if *this.consumed < this.buffer.len() {
-            return Poll::Ready(Ok(&this.buffer[*this.consumed..]));
-        }
-        // We need to copy last `overlap` bytes to front to deal with references that overlap reads
-        if *this.consumed > overlap {
-            let start = this.buffer.len() - overlap;
-            this.buffer.copy_within(start.., 0);
-            this.buffer.truncate(overlap);
-            *this.consumed = overlap;
-        }
-        // Read at least until self.buffer.len() > overlap so we can do one scan
-        loop {
-            let filled = {
-                let mut buf = ReadBuf::uninit(this.buffer.spare_capacity_mut());
-                ready!(this.reader.as_mut().poll_read(cx, &mut buf))?;
-                buf.filled().len()
-            };
-            // SAFETY: We just read `filled` amount of data above
-            unsafe {
-                this.buffer.set_len(filled + this.buffer.len());
-            }
-            if filled == 0 || this.buffer.len() > overlap {
-                break;
-            }
-        }
-
-        #[allow(clippy::needless_borrows_for_generic_args)] // misfiring lint (breaks code below)
-        this.scanner.scan(&this.buffer);
-
-        Poll::Ready(Ok(&this.buffer[*this.consumed..]))
-    }
-
-    fn consume(self: Pin<&mut Self>, amt: usize) {
-        debug_assert!(self.consumed + amt <= self.buffer.len());
-        let this = self.project();
-        *this.consumed += amt;
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use rstest::rstest;
-    use tokio::io::AsyncReadExt as _;
-    use tokio_test::io::Builder;
-
-    use super::*;
-
-    // The actual derivation of `nixpkgs.hello`.
-    const HELLO_DRV: &str = r#"Derive([("out","/nix/store/33l4p0pn0mybmqzaxfkpppyh7vx1c74p-hello-2.12.1","","")],[("/nix/store/6z1jfnqqgyqr221zgbpm30v91yfj3r45-bash-5.1-p16.drv",["out"]),("/nix/store/ap9g09fxbicj836zm88d56dn3ff4clxl-stdenv-linux.drv",["out"]),("/nix/store/pf80kikyxr63wrw56k00i1kw6ba76qik-hello-2.12.1.tar.gz.drv",["out"])],["/nix/store/9krlzvny65gdc8s7kpb6lkx8cd02c25b-default-builder.sh"],"x86_64-linux","/nix/store/4xw8n979xpivdc46a9ndcvyhwgif00hz-bash-5.1-p16/bin/bash",["-e","/nix/store/9krlzvny65gdc8s7kpb6lkx8cd02c25b-default-builder.sh"],[("buildInputs",""),("builder","/nix/store/4xw8n979xpivdc46a9ndcvyhwgif00hz-bash-5.1-p16/bin/bash"),("cmakeFlags",""),("configureFlags",""),("depsBuildBuild",""),("depsBuildBuildPropagated",""),("depsBuildTarget",""),("depsBuildTargetPropagated",""),("depsHostHost",""),("depsHostHostPropagated",""),("depsTargetTarget",""),("depsTargetTargetPropagated",""),("doCheck","1"),("doInstallCheck",""),("mesonFlags",""),("name","hello-2.12.1"),("nativeBuildInputs",""),("out","/nix/store/33l4p0pn0mybmqzaxfkpppyh7vx1c74p-hello-2.12.1"),("outputs","out"),("patches",""),("pname","hello"),("propagatedBuildInputs",""),("propagatedNativeBuildInputs",""),("src","/nix/store/pa10z4ngm0g83kx9mssrqzz30s84vq7k-hello-2.12.1.tar.gz"),("stdenv","/nix/store/cp65c8nk29qq5cl1wyy5qyw103cwmax7-stdenv-linux"),("strictDeps",""),("system","x86_64-linux"),("version","2.12.1")])"#;
-
-    #[test]
-    fn test_no_patterns() {
-        let mut scanner: ReferenceScanner<String> = ReferenceScanner::new(vec![]);
-
-        scanner.scan(HELLO_DRV);
-
-        let result = scanner.finalise();
-
-        assert_eq!(result.len(), 0);
-    }
-
-    #[test]
-    fn test_single_match() {
-        let mut scanner = ReferenceScanner::new(vec![
-            "/nix/store/4xw8n979xpivdc46a9ndcvyhwgif00hz-bash-5.1-p16".to_string(),
-        ]);
-        scanner.scan(HELLO_DRV);
-
-        let result = scanner.finalise();
-
-        assert_eq!(result.len(), 1);
-        assert!(result.contains("/nix/store/4xw8n979xpivdc46a9ndcvyhwgif00hz-bash-5.1-p16"));
-    }
-
-    #[test]
-    fn test_multiple_matches() {
-        let candidates = vec![
-            // these exist in the drv:
-            "/nix/store/33l4p0pn0mybmqzaxfkpppyh7vx1c74p-hello-2.12.1".to_string(),
-            "/nix/store/pf80kikyxr63wrw56k00i1kw6ba76qik-hello-2.12.1.tar.gz.drv".to_string(),
-            "/nix/store/cp65c8nk29qq5cl1wyy5qyw103cwmax7-stdenv-linux".to_string(),
-            // this doesn't:
-            "/nix/store/fn7zvafq26f0c8b17brs7s95s10ibfzs-emacs-28.2.drv".to_string(),
-        ];
-
-        let mut scanner = ReferenceScanner::new(candidates.clone());
-        scanner.scan(HELLO_DRV);
-
-        let result = scanner.finalise();
-        assert_eq!(result.len(), 3);
-
-        for c in candidates[..3].iter() {
-            assert!(result.contains(c));
-        }
-    }
-
-    #[rstest]
-    #[case::normal(8096, 8096)]
-    #[case::small_capacity(8096, 1)]
-    #[case::small_read(1, 8096)]
-    #[case::all_small(1, 1)]
-    #[tokio::test]
-    async fn test_reference_reader(#[case] chunk_size: usize, #[case] capacity: usize) {
-        let candidates = vec![
-            // these exist in the drv:
-            "33l4p0pn0mybmqzaxfkpppyh7vx1c74p",
-            "pf80kikyxr63wrw56k00i1kw6ba76qik",
-            "cp65c8nk29qq5cl1wyy5qyw103cwmax7",
-            // this doesn't:
-            "fn7zvafq26f0c8b17brs7s95s10ibfzs",
-        ];
-        let pattern = ReferencePattern::new(candidates.clone());
-        let mut mock = Builder::new();
-        for c in HELLO_DRV.as_bytes().chunks(chunk_size) {
-            mock.read(c);
-        }
-        let mock = mock.build();
-        let mut reader = ReferenceReader::with_capacity(capacity, pattern, mock);
-        let mut s = String::new();
-        reader.read_to_string(&mut s).await.unwrap();
-        assert_eq!(s, HELLO_DRV);
-
-        let result = reader.finalise();
-        assert_eq!(result.len(), 3);
-
-        for c in candidates[..3].iter() {
-            assert!(result.contains(c));
-        }
-    }
-
-    // FUTUREWORK: Test with large file
-}