1 files changed, 479 insertions, 0 deletions
diff --git a/tvix/nix-compat/src/nar/reader/mod.rs b/tvix/nix-compat/src/nar/reader/mod.rs
new file mode 100644
index 0000000000..ecf9c3d789
--- /dev/null
+++ b/tvix/nix-compat/src/nar/reader/mod.rs
@@ -0,0 +1,479 @@
+//! Parser for the Nix archive format, aka NAR.
+//!
+//! NAR files (and their hashed representations) are used in C++ Nix for
+//! a variety of things, including addressing fixed-output derivations
+//! and transferring store paths between Nix stores.
+
+use std::io::{
+    self, BufRead,
+    ErrorKind::{InvalidData, UnexpectedEof},
+    Read, Write,
+};
+
+#[cfg(not(debug_assertions))]
+use std::marker::PhantomData;
+
+// Required reading for understanding this module.
+use crate::nar::wire;
+
+mod read;
+#[cfg(test)]
+mod test;
+
+pub type Reader<'a> = dyn BufRead + Send + 'a;
+
+struct ArchiveReader<'a, 'r> {
+    inner: &'a mut Reader<'r>,
+
+    /// In debug mode, also track when we need to abandon this archive reader.
+    /// The archive reader must be abandoned when:
+    ///   * An error is encountered at any point
+    ///   * A file or directory reader is dropped before being read entirely.
+    /// All of these checks vanish in release mode.
+    status: ArchiveReaderStatus<'a>,
+}
+
+macro_rules! try_or_poison {
+    ($it:expr, $ex:expr) => {
+        match $ex {
+            Ok(x) => x,
+            Err(e) => {
+                $it.status.poison();
+                return Err(e.into());
+            }
+        }
+    };
+}
+/// Start reading a NAR file from `reader`.
+pub fn open<'a, 'r>(reader: &'a mut Reader<'r>) -> io::Result<Node<'a, 'r>> {
+    read::token(reader, &wire::TOK_NAR)?;
+    Node::new(ArchiveReader {
+        inner: reader,
+        status: ArchiveReaderStatus::top(),
+    })
+}
+
+pub enum Node<'a, 'r> {
+    Symlink {
+        target: Vec<u8>,
+    },
+    File {
+        executable: bool,
+        reader: FileReader<'a, 'r>,
+    },
+    Directory(DirReader<'a, 'r>),
+}
+
+impl<'a, 'r> Node<'a, 'r> {
+    /// Start reading a [Node], matching the next [wire::Node].
+    ///
+    /// Reading the terminating [wire::TOK_PAR] is done immediately for [Node::Symlink],
+    /// but is otherwise left to [DirReader] or [FileReader].
+    fn new(mut reader: ArchiveReader<'a, 'r>) -> io::Result<Self> {
+        Ok(match read::tag(reader.inner)? {
+            wire::Node::Sym => {
+                let target =
+                    try_or_poison!(reader, read::bytes(reader.inner, wire::MAX_TARGET_LEN));
+
+                if target.is_empty() || target.contains(&0) {
+                    reader.status.poison();
+                    return Err(InvalidData.into());
+                }
+
+                try_or_poison!(reader, read::token(reader.inner, &wire::TOK_PAR));
+                reader.status.ready_parent(); // Immediately allow reading from parent again
+
+                Node::Symlink { target }
+            }
+            tag @ (wire::Node::Reg | wire::Node::Exe) => {
+                let len = try_or_poison!(&mut reader, read::u64(reader.inner));
+
+                Node::File {
+                    executable: tag == wire::Node::Exe,
+                    reader: FileReader::new(reader, len)?,
+                }
+            }
+            wire::Node::Dir => Node::Directory(DirReader::new(reader)),
+        })
+    }
+}
+
+/// File contents, readable through the [Read] trait.
+///
+/// It comes with some caveats:
+///  * You must always read the entire file, unless you intend to abandon the entire archive reader.
+///  * You must abandon the entire archive reader upon the first error.
+///
+/// It's fine to read exactly `reader.len()` bytes without ever seeing an explicit EOF.
+pub struct FileReader<'a, 'r> {
+    reader: ArchiveReader<'a, 'r>,
+    len: u64,
+    /// Truncated original file length for padding computation.
+    /// We only care about the 3 least significant bits; semantically, this is a u3.
+    pad: u8,
+}
+
+impl<'a, 'r> FileReader<'a, 'r> {
+    /// Instantiate a new reader, starting after [wire::TOK_REG] or [wire::TOK_EXE].
+    /// We handle the terminating [wire::TOK_PAR] on semantic EOF.
+    fn new(mut reader: ArchiveReader<'a, 'r>, len: u64) -> io::Result<Self> {
+        // For zero-length files, we have to read the terminating TOK_PAR
+        // immediately, since FileReader::read may never be called; we've
+        // already reached semantic EOF by definition.
+        if len == 0 {
+            read::token(reader.inner, &wire::TOK_PAR)?;
+            reader.status.ready_parent();
+        }
+
+        Ok(Self {
+            reader,
+            len,
+            pad: len as u8,
+        })
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len == 0
+    }
+
+    pub fn len(&self) -> u64 {
+        self.len
+    }
+}
+
+impl FileReader<'_, '_> {
+    /// Equivalent to [BufRead::fill_buf]
+    ///
+    /// We can't directly implement [BufRead], because [FileReader::consume] needs
+    /// to perform fallible I/O.
+    pub fn fill_buf(&mut self) -> io::Result<&[u8]> {
+        if self.is_empty() {
+            return Ok(&[]);
+        }
+
+        self.reader.check_correct();
+
+        let mut buf = try_or_poison!(self.reader, self.reader.inner.fill_buf());
+
+        if buf.is_empty() {
+            self.reader.status.poison();
+            return Err(UnexpectedEof.into());
+        }
+
+        if buf.len() as u64 > self.len {
+            buf = &buf[..self.len as usize];
+        }
+
+        Ok(buf)
+    }
+
+    /// Analogous to [BufRead::consume], differing only in that it needs
+    /// to perform I/O in order to read padding and terminators.
+    pub fn consume(&mut self, n: usize) -> io::Result<()> {
+        if n == 0 {
+            return Ok(());
+        }
+
+        self.reader.check_correct();
+
+        self.len = self
+            .len
+            .checked_sub(n as u64)
+            .expect("consumed bytes past EOF");
+
+        self.reader.inner.consume(n);
+
+        if self.is_empty() {
+            self.finish()?;
+        }
+
+        Ok(())
+    }
+
+    /// Copy the (remaining) contents of the file into `dst`.
+    pub fn copy(&mut self, mut dst: impl Write) -> io::Result<()> {
+        while !self.is_empty() {
+            let buf = self.fill_buf()?;
+            let n = try_or_poison!(self.reader, dst.write(buf));
+            self.consume(n)?;
+        }
+
+        Ok(())
+    }
+}
+
+impl Read for FileReader<'_, '_> {
+    fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
+        if buf.is_empty() || self.is_empty() {
+            return Ok(0);
+        }
+
+        self.reader.check_correct();
+
+        if buf.len() as u64 > self.len {
+            buf = &mut buf[..self.len as usize];
+        }
+
+        let n = try_or_poison!(self.reader, self.reader.inner.read(buf));
+        self.len -= n as u64;
+
+        if n == 0 {
+            self.reader.status.poison();
+            return Err(UnexpectedEof.into());
+        }
+
+        if self.is_empty() {
+            self.finish()?;
+        }
+
+        Ok(n)
+    }
+}
+
+impl FileReader<'_, '_> {
+    /// We've reached semantic EOF, consume and verify the padding and terminating TOK_PAR.
+    /// Files are padded to 64 bits (8 bytes), just like any other byte string in the wire format.
+    fn finish(&mut self) -> io::Result<()> {
+        let pad = (self.pad & 7) as usize;
+
+        if pad != 0 {
+            let mut buf = [0; 8];
+            try_or_poison!(self.reader, self.reader.inner.read_exact(&mut buf[pad..]));
+
+            if buf != [0; 8] {
+                self.reader.status.poison();
+                return Err(InvalidData.into());
+            }
+        }
+
+        try_or_poison!(self.reader, read::token(self.reader.inner, &wire::TOK_PAR));
+
+        // Done with reading this file, allow going back up the chain of readers
+        self.reader.status.ready_parent();
+
+        Ok(())
+    }
+}
+
+/// A directory iterator, yielding a sequence of [Node]s.
+/// It must be fully consumed before reading further from the [DirReader] that produced it, if any.
+pub struct DirReader<'a, 'r> {
+    reader: ArchiveReader<'a, 'r>,
+    /// Previous directory entry name.
+    /// We have to hang onto this to enforce name monotonicity.
+    prev_name: Option<Vec<u8>>,
+}
+
+pub struct Entry<'a, 'r> {
+    pub name: Vec<u8>,
+    pub node: Node<'a, 'r>,
+}
+
+impl<'a, 'r> DirReader<'a, 'r> {
+    fn new(reader: ArchiveReader<'a, 'r>) -> Self {
+        Self {
+            reader,
+            prev_name: None,
+        }
+    }
+
+    /// Read the next [Entry] from the directory.
+    ///
+    /// We explicitly don't implement [Iterator], since treating this as
+    /// a regular Rust iterator will surely lead you astray.
+    ///
+    ///  * You must always consume the entire iterator, unless you abandon the entire archive reader.
+    ///  * You must abandon the entire archive reader on the first error.
+    ///  * You must abandon the directory reader upon the first [None].
+    ///  * Even if you know the amount of elements up front, you must keep reading until you encounter [None].
+    #[allow(clippy::should_implement_trait)]
+    pub fn next(&mut self) -> io::Result<Option<Entry<'_, 'r>>> {
+        self.reader.check_correct();
+
+        // COME FROM the previous iteration: if we've already read an entry,
+        // read its terminating TOK_PAR here.
+        if self.prev_name.is_some() {
+            try_or_poison!(self.reader, read::token(self.reader.inner, &wire::TOK_PAR));
+        }
+
+        // Determine if there are more entries to follow
+        if let wire::Entry::None = try_or_poison!(self.reader, read::tag(self.reader.inner)) {
+            // We've reached the end of this directory.
+            self.reader.status.ready_parent();
+            return Ok(None);
+        }
+
+        let name = try_or_poison!(
+            self.reader,
+            read::bytes(self.reader.inner, wire::MAX_NAME_LEN)
+        );
+
+        if name.is_empty()
+            || name.contains(&0)
+            || name.contains(&b'/')
+            || name == b"."
+            || name == b".."
+        {
+            self.reader.status.poison();
+            return Err(InvalidData.into());
+        }
+
+        // Enforce strict monotonicity of directory entry names.
+        match &mut self.prev_name {
+            None => {
+                self.prev_name = Some(name.clone());
+            }
+            Some(prev_name) => {
+                if *prev_name >= name {
+                    self.reader.status.poison();
+                    return Err(InvalidData.into());
+                }
+
+                name[..].clone_into(prev_name);
+            }
+        }
+
+        try_or_poison!(self.reader, read::token(self.reader.inner, &wire::TOK_NOD));
+
+        Ok(Some(Entry {
+            name,
+            // Don't need to worry about poisoning here: Node::new will do it for us if needed
+            node: Node::new(self.reader.child())?,
+        }))
+    }
+}
+
+/// We use a stack of statuses to:
+///   * Share poisoned state across all objects from the same underlying reader,
+///     so we can check they are abandoned when an error occurs
+///   * Make sure only the most recently created object is read from, and is fully exhausted
+///     before anything it was created from is used again.
+enum ArchiveReaderStatus<'a> {
+    #[cfg(not(debug_assertions))]
+    None(PhantomData<&'a ()>),
+    #[cfg(debug_assertions)]
+    StackTop { poisoned: bool, ready: bool },
+    #[cfg(debug_assertions)]
+    StackChild {
+        poisoned: &'a mut bool,
+        parent_ready: &'a mut bool,
+        ready: bool,
+    },
+}
+
+impl ArchiveReaderStatus<'_> {
+    fn top() -> Self {
+        #[cfg(debug_assertions)]
+        {
+            ArchiveReaderStatus::StackTop {
+                poisoned: false,
+                ready: true,
+            }
+        }
+
+        #[cfg(not(debug_assertions))]
+        ArchiveReaderStatus::None(PhantomData)
+    }
+
+    /// Poison all the objects sharing the same reader, to be used when an error occurs
+    fn poison(&mut self) {
+        match self {
+            #[cfg(not(debug_assertions))]
+            ArchiveReaderStatus::None(_) => {}
+            #[cfg(debug_assertions)]
+            ArchiveReaderStatus::StackTop { poisoned: x, .. } => *x = true,
+            #[cfg(debug_assertions)]
+            ArchiveReaderStatus::StackChild { poisoned: x, .. } => **x = true,
+        }
+    }
+
+    /// Mark the parent as ready, allowing it to be used again and preventing this reference to the reader being used again.
+    fn ready_parent(&mut self) {
+        match self {
+            #[cfg(not(debug_assertions))]
+            ArchiveReaderStatus::None(_) => {}
+            #[cfg(debug_assertions)]
+            ArchiveReaderStatus::StackTop { ready, .. } => {
+                *ready = false;
+            }
+            #[cfg(debug_assertions)]
+            ArchiveReaderStatus::StackChild {
+                ready,
+                parent_ready,
+                ..
+            } => {
+                *ready = false;
+                **parent_ready = true;
+            }
+        };
+    }
+
+    fn poisoned(&self) -> bool {
+        match self {
+            #[cfg(not(debug_assertions))]
+            ArchiveReaderStatus::None(_) => false,
+            #[cfg(debug_assertions)]
+            ArchiveReaderStatus::StackTop { poisoned, .. } => *poisoned,
+            #[cfg(debug_assertions)]
+            ArchiveReaderStatus::StackChild { poisoned, .. } => **poisoned,
+        }
+    }
+
+    fn ready(&self) -> bool {
+        match self {
+            #[cfg(not(debug_assertions))]
+            ArchiveReaderStatus::None(_) => true,
+            #[cfg(debug_assertions)]
+            ArchiveReaderStatus::StackTop { ready, .. } => *ready,
+            #[cfg(debug_assertions)]
+            ArchiveReaderStatus::StackChild { ready, .. } => *ready,
+        }
+    }
+}
+
+impl<'a, 'r> ArchiveReader<'a, 'r> {
+    /// Create a new child reader from this one.
+    /// In debug mode, this reader will panic if called before the new child is exhausted / calls `ready_parent`
+    fn child(&mut self) -> ArchiveReader<'_, 'r> {
+        ArchiveReader {
+            inner: self.inner,
+            #[cfg(not(debug_assertions))]
+            status: ArchiveReaderStatus::None(PhantomData),
+            #[cfg(debug_assertions)]
+            status: match &mut self.status {
+                ArchiveReaderStatus::StackTop { poisoned, ready } => {
+                    *ready = false;
+                    ArchiveReaderStatus::StackChild {
+                        poisoned,
+                        parent_ready: ready,
+                        ready: true,
+                    }
+                }
+                ArchiveReaderStatus::StackChild {
+                    poisoned, ready, ..
+                } => {
+                    *ready = false;
+                    ArchiveReaderStatus::StackChild {
+                        poisoned,
+                        parent_ready: ready,
+                        ready: true,
+                    }
+                }
+            },
+        }
+    }
+
+    /// Check the reader is in the correct status.
+    /// Only does anything when debug assertions are on.
+    #[inline(always)]
+    fn check_correct(&self) {
+        assert!(
+            !self.status.poisoned(),
+            "Archive reader used after it was meant to be abandoned!"
+        );
+        assert!(
+            self.status.ready(),
+            "Non-ready archive reader used! (Should've been reading from something else)"
+        );
+    }
+}