about summary refs log tree commit diff
path: root/tvix/nix-compat/src/nar/reader
diff options
context:
space:
mode:
authoredef <edef@edef.eu>2023-10-15T14·59+0000
committeredef <edef@edef.eu>2023-10-18T11·40+0000
commit6638f4d4ea6848fe6fc2dc89271f0b6950764729 (patch)
tree470e909133069a49403298d182972c9fff8c7a82 /tvix/nix-compat/src/nar/reader
parent08b98b7503edbde5277ce11faf48b94ca2f1c0b7 (diff)
feat(tvix/nix-compat): NAR reader r/6853
Change-Id: I50d51baf62c0419eaf17f0dc262f728aaff9794d
Reviewed-on: https://cl.tvl.fyi/c/depot/+/9688
Reviewed-by: raitobezarius <tvl@lahfa.xyz>
Tested-by: BuildkiteCI
Reviewed-by: flokli <flokli@flokli.de>
Diffstat (limited to 'tvix/nix-compat/src/nar/reader')
-rw-r--r--tvix/nix-compat/src/nar/reader/mod.rs229
-rw-r--r--tvix/nix-compat/src/nar/reader/read.rs109
2 files changed, 338 insertions, 0 deletions
diff --git a/tvix/nix-compat/src/nar/reader/mod.rs b/tvix/nix-compat/src/nar/reader/mod.rs
new file mode 100644
index 000000000000..e0b2e1c84574
--- /dev/null
+++ b/tvix/nix-compat/src/nar/reader/mod.rs
@@ -0,0 +1,229 @@
+//! Parser for the Nix archive format, aka NAR.
+//!
+//! NAR files (and their hashed representations) are used in C++ Nix for
+//! a variety of things, including addressing fixed-output derivations
+//! and transferring store paths between Nix stores.
+
+use std::io::{
+    self,
+    ErrorKind::{InvalidData, UnexpectedEof},
+    Read,
+};
+
+// Required reading for understanding this module.
+use crate::nar::wire;
+
+mod read;
+
+pub type Reader<'a> = dyn Read + Send + 'a;
+
+/// Start reading a NAR file from `reader`.
+pub fn open<'a, 'r>(reader: &'a mut Reader<'r>) -> io::Result<Node<'a, 'r>> {
+    read::token(reader, &wire::TOK_NAR)?;
+    Node::new(reader)
+}
+
+pub enum Node<'a, 'r> {
+    Symlink {
+        target: Vec<u8>,
+    },
+    File {
+        executable: bool,
+        reader: FileReader<'a, 'r>,
+    },
+    Directory(DirReader<'a, 'r>),
+}
+
+impl<'a, 'r> Node<'a, 'r> {
+    /// Start reading a [Node], matching the next [wire::Node].
+    ///
+    /// Reading the terminating [wire::TOK_PAR] is done immediately for [Node::Symlink],
+    /// but is otherwise left to [DirReader] or [FileReader].
+    fn new(reader: &'a mut Reader<'r>) -> io::Result<Self> {
+        Ok(match read::tag(reader)? {
+            wire::Node::Sym => {
+                let target = read::bytes(reader, wire::MAX_TARGET_LEN)?;
+
+                if target.is_empty() || target.contains(&0) {
+                    return Err(InvalidData.into());
+                }
+
+                read::token(reader, &wire::TOK_PAR)?;
+
+                Node::Symlink { target }
+            }
+            tag @ (wire::Node::Reg | wire::Node::Exe) => {
+                let len = read::u64(reader)?;
+
+                Node::File {
+                    executable: tag == wire::Node::Exe,
+                    reader: FileReader::new(reader, len)?,
+                }
+            }
+            wire::Node::Dir => Node::Directory(DirReader::new(reader)),
+        })
+    }
+}
+
+/// File contents, readable through the [Read] trait.
+///
+/// It comes with some caveats:
+///  * You must always read the entire file, unless you intend to abandon the entire archive reader.
+///  * You must abandon the entire archive reader upon the first error.
+///
+/// It's fine to read exactly `reader.len()` bytes without ever seeing an explicit EOF.
+///
+/// TODO(edef): enforce these in `#[cfg(debug_assertions)]`
+pub struct FileReader<'a, 'r> {
+    reader: &'a mut Reader<'r>,
+    len: u64,
+    /// Truncated original file length for padding computation.
+    /// We only care about the 3 least significant bits; semantically, this is a u3.
+    pad: u8,
+}
+
+impl<'a, 'r> FileReader<'a, 'r> {
+    /// Instantiate a new reader, starting after [wire::TOK_REG] or [wire::TOK_EXE].
+    /// We handle the terminating [wire::TOK_PAR] on semantic EOF.
+    fn new(reader: &'a mut Reader<'r>, len: u64) -> io::Result<Self> {
+        // For zero-length files, we have to read the terminating TOK_PAR
+        // immediately, since FileReader::read may never be called; we've
+        // already reached semantic EOF by definition.
+        if len == 0 {
+            read::token(reader, &wire::TOK_PAR)?;
+        }
+
+        Ok(Self {
+            reader,
+            len,
+            pad: len as u8,
+        })
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len == 0
+    }
+
+    pub fn len(&self) -> u64 {
+        self.len
+    }
+}
+
+impl Read for FileReader<'_, '_> {
+    fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
+        if buf.is_empty() || self.is_empty() {
+            return Ok(0);
+        }
+
+        if buf.len() as u64 > self.len {
+            buf = &mut buf[..self.len as usize];
+        }
+
+        let n = self.reader.read(buf)?;
+        self.len -= n as u64;
+
+        if n == 0 {
+            return Err(UnexpectedEof.into());
+        }
+
+        // If we've reached semantic EOF, consume and verify the padding and terminating TOK_PAR.
+        // Files are padded to 64 bits (8 bytes), just like any other byte string in the wire format.
+        if self.is_empty() {
+            let pad = (self.pad & 7) as usize;
+
+            if pad != 0 {
+                let mut buf = [0; 8];
+                self.reader.read_exact(&mut buf[pad..])?;
+
+                if buf != [0; 8] {
+                    return Err(InvalidData.into());
+                }
+            }
+
+            read::token(self.reader, &wire::TOK_PAR)?;
+        }
+
+        Ok(n)
+    }
+}
+
+/// A directory iterator, yielding a sequence of [Node]s.
+/// It must be fully consumed before reading further from the [DirReader] that produced it, if any.
+pub struct DirReader<'a, 'r> {
+    reader: &'a mut Reader<'r>,
+    /// Previous directory entry name.
+    /// We have to hang onto this to enforce name monotonicity.
+    prev_name: Option<Vec<u8>>,
+}
+
+pub struct Entry<'a, 'r> {
+    pub name: Vec<u8>,
+    pub node: Node<'a, 'r>,
+}
+
+impl<'a, 'r> DirReader<'a, 'r> {
+    fn new(reader: &'a mut Reader<'r>) -> Self {
+        Self {
+            reader,
+            prev_name: None,
+        }
+    }
+
+    /// Read the next [Entry] from the directory.
+    ///
+    /// We explicitly don't implement [Iterator], since treating this as
+    /// a regular Rust iterator will surely lead you astray.
+    ///
+    ///  * You must always consume the entire iterator, unless you abandon the entire archive reader.
+    ///  * You must abandon the entire archive reader on the first error.
+    ///  * You must abandon the directory reader upon the first [None].
+    ///  * Even if you know the amount of elements up front, you must keep reading until you encounter [None].
+    ///
+    /// TODO(edef): enforce these in `#[cfg(debug_assertions)]`
+    #[allow(clippy::should_implement_trait)]
+    pub fn next(&mut self) -> io::Result<Option<Entry>> {
+        // COME FROM the previous iteration: if we've already read an entry,
+        // read its terminating TOK_PAR here.
+        if self.prev_name.is_some() {
+            read::token(self.reader, &wire::TOK_PAR)?;
+        }
+
+        // Determine if there are more entries to follow
+        if let wire::Entry::None = read::tag(self.reader)? {
+            // We've reached the end of this directory.
+            return Ok(None);
+        }
+
+        let name = read::bytes(self.reader, wire::MAX_NAME_LEN)?;
+
+        if name.is_empty()
+            || name.contains(&0)
+            || name.contains(&b'/')
+            || name == b"."
+            || name == b".."
+        {
+            return Err(InvalidData.into());
+        }
+
+        // Enforce strict monotonicity of directory entry names.
+        match &mut self.prev_name {
+            None => {
+                self.prev_name = Some(name.clone());
+            }
+            Some(prev_name) => {
+                if *prev_name >= name {
+                    return Err(InvalidData.into());
+                }
+
+                name[..].clone_into(prev_name);
+            }
+        }
+
+        read::token(self.reader, &wire::TOK_NOD)?;
+
+        Ok(Some(Entry {
+            name,
+            node: Node::new(&mut self.reader)?,
+        }))
+    }
+}
diff --git a/tvix/nix-compat/src/nar/reader/read.rs b/tvix/nix-compat/src/nar/reader/read.rs
new file mode 100644
index 000000000000..3c886dcfb87a
--- /dev/null
+++ b/tvix/nix-compat/src/nar/reader/read.rs
@@ -0,0 +1,109 @@
+//! Helpers for reading [crate::nar::wire] format.
+
+use std::io::{
+    self,
+    ErrorKind::{Interrupted, InvalidData, UnexpectedEof},
+};
+
+use super::Reader;
+use crate::nar::wire::Tag;
+
+/// Consume a little-endian [u64] from the reader.
+pub fn u64(reader: &mut Reader) -> io::Result<u64> {
+    let mut buf = [0; 8];
+    reader.read_exact(&mut buf)?;
+    Ok(u64::from_le_bytes(buf))
+}
+
+/// Consume a byte string of up to `max_len` bytes from the reader.
+pub fn bytes(reader: &mut Reader, max_len: usize) -> io::Result<Vec<u8>> {
+    assert!(max_len <= isize::MAX as usize);
+
+    // read the length, and reject excessively large values
+    let len = self::u64(reader)?;
+    if len > max_len as u64 {
+        return Err(InvalidData.into());
+    }
+    // we know the length fits in a usize now
+    let len = len as usize;
+
+    // read the data and padding into a buffer
+    let buf_len = (len + 7) & !7;
+    let mut buf = vec![0; buf_len];
+    reader.read_exact(&mut buf)?;
+
+    // verify that the padding is all zeroes
+    for b in buf.drain(len..) {
+        if b != 0 {
+            return Err(InvalidData.into());
+        }
+    }
+
+    Ok(buf)
+}
+
+/// Consume a known token from the reader.
+pub fn token<const N: usize>(reader: &mut Reader, token: &[u8; N]) -> io::Result<()> {
+    let mut buf = [0u8; N];
+
+    // This implements something similar to [Read::read_exact], but verifies that
+    // the input data matches the token while we read it. These two slices respectively
+    // represent the remaining token to be verified, and the remaining input buffer.
+    let mut token = &token[..];
+    let mut buf = &mut buf[..];
+
+    while !token.is_empty() {
+        match reader.read(buf) {
+            Ok(0) => {
+                return Err(UnexpectedEof.into());
+            }
+            Ok(n) => {
+                let (t, b);
+                (t, token) = token.split_at(n);
+                (b, buf) = buf.split_at_mut(n);
+
+                if t != b {
+                    return Err(InvalidData.into());
+                }
+            }
+            Err(e) => {
+                if e.kind() != Interrupted {
+                    return Err(e);
+                }
+            }
+        }
+    }
+
+    Ok(())
+}
+
+/// Consume a [Tag] from the reader.
+pub fn tag<T: Tag>(reader: &mut Reader) -> io::Result<T> {
+    let mut buf = T::make_buf();
+    let buf = buf.as_mut();
+
+    // first read the known minimum length…
+    reader.read_exact(&mut buf[..T::MIN])?;
+
+    // then decide which tag we're expecting
+    let tag = T::from_u8(buf[T::OFF]).ok_or(InvalidData)?;
+    let (head, tail) = tag.as_bytes().split_at(T::MIN);
+
+    // make sure what we've read so far is valid
+    if buf[..T::MIN] != *head {
+        return Err(InvalidData.into());
+    }
+
+    // …then read the rest, if any
+    if !tail.is_empty() {
+        let rest = tail.len();
+        reader.read_exact(&mut buf[..rest])?;
+
+        // and make sure it's what we expect
+        if buf[..rest] != *tail {
+            return Err(InvalidData.into());
+        }
+    }
+
+    Ok(tag)
+}