about summary refs log tree commit diff
path: root/tvix/nix-compat
diff options
context:
space:
mode:
authoredef <edef@edef.eu>2023-10-15T14·59+0000
committeredef <edef@edef.eu>2023-10-18T11·40+0000
commit6638f4d4ea6848fe6fc2dc89271f0b6950764729 (patch)
tree470e909133069a49403298d182972c9fff8c7a82 /tvix/nix-compat
parent08b98b7503edbde5277ce11faf48b94ca2f1c0b7 (diff)
feat(tvix/nix-compat): NAR reader r/6853
Change-Id: I50d51baf62c0419eaf17f0dc262f728aaff9794d
Reviewed-on: https://cl.tvl.fyi/c/depot/+/9688
Reviewed-by: raitobezarius <tvl@lahfa.xyz>
Tested-by: BuildkiteCI
Reviewed-by: flokli <flokli@flokli.de>
Diffstat (limited to 'tvix/nix-compat')
-rw-r--r--tvix/nix-compat/src/nar/mod.rs3
-rw-r--r--tvix/nix-compat/src/nar/reader/mod.rs229
-rw-r--r--tvix/nix-compat/src/nar/reader/read.rs109
-rw-r--r--tvix/nix-compat/src/nar/wire/mod.rs (renamed from tvix/nix-compat/src/nar/writer/wire.rs)22
-rw-r--r--tvix/nix-compat/src/nar/wire/tag.rs165
-rw-r--r--tvix/nix-compat/src/nar/writer/async.rs2
-rw-r--r--tvix/nix-compat/src/nar/writer/mod.rs2
-rw-r--r--tvix/nix-compat/src/nar/writer/sync.rs2
8 files changed, 530 insertions, 4 deletions
diff --git a/tvix/nix-compat/src/nar/mod.rs b/tvix/nix-compat/src/nar/mod.rs
index d3baa817825a..058977f4fcd1 100644
--- a/tvix/nix-compat/src/nar/mod.rs
+++ b/tvix/nix-compat/src/nar/mod.rs
@@ -1 +1,4 @@
+mod wire;
+
+pub mod reader;
 pub mod writer;
diff --git a/tvix/nix-compat/src/nar/reader/mod.rs b/tvix/nix-compat/src/nar/reader/mod.rs
new file mode 100644
index 000000000000..e0b2e1c84574
--- /dev/null
+++ b/tvix/nix-compat/src/nar/reader/mod.rs
@@ -0,0 +1,229 @@
+//! Parser for the Nix archive format, aka NAR.
+//!
+//! NAR files (and their hashed representations) are used in C++ Nix for
+//! a variety of things, including addressing fixed-output derivations
+//! and transferring store paths between Nix stores.
+
+use std::io::{
+    self,
+    ErrorKind::{InvalidData, UnexpectedEof},
+    Read,
+};
+
+// Required reading for understanding this module.
+use crate::nar::wire;
+
+mod read;
+
+pub type Reader<'a> = dyn Read + Send + 'a;
+
+/// Start reading a NAR file from `reader`.
+pub fn open<'a, 'r>(reader: &'a mut Reader<'r>) -> io::Result<Node<'a, 'r>> {
+    read::token(reader, &wire::TOK_NAR)?;
+    Node::new(reader)
+}
+
+pub enum Node<'a, 'r> {
+    Symlink {
+        target: Vec<u8>,
+    },
+    File {
+        executable: bool,
+        reader: FileReader<'a, 'r>,
+    },
+    Directory(DirReader<'a, 'r>),
+}
+
+impl<'a, 'r> Node<'a, 'r> {
+    /// Start reading a [Node], matching the next [wire::Node].
+    ///
+    /// Reading the terminating [wire::TOK_PAR] is done immediately for [Node::Symlink],
+    /// but is otherwise left to [DirReader] or [FileReader].
+    fn new(reader: &'a mut Reader<'r>) -> io::Result<Self> {
+        Ok(match read::tag(reader)? {
+            wire::Node::Sym => {
+                let target = read::bytes(reader, wire::MAX_TARGET_LEN)?;
+
+                if target.is_empty() || target.contains(&0) {
+                    return Err(InvalidData.into());
+                }
+
+                read::token(reader, &wire::TOK_PAR)?;
+
+                Node::Symlink { target }
+            }
+            tag @ (wire::Node::Reg | wire::Node::Exe) => {
+                let len = read::u64(reader)?;
+
+                Node::File {
+                    executable: tag == wire::Node::Exe,
+                    reader: FileReader::new(reader, len)?,
+                }
+            }
+            wire::Node::Dir => Node::Directory(DirReader::new(reader)),
+        })
+    }
+}
+
+/// File contents, readable through the [Read] trait.
+///
+/// It comes with some caveats:
+///  * You must always read the entire file, unless you intend to abandon the entire archive reader.
+///  * You must abandon the entire archive reader upon the first error.
+///
+/// It's fine to read exactly `reader.len()` bytes without ever seeing an explicit EOF.
+///
+/// TODO(edef): enforce these in `#[cfg(debug_assertions)]`
+pub struct FileReader<'a, 'r> {
+    reader: &'a mut Reader<'r>,
+    len: u64,
+    /// Truncated original file length for padding computation.
+    /// We only care about the 3 least significant bits; semantically, this is a u3.
+    pad: u8,
+}
+
+impl<'a, 'r> FileReader<'a, 'r> {
+    /// Instantiate a new reader, starting after [wire::TOK_REG] or [wire::TOK_EXE].
+    /// We handle the terminating [wire::TOK_PAR] on semantic EOF.
+    fn new(reader: &'a mut Reader<'r>, len: u64) -> io::Result<Self> {
+        // For zero-length files, we have to read the terminating TOK_PAR
+        // immediately, since FileReader::read may never be called; we've
+        // already reached semantic EOF by definition.
+        if len == 0 {
+            read::token(reader, &wire::TOK_PAR)?;
+        }
+
+        Ok(Self {
+            reader,
+            len,
+            pad: len as u8,
+        })
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len == 0
+    }
+
+    pub fn len(&self) -> u64 {
+        self.len
+    }
+}
+
+impl Read for FileReader<'_, '_> {
+    fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
+        if buf.is_empty() || self.is_empty() {
+            return Ok(0);
+        }
+
+        if buf.len() as u64 > self.len {
+            buf = &mut buf[..self.len as usize];
+        }
+
+        let n = self.reader.read(buf)?;
+        self.len -= n as u64;
+
+        if n == 0 {
+            return Err(UnexpectedEof.into());
+        }
+
+        // If we've reached semantic EOF, consume and verify the padding and terminating TOK_PAR.
+        // Files are padded to 64 bits (8 bytes), just like any other byte string in the wire format.
+        if self.is_empty() {
+            let pad = (self.pad & 7) as usize;
+
+            if pad != 0 {
+                let mut buf = [0; 8];
+                self.reader.read_exact(&mut buf[pad..])?;
+
+                if buf != [0; 8] {
+                    return Err(InvalidData.into());
+                }
+            }
+
+            read::token(self.reader, &wire::TOK_PAR)?;
+        }
+
+        Ok(n)
+    }
+}
+
+/// A directory iterator, yielding a sequence of [Node]s.
+/// It must be fully consumed before reading further from the [DirReader] that produced it, if any.
+pub struct DirReader<'a, 'r> {
+    reader: &'a mut Reader<'r>,
+    /// Previous directory entry name.
+    /// We have to hang onto this to enforce name monotonicity.
+    prev_name: Option<Vec<u8>>,
+}
+
+pub struct Entry<'a, 'r> {
+    pub name: Vec<u8>,
+    pub node: Node<'a, 'r>,
+}
+
+impl<'a, 'r> DirReader<'a, 'r> {
+    fn new(reader: &'a mut Reader<'r>) -> Self {
+        Self {
+            reader,
+            prev_name: None,
+        }
+    }
+
+    /// Read the next [Entry] from the directory.
+    ///
+    /// We explicitly don't implement [Iterator], since treating this as
+    /// a regular Rust iterator will surely lead you astray.
+    ///
+    ///  * You must always consume the entire iterator, unless you abandon the entire archive reader.
+    ///  * You must abandon the entire archive reader on the first error.
+    ///  * You must abandon the directory reader upon the first [None].
+    ///  * Even if you know the amount of elements up front, you must keep reading until you encounter [None].
+    ///
+    /// TODO(edef): enforce these in `#[cfg(debug_assertions)]`
+    #[allow(clippy::should_implement_trait)]
+    pub fn next(&mut self) -> io::Result<Option<Entry>> {
+        // COME FROM the previous iteration: if we've already read an entry,
+        // read its terminating TOK_PAR here.
+        if self.prev_name.is_some() {
+            read::token(self.reader, &wire::TOK_PAR)?;
+        }
+
+        // Determine if there are more entries to follow
+        if let wire::Entry::None = read::tag(self.reader)? {
+            // We've reached the end of this directory.
+            return Ok(None);
+        }
+
+        let name = read::bytes(self.reader, wire::MAX_NAME_LEN)?;
+
+        if name.is_empty()
+            || name.contains(&0)
+            || name.contains(&b'/')
+            || name == b"."
+            || name == b".."
+        {
+            return Err(InvalidData.into());
+        }
+
+        // Enforce strict monotonicity of directory entry names.
+        match &mut self.prev_name {
+            None => {
+                self.prev_name = Some(name.clone());
+            }
+            Some(prev_name) => {
+                if *prev_name >= name {
+                    return Err(InvalidData.into());
+                }
+
+                name[..].clone_into(prev_name);
+            }
+        }
+
+        read::token(self.reader, &wire::TOK_NOD)?;
+
+        Ok(Some(Entry {
+            name,
+            node: Node::new(&mut self.reader)?,
+        }))
+    }
+}
diff --git a/tvix/nix-compat/src/nar/reader/read.rs b/tvix/nix-compat/src/nar/reader/read.rs
new file mode 100644
index 000000000000..3c886dcfb87a
--- /dev/null
+++ b/tvix/nix-compat/src/nar/reader/read.rs
@@ -0,0 +1,109 @@
+//! Helpers for reading [crate::nar::wire] format.
+
+use std::io::{
+    self,
+    ErrorKind::{Interrupted, InvalidData, UnexpectedEof},
+};
+
+use super::Reader;
+use crate::nar::wire::Tag;
+
+/// Consume a little-endian [u64] from the reader.
+pub fn u64(reader: &mut Reader) -> io::Result<u64> {
+    let mut buf = [0; 8];
+    reader.read_exact(&mut buf)?;
+    Ok(u64::from_le_bytes(buf))
+}
+
+/// Consume a byte string of up to `max_len` bytes from the reader.
+pub fn bytes(reader: &mut Reader, max_len: usize) -> io::Result<Vec<u8>> {
+    assert!(max_len <= isize::MAX as usize);
+
+    // read the length, and reject excessively large values
+    let len = self::u64(reader)?;
+    if len > max_len as u64 {
+        return Err(InvalidData.into());
+    }
+    // we know the length fits in a usize now
+    let len = len as usize;
+
+    // read the data and padding into a buffer
+    let buf_len = (len + 7) & !7;
+    let mut buf = vec![0; buf_len];
+    reader.read_exact(&mut buf)?;
+
+    // verify that the padding is all zeroes
+    for b in buf.drain(len..) {
+        if b != 0 {
+            return Err(InvalidData.into());
+        }
+    }
+
+    Ok(buf)
+}
+
+/// Consume a known token from the reader.
+pub fn token<const N: usize>(reader: &mut Reader, token: &[u8; N]) -> io::Result<()> {
+    let mut buf = [0u8; N];
+
+    // This implements something similar to [Read::read_exact], but verifies that
+    // the input data matches the token while we read it. These two slices respectively
+    // represent the remaining token to be verified, and the remaining input buffer.
+    let mut token = &token[..];
+    let mut buf = &mut buf[..];
+
+    while !token.is_empty() {
+        match reader.read(buf) {
+            Ok(0) => {
+                return Err(UnexpectedEof.into());
+            }
+            Ok(n) => {
+                let (t, b);
+                (t, token) = token.split_at(n);
+                (b, buf) = buf.split_at_mut(n);
+
+                if t != b {
+                    return Err(InvalidData.into());
+                }
+            }
+            Err(e) => {
+                if e.kind() != Interrupted {
+                    return Err(e);
+                }
+            }
+        }
+    }
+
+    Ok(())
+}
+
+/// Consume a [Tag] from the reader.
+pub fn tag<T: Tag>(reader: &mut Reader) -> io::Result<T> {
+    let mut buf = T::make_buf();
+    let buf = buf.as_mut();
+
+    // first read the known minimum length…
+    reader.read_exact(&mut buf[..T::MIN])?;
+
+    // then decide which tag we're expecting
+    let tag = T::from_u8(buf[T::OFF]).ok_or(InvalidData)?;
+    let (head, tail) = tag.as_bytes().split_at(T::MIN);
+
+    // make sure what we've read so far is valid
+    if buf[..T::MIN] != *head {
+        return Err(InvalidData.into());
+    }
+
+    // …then read the rest, if any
+    if !tail.is_empty() {
+        let rest = tail.len();
+        reader.read_exact(&mut buf[..rest])?;
+
+        // and make sure it's what we expect
+        if buf[..rest] != *tail {
+            return Err(InvalidData.into());
+        }
+    }
+
+    Ok(tag)
+}
diff --git a/tvix/nix-compat/src/nar/writer/wire.rs b/tvix/nix-compat/src/nar/wire/mod.rs
index a6c19f0759c3..fc8b8d3510aa 100644
--- a/tvix/nix-compat/src/nar/writer/wire.rs
+++ b/tvix/nix-compat/src/nar/wire/mod.rs
@@ -108,3 +108,25 @@ fn tokens() {
         assert_eq!(tok, token(xs));
     }
 }
+
+pub use tag::Tag;
+mod tag;
+
+tag::make! {
+    /// These are the node tokens, succeeding [TOK_NAR] or [TOK_NOD],
+    /// and preceding the next variable-length element.
+    pub enum Node[16] {
+        Sym = TOK_SYM,
+        Reg = TOK_REG,
+        Exe = TOK_EXE,
+        Dir = TOK_DIR,
+    }
+
+    /// Directory entry or terminator
+    pub enum Entry[0] {
+        /// End of directory
+        None = TOK_PAR,
+        /// Directory entry, which must be followed by [Node]
+        Some = TOK_ENT,
+    }
+}
diff --git a/tvix/nix-compat/src/nar/wire/tag.rs b/tvix/nix-compat/src/nar/wire/tag.rs
new file mode 100644
index 000000000000..55ce4b4a2a87
--- /dev/null
+++ b/tvix/nix-compat/src/nar/wire/tag.rs
@@ -0,0 +1,165 @@
+/// A type implementing Tag represents a static hash set of byte strings,
+/// with a very simple perfect hash function: every element has a unique
+/// discriminant at a common byte offset. The values of the type represent
+/// the members by this single discriminant byte; they are indices into the
+/// hash set.
+pub trait Tag: Sized {
+    /// Discriminant offset
+    const OFF: usize;
+    /// Minimum variant length
+    const MIN: usize;
+
+    /// Minimal suitably sized buffer for reading the wire representation
+    /// HACK: This is a workaround for const generics limitations.
+    type Buf: AsMut<[u8]> + Send;
+
+    /// Make an instance of [Self::Buf]
+    fn make_buf() -> Self::Buf;
+
+    /// Convert a discriminant into the corresponding variant
+    fn from_u8(x: u8) -> Option<Self>;
+
+    /// Convert a variant back into the wire representation
+    fn as_bytes(&self) -> &'static [u8];
+}
+
+/// Generate an enum implementing [Tag], enforcing at compile time that
+/// the discriminant values are distinct.
+macro_rules! make {
+    (
+        $(
+            $(#[doc = $doc:expr])*
+            $vis:vis enum $Enum:ident[$off:expr] {
+                $(
+                    $(#[doc = $var_doc:expr])*
+                    $Var:ident = $TOK:ident,
+                )+
+            }
+        )*
+    ) => {
+        $(
+            $(#[doc = $doc])*
+            #[derive(Debug, PartialEq, Eq)]
+            #[repr(u8)]
+            $vis enum $Enum {
+                $(
+                    $(#[doc = $var_doc])*
+                    $Var = $TOK[$Enum::OFF]
+                ),+
+            }
+
+            impl Tag for $Enum {
+                /// Discriminant offset
+                const OFF: usize = $off;
+                /// Minimum variant length
+                const MIN: usize = tag::min_of(&[$($TOK.len()),+]);
+
+                /// Minimal suitably sized buffer for reading the wire representation
+                type Buf = [u8; tag::buf_of(&[$($TOK.len()),+])];
+
+                /// Make an instance of [Self::Buf]
+                #[inline(always)]
+                fn make_buf() -> Self::Buf {
+                    [0u8; tag::buf_of(&[$($TOK.len()),+])]
+                }
+
+                /// Convert a discriminant into the corresponding variant
+                #[inline(always)]
+                fn from_u8(x: u8) -> Option<Self> {
+                    #[allow(non_upper_case_globals)]
+                    mod __variant {
+                        $(
+                            pub const $Var: u8 = super::$Enum::$Var as u8;
+                        )+
+                    }
+
+                    match x {
+                        $(__variant::$Var => Some(Self::$Var),)+
+                        _ => None
+                    }
+                }
+
+                /// Convert a variant back into the wire representation
+                #[inline(always)]
+                fn as_bytes(&self) -> &'static [u8] {
+                    match self {
+                        $(Self::$Var => &$TOK,)+
+                    }
+                }
+            }
+        )*
+    };
+}
+
+pub(crate) use make;
+
+#[cfg(test)]
+mod test {
+    use super::super::tag::{self, Tag};
+
+    const TOK_A: [u8; 3] = [0xed, 0xef, 0x1c];
+    const TOK_B: [u8; 3] = [0xed, 0xf0, 0x1c];
+
+    const OFFSET: usize = 1;
+
+    make! {
+        enum Token[OFFSET] {
+            A = TOK_A,
+            B = TOK_B,
+        }
+    }
+
+    #[test]
+    fn example() {
+        assert_eq!(Token::from_u8(0xed), None);
+
+        let tag = Token::from_u8(0xef).unwrap();
+        assert_eq!(tag.as_bytes(), &TOK_A[..]);
+
+        let tag = Token::from_u8(0xf0).unwrap();
+        assert_eq!(tag.as_bytes(), &TOK_B[..]);
+    }
+}
+
+// The following functions are written somewhat unusually,
+// since they're const functions that cannot use iterators.
+
+/// Maximum element of a slice
+const fn max_of(mut xs: &[usize]) -> usize {
+    let mut y = usize::MIN;
+    while let &[x, ref tail @ ..] = xs {
+        y = if x > y { x } else { y };
+        xs = tail;
+    }
+    y
+}
+
+/// Minimum element of a slice
+pub const fn min_of(mut xs: &[usize]) -> usize {
+    let mut y = usize::MAX;
+    while let &[x, ref tail @ ..] = xs {
+        y = if x < y { x } else { y };
+        xs = tail;
+    }
+    y
+}
+
+/// Minimum buffer size to contain either of `0..Tag::MIN` and `Tag::MIN..`
+/// at a particular time, for all possible tag wire representations, given
+/// the sizes of all wire representations.
+///
+/// # Example
+///
+/// ```plain
+/// OFF = 16
+/// MIN = 24
+/// MAX = 64
+///
+/// BUF = max(MIN, MAX-MIN)
+///     = max(24, 64-24)
+///     = max(24, 40)
+///     = 40
+/// ```
+pub const fn buf_of(xs: &[usize]) -> usize {
+    max_of(&[min_of(xs), max_of(xs) - min_of(xs)])
+}
diff --git a/tvix/nix-compat/src/nar/writer/async.rs b/tvix/nix-compat/src/nar/writer/async.rs
index 2a77fa736677..2b73e56737ce 100644
--- a/tvix/nix-compat/src/nar/writer/async.rs
+++ b/tvix/nix-compat/src/nar/writer/async.rs
@@ -30,7 +30,7 @@
 //! # });
 //! ```
 
-use super::wire;
+use crate::nar::wire;
 use bstr::ByteSlice;
 use futures_util::{AsyncBufRead, AsyncBufReadExt, AsyncWrite, AsyncWriteExt};
 use std::{
diff --git a/tvix/nix-compat/src/nar/writer/mod.rs b/tvix/nix-compat/src/nar/writer/mod.rs
index 79f78f37fc40..bf81ccd4df32 100644
--- a/tvix/nix-compat/src/nar/writer/mod.rs
+++ b/tvix/nix-compat/src/nar/writer/mod.rs
@@ -1,7 +1,5 @@
 pub use sync::*;
 
-mod wire;
-
 pub mod sync;
 
 #[cfg(feature = "async")]
diff --git a/tvix/nix-compat/src/nar/writer/sync.rs b/tvix/nix-compat/src/nar/writer/sync.rs
index 996815149dd6..9290e4a5bdfe 100644
--- a/tvix/nix-compat/src/nar/writer/sync.rs
+++ b/tvix/nix-compat/src/nar/writer/sync.rs
@@ -28,7 +28,7 @@
 //! # Ok::<(), std::io::Error>(())
 //! ```
 
-use super::wire;
+use crate::nar::wire;
 use bstr::ByteSlice;
 use std::io::{
     self, BufRead,