about summary refs log tree commit diff
path: root/tvix/nix-compat/src/nar/reader/mod.rs
//! Parser for the Nix archive format, aka NAR.
//!
//! NAR files (and their hashed representations) are used in C++ Nix for
//! a variety of things, including addressing fixed-output derivations
//! and transferring store paths between Nix stores.

use std::io::{
    self, BufRead,
    ErrorKind::{InvalidData, UnexpectedEof},
    Read, Write,
};

#[cfg(not(debug_assertions))]
use std::marker::PhantomData;

// Required reading for understanding this module.
use crate::nar::wire;

mod read;
#[cfg(test)]
mod test;

pub type Reader<'a> = dyn BufRead + Send + 'a;

struct ArchiveReader<'a, 'r> {
    inner: &'a mut Reader<'r>,

    /// In debug mode, also track when we need to abandon this archive reader.
    /// The archive reader must be abandoned when:
    ///   * An error is encountered at any point
    ///   * A file or directory reader is dropped before being read entirely.
    /// All of these checks vanish in release mode.
    status: ArchiveReaderStatus<'a>,
}

macro_rules! try_or_poison {
    ($it:expr, $ex:expr) => {
        match $ex {
            Ok(x) => x,
            Err(e) => {
                $it.status.poison();
                return Err(e.into());
            }
        }
    };
}
/// Start reading a NAR file from `reader`.
pub fn open<'a, 'r>(reader: &'a mut Reader<'r>) -> io::Result<Node<'a, 'r>> {
    read::token(reader, &wire::TOK_NAR)?;
    Node::new(ArchiveReader {
        inner: reader,
        status: ArchiveReaderStatus::top(),
    })
}

pub enum Node<'a, 'r> {
    Symlink {
        target: Vec<u8>,
    },
    File {
        executable: bool,
        reader: FileReader<'a, 'r>,
    },
    Directory(DirReader<'a, 'r>),
}

impl<'a, 'r> Node<'a, 'r> {
    /// Start reading a [Node], matching the next [wire::Node].
    ///
    /// Reading the terminating [wire::TOK_PAR] is done immediately for [Node::Symlink],
    /// but is otherwise left to [DirReader] or [FileReader].
    fn new(mut reader: ArchiveReader<'a, 'r>) -> io::Result<Self> {
        Ok(match read::tag(reader.inner)? {
            wire::Node::Sym => {
                let target =
                    try_or_poison!(reader, read::bytes(reader.inner, wire::MAX_TARGET_LEN));

                if target.is_empty() || target.contains(&0) {
                    reader.status.poison();
                    return Err(InvalidData.into());
                }

                try_or_poison!(reader, read::token(reader.inner, &wire::TOK_PAR));
                reader.status.ready_parent(); // Immediately allow reading from parent again

                Node::Symlink { target }
            }
            tag @ (wire::Node::Reg | wire::Node::Exe) => {
                let len = try_or_poison!(&mut reader, read::u64(reader.inner));

                Node::File {
                    executable: tag == wire::Node::Exe,
                    reader: FileReader::new(reader, len)?,
                }
            }
            wire::Node::Dir => Node::Directory(DirReader::new(reader)),
        })
    }
}

/// File contents, readable through the [Read] trait.
///
/// It comes with some caveats:
///  * You must always read the entire file, unless you intend to abandon the entire archive reader.
///  * You must abandon the entire archive reader upon the first error.
///
/// It's fine to read exactly `reader.len()` bytes without ever seeing an explicit EOF.
pub struct FileReader<'a, 'r> {
    reader: ArchiveReader<'a, 'r>,
    len: u64,
    /// Truncated original file length for padding computation.
    /// We only care about the 3 least significant bits; semantically, this is a u3.
    pad: u8,
}

impl<'a, 'r> FileReader<'a, 'r> {
    /// Instantiate a new reader, starting after [wire::TOK_REG] or [wire::TOK_EXE].
    /// We handle the terminating [wire::TOK_PAR] on semantic EOF.
    fn new(mut reader: ArchiveReader<'a, 'r>, len: u64) -> io::Result<Self> {
        // For zero-length files, we have to read the terminating TOK_PAR
        // immediately, since FileReader::read may never be called; we've
        // already reached semantic EOF by definition.
        if len == 0 {
            read::token(reader.inner, &wire::TOK_PAR)?;
            reader.status.ready_parent();
        }

        Ok(Self {
            reader,
            len,
            pad: len as u8,
        })
    }

    pub fn is_empty(&self) -> bool {
        self.len == 0
    }

    pub fn len(&self) -> u64 {
        self.len
    }
}

impl FileReader<'_, '_> {
    /// Equivalent to [BufRead::fill_buf]
    ///
    /// We can't directly implement [BufRead], because [FileReader::consume] needs
    /// to perform fallible I/O.
    pub fn fill_buf(&mut self) -> io::Result<&[u8]> {
        if self.is_empty() {
            return Ok(&[]);
        }

        self.reader.check_correct();

        let mut buf = try_or_poison!(self.reader, self.reader.inner.fill_buf());

        if buf.is_empty() {
            self.reader.status.poison();
            return Err(UnexpectedEof.into());
        }

        if buf.len() as u64 > self.len {
            buf = &buf[..self.len as usize];
        }

        Ok(buf)
    }

    /// Analogous to [BufRead::consume], differing only in that it needs
    /// to perform I/O in order to read padding and terminators.
    pub fn consume(&mut self, n: usize) -> io::Result<()> {
        if n == 0 {
            return Ok(());
        }

        self.reader.check_correct();

        self.len = self
            .len
            .checked_sub(n as u64)
            .expect("consumed bytes past EOF");

        self.reader.inner.consume(n);

        if self.is_empty() {
            self.finish()?;
        }

        Ok(())
    }

    /// Copy the (remaining) contents of the file into `dst`.
    pub fn copy(&mut self, mut dst: impl Write) -> io::Result<()> {
        while !self.is_empty() {
            let buf = self.fill_buf()?;
            let n = try_or_poison!(self.reader, dst.write(buf));
            self.consume(n)?;
        }

        Ok(())
    }
}

impl Read for FileReader<'_, '_> {
    fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
        if buf.is_empty() || self.is_empty() {
            return Ok(0);
        }

        self.reader.check_correct();

        if buf.len() as u64 > self.len {
            buf = &mut buf[..self.len as usize];
        }

        let n = try_or_poison!(self.reader, self.reader.inner.read(buf));
        self.len -= n as u64;

        if n == 0 {
            self.reader.status.poison();
            return Err(UnexpectedEof.into());
        }

        if self.is_empty() {
            self.finish()?;
        }

        Ok(n)
    }
}

impl FileReader<'_, '_> {
    /// We've reached semantic EOF, consume and verify the padding and terminating TOK_PAR.
    /// Files are padded to 64 bits (8 bytes), just like any other byte string in the wire format.
    fn finish(&mut self) -> io::Result<()> {
        let pad = (self.pad & 7) as usize;

        if pad != 0 {
            let mut buf = [0; 8];
            try_or_poison!(self.reader, self.reader.inner.read_exact(&mut buf[pad..]));

            if buf != [0; 8] {
                self.reader.status.poison();
                return Err(InvalidData.into());
            }
        }

        try_or_poison!(self.reader, read::token(self.reader.inner, &wire::TOK_PAR));

        // Done with reading this file, allow going back up the chain of readers
        self.reader.status.ready_parent();

        Ok(())
    }
}

/// A directory iterator, yielding a sequence of [Node]s.
/// It must be fully consumed before reading further from the [DirReader] that produced it, if any.
pub struct DirReader<'a, 'r> {
    reader: ArchiveReader<'a, 'r>,
    /// Previous directory entry name.
    /// We have to hang onto this to enforce name monotonicity.
    prev_name: Option<Vec<u8>>,
}

pub struct Entry<'a, 'r> {
    pub name: Vec<u8>,
    pub node: Node<'a, 'r>,
}

impl<'a, 'r> DirReader<'a, 'r> {
    fn new(reader: ArchiveReader<'a, 'r>) -> Self {
        Self {
            reader,
            prev_name: None,
        }
    }

    /// Read the next [Entry] from the directory.
    ///
    /// We explicitly don't implement [Iterator], since treating this as
    /// a regular Rust iterator will surely lead you astray.
    ///
    ///  * You must always consume the entire iterator, unless you abandon the entire archive reader.
    ///  * You must abandon the entire archive reader on the first error.
    ///  * You must abandon the directory reader upon the first [None].
    ///  * Even if you know the amount of elements up front, you must keep reading until you encounter [None].
    #[allow(clippy::should_implement_trait)]
    pub fn next(&mut self) -> io::Result<Option<Entry<'_, 'r>>> {
        self.reader.check_correct();

        // COME FROM the previous iteration: if we've already read an entry,
        // read its terminating TOK_PAR here.
        if self.prev_name.is_some() {
            try_or_poison!(self.reader, read::token(self.reader.inner, &wire::TOK_PAR));
        }

        // Determine if there are more entries to follow
        if let wire::Entry::None = try_or_poison!(self.reader, read::tag(self.reader.inner)) {
            // We've reached the end of this directory.
            self.reader.status.ready_parent();
            return Ok(None);
        }

        let name = try_or_poison!(
            self.reader,
            read::bytes(self.reader.inner, wire::MAX_NAME_LEN)
        );

        if name.is_empty()
            || name.contains(&0)
            || name.contains(&b'/')
            || name == b"."
            || name == b".."
        {
            self.reader.status.poison();
            return Err(InvalidData.into());
        }

        // Enforce strict monotonicity of directory entry names.
        match &mut self.prev_name {
            None => {
                self.prev_name = Some(name.clone());
            }
            Some(prev_name) => {
                if *prev_name >= name {
                    self.reader.status.poison();
                    return Err(InvalidData.into());
                }

                name[..].clone_into(prev_name);
            }
        }

        try_or_poison!(self.reader, read::token(self.reader.inner, &wire::TOK_NOD));

        Ok(Some(Entry {
            name,
            // Don't need to worry about poisoning here: Node::new will do it for us if needed
            node: Node::new(self.reader.child())?,
        }))
    }
}

/// We use a stack of statuses to:
///   * Share poisoned state across all objects from the same underlying reader,
///     so we can check they are abandoned when an error occurs
///   * Make sure only the most recently created object is read from, and is fully exhausted
///     before anything it was created from is used again.
enum ArchiveReaderStatus<'a> {
    #[cfg(not(debug_assertions))]
    None(PhantomData<&'a ()>),
    #[cfg(debug_assertions)]
    StackTop { poisoned: bool, ready: bool },
    #[cfg(debug_assertions)]
    StackChild {
        poisoned: &'a mut bool,
        parent_ready: &'a mut bool,
        ready: bool,
    },
}

impl ArchiveReaderStatus<'_> {
    fn top() -> Self {
        #[cfg(debug_assertions)]
        {
            ArchiveReaderStatus::StackTop {
                poisoned: false,
                ready: true,
            }
        }

        #[cfg(not(debug_assertions))]
        ArchiveReaderStatus::None(PhantomData)
    }

    /// Poison all the objects sharing the same reader, to be used when an error occurs
    fn poison(&mut self) {
        match self {
            #[cfg(not(debug_assertions))]
            ArchiveReaderStatus::None(_) => {}
            #[cfg(debug_assertions)]
            ArchiveReaderStatus::StackTop { poisoned: x, .. } => *x = true,
            #[cfg(debug_assertions)]
            ArchiveReaderStatus::StackChild { poisoned: x, .. } => **x = true,
        }
    }

    /// Mark the parent as ready, allowing it to be used again and preventing this reference to the reader being used again.
    fn ready_parent(&mut self) {
        match self {
            #[cfg(not(debug_assertions))]
            ArchiveReaderStatus::None(_) => {}
            #[cfg(debug_assertions)]
            ArchiveReaderStatus::StackTop { ready, .. } => {
                *ready = false;
            }
            #[cfg(debug_assertions)]
            ArchiveReaderStatus::StackChild {
                ready,
                parent_ready,
                ..
            } => {
                *ready = false;
                **parent_ready = true;
            }
        };
    }

    fn poisoned(&self) -> bool {
        match self {
            #[cfg(not(debug_assertions))]
            ArchiveReaderStatus::None(_) => false,
            #[cfg(debug_assertions)]
            ArchiveReaderStatus::StackTop { poisoned, .. } => *poisoned,
            #[cfg(debug_assertions)]
            ArchiveReaderStatus::StackChild { poisoned, .. } => **poisoned,
        }
    }

    fn ready(&self) -> bool {
        match self {
            #[cfg(not(debug_assertions))]
            ArchiveReaderStatus::None(_) => true,
            #[cfg(debug_assertions)]
            ArchiveReaderStatus::StackTop { ready, .. } => *ready,
            #[cfg(debug_assertions)]
            ArchiveReaderStatus::StackChild { ready, .. } => *ready,
        }
    }
}

impl<'a, 'r> ArchiveReader<'a, 'r> {
    /// Create a new child reader from this one.
    /// In debug mode, this reader will panic if called before the new child is exhausted / calls `ready_parent`
    fn child(&mut self) -> ArchiveReader<'_, 'r> {
        ArchiveReader {
            inner: self.inner,
            #[cfg(not(debug_assertions))]
            status: ArchiveReaderStatus::None(PhantomData),
            #[cfg(debug_assertions)]
            status: match &mut self.status {
                ArchiveReaderStatus::StackTop { poisoned, ready } => {
                    *ready = false;
                    ArchiveReaderStatus::StackChild {
                        poisoned,
                        parent_ready: ready,
                        ready: true,
                    }
                }
                ArchiveReaderStatus::StackChild {
                    poisoned, ready, ..
                } => {
                    *ready = false;
                    ArchiveReaderStatus::StackChild {
                        poisoned,
                        parent_ready: ready,
                        ready: true,
                    }
                }
            },
        }
    }

    /// Check the reader is in the correct status.
    /// Only does anything when debug assertions are on.
    #[inline(always)]
    fn check_correct(&self) {
        assert!(
            !self.status.poisoned(),
            "Archive reader used after it was meant to be abandoned!"
        );
        assert!(
            self.status.ready(),
            "Non-ready archive reader used! (Should've been reading from something else)"
        );
    }
}