From 6638f4d4ea6848fe6fc2dc89271f0b6950764729 Mon Sep 17 00:00:00 2001 From: edef Date: Sun, 15 Oct 2023 14:59:39 +0000 Subject: feat(tvix/nix-compat): NAR reader Change-Id: I50d51baf62c0419eaf17f0dc262f728aaff9794d Reviewed-on: https://cl.tvl.fyi/c/depot/+/9688 Reviewed-by: raitobezarius Tested-by: BuildkiteCI Reviewed-by: flokli --- tvix/nix-compat/src/nar/mod.rs | 3 + tvix/nix-compat/src/nar/reader/mod.rs | 229 ++++++++++++++++++++++++++++++++ tvix/nix-compat/src/nar/reader/read.rs | 109 +++++++++++++++ tvix/nix-compat/src/nar/wire/mod.rs | 132 ++++++++++++++++++ tvix/nix-compat/src/nar/wire/tag.rs | 165 +++++++++++++++++++++++ tvix/nix-compat/src/nar/writer/async.rs | 2 +- tvix/nix-compat/src/nar/writer/mod.rs | 2 - tvix/nix-compat/src/nar/writer/sync.rs | 2 +- tvix/nix-compat/src/nar/writer/wire.rs | 110 --------------- 9 files changed, 640 insertions(+), 114 deletions(-) create mode 100644 tvix/nix-compat/src/nar/reader/mod.rs create mode 100644 tvix/nix-compat/src/nar/reader/read.rs create mode 100644 tvix/nix-compat/src/nar/wire/mod.rs create mode 100644 tvix/nix-compat/src/nar/wire/tag.rs delete mode 100644 tvix/nix-compat/src/nar/writer/wire.rs diff --git a/tvix/nix-compat/src/nar/mod.rs b/tvix/nix-compat/src/nar/mod.rs index d3baa817825a..058977f4fcd1 100644 --- a/tvix/nix-compat/src/nar/mod.rs +++ b/tvix/nix-compat/src/nar/mod.rs @@ -1 +1,4 @@ +mod wire; + +pub mod reader; pub mod writer; diff --git a/tvix/nix-compat/src/nar/reader/mod.rs b/tvix/nix-compat/src/nar/reader/mod.rs new file mode 100644 index 000000000000..e0b2e1c84574 --- /dev/null +++ b/tvix/nix-compat/src/nar/reader/mod.rs @@ -0,0 +1,229 @@ +//! Parser for the Nix archive format, aka NAR. +//! +//! NAR files (and their hashed representations) are used in C++ Nix for +//! a variety of things, including addressing fixed-output derivations +//! and transferring store paths between Nix stores. + +use std::io::{ + self, + ErrorKind::{InvalidData, UnexpectedEof}, + Read, +}; + +// Required reading for understanding this module. +use crate::nar::wire; + +mod read; + +pub type Reader<'a> = dyn Read + Send + 'a; + +/// Start reading a NAR file from `reader`. +pub fn open<'a, 'r>(reader: &'a mut Reader<'r>) -> io::Result> { + read::token(reader, &wire::TOK_NAR)?; + Node::new(reader) +} + +pub enum Node<'a, 'r> { + Symlink { + target: Vec, + }, + File { + executable: bool, + reader: FileReader<'a, 'r>, + }, + Directory(DirReader<'a, 'r>), +} + +impl<'a, 'r> Node<'a, 'r> { + /// Start reading a [Node], matching the next [wire::Node]. + /// + /// Reading the terminating [wire::TOK_PAR] is done immediately for [Node::Symlink], + /// but is otherwise left to [DirReader] or [FileReader]. + fn new(reader: &'a mut Reader<'r>) -> io::Result { + Ok(match read::tag(reader)? { + wire::Node::Sym => { + let target = read::bytes(reader, wire::MAX_TARGET_LEN)?; + + if target.is_empty() || target.contains(&0) { + return Err(InvalidData.into()); + } + + read::token(reader, &wire::TOK_PAR)?; + + Node::Symlink { target } + } + tag @ (wire::Node::Reg | wire::Node::Exe) => { + let len = read::u64(reader)?; + + Node::File { + executable: tag == wire::Node::Exe, + reader: FileReader::new(reader, len)?, + } + } + wire::Node::Dir => Node::Directory(DirReader::new(reader)), + }) + } +} + +/// File contents, readable through the [Read] trait. +/// +/// It comes with some caveats: +/// * You must always read the entire file, unless you intend to abandon the entire archive reader. +/// * You must abandon the entire archive reader upon the first error. +/// +/// It's fine to read exactly `reader.len()` bytes without ever seeing an explicit EOF. +/// +/// TODO(edef): enforce these in `#[cfg(debug_assertions)]` +pub struct FileReader<'a, 'r> { + reader: &'a mut Reader<'r>, + len: u64, + /// Truncated original file length for padding computation. + /// We only care about the 3 least significant bits; semantically, this is a u3. + pad: u8, +} + +impl<'a, 'r> FileReader<'a, 'r> { + /// Instantiate a new reader, starting after [wire::TOK_REG] or [wire::TOK_EXE]. + /// We handle the terminating [wire::TOK_PAR] on semantic EOF. + fn new(reader: &'a mut Reader<'r>, len: u64) -> io::Result { + // For zero-length files, we have to read the terminating TOK_PAR + // immediately, since FileReader::read may never be called; we've + // already reached semantic EOF by definition. + if len == 0 { + read::token(reader, &wire::TOK_PAR)?; + } + + Ok(Self { + reader, + len, + pad: len as u8, + }) + } + + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + pub fn len(&self) -> u64 { + self.len + } +} + +impl Read for FileReader<'_, '_> { + fn read(&mut self, mut buf: &mut [u8]) -> io::Result { + if buf.is_empty() || self.is_empty() { + return Ok(0); + } + + if buf.len() as u64 > self.len { + buf = &mut buf[..self.len as usize]; + } + + let n = self.reader.read(buf)?; + self.len -= n as u64; + + if n == 0 { + return Err(UnexpectedEof.into()); + } + + // If we've reached semantic EOF, consume and verify the padding and terminating TOK_PAR. + // Files are padded to 64 bits (8 bytes), just like any other byte string in the wire format. + if self.is_empty() { + let pad = (self.pad & 7) as usize; + + if pad != 0 { + let mut buf = [0; 8]; + self.reader.read_exact(&mut buf[pad..])?; + + if buf != [0; 8] { + return Err(InvalidData.into()); + } + } + + read::token(self.reader, &wire::TOK_PAR)?; + } + + Ok(n) + } +} + +/// A directory iterator, yielding a sequence of [Node]s. +/// It must be fully consumed before reading further from the [DirReader] that produced it, if any. +pub struct DirReader<'a, 'r> { + reader: &'a mut Reader<'r>, + /// Previous directory entry name. + /// We have to hang onto this to enforce name monotonicity. + prev_name: Option>, +} + +pub struct Entry<'a, 'r> { + pub name: Vec, + pub node: Node<'a, 'r>, +} + +impl<'a, 'r> DirReader<'a, 'r> { + fn new(reader: &'a mut Reader<'r>) -> Self { + Self { + reader, + prev_name: None, + } + } + + /// Read the next [Entry] from the directory. + /// + /// We explicitly don't implement [Iterator], since treating this as + /// a regular Rust iterator will surely lead you astray. + /// + /// * You must always consume the entire iterator, unless you abandon the entire archive reader. + /// * You must abandon the entire archive reader on the first error. + /// * You must abandon the directory reader upon the first [None]. + /// * Even if you know the amount of elements up front, you must keep reading until you encounter [None]. + /// + /// TODO(edef): enforce these in `#[cfg(debug_assertions)]` + #[allow(clippy::should_implement_trait)] + pub fn next(&mut self) -> io::Result> { + // COME FROM the previous iteration: if we've already read an entry, + // read its terminating TOK_PAR here. + if self.prev_name.is_some() { + read::token(self.reader, &wire::TOK_PAR)?; + } + + // Determine if there are more entries to follow + if let wire::Entry::None = read::tag(self.reader)? { + // We've reached the end of this directory. + return Ok(None); + } + + let name = read::bytes(self.reader, wire::MAX_NAME_LEN)?; + + if name.is_empty() + || name.contains(&0) + || name.contains(&b'/') + || name == b"." + || name == b".." + { + return Err(InvalidData.into()); + } + + // Enforce strict monotonicity of directory entry names. + match &mut self.prev_name { + None => { + self.prev_name = Some(name.clone()); + } + Some(prev_name) => { + if *prev_name >= name { + return Err(InvalidData.into()); + } + + name[..].clone_into(prev_name); + } + } + + read::token(self.reader, &wire::TOK_NOD)?; + + Ok(Some(Entry { + name, + node: Node::new(&mut self.reader)?, + })) + } +} diff --git a/tvix/nix-compat/src/nar/reader/read.rs b/tvix/nix-compat/src/nar/reader/read.rs new file mode 100644 index 000000000000..3c886dcfb87a --- /dev/null +++ b/tvix/nix-compat/src/nar/reader/read.rs @@ -0,0 +1,109 @@ +//! Helpers for reading [crate::nar::wire] format. + +use std::io::{ + self, + ErrorKind::{Interrupted, InvalidData, UnexpectedEof}, +}; + +use super::Reader; +use crate::nar::wire::Tag; + +/// Consume a little-endian [u64] from the reader. +pub fn u64(reader: &mut Reader) -> io::Result { + let mut buf = [0; 8]; + reader.read_exact(&mut buf)?; + Ok(u64::from_le_bytes(buf)) +} + +/// Consume a byte string of up to `max_len` bytes from the reader. +pub fn bytes(reader: &mut Reader, max_len: usize) -> io::Result> { + assert!(max_len <= isize::MAX as usize); + + // read the length, and reject excessively large values + let len = self::u64(reader)?; + if len > max_len as u64 { + return Err(InvalidData.into()); + } + // we know the length fits in a usize now + let len = len as usize; + + // read the data and padding into a buffer + let buf_len = (len + 7) & !7; + let mut buf = vec![0; buf_len]; + reader.read_exact(&mut buf)?; + + // verify that the padding is all zeroes + for b in buf.drain(len..) { + if b != 0 { + return Err(InvalidData.into()); + } + } + + Ok(buf) +} + +/// Consume a known token from the reader. +pub fn token(reader: &mut Reader, token: &[u8; N]) -> io::Result<()> { + let mut buf = [0u8; N]; + + // This implements something similar to [Read::read_exact], but verifies that + // the input data matches the token while we read it. These two slices respectively + // represent the remaining token to be verified, and the remaining input buffer. + let mut token = &token[..]; + let mut buf = &mut buf[..]; + + while !token.is_empty() { + match reader.read(buf) { + Ok(0) => { + return Err(UnexpectedEof.into()); + } + Ok(n) => { + let (t, b); + (t, token) = token.split_at(n); + (b, buf) = buf.split_at_mut(n); + + if t != b { + return Err(InvalidData.into()); + } + } + Err(e) => { + if e.kind() != Interrupted { + return Err(e); + } + } + } + } + + Ok(()) +} + +/// Consume a [Tag] from the reader. +pub fn tag(reader: &mut Reader) -> io::Result { + let mut buf = T::make_buf(); + let buf = buf.as_mut(); + + // first read the known minimum length… + reader.read_exact(&mut buf[..T::MIN])?; + + // then decide which tag we're expecting + let tag = T::from_u8(buf[T::OFF]).ok_or(InvalidData)?; + let (head, tail) = tag.as_bytes().split_at(T::MIN); + + // make sure what we've read so far is valid + if buf[..T::MIN] != *head { + return Err(InvalidData.into()); + } + + // …then read the rest, if any + if !tail.is_empty() { + let rest = tail.len(); + reader.read_exact(&mut buf[..rest])?; + + // and make sure it's what we expect + if buf[..rest] != *tail { + return Err(InvalidData.into()); + } + } + + Ok(tag) +} diff --git a/tvix/nix-compat/src/nar/wire/mod.rs b/tvix/nix-compat/src/nar/wire/mod.rs new file mode 100644 index 000000000000..fc8b8d3510aa --- /dev/null +++ b/tvix/nix-compat/src/nar/wire/mod.rs @@ -0,0 +1,132 @@ +//! NAR wire format, without I/O details, since those differ between +//! the synchronous and asynchronous implementations. +//! +//! The wire format is an S-expression format, encoded onto the wire +//! using simple encoding rules. +//! +//! # Encoding +//! +//! Lengths are represented as 64-bit unsigned integers in little-endian +//! format. Byte strings, including file contents and syntactic strings +//! part of the grammar, are prefixed by their 64-bit length, and padded +//! to 8-byte (64-bit) alignment with zero bytes. The zero-length string +//! is therefore encoded as eight zero bytes representing its length. +//! +//! # Grammar +//! +//! The NAR grammar is as follows: +//! ```plain +//! archive ::= "nix-archive-1" node +//! +//! node ::= "(" "type" "symlink" "target" string ")" +//! ||= "(" "type" "regular" ("executable" "")? "contents" string ")" +//! ||= "(" "type" "directory" entry* ")" +//! +//! entry ::= "entry" "(" "name" string "node" node ")" +//! ``` +//! +//! We rewrite it to pull together the purely syntactic elements into +//! unified tokens, producing an equivalent grammar that can be parsed +//! and serialized more elegantly: +//! ```plain +//! archive ::= TOK_NAR node +//! node ::= TOK_SYM string TOK_PAR +//! ||= (TOK_REG | TOK_EXE) string TOK_PAR +//! ||= TOK_DIR entry* TOK_PAR +//! +//! entry ::= TOK_ENT string TOK_NOD node TOK_PAR +//! +//! TOK_NAR ::= "nix-archive-1" "(" "type" +//! TOK_SYM ::= "symlink" "target" +//! TOK_REG ::= "regular" "contents" +//! TOK_EXE ::= "regular" "executable" "" +//! TOK_DIR ::= "directory" +//! TOK_ENT ::= "entry" "(" "name" +//! TOK_NOD ::= "node" "(" "type" +//! TOK_PAR ::= ")" +//! ``` +//! +//! # Restrictions +//! +//! NOTE: These restrictions are not (and cannot be) enforced by this module, +//! but must be enforced by its consumers, [super::reader] and [super::writer]. +//! +//! Directory entry names cannot have the reserved names `.` and `..`, nor contain +//! forward slashes. They must appear in strictly ascending lexicographic order +//! within a directory, and can be at most [MAX_NAME_LEN] bytes in length. +//! +//! Symlink targets can be at most [MAX_TARGET_LEN] bytes in length. +//! +//! Neither is permitted to be empty, or contain null bytes. + +// These values are the standard Linux length limits +/// Maximum length of a directory entry name +pub const MAX_NAME_LEN: usize = 255; +/// Maximum length of a symlink target +pub const MAX_TARGET_LEN: usize = 4095; + +#[cfg(test)] +fn token(xs: &[&str]) -> Vec { + let mut out = vec![]; + for x in xs { + let len = x.len() as u64; + out.extend_from_slice(&len.to_le_bytes()); + out.extend_from_slice(x.as_bytes()); + + let n = x.len() & 7; + if n != 0 { + const ZERO: [u8; 8] = [0; 8]; + out.extend_from_slice(&ZERO[n..]); + } + } + out +} + +pub const TOK_NAR: [u8; 56] = *b"\x0d\0\0\0\0\0\0\0nix-archive-1\0\0\0\x01\0\0\0\0\0\0\0(\0\0\0\0\0\0\0\x04\0\0\0\0\0\0\0type\0\0\0\0"; +pub const TOK_SYM: [u8; 32] = *b"\x07\0\0\0\0\0\0\0symlink\0\x06\0\0\0\0\0\0\0target\0\0"; +pub const TOK_REG: [u8; 32] = *b"\x07\0\0\0\0\0\0\0regular\0\x08\0\0\0\0\0\0\0contents"; +pub const TOK_EXE: [u8; 64] = *b"\x07\0\0\0\0\0\0\0regular\0\x0a\0\0\0\0\0\0\0executable\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0contents"; +pub const TOK_DIR: [u8; 24] = *b"\x09\0\0\0\0\0\0\0directory\0\0\0\0\0\0\0"; +pub const TOK_ENT: [u8; 48] = *b"\x05\0\0\0\0\0\0\0entry\0\0\0\x01\0\0\0\0\0\0\0(\0\0\0\0\0\0\0\x04\0\0\0\0\0\0\0name\0\0\0\0"; +pub const TOK_NOD: [u8; 48] = *b"\x04\0\0\0\0\0\0\0node\0\0\0\0\x01\0\0\0\0\0\0\0(\0\0\0\0\0\0\0\x04\0\0\0\0\0\0\0type\0\0\0\0"; +pub const TOK_PAR: [u8; 16] = *b"\x01\0\0\0\0\0\0\0)\0\0\0\0\0\0\0"; + +#[test] +fn tokens() { + let cases: &[(&[u8], &[&str])] = &[ + (&TOK_NAR, &["nix-archive-1", "(", "type"]), + (&TOK_SYM, &["symlink", "target"]), + (&TOK_REG, &["regular", "contents"]), + (&TOK_EXE, &["regular", "executable", "", "contents"]), + (&TOK_DIR, &["directory"]), + (&TOK_ENT, &["entry", "(", "name"]), + (&TOK_NOD, &["node", "(", "type"]), + (&TOK_PAR, &[")"]), + ]; + + for &(tok, xs) in cases { + assert_eq!(tok, token(xs)); + } +} + +pub use tag::Tag; +mod tag; + +tag::make! { + /// These are the node tokens, succeeding [TOK_NAR] or [TOK_NOD], + /// and preceding the next variable-length element. + pub enum Node[16] { + Sym = TOK_SYM, + Reg = TOK_REG, + Exe = TOK_EXE, + Dir = TOK_DIR, + } + + /// Directory entry or terminator + pub enum Entry[0] { + /// End of directory + None = TOK_PAR, + /// Directory entry, which must be followed by [Node] + Some = TOK_ENT, + } +} diff --git a/tvix/nix-compat/src/nar/wire/tag.rs b/tvix/nix-compat/src/nar/wire/tag.rs new file mode 100644 index 000000000000..55ce4b4a2a87 --- /dev/null +++ b/tvix/nix-compat/src/nar/wire/tag.rs @@ -0,0 +1,165 @@ +/// A type implementing Tag represents a static hash set of byte strings, +/// with a very simple perfect hash function: every element has a unique +/// discriminant at a common byte offset. The values of the type represent +/// the members by this single discriminant byte; they are indices into the +/// hash set. +pub trait Tag: Sized { + /// Discriminant offset + const OFF: usize; + /// Minimum variant length + const MIN: usize; + + /// Minimal suitably sized buffer for reading the wire representation + /// HACK: This is a workaround for const generics limitations. + type Buf: AsMut<[u8]> + Send; + + /// Make an instance of [Self::Buf] + fn make_buf() -> Self::Buf; + + /// Convert a discriminant into the corresponding variant + fn from_u8(x: u8) -> Option; + + /// Convert a variant back into the wire representation + fn as_bytes(&self) -> &'static [u8]; +} + +/// Generate an enum implementing [Tag], enforcing at compile time that +/// the discriminant values are distinct. +macro_rules! make { + ( + $( + $(#[doc = $doc:expr])* + $vis:vis enum $Enum:ident[$off:expr] { + $( + $(#[doc = $var_doc:expr])* + $Var:ident = $TOK:ident, + )+ + } + )* + ) => { + $( + $(#[doc = $doc])* + #[derive(Debug, PartialEq, Eq)] + #[repr(u8)] + $vis enum $Enum { + $( + $(#[doc = $var_doc])* + $Var = $TOK[$Enum::OFF] + ),+ + } + + impl Tag for $Enum { + /// Discriminant offset + const OFF: usize = $off; + /// Minimum variant length + const MIN: usize = tag::min_of(&[$($TOK.len()),+]); + + /// Minimal suitably sized buffer for reading the wire representation + type Buf = [u8; tag::buf_of(&[$($TOK.len()),+])]; + + /// Make an instance of [Self::Buf] + #[inline(always)] + fn make_buf() -> Self::Buf { + [0u8; tag::buf_of(&[$($TOK.len()),+])] + } + + /// Convert a discriminant into the corresponding variant + #[inline(always)] + fn from_u8(x: u8) -> Option { + #[allow(non_upper_case_globals)] + mod __variant { + $( + pub const $Var: u8 = super::$Enum::$Var as u8; + )+ + } + + match x { + $(__variant::$Var => Some(Self::$Var),)+ + _ => None + } + } + + /// Convert a variant back into the wire representation + #[inline(always)] + fn as_bytes(&self) -> &'static [u8] { + match self { + $(Self::$Var => &$TOK,)+ + } + } + } + )* + }; +} + +pub(crate) use make; + +#[cfg(test)] +mod test { + use super::super::tag::{self, Tag}; + + const TOK_A: [u8; 3] = [0xed, 0xef, 0x1c]; + const TOK_B: [u8; 3] = [0xed, 0xf0, 0x1c]; + + const OFFSET: usize = 1; + + make! { + enum Token[OFFSET] { + A = TOK_A, + B = TOK_B, + } + } + + #[test] + fn example() { + assert_eq!(Token::from_u8(0xed), None); + + let tag = Token::from_u8(0xef).unwrap(); + assert_eq!(tag.as_bytes(), &TOK_A[..]); + + let tag = Token::from_u8(0xf0).unwrap(); + assert_eq!(tag.as_bytes(), &TOK_B[..]); + } +} + +// The following functions are written somewhat unusually, +// since they're const functions that cannot use iterators. + +/// Maximum element of a slice +const fn max_of(mut xs: &[usize]) -> usize { + let mut y = usize::MIN; + while let &[x, ref tail @ ..] = xs { + y = if x > y { x } else { y }; + xs = tail; + } + y +} + +/// Minimum element of a slice +pub const fn min_of(mut xs: &[usize]) -> usize { + let mut y = usize::MAX; + while let &[x, ref tail @ ..] = xs { + y = if x < y { x } else { y }; + xs = tail; + } + y +} + +/// Minimum buffer size to contain either of `0..Tag::MIN` and `Tag::MIN..` +/// at a particular time, for all possible tag wire representations, given +/// the sizes of all wire representations. +/// +/// # Example +/// +/// ```plain +/// OFF = 16 +/// MIN = 24 +/// MAX = 64 +/// +/// BUF = max(MIN, MAX-MIN) +/// = max(24, 64-24) +/// = max(24, 40) +/// = 40 +/// ``` +pub const fn buf_of(xs: &[usize]) -> usize { + max_of(&[min_of(xs), max_of(xs) - min_of(xs)]) +} diff --git a/tvix/nix-compat/src/nar/writer/async.rs b/tvix/nix-compat/src/nar/writer/async.rs index 2a77fa736677..2b73e56737ce 100644 --- a/tvix/nix-compat/src/nar/writer/async.rs +++ b/tvix/nix-compat/src/nar/writer/async.rs @@ -30,7 +30,7 @@ //! # }); //! ``` -use super::wire; +use crate::nar::wire; use bstr::ByteSlice; use futures_util::{AsyncBufRead, AsyncBufReadExt, AsyncWrite, AsyncWriteExt}; use std::{ diff --git a/tvix/nix-compat/src/nar/writer/mod.rs b/tvix/nix-compat/src/nar/writer/mod.rs index 79f78f37fc40..bf81ccd4df32 100644 --- a/tvix/nix-compat/src/nar/writer/mod.rs +++ b/tvix/nix-compat/src/nar/writer/mod.rs @@ -1,7 +1,5 @@ pub use sync::*; -mod wire; - pub mod sync; #[cfg(feature = "async")] diff --git a/tvix/nix-compat/src/nar/writer/sync.rs b/tvix/nix-compat/src/nar/writer/sync.rs index 996815149dd6..9290e4a5bdfe 100644 --- a/tvix/nix-compat/src/nar/writer/sync.rs +++ b/tvix/nix-compat/src/nar/writer/sync.rs @@ -28,7 +28,7 @@ //! # Ok::<(), std::io::Error>(()) //! ``` -use super::wire; +use crate::nar::wire; use bstr::ByteSlice; use std::io::{ self, BufRead, diff --git a/tvix/nix-compat/src/nar/writer/wire.rs b/tvix/nix-compat/src/nar/writer/wire.rs deleted file mode 100644 index a6c19f0759c3..000000000000 --- a/tvix/nix-compat/src/nar/writer/wire.rs +++ /dev/null @@ -1,110 +0,0 @@ -//! NAR wire format, without I/O details, since those differ between -//! the synchronous and asynchronous implementations. -//! -//! The wire format is an S-expression format, encoded onto the wire -//! using simple encoding rules. -//! -//! # Encoding -//! -//! Lengths are represented as 64-bit unsigned integers in little-endian -//! format. Byte strings, including file contents and syntactic strings -//! part of the grammar, are prefixed by their 64-bit length, and padded -//! to 8-byte (64-bit) alignment with zero bytes. The zero-length string -//! is therefore encoded as eight zero bytes representing its length. -//! -//! # Grammar -//! -//! The NAR grammar is as follows: -//! ```plain -//! archive ::= "nix-archive-1" node -//! -//! node ::= "(" "type" "symlink" "target" string ")" -//! ||= "(" "type" "regular" ("executable" "")? "contents" string ")" -//! ||= "(" "type" "directory" entry* ")" -//! -//! entry ::= "entry" "(" "name" string "node" node ")" -//! ``` -//! -//! We rewrite it to pull together the purely syntactic elements into -//! unified tokens, producing an equivalent grammar that can be parsed -//! and serialized more elegantly: -//! ```plain -//! archive ::= TOK_NAR node -//! node ::= TOK_SYM string TOK_PAR -//! ||= (TOK_REG | TOK_EXE) string TOK_PAR -//! ||= TOK_DIR entry* TOK_PAR -//! -//! entry ::= TOK_ENT string TOK_NOD node TOK_PAR -//! -//! TOK_NAR ::= "nix-archive-1" "(" "type" -//! TOK_SYM ::= "symlink" "target" -//! TOK_REG ::= "regular" "contents" -//! TOK_EXE ::= "regular" "executable" "" -//! TOK_DIR ::= "directory" -//! TOK_ENT ::= "entry" "(" "name" -//! TOK_NOD ::= "node" "(" "type" -//! TOK_PAR ::= ")" -//! ``` -//! -//! # Restrictions -//! -//! NOTE: These restrictions are not (and cannot be) enforced by this module, -//! but must be enforced by its consumers, [super::reader] and [super::writer]. -//! -//! Directory entry names cannot have the reserved names `.` and `..`, nor contain -//! forward slashes. They must appear in strictly ascending lexicographic order -//! within a directory, and can be at most [MAX_NAME_LEN] bytes in length. -//! -//! Symlink targets can be at most [MAX_TARGET_LEN] bytes in length. -//! -//! Neither is permitted to be empty, or contain null bytes. - -// These values are the standard Linux length limits -/// Maximum length of a directory entry name -pub const MAX_NAME_LEN: usize = 255; -/// Maximum length of a symlink target -pub const MAX_TARGET_LEN: usize = 4095; - -#[cfg(test)] -fn token(xs: &[&str]) -> Vec { - let mut out = vec![]; - for x in xs { - let len = x.len() as u64; - out.extend_from_slice(&len.to_le_bytes()); - out.extend_from_slice(x.as_bytes()); - - let n = x.len() & 7; - if n != 0 { - const ZERO: [u8; 8] = [0; 8]; - out.extend_from_slice(&ZERO[n..]); - } - } - out -} - -pub const TOK_NAR: [u8; 56] = *b"\x0d\0\0\0\0\0\0\0nix-archive-1\0\0\0\x01\0\0\0\0\0\0\0(\0\0\0\0\0\0\0\x04\0\0\0\0\0\0\0type\0\0\0\0"; -pub const TOK_SYM: [u8; 32] = *b"\x07\0\0\0\0\0\0\0symlink\0\x06\0\0\0\0\0\0\0target\0\0"; -pub const TOK_REG: [u8; 32] = *b"\x07\0\0\0\0\0\0\0regular\0\x08\0\0\0\0\0\0\0contents"; -pub const TOK_EXE: [u8; 64] = *b"\x07\0\0\0\0\0\0\0regular\0\x0a\0\0\0\0\0\0\0executable\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0contents"; -pub const TOK_DIR: [u8; 24] = *b"\x09\0\0\0\0\0\0\0directory\0\0\0\0\0\0\0"; -pub const TOK_ENT: [u8; 48] = *b"\x05\0\0\0\0\0\0\0entry\0\0\0\x01\0\0\0\0\0\0\0(\0\0\0\0\0\0\0\x04\0\0\0\0\0\0\0name\0\0\0\0"; -pub const TOK_NOD: [u8; 48] = *b"\x04\0\0\0\0\0\0\0node\0\0\0\0\x01\0\0\0\0\0\0\0(\0\0\0\0\0\0\0\x04\0\0\0\0\0\0\0type\0\0\0\0"; -pub const TOK_PAR: [u8; 16] = *b"\x01\0\0\0\0\0\0\0)\0\0\0\0\0\0\0"; - -#[test] -fn tokens() { - let cases: &[(&[u8], &[&str])] = &[ - (&TOK_NAR, &["nix-archive-1", "(", "type"]), - (&TOK_SYM, &["symlink", "target"]), - (&TOK_REG, &["regular", "contents"]), - (&TOK_EXE, &["regular", "executable", "", "contents"]), - (&TOK_DIR, &["directory"]), - (&TOK_ENT, &["entry", "(", "name"]), - (&TOK_NOD, &["node", "(", "type"]), - (&TOK_PAR, &[")"]), - ]; - - for &(tok, xs) in cases { - assert_eq!(tok, token(xs)); - } -} -- cgit 1.4.1