about summary refs log tree commit diff
path: root/tvix/nix-compat/src/nar/wire
diff options
context:
space:
mode:
authoredef <edef@edef.eu>2023-10-15T14·59+0000
committeredef <edef@edef.eu>2023-10-18T11·40+0000
commit6638f4d4ea6848fe6fc2dc89271f0b6950764729 (patch)
tree470e909133069a49403298d182972c9fff8c7a82 /tvix/nix-compat/src/nar/wire
parent08b98b7503edbde5277ce11faf48b94ca2f1c0b7 (diff)
feat(tvix/nix-compat): NAR reader r/6853
Change-Id: I50d51baf62c0419eaf17f0dc262f728aaff9794d
Reviewed-on: https://cl.tvl.fyi/c/depot/+/9688
Reviewed-by: raitobezarius <tvl@lahfa.xyz>
Tested-by: BuildkiteCI
Reviewed-by: flokli <flokli@flokli.de>
Diffstat (limited to 'tvix/nix-compat/src/nar/wire')
-rw-r--r--tvix/nix-compat/src/nar/wire/mod.rs132
-rw-r--r--tvix/nix-compat/src/nar/wire/tag.rs165
2 files changed, 297 insertions, 0 deletions
diff --git a/tvix/nix-compat/src/nar/wire/mod.rs b/tvix/nix-compat/src/nar/wire/mod.rs
new file mode 100644
index 000000000000..fc8b8d3510aa
--- /dev/null
+++ b/tvix/nix-compat/src/nar/wire/mod.rs
@@ -0,0 +1,132 @@
+//! NAR wire format, without I/O details, since those differ between
+//! the synchronous and asynchronous implementations.
+//!
+//! The wire format is an S-expression format, encoded onto the wire
+//! using simple encoding rules.
+//!
+//! # Encoding
+//!
+//! Lengths are represented as 64-bit unsigned integers in little-endian
+//! format. Byte strings, including file contents and syntactic strings
+//! part of the grammar, are prefixed by their 64-bit length, and padded
+//! to 8-byte (64-bit) alignment with zero bytes. The zero-length string
+//! is therefore encoded as eight zero bytes representing its length.
+//!
+//! # Grammar
+//!
+//! The NAR grammar is as follows:
+//! ```plain
+//! archive ::= "nix-archive-1" node
+//!
+//! node ::= "(" "type" "symlink" "target" string ")"
+//!      ||= "(" "type" "regular" ("executable" "")? "contents" string ")"
+//!      ||= "(" "type" "directory" entry* ")"
+//!
+//! entry ::= "entry" "(" "name" string "node" node ")"
+//! ```
+//!
+//! We rewrite it to pull together the purely syntactic elements into
+//! unified tokens, producing an equivalent grammar that can be parsed
+//! and serialized more elegantly:
+//! ```plain
+//! archive ::= TOK_NAR node
+//! node ::= TOK_SYM string             TOK_PAR
+//!      ||= (TOK_REG | TOK_EXE) string TOK_PAR
+//!      ||= TOK_DIR entry*             TOK_PAR
+//!
+//! entry ::= TOK_ENT string TOK_NOD node TOK_PAR
+//!
+//! TOK_NAR ::= "nix-archive-1" "(" "type"
+//! TOK_SYM ::= "symlink" "target"
+//! TOK_REG ::= "regular" "contents"
+//! TOK_EXE ::= "regular" "executable" ""
+//! TOK_DIR ::= "directory"
+//! TOK_ENT ::= "entry" "(" "name"
+//! TOK_NOD ::= "node" "(" "type"
+//! TOK_PAR ::= ")"
+//! ```
+//!
+//! # Restrictions
+//!
+//! NOTE: These restrictions are not (and cannot be) enforced by this module,
+//! but must be enforced by its consumers, [super::reader] and [super::writer].
+//!
+//! Directory entry names cannot have the reserved names `.` and `..`, nor contain
+//! forward slashes. They must appear in strictly ascending lexicographic order
+//! within a directory, and can be at most [MAX_NAME_LEN] bytes in length.
+//!
+//! Symlink targets can be at most [MAX_TARGET_LEN] bytes in length.
+//!
+//! Neither is permitted to be empty, or contain null bytes.
+
+// These values are the standard Linux length limits
+/// Maximum length of a directory entry name
+pub const MAX_NAME_LEN: usize = 255;
+/// Maximum length of a symlink target
+pub const MAX_TARGET_LEN: usize = 4095;
+
+#[cfg(test)]
+fn token(xs: &[&str]) -> Vec<u8> {
+    let mut out = vec![];
+    for x in xs {
+        let len = x.len() as u64;
+        out.extend_from_slice(&len.to_le_bytes());
+        out.extend_from_slice(x.as_bytes());
+
+        let n = x.len() & 7;
+        if n != 0 {
+            const ZERO: [u8; 8] = [0; 8];
+            out.extend_from_slice(&ZERO[n..]);
+        }
+    }
+    out
+}
+
+pub const TOK_NAR: [u8; 56] = *b"\x0d\0\0\0\0\0\0\0nix-archive-1\0\0\0\x01\0\0\0\0\0\0\0(\0\0\0\0\0\0\0\x04\0\0\0\0\0\0\0type\0\0\0\0";
+pub const TOK_SYM: [u8; 32] = *b"\x07\0\0\0\0\0\0\0symlink\0\x06\0\0\0\0\0\0\0target\0\0";
+pub const TOK_REG: [u8; 32] = *b"\x07\0\0\0\0\0\0\0regular\0\x08\0\0\0\0\0\0\0contents";
+pub const TOK_EXE: [u8; 64] = *b"\x07\0\0\0\0\0\0\0regular\0\x0a\0\0\0\0\0\0\0executable\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0contents";
+pub const TOK_DIR: [u8; 24] = *b"\x09\0\0\0\0\0\0\0directory\0\0\0\0\0\0\0";
+pub const TOK_ENT: [u8; 48] = *b"\x05\0\0\0\0\0\0\0entry\0\0\0\x01\0\0\0\0\0\0\0(\0\0\0\0\0\0\0\x04\0\0\0\0\0\0\0name\0\0\0\0";
+pub const TOK_NOD: [u8; 48] = *b"\x04\0\0\0\0\0\0\0node\0\0\0\0\x01\0\0\0\0\0\0\0(\0\0\0\0\0\0\0\x04\0\0\0\0\0\0\0type\0\0\0\0";
+pub const TOK_PAR: [u8; 16] = *b"\x01\0\0\0\0\0\0\0)\0\0\0\0\0\0\0";
+
+#[test]
+fn tokens() {
+    let cases: &[(&[u8], &[&str])] = &[
+        (&TOK_NAR, &["nix-archive-1", "(", "type"]),
+        (&TOK_SYM, &["symlink", "target"]),
+        (&TOK_REG, &["regular", "contents"]),
+        (&TOK_EXE, &["regular", "executable", "", "contents"]),
+        (&TOK_DIR, &["directory"]),
+        (&TOK_ENT, &["entry", "(", "name"]),
+        (&TOK_NOD, &["node", "(", "type"]),
+        (&TOK_PAR, &[")"]),
+    ];
+
+    for &(tok, xs) in cases {
+        assert_eq!(tok, token(xs));
+    }
+}
+
+pub use tag::Tag;
+mod tag;
+
+tag::make! {
+    /// These are the node tokens, succeeding [TOK_NAR] or [TOK_NOD],
+    /// and preceding the next variable-length element.
+    pub enum Node[16] {
+        Sym = TOK_SYM,
+        Reg = TOK_REG,
+        Exe = TOK_EXE,
+        Dir = TOK_DIR,
+    }
+
+    /// Directory entry or terminator
+    pub enum Entry[0] {
+        /// End of directory
+        None = TOK_PAR,
+        /// Directory entry, which must be followed by [Node]
+        Some = TOK_ENT,
+    }
+}
diff --git a/tvix/nix-compat/src/nar/wire/tag.rs b/tvix/nix-compat/src/nar/wire/tag.rs
new file mode 100644
index 000000000000..55ce4b4a2a87
--- /dev/null
+++ b/tvix/nix-compat/src/nar/wire/tag.rs
@@ -0,0 +1,165 @@
+/// A type implementing Tag represents a static hash set of byte strings,
+/// with a very simple perfect hash function: every element has a unique
+/// discriminant at a common byte offset. The values of the type represent
+/// the members by this single discriminant byte; they are indices into the
+/// hash set.
+pub trait Tag: Sized {
+    /// Discriminant offset
+    const OFF: usize;
+    /// Minimum variant length
+    const MIN: usize;
+
+    /// Minimal suitably sized buffer for reading the wire representation
+    /// HACK: This is a workaround for const generics limitations.
+    type Buf: AsMut<[u8]> + Send;
+
+    /// Make an instance of [Self::Buf]
+    fn make_buf() -> Self::Buf;
+
+    /// Convert a discriminant into the corresponding variant
+    fn from_u8(x: u8) -> Option<Self>;
+
+    /// Convert a variant back into the wire representation
+    fn as_bytes(&self) -> &'static [u8];
+}
+
+/// Generate an enum implementing [Tag], enforcing at compile time that
+/// the discriminant values are distinct.
+macro_rules! make {
+    (
+        $(
+            $(#[doc = $doc:expr])*
+            $vis:vis enum $Enum:ident[$off:expr] {
+                $(
+                    $(#[doc = $var_doc:expr])*
+                    $Var:ident = $TOK:ident,
+                )+
+            }
+        )*
+    ) => {
+        $(
+            $(#[doc = $doc])*
+            #[derive(Debug, PartialEq, Eq)]
+            #[repr(u8)]
+            $vis enum $Enum {
+                $(
+                    $(#[doc = $var_doc])*
+                    $Var = $TOK[$Enum::OFF]
+                ),+
+            }
+
+            impl Tag for $Enum {
+                /// Discriminant offset
+                const OFF: usize = $off;
+                /// Minimum variant length
+                const MIN: usize = tag::min_of(&[$($TOK.len()),+]);
+
+                /// Minimal suitably sized buffer for reading the wire representation
+                type Buf = [u8; tag::buf_of(&[$($TOK.len()),+])];
+
+                /// Make an instance of [Self::Buf]
+                #[inline(always)]
+                fn make_buf() -> Self::Buf {
+                    [0u8; tag::buf_of(&[$($TOK.len()),+])]
+                }
+
+                /// Convert a discriminant into the corresponding variant
+                #[inline(always)]
+                fn from_u8(x: u8) -> Option<Self> {
+                    #[allow(non_upper_case_globals)]
+                    mod __variant {
+                        $(
+                            pub const $Var: u8 = super::$Enum::$Var as u8;
+                        )+
+                    }
+
+                    match x {
+                        $(__variant::$Var => Some(Self::$Var),)+
+                        _ => None
+                    }
+                }
+
+                /// Convert a variant back into the wire representation
+                #[inline(always)]
+                fn as_bytes(&self) -> &'static [u8] {
+                    match self {
+                        $(Self::$Var => &$TOK,)+
+                    }
+                }
+            }
+        )*
+    };
+}
+
+pub(crate) use make;
+
+#[cfg(test)]
+mod test {
+    use super::super::tag::{self, Tag};
+
+    const TOK_A: [u8; 3] = [0xed, 0xef, 0x1c];
+    const TOK_B: [u8; 3] = [0xed, 0xf0, 0x1c];
+
+    const OFFSET: usize = 1;
+
+    make! {
+        enum Token[OFFSET] {
+            A = TOK_A,
+            B = TOK_B,
+        }
+    }
+
+    #[test]
+    fn example() {
+        assert_eq!(Token::from_u8(0xed), None);
+
+        let tag = Token::from_u8(0xef).unwrap();
+        assert_eq!(tag.as_bytes(), &TOK_A[..]);
+
+        let tag = Token::from_u8(0xf0).unwrap();
+        assert_eq!(tag.as_bytes(), &TOK_B[..]);
+    }
+}
+
+// The following functions are written somewhat unusually,
+// since they're const functions that cannot use iterators.
+
+/// Maximum element of a slice
+const fn max_of(mut xs: &[usize]) -> usize {
+    let mut y = usize::MIN;
+    while let &[x, ref tail @ ..] = xs {
+        y = if x > y { x } else { y };
+        xs = tail;
+    }
+    y
+}
+
+/// Minimum element of a slice
+pub const fn min_of(mut xs: &[usize]) -> usize {
+    let mut y = usize::MAX;
+    while let &[x, ref tail @ ..] = xs {
+        y = if x < y { x } else { y };
+        xs = tail;
+    }
+    y
+}
+
+/// Minimum buffer size to contain either of `0..Tag::MIN` and `Tag::MIN..`
+/// at a particular time, for all possible tag wire representations, given
+/// the sizes of all wire representations.
+///
+/// # Example
+///
+/// ```plain
+/// OFF = 16
+/// MIN = 24
+/// MAX = 64
+///
+/// BUF = max(MIN, MAX-MIN)
+///     = max(24, 64-24)
+///     = max(24, 40)
+///     = 40
+/// ```
+pub const fn buf_of(xs: &[usize]) -> usize {
+    max_of(&[min_of(xs), max_of(xs) - min_of(xs)])
+}