From 08b98b7503edbde5277ce11faf48b94ca2f1c0b7 Mon Sep 17 00:00:00 2001 From: edef Date: Tue, 17 Oct 2023 22:29:33 +0000 Subject: docs(tvix/nix-compat/nar): document the wire format Change-Id: I6c8e23bad27fa6ada1b8973482b4d99190cf050d Reviewed-on: https://cl.tvl.fyi/c/depot/+/9767 Tested-by: BuildkiteCI Reviewed-by: flokli Reviewed-by: raitobezarius --- tvix/nix-compat/src/nar/writer/wire.rs | 68 +++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 2 deletions(-) diff --git a/tvix/nix-compat/src/nar/writer/wire.rs b/tvix/nix-compat/src/nar/writer/wire.rs index 98581ae3aa7c..a6c19f0759c3 100644 --- a/tvix/nix-compat/src/nar/writer/wire.rs +++ b/tvix/nix-compat/src/nar/writer/wire.rs @@ -1,4 +1,68 @@ +//! NAR wire format, without I/O details, since those differ between +//! the synchronous and asynchronous implementations. +//! +//! The wire format is an S-expression format, encoded onto the wire +//! using simple encoding rules. +//! +//! # Encoding +//! +//! Lengths are represented as 64-bit unsigned integers in little-endian +//! format. Byte strings, including file contents and syntactic strings +//! part of the grammar, are prefixed by their 64-bit length, and padded +//! to 8-byte (64-bit) alignment with zero bytes. The zero-length string +//! is therefore encoded as eight zero bytes representing its length. +//! +//! # Grammar +//! +//! The NAR grammar is as follows: +//! ```plain +//! archive ::= "nix-archive-1" node +//! +//! node ::= "(" "type" "symlink" "target" string ")" +//! ||= "(" "type" "regular" ("executable" "")? "contents" string ")" +//! ||= "(" "type" "directory" entry* ")" +//! +//! entry ::= "entry" "(" "name" string "node" node ")" +//! ``` +//! +//! We rewrite it to pull together the purely syntactic elements into +//! unified tokens, producing an equivalent grammar that can be parsed +//! and serialized more elegantly: +//! ```plain +//! archive ::= TOK_NAR node +//! node ::= TOK_SYM string TOK_PAR +//! ||= (TOK_REG | TOK_EXE) string TOK_PAR +//! ||= TOK_DIR entry* TOK_PAR +//! +//! entry ::= TOK_ENT string TOK_NOD node TOK_PAR +//! +//! TOK_NAR ::= "nix-archive-1" "(" "type" +//! TOK_SYM ::= "symlink" "target" +//! TOK_REG ::= "regular" "contents" +//! TOK_EXE ::= "regular" "executable" "" +//! TOK_DIR ::= "directory" +//! TOK_ENT ::= "entry" "(" "name" +//! TOK_NOD ::= "node" "(" "type" +//! TOK_PAR ::= ")" +//! ``` +//! +//! # Restrictions +//! +//! NOTE: These restrictions are not (and cannot be) enforced by this module, +//! but must be enforced by its consumers, [super::reader] and [super::writer]. +//! +//! Directory entry names cannot have the reserved names `.` and `..`, nor contain +//! forward slashes. They must appear in strictly ascending lexicographic order +//! within a directory, and can be at most [MAX_NAME_LEN] bytes in length. +//! +//! Symlink targets can be at most [MAX_TARGET_LEN] bytes in length. +//! +//! Neither is permitted to be empty, or contain null bytes. + +// These values are the standard Linux length limits +/// Maximum length of a directory entry name pub const MAX_NAME_LEN: usize = 255; +/// Maximum length of a symlink target pub const MAX_TARGET_LEN: usize = 4095; #[cfg(test)] @@ -19,9 +83,9 @@ fn token(xs: &[&str]) -> Vec { } pub const TOK_NAR: [u8; 56] = *b"\x0d\0\0\0\0\0\0\0nix-archive-1\0\0\0\x01\0\0\0\0\0\0\0(\0\0\0\0\0\0\0\x04\0\0\0\0\0\0\0type\0\0\0\0"; +pub const TOK_SYM: [u8; 32] = *b"\x07\0\0\0\0\0\0\0symlink\0\x06\0\0\0\0\0\0\0target\0\0"; pub const TOK_REG: [u8; 32] = *b"\x07\0\0\0\0\0\0\0regular\0\x08\0\0\0\0\0\0\0contents"; pub const TOK_EXE: [u8; 64] = *b"\x07\0\0\0\0\0\0\0regular\0\x0a\0\0\0\0\0\0\0executable\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0contents"; -pub const TOK_SYM: [u8; 32] = *b"\x07\0\0\0\0\0\0\0symlink\0\x06\0\0\0\0\0\0\0target\0\0"; pub const TOK_DIR: [u8; 24] = *b"\x09\0\0\0\0\0\0\0directory\0\0\0\0\0\0\0"; pub const TOK_ENT: [u8; 48] = *b"\x05\0\0\0\0\0\0\0entry\0\0\0\x01\0\0\0\0\0\0\0(\0\0\0\0\0\0\0\x04\0\0\0\0\0\0\0name\0\0\0\0"; pub const TOK_NOD: [u8; 48] = *b"\x04\0\0\0\0\0\0\0node\0\0\0\0\x01\0\0\0\0\0\0\0(\0\0\0\0\0\0\0\x04\0\0\0\0\0\0\0type\0\0\0\0"; @@ -31,9 +95,9 @@ pub const TOK_PAR: [u8; 16] = *b"\x01\0\0\0\0\0\0\0)\0\0\0\0\0\0\0"; fn tokens() { let cases: &[(&[u8], &[&str])] = &[ (&TOK_NAR, &["nix-archive-1", "(", "type"]), + (&TOK_SYM, &["symlink", "target"]), (&TOK_REG, &["regular", "contents"]), (&TOK_EXE, &["regular", "executable", "", "contents"]), - (&TOK_SYM, &["symlink", "target"]), (&TOK_DIR, &["directory"]), (&TOK_ENT, &["entry", "(", "name"]), (&TOK_NOD, &["node", "(", "type"]), -- cgit 1.4.1