From 2410f2292f53a17242ed54b0af2d7b04ec3173f6 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Mon, 31 Jul 2023 15:46:39 +0200 Subject: feat(nix-compat/{aterm,derivation}): init parser This provides a nom-based parser for Nix derivations in ATerm format, which can be reached via `Derivation::from_aterm_bytes`. Some of the lower-level ATerm primitives are moved into a (new) aterm module, and some more higher-level ones that construct derivation- specific types. Also, move the escape_bytes function into there, this is a generic ATerm thing. Change-Id: I2b03b8a1461c7ea2fcb8640c2fc3d1fa3ea719fb Reviewed-on: https://cl.tvl.fyi/c/depot/+/9730 Autosubmit: flokli Reviewed-by: raitobezarius Tested-by: BuildkiteCI --- tvix/nix-compat/src/aterm/escape.rs | 27 ++++++++ tvix/nix-compat/src/aterm/mod.rs | 7 ++ tvix/nix-compat/src/aterm/parser.rs | 123 ++++++++++++++++++++++++++++++++++++ 3 files changed, 157 insertions(+) create mode 100644 tvix/nix-compat/src/aterm/escape.rs create mode 100644 tvix/nix-compat/src/aterm/mod.rs create mode 100644 tvix/nix-compat/src/aterm/parser.rs (limited to 'tvix/nix-compat/src/aterm') diff --git a/tvix/nix-compat/src/aterm/escape.rs b/tvix/nix-compat/src/aterm/escape.rs new file mode 100644 index 000000000000..06b550bbf02d --- /dev/null +++ b/tvix/nix-compat/src/aterm/escape.rs @@ -0,0 +1,27 @@ +use bstr::ByteSlice; + +/// Escapes a byte sequence. Does not add surrounding quotes. +pub fn escape_bytes>(s: P) -> Vec { + let mut s: Vec = s.as_ref().to_vec(); + + s = s.replace(b"\\", b"\\\\"); + s = s.replace(b"\n", b"\\n"); + s = s.replace(b"\r", b"\\r"); + s = s.replace(b"\t", b"\\t"); + s = s.replace(b"\"", b"\\\""); + + s +} + +#[cfg(test)] +mod tests { + use super::escape_bytes; + use test_case::test_case; + + #[test_case(b"", b""; "empty")] + #[test_case(b"\"", b"\\\""; "doublequote")] + #[test_case(b":", b":"; "colon")] + fn escape(input: &[u8], expected: &[u8]) { + assert_eq!(expected, escape_bytes(input)) + } +} diff --git a/tvix/nix-compat/src/aterm/mod.rs b/tvix/nix-compat/src/aterm/mod.rs new file mode 100644 index 000000000000..8806b6caf2e5 --- /dev/null +++ b/tvix/nix-compat/src/aterm/mod.rs @@ -0,0 +1,7 @@ +mod escape; +mod parser; + +pub(crate) use escape::escape_bytes; +pub(crate) use parser::parse_bstr_field; +pub(crate) use parser::parse_str_list; +pub(crate) use parser::parse_string_field; diff --git a/tvix/nix-compat/src/aterm/parser.rs b/tvix/nix-compat/src/aterm/parser.rs new file mode 100644 index 000000000000..883eeb60b984 --- /dev/null +++ b/tvix/nix-compat/src/aterm/parser.rs @@ -0,0 +1,123 @@ +//! This module implements parsing code for some basic building blocks +//! of the [ATerm][] format, which is used by C++ Nix to serialize Derivations. +//! +//! [ATerm]: http://program-transformation.org/Tools/ATermFormat.html +use bstr::BString; +use nom::branch::alt; +use nom::bytes::complete::{escaped_transform, is_not, tag}; +use nom::character::complete::char as nomchar; +use nom::combinator::{map, value}; +use nom::multi::separated_list0; +use nom::sequence::delimited; +use nom::IResult; + +/// Parse a bstr and undo any escaping. +fn parse_escaped_bstr(i: &[u8]) -> IResult<&[u8], BString> { + escaped_transform( + is_not("\"\\"), + '\\', + alt(( + value("\\".as_bytes(), nomchar('\\')), + value("\n".as_bytes(), nomchar('n')), + value("\t".as_bytes(), nomchar('t')), + value("\r".as_bytes(), nomchar('r')), + value("\"".as_bytes(), nomchar('\"')), + )), + )(i) + .map(|(i, v)| (i, BString::new(v))) +} + +/// Parse a field in double quotes, undo any escaping, and return the unquoted +/// and decoded Vec. +pub(crate) fn parse_bstr_field(i: &[u8]) -> IResult<&[u8], BString> { + // inside double quotes… + delimited( + nomchar('\"'), + // There is + alt(( + // …either is a bstr after unescaping + parse_escaped_bstr, + // …or an empty string. + map(tag(b""), |_| BString::default()), + )), + nomchar('\"'), + )(i) +} + +/// Parse a field in double quotes, undo any escaping, and return the unquoted +/// and decoded string, if it's a valid string. Or fail parsing if the bytes are +/// no valid UTF-8. +pub(crate) fn parse_string_field(i: &[u8]) -> IResult<&[u8], String> { + // inside double quotes… + delimited( + nomchar('\"'), + // There is + alt(( + // either is a String after unescaping + nom::combinator::map_opt(parse_escaped_bstr, |escaped_bstr| { + String::from_utf8(escaped_bstr.into()).ok() + }), + // or an empty string. + map(tag(b""), |_| String::new()), + )), + nomchar('\"'), + )(i) +} + +/// Parse a list of of string fields (enclosed in brackets) +pub(crate) fn parse_str_list(i: &[u8]) -> IResult<&[u8], Vec> { + // inside brackets + delimited( + nomchar('['), + separated_list0(nomchar(','), parse_string_field), + nomchar(']'), + )(i) +} + +#[cfg(test)] +mod tests { + use test_case::test_case; + + #[test_case(br#""""#, b"", b""; "empty")] + #[test_case(br#""Hello World""#, b"Hello World", b""; "hello world")] + #[test_case(br#""\"""#, br#"""#, b""; "doublequote")] + #[test_case(br#"":""#, b":", b""; "colon")] + #[test_case(br#""\""Rest"#, br#"""#, b"Rest"; "doublequote rest")] + fn parse_bstr_field(input: &[u8], expected: &[u8], exp_rest: &[u8]) { + let (rest, parsed) = super::parse_bstr_field(input).expect("must parse"); + assert_eq!(exp_rest, rest, "expected remainder"); + assert_eq!(expected, parsed); + } + + #[test_case(br#""""#, "", b""; "empty")] + #[test_case(br#""Hello World""#, "Hello World", b""; "hello world")] + #[test_case(br#""\"""#, r#"""#, b""; "doublequote")] + #[test_case(br#"":""#, ":", b""; "colon")] + #[test_case(br#""\""Rest"#, r#"""#, b"Rest"; "doublequote rest")] + fn parse_string_field(input: &[u8], expected: &str, exp_rest: &[u8]) { + let (rest, parsed) = super::parse_string_field(input).expect("must parse"); + assert_eq!(exp_rest, rest, "expected remainder"); + assert_eq!(expected, &parsed); + } + + #[test] + fn parse_string_field_invalid_encoding_fail() { + let mut input: Vec = Vec::new(); + input.push(b'"'); + input.push(0xc5); + input.push(0xc4); + input.push(0xd6); + input.push(b'"'); + + super::parse_string_field(&input).expect_err("must fail"); + } + + #[test_case(br#"["foo"]"#, vec!["foo".to_string()], b""; "single foo")] + #[test_case(b"[]", vec![], b""; "empty list")] + #[test_case(b"[]blub", vec![], b"blub"; "empty list with rest")] + fn parse_list(input: &[u8], expected: Vec, exp_rest: &[u8]) { + let (rest, parsed) = super::parse_str_list(input).expect("must parse"); + assert_eq!(exp_rest, rest, "expected remainder"); + assert_eq!(expected, parsed); + } +} -- cgit 1.4.1