feat(nix-compat/{aterm,derivation}): init parser r/6831

This provides a nom-based parser for Nix derivations in ATerm format, which can be reached via `Derivation::from_aterm_bytes`. Some of the lower-level ATerm primitives are moved into a (new) aterm module, and some more higher-level ones that construct derivation- specific types. Also, move the escape_bytes function into there, this is a generic ATerm thing. Change-Id: I2b03b8a1461c7ea2fcb8640c2fc3d1fa3ea719fb Reviewed-on: https://cl.tvl.fyi/c/depot/+/9730 Autosubmit: flokli <flokli@flokli.de> Reviewed-by: raitobezarius <tvl@lahfa.xyz> Tested-by: BuildkiteCI
author: Florian Klink <flokli@flokli.de> 2023-07-31T13·46+0200
committer: clbot <clbot@tvl.fyi> 2023-10-16T12·23+0000
commit: 2410f2292f53a17242ed54b0af2d7b04ec3173f6 (patch)
tree: 476d93504a2c21d48011fce4f4afe79bfb0b0cca /tvix/nix-compat/src/aterm
parent: 8b09ae54b1d635e77c394dd965915479283489a2 (diff)
3 files changed, 157 insertions, 0 deletions
diff --git a/tvix/nix-compat/src/aterm/escape.rs b/tvix/nix-compat/src/aterm/escape.rs
new file mode 100644
index 000000000000..06b550bbf02d
--- /dev/null
+++ b/tvix/nix-compat/src/aterm/escape.rs
@@ -0,0 +1,27 @@
+use bstr::ByteSlice;
+
+/// Escapes a byte sequence. Does not add surrounding quotes.
+pub fn escape_bytes<P: AsRef<[u8]>>(s: P) -> Vec<u8> {
+    let mut s: Vec<u8> = s.as_ref().to_vec();
+
+    s = s.replace(b"\\", b"\\\\");
+    s = s.replace(b"\n", b"\\n");
+    s = s.replace(b"\r", b"\\r");
+    s = s.replace(b"\t", b"\\t");
+    s = s.replace(b"\"", b"\\\"");
+
+    s
+}
+
+#[cfg(test)]
+mod tests {
+    use super::escape_bytes;
+    use test_case::test_case;
+
+    #[test_case(b"", b""; "empty")]
+    #[test_case(b"\"", b"\\\""; "doublequote")]
+    #[test_case(b":", b":"; "colon")]
+    fn escape(input: &[u8], expected: &[u8]) {
+        assert_eq!(expected, escape_bytes(input))
+    }
+}
diff --git a/tvix/nix-compat/src/aterm/mod.rs b/tvix/nix-compat/src/aterm/mod.rs
new file mode 100644
index 000000000000..8806b6caf2e5
--- /dev/null
+++ b/tvix/nix-compat/src/aterm/mod.rs
@@ -0,0 +1,7 @@
+mod escape;
+mod parser;
+
+pub(crate) use escape::escape_bytes;
+pub(crate) use parser::parse_bstr_field;
+pub(crate) use parser::parse_str_list;
+pub(crate) use parser::parse_string_field;
diff --git a/tvix/nix-compat/src/aterm/parser.rs b/tvix/nix-compat/src/aterm/parser.rs
new file mode 100644
index 000000000000..883eeb60b984
--- /dev/null
+++ b/tvix/nix-compat/src/aterm/parser.rs
@@ -0,0 +1,123 @@
+//! This module implements parsing code for some basic building blocks
+//! of the [ATerm][] format, which is used by C++ Nix to serialize Derivations.
+//!
+//! [ATerm]: http://program-transformation.org/Tools/ATermFormat.html
+use bstr::BString;
+use nom::branch::alt;
+use nom::bytes::complete::{escaped_transform, is_not, tag};
+use nom::character::complete::char as nomchar;
+use nom::combinator::{map, value};
+use nom::multi::separated_list0;
+use nom::sequence::delimited;
+use nom::IResult;
+
+/// Parse a bstr and undo any escaping.
+fn parse_escaped_bstr(i: &[u8]) -> IResult<&[u8], BString> {
+    escaped_transform(
+        is_not("\"\\"),
+        '\\',
+        alt((
+            value("\\".as_bytes(), nomchar('\\')),
+            value("\n".as_bytes(), nomchar('n')),
+            value("\t".as_bytes(), nomchar('t')),
+            value("\r".as_bytes(), nomchar('r')),
+            value("\"".as_bytes(), nomchar('\"')),
+        )),
+    )(i)
+    .map(|(i, v)| (i, BString::new(v)))
+}
+
+/// Parse a field in double quotes, undo any escaping, and return the unquoted
+/// and decoded Vec<u8>.
+pub(crate) fn parse_bstr_field(i: &[u8]) -> IResult<&[u8], BString> {
+    // inside double quotes…
+    delimited(
+        nomchar('\"'),
+        // There is
+        alt((
+            // …either is a bstr after unescaping
+            parse_escaped_bstr,
+            // …or an empty string.
+            map(tag(b""), |_| BString::default()),
+        )),
+        nomchar('\"'),
+    )(i)
+}
+
+/// Parse a field in double quotes, undo any escaping, and return the unquoted
+/// and decoded string, if it's a valid string. Or fail parsing if the bytes are
+/// no valid UTF-8.
+pub(crate) fn parse_string_field(i: &[u8]) -> IResult<&[u8], String> {
+    // inside double quotes…
+    delimited(
+        nomchar('\"'),
+        // There is
+        alt((
+            // either is a String after unescaping
+            nom::combinator::map_opt(parse_escaped_bstr, |escaped_bstr| {
+                String::from_utf8(escaped_bstr.into()).ok()
+            }),
+            // or an empty string.
+            map(tag(b""), |_| String::new()),
+        )),
+        nomchar('\"'),
+    )(i)
+}
+
+/// Parse a list of of string fields (enclosed in brackets)
+pub(crate) fn parse_str_list(i: &[u8]) -> IResult<&[u8], Vec<String>> {
+    // inside brackets
+    delimited(
+        nomchar('['),
+        separated_list0(nomchar(','), parse_string_field),
+        nomchar(']'),
+    )(i)
+}
+
+#[cfg(test)]
+mod tests {
+    use test_case::test_case;
+
+    #[test_case(br#""""#, b"", b""; "empty")]
+    #[test_case(br#""Hello World""#, b"Hello World", b""; "hello world")]
+    #[test_case(br#""\"""#, br#"""#, b""; "doublequote")]
+    #[test_case(br#"":""#, b":", b""; "colon")]
+    #[test_case(br#""\""Rest"#, br#"""#, b"Rest"; "doublequote rest")]
+    fn parse_bstr_field(input: &[u8], expected: &[u8], exp_rest: &[u8]) {
+        let (rest, parsed) = super::parse_bstr_field(input).expect("must parse");
+        assert_eq!(exp_rest, rest, "expected remainder");
+        assert_eq!(expected, parsed);
+    }
+
+    #[test_case(br#""""#, "", b""; "empty")]
+    #[test_case(br#""Hello World""#, "Hello World", b""; "hello world")]
+    #[test_case(br#""\"""#, r#"""#, b""; "doublequote")]
+    #[test_case(br#"":""#, ":", b""; "colon")]
+    #[test_case(br#""\""Rest"#, r#"""#, b"Rest"; "doublequote rest")]
+    fn parse_string_field(input: &[u8], expected: &str, exp_rest: &[u8]) {
+        let (rest, parsed) = super::parse_string_field(input).expect("must parse");
+        assert_eq!(exp_rest, rest, "expected remainder");
+        assert_eq!(expected, &parsed);
+    }
+
+    #[test]
+    fn parse_string_field_invalid_encoding_fail() {
+        let mut input: Vec<u8> = Vec::new();
+        input.push(b'"');
+        input.push(0xc5);
+        input.push(0xc4);
+        input.push(0xd6);
+        input.push(b'"');
+
+        super::parse_string_field(&input).expect_err("must fail");
+    }
+
+    #[test_case(br#"["foo"]"#, vec!["foo".to_string()], b""; "single foo")]
+    #[test_case(b"[]", vec![], b""; "empty list")]
+    #[test_case(b"[]blub", vec![], b"blub"; "empty list with rest")]
+    fn parse_list(input: &[u8], expected: Vec<String>, exp_rest: &[u8]) {
+        let (rest, parsed) = super::parse_str_list(input).expect("must parse");
+        assert_eq!(exp_rest, rest, "expected remainder");
+        assert_eq!(expected, parsed);
+    }
+}
author	Florian Klink <flokli@flokli.de>	2023-07-31T13·46+0200
committer	clbot <clbot@tvl.fyi>	2023-10-16T12·23+0000
commit	2410f2292f53a17242ed54b0af2d7b04ec3173f6 (patch)
tree	476d93504a2c21d48011fce4f4afe79bfb0b0cca /tvix/nix-compat/src/aterm
parent	8b09ae54b1d635e77c394dd965915479283489a2 (diff)