feat(users/Profpatsch/netencode): fully streaming parser r/2382

In order to arbitrarily split netencode over multiple reads, we need to make the parser completely streaming, so that it recognizes all cases where it needs more input. Luckily, this is fairly trivial, after working around a bunch of overeager parsing. The tricky part was the giant `alt`, where inner parsers would start consuming input and thus become incomplete when they fail afterwards. Sinc the format *always* starts the different types with one discriminator char, we can use that to instantly return the parser and try the next one instead. The other tricky part was that lists and records would parse all inner elements and then choke on the empty string after the last element, because the inner parser would consume at least the descriminator, and an empty string is always `Incomplete`. We wrap these into a small combinator which plays nice with `many0` in that regard. Change-Id: Ib8d15d9a7cab19d432c6b24a35fcad6a5a72b246 Reviewed-on: https://cl.tvl.fyi/c/depot/+/2704 Tested-by: BuildkiteCI Reviewed-by: Profpatsch <mail@profpatsch.de> Reviewed-by: sterni <sternenseemann@systemli.org>
author: Profpatsch <mail@profpatsch.de> 2021-03-30T02·49+0200
committer: Profpatsch <mail@profpatsch.de> 2021-04-01T07·28+0000
commit: 59a9955d753d8f9deb705d36922f6e8d77307f1d (patch)
tree: 525f8269bfa15160b58d2d73498306c0899c837e /users/Profpatsch/netencode
parent: 53d8dd6a1e56533dbe33a711bdec792cd477f0c7 (diff)
1 files changed, 95 insertions, 39 deletions
diff --git a/users/Profpatsch/netencode/netencode.rs b/users/Profpatsch/netencode/netencode.rs
index 2800326092..249cc33ed1 100644
--- a/users/Profpatsch/netencode/netencode.rs
+++ b/users/Profpatsch/netencode/netencode.rs
@@ -210,9 +210,9 @@ pub mod parse {
     use std::collections::HashMap;
 
     use nom::{IResult};
-    use nom::bytes::complete::{tag, take};
     use nom::branch::{alt};
-    use nom::character::complete::{digit1, char};
+    use nom::bytes::streaming::{tag, take};
+    use nom::character::streaming::{digit1, char};
     use nom::sequence::{tuple};
     use nom::combinator::{map, map_res, flat_map, map_parser, opt};
     use nom::error::{context, ErrorKind, ParseError};
@@ -233,8 +233,10 @@ pub mod parse {
 
     fn sized(begin: char, end: char) -> impl Fn(&[u8]) -> IResult<&[u8], &[u8]> {
         move |s: &[u8]| {
-            let (s, (_, len, _)) = tuple((
-                char(begin),
+            // This is the point where we check the descriminator;
+            // if the beginning char does not match, we can immediately return.
+            let (s, _) = char(begin)(s)?;
+            let (s, (len, _)) = tuple((
                 usize_t,
                 char(':')
             ))(s)?;
@@ -344,14 +346,33 @@ pub mod parse {
         list_g(t_t)(s)
     }
 
+    /// Wrap the inner parser of an `many0`/`fold_many0`, so that the parser
+    /// is not called when the `s` is empty already, preventing it from
+    /// returning `Incomplete` on streaming parsing.
+    fn inner_no_empty_string<'a, P, O>(inner: P) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], O>
+    where
+        O: Clone,
+        P: Fn(&'a [u8]) -> IResult<&'a [u8], O>,
+    {
+        move |s: &'a [u8]| {
+            if s.is_empty() {
+                // This is a bit hacky, `many0` considers the inside done
+                // when a parser returns `Err::Error`, ignoring the actual error content
+                Err(nom::Err::Error((s, nom::error::ErrorKind::Many0)))
+            } else {
+                inner(s)
+            }
+        }
+    }
+
     fn list_g<'a, P, O>(inner: P) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], Vec<O>>
     where
         O: Clone,
-        P: Fn(&'a [u8]) -> IResult<&'a [u8], O>
+        P: Fn(&'a [u8]) -> IResult<&'a [u8], O>,
     {
         map_parser(
             sized('[', ']'),
-            nom::multi::many0(inner)
+            nom::multi::many0(inner_no_empty_string(inner))
         )
     }
 
@@ -368,21 +389,29 @@ pub mod parse {
         O: Clone,
         P: Fn(&'a [u8]) -> IResult<&'a [u8], O>
     {
-        map_parser(
-            sized('{', '}'),
-            nom::multi::fold_many1(
-                tag_g(inner),
-                HashMap::new(),
-                |mut acc: HashMap<_,_>, Tag { tag, mut val }| {
-                    // ignore duplicated tag names that appear later
-                    // according to netencode spec
-                    if ! acc.contains_key(tag) {
-                        acc.insert(tag, *val);
+        move |s: &'a [u8]| {
+            let (s, map) = map_parser(
+                sized('{', '}'),
+                nom::multi::fold_many0(
+                    inner_no_empty_string(tag_g(&inner)),
+                    HashMap::new(),
+                    |mut acc: HashMap<_,_>, Tag { tag, mut val }| {
+                        // ignore duplicated tag names that appear later
+                        // according to netencode spec
+                        if ! acc.contains_key(tag) {
+                            acc.insert(tag, *val);
+                        }
+                        acc
                     }
-                    acc
-                }
-            )
-        )
+                )
+            )(s)?;
+            if map.is_empty() {
+                // records must not be empty, according to the spec
+                Err(nom::Err::Failure((s,nom::error::ErrorKind::Many1)))
+            } else {
+                Ok((s, map))
+            }
+        }
     }
 
     pub fn u_u(s: &[u8]) -> IResult<&[u8], U> {
@@ -512,16 +541,19 @@ pub mod parse {
         fn test_parse_text() {
             assert_eq!(
                 text("t5:hello,".as_bytes()),
-                Ok(("".as_bytes(), T::Text("hello".to_owned())))
+                Ok(("".as_bytes(), T::Text("hello".to_owned()))),
+                "{}", r"t5:hello,"
             );
             assert_eq!(
-                text("t4:fo,".as_bytes()),
-                // TODO: way better parse error messages
-                Err(nom::Err::Error(("fo,".as_bytes(), nom::error::ErrorKind::Eof)))
+                text("t4:fo".as_bytes()),
+                // The content of the text should be 4 long
+                Err(nom::Err::Incomplete(nom::Needed::Size(4))),
+                "{}", r"t4:fo,"
             );
             assert_eq!(
                 text("t9:今日は,".as_bytes()),
-                Ok(("".as_bytes(), T::Text("今日は".to_owned())))
+                Ok(("".as_bytes(), T::Text("今日は".to_owned()))),
+                "{}", r"t9:今日は,"
             );
         }
 
@@ -529,16 +561,25 @@ pub mod parse {
         fn test_parse_binary() {
             assert_eq!(
                 binary()("b5:hello,".as_bytes()),
-                Ok(("".as_bytes(), T::Binary(Vec::from("hello".to_owned()))))
+                Ok(("".as_bytes(), T::Binary(Vec::from("hello".to_owned())))),
+                "{}", r"b5:hello,"
             );
             assert_eq!(
-                binary()("b4:fo,".as_bytes()),
-                // TODO: way better parse error messages
-                Err(nom::Err::Error(("fo,".as_bytes(), nom::error::ErrorKind::Eof)))
+                binary()("b4:fo".as_bytes()),
+                // The content of the byte should be 4 long
+                Err(nom::Err::Incomplete(nom::Needed::Size(4))),
+                "{}", r"b4:fo,"
             );
             assert_eq!(
+                binary()("b4:foob".as_bytes()),
+                // The content is 4 bytes now, but the finishing , is missing
+                Err(nom::Err::Incomplete(nom::Needed::Size(1))),
+                    "{}", r"b4:fo,"
+                );
+            assert_eq!(
                 binary()("b9:今日は,".as_bytes()),
-                Ok(("".as_bytes(), T::Binary(Vec::from("今日は".as_bytes()))))
+                Ok(("".as_bytes(), T::Binary(Vec::from("今日は".as_bytes())))),
+                "{}", r"b9:今日は,"
             );
         }
 
@@ -546,7 +587,8 @@ pub mod parse {
         fn test_list() {
             assert_eq!(
                 list_t("[0:]".as_bytes()),
-                Ok(("".as_bytes(), vec![]))
+                Ok(("".as_bytes(), vec![])),
+                "{}", r"[0:]"
             );
             assert_eq!(
                 list_t("[6:u,u,u,]".as_bytes()),
@@ -554,7 +596,8 @@ pub mod parse {
                     T::Unit,
                     T::Unit,
                     T::Unit,
-                ]))
+                ])),
+                "{}", r"[6:u,u,u,]"
             );
             assert_eq!(
                 list_t("[15:u,[7:t3:foo,]u,]".as_bytes()),
@@ -562,7 +605,8 @@ pub mod parse {
                     T::Unit,
                     T::List(vec![T::Text("foo".to_owned())]),
                     T::Unit,
-                ]))
+                ])),
+                "{}", r"[15:u,[7:t3:foo,]u,]"
             );
         }
 
@@ -574,7 +618,8 @@ pub mod parse {
                     ("a".to_owned(), T::Unit),
                     ("b".to_owned(), T::Unit),
                     ("c".to_owned(), T::Unit),
-                ].into_iter().collect::<HashMap<String, T>>()))
+                ].into_iter().collect::<HashMap<String, T>>())),
+                "{}", r"{21:<1:a|u,<1:b|u,<1:c|u,}"
             );
             // duplicated keys are ignored (first is taken)
             assert_eq!(
@@ -582,7 +627,14 @@ pub mod parse {
                 Ok(("".as_bytes(), vec![
                     ("a".to_owned(), T::Unit),
                     ("b".to_owned(), T::Unit),
-                ].into_iter().collect::<HashMap<_,_>>()))
+                ].into_iter().collect::<HashMap<_,_>>())),
+                "{}", r"{25:<1:a|u,<1:b|u,<1:a|i1:-1,}"
+            );
+            // empty records are not allowed
+            assert_eq!(
+                record_t("{0:}".as_bytes()),
+                Err(nom::Err::Failure(("".as_bytes(), nom::error::ErrorKind::Many1))),
+                "{}", r"{0:}"
             );
         }
 
@@ -590,18 +642,21 @@ pub mod parse {
         fn test_parse() {
             assert_eq!(
                 t_t("n3:255,".as_bytes()),
-                Ok(("".as_bytes(), T::N3(255)))
+                Ok(("".as_bytes(), T::N3(255))),
+                "{}", r"n3:255,"
             );
             assert_eq!(
                 t_t("t6:halloo,".as_bytes()),
-                Ok(("".as_bytes(), T::Text("halloo".to_owned())))
+                Ok(("".as_bytes(), T::Text("halloo".to_owned()))),
+                "{}", r"t6:halloo,"
             );
             assert_eq!(
                 t_t("<3:foo|t6:halloo,".as_bytes()),
                 Ok(("".as_bytes(), T::Sum (Tag {
                     tag: "foo".to_owned(),
                     val: Box::new(T::Text("halloo".to_owned()))
-                })))
+                }))),
+                "{}", r"<3:foo|t6:halloo,"
             );
             // { a: Unit
             // , foo: List <A: Unit | B: List i3> }
@@ -614,7 +669,8 @@ pub mod parse {
                         T::Sum(Tag { tag: "A".to_owned(), val: Box::new(T::N1(true)) }),
                         T::Sum(Tag { tag: "B".to_owned(), val: Box::new(T::List(vec![T::I3(127)])) }),
                     ]))
-                ].into_iter().collect::<HashMap<String, T>>())))
+                ].into_iter().collect::<HashMap<String, T>>()))),
+                "{}", r"{52:<1:a|u,<3:foo|[33:<1:A|u,<1:A|n1:1,<1:B|[7:i3:127,]]}"
             );
         }
author	Profpatsch <mail@profpatsch.de>	2021-03-30T02·49+0200
committer	Profpatsch <mail@profpatsch.de>	2021-04-01T07·28+0000
commit	59a9955d753d8f9deb705d36922f6e8d77307f1d (patch)
tree	525f8269bfa15160b58d2d73498306c0899c837e /users/Profpatsch/netencode
parent	53d8dd6a1e56533dbe33a711bdec792cd477f0c7 (diff)