diff options
Diffstat (limited to 'users/Profpatsch/netencode')
-rw-r--r-- | users/Profpatsch/netencode/Netencode.hs | 433 | ||||
-rw-r--r-- | users/Profpatsch/netencode/Netencode/Parse.hs | 103 | ||||
-rw-r--r-- | users/Profpatsch/netencode/README.md | 133 | ||||
-rw-r--r-- | users/Profpatsch/netencode/default.nix | 184 | ||||
-rw-r--r-- | users/Profpatsch/netencode/gen.nix | 73 | ||||
-rw-r--r-- | users/Profpatsch/netencode/netencode-mustache.rs | 52 | ||||
-rw-r--r-- | users/Profpatsch/netencode/netencode.cabal | 74 | ||||
-rw-r--r-- | users/Profpatsch/netencode/netencode.rs | 969 | ||||
-rw-r--r-- | users/Profpatsch/netencode/pretty.rs | 163 |
9 files changed, 2184 insertions, 0 deletions
diff --git a/users/Profpatsch/netencode/Netencode.hs b/users/Profpatsch/netencode/Netencode.hs new file mode 100644 index 000000000000..ca93ab2fefdf --- /dev/null +++ b/users/Profpatsch/netencode/Netencode.hs @@ -0,0 +1,433 @@ +{-# LANGUAGE AllowAmbiguousTypes #-} +{-# LANGUAGE QuasiQuotes #-} +{-# LANGUAGE TemplateHaskell #-} + +module Netencode where + +import Control.Applicative (many) +import Data.Attoparsec.ByteString qualified as Atto +import Data.Attoparsec.ByteString.Char8 qualified as Atto.Char +import Data.ByteString qualified as ByteString +import Data.ByteString.Builder (Builder) +import Data.ByteString.Builder qualified as Builder +import Data.ByteString.Lazy qualified as ByteString.Lazy +import Data.Fix (Fix (Fix)) +import Data.Fix qualified as Fix +import Data.Functor.Classes (Eq1 (liftEq)) +import Data.Int (Int16, Int32, Int64, Int8) +import Data.Map.NonEmpty (NEMap) +import Data.Map.NonEmpty qualified as NEMap +import Data.Semigroup qualified as Semi +import Data.String (IsString) +import Data.Word (Word16, Word32, Word64) +import GHC.Exts (fromString) +import Hedgehog qualified as Hedge +import Hedgehog.Gen qualified as Gen +import Hedgehog.Range qualified as Range +import PossehlAnalyticsPrelude +import Text.Show.Deriving +import Prelude hiding (sum) + +-- | Netencode type base functor. +-- +-- Recursive elements have a @rec@. +data TF rec + = -- | Unit value + Unit + | -- | Boolean (2^1) + N1 Bool + | -- | Byte (2^3) + N3 Word8 + | -- | 64-bit Natural (2^6) + N6 Word64 + | -- | 64-bit Integer (2^6) + I6 Int64 + | -- | Unicode Text + Text Text + | -- | Arbitrary Bytestring + Bytes ByteString + | -- | A constructor of a(n open) Sum + Sum (Tag Text rec) + | -- | Record + Record (NEMap Text rec) + | -- | List + List [rec] + deriving stock (Show, Eq, Functor) + +instance Eq1 TF where + liftEq _ Unit Unit = True + liftEq _ (N1 b) (N1 b') = b == b' + liftEq _ (N3 w8) (N3 w8') = w8 == w8' + liftEq _ (N6 w64) (N6 w64') = w64 == w64' + liftEq _ (I6 i64) (I6 i64') = i64 == i64' + liftEq _ (Text t) (Text t') = t == t' + liftEq _ (Bytes b) (Bytes b') = b == b' + liftEq eq (Sum t) (Sum t') = eq (t.tagVal) (t'.tagVal) + liftEq eq (Record m) (Record m') = liftEq eq m m' + liftEq eq (List xs) (List xs') = liftEq eq xs xs' + liftEq _ _ _ = False + +-- | A tagged value +data Tag tag val = Tag + { tagTag :: tag, + tagVal :: val + } + deriving stock (Show, Eq, Functor) + +$(Text.Show.Deriving.deriveShow1 ''Tag) +$(Text.Show.Deriving.deriveShow1 ''TF) + +-- | The Netencode type +newtype T = T {unT :: Fix TF} + deriving stock (Eq, Show) + +-- | Create a unit +unit :: T +unit = T $ Fix Unit + +-- | Create a boolean +n1 :: Bool -> T +n1 = T . Fix . N1 + +-- | Create a byte +n3 :: Word8 -> T +n3 = T . Fix . N3 + +-- | Create a 64-bit natural +n6 :: Word64 -> T +n6 = T . Fix . N6 + +-- | Create a 64-bit integer +i6 :: Int64 -> T +i6 = T . Fix . I6 + +-- | Create a UTF-8 unicode text +text :: Text -> T +text = T . Fix . Text + +-- | Create an arbitrary bytestring +bytes :: ByteString -> T +bytes = T . Fix . Bytes + +-- | Create a tagged value from a tag name and a value +tag :: Text -> T -> T +tag key val = T $ Fix $ Sum $ coerce @(Tag Text T) @(Tag Text (Fix TF)) $ Tag key val + +-- | Create a record from a non-empty map +record :: NEMap Text T -> T +record = T . Fix . Record . coerce @(NEMap Text T) @(NEMap Text (Fix TF)) + +-- | Create a list +list :: [T] -> T +list = T . Fix . List . coerce @[T] @([Fix TF]) + +-- | Stable encoding of a netencode value. Record keys will be sorted lexicographically ascending. +netencodeEncodeStable :: T -> Builder +netencodeEncodeStable (T fix) = Fix.foldFix (netencodeEncodeStableF id) fix + +-- | Stable encoding of a netencode functor value. Record keys will be sorted lexicographically ascending. +-- +-- The given function is used for encoding the recursive values. +netencodeEncodeStableF :: (rec -> Builder) -> TF rec -> Builder +netencodeEncodeStableF inner tf = builder go + where + -- TODO: directly pass in BL? + innerBL = fromBuilder . inner + go = case tf of + Unit -> "u," + N1 False -> "n1:0," + N1 True -> "n1:1," + N3 w8 -> "n3:" <> fromBuilder (Builder.word8Dec w8) <> "," + N6 w64 -> "n6:" <> fromBuilder (Builder.word64Dec w64) <> "," + I6 i64 -> "i6:" <> fromBuilder (Builder.int64Dec i64) <> "," + Text t -> + let b = fromText t + in "t" <> builderLenDec b <> ":" <> b <> "," + Bytes b -> "b" <> builderLenDec (fromByteString b) <> ":" <> fromByteString b <> "," + Sum (Tag key val) -> encTag key val + Record m -> + -- NEMap uses Map internally, and that folds in lexicographic ascending order over the key. + -- Since these are `Text` in our case, this is stable. + let mBuilder = m & NEMap.foldMapWithKey encTag + in "{" <> builderLenDec mBuilder <> ":" <> mBuilder <> "}" + List xs -> + let xsBuilder = xs <&> innerBL & mconcat + in "[" <> builderLenDec xsBuilder <> ":" <> xsBuilder <> "]" + where + encTag key val = + let bKey = fromText key + in "<" <> builderLenDec bKey <> ":" <> bKey <> "|" <> innerBL val + +-- | A builder that knows its own size in bytes +newtype BL = BL (Builder, Semi.Sum Natural) + deriving newtype (Monoid, Semigroup) + +instance IsString BL where + fromString s = + BL + ( fromString @Builder s, + fromString @ByteString s + & ByteString.length + & intToNatural + & fromMaybe 0 + & Semi.Sum + ) + +-- | Retrieve the builder +builder :: BL -> Builder +builder (BL (b, _)) = b + +-- | Retrieve the bytestring length +builderLen :: BL -> Natural +builderLen (BL (_, len)) = Semi.getSum $ len + +-- | Take a 'BL' and create a new 'BL' that represents the length as a decimal integer +builderLenDec :: BL -> BL +builderLenDec (BL (_, len)) = + let b = Builder.intDec $ (len & Semi.getSum & fromIntegral @Natural @Int) + in b & fromBuilder + +-- | Create a 'BL' from a 'Builder'. +-- +-- Not efficient, goes back to a lazy bytestring to get the length +fromBuilder :: Builder -> BL +fromBuilder b = + BL + ( b, + b + & Builder.toLazyByteString + & ByteString.Lazy.length + & fromIntegral @Int64 @Natural + & Semi.Sum + ) + +-- | Create a 'BL' from a 'ByteString'. +fromByteString :: ByteString -> BL +fromByteString b = + BL + ( Builder.byteString b, + b + & ByteString.length + & fromIntegral @Int @Natural + & Semi.Sum + ) + +-- | Create a 'BL' from a 'Text'. +fromText :: Text -> BL +fromText t = t & textToBytesUtf8 & fromByteString + +-- | Parser for a netencode value. +netencodeParser :: Atto.Parser T +netencodeParser = T <$> go + where + go = Fix <$> netencodeParserF go + +-- | Parser for one level of a netencode value. Requires a parser for the recursion. +netencodeParserF :: Atto.Parser rec -> Atto.Parser (TF rec) +netencodeParserF inner = do + typeTag <- Atto.Char.anyChar + case typeTag of + 't' -> Text <$> textParser + 'b' -> Bytes <$> bytesParser + 'u' -> unitParser + '<' -> Sum <$> tagParser + '{' -> Record <$> recordParser + '[' -> List <$> listParser + 'n' -> naturalParser + 'i' -> I6 <$> intParser + c -> fail ([c] <> " is not a valid netencode tag") + where + bytesParser = do + len <- boundedDecimalFail Atto.<?> "bytes is missing a digit specifying the length" + _ <- Atto.Char.char ':' Atto.<?> "bytes did not have : after length" + bytes' <- Atto.take len + _ <- Atto.Char.char ',' Atto.<?> "bytes did not end with ," + pure bytes' + + textParser = do + len <- boundedDecimalFail Atto.<?> "text is missing a digit specifying the length" + _ <- Atto.Char.char ':' Atto.<?> "text did not have : after length" + text' <- + Atto.take len <&> bytesToTextUtf8 >>= \case + Left err -> fail [fmt|cannot decode text as utf8: {err & prettyError}|] + Right t -> pure t + _ <- Atto.Char.char ',' Atto.<?> "text did not end with ," + pure text' + + unitParser = do + _ <- Atto.Char.char ',' Atto.<?> "unit did not end with ," + pure $ Unit + + tagParser = do + len <- boundedDecimalFail Atto.<?> "tag is missing a digit specifying the length" + _ <- Atto.Char.char ':' Atto.<?> "tag did not have : after length" + tagTag <- + Atto.take len <&> bytesToTextUtf8 >>= \case + Left err -> fail [fmt|cannot decode tag key as utf8: {err & prettyError}|] + Right t -> pure t + _ <- Atto.Char.char '|' Atto.<?> "tag was missing the key/value separator (|)" + tagVal <- inner + pure $ Tag {..} + + recordParser = do + -- TODO: the record does not use its inner length because we are descending into the inner parsers. + -- This is a smell! In theory it can be used to skip parsing the whole inner keys. + _len <- boundedDecimalFail Atto.<?> "record is missing a digit specifying the length" + _ <- Atto.Char.char ':' Atto.<?> "record did not have : after length" + record' <- + many (Atto.Char.char '<' >> tagParser) <&> nonEmpty >>= \case + Nothing -> fail "record is not allowed to have 0 elements" + Just tags -> + pure $ + tags + <&> (\t -> (t.tagTag, t.tagVal)) + -- later keys are preferred if they are duplicates, according to the standard + & NEMap.fromList + _ <- Atto.Char.char '}' Atto.<?> "record did not end with }" + pure record' + + listParser = do + -- TODO: the list does not use its inner length because we are descending into the inner parsers. + -- This is a smell! In theory it can be used to skip parsing the whole inner keys. + _len <- boundedDecimalFail Atto.<?> "list is missing a digit specifying the length" + _ <- Atto.Char.char ':' Atto.<?> "list did not have : after length" + -- TODO: allow empty lists? + list' <- many inner + _ <- Atto.Char.char ']' Atto.<?> "list did not end with ]" + pure list' + + intParser = do + let p :: forall parseSize. (Bounded parseSize, Integral parseSize) => (Integer -> Atto.Parser Int64) + p n = do + _ <- Atto.Char.char ':' Atto.<?> [fmt|i{n & show} did not have : after length|] + isNegative <- Atto.option False (Atto.Char.char '-' <&> \_c -> True) + int <- + boundedDecimal @parseSize >>= \case + Nothing -> fail [fmt|cannot parse into i{n & show}, the number is too big (would overflow)|] + Just i -> + pure $ + if isNegative + then -- TODO: this should alread be done in the decimal parser, @minBound@ cannot be parsed cause it’s one more than @(-maxBound)@! + (-i) + else i + _ <- Atto.Char.char ',' Atto.<?> [fmt|i{n & show} did not end with ,|] + pure $ fromIntegral @parseSize @Int64 int + digit <- Atto.Char.digit + case digit of + -- TODO: separate parser for i1 and i2 that makes sure the boundaries are right! + '1' -> p @Int8 1 + '2' -> p @Int8 2 + '3' -> p @Int8 3 + '4' -> p @Int16 4 + '5' -> p @Int32 5 + '6' -> p @Int64 6 + '7' -> fail [fmt|i parser only supports numbers up to size 6, was 7|] + '8' -> fail [fmt|i parser only supports numbers up to size 6, was 8|] + '9' -> fail [fmt|i parser only supports numbers up to size 6, was 9|] + o -> fail [fmt|i number with length {o & show} not possible|] + + naturalParser = do + let p :: forall parseSize finalSize. (Bounded parseSize, Integral parseSize, Num finalSize) => (Integer -> Atto.Parser finalSize) + p n = do + _ <- Atto.Char.char ':' Atto.<?> [fmt|n{n & show} did not have : after length|] + int <- + boundedDecimal @parseSize >>= \case + Nothing -> fail [fmt|cannot parse into n{n & show}, the number is too big (would overflow)|] + Just i -> pure i + + _ <- Atto.Char.char ',' Atto.<?> [fmt|n{n & show} did not end with ,|] + pure $ fromIntegral @parseSize @finalSize int + let b n = do + _ <- Atto.Char.char ':' Atto.<?> [fmt|n{n & show} did not have : after length|] + bool <- + (Atto.Char.char '0' >> pure False) + <|> (Atto.Char.char '1' >> pure True) + _ <- Atto.Char.char ',' Atto.<?> [fmt|n{n & show} did not end with ,|] + pure bool + + digit <- Atto.Char.digit + case digit of + -- TODO: separate parser for n1 and n2 that makes sure the boundaries are right! + '1' -> N1 <$> b 1 + '2' -> N3 <$> p @Word8 @Word8 2 + '3' -> N3 <$> p @Word8 @Word8 3 + '4' -> N6 <$> p @Word16 @Word64 4 + '5' -> N6 <$> p @Word32 @Word64 5 + '6' -> N6 <$> p @Word64 @Word64 6 + '7' -> fail [fmt|n parser only supports numbers up to size 6, was 7|] + '8' -> fail [fmt|n parser only supports numbers up to size 6, was 8|] + '9' -> fail [fmt|n parser only supports numbers up to size 6, was 9|] + o -> fail [fmt|n number with length {o & show} not possible|] + +-- | Parser for a bounded decimal that does not overflow the decimal. +-- +-- via https://www.extrema.is/blog/2021/10/20/parsing-bounded-integers +boundedDecimal :: forall a. (Bounded a, Integral a) => Atto.Parser (Maybe a) +boundedDecimal = do + i :: Integer <- decimal + pure $ + if (i :: Integer) > fromIntegral (maxBound :: a) + then Nothing + else Just $ fromIntegral i + where + -- Copied from @Attoparsec.Text@ and adjusted to bytestring + decimal :: (Integral a2) => Atto.Parser a2 + decimal = ByteString.foldl' step 0 <$> Atto.Char.takeWhile1 Atto.Char.isDigit + where + step a c = a * 10 + fromIntegral (c - 48) +{-# SPECIALIZE boundedDecimal :: Atto.Parser (Maybe Int) #-} +{-# SPECIALIZE boundedDecimal :: Atto.Parser (Maybe Int64) #-} +{-# SPECIALIZE boundedDecimal :: Atto.Parser (Maybe Word8) #-} +{-# SPECIALIZE boundedDecimal :: Atto.Parser (Maybe Word64) #-} + +-- | 'boundedDecimal', but fail the parser if the decimal overflows. +boundedDecimalFail :: Atto.Parser Int +boundedDecimalFail = + boundedDecimal >>= \case + Nothing -> fail "decimal out of range" + Just a -> pure a + +-- | Hedgehog generator for a netencode value. +genNetencode :: Hedge.MonadGen m => m T +genNetencode = + Gen.recursive + Gen.choice + [ -- these are bundled into one Gen, so that scalar elements get chosen less frequently, and the generator produces nicely nested examples + Gen.frequency + [ (1, pure unit), + (1, n1 <$> Gen.bool), + (1, n3 <$> Gen.element [0, 1, 5]), + (1, n6 <$> Gen.element [0, 1, 5]), + (1, i6 <$> Gen.element [-1, 1, 5]), + (2, text <$> Gen.text (Range.linear 1 10) Gen.lower), + (2, bytes <$> Gen.bytes (Range.linear 1 10)) + ] + ] + [ do + key <- Gen.text (Range.linear 3 10) Gen.lower + val <- genNetencode + pure $ tag key val, + record + <$> ( let k = Gen.text (Range.linear 3 10) Gen.lower + v = genNetencode + in NEMap.insertMap + <$> k + <*> v + <*> ( (Gen.map (Range.linear 0 3)) $ + (,) <$> k <*> v + ) + ) + ] + +-- | Hedgehog property: encoding a netencode value and parsing it again returns the same result. +prop_netencodeRoundtrip :: Hedge.Property +prop_netencodeRoundtrip = Hedge.property $ do + enc <- Hedge.forAll genNetencode + ( Atto.parseOnly + netencodeParser + ( netencodeEncodeStable enc + & Builder.toLazyByteString + & toStrictBytes + ) + ) + Hedge.=== (Right enc) diff --git a/users/Profpatsch/netencode/Netencode/Parse.hs b/users/Profpatsch/netencode/Netencode/Parse.hs new file mode 100644 index 000000000000..e55eedf568db --- /dev/null +++ b/users/Profpatsch/netencode/Netencode/Parse.hs @@ -0,0 +1,103 @@ +{-# LANGUAGE QuasiQuotes #-} + +module Netencode.Parse where + +import Control.Category qualified +import Control.Selective (Selective) +import Data.Error.Tree +import Data.Fix (Fix (..)) +import Data.Functor.Compose +import Data.List qualified as List +import Data.Map.NonEmpty (NEMap) +import Data.Map.NonEmpty qualified as NEMap +import Data.Semigroupoid qualified as Semigroupiod +import Data.Semigroupoid qualified as Semigroupoid +import Data.Text qualified as Text +import Label +import Netencode qualified +import PossehlAnalyticsPrelude +import Prelude hiding (log) + +newtype Parse from to + = -- TODO: the way @Context = [Text]@ has to be forwarded to everything is kinda shitty. + -- This is essentially just a difference list, and can probably be treated as a function in the output? + Parse (([Text], from) -> Validation (NonEmpty ErrorTree) ([Text], to)) + deriving + (Functor, Applicative, Selective) + via ( Compose + ( Compose + ((->) ([Text], from)) + (Validation (NonEmpty ErrorTree)) + ) + ((,) [Text]) + ) + +instance Semigroupoid Parse where + o p2 p1 = Parse $ \from -> case runParse' p1 from of + Failure err -> Failure err + Success to1 -> runParse' p2 to1 + +instance Category Parse where + (.) = Semigroupoid.o + id = Parse $ \t -> Success t + +runParse :: Error -> Parse from to -> from -> Either ErrorTree to +runParse errMsg parser t = + (["$"], t) + & runParse' parser + <&> snd + & first (nestedMultiError errMsg) + & validationToEither + +runParse' :: Parse from to -> ([Text], from) -> Validation (NonEmpty ErrorTree) ([Text], to) +runParse' (Parse f) from = f from + +parseEither :: (([Text], from) -> Either ErrorTree ([Text], to)) -> Parse from to +parseEither f = Parse $ \from -> f from & eitherToListValidation + +tAs :: (Netencode.TF (Fix Netencode.TF) -> Either ([Text] -> ErrorTree) to) -> Parse Netencode.T to +tAs f = parseEither ((\(context, Netencode.T (Fix tf)) -> f tf & bimap ($ context) (context,))) + +key :: Text -> Parse (NEMap Text to) to +key name = parseEither $ \(context, rec) -> + rec + & NEMap.lookup name + & annotate (errorTreeContext (showContext context) [fmt|Key "{name}" does not exist|]) + <&> (addContext name context,) + +showContext :: [Text] -> Text +showContext context = context & List.reverse & Text.intercalate "." + +addContext :: a -> [a] -> [a] +addContext = (:) + +asText :: Parse Netencode.T Text +asText = tAs $ \case + Netencode.Text t -> pure t + other -> typeError "of text" other + +asBytes :: Parse Netencode.T ByteString +asBytes = tAs $ \case + Netencode.Bytes b -> pure b + other -> typeError "of bytes" other + +asRecord :: Parse Netencode.T (NEMap Text (Netencode.T)) +asRecord = tAs $ \case + Netencode.Record rec -> pure (rec <&> Netencode.T) + other -> typeError "a record" other + +typeError :: Text -> Netencode.TF ignored -> (Either ([Text] -> ErrorTree) b) +typeError should is = do + let otherS = is <&> (\_ -> ("…" :: String)) & show + Left $ \context -> errorTreeContext (showContext context) [fmt|Value is not {should}, but a {otherS}|] + +orThrowParseError :: + Parse (Either Error to) to +orThrowParseError = Parse $ \case + (context, Left err) -> + err + & singleError + & errorTreeContext (showContext context) + & singleton + & Failure + (context, Right to) -> Success (context, to) diff --git a/users/Profpatsch/netencode/README.md b/users/Profpatsch/netencode/README.md new file mode 100644 index 000000000000..3538a110a678 --- /dev/null +++ b/users/Profpatsch/netencode/README.md @@ -0,0 +1,133 @@ +# netencode 0.1-unreleased + +[bencode][] and [netstring][]-inspired pipe format that should be trivial to generate correctly in every context (only requires a `byte_length()` and a `printf()`), easy to parse (100 lines of code or less), mostly human-decipherable for easy debugging, and support nested record and sum types. + + +## scalars + +Scalars have the format `[type prefix][size]:[value],`. + +where size is a natural number without leading zeroes. + +### unit + +The unit (`u`) has only one value. + +* The unit is: `u,` + +### numbers + +Naturals (`n`) and Integers (`i`), with a maximum size in bits. + +Bit sizes are specified in 2^n increments, 1 to 9 (`n1`..`n9`, `i1`..`n9`). + +* Natural `1234` that fits in 32 bits (2^5): `n5:1234,` +* Integer `-42` that fits in 8 bits (2^3): `i3:-42,` +* Integer `23` that fits in 64 bits (2^6): `i6:23,` +* Integer `-1` that fits in 512 bits (2^9): `i9:-1,` +* Natural `0` that fits in 1 bit (2^1): `n1:0,` + +An implementation can define the biggest numbers it supports, and has to throw an error for anything bigger. It has to support everything smaller, so for example if you support up to i6/n6, you have to support 1–6 as well. An implementation could support up to the current architecture’s wordsize for example. + +Floats are not supported, you can implement fixed-size decimals or ratios using integers. + +### booleans + +A boolean is represented as `n1`. + +* `n1:0,`: false +* `n1:1,`: true + +TODO: should we add `f,` and `t,`? + +### text + +Text (`t`) that *must* be encoded as UTF-8, starting with its length in bytes: + +* The string `hello world` (11 bytes): `t11:hello world,` +* The string `今日は` (9 bytes): `t9:今日は,` +* The string `:,` (2 bytes): `t2::,,` +* The empty sting `` (0 bytes): `t0:,` + +### binary + +Arbitrary binary strings (`b`) that can contain any data, starting with its length in bytes. + +* The ASCII string `hello world` as binary data (11 bytes): `b11:hello world,` +* The empty binary string (0 bytes): `b0:,` +* The bytestring with `^D` (1 byte): `b1:,` + +Since the binary strings are length-prefixd, they can contain `\0` and no escaping is required. Care has to be taken in languages with `\0`-terminated bytestrings. + +Use text (`t`) if you have utf-8 encoded data. + +## tagged values + +### tags + +A tag (`<`) gives a value a name. The tag is UTF-8 encoded, starting with its length in bytes and proceeding with the value. + +* The tag `foo` (3 bytes) tagging the text `hello` (5 bytes): `<3:foo|t5:hello,` +* The tag `` (0 bytes) tagging the 8-bit integer 0: `<0:|i3:0,` + +### records (products/records), also maps + +A record (`{`) is a concatenation of tags (`<`). It needs to be closed with `}`. + +If tag names repeat the *earlier* ones should be ignored. +Using the last tag corresponds with the way most languages handle converting a list of tuples to Maps, by using a for-loop and Map.insert without checking the contents first. Otherwise you’d have to revert the list first or remember which keys you already inserted. + +Ordering of tags in a record does not matter. + +Similar to text, records start with the length of their *whole encoded content*, in bytes. This makes it possible to treat their contents as opaque bytestrings. + +* There is no empty record. (TODO: make the empty record the unit type, remove `u,`?) +* A record with one empty field, `foo`: `{9:<3:foo|u,}` +* A record with two fields, `foo` and `x`: `{21:<3:foo|u,<1:x|t3:baz,}` +* The same record: `{21:<1:x|t3:baz,<3:foo|u,}` +* The same record (earlier occurences of fields are ignored): `{<1:x|u,28:<1:x|t3:baz,<3:foo|u,}` + +### sums (tagged unions) + +Simply a tagged value. The tag marker `<` indicates it is a sum if it appears outside of a record. + +## lists + +A list (`[`) imposes an ordering on a sequence of values. It needs to be closed with `]`. Values in it are simply concatenated. + +Similar to records, lists start with the length of their whole encoded content. + +* The empty list: `[0:]` +* The list with one element, the string `foo`: `[7:t3:foo,]` +* The list with text `foo` followed by i3 `-42`: `[14:t3:foo,i3:-42,]` +* The list with `Some` and `None` tags: `[33:<4:Some|t3:foo,<4None|u,<4None|u,]` + +## parser security considerations + +The length field is a decimal number that is not length-restricted, +meaning an attacker could give an infinitely long length (or extremely long) +thus overflowing your parser if you are not careful. + +You should thus put a practical length limit to the length of length fields, +which implicitely enforces a length limit on how long the value itself can be. + +Start by defining a max value length in bytes. +Then count the number of decimals in that number. + +So if your max length is 1024 bytes, your length field can be a maximum `count_digits(1024) == 4` bytes long. + +Thus, if you restrict your parser to a length field of 4 bytes, +it should also never parse anything longer than 1024 bytes for the value +(plus 1 byte for the type tag, 4 bytes for the length, and 2 bytes for the separator & ending character). + +## motivation + +TODO + +## guarantees + +TODO: do I want unique representation (bijection like bencode?) This would put more restrictions on the generator, like sorting records in lexicographic order, but would make it possible to compare without decoding + + +[bencode]: https://en.wikipedia.org/wiki/Bencode +[netstring]: https://en.wikipedia.org/wiki/Netstring diff --git a/users/Profpatsch/netencode/default.nix b/users/Profpatsch/netencode/default.nix new file mode 100644 index 000000000000..6e7dce489a81 --- /dev/null +++ b/users/Profpatsch/netencode/default.nix @@ -0,0 +1,184 @@ +{ depot, pkgs, lib, ... }: + +let + netencode-rs = depot.nix.writers.rustSimpleLib + { + name = "netencode"; + dependencies = [ + depot.third_party.rust-crates.nom + depot.users.Profpatsch.execline.exec-helpers + ]; + } + (builtins.readFile ./netencode.rs); + + netencode-hs = pkgs.haskellPackages.mkDerivation { + pname = "netencode"; + version = "0.1.0"; + + src = depot.users.Profpatsch.exactSource ./. [ + ./netencode.cabal + ./Netencode.hs + ./Netencode/Parse.hs + ]; + + libraryHaskellDepends = [ + pkgs.haskellPackages.hedgehog + pkgs.haskellPackages.nonempty-containers + pkgs.haskellPackages.deriving-compat + pkgs.haskellPackages.data-fix + pkgs.haskellPackages.bytestring + pkgs.haskellPackages.attoparsec + pkgs.haskellPackages.pa-prelude + pkgs.haskellPackages.pa-label + pkgs.haskellPackages.pa-error-tree + ]; + + isLibrary = true; + license = lib.licenses.mit; + + + }; + + gen = import ./gen.nix { inherit lib; }; + + pretty-rs = depot.nix.writers.rustSimpleLib + { + name = "netencode-pretty"; + dependencies = [ + netencode-rs + ]; + } + (builtins.readFile ./pretty.rs); + + pretty = depot.nix.writers.rustSimple + { + name = "netencode-pretty"; + dependencies = [ + netencode-rs + pretty-rs + depot.users.Profpatsch.execline.exec-helpers + ]; + } '' + extern crate netencode; + extern crate netencode_pretty; + extern crate exec_helpers; + + fn main() { + let (_, prog) = exec_helpers::args_for_exec("netencode-pretty", 0); + let t = netencode::t_from_stdin_or_die_user_error("netencode-pretty"); + match netencode_pretty::Pretty::from_u(t.to_u()).print_multiline(&mut std::io::stdout()) { + Ok(()) => {}, + Err(err) => exec_helpers::die_temporary("netencode-pretty", format!("could not write to stdout: {}", err)) + } + } + ''; + + netencode-mustache = depot.nix.writers.rustSimple + { + name = "netencode_mustache"; + dependencies = [ + depot.users.Profpatsch.arglib.netencode.rust + netencode-rs + depot.third_party.rust-crates.mustache + ]; + } + (builtins.readFile ./netencode-mustache.rs); + + + record-get = depot.nix.writers.rustSimple + { + name = "record-get"; + dependencies = [ + netencode-rs + depot.users.Profpatsch.execline.exec-helpers + ]; + } '' + extern crate netencode; + extern crate exec_helpers; + use netencode::{encode, dec}; + use netencode::dec::{Decoder, DecodeError}; + + fn main() { + let args = exec_helpers::args("record-get", 1); + let field = match std::str::from_utf8(&args[0]) { + Ok(f) => f, + Err(_e) => exec_helpers::die_user_error("record-get", format!("The field name needs to be valid unicode")) + }; + let t = netencode::t_from_stdin_or_die_user_error("record-get"); + match (dec::RecordDot {field, inner: dec::AnyU }).dec(t.to_u()) { + Ok(u) => encode(&mut std::io::stdout(), &u).expect("encoding to stdout failed"), + Err(DecodeError(err)) => exec_helpers::die_user_error("record-get", err) + } + } + ''; + + record-splice-env = depot.nix.writers.rustSimple + { + name = "record-splice-env"; + dependencies = [ + netencode-rs + depot.users.Profpatsch.execline.exec-helpers + ]; + } '' + extern crate netencode; + extern crate exec_helpers; + use netencode::dec::{Record, Try, ScalarAsBytes, Decoder, DecodeError}; + + fn main() { + let t = netencode::t_from_stdin_or_die_user_error("record-splice-env"); + let (_, prog) = exec_helpers::args_for_exec("record-splice-env", 0); + match Record(Try(ScalarAsBytes)).dec(t.to_u()) { + Ok(map) => { + exec_helpers::exec_into_args( + "record-splice-env", + prog, + // some elements can’t be decoded as scalars, so just ignore them + map.into_iter().filter_map(|(k, v)| v.map(|v2| (k, v2))) + ); + }, + Err(DecodeError(err)) => exec_helpers::die_user_error("record-splice-env", err), + } + } + ''; + + env-splice-record = depot.nix.writers.rustSimple + { + name = "env-splice-record"; + dependencies = [ + netencode-rs + depot.users.Profpatsch.execline.exec-helpers + ]; + } '' + extern crate netencode; + extern crate exec_helpers; + use netencode::{T}; + use std::os::unix::ffi::OsStringExt; + + fn main() { + exec_helpers::no_args("env-splice-record"); + let mut res = std::collections::HashMap::new(); + for (key, val) in std::env::vars_os() { + match (String::from_utf8(key.into_vec()), String::from_utf8(val.into_vec())) { + (Ok(k), Ok(v)) => { let _ = res.insert(k, T::Text(v)); }, + // same as in record-splice-env, we ignore non-utf8 variables + (_, _) => {}, + } + } + netencode::encode(&mut std::io::stdout(), &T::Record(res).to_u()).unwrap() + } + ''; + +in +depot.nix.readTree.drvTargets { + inherit + netencode-rs + netencode-hs + pretty-rs + pretty + netencode-mustache + record-get + record-splice-env + env-splice-record + gen + ; +} diff --git a/users/Profpatsch/netencode/gen.nix b/users/Profpatsch/netencode/gen.nix new file mode 100644 index 000000000000..efc9629ca0df --- /dev/null +++ b/users/Profpatsch/netencode/gen.nix @@ -0,0 +1,73 @@ +{ lib }: +let + + netstring = tag: suffix: s: + "${tag}${toString (builtins.stringLength s)}:${s}${suffix}"; + + unit = "u,"; + + n1 = b: if b then "n1:1," else "n1:0,"; + + n = i: n: "n${toString i}:${toString n},"; + i = i: n: "i${toString i}:${toString n},"; + + n3 = n 3; + n6 = n 6; + n7 = n 7; + + i3 = i 3; + i6 = i 6; + i7 = i 7; + + text = netstring "t" ","; + binary = netstring "b" ","; + + tag = key: val: netstring "<" "|" key + val; + + concatStrings = builtins.concatStringsSep ""; + + record = lokv: netstring "{" "}" + (concatStrings (map ({ key, val }: tag key val) lokv)); + + list = l: netstring "[" "]" (concatStrings l); + + dwim = val: + let + match = { + "bool" = n1; + "int" = i6; + "string" = text; + "set" = attrs: + # it could be a derivation, then just return the path + if attrs.type or "" == "derivation" then text "${attrs}" + else + record (lib.mapAttrsToList + (k: v: { + key = k; + val = dwim v; + }) + attrs); + "list" = l: list (map dwim l); + }; + in + match.${builtins.typeOf val} val; + +in +{ + inherit + unit + n1 + n3 + n6 + n7 + i3 + i6 + i7 + text + binary + tag + record + list + dwim + ; +} diff --git a/users/Profpatsch/netencode/netencode-mustache.rs b/users/Profpatsch/netencode/netencode-mustache.rs new file mode 100644 index 000000000000..73ed5be1ded2 --- /dev/null +++ b/users/Profpatsch/netencode/netencode-mustache.rs @@ -0,0 +1,52 @@ +extern crate arglib_netencode; +extern crate mustache; +extern crate netencode; + +use mustache::Data; +use netencode::T; +use std::collections::HashMap; +use std::io::Read; +use std::os::unix::ffi::OsStrExt; + +fn netencode_to_mustache_data_dwim(t: T) -> Data { + match t { + // TODO: good idea? + T::Unit => Data::Null, + T::N1(b) => Data::Bool(b), + T::N3(u) => Data::String(u.to_string()), + T::N6(u) => Data::String(u.to_string()), + T::N7(u) => Data::String(u.to_string()), + T::I3(i) => Data::String(i.to_string()), + T::I6(i) => Data::String(i.to_string()), + T::I7(i) => Data::String(i.to_string()), + T::Text(s) => Data::String(s), + T::Binary(b) => unimplemented!(), + T::Sum(tag) => unimplemented!(), + T::Record(xs) => Data::Map( + xs.into_iter() + .map(|(key, val)| (key, netencode_to_mustache_data_dwim(val))) + .collect::<HashMap<_, _>>(), + ), + T::List(xs) => Data::Vec( + xs.into_iter() + .map(|x| netencode_to_mustache_data_dwim(x)) + .collect::<Vec<_>>(), + ), + } +} + +pub fn from_stdin() -> () { + let data = netencode_to_mustache_data_dwim(arglib_netencode::arglib_netencode( + "netencode-mustache", + Some(std::ffi::OsStr::new("TEMPLATE_DATA")), + )); + let mut stdin = String::new(); + std::io::stdin().read_to_string(&mut stdin).unwrap(); + mustache::compile_str(&stdin) + .and_then(|templ| templ.render_data(&mut std::io::stdout(), &data)) + .unwrap() +} + +pub fn main() { + from_stdin() +} diff --git a/users/Profpatsch/netencode/netencode.cabal b/users/Profpatsch/netencode/netencode.cabal new file mode 100644 index 000000000000..7bff4487bbc1 --- /dev/null +++ b/users/Profpatsch/netencode/netencode.cabal @@ -0,0 +1,74 @@ +cabal-version: 3.0 +name: netencode +version: 0.1.0.0 +author: Profpatsch +maintainer: mail@profpatsch.de + + +common common-options + ghc-options: + -Wall + -Wno-type-defaults + -Wunused-packages + -Wredundant-constraints + -fwarn-missing-deriving-strategies + + -- See https://downloads.haskell.org/ghc/latest/docs/users_guide/exts.html + -- for a description of all these extensions + default-extensions: + -- Infer Applicative instead of Monad where possible + ApplicativeDo + + -- Allow literal strings to be Text + OverloadedStrings + + -- Syntactic sugar improvements + LambdaCase + MultiWayIf + + -- Makes the (deprecated) usage of * instead of Data.Kind.Type an error + NoStarIsType + + -- Convenient and crucial to deal with ambiguous field names, commonly + -- known as RecordDotSyntax + OverloadedRecordDot + + -- does not export record fields as functions, use OverloadedRecordDot to access instead + NoFieldSelectors + + -- Record punning + RecordWildCards + + -- Improved Deriving + DerivingStrategies + DerivingVia + + -- Type-level strings + DataKinds + + -- to enable the `type` keyword in import lists (ormolu uses this automatically) + ExplicitNamespaces + + default-language: GHC2021 + + +library + import: common-options + exposed-modules: + Netencode, + Netencode.Parse + + build-depends: + base >=4.15 && <5, + pa-prelude, + pa-label, + pa-error-tree, + hedgehog, + nonempty-containers, + deriving-compat, + data-fix, + bytestring, + attoparsec, + text, + semigroupoids, + selective diff --git a/users/Profpatsch/netencode/netencode.rs b/users/Profpatsch/netencode/netencode.rs new file mode 100644 index 000000000000..34a8fcef0990 --- /dev/null +++ b/users/Profpatsch/netencode/netencode.rs @@ -0,0 +1,969 @@ +extern crate exec_helpers; +extern crate nom; + +use std::collections::HashMap; +use std::fmt::{Debug, Display}; +use std::io::{Read, Write}; + +#[derive(Debug, PartialEq, Eq, Clone)] +pub enum T { + // Unit + Unit, + // Boolean + N1(bool), + // Naturals + N3(u8), + N6(u64), + N7(u128), + // Integers + I3(i8), + I6(i64), + I7(i128), + // Text + // TODO: make into &str + Text(String), + // TODO: rename to Bytes + Binary(Vec<u8>), + // Tags + // TODO: make into &str + // TODO: rename to Tag + Sum(Tag<String, T>), + // TODO: make into &str + Record(HashMap<String, T>), + List(Vec<T>), +} + +impl T { + pub fn to_u<'a>(&'a self) -> U<'a> { + match self { + T::Unit => U::Unit, + T::N1(b) => U::N1(*b), + T::N3(u) => U::N3(*u), + T::N6(u) => U::N6(*u), + T::N7(u) => U::N7(*u), + T::I3(i) => U::I3(*i), + T::I6(i) => U::I6(*i), + T::I7(i) => U::I7(*i), + T::Text(t) => U::Text(t.as_str()), + T::Binary(v) => U::Binary(v), + T::Sum(Tag { tag, val }) => U::Sum(Tag { + tag: tag.as_str(), + val: Box::new(val.to_u()), + }), + T::Record(map) => U::Record(map.iter().map(|(k, v)| (k.as_str(), v.to_u())).collect()), + T::List(l) => U::List(l.iter().map(|v| v.to_u()).collect::<Vec<U<'a>>>()), + } + } + + pub fn encode<'a>(&'a self) -> Vec<u8> { + match self { + // TODO: don’t go via U, inefficient + o => o.to_u().encode(), + } + } +} + +#[derive(Debug, PartialEq, Eq, Clone)] +pub enum U<'a> { + Unit, + // Boolean + N1(bool), + // Naturals + N3(u8), + N6(u64), + N7(u128), + // Integers + I3(i8), + I6(i64), + I7(i128), + // Text + Text(&'a str), + Binary(&'a [u8]), + // TODO: the U-recursion we do here means we can’t be breadth-lazy anymore + // like we originally planned; maybe we want to go `U<'a>` → `&'a [u8]` again? + // Tags + // TODO: rename to Tag + Sum(Tag<&'a str, U<'a>>), + Record(HashMap<&'a str, U<'a>>), + List(Vec<U<'a>>), +} + +impl<'a> U<'a> { + pub fn encode(&self) -> Vec<u8> { + let mut c = std::io::Cursor::new(vec![]); + encode(&mut c, self); + c.into_inner() + } + + pub fn to_t(&self) -> T { + match self { + U::Unit => T::Unit, + U::N1(b) => T::N1(*b), + U::N3(u) => T::N3(*u), + U::N6(u) => T::N6(*u), + U::N7(u) => T::N7(*u), + U::I3(i) => T::I3(*i), + U::I6(i) => T::I6(*i), + U::I7(i) => T::I7(*i), + U::Text(t) => T::Text((*t).to_owned()), + U::Binary(v) => T::Binary((*v).to_owned()), + U::Sum(Tag { tag, val }) => T::Sum(Tag { + tag: (*tag).to_owned(), + val: Box::new(val.to_t()), + }), + U::Record(map) => T::Record( + map.iter() + .map(|(k, v)| ((*k).to_owned(), v.to_t())) + .collect::<HashMap<String, T>>(), + ), + U::List(l) => T::List(l.iter().map(|v| v.to_t()).collect::<Vec<T>>()), + } + } +} + +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct Tag<S, A> { + // TODO: make into &str + pub tag: S, + pub val: Box<A>, +} + +impl<S, A> Tag<S, A> { + fn map<F, B>(self, f: F) -> Tag<S, B> + where + F: Fn(A) -> B, + { + Tag { + tag: self.tag, + val: Box::new(f(*self.val)), + } + } +} + +fn encode_tag<W: Write>(w: &mut W, tag: &str, val: &U) -> std::io::Result<()> { + write!(w, "<{}:{}|", tag.len(), tag)?; + encode(w, val)?; + Ok(()) +} + +pub fn encode<W: Write>(w: &mut W, u: &U) -> std::io::Result<()> { + match u { + U::Unit => write!(w, "u,"), + U::N1(b) => { + if *b { + write!(w, "n1:1,") + } else { + write!(w, "n1:0,") + } + } + U::N3(n) => write!(w, "n3:{},", n), + U::N6(n) => write!(w, "n6:{},", n), + U::N7(n) => write!(w, "n7:{},", n), + U::I3(i) => write!(w, "i3:{},", i), + U::I6(i) => write!(w, "i6:{},", i), + U::I7(i) => write!(w, "i7:{},", i), + U::Text(s) => { + write!(w, "t{}:", s.len()); + w.write_all(s.as_bytes()); + write!(w, ",") + } + U::Binary(s) => { + write!(w, "b{}:", s.len()); + w.write_all(&s); + write!(w, ",") + } + U::Sum(Tag { tag, val }) => encode_tag(w, tag, val), + U::Record(m) => { + let mut c = std::io::Cursor::new(vec![]); + for (k, v) in m { + encode_tag(&mut c, k, v)?; + } + write!(w, "{{{}:", c.get_ref().len())?; + w.write_all(c.get_ref())?; + write!(w, "}}") + } + U::List(l) => { + let mut c = std::io::Cursor::new(vec![]); + for u in l { + encode(&mut c, u)?; + } + write!(w, "[{}:", c.get_ref().len())?; + w.write_all(c.get_ref())?; + write!(w, "]") + } + } +} + +pub fn text(s: String) -> T { + T::Text(s) +} + +pub fn t_from_stdin_or_die_user_error<'a>(prog_name: &'_ str) -> T { + match t_from_stdin_or_die_user_error_with_rest(prog_name, &vec![]) { + None => exec_helpers::die_user_error(prog_name, "stdin was empty"), + Some((rest, t)) => { + if rest.is_empty() { + t + } else { + exec_helpers::die_user_error( + prog_name, + format!( + "stdin contained some soup after netencode value: {:?}", + String::from_utf8_lossy(&rest) + ), + ) + } + } + } +} + +/// Read a netencode value from stdin incrementally, return bytes that could not be read. +/// Nothing if there was nothing to read from stdin & no initial_bytes were provided. +/// These can be passed back as `initial_bytes` if more values should be read. +pub fn t_from_stdin_or_die_user_error_with_rest<'a>( + prog_name: &'_ str, + initial_bytes: &[u8], +) -> Option<(Vec<u8>, T)> { + let mut chonker = Chunkyboi::new(std::io::stdin().lock(), 4096); + // The vec to pass to the parser on each step + let mut parser_vec: Vec<u8> = initial_bytes.to_vec(); + // whether stdin was already empty + let mut was_empty: bool = false; + loop { + match chonker.next() { + None => { + if parser_vec.is_empty() { + return None; + } else { + was_empty = true + } + } + Some(Err(err)) => exec_helpers::die_temporary( + prog_name, + &format!("could not read from stdin: {:?}", err), + ), + Some(Ok(mut new_bytes)) => parser_vec.append(&mut new_bytes), + } + + match parse::t_t(&parser_vec) { + Ok((rest, t)) => return Some((rest.to_owned(), t)), + Err(nom::Err::Incomplete(Needed)) => { + if was_empty { + exec_helpers::die_user_error( + prog_name, + &format!( + "unable to parse netencode from stdin, input incomplete: {:?}", + parser_vec + ), + ); + } + // read more from stdin and try parsing again + continue; + } + Err(err) => exec_helpers::die_user_error( + prog_name, + &format!("unable to parse netencode from stdin: {:?}", err), + ), + } + } +} + +// iter helper +// TODO: put into its own module +struct Chunkyboi<T> { + inner: T, + buf: Vec<u8>, +} + +impl<R: Read> Chunkyboi<R> { + fn new(inner: R, chunksize: usize) -> Self { + let buf = vec![0; chunksize]; + Chunkyboi { inner, buf } + } +} + +impl<R: Read> Iterator for Chunkyboi<R> { + type Item = std::io::Result<Vec<u8>>; + + fn next(&mut self) -> Option<std::io::Result<Vec<u8>>> { + match self.inner.read(&mut self.buf) { + Ok(0) => None, + Ok(read) => { + // clone a new buffer so we can reuse the internal one + Some(Ok(self.buf[..read].to_owned())) + } + Err(err) => Some(Err(err)), + } + } +} + +pub mod parse { + use super::{Tag, T, U}; + + use std::collections::HashMap; + use std::ops::Neg; + use std::str::FromStr; + + use nom::branch::alt; + use nom::bytes::streaming::{tag, take}; + use nom::character::streaming::{char, digit1}; + use nom::combinator::{flat_map, map, map_parser, map_res, opt}; + use nom::error::{context, ErrorKind, ParseError}; + use nom::sequence::tuple; + use nom::IResult; + + fn unit_t(s: &[u8]) -> IResult<&[u8], ()> { + let (s, _) = context("unit", tag("u,"))(s)?; + Ok((s, ())) + } + + fn usize_t(s: &[u8]) -> IResult<&[u8], usize> { + context( + "usize", + map_res(map_res(digit1, |n| std::str::from_utf8(n)), |s| { + s.parse::<usize>() + }), + )(s) + } + + fn sized(begin: char, end: char) -> impl Fn(&[u8]) -> IResult<&[u8], &[u8]> { + move |s: &[u8]| { + // This is the point where we check the descriminator; + // if the beginning char does not match, we can immediately return. + let (s, _) = char(begin)(s)?; + let (s, (len, _)) = tuple((usize_t, char(':')))(s)?; + let (s, (res, _)) = tuple((take(len), char(end)))(s)?; + Ok((s, res)) + } + } + + fn uint_t<'a, I: FromStr + 'a>(t: &'static str) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], I> { + move |s: &'a [u8]| { + let (s, (_, _, int, _)) = tuple(( + tag(t.as_bytes()), + char(':'), + map_res(map_res(digit1, |n: &[u8]| std::str::from_utf8(n)), |s| { + s.parse::<I>() + }), + char(','), + ))(s)?; + Ok((s, int)) + } + } + + fn bool_t<'a>() -> impl Fn(&'a [u8]) -> IResult<&'a [u8], bool> { + context( + "bool", + alt((map(tag("n1:0,"), |_| false), map(tag("n1:1,"), |_| true))), + ) + } + + fn int_t<'a, I: FromStr + Neg<Output = I>>( + t: &'static str, + ) -> impl Fn(&'a [u8]) -> IResult<&[u8], I> { + context(t, move |s: &'a [u8]| { + let (s, (_, _, neg, int, _)) = tuple(( + tag(t.as_bytes()), + char(':'), + opt(char('-')), + map_res(map_res(digit1, |n: &[u8]| std::str::from_utf8(n)), |s| { + s.parse::<I>() + }), + char(','), + ))(s)?; + let res = match neg { + Some(_) => -int, + None => int, + }; + Ok((s, res)) + }) + } + + fn tag_t(s: &[u8]) -> IResult<&[u8], Tag<String, T>> { + // recurses into the main parser + map(tag_g(t_t), |Tag { tag, val }| Tag { + tag: tag.to_string(), + val, + })(s) + } + + fn tag_g<'a, P, O>(inner: P) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], Tag<&'a str, O>> + where + P: Fn(&'a [u8]) -> IResult<&'a [u8], O>, + { + move |s: &[u8]| { + let (s, tag) = sized('<', '|')(s)?; + let (s, val) = inner(s)?; + Ok(( + s, + Tag { + tag: std::str::from_utf8(tag) + .map_err(|_| nom::Err::Failure((s, ErrorKind::Char)))?, + val: Box::new(val), + }, + )) + } + } + + /// parse text scalar (`t5:hello,`) + fn text(s: &[u8]) -> IResult<&[u8], T> { + let (s, res) = text_g(s)?; + Ok((s, T::Text(res.to_string()))) + } + + fn text_g(s: &[u8]) -> IResult<&[u8], &str> { + let (s, res) = sized('t', ',')(s)?; + Ok(( + s, + std::str::from_utf8(res).map_err(|_| nom::Err::Failure((s, ErrorKind::Char)))?, + )) + } + + fn binary<'a>() -> impl Fn(&'a [u8]) -> IResult<&'a [u8], T> { + map(binary_g(), |b| T::Binary(b.to_owned())) + } + + fn binary_g() -> impl Fn(&[u8]) -> IResult<&[u8], &[u8]> { + sized('b', ',') + } + + fn list_t(s: &[u8]) -> IResult<&[u8], Vec<T>> { + list_g(t_t)(s) + } + + /// Wrap the inner parser of an `many0`/`fold_many0`, so that the parser + /// is not called when the `s` is empty already, preventing it from + /// returning `Incomplete` on streaming parsing. + fn inner_no_empty_string<'a, P, O>(inner: P) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], O> + where + O: Clone, + P: Fn(&'a [u8]) -> IResult<&'a [u8], O>, + { + move |s: &'a [u8]| { + if s.is_empty() { + // This is a bit hacky, `many0` considers the inside done + // when a parser returns `Err::Error`, ignoring the actual error content + Err(nom::Err::Error((s, nom::error::ErrorKind::Many0))) + } else { + inner(s) + } + } + } + + fn list_g<'a, P, O>(inner: P) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], Vec<O>> + where + O: Clone, + P: Fn(&'a [u8]) -> IResult<&'a [u8], O>, + { + map_parser( + sized('[', ']'), + nom::multi::many0(inner_no_empty_string(inner)), + ) + } + + fn record_t<'a>(s: &'a [u8]) -> IResult<&'a [u8], HashMap<String, T>> { + let (s, r) = record_g(t_t)(s)?; + Ok(( + s, + r.into_iter() + .map(|(k, v)| (k.to_string(), v)) + .collect::<HashMap<_, _>>(), + )) + } + + fn record_g<'a, P, O>(inner: P) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], HashMap<&'a str, O>> + where + O: Clone, + P: Fn(&'a [u8]) -> IResult<&'a [u8], O>, + { + move |s: &'a [u8]| { + let (s, map) = map_parser( + sized('{', '}'), + nom::multi::fold_many0( + inner_no_empty_string(tag_g(&inner)), + HashMap::new(), + |mut acc: HashMap<_, _>, Tag { tag, mut val }| { + // ignore earlier tags with the same name + // according to netencode spec + let _ = acc.insert(tag, *val); + acc + }, + ), + )(s)?; + if map.is_empty() { + // records must not be empty, according to the spec + Err(nom::Err::Failure((s, nom::error::ErrorKind::Many1))) + } else { + Ok((s, map)) + } + } + } + + pub fn u_u(s: &[u8]) -> IResult<&[u8], U> { + alt(( + map(text_g, U::Text), + map(binary_g(), U::Binary), + map(unit_t, |()| U::Unit), + map(tag_g(u_u), |t| U::Sum(t)), + map(list_g(u_u), U::List), + map(record_g(u_u), U::Record), + map(bool_t(), |u| U::N1(u)), + map(uint_t("n3"), |u| U::N3(u)), + map(uint_t("n6"), |u| U::N6(u)), + map(uint_t("n7"), |u| U::N7(u)), + map(int_t("i3"), |u| U::I3(u)), + map(int_t("i6"), |u| U::I6(u)), + map(int_t("i7"), |u| U::I7(u)), + // less common + map(uint_t("n2"), |u| U::N3(u)), + map(uint_t("n4"), |u| U::N6(u)), + map(uint_t("n5"), |u| U::N6(u)), + map(int_t("i1"), |u| U::I3(u)), + map(int_t("i2"), |u| U::I3(u)), + map(int_t("i4"), |u| U::I6(u)), + map(int_t("i5"), |u| U::I6(u)), + // TODO: 8, 9 not supported + ))(s) + } + + pub fn t_t(s: &[u8]) -> IResult<&[u8], T> { + alt(( + text, + binary(), + map(unit_t, |_| T::Unit), + map(tag_t, |t| T::Sum(t)), + map(list_t, |l| T::List(l)), + map(record_t, |p| T::Record(p)), + map(bool_t(), |u| T::N1(u)), + // 8, 64 and 128 bit + map(uint_t("n3"), |u| T::N3(u)), + map(uint_t("n6"), |u| T::N6(u)), + map(uint_t("n7"), |u| T::N7(u)), + map(int_t("i3"), |u| T::I3(u)), + map(int_t("i6"), |u| T::I6(u)), + map(int_t("i7"), |u| T::I7(u)), + // less common + map(uint_t("n2"), |u| T::N3(u)), + map(uint_t("n4"), |u| T::N6(u)), + map(uint_t("n5"), |u| T::N6(u)), + map(int_t("i1"), |u| T::I3(u)), + map(int_t("i2"), |u| T::I3(u)), + map(int_t("i4"), |u| T::I6(u)), + map(int_t("i5"), |u| T::I6(u)), + // TODO: 8, 9 not supported + ))(s) + } + + #[cfg(test)] + mod tests { + use super::*; + + #[test] + fn test_parse_unit_t() { + assert_eq!(unit_t("u,".as_bytes()), Ok(("".as_bytes(), ()))); + } + + #[test] + fn test_parse_bool_t() { + assert_eq!(bool_t()("n1:0,".as_bytes()), Ok(("".as_bytes(), false))); + assert_eq!(bool_t()("n1:1,".as_bytes()), Ok(("".as_bytes(), true))); + } + + #[test] + fn test_parse_usize_t() { + assert_eq!(usize_t("32foo".as_bytes()), Ok(("foo".as_bytes(), 32))); + } + + #[test] + fn test_parse_int_t() { + assert_eq!( + uint_t::<u8>("n3")("n3:42,abc".as_bytes()), + Ok(("abc".as_bytes(), 42)) + ); + assert_eq!( + uint_t::<u8>("n3")("n3:1024,abc".as_bytes()), + Err(nom::Err::Error(( + "1024,abc".as_bytes(), + nom::error::ErrorKind::MapRes + ))) + ); + assert_eq!( + int_t::<i64>("i6")("i6:-23,abc".as_bytes()), + Ok(("abc".as_bytes(), -23)) + ); + assert_eq!( + int_t::<i128>("i3")("i3:0,:abc".as_bytes()), + Ok((":abc".as_bytes(), 0)) + ); + assert_eq!( + uint_t::<u8>("n7")("n7:09,".as_bytes()), + Ok(("".as_bytes(), 9)) + ); + // assert_eq!( + // length("c"), + // Err(nom::Err::Error(("c", nom::error::ErrorKind::Digit))) + // ); + // assert_eq!( + // length(":"), + // Err(nom::Err::Error((":", nom::error::ErrorKind::Digit))) + // ); + } + + #[test] + fn test_parse_text() { + assert_eq!( + text("t5:hello,".as_bytes()), + Ok(("".as_bytes(), T::Text("hello".to_owned()))), + "{}", + r"t5:hello," + ); + assert_eq!( + text("t4:fo".as_bytes()), + // The content of the text should be 4 long + Err(nom::Err::Incomplete(nom::Needed::Size(4))), + "{}", + r"t4:fo," + ); + assert_eq!( + text("t9:今日は,".as_bytes()), + Ok(("".as_bytes(), T::Text("今日は".to_owned()))), + "{}", + r"t9:今日は," + ); + } + + #[test] + fn test_parse_binary() { + assert_eq!( + binary()("b5:hello,".as_bytes()), + Ok(("".as_bytes(), T::Binary(Vec::from("hello".to_owned())))), + "{}", + r"b5:hello," + ); + assert_eq!( + binary()("b4:fo".as_bytes()), + // The content of the byte should be 4 long + Err(nom::Err::Incomplete(nom::Needed::Size(4))), + "{}", + r"b4:fo," + ); + assert_eq!( + binary()("b4:foob".as_bytes()), + // The content is 4 bytes now, but the finishing , is missing + Err(nom::Err::Incomplete(nom::Needed::Size(1))), + "{}", + r"b4:fo," + ); + assert_eq!( + binary()("b9:今日は,".as_bytes()), + Ok(("".as_bytes(), T::Binary(Vec::from("今日は".as_bytes())))), + "{}", + r"b9:今日は," + ); + } + + #[test] + fn test_list() { + assert_eq!( + list_t("[0:]".as_bytes()), + Ok(("".as_bytes(), vec![])), + "{}", + r"[0:]" + ); + assert_eq!( + list_t("[6:u,u,u,]".as_bytes()), + Ok(("".as_bytes(), vec![T::Unit, T::Unit, T::Unit,])), + "{}", + r"[6:u,u,u,]" + ); + assert_eq!( + list_t("[15:u,[7:t3:foo,]u,]".as_bytes()), + Ok(( + "".as_bytes(), + vec![T::Unit, T::List(vec![T::Text("foo".to_owned())]), T::Unit,] + )), + "{}", + r"[15:u,[7:t3:foo,]u,]" + ); + } + + #[test] + fn test_record() { + assert_eq!( + record_t("{21:<1:a|u,<1:b|u,<1:c|u,}".as_bytes()), + Ok(( + "".as_bytes(), + vec![ + ("a".to_owned(), T::Unit), + ("b".to_owned(), T::Unit), + ("c".to_owned(), T::Unit), + ] + .into_iter() + .collect::<HashMap<String, T>>() + )), + "{}", + r"{21:<1:a|u,<1:b|u,<1:c|u,}" + ); + // duplicated keys are ignored (first is taken) + assert_eq!( + record_t("{25:<1:a|u,<1:b|u,<1:a|i1:-1,}".as_bytes()), + Ok(( + "".as_bytes(), + vec![("a".to_owned(), T::I3(-1)), ("b".to_owned(), T::Unit),] + .into_iter() + .collect::<HashMap<_, _>>() + )), + "{}", + r"{25:<1:a|u,<1:b|u,<1:a|i1:-1,}" + ); + // empty records are not allowed + assert_eq!( + record_t("{0:}".as_bytes()), + Err(nom::Err::Failure(( + "".as_bytes(), + nom::error::ErrorKind::Many1 + ))), + "{}", + r"{0:}" + ); + } + + #[test] + fn test_parse() { + assert_eq!( + t_t("n3:255,".as_bytes()), + Ok(("".as_bytes(), T::N3(255))), + "{}", + r"n3:255," + ); + assert_eq!( + t_t("t6:halloo,".as_bytes()), + Ok(("".as_bytes(), T::Text("halloo".to_owned()))), + "{}", + r"t6:halloo," + ); + assert_eq!( + t_t("<3:foo|t6:halloo,".as_bytes()), + Ok(( + "".as_bytes(), + T::Sum(Tag { + tag: "foo".to_owned(), + val: Box::new(T::Text("halloo".to_owned())) + }) + )), + "{}", + r"<3:foo|t6:halloo," + ); + // { a: Unit + // , foo: List <A: Unit | B: List i3> } + assert_eq!( + t_t("{52:<1:a|u,<3:foo|[33:<1:A|u,<1:A|n1:1,<1:B|[7:i3:127,]]}".as_bytes()), + Ok(( + "".as_bytes(), + T::Record( + vec![ + ("a".to_owned(), T::Unit), + ( + "foo".to_owned(), + T::List(vec![ + T::Sum(Tag { + tag: "A".to_owned(), + val: Box::new(T::Unit) + }), + T::Sum(Tag { + tag: "A".to_owned(), + val: Box::new(T::N1(true)) + }), + T::Sum(Tag { + tag: "B".to_owned(), + val: Box::new(T::List(vec![T::I3(127)])) + }), + ]) + ) + ] + .into_iter() + .collect::<HashMap<String, T>>() + ) + )), + "{}", + r"{52:<1:a|u,<3:foo|[33:<1:A|u,<1:A|n1:1,<1:B|[7:i3:127,]]}" + ); + } + } +} + +pub mod dec { + use super::*; + use std::collections::HashMap; + + pub struct DecodeError(pub String); + + pub trait Decoder<'a> { + type A; + fn dec(&self, u: U<'a>) -> Result<Self::A, DecodeError>; + } + + /// Any netencode, as `T`. + #[derive(Clone, Copy)] + pub struct AnyT; + /// Any netencode, as `U`. + #[derive(Clone, Copy)] + pub struct AnyU; + + impl<'a> Decoder<'a> for AnyT { + type A = T; + fn dec(&self, u: U<'a>) -> Result<Self::A, DecodeError> { + Ok(u.to_t()) + } + } + + impl<'a> Decoder<'a> for AnyU { + type A = U<'a>; + fn dec(&self, u: U<'a>) -> Result<Self::A, DecodeError> { + Ok(u) + } + } + + /// A text + #[derive(Clone, Copy)] + pub struct Text; + + /// A bytestring + // TODO: rename to Bytes + #[derive(Clone, Copy)] + pub struct Binary; + + impl<'a> Decoder<'a> for Text { + type A = &'a str; + fn dec(&self, u: U<'a>) -> Result<Self::A, DecodeError> { + match u { + U::Text(t) => Ok(t), + other => Err(DecodeError(format!("Cannot decode {:?} into Text", other))), + } + } + } + + impl<'a> Decoder<'a> for Binary { + type A = &'a [u8]; + fn dec(&self, u: U<'a>) -> Result<Self::A, DecodeError> { + match u { + U::Binary(b) => Ok(b), + other => Err(DecodeError(format!( + "Cannot decode {:?} into Binary", + other + ))), + } + } + } + + /// Any scalar, converted to bytes. + #[derive(Clone, Copy)] + pub struct ScalarAsBytes; + + impl<'a> Decoder<'a> for ScalarAsBytes { + type A = Vec<u8>; + fn dec(&self, u: U<'a>) -> Result<Self::A, DecodeError> { + match u { + U::N3(u) => Ok(format!("{}", u).into_bytes()), + U::N6(u) => Ok(format!("{}", u).into_bytes()), + U::N7(u) => Ok(format!("{}", u).into_bytes()), + U::I3(i) => Ok(format!("{}", i).into_bytes()), + U::I6(i) => Ok(format!("{}", i).into_bytes()), + U::I7(i) => Ok(format!("{}", i).into_bytes()), + U::Text(t) => Ok(t.as_bytes().to_owned()), + U::Binary(b) => Ok(b.to_owned()), + o => Err(DecodeError(format!("Cannot decode {:?} into scalar", o))), + } + } + } + + /// A map of Ts (TODO: rename to map) + #[derive(Clone, Copy)] + pub struct Record<T>(pub T); + + impl<'a, Inner> Decoder<'a> for Record<Inner> + where + Inner: Decoder<'a>, + { + type A = HashMap<&'a str, Inner::A>; + fn dec(&self, u: U<'a>) -> Result<Self::A, DecodeError> { + match u { + U::Record(map) => map + .into_iter() + .map(|(k, v)| self.0.dec(v).map(|v2| (k, v2))) + .collect::<Result<Self::A, _>>(), + o => Err(DecodeError(format!("Cannot decode {:?} into record", o))), + } + } + } + + /// Assume a record and project out the field with the given name and type. + #[derive(Clone, Copy)] + pub struct RecordDot<'a, T> { + pub field: &'a str, + pub inner: T, + } + + impl<'a, Inner> Decoder<'a> for RecordDot<'_, Inner> + where + Inner: Decoder<'a> + Clone, + { + type A = Inner::A; + fn dec(&self, u: U<'a>) -> Result<Self::A, DecodeError> { + match Record(self.inner.clone()).dec(u) { + Ok(mut map) => match map.remove(self.field) { + Some(inner) => Ok(inner), + None => Err(DecodeError(format!( + "Cannot find `{}` in record map", + self.field + ))), + }, + Err(err) => Err(err), + } + } + } + + /// Equals one of the listed `A`s exactly, after decoding. + #[derive(Clone)] + pub struct OneOf<T, A> { + pub inner: T, + pub list: Vec<A>, + } + + impl<'a, Inner> Decoder<'a> for OneOf<Inner, Inner::A> + where + Inner: Decoder<'a>, + Inner::A: Display + Debug + PartialEq, + { + type A = Inner::A; + fn dec(&self, u: U<'a>) -> Result<Self::A, DecodeError> { + match self.inner.dec(u) { + Ok(inner) => match self.list.iter().any(|x| x.eq(&inner)) { + true => Ok(inner), + false => Err(DecodeError(format!( + "{} is not one of {:?}", + inner, self.list + ))), + }, + Err(err) => Err(err), + } + } + } + + /// Try decoding as `T`. + #[derive(Clone)] + pub struct Try<T>(pub T); + + impl<'a, Inner> Decoder<'a> for Try<Inner> + where + Inner: Decoder<'a>, + { + type A = Option<Inner::A>; + fn dec(&self, u: U<'a>) -> Result<Self::A, DecodeError> { + match self.0.dec(u) { + Ok(inner) => Ok(Some(inner)), + Err(err) => Ok(None), + } + } + } +} diff --git a/users/Profpatsch/netencode/pretty.rs b/users/Profpatsch/netencode/pretty.rs new file mode 100644 index 000000000000..935c3d4a8a17 --- /dev/null +++ b/users/Profpatsch/netencode/pretty.rs @@ -0,0 +1,163 @@ +extern crate netencode; + +use netencode::{Tag, T, U}; + +pub enum Pretty { + Single { + r#type: char, + length: String, + val: String, + trailer: char, + }, + Tag { + r#type: char, + length: String, + key: String, + inner: char, + val: Box<Pretty>, + }, + Multi { + r#type: char, + length: String, + vals: Vec<Pretty>, + trailer: char, + }, +} + +impl Pretty { + pub fn from_u<'a>(u: U<'a>) -> Pretty { + match u { + U::Unit => Self::scalar('u', "", ""), + U::N1(b) => Self::scalar('n', "1:", if b { "1" } else { "0" }), + U::N3(n) => Self::scalar('n', "3:", n), + U::N6(n) => Self::scalar('n', "6:", n), + U::N7(n) => Self::scalar('n', "7:", n), + U::I3(i) => Self::scalar('i', "3:", i), + U::I6(i) => Self::scalar('i', "6:", i), + U::I7(i) => Self::scalar('i', "7:", i), + U::Text(s) => Pretty::Single { + r#type: 't', + length: format!("{}:", s.len()), + val: s.to_string(), + trailer: ',', + }, + U::Binary(s) => Pretty::Single { + r#type: 'b', + length: format!("{}:", s.len()), + // For pretty printing we want the string to be visible obviously. + // Instead of not supporting binary, let’s use lossy conversion. + val: String::from_utf8_lossy(s).into_owned(), + trailer: ',', + }, + U::Sum(Tag { tag, val }) => Self::pretty_tag(tag, Self::from_u(*val)), + U::Record(m) => Pretty::Multi { + r#type: '{', + // TODO: we are losing the size here, should we recompute it? Keep it? + length: String::from(""), + vals: m + .into_iter() + .map(|(k, v)| Self::pretty_tag(k, Self::from_u(v))) + .collect(), + trailer: '}', + }, + U::List(l) => Pretty::Multi { + r#type: '[', + // TODO: we are losing the size here, should we recompute it? Keep it? + length: String::from(""), + vals: l.into_iter().map(|v| Self::from_u(v)).collect(), + trailer: ']', + }, + } + } + + fn scalar<D>(r#type: char, length: &str, d: D) -> Pretty + where + D: std::fmt::Display, + { + Pretty::Single { + r#type, + length: length.to_string(), + val: format!("{}", d), + trailer: ',', + } + } + + fn pretty_tag(tag: &str, val: Pretty) -> Pretty { + Pretty::Tag { + r#type: '<', + length: format!("{}:", tag.len()), + key: tag.to_string(), + inner: '|', + val: Box::new(val), + } + } + + pub fn print_multiline<W>(&self, mut w: &mut W) -> std::io::Result<()> + where + W: std::io::Write, + { + Self::go(&mut w, self, 0, true); + write!(w, "\n") + } + + fn go<W>(mut w: &mut W, p: &Pretty, depth: usize, is_newline: bool) -> std::io::Result<()> + where + W: std::io::Write, + { + const full: usize = 4; + const half: usize = 2; + let i = &vec![b' '; depth * full]; + let iandhalf = &vec![b' '; depth * full + half]; + let (i, iandhalf) = unsafe { + ( + std::str::from_utf8_unchecked(i), + std::str::from_utf8_unchecked(iandhalf), + ) + }; + if is_newline { + write!(&mut w, "{}", i); + } + match p { + Pretty::Single { + r#type, + length, + val, + trailer, + } => write!(&mut w, "{} {}{}", r#type, val, trailer), + Pretty::Tag { + r#type, + length, + key, + inner, + val, + } => { + write!(&mut w, "{} {} {}", r#type, key, inner)?; + Self::go::<W>(&mut w, val, depth, false) + } + // if the length is 0 or 1, we print on one line, + // only if there’s more than one element we split the resulting value. + // we never break lines on arbitrary column sizes, since that is just silly. + Pretty::Multi { + r#type, + length, + vals, + trailer, + } => match vals.len() { + 0 => write!(&mut w, "{} {}", r#type, trailer), + 1 => { + write!(&mut w, "{} ", r#type); + Self::go::<W>(&mut w, &vals[0], depth, false)?; + write!(&mut w, "{}", trailer) + } + more => { + write!(&mut w, "\n{}{} \n", iandhalf, r#type)?; + for v in vals { + Self::go::<W>(&mut w, v, depth + 1, true)?; + write!(&mut w, "\n")?; + } + write!(&mut w, "{}{}", iandhalf, trailer) + } + }, + } + } +} |