about summary refs log tree commit diff
diff options
context:
space:
mode:
authorsterni <sternenseemann@systemli.org>2021-03-04T21·22+0100
committersterni <sternenseemann@systemli.org>2021-03-05T11·07+0000
commitb810c46a45c0bbf52b8a896d6d5d37f79f027d9f (patch)
tree60241864c4abb42dd8e61e32da5f0421faddb84f
parent5ae1d3fd7b363b6aab71b94626055ea91675f90d (diff)
feat(users/sterni/nix/utf8): pure nix utf-8 decoder r/2270
users.sterni.nix.utf8 implements UTF-8 decoding in pure nix. We
implement the decoding as a simple state machine which is fed one byte
at a time. Decoding whole strings is possible by subsequently calling
step. This is done in decode which uses builtins.foldl' to get around
recursion restrictions and a neat trick using builtins.deepSeq puck
showed me limiting the size of the thunks in a foldl' (which can also
cause a stack overflow).

This makes decoding arbitrarily large UTF-8 files into codepoints using
nix theoretically possible, but it is not really practical: Decoding a
36KB LaTeX file I had lying around takes ~160s on my laptop.

Change-Id: Iab8c973dac89074ec280b4880a7408e0b3d19bc7
Reviewed-on: https://cl.tvl.fyi/c/depot/+/2590
Tested-by: BuildkiteCI
Reviewed-by: sterni <sternenseemann@systemli.org>
-rw-r--r--users/sterni/nix/int/default.nix3
-rw-r--r--users/sterni/nix/utf8/default.nix208
-rw-r--r--users/sterni/nix/utf8/tests/default.nix121
3 files changed, 332 insertions, 0 deletions
diff --git a/users/sterni/nix/int/default.nix b/users/sterni/nix/int/default.nix
index cd46fe7864..b315757127 100644
--- a/users/sterni/nix/int/default.nix
+++ b/users/sterni/nix/int/default.nix
@@ -97,6 +97,8 @@ let
   # i. e. they truncate towards 0
   mod = a: b: let res = a / b; in a - (res * b);
 
+  inRange = a: b: x: x >= a && x <= b;
+
 in {
   inherit
     maxBound
@@ -117,5 +119,6 @@ in {
     bitXor
     toHex
     fromHex
+    inRange
     ;
 }
diff --git a/users/sterni/nix/utf8/default.nix b/users/sterni/nix/utf8/default.nix
new file mode 100644
index 0000000000..713f1f57cb
--- /dev/null
+++ b/users/sterni/nix/utf8/default.nix
@@ -0,0 +1,208 @@
+{ depot, lib, ... }:
+
+let
+
+  # TODO(sterni): encode
+
+  inherit (depot.users.sterni.nix)
+    char
+    flow
+    fun
+    int
+    string
+    util
+    ;
+
+  /* (Internal) function to determine the amount
+     bytes left in a UTF-8 byte sequence from the
+     first byte.
+
+     This function will throw if the given first
+     byte is ill-formed, but will not detect all
+     cases of ill-formed-ness.
+
+     Based on table 3-6. from The Unicode Standard,
+     Version 13.0, section 3.9.
+
+     Type: integer -> integer
+  */
+  byteCount = i: flow.cond [
+    [ (int.bitAnd i 128 == 0)   1 ]
+    [ (int.bitAnd i 224 == 192) 2 ]
+    [ (int.bitAnd i 240 == 224) 3 ]
+    [ (int.bitAnd i 248 == 240) 4 ]
+    [ true (builtins.throw "Ill-formed first byte ${int.toHex i}") ]
+  ];
+
+  /* (Internal) function to check if a given byte in
+     an UTF-8 byte sequence is well-formed.
+
+     Based on table 3-7. from The Unicode Standard,
+     Version 13.0, section 3.9.
+
+     Throws if the first byte is invalid.
+
+     Type: integer -> integer -> (integer -> bool)
+  */
+  wellFormedByte =
+    # first byte's integer value
+    first:
+    # byte position as an index starting with 0
+    pos:
+      let
+        defaultRange = int.inRange 128 191;
+      in
+        # The first byte is either ASCII which requires no checks
+        # or we automatically check it when we check the subsequent
+        # bytes. The downside is that this may generate bad error
+        # messages in very rare cases.
+        if pos == 0
+        then lib.const true
+        else if pos > 1 # 3rd and 4th byte have only one validity rule
+        then defaultRange
+        else assert pos == 1; flow.switch first [
+          [ (int.inRange 194 223) defaultRange          ] # C2..DF
+          [ 224                   (int.inRange 160 191) ] # E0
+          [ (int.inRange 225 236) defaultRange          ] # E1..EC
+          [ 237                   (int.inRange 128 159) ] # ED
+          [ (int.inRange 238 239) defaultRange          ] # EE..EF
+          [ 240                   (int.inRange 144 191) ] # F0
+          [ (int.inRange 241 243) defaultRange          ] # F1..F3
+          [ 244                   (int.inRange 128 143) ] # F4
+          [
+            (fun.const true)
+            (builtins.throw "Invalid first byte ${int.toHex first}")
+          ]
+        ];
+
+  /* Iteration step for decoding an UTF-8 byte sequence.
+     It decodes incrementally, i. e. it has to be fed
+     one byte at a time and then returns either a
+     new state or a final result.
+
+     If the resulting attribute set contains the attribute
+     result, it is finished and the decoded codepoint is
+     contained in that attribute. In all other cases,
+     pass the returned set to step again along with
+     a new byte. The initial state to pass is the empty
+     set.
+
+     Extra attributes are always passed through, so you
+     can pass extra state. Be sure not to use result,
+     pos, code, first or count.
+
+     This function will throw with a fairly detailed
+     message if it encounters ill-formed bytes.
+
+     The implementation is based on The Unicode Standard,
+     Version 13.0, section 3.9, especially table 3-6.
+
+     Type: { ... } -> string -> ({ result :: integer, ... } | { ... })
+
+     Example: utf8.step {} "f"
+     => { result = 102; }
+  */
+  step = { pos ? 0, code ? 0, ... }@args: byte:
+    let
+      value = char.ord byte;
+      # first byte is context for well-formed-ness
+      first = args.first or value;
+      count = args.count or (byteCount first);
+      newCode =
+        if count == 1
+        then int.bitAnd 127 first # ascii character
+        else # multi byte UTF-8 sequence
+          let
+            # Calculate the bitmask for extracting the
+            # codepoint data in the current byte.
+            # If the codepoint is not ASCII, the bits
+            # used for codepoint data differ depending
+            # on the byte position and overall byte
+            # count. The first byte always ignores
+            # the (count + 1) most significant bits.
+            # For all subsequent bytes, the 2 most
+            # significant bits need to be ignored.
+            # See also table 3-6.
+            mask =
+              if pos == 0
+              then int.exp 2 (8 - (count + 1)) - 1
+              else 63;
+            # UTF-8 uses the 6 least significant bits in all
+            # subsequent bytes after the first one. Therefore
+            # We can determine the amount we need to shift
+            # the current value by the amount of bytes left.
+            offset = (count - (pos + 1)) * 6;
+          in
+            code + (int.bitShiftL (int.bitAnd mask value) offset);
+      illFormedMsg =
+        "Ill-formed byte ${int.toHex value} at position ${toString pos} in ${toString count} byte UTF-8 sequence";
+    in
+      if !(wellFormedByte first pos value) then builtins.throw illFormedMsg
+      else if pos + 1 == count
+      then (builtins.removeAttrs args [ # allow extra state being passed through
+        "count"
+        "code"
+        "pos"
+        "first"
+      ]) // { result = newCode; }
+      else (builtins.removeAttrs args [ "result" ]) // {
+        inherit count first;
+        code = newCode;
+        pos  = pos + 1;
+      };
+
+  /* Decode an UTF-8 string into a list of codepoints.
+
+     Throws if the string is ill-formed UTF-8.
+
+     Type: string -> [ integer ]
+  */
+  # TODO(sterni): option to fallback to replacement char instead of failure
+  decode = s:
+    let
+      iter = { codes ? [], ... }@args: byte:
+        let
+          res = step args byte;
+        in
+          # foldl' forceValues the calculate value only at the end
+          # this makes the thunk grow large enough to cause a stack
+          # overflow with sufficiently large strings. To avoid this
+          # we always deepSeq the result which also keeps memory
+          # usage of decode reasonable.
+          builtins.deepSeq res
+            (if res ? result
+            then res // {
+              codes = codes ++ [ res.result ];
+            }
+            else res);
+      iterResult =
+        builtins.foldl' iter {} (string.toChars s);
+      earlyEndMsg =
+        if iterResult ? count && iterResult ? pos
+        then "Missing ${toString (with iterResult; count - pos)} bytes at end of input"
+        else "Unexpected end of input";
+    in
+      if iterResult ? result
+      then iterResult.codes
+      else builtins.throw earlyEndMsg;
+
+  /* Decodes an UTF-8 string, but doesn't throw on error.
+     Instead it returns null.
+
+     Type: string -> ( [ integer ] | null)
+  */
+  decodeSafe = s:
+    let
+      res = builtins.tryEval (decode s);
+    in
+      if res.success
+      then res.value
+      else null;
+
+in {
+  inherit
+    decode
+    decodeSafe
+    step
+    ;
+}
diff --git a/users/sterni/nix/utf8/tests/default.nix b/users/sterni/nix/utf8/tests/default.nix
new file mode 100644
index 0000000000..d9d8ae7710
--- /dev/null
+++ b/users/sterni/nix/utf8/tests/default.nix
@@ -0,0 +1,121 @@
+{ depot, lib, ... }:
+
+let
+
+  inherit (depot.third_party)
+    runCommandLocal
+    ;
+
+  inherit (depot.nix.runTestsuite)
+    runTestsuite
+    it
+    assertEq
+    assertThrows
+    assertDoesNotThrow
+    ;
+
+  inherit (depot.users.Profpatsch.writers)
+    rustSimple
+    ;
+
+  inherit (depot.users.sterni.nix)
+    int
+    utf8
+    string
+    char
+    ;
+
+  rustDecoder = rustSimple {
+    name = "utf8-decode";
+  } ''
+    use std::io::{self, Read};
+    fn main() -> std::io::Result<()> {
+      let mut buffer = String::new();
+      io::stdin().read_to_string(&mut buffer)?;
+
+      print!("[ ");
+
+      for c in buffer.chars() {
+        print!("{} ", u32::from(c));
+      }
+
+      print!("]");
+
+      Ok(())
+    }
+  '';
+
+  rustDecode = s:
+    let
+      expr = runCommandLocal "${s}-decoded" {} ''
+        printf '%s' ${lib.escapeShellArg s} | ${rustDecoder} > $out
+      '';
+    in import expr;
+
+  hexDecode = l:
+    utf8.decode (string.fromBytes (builtins.map int.fromHex l));
+
+  testFailures = it "checks UTF-8 decoding failures" [
+    (assertThrows "emtpy bytestring throws" (utf8.decode ""))
+    (assertThrows "truncated UTF-8 string throws" (hexDecode [ "F0" "9F" ]))
+    # examples from The Unicode Standard
+    (assertThrows "ill-formed: C0 AF" (hexDecode [ "C0" "AF" ]))
+    (assertThrows "ill-formed: E0 9F 80" (hexDecode [ "E0" "9F" "80" ]))
+    (assertEq "well-formed: F4 80 83 92" (hexDecode [ "F4" "80" "83" "92" ]) [ 1048786 ])
+  ];
+
+  testAscii = it "checks decoding of ascii strings"
+    (builtins.map (s: assertEq "ASCII decoding is equal to UTF-8 decoding for \"${s}\""
+      (string.toBytes s) (utf8.decode s)) [
+        "foo bar"
+        "hello\nworld"
+        "carriage\r\nreturn"
+        "1238398494829304 []<><>({})[]!!)"
+        (string.take 127 char.allChars)
+      ]);
+
+  randomUnicode = [
+    "🥰👨‍👨‍👧‍👦🐈‍⬛👩🏽‍🦰"
+    # https://kermitproject.org/utf8.html
+    "ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ"
+    "An preost wes on leoden, Laȝamon was ihoten"
+    "Sîne klâwen durh die wolken sint geslagen,"
+    "Τὴ γλῶσσα μοῦ ἔδωσαν ἑλληνικὴ"
+    "На берегу пустынных волн"
+    "ვეპხის ტყაოსანი შოთა რუსთაველი"
+    "யாமறிந்த மொழிகளிலே தமிழ்மொழி போல் இனிதாவது எங்கும் காணோம், "
+    "ಬಾ ಇಲ್ಲಿ ಸಂಭವಿಸು "
+  ];
+
+  # https://kermitproject.org/utf8.html
+  glassSentences = [
+    "Euro Symbol: €."
+    "Greek: Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα."
+    "Íslenska / Icelandic: Ég get etið gler án þess að meiða mig."
+    "Polish: Mogę jeść szkło, i mi nie szkodzi."
+    "Romanian: Pot să mănânc sticlă și ea nu mă rănește."
+    "Ukrainian: Я можу їсти шкло, й воно мені не пошкодить."
+    "Armenian: Կրնամ ապակի ուտել և ինծի անհանգիստ չըներ։"
+    "Georgian: მინას ვჭამ და არა მტკივა."
+    "Hindi: मैं काँच खा सकता हूँ, मुझे उस से कोई पीडा नहीं होती."
+    "Hebrew(2): אני יכול לאכול זכוכית וזה לא מזיק לי."
+    "Yiddish(2): איך קען עסן גלאָז און עס טוט מיר נישט װײ."
+    "Arabic(2): أنا قادر على أكل الزجاج و هذا لا يؤلمني."
+    "Japanese: 私はガラスを食べられます。それは私を傷つけません。"
+    "Thai: ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ "
+  ];
+
+  testDecoding = it "checks decoding of UTF-8 strings against Rust's String"
+    (builtins.map
+      (s: assertEq "Decoding of “${s}” is correct" (utf8.decode s) (rustDecode s))
+      (lib.flatten [
+        glassSentences
+        randomUnicode
+      ]));
+
+in
+  runTestsuite "nix.utf8" [
+    testFailures
+    testAscii
+    testDecoding
+  ]