diff options
author | sterni <sternenseemann@systemli.org> | 2021-03-04T21·22+0100 |
---|---|---|
committer | sterni <sternenseemann@systemli.org> | 2021-03-05T11·07+0000 |
commit | b810c46a45c0bbf52b8a896d6d5d37f79f027d9f (patch) | |
tree | 60241864c4abb42dd8e61e32da5f0421faddb84f | |
parent | 5ae1d3fd7b363b6aab71b94626055ea91675f90d (diff) |
feat(users/sterni/nix/utf8): pure nix utf-8 decoder r/2270
users.sterni.nix.utf8 implements UTF-8 decoding in pure nix. We implement the decoding as a simple state machine which is fed one byte at a time. Decoding whole strings is possible by subsequently calling step. This is done in decode which uses builtins.foldl' to get around recursion restrictions and a neat trick using builtins.deepSeq puck showed me limiting the size of the thunks in a foldl' (which can also cause a stack overflow). This makes decoding arbitrarily large UTF-8 files into codepoints using nix theoretically possible, but it is not really practical: Decoding a 36KB LaTeX file I had lying around takes ~160s on my laptop. Change-Id: Iab8c973dac89074ec280b4880a7408e0b3d19bc7 Reviewed-on: https://cl.tvl.fyi/c/depot/+/2590 Tested-by: BuildkiteCI Reviewed-by: sterni <sternenseemann@systemli.org>
-rw-r--r-- | users/sterni/nix/int/default.nix | 3 | ||||
-rw-r--r-- | users/sterni/nix/utf8/default.nix | 208 | ||||
-rw-r--r-- | users/sterni/nix/utf8/tests/default.nix | 121 |
3 files changed, 332 insertions, 0 deletions
diff --git a/users/sterni/nix/int/default.nix b/users/sterni/nix/int/default.nix index cd46fe7864d1..b3157571272f 100644 --- a/users/sterni/nix/int/default.nix +++ b/users/sterni/nix/int/default.nix @@ -97,6 +97,8 @@ let # i. e. they truncate towards 0 mod = a: b: let res = a / b; in a - (res * b); + inRange = a: b: x: x >= a && x <= b; + in { inherit maxBound @@ -117,5 +119,6 @@ in { bitXor toHex fromHex + inRange ; } diff --git a/users/sterni/nix/utf8/default.nix b/users/sterni/nix/utf8/default.nix new file mode 100644 index 000000000000..713f1f57cbe6 --- /dev/null +++ b/users/sterni/nix/utf8/default.nix @@ -0,0 +1,208 @@ +{ depot, lib, ... }: + +let + + # TODO(sterni): encode + + inherit (depot.users.sterni.nix) + char + flow + fun + int + string + util + ; + + /* (Internal) function to determine the amount + bytes left in a UTF-8 byte sequence from the + first byte. + + This function will throw if the given first + byte is ill-formed, but will not detect all + cases of ill-formed-ness. + + Based on table 3-6. from The Unicode Standard, + Version 13.0, section 3.9. + + Type: integer -> integer + */ + byteCount = i: flow.cond [ + [ (int.bitAnd i 128 == 0) 1 ] + [ (int.bitAnd i 224 == 192) 2 ] + [ (int.bitAnd i 240 == 224) 3 ] + [ (int.bitAnd i 248 == 240) 4 ] + [ true (builtins.throw "Ill-formed first byte ${int.toHex i}") ] + ]; + + /* (Internal) function to check if a given byte in + an UTF-8 byte sequence is well-formed. + + Based on table 3-7. from The Unicode Standard, + Version 13.0, section 3.9. + + Throws if the first byte is invalid. + + Type: integer -> integer -> (integer -> bool) + */ + wellFormedByte = + # first byte's integer value + first: + # byte position as an index starting with 0 + pos: + let + defaultRange = int.inRange 128 191; + in + # The first byte is either ASCII which requires no checks + # or we automatically check it when we check the subsequent + # bytes. The downside is that this may generate bad error + # messages in very rare cases. + if pos == 0 + then lib.const true + else if pos > 1 # 3rd and 4th byte have only one validity rule + then defaultRange + else assert pos == 1; flow.switch first [ + [ (int.inRange 194 223) defaultRange ] # C2..DF + [ 224 (int.inRange 160 191) ] # E0 + [ (int.inRange 225 236) defaultRange ] # E1..EC + [ 237 (int.inRange 128 159) ] # ED + [ (int.inRange 238 239) defaultRange ] # EE..EF + [ 240 (int.inRange 144 191) ] # F0 + [ (int.inRange 241 243) defaultRange ] # F1..F3 + [ 244 (int.inRange 128 143) ] # F4 + [ + (fun.const true) + (builtins.throw "Invalid first byte ${int.toHex first}") + ] + ]; + + /* Iteration step for decoding an UTF-8 byte sequence. + It decodes incrementally, i. e. it has to be fed + one byte at a time and then returns either a + new state or a final result. + + If the resulting attribute set contains the attribute + result, it is finished and the decoded codepoint is + contained in that attribute. In all other cases, + pass the returned set to step again along with + a new byte. The initial state to pass is the empty + set. + + Extra attributes are always passed through, so you + can pass extra state. Be sure not to use result, + pos, code, first or count. + + This function will throw with a fairly detailed + message if it encounters ill-formed bytes. + + The implementation is based on The Unicode Standard, + Version 13.0, section 3.9, especially table 3-6. + + Type: { ... } -> string -> ({ result :: integer, ... } | { ... }) + + Example: utf8.step {} "f" + => { result = 102; } + */ + step = { pos ? 0, code ? 0, ... }@args: byte: + let + value = char.ord byte; + # first byte is context for well-formed-ness + first = args.first or value; + count = args.count or (byteCount first); + newCode = + if count == 1 + then int.bitAnd 127 first # ascii character + else # multi byte UTF-8 sequence + let + # Calculate the bitmask for extracting the + # codepoint data in the current byte. + # If the codepoint is not ASCII, the bits + # used for codepoint data differ depending + # on the byte position and overall byte + # count. The first byte always ignores + # the (count + 1) most significant bits. + # For all subsequent bytes, the 2 most + # significant bits need to be ignored. + # See also table 3-6. + mask = + if pos == 0 + then int.exp 2 (8 - (count + 1)) - 1 + else 63; + # UTF-8 uses the 6 least significant bits in all + # subsequent bytes after the first one. Therefore + # We can determine the amount we need to shift + # the current value by the amount of bytes left. + offset = (count - (pos + 1)) * 6; + in + code + (int.bitShiftL (int.bitAnd mask value) offset); + illFormedMsg = + "Ill-formed byte ${int.toHex value} at position ${toString pos} in ${toString count} byte UTF-8 sequence"; + in + if !(wellFormedByte first pos value) then builtins.throw illFormedMsg + else if pos + 1 == count + then (builtins.removeAttrs args [ # allow extra state being passed through + "count" + "code" + "pos" + "first" + ]) // { result = newCode; } + else (builtins.removeAttrs args [ "result" ]) // { + inherit count first; + code = newCode; + pos = pos + 1; + }; + + /* Decode an UTF-8 string into a list of codepoints. + + Throws if the string is ill-formed UTF-8. + + Type: string -> [ integer ] + */ + # TODO(sterni): option to fallback to replacement char instead of failure + decode = s: + let + iter = { codes ? [], ... }@args: byte: + let + res = step args byte; + in + # foldl' forceValues the calculate value only at the end + # this makes the thunk grow large enough to cause a stack + # overflow with sufficiently large strings. To avoid this + # we always deepSeq the result which also keeps memory + # usage of decode reasonable. + builtins.deepSeq res + (if res ? result + then res // { + codes = codes ++ [ res.result ]; + } + else res); + iterResult = + builtins.foldl' iter {} (string.toChars s); + earlyEndMsg = + if iterResult ? count && iterResult ? pos + then "Missing ${toString (with iterResult; count - pos)} bytes at end of input" + else "Unexpected end of input"; + in + if iterResult ? result + then iterResult.codes + else builtins.throw earlyEndMsg; + + /* Decodes an UTF-8 string, but doesn't throw on error. + Instead it returns null. + + Type: string -> ( [ integer ] | null) + */ + decodeSafe = s: + let + res = builtins.tryEval (decode s); + in + if res.success + then res.value + else null; + +in { + inherit + decode + decodeSafe + step + ; +} diff --git a/users/sterni/nix/utf8/tests/default.nix b/users/sterni/nix/utf8/tests/default.nix new file mode 100644 index 000000000000..d9d8ae77105d --- /dev/null +++ b/users/sterni/nix/utf8/tests/default.nix @@ -0,0 +1,121 @@ +{ depot, lib, ... }: + +let + + inherit (depot.third_party) + runCommandLocal + ; + + inherit (depot.nix.runTestsuite) + runTestsuite + it + assertEq + assertThrows + assertDoesNotThrow + ; + + inherit (depot.users.Profpatsch.writers) + rustSimple + ; + + inherit (depot.users.sterni.nix) + int + utf8 + string + char + ; + + rustDecoder = rustSimple { + name = "utf8-decode"; + } '' + use std::io::{self, Read}; + fn main() -> std::io::Result<()> { + let mut buffer = String::new(); + io::stdin().read_to_string(&mut buffer)?; + + print!("[ "); + + for c in buffer.chars() { + print!("{} ", u32::from(c)); + } + + print!("]"); + + Ok(()) + } + ''; + + rustDecode = s: + let + expr = runCommandLocal "${s}-decoded" {} '' + printf '%s' ${lib.escapeShellArg s} | ${rustDecoder} > $out + ''; + in import expr; + + hexDecode = l: + utf8.decode (string.fromBytes (builtins.map int.fromHex l)); + + testFailures = it "checks UTF-8 decoding failures" [ + (assertThrows "emtpy bytestring throws" (utf8.decode "")) + (assertThrows "truncated UTF-8 string throws" (hexDecode [ "F0" "9F" ])) + # examples from The Unicode Standard + (assertThrows "ill-formed: C0 AF" (hexDecode [ "C0" "AF" ])) + (assertThrows "ill-formed: E0 9F 80" (hexDecode [ "E0" "9F" "80" ])) + (assertEq "well-formed: F4 80 83 92" (hexDecode [ "F4" "80" "83" "92" ]) [ 1048786 ]) + ]; + + testAscii = it "checks decoding of ascii strings" + (builtins.map (s: assertEq "ASCII decoding is equal to UTF-8 decoding for \"${s}\"" + (string.toBytes s) (utf8.decode s)) [ + "foo bar" + "hello\nworld" + "carriage\r\nreturn" + "1238398494829304 []<><>({})[]!!)" + (string.take 127 char.allChars) + ]); + + randomUnicode = [ + "🥰👨👨👧👦🐈⬛👩🏽🦰" + # https://kermitproject.org/utf8.html + "ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ" + "An preost wes on leoden, Laȝamon was ihoten" + "Sîne klâwen durh die wolken sint geslagen," + "Τὴ γλῶσσα μοῦ ἔδωσαν ἑλληνικὴ" + "На берегу пустынных волн" + "ვეპხის ტყაოსანი შოთა რუსთაველი" + "யாமறிந்த மொழிகளிலே தமிழ்மொழி போல் இனிதாவது எங்கும் காணோம், " + "ಬಾ ಇಲ್ಲಿ ಸಂಭವಿಸು " + ]; + + # https://kermitproject.org/utf8.html + glassSentences = [ + "Euro Symbol: €." + "Greek: Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα." + "Íslenska / Icelandic: Ég get etið gler án þess að meiða mig." + "Polish: Mogę jeść szkło, i mi nie szkodzi." + "Romanian: Pot să mănânc sticlă și ea nu mă rănește." + "Ukrainian: Я можу їсти шкло, й воно мені не пошкодить." + "Armenian: Կրնամ ապակի ուտել և ինծի անհանգիստ չըներ։" + "Georgian: მინას ვჭამ და არა მტკივა." + "Hindi: मैं काँच खा सकता हूँ, मुझे उस से कोई पीडा नहीं होती." + "Hebrew(2): אני יכול לאכול זכוכית וזה לא מזיק לי." + "Yiddish(2): איך קען עסן גלאָז און עס טוט מיר נישט װײ." + "Arabic(2): أنا قادر على أكل الزجاج و هذا لا يؤلمني." + "Japanese: 私はガラスを食べられます。それは私を傷つけません。" + "Thai: ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ " + ]; + + testDecoding = it "checks decoding of UTF-8 strings against Rust's String" + (builtins.map + (s: assertEq "Decoding of “${s}” is correct" (utf8.decode s) (rustDecode s)) + (lib.flatten [ + glassSentences + randomUnicode + ])); + +in + runTestsuite "nix.utf8" [ + testFailures + testAscii + testDecoding + ] |