diff options
Diffstat (limited to 'users/sterni/nix/utf8')
-rw-r--r-- | users/sterni/nix/utf8/default.nix | 313 | ||||
-rw-r--r-- | users/sterni/nix/utf8/tests/default.nix | 141 |
2 files changed, 454 insertions, 0 deletions
diff --git a/users/sterni/nix/utf8/default.nix b/users/sterni/nix/utf8/default.nix new file mode 100644 index 000000000000..270da934b6a6 --- /dev/null +++ b/users/sterni/nix/utf8/default.nix @@ -0,0 +1,313 @@ +{ depot, lib, ... }: + +let + + inherit (depot.users.sterni.nix) + char + flow + fun + int + string + util + ; + + /* (Internal) function to determine the amount + bytes left in a UTF-8 byte sequence from the + first byte. + + This function will throw if the given first + byte is ill-formed, but will not detect all + cases of ill-formed-ness. + + Based on table 3-6. from The Unicode Standard, + Version 13.0, section 3.9. + + Type: integer -> integer + */ + byteCount = i: flow.cond [ + [ (int.bitAnd i 128 == 0) 1 ] + [ (int.bitAnd i 224 == 192) 2 ] + [ (int.bitAnd i 240 == 224) 3 ] + [ (int.bitAnd i 248 == 240) 4 ] + [ true (builtins.throw "Ill-formed first byte ${int.toHex i}") ] + ]; + + /* (Internal) function to check if a given byte in + an UTF-8 byte sequence is well-formed. + + Based on table 3-7. from The Unicode Standard, + Version 13.0, section 3.9. + + Type: integer -> integer -> integer -> bool + */ + wellFormedByte = + # first byte's integer value + first: + # byte position as an index starting with 0 + pos: + let + defaultRange = int.inRange 128 191; + + secondBytePredicate = flow.switch first [ + [ (int.inRange 194 223) defaultRange ] # C2..DF + [ 224 (int.inRange 160 191) ] # E0 + [ (int.inRange 225 236) defaultRange ] # E1..EC + [ 237 (int.inRange 128 159) ] # ED + [ (int.inRange 238 239) defaultRange ] # EE..EF + [ 240 (int.inRange 144 191) ] # F0 + [ (int.inRange 241 243) defaultRange ] # F1..F3 + [ 244 (int.inRange 128 143) ] # F4 + [ (fun.const true) null ] + ]; + + firstBytePredicate = byte: assert first == byte; + first < 128 || secondBytePredicate != null; + in + # Either ASCII or in one of the byte ranges of Table 3-6. + if pos == 0 then firstBytePredicate + # return predicate according to Table 3-6. + else if pos == 1 then assert secondBytePredicate != null; secondBytePredicate + # 3rd and 4th byte have only one validity rule + else defaultRange; + + /* Iteration step for decoding an UTF-8 byte sequence. + It decodes incrementally, i. e. it has to be fed + one byte at a time and then returns either a + new state or a final result. + + If the resulting attribute set contains the attribute + result, it is finished and the decoded codepoint is + contained in that attribute. In all other cases, + pass the returned set to step again along with + a new byte. The initial state to pass is the empty + set. + + Extra attributes are always passed through, so you + can pass extra state. Be sure not to use result, + pos, code, first or count. + + This function will throw with a fairly detailed + message if it encounters ill-formed bytes. + + The implementation is based on The Unicode Standard, + Version 13.0, section 3.9, especially table 3-6. + + Type: { ... } -> string -> ({ result :: integer, ... } | { ... }) + + Example: utf8.step {} "f" + => { result = 102; } + */ + step = { pos ? 0, code ? 0, ... }@args: byte: + let + value = char.ord byte; + # first byte is context for well-formed-ness + first = args.first or value; + count = args.count or (byteCount first); + newCode = + if count == 1 + then int.bitAnd 127 first # ascii character + else # multi byte UTF-8 sequence + let + # Calculate the bitmask for extracting the + # codepoint data in the current byte. + # If the codepoint is not ASCII, the bits + # used for codepoint data differ depending + # on the byte position and overall byte + # count. The first byte always ignores + # the (count + 1) most significant bits. + # For all subsequent bytes, the 2 most + # significant bits need to be ignored. + # See also table 3-6. + mask = + if pos == 0 + then int.exp 2 (8 - (count + 1)) - 1 + else 63; + # UTF-8 uses the 6 least significant bits in all + # subsequent bytes after the first one. Therefore + # We can determine the amount we need to shift + # the current value by the amount of bytes left. + offset = (count - (pos + 1)) * 6; + in + code + (int.bitShiftL (int.bitAnd mask value) offset); + illFormedMsg = + "Ill-formed byte ${int.toHex value} at position ${toString pos} in ${toString count} byte UTF-8 sequence"; + in + if !(wellFormedByte first pos value) then builtins.throw illFormedMsg + else if pos + 1 == count + then (builtins.removeAttrs args [ # allow extra state being passed through + "count" + "code" + "pos" + "first" + ]) // { result = newCode; } + else (builtins.removeAttrs args [ "result" ]) // { + inherit count first; + code = newCode; + pos = pos + 1; + }; + + /* Decode an UTF-8 string into a list of codepoints. + + Throws if the string is ill-formed UTF-8. + + Type: string -> [ integer ] + */ + # TODO(sterni): option to fallback to replacement char instead of failure + decode = s: + let + stringLength = builtins.stringLength s; + iterResult = builtins.genericClosure { + startSet = [ + { + key = "start"; + stringIndex = -1; + state = {}; + codepoint = null; + } + ]; + operator = { state, stringIndex, ... }: + let + # updated values for current iteration step + newIndex = stringIndex + 1; + newState = step state (builtins.substring newIndex 1 s); + in lib.optional (newIndex < stringLength) { + # unique keys to make genericClosure happy + key = toString newIndex; + # carryover state for the next step + stringIndex = newIndex; + state = newState; + # actual payload for later, steps with value null are filtered out + codepoint = newState.result or null; + }; + }; + in + # extract all steps that yield a code point into a list + builtins.map (v: v.codepoint) ( + builtins.filter ( + { codepoint, stringIndex, state, ... }: + + let + # error message in case we are missing bytes at the end of input + earlyEndMsg = + if state ? count && state ? pos + then "Missing ${toString (with state; count - pos)} bytes at end of input" + else "Unexpected end of input"; + in + + # filter out all iteration steps without a codepoint value + codepoint != null + # if we are at the iteration step of a non-empty input string, throw + # an error if no codepoint was returned, as it indicates an incomplete + # UTF-8 sequence. + || (stringLength > 0 && stringIndex == stringLength - 1 && throw earlyEndMsg) + + ) iterResult + ); + + /* Pretty prints a Unicode codepoint in the U+<HEX> notation. + + Type: integer -> string + */ + formatCodepoint = cp: "U+" + string.fit { + width = 4; + char = "0"; + } (int.toHex cp); + + encodeCodepoint = cp: + let + # Find the amount of bytes needed to encode the given codepoint. + # Note that this doesn't check if the Unicode codepoint is allowed, + # but rather allows all theoretically UTF-8-encodeable ones. + count = flow.switch cp [ + [ (int.inRange 0 127) 1 ] # 00000000 0xxxxxxx + [ (int.inRange 128 2047) 2 ] # 00000yyy yyxxxxxx + [ (int.inRange 2048 65535) 3 ] # zzzzyyyy yyxxxxxx + [ (int.inRange 65536 1114111) 4 ] # 000uuuuu zzzzyyyy yyxxxxxx, + # capped at U+10FFFF + + [ (fun.const true) (builtins.throw invalidCodepointMsg) ] + ]; + + invalidCodepointMsg = "${formatCodepoint cp} is not a Unicode codepoint"; + + # Extract the bit ranges x, y, z and u from the given codepoint + # according to Table 3-6. from The Unicode Standard, Version 13.0, + # section 3.9. u is split into uh and ul since they are used in + # different bytes in the end. + components = lib.mapAttrs (_: { mask, offset }: + int.bitAnd (int.bitShiftR cp offset) mask + ) { + x = { + mask = if count > 1 then 63 else 127; + offset = 0; + }; + y = { + mask = if count > 2 then 63 else 31; + offset = 6; + }; + z = { + mask = 15; + offset = 12; + }; + # u which belongs into the second byte + ul = { + mask = 3; + offset = 16; + }; + # u which belongs into the first byte + uh = { + mask = 7; + offset = 18; + }; + }; + inherit (components) x y z ul uh; + + # Finally construct the byte sequence for the given codepoint. This is + # usually done by using the component and adding a few bits as a prefix + # which depends on the length of the sequence. The longer the sequence, + # the further back each component is pushed. To simplify this, we + # always construct a 4 element list and take the last `count` elements. + # Thanks to laziness the bogus values created by this are never evaluated. + # + # Based on table 3-6. from The Unicode Standard, + # Version 13.0, section 3.9. + bytes = lib.sublist (4 - count) count [ + # 11110uuu + (uh + 240) + # 10uuzzzz or 1110zzzz + (z + (if count > 3 then 128 + int.bitShiftL ul 4 else 224)) + # 10yyyyyy or 110yyyyy + (y + (if count > 2 then 128 else 192)) + # 10xxxxxx or 0xxxxxxx + (x + (if count > 1 then 128 else 0)) + ]; + + firstByte = builtins.head bytes; + + unableToEncodeMessage = "Can't encode ${formatCodepoint cp} as UTF-8"; + + in string.fromBytes ( + builtins.genList (i: + let + byte = builtins.elemAt bytes i; + in + if wellFormedByte firstByte i byte + then byte + else builtins.throw unableToEncodeMessage + ) count + ); + + /* Encode a list of Unicode codepoints into an UTF-8 string. + + Type: [ integer ] -> string + */ + encode = lib.concatMapStrings encodeCodepoint; + +in { + inherit + encode + decode + step + formatCodepoint + ; +} diff --git a/users/sterni/nix/utf8/tests/default.nix b/users/sterni/nix/utf8/tests/default.nix new file mode 100644 index 000000000000..ddcd34208a6d --- /dev/null +++ b/users/sterni/nix/utf8/tests/default.nix @@ -0,0 +1,141 @@ +{ depot, pkgs, lib, ... }: + +let + + inherit (pkgs) + runCommandLocal + ; + + inherit (depot.nix.runTestsuite) + runTestsuite + it + assertEq + assertThrows + assertDoesNotThrow + ; + + inherit (depot.nix.writers) + rustSimple + ; + + inherit (depot.users.sterni.nix) + int + utf8 + string + char + ; + + rustDecoder = rustSimple { + name = "utf8-decode"; + } '' + use std::io::{self, Read}; + fn main() -> std::io::Result<()> { + let mut buffer = String::new(); + io::stdin().read_to_string(&mut buffer)?; + + print!("[ "); + + for c in buffer.chars() { + print!("{} ", u32::from(c)); + } + + print!("]"); + + Ok(()) + } + ''; + + rustDecode = s: + let + expr = runCommandLocal "${s}-decoded" {} '' + printf '%s' ${lib.escapeShellArg s} | ${rustDecoder} > $out + ''; + in import expr; + + hexDecode = l: + utf8.decode (string.fromBytes (builtins.map int.fromHex l)); + + hexEncode = l: utf8.encode (builtins.map int.fromHex l); + + testFailures = it "checks UTF-8 decoding failures" ([ + (assertThrows "truncated UTF-8 string throws" (hexDecode [ "F0" "9F" ])) + # examples from The Unicode Standard + (assertThrows "ill-formed: C0 AF" (hexDecode [ "C0" "AF" ])) + (assertThrows "ill-formed: E0 9F 80" (hexDecode [ "E0" "9F" "80" ])) + (assertEq "well-formed: F4 80 83 92" (hexDecode [ "F4" "80" "83" "92" ]) [ 1048786 ]) + (assertThrows "Codepoint out of range: 0xFFFFFF" (hexEncode [ "FFFFFF" ])) + (assertThrows "Codepoint out of range: -0x02" (hexEncode [ "-02" ])) + ] ++ builtins.genList (i: + let + cp = i + int.fromHex "D800"; + in + assertThrows "Can't encode UTF-16 reserved characters: ${utf8.formatCodepoint cp}" + (utf8.encode [ cp ]) + ) (int.fromHex "07FF")); + + testAscii = it "checks decoding of ascii strings" + (builtins.map (s: assertEq "ASCII decoding is equal to UTF-8 decoding for \"${s}\"" + (string.toBytes s) (utf8.decode s)) [ + "foo bar" + "hello\nworld" + "carriage\r\nreturn" + "1238398494829304 []<><>({})[]!!)" + (string.take 127 char.allChars) + ]); + + randomUnicode = [ + "" # empty string should yield empty list + "🥰👨👨👧👦🐈⬛👩🏽🦰" + # https://kermitproject.org/utf8.html + "ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ" + "An preost wes on leoden, Laȝamon was ihoten" + "Sîne klâwen durh die wolken sint geslagen," + "Τὴ γλῶσσα μοῦ ἔδωσαν ἑλληνικὴ" + "На берегу пустынных волн" + "ვეპხის ტყაოსანი შოთა რუსთაველი" + "யாமறிந்த மொழிகளிலே தமிழ்மொழி போல் இனிதாவது எங்கும் காணோம், " + "ಬಾ ಇಲ್ಲಿ ಸಂಭವಿಸು " + ]; + + # https://kermitproject.org/utf8.html + glassSentences = [ + "Euro Symbol: €." + "Greek: Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα." + "Íslenska / Icelandic: Ég get etið gler án þess að meiða mig." + "Polish: Mogę jeść szkło, i mi nie szkodzi." + "Romanian: Pot să mănânc sticlă și ea nu mă rănește." + "Ukrainian: Я можу їсти шкло, й воно мені не пошкодить." + "Armenian: Կրնամ ապակի ուտել և ինծի անհանգիստ չըներ։" + "Georgian: მინას ვჭამ და არა მტკივა." + "Hindi: मैं काँच खा सकता हूँ, मुझे उस से कोई पीडा नहीं होती." + "Hebrew(2): אני יכול לאכול זכוכית וזה לא מזיק לי." + "Yiddish(2): איך קען עסן גלאָז און עס טוט מיר נישט װײ." + "Arabic(2): أنا قادر على أكل الزجاج و هذا لا يؤلمني." + "Japanese: 私はガラスを食べられます。それは私を傷つけません。" + "Thai: ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ " + ]; + + testDecoding = it "checks decoding of UTF-8 strings against Rust's String" + (builtins.map + (s: assertEq "Decoding of “${s}” is correct" (utf8.decode s) (rustDecode s)) + (lib.flatten [ + glassSentences + randomUnicode + ])); + + testDecodingEncoding = it "checks that decoding and then encoding forms an identity" + (builtins.map + (s: assertEq "Decoding and then encoding “${s}” yields itself" + (utf8.encode (utf8.decode s)) s) + (lib.flatten [ + glassSentences + randomUnicode + ])); + +in + runTestsuite "nix.utf8" [ + testFailures + testAscii + testDecoding + testDecodingEncoding + ] |