diff options
author | sterni <sternenseemann@systemli.org> | 2021-11-23T18·58+0100 |
---|---|---|
committer | sterni <sternenseemann@systemli.org> | 2021-11-25T11·15+0100 |
commit | 750ef6c6934209262cf6dfc00139ca32d08f442d (patch) | |
tree | c04ca5740b35425bf3b8fe86a490dad374e1140b /users | |
parent | 8dc54f89cdaf2e029230adbd14242ba0db6832ab (diff) |
feat(sterni/nix/utf8): check if codepoint valid/encodeable r/3094
* Enforce the U+0000 to U+10FFFF range in `count` and throw an error if the given codepoint exceeds the range (encoding U+0000 won't work of course, but this is Nix's fault…). * Check if the produced bytes are well formed and output an error if not. This indicates that the codepoint can't be encoded as UTF-8, like U+D800 which is reserved for UTF-16. Change-Id: I18336e527484580f28cbfe784d51718ee15c5477
Diffstat (limited to 'users')
-rw-r--r-- | users/sterni/nix/utf8/default.nix | 32 | ||||
-rw-r--r-- | users/sterni/nix/utf8/tests/default.nix | 14 |
2 files changed, 42 insertions, 4 deletions
diff --git a/users/sterni/nix/utf8/default.nix b/users/sterni/nix/utf8/default.nix index 0c6e7d940083..270da934b6a6 100644 --- a/users/sterni/nix/utf8/default.nix +++ b/users/sterni/nix/utf8/default.nix @@ -204,6 +204,15 @@ let ) iterResult ); + /* Pretty prints a Unicode codepoint in the U+<HEX> notation. + + Type: integer -> string + */ + formatCodepoint = cp: "U+" + string.fit { + width = 4; + char = "0"; + } (int.toHex cp); + encodeCodepoint = cp: let # Find the amount of bytes needed to encode the given codepoint. @@ -213,9 +222,14 @@ let [ (int.inRange 0 127) 1 ] # 00000000 0xxxxxxx [ (int.inRange 128 2047) 2 ] # 00000yyy yyxxxxxx [ (int.inRange 2048 65535) 3 ] # zzzzyyyy yyxxxxxx - [ (int.inRange 65536 2097151) 4 ] # 000uuuuu zzzzyyyy yyxxxxxx + [ (int.inRange 65536 1114111) 4 ] # 000uuuuu zzzzyyyy yyxxxxxx, + # capped at U+10FFFF + + [ (fun.const true) (builtins.throw invalidCodepointMsg) ] ]; + invalidCodepointMsg = "${formatCodepoint cp} is not a Unicode codepoint"; + # Extract the bit ranges x, y, z and u from the given codepoint # according to Table 3-6. from The Unicode Standard, Version 13.0, # section 3.9. u is split into uh and ul since they are used in @@ -268,7 +282,20 @@ let (x + (if count > 1 then 128 else 0)) ]; - in string.fromBytes bytes; + firstByte = builtins.head bytes; + + unableToEncodeMessage = "Can't encode ${formatCodepoint cp} as UTF-8"; + + in string.fromBytes ( + builtins.genList (i: + let + byte = builtins.elemAt bytes i; + in + if wellFormedByte firstByte i byte + then byte + else builtins.throw unableToEncodeMessage + ) count + ); /* Encode a list of Unicode codepoints into an UTF-8 string. @@ -281,5 +308,6 @@ in { encode decode step + formatCodepoint ; } diff --git a/users/sterni/nix/utf8/tests/default.nix b/users/sterni/nix/utf8/tests/default.nix index fdc0b067156f..ddcd34208a6d 100644 --- a/users/sterni/nix/utf8/tests/default.nix +++ b/users/sterni/nix/utf8/tests/default.nix @@ -55,13 +55,23 @@ let hexDecode = l: utf8.decode (string.fromBytes (builtins.map int.fromHex l)); - testFailures = it "checks UTF-8 decoding failures" [ + hexEncode = l: utf8.encode (builtins.map int.fromHex l); + + testFailures = it "checks UTF-8 decoding failures" ([ (assertThrows "truncated UTF-8 string throws" (hexDecode [ "F0" "9F" ])) # examples from The Unicode Standard (assertThrows "ill-formed: C0 AF" (hexDecode [ "C0" "AF" ])) (assertThrows "ill-formed: E0 9F 80" (hexDecode [ "E0" "9F" "80" ])) (assertEq "well-formed: F4 80 83 92" (hexDecode [ "F4" "80" "83" "92" ]) [ 1048786 ]) - ]; + (assertThrows "Codepoint out of range: 0xFFFFFF" (hexEncode [ "FFFFFF" ])) + (assertThrows "Codepoint out of range: -0x02" (hexEncode [ "-02" ])) + ] ++ builtins.genList (i: + let + cp = i + int.fromHex "D800"; + in + assertThrows "Can't encode UTF-16 reserved characters: ${utf8.formatCodepoint cp}" + (utf8.encode [ cp ]) + ) (int.fromHex "07FF")); testAscii = it "checks decoding of ascii strings" (builtins.map (s: assertEq "ASCII decoding is equal to UTF-8 decoding for \"${s}\"" |