From 750ef6c6934209262cf6dfc00139ca32d08f442d Mon Sep 17 00:00:00 2001 From: sterni Date: Tue, 23 Nov 2021 19:58:15 +0100 Subject: feat(sterni/nix/utf8): check if codepoint valid/encodeable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Enforce the U+0000 to U+10FFFF range in `count` and throw an error if the given codepoint exceeds the range (encoding U+0000 won't work of course, but this is Nix's fault…). * Check if the produced bytes are well formed and output an error if not. This indicates that the codepoint can't be encoded as UTF-8, like U+D800 which is reserved for UTF-16. Change-Id: I18336e527484580f28cbfe784d51718ee15c5477 --- users/sterni/nix/utf8/tests/default.nix | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'users/sterni/nix/utf8/tests') diff --git a/users/sterni/nix/utf8/tests/default.nix b/users/sterni/nix/utf8/tests/default.nix index fdc0b067156f..ddcd34208a6d 100644 --- a/users/sterni/nix/utf8/tests/default.nix +++ b/users/sterni/nix/utf8/tests/default.nix @@ -55,13 +55,23 @@ let hexDecode = l: utf8.decode (string.fromBytes (builtins.map int.fromHex l)); - testFailures = it "checks UTF-8 decoding failures" [ + hexEncode = l: utf8.encode (builtins.map int.fromHex l); + + testFailures = it "checks UTF-8 decoding failures" ([ (assertThrows "truncated UTF-8 string throws" (hexDecode [ "F0" "9F" ])) # examples from The Unicode Standard (assertThrows "ill-formed: C0 AF" (hexDecode [ "C0" "AF" ])) (assertThrows "ill-formed: E0 9F 80" (hexDecode [ "E0" "9F" "80" ])) (assertEq "well-formed: F4 80 83 92" (hexDecode [ "F4" "80" "83" "92" ]) [ 1048786 ]) - ]; + (assertThrows "Codepoint out of range: 0xFFFFFF" (hexEncode [ "FFFFFF" ])) + (assertThrows "Codepoint out of range: -0x02" (hexEncode [ "-02" ])) + ] ++ builtins.genList (i: + let + cp = i + int.fromHex "D800"; + in + assertThrows "Can't encode UTF-16 reserved characters: ${utf8.formatCodepoint cp}" + (utf8.encode [ cp ]) + ) (int.fromHex "07FF")); testAscii = it "checks decoding of ascii strings" (builtins.map (s: assertEq "ASCII decoding is equal to UTF-8 decoding for \"${s}\"" -- cgit 1.4.1