about summary refs log tree commit diff
path: root/users/sterni/nix/utf8/tests/default.nix
diff options
context:
space:
mode:
authorsterni <sternenseemann@systemli.org>2021-11-23T18·58+0100
committersterni <sternenseemann@systemli.org>2021-11-25T11·15+0100
commit750ef6c6934209262cf6dfc00139ca32d08f442d (patch)
treec04ca5740b35425bf3b8fe86a490dad374e1140b /users/sterni/nix/utf8/tests/default.nix
parent8dc54f89cdaf2e029230adbd14242ba0db6832ab (diff)
feat(sterni/nix/utf8): check if codepoint valid/encodeable r/3094
* Enforce the U+0000 to U+10FFFF range in `count` and throw an error if
  the given codepoint exceeds the range (encoding U+0000 won't work of
  course, but this is Nix's fault…).

* Check if the produced bytes are well formed and output an error if
  not. This indicates that the codepoint can't be encoded as UTF-8, like
  U+D800 which is reserved for UTF-16.

Change-Id: I18336e527484580f28cbfe784d51718ee15c5477
Diffstat (limited to '')
-rw-r--r--users/sterni/nix/utf8/tests/default.nix14
1 files changed, 12 insertions, 2 deletions
diff --git a/users/sterni/nix/utf8/tests/default.nix b/users/sterni/nix/utf8/tests/default.nix
index fdc0b06715..ddcd34208a 100644
--- a/users/sterni/nix/utf8/tests/default.nix
+++ b/users/sterni/nix/utf8/tests/default.nix
@@ -55,13 +55,23 @@ let
   hexDecode = l:
     utf8.decode (string.fromBytes (builtins.map int.fromHex l));
 
-  testFailures = it "checks UTF-8 decoding failures" [
+  hexEncode = l: utf8.encode (builtins.map int.fromHex l);
+
+  testFailures = it "checks UTF-8 decoding failures" ([
     (assertThrows "truncated UTF-8 string throws" (hexDecode [ "F0" "9F" ]))
     # examples from The Unicode Standard
     (assertThrows "ill-formed: C0 AF" (hexDecode [ "C0" "AF" ]))
     (assertThrows "ill-formed: E0 9F 80" (hexDecode [ "E0" "9F" "80" ]))
     (assertEq "well-formed: F4 80 83 92" (hexDecode [ "F4" "80" "83" "92" ]) [ 1048786 ])
-  ];
+    (assertThrows "Codepoint out of range: 0xFFFFFF" (hexEncode [ "FFFFFF" ]))
+    (assertThrows "Codepoint out of range: -0x02" (hexEncode [ "-02" ]))
+  ] ++ builtins.genList (i:
+    let
+      cp = i + int.fromHex "D800";
+    in
+      assertThrows "Can't encode UTF-16 reserved characters: ${utf8.formatCodepoint cp}"
+        (utf8.encode [ cp ])
+  ) (int.fromHex "07FF"));
 
   testAscii = it "checks decoding of ascii strings"
     (builtins.map (s: assertEq "ASCII decoding is equal to UTF-8 decoding for \"${s}\""