From 750ef6c6934209262cf6dfc00139ca32d08f442d Mon Sep 17 00:00:00 2001
From: sterni <sternenseemann@systemli.org>
Date: Tue, 23 Nov 2021 19:58:15 +0100
Subject: feat(sterni/nix/utf8): check if codepoint valid/encodeable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Enforce the U+0000 to U+10FFFF range in `count` and throw an error if
  the given codepoint exceeds the range (encoding U+0000 won't work of
  course, but this is Nix's fault…).

* Check if the produced bytes are well formed and output an error if
  not. This indicates that the codepoint can't be encoded as UTF-8, like
  U+D800 which is reserved for UTF-16.

Change-Id: I18336e527484580f28cbfe784d51718ee15c5477
---
 users/sterni/nix/utf8/default.nix       | 32 ++++++++++++++++++++++++++++++--
 users/sterni/nix/utf8/tests/default.nix | 14 ++++++++++++--
 2 files changed, 42 insertions(+), 4 deletions(-)

(limited to 'users')
diff --git a/users/sterni/nix/utf8/default.nix b/users/sterni/nix/utf8/default.nix
index 0c6e7d940083..270da934b6a6 100644
--- a/users/sterni/nix/utf8/default.nix
+++ b/users/sterni/nix/utf8/default.nix
@@ -204,6 +204,15 @@ let
       ) iterResult
     );
 
+  /* Pretty prints a Unicode codepoint in the U+<HEX> notation.
+
+     Type: integer -> string
+  */
+  formatCodepoint = cp: "U+" + string.fit {
+    width = 4;
+    char = "0";
+  } (int.toHex cp);
+
   encodeCodepoint = cp:
     let
       # Find the amount of bytes needed to encode the given codepoint.
@@ -213,9 +222,14 @@ let
         [ (int.inRange 0 127)         1 ] # 00000000 0xxxxxxx
         [ (int.inRange 128 2047)      2 ] # 00000yyy yyxxxxxx
         [ (int.inRange 2048 65535)    3 ] # zzzzyyyy yyxxxxxx
-        [ (int.inRange 65536 2097151) 4 ] # 000uuuuu zzzzyyyy yyxxxxxx
+        [ (int.inRange 65536 1114111) 4 ] # 000uuuuu zzzzyyyy yyxxxxxx,
+                                          # capped at U+10FFFF
+
+        [ (fun.const true) (builtins.throw invalidCodepointMsg) ]
       ];
 
+      invalidCodepointMsg = "${formatCodepoint cp} is not a Unicode codepoint";
+
       # Extract the bit ranges x, y, z and u from the given codepoint
       # according to Table 3-6. from The Unicode Standard, Version 13.0,
       # section 3.9. u is split into uh and ul since they are used in
@@ -268,7 +282,20 @@ let
         (x + (if count > 1 then 128 else 0))
       ];
 
-    in string.fromBytes bytes;
+      firstByte = builtins.head bytes;
+
+      unableToEncodeMessage = "Can't encode ${formatCodepoint cp} as UTF-8";
+
+    in string.fromBytes (
+      builtins.genList (i:
+        let
+          byte = builtins.elemAt bytes i;
+        in
+          if wellFormedByte firstByte i byte
+          then byte
+          else builtins.throw unableToEncodeMessage
+      ) count
+    );
 
   /* Encode a list of Unicode codepoints into an UTF-8 string.
 
@@ -281,5 +308,6 @@ in {
     encode
     decode
     step
+    formatCodepoint
     ;
 }
diff --git a/users/sterni/nix/utf8/tests/default.nix b/users/sterni/nix/utf8/tests/default.nix
index fdc0b067156f..ddcd34208a6d 100644
--- a/users/sterni/nix/utf8/tests/default.nix
+++ b/users/sterni/nix/utf8/tests/default.nix
@@ -55,13 +55,23 @@ let
   hexDecode = l:
     utf8.decode (string.fromBytes (builtins.map int.fromHex l));
 
-  testFailures = it "checks UTF-8 decoding failures" [
+  hexEncode = l: utf8.encode (builtins.map int.fromHex l);
+
+  testFailures = it "checks UTF-8 decoding failures" ([
     (assertThrows "truncated UTF-8 string throws" (hexDecode [ "F0" "9F" ]))
     # examples from The Unicode Standard
     (assertThrows "ill-formed: C0 AF" (hexDecode [ "C0" "AF" ]))
     (assertThrows "ill-formed: E0 9F 80" (hexDecode [ "E0" "9F" "80" ]))
     (assertEq "well-formed: F4 80 83 92" (hexDecode [ "F4" "80" "83" "92" ]) [ 1048786 ])
-  ];
+    (assertThrows "Codepoint out of range: 0xFFFFFF" (hexEncode [ "FFFFFF" ]))
+    (assertThrows "Codepoint out of range: -0x02" (hexEncode [ "-02" ]))
+  ] ++ builtins.genList (i:
+    let
+      cp = i + int.fromHex "D800";
+    in
+      assertThrows "Can't encode UTF-16 reserved characters: ${utf8.formatCodepoint cp}"
+        (utf8.encode [ cp ])
+  ) (int.fromHex "07FF"));
 
   testAscii = it "checks decoding of ascii strings"
     (builtins.map (s: assertEq "ASCII decoding is equal to UTF-8 decoding for \"${s}\""
-- 
cgit 1.4.1