about summary refs log tree commit diff
diff options
context:
space:
mode:
authorsterni <sternenseemann@systemli.org>2021-11-23T18·23+0100
committersterni <sternenseemann@systemli.org>2021-11-25T11·15+0100
commit87a0aaa77dd94a5a83e4cc0d00e06528d5ce8edc (patch)
tree30b8cc68cba18b3c06c851af2d095e9d69f0cb23
parent9370ea5e3303359a9c25417b0b8210d174e167e5 (diff)
feat(sterni/nix/utf8): implement UTF-8 encoding r/3091
This implementation is still a bit rough as it doesn't check if the
produced string is valid UTF-8 which may happen if an invalid Unicode
codepoint is passed.

Change-Id: Ibaa91dafa8937142ef704a175efe967b62e3ee7b
-rw-r--r--users/sterni/nix/utf8/default.nix75
-rw-r--r--users/sterni/nix/utf8/tests/default.nix10
2 files changed, 83 insertions, 2 deletions
diff --git a/users/sterni/nix/utf8/default.nix b/users/sterni/nix/utf8/default.nix
index 99947c5a8c..b3185d9743 100644
--- a/users/sterni/nix/utf8/default.nix
+++ b/users/sterni/nix/utf8/default.nix
@@ -2,8 +2,6 @@
 
 let
 
-  # TODO(sterni): encode
-
   inherit (depot.users.sterni.nix)
     char
     flow
@@ -209,8 +207,81 @@ let
       ) iterResult
     );
 
+  encodeCodepoint = cp:
+    let
+      # Find the amount of bytes needed to encode the given codepoint.
+      # Note that this doesn't check if the Unicode codepoint is allowed,
+      # but rather allows all theoretically UTF-8-encodeable ones.
+      count = flow.switch cp [
+        [ (int.inRange 0 127)         1 ] # 00000000 0xxxxxxx
+        [ (int.inRange 128 2047)      2 ] # 00000yyy yyxxxxxx
+        [ (int.inRange 2048 65535)    3 ] # zzzzyyyy yyxxxxxx
+        [ (int.inRange 65536 2097151) 4 ] # 000uuuuu zzzzyyyy yyxxxxxx
+      ];
+
+      # Extract the bit ranges x, y, z and u from the given codepoint
+      # according to Table 3-6. from The Unicode Standard, Version 13.0,
+      # section 3.9. u is split into uh and ul since they are used in
+      # different bytes in the end.
+      components = lib.mapAttrs (_: { mask, offset }:
+        int.bitAnd (int.bitShiftR cp offset) mask
+      ) {
+        x = {
+          mask = if count > 1 then 63 else 127;
+          offset = 0;
+        };
+        y = {
+          mask = if count > 2 then 63 else 31;
+          offset = 6;
+        };
+        z = {
+          mask = 15;
+          offset = 12;
+        };
+        # u which belongs into the second byte
+        ul = {
+          mask = 3;
+          offset = 16;
+        };
+        # u which belongs into the first byte
+        uh = {
+          mask = 7;
+          offset = 18;
+        };
+      };
+      inherit (components) x y z ul uh;
+
+      # Finally construct the byte sequence for the given codepoint. This is
+      # usually done by using the component and adding a few bits as a prefix
+      # which depends on the length of the sequence. The longer the sequence,
+      # the further back each component is pushed. To simplify this, we
+      # always construct a 4 element list and take the last `count` elements.
+      # Thanks to laziness the bogus values created by this are never evaluated.
+      #
+      # Based on table 3-6. from The Unicode Standard,
+      # Version 13.0, section 3.9.
+      bytes = lib.sublist (4 - count) count [
+        # 11110uuu
+        (uh + 240)
+        # 10uuzzzz or 1110zzzz
+        (z + (if count > 3 then 128 + int.bitShiftL ul 4 else 224))
+        # 10yyyyyy or 110yyyyy
+        (y + (if count > 2 then 128 else 192))
+        # 10xxxxxx or 0xxxxxxx
+        (x + (if count > 1 then 128 else 0))
+      ];
+
+    in string.fromBytes bytes;
+
+  /* Encode a list of Unicode codepoints into an UTF-8 string.
+
+     Type: [ integer ] -> string
+  */
+  encode = lib.concatMapStrings encodeCodepoint;
+
 in {
   inherit
+    encode
     decode
     step
     ;
diff --git a/users/sterni/nix/utf8/tests/default.nix b/users/sterni/nix/utf8/tests/default.nix
index 2f8054fad6..fdc0b06715 100644
--- a/users/sterni/nix/utf8/tests/default.nix
+++ b/users/sterni/nix/utf8/tests/default.nix
@@ -113,9 +113,19 @@ let
         randomUnicode
       ]));
 
+  testDecodingEncoding = it "checks that decoding and then encoding forms an identity"
+    (builtins.map
+      (s: assertEq "Decoding and then encoding “${s}” yields itself"
+        (utf8.encode (utf8.decode s)) s)
+      (lib.flatten [
+        glassSentences
+        randomUnicode
+      ]));
+
 in
   runTestsuite "nix.utf8" [
     testFailures
     testAscii
     testDecoding
+    testDecodingEncoding
   ]