From 87a0aaa77dd94a5a83e4cc0d00e06528d5ce8edc Mon Sep 17 00:00:00 2001 From: sterni Date: Tue, 23 Nov 2021 19:23:54 +0100 Subject: feat(sterni/nix/utf8): implement UTF-8 encoding This implementation is still a bit rough as it doesn't check if the produced string is valid UTF-8 which may happen if an invalid Unicode codepoint is passed. Change-Id: Ibaa91dafa8937142ef704a175efe967b62e3ee7b --- users/sterni/nix/utf8/default.nix | 75 ++++++++++++++++++++++++++++++++- users/sterni/nix/utf8/tests/default.nix | 10 +++++ 2 files changed, 83 insertions(+), 2 deletions(-) (limited to 'users/sterni/nix/utf8') diff --git a/users/sterni/nix/utf8/default.nix b/users/sterni/nix/utf8/default.nix index 99947c5a8c2c..b3185d9743b5 100644 --- a/users/sterni/nix/utf8/default.nix +++ b/users/sterni/nix/utf8/default.nix @@ -2,8 +2,6 @@ let - # TODO(sterni): encode - inherit (depot.users.sterni.nix) char flow @@ -209,8 +207,81 @@ let ) iterResult ); + encodeCodepoint = cp: + let + # Find the amount of bytes needed to encode the given codepoint. + # Note that this doesn't check if the Unicode codepoint is allowed, + # but rather allows all theoretically UTF-8-encodeable ones. + count = flow.switch cp [ + [ (int.inRange 0 127) 1 ] # 00000000 0xxxxxxx + [ (int.inRange 128 2047) 2 ] # 00000yyy yyxxxxxx + [ (int.inRange 2048 65535) 3 ] # zzzzyyyy yyxxxxxx + [ (int.inRange 65536 2097151) 4 ] # 000uuuuu zzzzyyyy yyxxxxxx + ]; + + # Extract the bit ranges x, y, z and u from the given codepoint + # according to Table 3-6. from The Unicode Standard, Version 13.0, + # section 3.9. u is split into uh and ul since they are used in + # different bytes in the end. + components = lib.mapAttrs (_: { mask, offset }: + int.bitAnd (int.bitShiftR cp offset) mask + ) { + x = { + mask = if count > 1 then 63 else 127; + offset = 0; + }; + y = { + mask = if count > 2 then 63 else 31; + offset = 6; + }; + z = { + mask = 15; + offset = 12; + }; + # u which belongs into the second byte + ul = { + mask = 3; + offset = 16; + }; + # u which belongs into the first byte + uh = { + mask = 7; + offset = 18; + }; + }; + inherit (components) x y z ul uh; + + # Finally construct the byte sequence for the given codepoint. This is + # usually done by using the component and adding a few bits as a prefix + # which depends on the length of the sequence. The longer the sequence, + # the further back each component is pushed. To simplify this, we + # always construct a 4 element list and take the last `count` elements. + # Thanks to laziness the bogus values created by this are never evaluated. + # + # Based on table 3-6. from The Unicode Standard, + # Version 13.0, section 3.9. + bytes = lib.sublist (4 - count) count [ + # 11110uuu + (uh + 240) + # 10uuzzzz or 1110zzzz + (z + (if count > 3 then 128 + int.bitShiftL ul 4 else 224)) + # 10yyyyyy or 110yyyyy + (y + (if count > 2 then 128 else 192)) + # 10xxxxxx or 0xxxxxxx + (x + (if count > 1 then 128 else 0)) + ]; + + in string.fromBytes bytes; + + /* Encode a list of Unicode codepoints into an UTF-8 string. + + Type: [ integer ] -> string + */ + encode = lib.concatMapStrings encodeCodepoint; + in { inherit + encode decode step ; diff --git a/users/sterni/nix/utf8/tests/default.nix b/users/sterni/nix/utf8/tests/default.nix index 2f8054fad6d9..fdc0b067156f 100644 --- a/users/sterni/nix/utf8/tests/default.nix +++ b/users/sterni/nix/utf8/tests/default.nix @@ -113,9 +113,19 @@ let randomUnicode ])); + testDecodingEncoding = it "checks that decoding and then encoding forms an identity" + (builtins.map + (s: assertEq "Decoding and then encoding “${s}” yields itself" + (utf8.encode (utf8.decode s)) s) + (lib.flatten [ + glassSentences + randomUnicode + ])); + in runTestsuite "nix.utf8" [ testFailures testAscii testDecoding + testDecodingEncoding ] -- cgit 1.4.1