about summary refs log tree commit diff
path: root/users/sterni/nix/utf8
diff options
context:
space:
mode:
Diffstat (limited to 'users/sterni/nix/utf8')
-rw-r--r--users/sterni/nix/utf8/default.nix200
-rw-r--r--users/sterni/nix/utf8/tests/default.nix57
2 files changed, 138 insertions, 119 deletions
diff --git a/users/sterni/nix/utf8/default.nix b/users/sterni/nix/utf8/default.nix
index 270da934b6a6..71c846c0421e 100644
--- a/users/sterni/nix/utf8/default.nix
+++ b/users/sterni/nix/utf8/default.nix
@@ -25,7 +25,7 @@ let
      Type: integer -> integer
   */
   byteCount = i: flow.cond [
-    [ (int.bitAnd i 128 == 0)   1 ]
+    [ (int.bitAnd i 128 == 0) 1 ]
     [ (int.bitAnd i 224 == 192) 2 ]
     [ (int.bitAnd i 240 == 224) 3 ]
     [ (int.bitAnd i 248 == 240) 4 ]
@@ -45,30 +45,30 @@ let
     first:
     # byte position as an index starting with 0
     pos:
-      let
-        defaultRange = int.inRange 128 191;
-
-        secondBytePredicate = flow.switch first [
-          [ (int.inRange 194 223) defaultRange          ] # C2..DF
-          [ 224                   (int.inRange 160 191) ] # E0
-          [ (int.inRange 225 236) defaultRange          ] # E1..EC
-          [ 237                   (int.inRange 128 159) ] # ED
-          [ (int.inRange 238 239) defaultRange          ] # EE..EF
-          [ 240                   (int.inRange 144 191) ] # F0
-          [ (int.inRange 241 243) defaultRange          ] # F1..F3
-          [ 244                   (int.inRange 128 143) ] # F4
-          [ (fun.const true)      null                  ]
-        ];
+    let
+      defaultRange = int.inRange 128 191;
+
+      secondBytePredicate = flow.switch first [
+        [ (int.inRange 194 223) defaultRange ] # C2..DF
+        [ 224 (int.inRange 160 191) ] # E0
+        [ (int.inRange 225 236) defaultRange ] # E1..EC
+        [ 237 (int.inRange 128 159) ] # ED
+        [ (int.inRange 238 239) defaultRange ] # EE..EF
+        [ 240 (int.inRange 144 191) ] # F0
+        [ (int.inRange 241 243) defaultRange ] # F1..F3
+        [ 244 (int.inRange 128 143) ] # F4
+        [ (fun.const true) null ]
+      ];
 
-        firstBytePredicate = byte: assert first == byte;
-          first < 128 || secondBytePredicate != null;
-      in
-        # Either ASCII or in one of the byte ranges of Table 3-6.
-        if pos == 0 then firstBytePredicate
-        # return predicate according to Table 3-6.
-        else if pos == 1 then assert secondBytePredicate != null; secondBytePredicate
-        # 3rd and 4th byte have only one validity rule
-        else defaultRange;
+      firstBytePredicate = byte: assert first == byte;
+        first < 128 || secondBytePredicate != null;
+    in
+    # Either ASCII or in one of the byte ranges of Table 3-6.
+    if pos == 0 then firstBytePredicate
+    # return predicate according to Table 3-6.
+    else if pos == 1 then assert secondBytePredicate != null; secondBytePredicate
+    # 3rd and 4th byte have only one validity rule
+    else defaultRange;
 
   /* Iteration step for decoding an UTF-8 byte sequence.
      It decodes incrementally, i. e. it has to be fed
@@ -128,23 +128,24 @@ let
             # the current value by the amount of bytes left.
             offset = (count - (pos + 1)) * 6;
           in
-            code + (int.bitShiftL (int.bitAnd mask value) offset);
+          code + (int.bitShiftL (int.bitAnd mask value) offset);
       illFormedMsg =
         "Ill-formed byte ${int.toHex value} at position ${toString pos} in ${toString count} byte UTF-8 sequence";
     in
-      if !(wellFormedByte first pos value) then builtins.throw illFormedMsg
-      else if pos + 1 == count
-      then (builtins.removeAttrs args [ # allow extra state being passed through
-        "count"
-        "code"
-        "pos"
-        "first"
-      ]) // { result = newCode; }
-      else (builtins.removeAttrs args [ "result" ]) // {
-        inherit count first;
-        code = newCode;
-        pos  = pos + 1;
-      };
+    if !(wellFormedByte first pos value) then builtins.throw illFormedMsg
+    else if pos + 1 == count
+    then (builtins.removeAttrs args [
+      # allow extra state being passed through
+      "count"
+      "code"
+      "pos"
+      "first"
+    ]) // { result = newCode; }
+    else (builtins.removeAttrs args [ "result" ]) // {
+      inherit count first;
+      code = newCode;
+      pos = pos + 1;
+    };
 
   /* Decode an UTF-8 string into a list of codepoints.
 
@@ -161,7 +162,7 @@ let
           {
             key = "start";
             stringIndex = -1;
-            state = {};
+            state = { };
             codepoint = null;
           }
         ];
@@ -170,7 +171,8 @@ let
             # updated values for current iteration step
             newIndex = stringIndex + 1;
             newState = step state (builtins.substring newIndex 1 s);
-          in lib.optional (newIndex < stringLength) {
+          in
+          lib.optional (newIndex < stringLength) {
             # unique keys to make genericClosure happy
             key = toString newIndex;
             # carryover state for the next step
@@ -183,35 +185,39 @@ let
     in
     # extract all steps that yield a code point into a list
     builtins.map (v: v.codepoint) (
-      builtins.filter (
-        { codepoint, stringIndex, state, ... }:
-
-        let
-          # error message in case we are missing bytes at the end of input
-          earlyEndMsg =
-            if state ? count && state ? pos
-            then "Missing ${toString (with state; count - pos)} bytes at end of input"
-            else "Unexpected end of input";
-        in
-
-        # filter out all iteration steps without a codepoint value
-        codepoint != null
+      builtins.filter
+        (
+          { codepoint, stringIndex, state, ... }:
+
+          let
+            # error message in case we are missing bytes at the end of input
+            earlyEndMsg =
+              if state ? count && state ? pos
+              then "Missing ${toString (with state; count - pos)} bytes at end of input"
+              else "Unexpected end of input";
+          in
+
+          # filter out all iteration steps without a codepoint value
+          codepoint != null
           # if we are at the iteration step of a non-empty input string, throw
           # an error if no codepoint was returned, as it indicates an incomplete
           # UTF-8 sequence.
           || (stringLength > 0 && stringIndex == stringLength - 1 && throw earlyEndMsg)
 
-      ) iterResult
+        )
+        iterResult
     );
 
   /* Pretty prints a Unicode codepoint in the U+<HEX> notation.
 
      Type: integer -> string
   */
-  formatCodepoint = cp: "U+" + string.fit {
-    width = 4;
-    char = "0";
-  } (int.toHex cp);
+  formatCodepoint = cp: "U+" + string.fit
+    {
+      width = 4;
+      char = "0";
+    }
+    (int.toHex cp);
 
   encodeCodepoint = cp:
     let
@@ -219,11 +225,11 @@ let
       # Note that this doesn't check if the Unicode codepoint is allowed,
       # but rather allows all theoretically UTF-8-encodeable ones.
       count = flow.switch cp [
-        [ (int.inRange 0 127)         1 ] # 00000000 0xxxxxxx
-        [ (int.inRange 128 2047)      2 ] # 00000yyy yyxxxxxx
-        [ (int.inRange 2048 65535)    3 ] # zzzzyyyy yyxxxxxx
+        [ (int.inRange 0 127) 1 ] # 00000000 0xxxxxxx
+        [ (int.inRange 128 2047) 2 ] # 00000yyy yyxxxxxx
+        [ (int.inRange 2048 65535) 3 ] # zzzzyyyy yyxxxxxx
         [ (int.inRange 65536 1114111) 4 ] # 000uuuuu zzzzyyyy yyxxxxxx,
-                                          # capped at U+10FFFF
+        # capped at U+10FFFF
 
         [ (fun.const true) (builtins.throw invalidCodepointMsg) ]
       ];
@@ -234,32 +240,34 @@ let
       # according to Table 3-6. from The Unicode Standard, Version 13.0,
       # section 3.9. u is split into uh and ul since they are used in
       # different bytes in the end.
-      components = lib.mapAttrs (_: { mask, offset }:
-        int.bitAnd (int.bitShiftR cp offset) mask
-      ) {
-        x = {
-          mask = if count > 1 then 63 else 127;
-          offset = 0;
-        };
-        y = {
-          mask = if count > 2 then 63 else 31;
-          offset = 6;
-        };
-        z = {
-          mask = 15;
-          offset = 12;
-        };
-        # u which belongs into the second byte
-        ul = {
-          mask = 3;
-          offset = 16;
-        };
-        # u which belongs into the first byte
-        uh = {
-          mask = 7;
-          offset = 18;
+      components = lib.mapAttrs
+        (_: { mask, offset }:
+          int.bitAnd (int.bitShiftR cp offset) mask
+        )
+        {
+          x = {
+            mask = if count > 1 then 63 else 127;
+            offset = 0;
+          };
+          y = {
+            mask = if count > 2 then 63 else 31;
+            offset = 6;
+          };
+          z = {
+            mask = 15;
+            offset = 12;
+          };
+          # u which belongs into the second byte
+          ul = {
+            mask = 3;
+            offset = 16;
+          };
+          # u which belongs into the first byte
+          uh = {
+            mask = 7;
+            offset = 18;
+          };
         };
-      };
       inherit (components) x y z ul uh;
 
       # Finally construct the byte sequence for the given codepoint. This is
@@ -286,15 +294,18 @@ let
 
       unableToEncodeMessage = "Can't encode ${formatCodepoint cp} as UTF-8";
 
-    in string.fromBytes (
-      builtins.genList (i:
-        let
-          byte = builtins.elemAt bytes i;
-        in
+    in
+    string.fromBytes (
+      builtins.genList
+        (i:
+          let
+            byte = builtins.elemAt bytes i;
+          in
           if wellFormedByte firstByte i byte
           then byte
           else builtins.throw unableToEncodeMessage
-      ) count
+        )
+        count
     );
 
   /* Encode a list of Unicode codepoints into an UTF-8 string.
@@ -303,7 +314,8 @@ let
   */
   encode = lib.concatMapStrings encodeCodepoint;
 
-in {
+in
+{
   inherit
     encode
     decode
diff --git a/users/sterni/nix/utf8/tests/default.nix b/users/sterni/nix/utf8/tests/default.nix
index ddcd34208a6d..40783eab2421 100644
--- a/users/sterni/nix/utf8/tests/default.nix
+++ b/users/sterni/nix/utf8/tests/default.nix
@@ -25,9 +25,10 @@ let
     char
     ;
 
-  rustDecoder = rustSimple {
-    name = "utf8-decode";
-  } ''
+  rustDecoder = rustSimple
+    {
+      name = "utf8-decode";
+    } ''
     use std::io::{self, Read};
     fn main() -> std::io::Result<()> {
       let mut buffer = String::new();
@@ -47,10 +48,11 @@ let
 
   rustDecode = s:
     let
-      expr = runCommandLocal "${s}-decoded" {} ''
+      expr = runCommandLocal "${s}-decoded" { } ''
         printf '%s' ${lib.escapeShellArg s} | ${rustDecoder} > $out
       '';
-    in import expr;
+    in
+    import expr;
 
   hexDecode = l:
     utf8.decode (string.fromBytes (builtins.map int.fromHex l));
@@ -65,23 +67,27 @@ let
     (assertEq "well-formed: F4 80 83 92" (hexDecode [ "F4" "80" "83" "92" ]) [ 1048786 ])
     (assertThrows "Codepoint out of range: 0xFFFFFF" (hexEncode [ "FFFFFF" ]))
     (assertThrows "Codepoint out of range: -0x02" (hexEncode [ "-02" ]))
-  ] ++ builtins.genList (i:
-    let
-      cp = i + int.fromHex "D800";
-    in
+  ] ++ builtins.genList
+    (i:
+      let
+        cp = i + int.fromHex "D800";
+      in
       assertThrows "Can't encode UTF-16 reserved characters: ${utf8.formatCodepoint cp}"
         (utf8.encode [ cp ])
-  ) (int.fromHex "07FF"));
+    )
+    (int.fromHex "07FF"));
 
   testAscii = it "checks decoding of ascii strings"
-    (builtins.map (s: assertEq "ASCII decoding is equal to UTF-8 decoding for \"${s}\""
-      (string.toBytes s) (utf8.decode s)) [
-        "foo bar"
-        "hello\nworld"
-        "carriage\r\nreturn"
-        "1238398494829304 []<><>({})[]!!)"
-        (string.take 127 char.allChars)
-      ]);
+    (builtins.map
+      (s: assertEq "ASCII decoding is equal to UTF-8 decoding for \"${s}\""
+        (string.toBytes s)
+        (utf8.decode s)) [
+      "foo bar"
+      "hello\nworld"
+      "carriage\r\nreturn"
+      "1238398494829304 []<><>({})[]!!)"
+      (string.take 127 char.allChars)
+    ]);
 
   randomUnicode = [
     "" # empty string should yield empty list
@@ -126,16 +132,17 @@ let
   testDecodingEncoding = it "checks that decoding and then encoding forms an identity"
     (builtins.map
       (s: assertEq "Decoding and then encoding “${s}” yields itself"
-        (utf8.encode (utf8.decode s)) s)
+        (utf8.encode (utf8.decode s))
+        s)
       (lib.flatten [
         glassSentences
         randomUnicode
       ]));
 
 in
-  runTestsuite "nix.utf8" [
-    testFailures
-    testAscii
-    testDecoding
-    testDecodingEncoding
-  ]
+runTestsuite "nix.utf8" [
+  testFailures
+  testAscii
+  testDecoding
+  testDecodingEncoding
+]