about summary refs log tree commit diff
path: root/users/sterni/nix/utf8/default.nix
diff options
context:
space:
mode:
Diffstat (limited to 'users/sterni/nix/utf8/default.nix')
-rw-r--r--users/sterni/nix/utf8/default.nix200
1 files changed, 106 insertions, 94 deletions
diff --git a/users/sterni/nix/utf8/default.nix b/users/sterni/nix/utf8/default.nix
index 270da934b6..71c846c042 100644
--- a/users/sterni/nix/utf8/default.nix
+++ b/users/sterni/nix/utf8/default.nix
@@ -25,7 +25,7 @@ let
      Type: integer -> integer
   */
   byteCount = i: flow.cond [
-    [ (int.bitAnd i 128 == 0)   1 ]
+    [ (int.bitAnd i 128 == 0) 1 ]
     [ (int.bitAnd i 224 == 192) 2 ]
     [ (int.bitAnd i 240 == 224) 3 ]
     [ (int.bitAnd i 248 == 240) 4 ]
@@ -45,30 +45,30 @@ let
     first:
     # byte position as an index starting with 0
     pos:
-      let
-        defaultRange = int.inRange 128 191;
-
-        secondBytePredicate = flow.switch first [
-          [ (int.inRange 194 223) defaultRange          ] # C2..DF
-          [ 224                   (int.inRange 160 191) ] # E0
-          [ (int.inRange 225 236) defaultRange          ] # E1..EC
-          [ 237                   (int.inRange 128 159) ] # ED
-          [ (int.inRange 238 239) defaultRange          ] # EE..EF
-          [ 240                   (int.inRange 144 191) ] # F0
-          [ (int.inRange 241 243) defaultRange          ] # F1..F3
-          [ 244                   (int.inRange 128 143) ] # F4
-          [ (fun.const true)      null                  ]
-        ];
+    let
+      defaultRange = int.inRange 128 191;
+
+      secondBytePredicate = flow.switch first [
+        [ (int.inRange 194 223) defaultRange ] # C2..DF
+        [ 224 (int.inRange 160 191) ] # E0
+        [ (int.inRange 225 236) defaultRange ] # E1..EC
+        [ 237 (int.inRange 128 159) ] # ED
+        [ (int.inRange 238 239) defaultRange ] # EE..EF
+        [ 240 (int.inRange 144 191) ] # F0
+        [ (int.inRange 241 243) defaultRange ] # F1..F3
+        [ 244 (int.inRange 128 143) ] # F4
+        [ (fun.const true) null ]
+      ];
 
-        firstBytePredicate = byte: assert first == byte;
-          first < 128 || secondBytePredicate != null;
-      in
-        # Either ASCII or in one of the byte ranges of Table 3-6.
-        if pos == 0 then firstBytePredicate
-        # return predicate according to Table 3-6.
-        else if pos == 1 then assert secondBytePredicate != null; secondBytePredicate
-        # 3rd and 4th byte have only one validity rule
-        else defaultRange;
+      firstBytePredicate = byte: assert first == byte;
+        first < 128 || secondBytePredicate != null;
+    in
+    # Either ASCII or in one of the byte ranges of Table 3-6.
+    if pos == 0 then firstBytePredicate
+    # return predicate according to Table 3-6.
+    else if pos == 1 then assert secondBytePredicate != null; secondBytePredicate
+    # 3rd and 4th byte have only one validity rule
+    else defaultRange;
 
   /* Iteration step for decoding an UTF-8 byte sequence.
      It decodes incrementally, i. e. it has to be fed
@@ -128,23 +128,24 @@ let
             # the current value by the amount of bytes left.
             offset = (count - (pos + 1)) * 6;
           in
-            code + (int.bitShiftL (int.bitAnd mask value) offset);
+          code + (int.bitShiftL (int.bitAnd mask value) offset);
       illFormedMsg =
         "Ill-formed byte ${int.toHex value} at position ${toString pos} in ${toString count} byte UTF-8 sequence";
     in
-      if !(wellFormedByte first pos value) then builtins.throw illFormedMsg
-      else if pos + 1 == count
-      then (builtins.removeAttrs args [ # allow extra state being passed through
-        "count"
-        "code"
-        "pos"
-        "first"
-      ]) // { result = newCode; }
-      else (builtins.removeAttrs args [ "result" ]) // {
-        inherit count first;
-        code = newCode;
-        pos  = pos + 1;
-      };
+    if !(wellFormedByte first pos value) then builtins.throw illFormedMsg
+    else if pos + 1 == count
+    then (builtins.removeAttrs args [
+      # allow extra state being passed through
+      "count"
+      "code"
+      "pos"
+      "first"
+    ]) // { result = newCode; }
+    else (builtins.removeAttrs args [ "result" ]) // {
+      inherit count first;
+      code = newCode;
+      pos = pos + 1;
+    };
 
   /* Decode an UTF-8 string into a list of codepoints.
 
@@ -161,7 +162,7 @@ let
           {
             key = "start";
             stringIndex = -1;
-            state = {};
+            state = { };
             codepoint = null;
           }
         ];
@@ -170,7 +171,8 @@ let
             # updated values for current iteration step
             newIndex = stringIndex + 1;
             newState = step state (builtins.substring newIndex 1 s);
-          in lib.optional (newIndex < stringLength) {
+          in
+          lib.optional (newIndex < stringLength) {
             # unique keys to make genericClosure happy
             key = toString newIndex;
             # carryover state for the next step
@@ -183,35 +185,39 @@ let
     in
     # extract all steps that yield a code point into a list
     builtins.map (v: v.codepoint) (
-      builtins.filter (
-        { codepoint, stringIndex, state, ... }:
-
-        let
-          # error message in case we are missing bytes at the end of input
-          earlyEndMsg =
-            if state ? count && state ? pos
-            then "Missing ${toString (with state; count - pos)} bytes at end of input"
-            else "Unexpected end of input";
-        in
-
-        # filter out all iteration steps without a codepoint value
-        codepoint != null
+      builtins.filter
+        (
+          { codepoint, stringIndex, state, ... }:
+
+          let
+            # error message in case we are missing bytes at the end of input
+            earlyEndMsg =
+              if state ? count && state ? pos
+              then "Missing ${toString (with state; count - pos)} bytes at end of input"
+              else "Unexpected end of input";
+          in
+
+          # filter out all iteration steps without a codepoint value
+          codepoint != null
           # if we are at the iteration step of a non-empty input string, throw
           # an error if no codepoint was returned, as it indicates an incomplete
           # UTF-8 sequence.
           || (stringLength > 0 && stringIndex == stringLength - 1 && throw earlyEndMsg)
 
-      ) iterResult
+        )
+        iterResult
     );
 
   /* Pretty prints a Unicode codepoint in the U+<HEX> notation.
 
      Type: integer -> string
   */
-  formatCodepoint = cp: "U+" + string.fit {
-    width = 4;
-    char = "0";
-  } (int.toHex cp);
+  formatCodepoint = cp: "U+" + string.fit
+    {
+      width = 4;
+      char = "0";
+    }
+    (int.toHex cp);
 
   encodeCodepoint = cp:
     let
@@ -219,11 +225,11 @@ let
       # Note that this doesn't check if the Unicode codepoint is allowed,
       # but rather allows all theoretically UTF-8-encodeable ones.
       count = flow.switch cp [
-        [ (int.inRange 0 127)         1 ] # 00000000 0xxxxxxx
-        [ (int.inRange 128 2047)      2 ] # 00000yyy yyxxxxxx
-        [ (int.inRange 2048 65535)    3 ] # zzzzyyyy yyxxxxxx
+        [ (int.inRange 0 127) 1 ] # 00000000 0xxxxxxx
+        [ (int.inRange 128 2047) 2 ] # 00000yyy yyxxxxxx
+        [ (int.inRange 2048 65535) 3 ] # zzzzyyyy yyxxxxxx
         [ (int.inRange 65536 1114111) 4 ] # 000uuuuu zzzzyyyy yyxxxxxx,
-                                          # capped at U+10FFFF
+        # capped at U+10FFFF
 
         [ (fun.const true) (builtins.throw invalidCodepointMsg) ]
       ];
@@ -234,32 +240,34 @@ let
       # according to Table 3-6. from The Unicode Standard, Version 13.0,
       # section 3.9. u is split into uh and ul since they are used in
       # different bytes in the end.
-      components = lib.mapAttrs (_: { mask, offset }:
-        int.bitAnd (int.bitShiftR cp offset) mask
-      ) {
-        x = {
-          mask = if count > 1 then 63 else 127;
-          offset = 0;
-        };
-        y = {
-          mask = if count > 2 then 63 else 31;
-          offset = 6;
-        };
-        z = {
-          mask = 15;
-          offset = 12;
-        };
-        # u which belongs into the second byte
-        ul = {
-          mask = 3;
-          offset = 16;
-        };
-        # u which belongs into the first byte
-        uh = {
-          mask = 7;
-          offset = 18;
+      components = lib.mapAttrs
+        (_: { mask, offset }:
+          int.bitAnd (int.bitShiftR cp offset) mask
+        )
+        {
+          x = {
+            mask = if count > 1 then 63 else 127;
+            offset = 0;
+          };
+          y = {
+            mask = if count > 2 then 63 else 31;
+            offset = 6;
+          };
+          z = {
+            mask = 15;
+            offset = 12;
+          };
+          # u which belongs into the second byte
+          ul = {
+            mask = 3;
+            offset = 16;
+          };
+          # u which belongs into the first byte
+          uh = {
+            mask = 7;
+            offset = 18;
+          };
         };
-      };
       inherit (components) x y z ul uh;
 
       # Finally construct the byte sequence for the given codepoint. This is
@@ -286,15 +294,18 @@ let
 
       unableToEncodeMessage = "Can't encode ${formatCodepoint cp} as UTF-8";
 
-    in string.fromBytes (
-      builtins.genList (i:
-        let
-          byte = builtins.elemAt bytes i;
-        in
+    in
+    string.fromBytes (
+      builtins.genList
+        (i:
+          let
+            byte = builtins.elemAt bytes i;
+          in
           if wellFormedByte firstByte i byte
           then byte
           else builtins.throw unableToEncodeMessage
-      ) count
+        )
+        count
     );
 
   /* Encode a list of Unicode codepoints into an UTF-8 string.
@@ -303,7 +314,8 @@ let
   */
   encode = lib.concatMapStrings encodeCodepoint;
 
-in {
+in
+{
   inherit
     encode
     decode