diff options
Diffstat (limited to 'users/sterni/nix/utf8/default.nix')
-rw-r--r-- | users/sterni/nix/utf8/default.nix | 200 |
1 files changed, 106 insertions, 94 deletions
diff --git a/users/sterni/nix/utf8/default.nix b/users/sterni/nix/utf8/default.nix index 270da934b6a6..71c846c0421e 100644 --- a/users/sterni/nix/utf8/default.nix +++ b/users/sterni/nix/utf8/default.nix @@ -25,7 +25,7 @@ let Type: integer -> integer */ byteCount = i: flow.cond [ - [ (int.bitAnd i 128 == 0) 1 ] + [ (int.bitAnd i 128 == 0) 1 ] [ (int.bitAnd i 224 == 192) 2 ] [ (int.bitAnd i 240 == 224) 3 ] [ (int.bitAnd i 248 == 240) 4 ] @@ -45,30 +45,30 @@ let first: # byte position as an index starting with 0 pos: - let - defaultRange = int.inRange 128 191; - - secondBytePredicate = flow.switch first [ - [ (int.inRange 194 223) defaultRange ] # C2..DF - [ 224 (int.inRange 160 191) ] # E0 - [ (int.inRange 225 236) defaultRange ] # E1..EC - [ 237 (int.inRange 128 159) ] # ED - [ (int.inRange 238 239) defaultRange ] # EE..EF - [ 240 (int.inRange 144 191) ] # F0 - [ (int.inRange 241 243) defaultRange ] # F1..F3 - [ 244 (int.inRange 128 143) ] # F4 - [ (fun.const true) null ] - ]; + let + defaultRange = int.inRange 128 191; + + secondBytePredicate = flow.switch first [ + [ (int.inRange 194 223) defaultRange ] # C2..DF + [ 224 (int.inRange 160 191) ] # E0 + [ (int.inRange 225 236) defaultRange ] # E1..EC + [ 237 (int.inRange 128 159) ] # ED + [ (int.inRange 238 239) defaultRange ] # EE..EF + [ 240 (int.inRange 144 191) ] # F0 + [ (int.inRange 241 243) defaultRange ] # F1..F3 + [ 244 (int.inRange 128 143) ] # F4 + [ (fun.const true) null ] + ]; - firstBytePredicate = byte: assert first == byte; - first < 128 || secondBytePredicate != null; - in - # Either ASCII or in one of the byte ranges of Table 3-6. - if pos == 0 then firstBytePredicate - # return predicate according to Table 3-6. - else if pos == 1 then assert secondBytePredicate != null; secondBytePredicate - # 3rd and 4th byte have only one validity rule - else defaultRange; + firstBytePredicate = byte: assert first == byte; + first < 128 || secondBytePredicate != null; + in + # Either ASCII or in one of the byte ranges of Table 3-6. + if pos == 0 then firstBytePredicate + # return predicate according to Table 3-6. + else if pos == 1 then assert secondBytePredicate != null; secondBytePredicate + # 3rd and 4th byte have only one validity rule + else defaultRange; /* Iteration step for decoding an UTF-8 byte sequence. It decodes incrementally, i. e. it has to be fed @@ -128,23 +128,24 @@ let # the current value by the amount of bytes left. offset = (count - (pos + 1)) * 6; in - code + (int.bitShiftL (int.bitAnd mask value) offset); + code + (int.bitShiftL (int.bitAnd mask value) offset); illFormedMsg = "Ill-formed byte ${int.toHex value} at position ${toString pos} in ${toString count} byte UTF-8 sequence"; in - if !(wellFormedByte first pos value) then builtins.throw illFormedMsg - else if pos + 1 == count - then (builtins.removeAttrs args [ # allow extra state being passed through - "count" - "code" - "pos" - "first" - ]) // { result = newCode; } - else (builtins.removeAttrs args [ "result" ]) // { - inherit count first; - code = newCode; - pos = pos + 1; - }; + if !(wellFormedByte first pos value) then builtins.throw illFormedMsg + else if pos + 1 == count + then (builtins.removeAttrs args [ + # allow extra state being passed through + "count" + "code" + "pos" + "first" + ]) // { result = newCode; } + else (builtins.removeAttrs args [ "result" ]) // { + inherit count first; + code = newCode; + pos = pos + 1; + }; /* Decode an UTF-8 string into a list of codepoints. @@ -161,7 +162,7 @@ let { key = "start"; stringIndex = -1; - state = {}; + state = { }; codepoint = null; } ]; @@ -170,7 +171,8 @@ let # updated values for current iteration step newIndex = stringIndex + 1; newState = step state (builtins.substring newIndex 1 s); - in lib.optional (newIndex < stringLength) { + in + lib.optional (newIndex < stringLength) { # unique keys to make genericClosure happy key = toString newIndex; # carryover state for the next step @@ -183,35 +185,39 @@ let in # extract all steps that yield a code point into a list builtins.map (v: v.codepoint) ( - builtins.filter ( - { codepoint, stringIndex, state, ... }: - - let - # error message in case we are missing bytes at the end of input - earlyEndMsg = - if state ? count && state ? pos - then "Missing ${toString (with state; count - pos)} bytes at end of input" - else "Unexpected end of input"; - in - - # filter out all iteration steps without a codepoint value - codepoint != null + builtins.filter + ( + { codepoint, stringIndex, state, ... }: + + let + # error message in case we are missing bytes at the end of input + earlyEndMsg = + if state ? count && state ? pos + then "Missing ${toString (with state; count - pos)} bytes at end of input" + else "Unexpected end of input"; + in + + # filter out all iteration steps without a codepoint value + codepoint != null # if we are at the iteration step of a non-empty input string, throw # an error if no codepoint was returned, as it indicates an incomplete # UTF-8 sequence. || (stringLength > 0 && stringIndex == stringLength - 1 && throw earlyEndMsg) - ) iterResult + ) + iterResult ); /* Pretty prints a Unicode codepoint in the U+<HEX> notation. Type: integer -> string */ - formatCodepoint = cp: "U+" + string.fit { - width = 4; - char = "0"; - } (int.toHex cp); + formatCodepoint = cp: "U+" + string.fit + { + width = 4; + char = "0"; + } + (int.toHex cp); encodeCodepoint = cp: let @@ -219,11 +225,11 @@ let # Note that this doesn't check if the Unicode codepoint is allowed, # but rather allows all theoretically UTF-8-encodeable ones. count = flow.switch cp [ - [ (int.inRange 0 127) 1 ] # 00000000 0xxxxxxx - [ (int.inRange 128 2047) 2 ] # 00000yyy yyxxxxxx - [ (int.inRange 2048 65535) 3 ] # zzzzyyyy yyxxxxxx + [ (int.inRange 0 127) 1 ] # 00000000 0xxxxxxx + [ (int.inRange 128 2047) 2 ] # 00000yyy yyxxxxxx + [ (int.inRange 2048 65535) 3 ] # zzzzyyyy yyxxxxxx [ (int.inRange 65536 1114111) 4 ] # 000uuuuu zzzzyyyy yyxxxxxx, - # capped at U+10FFFF + # capped at U+10FFFF [ (fun.const true) (builtins.throw invalidCodepointMsg) ] ]; @@ -234,32 +240,34 @@ let # according to Table 3-6. from The Unicode Standard, Version 13.0, # section 3.9. u is split into uh and ul since they are used in # different bytes in the end. - components = lib.mapAttrs (_: { mask, offset }: - int.bitAnd (int.bitShiftR cp offset) mask - ) { - x = { - mask = if count > 1 then 63 else 127; - offset = 0; - }; - y = { - mask = if count > 2 then 63 else 31; - offset = 6; - }; - z = { - mask = 15; - offset = 12; - }; - # u which belongs into the second byte - ul = { - mask = 3; - offset = 16; - }; - # u which belongs into the first byte - uh = { - mask = 7; - offset = 18; + components = lib.mapAttrs + (_: { mask, offset }: + int.bitAnd (int.bitShiftR cp offset) mask + ) + { + x = { + mask = if count > 1 then 63 else 127; + offset = 0; + }; + y = { + mask = if count > 2 then 63 else 31; + offset = 6; + }; + z = { + mask = 15; + offset = 12; + }; + # u which belongs into the second byte + ul = { + mask = 3; + offset = 16; + }; + # u which belongs into the first byte + uh = { + mask = 7; + offset = 18; + }; }; - }; inherit (components) x y z ul uh; # Finally construct the byte sequence for the given codepoint. This is @@ -286,15 +294,18 @@ let unableToEncodeMessage = "Can't encode ${formatCodepoint cp} as UTF-8"; - in string.fromBytes ( - builtins.genList (i: - let - byte = builtins.elemAt bytes i; - in + in + string.fromBytes ( + builtins.genList + (i: + let + byte = builtins.elemAt bytes i; + in if wellFormedByte firstByte i byte then byte else builtins.throw unableToEncodeMessage - ) count + ) + count ); /* Encode a list of Unicode codepoints into an UTF-8 string. @@ -303,7 +314,8 @@ let */ encode = lib.concatMapStrings encodeCodepoint; -in { +in +{ inherit encode decode |