diff options
Diffstat (limited to 'users/sterni/nix/utf8/default.nix')
-rw-r--r-- | users/sterni/nix/utf8/default.nix | 208 |
1 files changed, 208 insertions, 0 deletions
diff --git a/users/sterni/nix/utf8/default.nix b/users/sterni/nix/utf8/default.nix new file mode 100644 index 000000000000..713f1f57cbe6 --- /dev/null +++ b/users/sterni/nix/utf8/default.nix @@ -0,0 +1,208 @@ +{ depot, lib, ... }: + +let + + # TODO(sterni): encode + + inherit (depot.users.sterni.nix) + char + flow + fun + int + string + util + ; + + /* (Internal) function to determine the amount + bytes left in a UTF-8 byte sequence from the + first byte. + + This function will throw if the given first + byte is ill-formed, but will not detect all + cases of ill-formed-ness. + + Based on table 3-6. from The Unicode Standard, + Version 13.0, section 3.9. + + Type: integer -> integer + */ + byteCount = i: flow.cond [ + [ (int.bitAnd i 128 == 0) 1 ] + [ (int.bitAnd i 224 == 192) 2 ] + [ (int.bitAnd i 240 == 224) 3 ] + [ (int.bitAnd i 248 == 240) 4 ] + [ true (builtins.throw "Ill-formed first byte ${int.toHex i}") ] + ]; + + /* (Internal) function to check if a given byte in + an UTF-8 byte sequence is well-formed. + + Based on table 3-7. from The Unicode Standard, + Version 13.0, section 3.9. + + Throws if the first byte is invalid. + + Type: integer -> integer -> (integer -> bool) + */ + wellFormedByte = + # first byte's integer value + first: + # byte position as an index starting with 0 + pos: + let + defaultRange = int.inRange 128 191; + in + # The first byte is either ASCII which requires no checks + # or we automatically check it when we check the subsequent + # bytes. The downside is that this may generate bad error + # messages in very rare cases. + if pos == 0 + then lib.const true + else if pos > 1 # 3rd and 4th byte have only one validity rule + then defaultRange + else assert pos == 1; flow.switch first [ + [ (int.inRange 194 223) defaultRange ] # C2..DF + [ 224 (int.inRange 160 191) ] # E0 + [ (int.inRange 225 236) defaultRange ] # E1..EC + [ 237 (int.inRange 128 159) ] # ED + [ (int.inRange 238 239) defaultRange ] # EE..EF + [ 240 (int.inRange 144 191) ] # F0 + [ (int.inRange 241 243) defaultRange ] # F1..F3 + [ 244 (int.inRange 128 143) ] # F4 + [ + (fun.const true) + (builtins.throw "Invalid first byte ${int.toHex first}") + ] + ]; + + /* Iteration step for decoding an UTF-8 byte sequence. + It decodes incrementally, i. e. it has to be fed + one byte at a time and then returns either a + new state or a final result. + + If the resulting attribute set contains the attribute + result, it is finished and the decoded codepoint is + contained in that attribute. In all other cases, + pass the returned set to step again along with + a new byte. The initial state to pass is the empty + set. + + Extra attributes are always passed through, so you + can pass extra state. Be sure not to use result, + pos, code, first or count. + + This function will throw with a fairly detailed + message if it encounters ill-formed bytes. + + The implementation is based on The Unicode Standard, + Version 13.0, section 3.9, especially table 3-6. + + Type: { ... } -> string -> ({ result :: integer, ... } | { ... }) + + Example: utf8.step {} "f" + => { result = 102; } + */ + step = { pos ? 0, code ? 0, ... }@args: byte: + let + value = char.ord byte; + # first byte is context for well-formed-ness + first = args.first or value; + count = args.count or (byteCount first); + newCode = + if count == 1 + then int.bitAnd 127 first # ascii character + else # multi byte UTF-8 sequence + let + # Calculate the bitmask for extracting the + # codepoint data in the current byte. + # If the codepoint is not ASCII, the bits + # used for codepoint data differ depending + # on the byte position and overall byte + # count. The first byte always ignores + # the (count + 1) most significant bits. + # For all subsequent bytes, the 2 most + # significant bits need to be ignored. + # See also table 3-6. + mask = + if pos == 0 + then int.exp 2 (8 - (count + 1)) - 1 + else 63; + # UTF-8 uses the 6 least significant bits in all + # subsequent bytes after the first one. Therefore + # We can determine the amount we need to shift + # the current value by the amount of bytes left. + offset = (count - (pos + 1)) * 6; + in + code + (int.bitShiftL (int.bitAnd mask value) offset); + illFormedMsg = + "Ill-formed byte ${int.toHex value} at position ${toString pos} in ${toString count} byte UTF-8 sequence"; + in + if !(wellFormedByte first pos value) then builtins.throw illFormedMsg + else if pos + 1 == count + then (builtins.removeAttrs args [ # allow extra state being passed through + "count" + "code" + "pos" + "first" + ]) // { result = newCode; } + else (builtins.removeAttrs args [ "result" ]) // { + inherit count first; + code = newCode; + pos = pos + 1; + }; + + /* Decode an UTF-8 string into a list of codepoints. + + Throws if the string is ill-formed UTF-8. + + Type: string -> [ integer ] + */ + # TODO(sterni): option to fallback to replacement char instead of failure + decode = s: + let + iter = { codes ? [], ... }@args: byte: + let + res = step args byte; + in + # foldl' forceValues the calculate value only at the end + # this makes the thunk grow large enough to cause a stack + # overflow with sufficiently large strings. To avoid this + # we always deepSeq the result which also keeps memory + # usage of decode reasonable. + builtins.deepSeq res + (if res ? result + then res // { + codes = codes ++ [ res.result ]; + } + else res); + iterResult = + builtins.foldl' iter {} (string.toChars s); + earlyEndMsg = + if iterResult ? count && iterResult ? pos + then "Missing ${toString (with iterResult; count - pos)} bytes at end of input" + else "Unexpected end of input"; + in + if iterResult ? result + then iterResult.codes + else builtins.throw earlyEndMsg; + + /* Decodes an UTF-8 string, but doesn't throw on error. + Instead it returns null. + + Type: string -> ( [ integer ] | null) + */ + decodeSafe = s: + let + res = builtins.tryEval (decode s); + in + if res.success + then res.value + else null; + +in { + inherit + decode + decodeSafe + step + ; +} |