diff options
Diffstat (limited to 'users/sterni/nix/utf8')
-rw-r--r-- | users/sterni/nix/utf8/default.nix | 69 |
1 files changed, 46 insertions, 23 deletions
diff --git a/users/sterni/nix/utf8/default.nix b/users/sterni/nix/utf8/default.nix index 713f1f57cbe6..c89263cd8fbf 100644 --- a/users/sterni/nix/utf8/default.nix +++ b/users/sterni/nix/utf8/default.nix @@ -160,31 +160,54 @@ let # TODO(sterni): option to fallback to replacement char instead of failure decode = s: let - iter = { codes ? [], ... }@args: byte: + stringLength = builtins.stringLength s; + iterResult = builtins.genericClosure { + startSet = [ + { + key = "start"; + stringIndex = -1; + state = {}; + codepoint = null; + } + ]; + operator = { state, stringIndex, ... }: + let + # updated values for current iteration step + newIndex = stringIndex + 1; + newState = step state (builtins.substring newIndex 1 s); + in lib.optional (newIndex < stringLength) { + # unique keys to make genericClosure happy + key = toString newIndex; + # carryover state for the next step + stringIndex = newIndex; + state = newState; + # actual payload for later, steps with value null are filtered out + codepoint = newState.result or null; + }; + }; + in + # extract all steps that yield a code point into a list + builtins.map (v: v.codepoint) ( + builtins.filter ( + { codepoint, stringIndex, state, ... }: + let - res = step args byte; + # error message in case we are missing bytes at the end of input + earlyEndMsg = + if state ? count && state ? pos + then "Missing ${toString (with state; count - pos)} bytes at end of input" + else "Unexpected end of input"; in - # foldl' forceValues the calculate value only at the end - # this makes the thunk grow large enough to cause a stack - # overflow with sufficiently large strings. To avoid this - # we always deepSeq the result which also keeps memory - # usage of decode reasonable. - builtins.deepSeq res - (if res ? result - then res // { - codes = codes ++ [ res.result ]; - } - else res); - iterResult = - builtins.foldl' iter {} (string.toChars s); - earlyEndMsg = - if iterResult ? count && iterResult ? pos - then "Missing ${toString (with iterResult; count - pos)} bytes at end of input" - else "Unexpected end of input"; - in - if iterResult ? result - then iterResult.codes - else builtins.throw earlyEndMsg; + + # filter out all iteration steps without a codepoint value + codepoint != null + # if we are at the iteration step of the input string, throw + # an error if no codepoint was returned, as it indicates an incomplete + # UTF-8 sequence. + || (stringIndex == stringLength - 1 && throw earlyEndMsg) + + ) iterResult + ); /* Decodes an UTF-8 string, but doesn't throw on error. Instead it returns null. |