diff options
author | sterni <sternenseemann@systemli.org> | 2021-11-22T21·06+0100 |
---|---|---|
committer | sterni <sternenseemann@systemli.org> | 2021-11-23T13·22+0100 |
commit | 8615322bc81dfe106aa158b95567300372ea469d (patch) | |
tree | 83fe1e4eebe2df260183e4c298738b07a98537d5 | |
parent | a2be05faa4043c225581090846c52718812fac8f (diff) |
refactor(sterni/nix/utf8): use genericClosure for decoding iteration r/3085
builtins.genericClosure is a quite powerful (and undocumented) Nix primop: It repeatedly applies a function to values it produces and collects them into a list. Additionally individual results can be identified via a key attribute. Since genericClosure only ever creates a single list value internally, we can eliminate a huge performance bottleneck when building a list in a recursive algorithm: list concatenation. Because Nix needs to copy the entire chunk of memory used internally to represent the list, building big lists one element at a time grinds Nix to a halt. After rewriting decode using genericClosure decoding the LaTeX source of my 20 page term paper now takes 2s instead of 14min. Change-Id: I33847e4e7dd95d7f4d78ac83eb0d74a9867bfe80
-rw-r--r-- | users/sterni/nix/utf8/default.nix | 69 |
1 files changed, 46 insertions, 23 deletions
diff --git a/users/sterni/nix/utf8/default.nix b/users/sterni/nix/utf8/default.nix index 713f1f57cbe6..c89263cd8fbf 100644 --- a/users/sterni/nix/utf8/default.nix +++ b/users/sterni/nix/utf8/default.nix @@ -160,31 +160,54 @@ let # TODO(sterni): option to fallback to replacement char instead of failure decode = s: let - iter = { codes ? [], ... }@args: byte: + stringLength = builtins.stringLength s; + iterResult = builtins.genericClosure { + startSet = [ + { + key = "start"; + stringIndex = -1; + state = {}; + codepoint = null; + } + ]; + operator = { state, stringIndex, ... }: + let + # updated values for current iteration step + newIndex = stringIndex + 1; + newState = step state (builtins.substring newIndex 1 s); + in lib.optional (newIndex < stringLength) { + # unique keys to make genericClosure happy + key = toString newIndex; + # carryover state for the next step + stringIndex = newIndex; + state = newState; + # actual payload for later, steps with value null are filtered out + codepoint = newState.result or null; + }; + }; + in + # extract all steps that yield a code point into a list + builtins.map (v: v.codepoint) ( + builtins.filter ( + { codepoint, stringIndex, state, ... }: + let - res = step args byte; + # error message in case we are missing bytes at the end of input + earlyEndMsg = + if state ? count && state ? pos + then "Missing ${toString (with state; count - pos)} bytes at end of input" + else "Unexpected end of input"; in - # foldl' forceValues the calculate value only at the end - # this makes the thunk grow large enough to cause a stack - # overflow with sufficiently large strings. To avoid this - # we always deepSeq the result which also keeps memory - # usage of decode reasonable. - builtins.deepSeq res - (if res ? result - then res // { - codes = codes ++ [ res.result ]; - } - else res); - iterResult = - builtins.foldl' iter {} (string.toChars s); - earlyEndMsg = - if iterResult ? count && iterResult ? pos - then "Missing ${toString (with iterResult; count - pos)} bytes at end of input" - else "Unexpected end of input"; - in - if iterResult ? result - then iterResult.codes - else builtins.throw earlyEndMsg; + + # filter out all iteration steps without a codepoint value + codepoint != null + # if we are at the iteration step of the input string, throw + # an error if no codepoint was returned, as it indicates an incomplete + # UTF-8 sequence. + || (stringIndex == stringLength - 1 && throw earlyEndMsg) + + ) iterResult + ); /* Decodes an UTF-8 string, but doesn't throw on error. Instead it returns null. |