about summary refs log tree commit diff
diff options
context:
space:
mode:
authorsterni <sternenseemann@systemli.org>2021-11-22T21·06+0100
committersterni <sternenseemann@systemli.org>2021-11-23T13·22+0100
commit8615322bc81dfe106aa158b95567300372ea469d (patch)
tree83fe1e4eebe2df260183e4c298738b07a98537d5
parenta2be05faa4043c225581090846c52718812fac8f (diff)
refactor(sterni/nix/utf8): use genericClosure for decoding iteration r/3085
builtins.genericClosure is a quite powerful (and undocumented) Nix
primop: It repeatedly applies a function to values it produces and
collects them into a list. Additionally individual results can be
identified via a key attribute.

Since genericClosure only ever creates a single list value internally,
we can eliminate a huge performance bottleneck when building a list in a
recursive algorithm: list concatenation. Because Nix needs to copy the
entire chunk of memory used internally to represent the list, building
big lists one element at a time grinds Nix to a halt.

After rewriting decode using genericClosure decoding the LaTeX source
of my 20 page term paper now takes 2s instead of 14min.

Change-Id: I33847e4e7dd95d7f4d78ac83eb0d74a9867bfe80
-rw-r--r--users/sterni/nix/utf8/default.nix69
1 files changed, 46 insertions, 23 deletions
diff --git a/users/sterni/nix/utf8/default.nix b/users/sterni/nix/utf8/default.nix
index 713f1f57cb..c89263cd8f 100644
--- a/users/sterni/nix/utf8/default.nix
+++ b/users/sterni/nix/utf8/default.nix
@@ -160,31 +160,54 @@ let
   # TODO(sterni): option to fallback to replacement char instead of failure
   decode = s:
     let
-      iter = { codes ? [], ... }@args: byte:
+      stringLength = builtins.stringLength s;
+      iterResult = builtins.genericClosure {
+        startSet = [
+          {
+            key = "start";
+            stringIndex = -1;
+            state = {};
+            codepoint = null;
+          }
+        ];
+        operator = { state, stringIndex, ... }:
+          let
+            # updated values for current iteration step
+            newIndex = stringIndex + 1;
+            newState = step state (builtins.substring newIndex 1 s);
+          in lib.optional (newIndex < stringLength) {
+            # unique keys to make genericClosure happy
+            key = toString newIndex;
+            # carryover state for the next step
+            stringIndex = newIndex;
+            state = newState;
+            # actual payload for later, steps with value null are filtered out
+            codepoint = newState.result or null;
+          };
+      };
+    in
+    # extract all steps that yield a code point into a list
+    builtins.map (v: v.codepoint) (
+      builtins.filter (
+        { codepoint, stringIndex, state, ... }:
+
         let
-          res = step args byte;
+          # error message in case we are missing bytes at the end of input
+          earlyEndMsg =
+            if state ? count && state ? pos
+            then "Missing ${toString (with state; count - pos)} bytes at end of input"
+            else "Unexpected end of input";
         in
-          # foldl' forceValues the calculate value only at the end
-          # this makes the thunk grow large enough to cause a stack
-          # overflow with sufficiently large strings. To avoid this
-          # we always deepSeq the result which also keeps memory
-          # usage of decode reasonable.
-          builtins.deepSeq res
-            (if res ? result
-            then res // {
-              codes = codes ++ [ res.result ];
-            }
-            else res);
-      iterResult =
-        builtins.foldl' iter {} (string.toChars s);
-      earlyEndMsg =
-        if iterResult ? count && iterResult ? pos
-        then "Missing ${toString (with iterResult; count - pos)} bytes at end of input"
-        else "Unexpected end of input";
-    in
-      if iterResult ? result
-      then iterResult.codes
-      else builtins.throw earlyEndMsg;
+
+        # filter out all iteration steps without a codepoint value
+        codepoint != null
+          # if we are at the iteration step of the input string, throw
+          # an error if no codepoint was returned, as it indicates an incomplete
+          # UTF-8 sequence.
+          || (stringIndex == stringLength - 1 && throw earlyEndMsg)
+
+      ) iterResult
+    );
 
   /* Decodes an UTF-8 string, but doesn't throw on error.
      Instead it returns null.