refactor(sterni/nix/utf8): let wellFormedByte check first byte r/3092

Previously we would check the first byte only when trying to figure out the predicate for the second byte. If the first byte was invalid, we'd then throw with a helpful error message. However this made wellFormedByte a very weird function. At the expense of doing the same check twice, we now check the first byte, when it is first passed, and always return a boolean. Change-Id: I32ab6051c844711849e5b4a115e2511b53682baa
author: sterni <sternenseemann@systemli.org> 2021-11-23T18·35+0100
committer: sterni <sternenseemann@systemli.org> 2021-11-25T11·15+0100
commit: 0e9c770972afe787565c7ba5475ff8398f807f20 (patch)
tree: 4e8fadfd563ee664e783e837c83700cc40350352
parent: 87a0aaa77dd94a5a83e4cc0d00e06528d5ce8edc (diff)
1 files changed, 14 insertions, 17 deletions
diff --git a/users/sterni/nix/utf8/default.nix b/users/sterni/nix/utf8/default.nix
index b3185d9743..0c6e7d9400 100644
--- a/users/sterni/nix/utf8/default.nix
+++ b/users/sterni/nix/utf8/default.nix
@@ -38,9 +38,7 @@ let
      Based on table 3-7. from The Unicode Standard,
      Version 13.0, section 3.9.
 
-     Throws if the first byte is invalid.
-
-     Type: integer -> integer -> (integer -> bool)
+     Type: integer -> integer -> integer -> bool
   */
   wellFormedByte =
     # first byte's integer value
@@ -49,16 +47,8 @@ let
     pos:
       let
         defaultRange = int.inRange 128 191;
-      in
-        # The first byte is either ASCII which requires no checks
-        # or we automatically check it when we check the subsequent
-        # bytes. The downside is that this may generate bad error
-        # messages in very rare cases.
-        if pos == 0
-        then lib.const true
-        else if pos > 1 # 3rd and 4th byte have only one validity rule
-        then defaultRange
-        else assert pos == 1; flow.switch first [
+
+        secondBytePredicate = flow.switch first [
           [ (int.inRange 194 223) defaultRange          ] # C2..DF
           [ 224                   (int.inRange 160 191) ] # E0
           [ (int.inRange 225 236) defaultRange          ] # E1..EC
@@ -67,12 +57,19 @@ let
           [ 240                   (int.inRange 144 191) ] # F0
           [ (int.inRange 241 243) defaultRange          ] # F1..F3
           [ 244                   (int.inRange 128 143) ] # F4
-          [
-            (fun.const true)
-            (builtins.throw "Invalid first byte ${int.toHex first}")
-          ]
+          [ (fun.const true)      null                  ]
         ];
 
+        firstBytePredicate = byte: assert first == byte;
+          first < 128 || secondBytePredicate != null;
+      in
+        # Either ASCII or in one of the byte ranges of Table 3-6.
+        if pos == 0 then firstBytePredicate
+        # return predicate according to Table 3-6.
+        else if pos == 1 then assert secondBytePredicate != null; secondBytePredicate
+        # 3rd and 4th byte have only one validity rule
+        else defaultRange;
+
   /* Iteration step for decoding an UTF-8 byte sequence.
      It decodes incrementally, i. e. it has to be fed
      one byte at a time and then returns either a
author	sterni <sternenseemann@systemli.org>	2021-11-23T18·35+0100
committer	sterni <sternenseemann@systemli.org>	2021-11-25T11·15+0100
commit	0e9c770972afe787565c7ba5475ff8398f807f20 (patch)
tree	4e8fadfd563ee664e783e837c83700cc40350352
parent	87a0aaa77dd94a5a83e4cc0d00e06528d5ce8edc (diff)