From 0e9c770972afe787565c7ba5475ff8398f807f20 Mon Sep 17 00:00:00 2001
From: sterni <sternenseemann@systemli.org>
Date: Tue, 23 Nov 2021 19:35:16 +0100
Subject: refactor(sterni/nix/utf8): let wellFormedByte check first byte

Previously we would check the first byte only when trying to figure out
the predicate for the second byte. If the first byte was invalid, we'd
then throw with a helpful error message. However this made
wellFormedByte a very weird function.

At the expense of doing the same check twice, we now check the first
byte, when it is first passed, and always return a boolean.

Change-Id: I32ab6051c844711849e5b4a115e2511b53682baa
---
 users/sterni/nix/utf8/default.nix | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

(limited to 'users')

diff --git a/users/sterni/nix/utf8/default.nix b/users/sterni/nix/utf8/default.nix
index b3185d9743b5..0c6e7d940083 100644
--- a/users/sterni/nix/utf8/default.nix
+++ b/users/sterni/nix/utf8/default.nix
@@ -38,9 +38,7 @@ let
      Based on table 3-7. from The Unicode Standard,
      Version 13.0, section 3.9.
 
-     Throws if the first byte is invalid.
-
-     Type: integer -> integer -> (integer -> bool)
+     Type: integer -> integer -> integer -> bool
   */
   wellFormedByte =
     # first byte's integer value
@@ -49,16 +47,8 @@ let
     pos:
       let
         defaultRange = int.inRange 128 191;
-      in
-        # The first byte is either ASCII which requires no checks
-        # or we automatically check it when we check the subsequent
-        # bytes. The downside is that this may generate bad error
-        # messages in very rare cases.
-        if pos == 0
-        then lib.const true
-        else if pos > 1 # 3rd and 4th byte have only one validity rule
-        then defaultRange
-        else assert pos == 1; flow.switch first [
+
+        secondBytePredicate = flow.switch first [
           [ (int.inRange 194 223) defaultRange          ] # C2..DF
           [ 224                   (int.inRange 160 191) ] # E0
           [ (int.inRange 225 236) defaultRange          ] # E1..EC
@@ -67,12 +57,19 @@ let
           [ 240                   (int.inRange 144 191) ] # F0
           [ (int.inRange 241 243) defaultRange          ] # F1..F3
           [ 244                   (int.inRange 128 143) ] # F4
-          [
-            (fun.const true)
-            (builtins.throw "Invalid first byte ${int.toHex first}")
-          ]
+          [ (fun.const true)      null                  ]
         ];
 
+        firstBytePredicate = byte: assert first == byte;
+          first < 128 || secondBytePredicate != null;
+      in
+        # Either ASCII or in one of the byte ranges of Table 3-6.
+        if pos == 0 then firstBytePredicate
+        # return predicate according to Table 3-6.
+        else if pos == 1 then assert secondBytePredicate != null; secondBytePredicate
+        # 3rd and 4th byte have only one validity rule
+        else defaultRange;
+
   /* Iteration step for decoding an UTF-8 byte sequence.
      It decodes incrementally, i. e. it has to be fed
      one byte at a time and then returns either a
-- 
cgit 1.4.1