From 0e9c770972afe787565c7ba5475ff8398f807f20 Mon Sep 17 00:00:00 2001 From: sterni Date: Tue, 23 Nov 2021 19:35:16 +0100 Subject: refactor(sterni/nix/utf8): let wellFormedByte check first byte Previously we would check the first byte only when trying to figure out the predicate for the second byte. If the first byte was invalid, we'd then throw with a helpful error message. However this made wellFormedByte a very weird function. At the expense of doing the same check twice, we now check the first byte, when it is first passed, and always return a boolean. Change-Id: I32ab6051c844711849e5b4a115e2511b53682baa --- users/sterni/nix/utf8/default.nix | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'users/sterni/nix/utf8/default.nix') diff --git a/users/sterni/nix/utf8/default.nix b/users/sterni/nix/utf8/default.nix index b3185d9743..0c6e7d9400 100644 --- a/users/sterni/nix/utf8/default.nix +++ b/users/sterni/nix/utf8/default.nix @@ -38,9 +38,7 @@ let Based on table 3-7. from The Unicode Standard, Version 13.0, section 3.9. - Throws if the first byte is invalid. - - Type: integer -> integer -> (integer -> bool) + Type: integer -> integer -> integer -> bool */ wellFormedByte = # first byte's integer value @@ -49,16 +47,8 @@ let pos: let defaultRange = int.inRange 128 191; - in - # The first byte is either ASCII which requires no checks - # or we automatically check it when we check the subsequent - # bytes. The downside is that this may generate bad error - # messages in very rare cases. - if pos == 0 - then lib.const true - else if pos > 1 # 3rd and 4th byte have only one validity rule - then defaultRange - else assert pos == 1; flow.switch first [ + + secondBytePredicate = flow.switch first [ [ (int.inRange 194 223) defaultRange ] # C2..DF [ 224 (int.inRange 160 191) ] # E0 [ (int.inRange 225 236) defaultRange ] # E1..EC @@ -67,12 +57,19 @@ let [ 240 (int.inRange 144 191) ] # F0 [ (int.inRange 241 243) defaultRange ] # F1..F3 [ 244 (int.inRange 128 143) ] # F4 - [ - (fun.const true) - (builtins.throw "Invalid first byte ${int.toHex first}") - ] + [ (fun.const true) null ] ]; + firstBytePredicate = byte: assert first == byte; + first < 128 || secondBytePredicate != null; + in + # Either ASCII or in one of the byte ranges of Table 3-6. + if pos == 0 then firstBytePredicate + # return predicate according to Table 3-6. + else if pos == 1 then assert secondBytePredicate != null; secondBytePredicate + # 3rd and 4th byte have only one validity rule + else defaultRange; + /* Iteration step for decoding an UTF-8 byte sequence. It decodes incrementally, i. e. it has to be fed one byte at a time and then returns either a -- cgit 1.4.1