From aa122cbae78ce97d60c0c98ba14df753d97e40b1 Mon Sep 17 00:00:00 2001 From: Vincent Ambo Date: Sun, 30 Jan 2022 19:06:58 +0300 Subject: style: format entire depot with nixpkgs-fmt This CL can be used to compare the style of nixpkgs-fmt against other formatters (nixpkgs, alejandra). Change-Id: I87c6abff6bcb546b02ead15ad0405f81e01b6d9e Reviewed-on: https://cl.tvl.fyi/c/depot/+/4397 Tested-by: BuildkiteCI Reviewed-by: sterni Reviewed-by: lukegb Reviewed-by: wpcarro Reviewed-by: Profpatsch Reviewed-by: kanepyork Reviewed-by: tazjin Reviewed-by: cynthia Reviewed-by: edef Reviewed-by: eta Reviewed-by: grfn --- users/sterni/nix/utf8/default.nix | 200 +++++++++++++++++--------------- users/sterni/nix/utf8/tests/default.nix | 57 +++++---- 2 files changed, 138 insertions(+), 119 deletions(-) (limited to 'users/sterni/nix/utf8') diff --git a/users/sterni/nix/utf8/default.nix b/users/sterni/nix/utf8/default.nix index 270da934b6a6..71c846c0421e 100644 --- a/users/sterni/nix/utf8/default.nix +++ b/users/sterni/nix/utf8/default.nix @@ -25,7 +25,7 @@ let Type: integer -> integer */ byteCount = i: flow.cond [ - [ (int.bitAnd i 128 == 0) 1 ] + [ (int.bitAnd i 128 == 0) 1 ] [ (int.bitAnd i 224 == 192) 2 ] [ (int.bitAnd i 240 == 224) 3 ] [ (int.bitAnd i 248 == 240) 4 ] @@ -45,30 +45,30 @@ let first: # byte position as an index starting with 0 pos: - let - defaultRange = int.inRange 128 191; - - secondBytePredicate = flow.switch first [ - [ (int.inRange 194 223) defaultRange ] # C2..DF - [ 224 (int.inRange 160 191) ] # E0 - [ (int.inRange 225 236) defaultRange ] # E1..EC - [ 237 (int.inRange 128 159) ] # ED - [ (int.inRange 238 239) defaultRange ] # EE..EF - [ 240 (int.inRange 144 191) ] # F0 - [ (int.inRange 241 243) defaultRange ] # F1..F3 - [ 244 (int.inRange 128 143) ] # F4 - [ (fun.const true) null ] - ]; + let + defaultRange = int.inRange 128 191; + + secondBytePredicate = flow.switch first [ + [ (int.inRange 194 223) defaultRange ] # C2..DF + [ 224 (int.inRange 160 191) ] # E0 + [ (int.inRange 225 236) defaultRange ] # E1..EC + [ 237 (int.inRange 128 159) ] # ED + [ (int.inRange 238 239) defaultRange ] # EE..EF + [ 240 (int.inRange 144 191) ] # F0 + [ (int.inRange 241 243) defaultRange ] # F1..F3 + [ 244 (int.inRange 128 143) ] # F4 + [ (fun.const true) null ] + ]; - firstBytePredicate = byte: assert first == byte; - first < 128 || secondBytePredicate != null; - in - # Either ASCII or in one of the byte ranges of Table 3-6. - if pos == 0 then firstBytePredicate - # return predicate according to Table 3-6. - else if pos == 1 then assert secondBytePredicate != null; secondBytePredicate - # 3rd and 4th byte have only one validity rule - else defaultRange; + firstBytePredicate = byte: assert first == byte; + first < 128 || secondBytePredicate != null; + in + # Either ASCII or in one of the byte ranges of Table 3-6. + if pos == 0 then firstBytePredicate + # return predicate according to Table 3-6. + else if pos == 1 then assert secondBytePredicate != null; secondBytePredicate + # 3rd and 4th byte have only one validity rule + else defaultRange; /* Iteration step for decoding an UTF-8 byte sequence. It decodes incrementally, i. e. it has to be fed @@ -128,23 +128,24 @@ let # the current value by the amount of bytes left. offset = (count - (pos + 1)) * 6; in - code + (int.bitShiftL (int.bitAnd mask value) offset); + code + (int.bitShiftL (int.bitAnd mask value) offset); illFormedMsg = "Ill-formed byte ${int.toHex value} at position ${toString pos} in ${toString count} byte UTF-8 sequence"; in - if !(wellFormedByte first pos value) then builtins.throw illFormedMsg - else if pos + 1 == count - then (builtins.removeAttrs args [ # allow extra state being passed through - "count" - "code" - "pos" - "first" - ]) // { result = newCode; } - else (builtins.removeAttrs args [ "result" ]) // { - inherit count first; - code = newCode; - pos = pos + 1; - }; + if !(wellFormedByte first pos value) then builtins.throw illFormedMsg + else if pos + 1 == count + then (builtins.removeAttrs args [ + # allow extra state being passed through + "count" + "code" + "pos" + "first" + ]) // { result = newCode; } + else (builtins.removeAttrs args [ "result" ]) // { + inherit count first; + code = newCode; + pos = pos + 1; + }; /* Decode an UTF-8 string into a list of codepoints. @@ -161,7 +162,7 @@ let { key = "start"; stringIndex = -1; - state = {}; + state = { }; codepoint = null; } ]; @@ -170,7 +171,8 @@ let # updated values for current iteration step newIndex = stringIndex + 1; newState = step state (builtins.substring newIndex 1 s); - in lib.optional (newIndex < stringLength) { + in + lib.optional (newIndex < stringLength) { # unique keys to make genericClosure happy key = toString newIndex; # carryover state for the next step @@ -183,35 +185,39 @@ let in # extract all steps that yield a code point into a list builtins.map (v: v.codepoint) ( - builtins.filter ( - { codepoint, stringIndex, state, ... }: - - let - # error message in case we are missing bytes at the end of input - earlyEndMsg = - if state ? count && state ? pos - then "Missing ${toString (with state; count - pos)} bytes at end of input" - else "Unexpected end of input"; - in - - # filter out all iteration steps without a codepoint value - codepoint != null + builtins.filter + ( + { codepoint, stringIndex, state, ... }: + + let + # error message in case we are missing bytes at the end of input + earlyEndMsg = + if state ? count && state ? pos + then "Missing ${toString (with state; count - pos)} bytes at end of input" + else "Unexpected end of input"; + in + + # filter out all iteration steps without a codepoint value + codepoint != null # if we are at the iteration step of a non-empty input string, throw # an error if no codepoint was returned, as it indicates an incomplete # UTF-8 sequence. || (stringLength > 0 && stringIndex == stringLength - 1 && throw earlyEndMsg) - ) iterResult + ) + iterResult ); /* Pretty prints a Unicode codepoint in the U+ notation. Type: integer -> string */ - formatCodepoint = cp: "U+" + string.fit { - width = 4; - char = "0"; - } (int.toHex cp); + formatCodepoint = cp: "U+" + string.fit + { + width = 4; + char = "0"; + } + (int.toHex cp); encodeCodepoint = cp: let @@ -219,11 +225,11 @@ let # Note that this doesn't check if the Unicode codepoint is allowed, # but rather allows all theoretically UTF-8-encodeable ones. count = flow.switch cp [ - [ (int.inRange 0 127) 1 ] # 00000000 0xxxxxxx - [ (int.inRange 128 2047) 2 ] # 00000yyy yyxxxxxx - [ (int.inRange 2048 65535) 3 ] # zzzzyyyy yyxxxxxx + [ (int.inRange 0 127) 1 ] # 00000000 0xxxxxxx + [ (int.inRange 128 2047) 2 ] # 00000yyy yyxxxxxx + [ (int.inRange 2048 65535) 3 ] # zzzzyyyy yyxxxxxx [ (int.inRange 65536 1114111) 4 ] # 000uuuuu zzzzyyyy yyxxxxxx, - # capped at U+10FFFF + # capped at U+10FFFF [ (fun.const true) (builtins.throw invalidCodepointMsg) ] ]; @@ -234,32 +240,34 @@ let # according to Table 3-6. from The Unicode Standard, Version 13.0, # section 3.9. u is split into uh and ul since they are used in # different bytes in the end. - components = lib.mapAttrs (_: { mask, offset }: - int.bitAnd (int.bitShiftR cp offset) mask - ) { - x = { - mask = if count > 1 then 63 else 127; - offset = 0; - }; - y = { - mask = if count > 2 then 63 else 31; - offset = 6; - }; - z = { - mask = 15; - offset = 12; - }; - # u which belongs into the second byte - ul = { - mask = 3; - offset = 16; - }; - # u which belongs into the first byte - uh = { - mask = 7; - offset = 18; + components = lib.mapAttrs + (_: { mask, offset }: + int.bitAnd (int.bitShiftR cp offset) mask + ) + { + x = { + mask = if count > 1 then 63 else 127; + offset = 0; + }; + y = { + mask = if count > 2 then 63 else 31; + offset = 6; + }; + z = { + mask = 15; + offset = 12; + }; + # u which belongs into the second byte + ul = { + mask = 3; + offset = 16; + }; + # u which belongs into the first byte + uh = { + mask = 7; + offset = 18; + }; }; - }; inherit (components) x y z ul uh; # Finally construct the byte sequence for the given codepoint. This is @@ -286,15 +294,18 @@ let unableToEncodeMessage = "Can't encode ${formatCodepoint cp} as UTF-8"; - in string.fromBytes ( - builtins.genList (i: - let - byte = builtins.elemAt bytes i; - in + in + string.fromBytes ( + builtins.genList + (i: + let + byte = builtins.elemAt bytes i; + in if wellFormedByte firstByte i byte then byte else builtins.throw unableToEncodeMessage - ) count + ) + count ); /* Encode a list of Unicode codepoints into an UTF-8 string. @@ -303,7 +314,8 @@ let */ encode = lib.concatMapStrings encodeCodepoint; -in { +in +{ inherit encode decode diff --git a/users/sterni/nix/utf8/tests/default.nix b/users/sterni/nix/utf8/tests/default.nix index ddcd34208a6d..40783eab2421 100644 --- a/users/sterni/nix/utf8/tests/default.nix +++ b/users/sterni/nix/utf8/tests/default.nix @@ -25,9 +25,10 @@ let char ; - rustDecoder = rustSimple { - name = "utf8-decode"; - } '' + rustDecoder = rustSimple + { + name = "utf8-decode"; + } '' use std::io::{self, Read}; fn main() -> std::io::Result<()> { let mut buffer = String::new(); @@ -47,10 +48,11 @@ let rustDecode = s: let - expr = runCommandLocal "${s}-decoded" {} '' + expr = runCommandLocal "${s}-decoded" { } '' printf '%s' ${lib.escapeShellArg s} | ${rustDecoder} > $out ''; - in import expr; + in + import expr; hexDecode = l: utf8.decode (string.fromBytes (builtins.map int.fromHex l)); @@ -65,23 +67,27 @@ let (assertEq "well-formed: F4 80 83 92" (hexDecode [ "F4" "80" "83" "92" ]) [ 1048786 ]) (assertThrows "Codepoint out of range: 0xFFFFFF" (hexEncode [ "FFFFFF" ])) (assertThrows "Codepoint out of range: -0x02" (hexEncode [ "-02" ])) - ] ++ builtins.genList (i: - let - cp = i + int.fromHex "D800"; - in + ] ++ builtins.genList + (i: + let + cp = i + int.fromHex "D800"; + in assertThrows "Can't encode UTF-16 reserved characters: ${utf8.formatCodepoint cp}" (utf8.encode [ cp ]) - ) (int.fromHex "07FF")); + ) + (int.fromHex "07FF")); testAscii = it "checks decoding of ascii strings" - (builtins.map (s: assertEq "ASCII decoding is equal to UTF-8 decoding for \"${s}\"" - (string.toBytes s) (utf8.decode s)) [ - "foo bar" - "hello\nworld" - "carriage\r\nreturn" - "1238398494829304 []<><>({})[]!!)" - (string.take 127 char.allChars) - ]); + (builtins.map + (s: assertEq "ASCII decoding is equal to UTF-8 decoding for \"${s}\"" + (string.toBytes s) + (utf8.decode s)) [ + "foo bar" + "hello\nworld" + "carriage\r\nreturn" + "1238398494829304 []<><>({})[]!!)" + (string.take 127 char.allChars) + ]); randomUnicode = [ "" # empty string should yield empty list @@ -126,16 +132,17 @@ let testDecodingEncoding = it "checks that decoding and then encoding forms an identity" (builtins.map (s: assertEq "Decoding and then encoding “${s}” yields itself" - (utf8.encode (utf8.decode s)) s) + (utf8.encode (utf8.decode s)) + s) (lib.flatten [ glassSentences randomUnicode ])); in - runTestsuite "nix.utf8" [ - testFailures - testAscii - testDecoding - testDecodingEncoding - ] +runTestsuite "nix.utf8" [ + testFailures + testAscii + testDecoding + testDecodingEncoding +] -- cgit 1.4.1