{ depot, pkgs, lib, ... }: let inherit (pkgs) runCommandLocal ; inherit (depot.nix.runTestsuite) runTestsuite it assertEq assertThrows assertDoesNotThrow ; inherit (depot.nix.writers) rustSimple ; inherit (depot.users.sterni.nix) int utf8 string char ; rustDecoder = rustSimple { name = "utf8-decode"; } '' use std::io::{self, Read}; fn main() -> std::io::Result<()> { let mut buffer = String::new(); io::stdin().read_to_string(&mut buffer)?; print!("[ "); for c in buffer.chars() { print!("{} ", u32::from(c)); } print!("]"); Ok(()) } ''; rustDecode = s: let expr = runCommandLocal "${s}-decoded" { } '' printf '%s' ${lib.escapeShellArg s} | ${rustDecoder} > $out ''; in import expr; hexDecode = l: utf8.decode (string.fromBytes (builtins.map int.fromHex l)); hexEncode = l: utf8.encode (builtins.map int.fromHex l); testFailures = it "checks UTF-8 decoding failures" ([ (assertThrows "truncated UTF-8 string throws" (hexDecode [ "F0" "9F" ])) # examples from The Unicode Standard (assertThrows "ill-formed: C0 AF" (hexDecode [ "C0" "AF" ])) (assertThrows "ill-formed: E0 9F 80" (hexDecode [ "E0" "9F" "80" ])) (assertEq "well-formed: F4 80 83 92" (hexDecode [ "F4" "80" "83" "92" ]) [ 1048786 ]) (assertThrows "Codepoint out of range: 0xFFFFFF" (hexEncode [ "FFFFFF" ])) (assertThrows "Codepoint out of range: -0x02" (hexEncode [ "-02" ])) ] ++ builtins.genList (i: let cp = i + int.fromHex "D800"; in assertThrows "Can't encode UTF-16 reserved characters: ${utf8.formatCodepoint cp}" (utf8.encode [ cp ]) ) (int.fromHex "07FF")); testAscii = it "checks decoding of ascii strings" (builtins.map (s: assertEq "ASCII decoding is equal to UTF-8 decoding for \"${s}\"" (string.toBytes s) (utf8.decode s)) [ "foo bar" "hello\nworld" "carriage\r\nreturn" "1238398494829304 []<><>({})[]!!)" (string.take 127 char.allChars) ]); randomUnicode = [ "" # empty string should yield empty list "🥰👨‍👨‍👧‍👦🐈‍⬛👩🏽‍🦰" # https://kermitproject.org/utf8.html "ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ" "An preost wes on leoden, Laȝamon was ihoten" "Sîne klâwen durh die wolken sint geslagen," "Τὴ γλῶσσα μοῦ ἔδωσαν ἑλληνικὴ" "На берегу пустынных волн" "ვეპხის ტყაოსანი შოთა რუსთაველი" "யாமறிந்த மொழிகளிலே தமிழ்மொழி போல் இனிதாவது எங்கும் காணோம், " "ಬಾ ಇಲ್ಲಿ ಸಂಭವಿಸು " ]; # https://kermitproject.org/utf8.html glassSentences = [ "Euro Symbol: €." "Greek: Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα." "Íslenska / Icelandic: Ég get etið gler án þess að meiða mig." "Polish: Mogę jeść szkło, i mi nie szkodzi." "Romanian: Pot să mănânc sticlă și ea nu mă rănește." "Ukrainian: Я можу їсти шкло, й воно мені не пошкодить." "Armenian: Կրնամ ապակի ուտել և ինծի անհանգիստ չըներ։" "Georgian: მინას ვჭამ და არა მტკივა." "Hindi: मैं काँच खा सकता हूँ, मुझे उस से कोई पीडा नहीं होती." "Hebrew(2): אני יכול לאכול זכוכית וזה לא מזיק לי." "Yiddish(2): איך קען עסן גלאָז און עס טוט מיר נישט װײ." "Arabic(2): أنا قادر على أكل الزجاج و هذا لا يؤلمني." "Japanese: 私はガラスを食べられます。それは私を傷つけません。" "Thai: ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ " ]; testDecoding = it "checks decoding of UTF-8 strings against Rust's String" (builtins.map (s: assertEq "Decoding of “${s}” is correct" (utf8.decode s) (rustDecode s)) (lib.flatten [ glassSentences randomUnicode ])); testDecodingEncoding = it "checks that decoding and then encoding forms an identity" (builtins.map (s: assertEq "Decoding and then encoding “${s}” yields itself" (utf8.encode (utf8.decode s)) s) (lib.flatten [ glassSentences randomUnicode ])); in runTestsuite "nix.utf8" [ testFailures testAscii testDecoding testDecodingEncoding ]