about summary refs log tree commit diff
path: root/users/sterni/nix/utf8/tests/default.nix
diff options
context:
space:
mode:
Diffstat (limited to 'users/sterni/nix/utf8/tests/default.nix')
-rw-r--r--users/sterni/nix/utf8/tests/default.nix141
1 files changed, 141 insertions, 0 deletions
diff --git a/users/sterni/nix/utf8/tests/default.nix b/users/sterni/nix/utf8/tests/default.nix
new file mode 100644
index 000000000000..ddcd34208a6d
--- /dev/null
+++ b/users/sterni/nix/utf8/tests/default.nix
@@ -0,0 +1,141 @@
+{ depot, pkgs, lib, ... }:
+
+let
+
+  inherit (pkgs)
+    runCommandLocal
+    ;
+
+  inherit (depot.nix.runTestsuite)
+    runTestsuite
+    it
+    assertEq
+    assertThrows
+    assertDoesNotThrow
+    ;
+
+  inherit (depot.nix.writers)
+    rustSimple
+    ;
+
+  inherit (depot.users.sterni.nix)
+    int
+    utf8
+    string
+    char
+    ;
+
+  rustDecoder = rustSimple {
+    name = "utf8-decode";
+  } ''
+    use std::io::{self, Read};
+    fn main() -> std::io::Result<()> {
+      let mut buffer = String::new();
+      io::stdin().read_to_string(&mut buffer)?;
+
+      print!("[ ");
+
+      for c in buffer.chars() {
+        print!("{} ", u32::from(c));
+      }
+
+      print!("]");
+
+      Ok(())
+    }
+  '';
+
+  rustDecode = s:
+    let
+      expr = runCommandLocal "${s}-decoded" {} ''
+        printf '%s' ${lib.escapeShellArg s} | ${rustDecoder} > $out
+      '';
+    in import expr;
+
+  hexDecode = l:
+    utf8.decode (string.fromBytes (builtins.map int.fromHex l));
+
+  hexEncode = l: utf8.encode (builtins.map int.fromHex l);
+
+  testFailures = it "checks UTF-8 decoding failures" ([
+    (assertThrows "truncated UTF-8 string throws" (hexDecode [ "F0" "9F" ]))
+    # examples from The Unicode Standard
+    (assertThrows "ill-formed: C0 AF" (hexDecode [ "C0" "AF" ]))
+    (assertThrows "ill-formed: E0 9F 80" (hexDecode [ "E0" "9F" "80" ]))
+    (assertEq "well-formed: F4 80 83 92" (hexDecode [ "F4" "80" "83" "92" ]) [ 1048786 ])
+    (assertThrows "Codepoint out of range: 0xFFFFFF" (hexEncode [ "FFFFFF" ]))
+    (assertThrows "Codepoint out of range: -0x02" (hexEncode [ "-02" ]))
+  ] ++ builtins.genList (i:
+    let
+      cp = i + int.fromHex "D800";
+    in
+      assertThrows "Can't encode UTF-16 reserved characters: ${utf8.formatCodepoint cp}"
+        (utf8.encode [ cp ])
+  ) (int.fromHex "07FF"));
+
+  testAscii = it "checks decoding of ascii strings"
+    (builtins.map (s: assertEq "ASCII decoding is equal to UTF-8 decoding for \"${s}\""
+      (string.toBytes s) (utf8.decode s)) [
+        "foo bar"
+        "hello\nworld"
+        "carriage\r\nreturn"
+        "1238398494829304 []<><>({})[]!!)"
+        (string.take 127 char.allChars)
+      ]);
+
+  randomUnicode = [
+    "" # empty string should yield empty list
+    "🥰👨‍👨‍👧‍👦🐈‍⬛👩🏽‍🦰"
+    # https://kermitproject.org/utf8.html
+    "ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ"
+    "An preost wes on leoden, Laȝamon was ihoten"
+    "Sîne klâwen durh die wolken sint geslagen,"
+    "Τὴ γλῶσσα μοῦ ἔδωσαν ἑλληνικὴ"
+    "На берегу пустынных волн"
+    "ვეპხის ტყაოსანი შოთა რუსთაველი"
+    "யாமறிந்த மொழிகளிலே தமிழ்மொழி போல் இனிதாவது எங்கும் காணோம், "
+    "ಬಾ ಇಲ್ಲಿ ಸಂಭವಿಸು "
+  ];
+
+  # https://kermitproject.org/utf8.html
+  glassSentences = [
+    "Euro Symbol: €."
+    "Greek: Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα."
+    "Íslenska / Icelandic: Ég get etið gler án þess að meiða mig."
+    "Polish: Mogę jeść szkło, i mi nie szkodzi."
+    "Romanian: Pot să mănânc sticlă și ea nu mă rănește."
+    "Ukrainian: Я можу їсти шкло, й воно мені не пошкодить."
+    "Armenian: Կրնամ ապակի ուտել և ինծի անհանգիստ չըներ։"
+    "Georgian: მინას ვჭამ და არა მტკივა."
+    "Hindi: मैं काँच खा सकता हूँ, मुझे उस से कोई पीडा नहीं होती."
+    "Hebrew(2): אני יכול לאכול זכוכית וזה לא מזיק לי."
+    "Yiddish(2): איך קען עסן גלאָז און עס טוט מיר נישט װײ."
+    "Arabic(2): أنا قادر على أكل الزجاج و هذا لا يؤلمني."
+    "Japanese: 私はガラスを食べられます。それは私を傷つけません。"
+    "Thai: ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ "
+  ];
+
+  testDecoding = it "checks decoding of UTF-8 strings against Rust's String"
+    (builtins.map
+      (s: assertEq "Decoding of “${s}” is correct" (utf8.decode s) (rustDecode s))
+      (lib.flatten [
+        glassSentences
+        randomUnicode
+      ]));
+
+  testDecodingEncoding = it "checks that decoding and then encoding forms an identity"
+    (builtins.map
+      (s: assertEq "Decoding and then encoding “${s}” yields itself"
+        (utf8.encode (utf8.decode s)) s)
+      (lib.flatten [
+        glassSentences
+        randomUnicode
+      ]));
+
+in
+  runTestsuite "nix.utf8" [
+    testFailures
+    testAscii
+    testDecoding
+    testDecodingEncoding
+  ]