about summary refs log tree commit diff
path: root/users/sterni/nix/utf8/tests/default.nix
blob: ddcd34208a6d26f5cdc311c1916a9153a0ccee79 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
{ depot, pkgs, lib, ... }:

let

  inherit (pkgs)
    runCommandLocal
    ;

  inherit (depot.nix.runTestsuite)
    runTestsuite
    it
    assertEq
    assertThrows
    assertDoesNotThrow
    ;

  inherit (depot.nix.writers)
    rustSimple
    ;

  inherit (depot.users.sterni.nix)
    int
    utf8
    string
    char
    ;

  rustDecoder = rustSimple {
    name = "utf8-decode";
  } ''
    use std::io::{self, Read};
    fn main() -> std::io::Result<()> {
      let mut buffer = String::new();
      io::stdin().read_to_string(&mut buffer)?;

      print!("[ ");

      for c in buffer.chars() {
        print!("{} ", u32::from(c));
      }

      print!("]");

      Ok(())
    }
  '';

  rustDecode = s:
    let
      expr = runCommandLocal "${s}-decoded" {} ''
        printf '%s' ${lib.escapeShellArg s} | ${rustDecoder} > $out
      '';
    in import expr;

  hexDecode = l:
    utf8.decode (string.fromBytes (builtins.map int.fromHex l));

  hexEncode = l: utf8.encode (builtins.map int.fromHex l);

  testFailures = it "checks UTF-8 decoding failures" ([
    (assertThrows "truncated UTF-8 string throws" (hexDecode [ "F0" "9F" ]))
    # examples from The Unicode Standard
    (assertThrows "ill-formed: C0 AF" (hexDecode [ "C0" "AF" ]))
    (assertThrows "ill-formed: E0 9F 80" (hexDecode [ "E0" "9F" "80" ]))
    (assertEq "well-formed: F4 80 83 92" (hexDecode [ "F4" "80" "83" "92" ]) [ 1048786 ])
    (assertThrows "Codepoint out of range: 0xFFFFFF" (hexEncode [ "FFFFFF" ]))
    (assertThrows "Codepoint out of range: -0x02" (hexEncode [ "-02" ]))
  ] ++ builtins.genList (i:
    let
      cp = i + int.fromHex "D800";
    in
      assertThrows "Can't encode UTF-16 reserved characters: ${utf8.formatCodepoint cp}"
        (utf8.encode [ cp ])
  ) (int.fromHex "07FF"));

  testAscii = it "checks decoding of ascii strings"
    (builtins.map (s: assertEq "ASCII decoding is equal to UTF-8 decoding for \"${s}\""
      (string.toBytes s) (utf8.decode s)) [
        "foo bar"
        "hello\nworld"
        "carriage\r\nreturn"
        "1238398494829304 []<><>({})[]!!)"
        (string.take 127 char.allChars)
      ]);

  randomUnicode = [
    "" # empty string should yield empty list
    "🥰👨‍👨‍👧‍👦🐈‍⬛👩🏽‍🦰"
    # https://kermitproject.org/utf8.html
    "ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ"
    "An preost wes on leoden, Laȝamon was ihoten"
    "Sîne klâwen durh die wolken sint geslagen,"
    "Τὴ γλῶσσα μοῦ ἔδωσαν ἑλληνικὴ"
    "На берегу пустынных волн"
    "ვეპხის ტყაოსანი შოთა რუსთაველი"
    "யாமறிந்த மொழிகளிலே தமிழ்மொழி போல் இனிதாவது எங்கும் காணோம், "
    "ಬಾ ಇಲ್ಲಿ ಸಂಭವಿಸು "
  ];

  # https://kermitproject.org/utf8.html
  glassSentences = [
    "Euro Symbol: €."
    "Greek: Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα."
    "Íslenska / Icelandic: Ég get etið gler án þess að meiða mig."
    "Polish: Mogę jeść szkło, i mi nie szkodzi."
    "Romanian: Pot să mănânc sticlă și ea nu mă rănește."
    "Ukrainian: Я можу їсти шкло, й воно мені не пошкодить."
    "Armenian: Կրնամ ապակի ուտել և ինծի անհանգիստ չըներ։"
    "Georgian: მინას ვჭამ და არა მტკივა."
    "Hindi: मैं काँच खा सकता हूँ, मुझे उस से कोई पीडा नहीं होती."
    "Hebrew(2): אני יכול לאכול זכוכית וזה לא מזיק לי."
    "Yiddish(2): איך קען עסן גלאָז און עס טוט מיר נישט װײ."
    "Arabic(2): أنا قادر على أكل الزجاج و هذا لا يؤلمني."
    "Japanese: 私はガラスを食べられます。それは私を傷つけません。"
    "Thai: ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ "
  ];

  testDecoding = it "checks decoding of UTF-8 strings against Rust's String"
    (builtins.map
      (s: assertEq "Decoding of “${s}” is correct" (utf8.decode s) (rustDecode s))
      (lib.flatten [
        glassSentences
        randomUnicode
      ]));

  testDecodingEncoding = it "checks that decoding and then encoding forms an identity"
    (builtins.map
      (s: assertEq "Decoding and then encoding “${s}” yields itself"
        (utf8.encode (utf8.decode s)) s)
      (lib.flatten [
        glassSentences
        randomUnicode
      ]));

in
  runTestsuite "nix.utf8" [
    testFailures
    testAscii
    testDecoding
    testDecodingEncoding
  ]