corp/russian/data-import/src/main.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288

//! This program imports Russian language data from OpenCorpora
//! ("Открытый корпус") and OpenRussian into a SQLite database that
//! can be used for [//corp/russian][corp-russian] projects.
//!
//! [corp-russian]: https://at.tvl.fyi/?q=%2F%2Fcorp%2Frussian
//!
//! Ideally, running this on intact dumps should yield a fully
//! functional SQLite database compatible with all other tools
//! consuming it.
//!
//! ## OpenCorpora format
//!
//! The format used is partially documented on the [OpenCorpora
//! website][format-docs]. This seems to be a slightly outdated
//! format, however, hence some information about what the format
//! seems to be today.
//!
//! [format-docs]: http://opencorpora.org/?page=export
//!
//! The format is an XML file, which has several categories of data,
//! each with their own schema:
//!
//! * `grammemes`: These define units of grammar. They're *likely* pretty
//!   static, and we'll *likely* want to map them into a custom set of
//!   (simpler) categories.
//!
//!   They form some kind of internal hierarchy, where some of them have a
//!   `parent` attribute set to some other grammemes `name`.
//!
//!   There's a ridiculous number of these.
//!
//! * `restrictions`: Unclear, not documented on the page. They describe
//!   something about the relationship between grammemes.
//!
//! * `lemmata`: this lists the actual lemmas, as well as all their
//!   included morphological variants
//!
//!   Each lemma has an `id` attribute uniquely identifying its dictionary
//!   form, as well as a number of sub-elements:
//!
//!   * the `l` attribute contains the lemma itself
//!   * the `f` attributes contain morphological variations
//!
//!   Each of these sub elements again contains a number of `g` elements,
//!   which refer to the IDs of grammems in their `v` attributes.
//!
//! * `<link_types>` These list possible "relationships between lemmas",
//!   basically just assigning them IDs and names. There's only 27 of
//!   these.
//!
//! * `<links>`: Using the types defined above, this establishes links
//!   between lemmas that have some kind of relationship.
//!
//!   For example, a relationship `cardinal/ordinal` might be established
//!   between the lemmas "два" and "второй".
//!
//! ## OpenRussian format
//!
//! The [OpenRussian](https://en.openrussian.org/dictionary) project
//! lets users export its database as a set of CSV-files. For our
//! purposes, we download the files using `<tab>` separators.
//!
//! Whereas OpenCorpora opts for a flat structure with a "tag" system
//! (through its flexible grammemes), OpenRussian has a fixed pre-hoc
//! structure into which it sorts some words with their morphologies.
//! The OpenRussian database is much smaller as of January 2023 (~1.7
//! million words vs. >5 million for OpenCorpora), but some of the
//! information is much more practically useful.
//!
//! Two very important bits of information OpenRussian has are accent
//! marks (most tables containing actual words have a normal form
//! containing and accent mark, and a "bare" form without) and
//! translations into English and German.
//!
//! The full dump includes the following tables (and some more):
//!
//! * `words`: List of lemmas in the corpus, with various bits of
//!    metadata as well as hand-written notes.
//!
//! * `adjectives`: Contains IDs for words that are adjectives.
//!
//! * `nouns`: IDs for words that are nouns; and noun metadata (e.g.
//!   gender, declinability)
//!
//! * `verbs`: IDs of words that are verbs, including their aspect and
//!   "partnered" verb in the other aspect
//!
//! * `words_forms`: Contains all morphed variants of the lemmas from
//!   `words`, including information about their grammeme, and accent
//!   marks.
//!
//! * `words_rels`: Contains relations between words, containing
//!   information like "synonyms" or general relation between words.
//!
//! * `translations`: Contains translations tagged by target language,
//!   as well as examples and (occasionally) additional information.
//!
//! These tables also contain something, but have not been analysed
//! yet:
//!
//! * `expressions_words`
//! * `sentences`
//! * `sentences_translations`
//! * `sentences_words`

use log::{error, info};
use rusqlite::{Connection, Result};
use std::env;
use std::fmt::Display;
use std::fs::File;
use std::io::BufReader;

mod db_setup;
mod oc_parser;
mod or_parser;

struct Args {
    output: String,
    or_input: String,
    oc_input: String,
}

impl Args {
    fn populated(&self) -> bool {
        !(self.output.is_empty() || self.or_input.is_empty() || self.oc_input.is_empty())
    }
}

fn usage(binary_name: &str) {
    bail(format!(
        "usage: {} --output <output-file> --or-input <or-input> --oc-input <oc-input>",
        binary_name
    ));
}

fn parse_args() -> Args {
    let mut args_iter = env::args();
    let binary_name = args_iter.next().unwrap();

    let mut args = Args {
        output: "".into(),
        or_input: env::var("OPENRUSSIAN_DATA").unwrap_or_default(),
        oc_input: env::var("OPENCORPORA_DATA").unwrap_or_default(),
    };

    loop {
        if args.populated() {
            break;
        }

        while let Some(arg) = args_iter.next() {
            match arg.as_str() {
                "--output" => {
                    args.output = args_iter.next().unwrap();
                }

                "--or-input" => {
                    args.or_input = args_iter.next().unwrap();
                }

                "--oc-input" => {
                    args.oc_input = args_iter.next().unwrap();
                }

                _ => usage(&binary_name),
            }
        }
    }

    if args.output.is_empty() || args.or_input.is_empty() || args.oc_input.is_empty() {
        usage(&binary_name);
    }

    args
}

fn open_corpora(conn: &Connection, args: &Args) {
    let input_file = File::open(&args.oc_input).ensure("failed to open input file");
    let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file));
    db_setup::initial_oc_schema(&conn);

    let mut tx = conn
        .unchecked_transaction()
        .ensure("failed to start transaction");

    let mut count = 0;

    while let Some(elem) = parser.next_element() {
        // commit every 1000 things
        if count % 1000 == 0 {
            tx.commit().ensure("transaction failed");
            tx = conn
                .unchecked_transaction()
                .ensure("failed to start new transaction");
            info!("transaction committed at watermark {}", count);
        }

        db_setup::insert_oc_element(&tx, elem);

        count += 1;
    }

    tx.commit().ensure("final OpenCorpora commit failed");

    info!("finished OpenCorpora import");
}

fn open_russian(conn: &Connection, args: &Args) {
    let parser = or_parser::OpenRussianParser::new(&args.or_input);

    db_setup::initial_or_schema(conn);

    {
        let tx = conn
            .unchecked_transaction()
            .ensure("failed to start transaction");

        db_setup::insert_or_words(&tx, parser.words());
        tx.commit().ensure("OpenRussian words commit failed");
    }

    {
        let tx = conn
            .unchecked_transaction()
            .ensure("failed to start transaction");

        db_setup::insert_or_word_forms(&tx, parser.words_forms());
        tx.commit().ensure("OpenRussian word forms commit failed");
    }

    info!("finished OpenRussian import");
}

fn main() {
    env_logger::builder()
        .filter_level(log::LevelFilter::Info)
        .init();

    let args = parse_args();

    info!("output path: {}", args.output);
    info!("OpenCorpora input path: {}", args.oc_input);
    info!("OpenRussian input path: {}", args.or_input);

    let conn = Connection::open(&args.output).ensure("failed to open DB connection");

    open_corpora(&conn, &args);
    open_russian(&conn, &args);

    // afterwards:
    // add actual IDs to grammemes
    // properly reference keys internally
    // add foreign key constraint on lemma_grammemes.grammeme
}

/// It's like `expect`, but through `log::error`.
trait Ensure<T> {
    fn ensure<S: Into<String>>(self, msg: S) -> T;
}

impl<T, E: Display> Ensure<T> for Result<T, E> {
    fn ensure<S: Into<String>>(self, msg: S) -> T {
        match self {
            Ok(x) => x,
            Err(err) => {
                error!("{}: {}", msg.into(), err);
                std::process::exit(1);
            }
        }
    }
}

impl<T> Ensure<T> for Option<T> {
    fn ensure<S: Into<String>>(self, msg: S) -> T {
        match self {
            Some(x) => x,
            None => {
                error!("{}", msg.into());
                std::process::exit(1);
            }
        }
    }
}

fn bail<S: Into<String>>(msg: S) -> ! {
    error!("{}", msg.into());
    std::process::exit(1);
}