about summary refs log tree commit diff
path: root/corp/russian/data-import/src/main.rs
diff options
context:
space:
mode:
Diffstat (limited to 'corp/russian/data-import/src/main.rs')
-rw-r--r--corp/russian/data-import/src/main.rs298
1 files changed, 298 insertions, 0 deletions
diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs
new file mode 100644
index 0000000000..21da48e8d8
--- /dev/null
+++ b/corp/russian/data-import/src/main.rs
@@ -0,0 +1,298 @@
+//! This program imports Russian language data from OpenCorpora
+//! ("Открытый корпус") and OpenRussian into a SQLite database that
+//! can be used for [//corp/russian][corp-russian] projects.
+//!
+//! [corp-russian]: https://at.tvl.fyi/?q=%2F%2Fcorp%2Frussian
+//!
+//! Ideally, running this on intact dumps should yield a fully
+//! functional SQLite database compatible with all other tools
+//! consuming it.
+//!
+//! ## OpenCorpora format
+//!
+//! The format used is partially documented on the [OpenCorpora
+//! website][format-docs]. This seems to be a slightly outdated
+//! format, however, hence some information about what the format
+//! seems to be today.
+//!
+//! [format-docs]: http://opencorpora.org/?page=export
+//!
+//! The format is an XML file, which has several categories of data,
+//! each with their own schema:
+//!
+//! * `grammemes`: These define units of grammar. They're *likely* pretty
+//!   static, and we'll *likely* want to map them into a custom set of
+//!   (simpler) categories.
+//!
+//!   They form some kind of internal hierarchy, where some of them have a
+//!   `parent` attribute set to some other grammemes `name`.
+//!
+//!   There's a ridiculous number of these.
+//!
+//! * `restrictions`: Unclear, not documented on the page. They describe
+//!   something about the relationship between grammemes.
+//!
+//! * `lemmata`: this lists the actual lemmas, as well as all their
+//!   included morphological variants
+//!
+//!   Each lemma has an `id` attribute uniquely identifying its dictionary
+//!   form, as well as a number of sub-elements:
+//!
+//!   * the `l` attribute contains the lemma itself
+//!   * the `f` attributes contain morphological variations
+//!
+//!   Each of these sub elements again contains a number of `g` elements,
+//!   which refer to the IDs of grammems in their `v` attributes.
+//!
+//! * `<link_types>` These list possible "relationships between lemmas",
+//!   basically just assigning them IDs and names. There's only 27 of
+//!   these.
+//!
+//! * `<links>`: Using the types defined above, this establishes links
+//!   between lemmas that have some kind of relationship.
+//!
+//!   For example, a relationship `cardinal/ordinal` might be established
+//!   between the lemmas "два" and "второй".
+//!
+//! ## OpenRussian format
+//!
+//! The [OpenRussian](https://en.openrussian.org/dictionary) project
+//! lets users export its database as a set of CSV-files. For our
+//! purposes, we download the files using `<tab>` separators.
+//!
+//! Whereas OpenCorpora opts for a flat structure with a "tag" system
+//! (through its flexible grammemes), OpenRussian has a fixed pre-hoc
+//! structure into which it sorts some words with their morphologies.
+//! The OpenRussian database is much smaller as of January 2023 (~1.7
+//! million words vs. >5 million for OpenCorpora), but some of the
+//! information is much more practically useful.
+//!
+//! Two very important bits of information OpenRussian has are accent
+//! marks (most tables containing actual words have a normal form
+//! containing and accent mark, and a "bare" form without) and
+//! translations into English and German.
+//!
+//! The full dump includes the following tables (and some more):
+//!
+//! * `words`: List of lemmas in the corpus, with various bits of
+//!    metadata as well as hand-written notes.
+//!
+//! * `adjectives`: Contains IDs for words that are adjectives.
+//!
+//! * `nouns`: IDs for words that are nouns; and noun metadata (e.g.
+//!   gender, declinability)
+//!
+//! * `verbs`: IDs of words that are verbs, including their aspect and
+//!   "partnered" verb in the other aspect
+//!
+//! * `words_forms`: Contains all morphed variants of the lemmas from
+//!   `words`, including information about their grammeme, and accent
+//!   marks.
+//!
+//! * `words_rels`: Contains relations between words, containing
+//!   information like "synonyms" or general relation between words.
+//!
+//! * `translations`: Contains translations tagged by target language,
+//!   as well as examples and (occasionally) additional information.
+//!
+//! These tables also contain something, but have not been analysed
+//! yet:
+//!
+//! * `expressions_words`
+//! * `sentences`
+//! * `sentences_translations`
+//! * `sentences_words`
+
+use log::{error, info};
+use rusqlite::{Connection, Result};
+use std::env;
+use std::fmt::Display;
+use std::fs::File;
+use std::io::BufReader;
+
+mod db_setup;
+mod mappings;
+mod oc_parser;
+mod or_parser;
+
+struct Args {
+    output: String,
+    or_input: String,
+    oc_input: String,
+}
+
+impl Args {
+    fn populated(&self) -> bool {
+        !(self.output.is_empty() || self.or_input.is_empty() || self.oc_input.is_empty())
+    }
+}
+
+fn usage(binary_name: &str) {
+    bail(format!(
+        "usage: {} --output <output-file> --or-input <or-input> --oc-input <oc-input>",
+        binary_name
+    ));
+}
+
+fn parse_args() -> Args {
+    let mut args_iter = env::args();
+    let binary_name = args_iter.next().unwrap();
+
+    let mut args = Args {
+        output: "".into(),
+        or_input: env::var("OPENRUSSIAN_DATA").unwrap_or_default(),
+        oc_input: env::var("OPENCORPORA_DATA").unwrap_or_default(),
+    };
+
+    loop {
+        if args.populated() {
+            break;
+        }
+
+        while let Some(arg) = args_iter.next() {
+            match arg.as_str() {
+                "--output" => {
+                    args.output = args_iter.next().unwrap();
+                }
+
+                "--or-input" => {
+                    args.or_input = args_iter.next().unwrap();
+                }
+
+                "--oc-input" => {
+                    args.oc_input = args_iter.next().unwrap();
+                }
+
+                _ => usage(&binary_name),
+            }
+        }
+    }
+
+    if args.output.is_empty() || args.or_input.is_empty() || args.oc_input.is_empty() {
+        usage(&binary_name);
+    }
+
+    args
+}
+
+fn open_corpora(conn: &Connection, args: &Args) {
+    let input_file = File::open(&args.oc_input).ensure("failed to open input file");
+    let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file));
+    db_setup::initial_oc_schema(&conn);
+
+    let mut tx = conn
+        .unchecked_transaction()
+        .ensure("failed to start transaction");
+
+    let mut count = 0;
+
+    while let Some(elem) = parser.next_element() {
+        // commit every 1000 things
+        if count % 1000 == 0 {
+            tx.commit().ensure("transaction failed");
+            tx = conn
+                .unchecked_transaction()
+                .ensure("failed to start new transaction");
+            info!("transaction committed at watermark {}", count);
+        }
+
+        db_setup::insert_oc_element(&tx, elem);
+
+        count += 1;
+    }
+
+    tx.commit().ensure("final OpenCorpora commit failed");
+
+    info!("finished OpenCorpora import");
+}
+
+fn open_russian(conn: &Connection, args: &Args) {
+    let parser = or_parser::OpenRussianParser::new(&args.or_input);
+
+    db_setup::initial_or_schema(conn);
+
+    {
+        let tx = conn
+            .unchecked_transaction()
+            .ensure("failed to start transaction");
+
+        db_setup::insert_or_words(&tx, parser.words());
+        tx.commit().ensure("OpenRussian words commit failed");
+    }
+
+    {
+        let tx = conn
+            .unchecked_transaction()
+            .ensure("failed to start transaction");
+
+        db_setup::insert_or_word_forms(&tx, parser.words_forms());
+        tx.commit().ensure("OpenRussian word forms commit failed");
+    }
+
+    {
+        let tx = conn
+            .unchecked_transaction()
+            .ensure("failed to start transaction");
+
+        db_setup::insert_or_translations(&tx, parser.translations());
+        tx.commit().ensure("OpenRussian translations commit failed");
+    }
+
+    info!("finished OpenRussian import");
+}
+
+fn main() {
+    env_logger::builder()
+        .filter_level(log::LevelFilter::Info)
+        .init();
+
+    let args = parse_args();
+
+    info!("output path: {}", args.output);
+    info!("OpenCorpora input path: {}", args.oc_input);
+    info!("OpenRussian input path: {}", args.or_input);
+
+    let conn = Connection::open(&args.output).ensure("failed to open DB connection");
+
+    open_corpora(&conn, &args);
+    open_russian(&conn, &args);
+
+    // afterwards:
+    // add actual IDs to grammemes
+    // properly reference keys internally
+    // add foreign key constraint on lemma_grammemes.grammeme
+}
+
+/// It's like `expect`, but through `log::error`.
+trait Ensure<T> {
+    fn ensure<S: Into<String>>(self, msg: S) -> T;
+}
+
+impl<T, E: Display> Ensure<T> for Result<T, E> {
+    fn ensure<S: Into<String>>(self, msg: S) -> T {
+        match self {
+            Ok(x) => x,
+            Err(err) => {
+                error!("{}: {}", msg.into(), err);
+                std::process::exit(1);
+            }
+        }
+    }
+}
+
+impl<T> Ensure<T> for Option<T> {
+    fn ensure<S: Into<String>>(self, msg: S) -> T {
+        match self {
+            Some(x) => x,
+            None => {
+                error!("{}", msg.into());
+                std::process::exit(1);
+            }
+        }
+    }
+}
+
+fn bail<S: Into<String>>(msg: S) -> ! {
+    error!("{}", msg.into());
+    std::process::exit(1);
+}