about summary refs log tree commit diff
path: root/corp/russian/data-import/src
diff options
context:
space:
mode:
Diffstat (limited to 'corp/russian/data-import/src')
-rw-r--r--corp/russian/data-import/src/db_setup.rs298
-rw-r--r--corp/russian/data-import/src/main.rs298
-rw-r--r--corp/russian/data-import/src/mappings.rs185
-rw-r--r--corp/russian/data-import/src/oc_parser.rs470
-rw-r--r--corp/russian/data-import/src/or_parser.rs105
5 files changed, 1356 insertions, 0 deletions
diff --git a/corp/russian/data-import/src/db_setup.rs b/corp/russian/data-import/src/db_setup.rs
new file mode 100644
index 0000000000..c9fb517386
--- /dev/null
+++ b/corp/russian/data-import/src/db_setup.rs
@@ -0,0 +1,298 @@
+//! This module prepares the database layout.
+//!
+//! The XML import may be in an arbitrary order, so importing data is
+//! a multi-step process where we first set up schemas matching the
+//! data layout, import the data, and then modify the schema to
+//! introduce things like foreign key constraints between tables that
+//! represent relations.
+
+use super::Ensure;
+use crate::oc_parser::*;
+use crate::or_parser;
+use log::{debug, info};
+use rusqlite::Connection;
+
+/// Sets up an initial schema which matches the OpenCorpora data.
+pub fn initial_oc_schema(conn: &Connection) {
+    conn.execute_batch(
+        r#"
+-- table for plain import of grammemes from XML
+CREATE TABLE oc_grammemes (
+    name TEXT PRIMARY KEY,
+    parent TEXT,
+    alias TEXT,
+    description TEXT
+) STRICT;
+
+-- table for plain import of lemmas (*not* their variations!)
+CREATE TABLE oc_lemmas (
+    id INTEGER PRIMARY KEY,
+    lemma TEXT NOT NULL
+) STRICT;
+
+-- table for relationship between grammemes and lemmas
+CREATE TABLE oc_lemma_grammemes (
+    lemma INTEGER,
+    grammeme TEXT NOT NULL,
+    FOREIGN KEY(lemma) REFERENCES oc_lemmas(id)
+) STRICT;
+
+-- table for all words, i.e. including variations of lemmata
+CREATE TABLE oc_words (
+    lemma INTEGER NOT NULL,
+    word TEXT NOT NULL,
+    FOREIGN KEY(lemma) REFERENCES oc_lemmas(id)
+) STRICT;
+
+-- table for relationship between words and grammemes
+CREATE TABLE oc_word_grammemes (
+    word INTEGER NOT NULL,
+    grammeme TEXT NOT NULL,
+    FOREIGN KEY(word) REFERENCES oc_words(ROWID)
+) STRICT;
+
+-- table for link types
+CREATE TABLE oc_link_types (
+  id INTEGER PRIMARY KEY,
+  name TEXT
+) STRICT;
+
+-- table for links between lemmata
+CREATE TABLE oc_links (
+  id INTEGER PRIMARY KEY,
+  link_type INTEGER NOT NULL,
+  from_lemma INTEGER NOT NULL,
+  to_lemma INTEGER NOT NULL,
+  FOREIGN KEY(link_type) REFERENCES oc_link_types(id),
+  FOREIGN KEY(from_lemma) REFERENCES oc_lemmas(id),
+  FOREIGN KEY(to_lemma) REFERENCES oc_lemmas(id)
+) STRICT;
+
+"#,
+    )
+    .ensure("setting up OpenCorpora table schema failed");
+
+    info!("set up initial table schema for OpenCorpora import");
+}
+
+/// Inserts a single OpenCorpora element into the initial table structure.
+pub fn insert_oc_element(conn: &Connection, elem: OcElement) {
+    match elem {
+        OcElement::Grammeme(grammeme) => {
+            conn.execute(
+                "INSERT INTO oc_grammemes (name, parent, alias, description) VALUES (?1, ?2, ?3, ?4)",
+                (
+                    &grammeme.name,
+                    &grammeme.parent,
+                    &grammeme.alias,
+                    &grammeme.description,
+                ),
+            )
+            .ensure("failed to insert grammeme");
+
+            debug!("inserted grammeme {}", grammeme.name);
+        }
+
+        OcElement::Lemma(lemma) => insert_lemma(conn, lemma),
+
+        OcElement::LinkType(lt) => {
+            conn.execute(
+                "INSERT INTO oc_link_types (id, name) VALUES (?1, ?2)",
+                (&lt.id, &lt.name),
+            )
+            .ensure("failed to insert link type");
+
+            info!("inserted link type {}", lt.name);
+        }
+
+        OcElement::Link(link) => {
+            let mut stmt = conn
+                .prepare_cached(
+                    "INSERT INTO oc_links (id, link_type, from_lemma, to_lemma) VALUES (?1, ?2, ?3, ?4)",
+                )
+                .ensure("failed to prepare link statement");
+
+            stmt.execute((&link.id, &link.link_type, &link.from, &link.to))
+                .ensure("failed to insert link");
+
+            debug!("inserted link {}", link.id);
+        }
+    }
+}
+
+/// Insert a single lemma into the initial structure. This is somewhat
+/// involved because it also establishes a bunch of relations.
+fn insert_lemma(conn: &Connection, lemma: Lemma) {
+    // insert the lemma itself
+    let mut stmt = conn
+        .prepare_cached("INSERT INTO oc_lemmas (id, lemma) VALUES (?1, ?2)")
+        .ensure("failed to prepare statement");
+
+    stmt.execute((&lemma.id, &lemma.lemma.word))
+        .ensure("failed to insert grammeme");
+
+    // followed by its relations to the grammemes set
+    let mut stmt = conn
+        .prepare_cached("INSERT INTO oc_lemma_grammemes (lemma, grammeme) VALUES (?1, ?2)")
+        .ensure("failed to prepare statement");
+
+    for grammeme in lemma.grammemes {
+        stmt.execute((&lemma.id, grammeme))
+            .ensure("failed to insert grammeme<>lemma relationship");
+    }
+
+    // followed by all of its variations ...
+    let mut word_insert = conn
+        .prepare_cached("INSERT INTO oc_words (lemma, word) VALUES (?1, ?2)")
+        .unwrap();
+
+    let mut word_grammeme = conn
+        .prepare_cached("INSERT INTO oc_word_grammemes (word, grammeme) VALUES (?1, ?2)")
+        .unwrap();
+
+    for variation in lemma.variations {
+        // insert the word itself and get its rowid
+        word_insert
+            .execute((&lemma.id, &variation.word))
+            .ensure("failed to insert word");
+        let row_id = conn.last_insert_rowid();
+
+        // then insert its grammeme links
+        for grammeme in variation.grammemes {
+            word_grammeme
+                .execute((row_id, grammeme))
+                .ensure("failed to insert word<>grammeme link");
+        }
+    }
+
+    debug!("inserted lemma {}", lemma.id);
+}
+
+/// Sets up an initial schema for the OpenRussian data.
+pub fn initial_or_schema(conn: &Connection) {
+    conn.execute_batch(
+        r#"
+CREATE TABLE or_words (
+    id INTEGER PRIMARY KEY,
+    bare TEXT NOT NULL,
+    accented TEXT,
+    derived_from_word_id INTEGER,
+    rank INTEGER,
+    word_type TEXT,
+    level TEXT
+) STRICT;
+
+CREATE TABLE or_words_forms (
+    id INTEGER PRIMARY KEY,
+    word_id INTEGER NOT NULL,
+    form_type TEXT,
+    position TEXT,
+    form TEXT,
+    form_bare TEXT,
+    FOREIGN KEY(word_id) REFERENCES words(id)
+) STRICT;
+
+CREATE TABLE or_translations (
+    id INTEGER PRIMARY KEY,
+    word_id INTEGER NOT NULL,
+    translation TEXT,
+    example_ru TEXT,
+    example_tl TEXT,
+    info TEXT,
+    FOREIGN KEY(word_id) REFERENCES words(id)
+) STRICT;
+"#,
+    )
+    .ensure("setting up OpenRussian table schema failed");
+
+    info!("set up initial table schema for OpenRussian import");
+}
+
+pub fn insert_or_words<I: Iterator<Item = or_parser::Word>>(conn: &Connection, words: I) {
+    let mut stmt = conn
+        .prepare_cached(
+            "
+INSERT INTO or_words (id, bare, accented, derived_from_word_id, rank, word_type, level)
+VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)
+",
+        )
+        .ensure("failed to prepare OR words statement");
+    let mut count = 0;
+
+    for word in words {
+        stmt.execute((
+            word.id,
+            word.bare,
+            word.accented,
+            word.derived_from_word_id,
+            word.rank,
+            word.word_type,
+            word.level,
+        ))
+        .ensure("failed to insert OR word");
+        count += 1;
+    }
+
+    info!("inserted {} OpenRussian words", count);
+}
+
+pub fn insert_or_word_forms<I: Iterator<Item = or_parser::WordForm>>(conn: &Connection, forms: I) {
+    let mut stmt = conn
+        .prepare_cached(
+            "
+INSERT INTO or_words_forms (id, word_id, form_type, position, form, form_bare)
+VALUES (?1, ?2, ?3, ?4, ?5, ?6)
+",
+        )
+        .ensure("failed to prepare OR word forms statement");
+    let mut count = 0;
+
+    for form in forms {
+        stmt.execute((
+            form.id,
+            form.word_id,
+            form.form_type,
+            form.position,
+            form.form,
+            form.form_bare,
+        ))
+        .ensure("failed to insert OR word form");
+        count += 1;
+    }
+
+    info!("inserted {} OpenRussian word forms", count);
+}
+
+pub fn insert_or_translations<I: Iterator<Item = or_parser::Translation>>(
+    conn: &Connection,
+    translations: I,
+) {
+    let mut stmt = conn
+        .prepare_cached(
+            "INSERT INTO or_translations (id, word_id, translation, example_ru, example_tl, info)
+             VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
+        )
+        .ensure("failed to prepare OR translation statement");
+
+    let mut count = 0;
+
+    for tl in translations {
+        if tl.lang != "en" {
+            continue;
+        }
+
+        stmt.execute((
+            tl.id,
+            tl.word_id,
+            tl.tl,
+            tl.example_ru,
+            tl.example_tl,
+            tl.info,
+        ))
+        .ensure("failed to insert OR translation");
+
+        count += 1;
+    }
+
+    info!("inserted {} OpenRussian translations", count);
+}
diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs
new file mode 100644
index 0000000000..21da48e8d8
--- /dev/null
+++ b/corp/russian/data-import/src/main.rs
@@ -0,0 +1,298 @@
+//! This program imports Russian language data from OpenCorpora
+//! ("Открытый корпус") and OpenRussian into a SQLite database that
+//! can be used for [//corp/russian][corp-russian] projects.
+//!
+//! [corp-russian]: https://at.tvl.fyi/?q=%2F%2Fcorp%2Frussian
+//!
+//! Ideally, running this on intact dumps should yield a fully
+//! functional SQLite database compatible with all other tools
+//! consuming it.
+//!
+//! ## OpenCorpora format
+//!
+//! The format used is partially documented on the [OpenCorpora
+//! website][format-docs]. This seems to be a slightly outdated
+//! format, however, hence some information about what the format
+//! seems to be today.
+//!
+//! [format-docs]: http://opencorpora.org/?page=export
+//!
+//! The format is an XML file, which has several categories of data,
+//! each with their own schema:
+//!
+//! * `grammemes`: These define units of grammar. They're *likely* pretty
+//!   static, and we'll *likely* want to map them into a custom set of
+//!   (simpler) categories.
+//!
+//!   They form some kind of internal hierarchy, where some of them have a
+//!   `parent` attribute set to some other grammemes `name`.
+//!
+//!   There's a ridiculous number of these.
+//!
+//! * `restrictions`: Unclear, not documented on the page. They describe
+//!   something about the relationship between grammemes.
+//!
+//! * `lemmata`: this lists the actual lemmas, as well as all their
+//!   included morphological variants
+//!
+//!   Each lemma has an `id` attribute uniquely identifying its dictionary
+//!   form, as well as a number of sub-elements:
+//!
+//!   * the `l` attribute contains the lemma itself
+//!   * the `f` attributes contain morphological variations
+//!
+//!   Each of these sub elements again contains a number of `g` elements,
+//!   which refer to the IDs of grammems in their `v` attributes.
+//!
+//! * `<link_types>` These list possible "relationships between lemmas",
+//!   basically just assigning them IDs and names. There's only 27 of
+//!   these.
+//!
+//! * `<links>`: Using the types defined above, this establishes links
+//!   between lemmas that have some kind of relationship.
+//!
+//!   For example, a relationship `cardinal/ordinal` might be established
+//!   between the lemmas "два" and "второй".
+//!
+//! ## OpenRussian format
+//!
+//! The [OpenRussian](https://en.openrussian.org/dictionary) project
+//! lets users export its database as a set of CSV-files. For our
+//! purposes, we download the files using `<tab>` separators.
+//!
+//! Whereas OpenCorpora opts for a flat structure with a "tag" system
+//! (through its flexible grammemes), OpenRussian has a fixed pre-hoc
+//! structure into which it sorts some words with their morphologies.
+//! The OpenRussian database is much smaller as of January 2023 (~1.7
+//! million words vs. >5 million for OpenCorpora), but some of the
+//! information is much more practically useful.
+//!
+//! Two very important bits of information OpenRussian has are accent
+//! marks (most tables containing actual words have a normal form
+//! containing and accent mark, and a "bare" form without) and
+//! translations into English and German.
+//!
+//! The full dump includes the following tables (and some more):
+//!
+//! * `words`: List of lemmas in the corpus, with various bits of
+//!    metadata as well as hand-written notes.
+//!
+//! * `adjectives`: Contains IDs for words that are adjectives.
+//!
+//! * `nouns`: IDs for words that are nouns; and noun metadata (e.g.
+//!   gender, declinability)
+//!
+//! * `verbs`: IDs of words that are verbs, including their aspect and
+//!   "partnered" verb in the other aspect
+//!
+//! * `words_forms`: Contains all morphed variants of the lemmas from
+//!   `words`, including information about their grammeme, and accent
+//!   marks.
+//!
+//! * `words_rels`: Contains relations between words, containing
+//!   information like "synonyms" or general relation between words.
+//!
+//! * `translations`: Contains translations tagged by target language,
+//!   as well as examples and (occasionally) additional information.
+//!
+//! These tables also contain something, but have not been analysed
+//! yet:
+//!
+//! * `expressions_words`
+//! * `sentences`
+//! * `sentences_translations`
+//! * `sentences_words`
+
+use log::{error, info};
+use rusqlite::{Connection, Result};
+use std::env;
+use std::fmt::Display;
+use std::fs::File;
+use std::io::BufReader;
+
+mod db_setup;
+mod mappings;
+mod oc_parser;
+mod or_parser;
+
+struct Args {
+    output: String,
+    or_input: String,
+    oc_input: String,
+}
+
+impl Args {
+    fn populated(&self) -> bool {
+        !(self.output.is_empty() || self.or_input.is_empty() || self.oc_input.is_empty())
+    }
+}
+
+fn usage(binary_name: &str) {
+    bail(format!(
+        "usage: {} --output <output-file> --or-input <or-input> --oc-input <oc-input>",
+        binary_name
+    ));
+}
+
+fn parse_args() -> Args {
+    let mut args_iter = env::args();
+    let binary_name = args_iter.next().unwrap();
+
+    let mut args = Args {
+        output: "".into(),
+        or_input: env::var("OPENRUSSIAN_DATA").unwrap_or_default(),
+        oc_input: env::var("OPENCORPORA_DATA").unwrap_or_default(),
+    };
+
+    loop {
+        if args.populated() {
+            break;
+        }
+
+        while let Some(arg) = args_iter.next() {
+            match arg.as_str() {
+                "--output" => {
+                    args.output = args_iter.next().unwrap();
+                }
+
+                "--or-input" => {
+                    args.or_input = args_iter.next().unwrap();
+                }
+
+                "--oc-input" => {
+                    args.oc_input = args_iter.next().unwrap();
+                }
+
+                _ => usage(&binary_name),
+            }
+        }
+    }
+
+    if args.output.is_empty() || args.or_input.is_empty() || args.oc_input.is_empty() {
+        usage(&binary_name);
+    }
+
+    args
+}
+
+fn open_corpora(conn: &Connection, args: &Args) {
+    let input_file = File::open(&args.oc_input).ensure("failed to open input file");
+    let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file));
+    db_setup::initial_oc_schema(&conn);
+
+    let mut tx = conn
+        .unchecked_transaction()
+        .ensure("failed to start transaction");
+
+    let mut count = 0;
+
+    while let Some(elem) = parser.next_element() {
+        // commit every 1000 things
+        if count % 1000 == 0 {
+            tx.commit().ensure("transaction failed");
+            tx = conn
+                .unchecked_transaction()
+                .ensure("failed to start new transaction");
+            info!("transaction committed at watermark {}", count);
+        }
+
+        db_setup::insert_oc_element(&tx, elem);
+
+        count += 1;
+    }
+
+    tx.commit().ensure("final OpenCorpora commit failed");
+
+    info!("finished OpenCorpora import");
+}
+
+fn open_russian(conn: &Connection, args: &Args) {
+    let parser = or_parser::OpenRussianParser::new(&args.or_input);
+
+    db_setup::initial_or_schema(conn);
+
+    {
+        let tx = conn
+            .unchecked_transaction()
+            .ensure("failed to start transaction");
+
+        db_setup::insert_or_words(&tx, parser.words());
+        tx.commit().ensure("OpenRussian words commit failed");
+    }
+
+    {
+        let tx = conn
+            .unchecked_transaction()
+            .ensure("failed to start transaction");
+
+        db_setup::insert_or_word_forms(&tx, parser.words_forms());
+        tx.commit().ensure("OpenRussian word forms commit failed");
+    }
+
+    {
+        let tx = conn
+            .unchecked_transaction()
+            .ensure("failed to start transaction");
+
+        db_setup::insert_or_translations(&tx, parser.translations());
+        tx.commit().ensure("OpenRussian translations commit failed");
+    }
+
+    info!("finished OpenRussian import");
+}
+
+fn main() {
+    env_logger::builder()
+        .filter_level(log::LevelFilter::Info)
+        .init();
+
+    let args = parse_args();
+
+    info!("output path: {}", args.output);
+    info!("OpenCorpora input path: {}", args.oc_input);
+    info!("OpenRussian input path: {}", args.or_input);
+
+    let conn = Connection::open(&args.output).ensure("failed to open DB connection");
+
+    open_corpora(&conn, &args);
+    open_russian(&conn, &args);
+
+    // afterwards:
+    // add actual IDs to grammemes
+    // properly reference keys internally
+    // add foreign key constraint on lemma_grammemes.grammeme
+}
+
+/// It's like `expect`, but through `log::error`.
+trait Ensure<T> {
+    fn ensure<S: Into<String>>(self, msg: S) -> T;
+}
+
+impl<T, E: Display> Ensure<T> for Result<T, E> {
+    fn ensure<S: Into<String>>(self, msg: S) -> T {
+        match self {
+            Ok(x) => x,
+            Err(err) => {
+                error!("{}: {}", msg.into(), err);
+                std::process::exit(1);
+            }
+        }
+    }
+}
+
+impl<T> Ensure<T> for Option<T> {
+    fn ensure<S: Into<String>>(self, msg: S) -> T {
+        match self {
+            Some(x) => x,
+            None => {
+                error!("{}", msg.into());
+                std::process::exit(1);
+            }
+        }
+    }
+}
+
+fn bail<S: Into<String>>(msg: S) -> ! {
+    error!("{}", msg.into());
+    std::process::exit(1);
+}
diff --git a/corp/russian/data-import/src/mappings.rs b/corp/russian/data-import/src/mappings.rs
new file mode 100644
index 0000000000..985088a566
--- /dev/null
+++ b/corp/russian/data-import/src/mappings.rs
@@ -0,0 +1,185 @@
+//! Manual mapping of some data structures in OC/OR corpora.
+
+/// Maps the *names* of OpenRussian word types (the `word_type` field
+/// in the `or_words` table) to the *set* of OpenCorpora grammemes
+/// commonly attached to lemmata of this type in OC.
+///
+/// Some word types just don't map over, and are omitted. Many words
+/// also have an empty word type.
+pub const WORD_TYPES_GRAMMEME_MAP: &'static [(&'static str, &'static [&'static str])] = &[
+    ("adjective", &["ADJF"]),
+    ("adverb", &["ADVB"]),
+    ("noun", &["NOUN"]),
+    ("verb", &["INFN"]), // or "VERB" ...
+];
+
+/// Maps the *names* of OpenRussian grammemes (the `form_type` fields
+/// in the `or_word_forms` table) to the *set* of OpenCorpora
+/// grammemes attached to them corresponding lemma in the `oc_lemmas`
+/// table.
+///
+/// This *only* includes grammatical information about the lemma of
+/// the word (such as whether it is a verb or other type), but *not*
+/// information about the specific instance of the word (such as its
+/// gender).
+///
+/// Correctly corresponding these requires use of all mapping tables.
+pub const FORMS_LEMMATA_GRAMMEME_MAP: &'static [(&'static str, &'static [&'static str])] = &[
+    ("ru_adj_comparative", &["COMP"]),
+    ("ru_adj_superlative", &["ADJF", "Supr"]),
+    ("ru_adj_f_acc", &["ADJF"]),
+    ("ru_adj_f_dat", &["ADJF"]),
+    ("ru_adj_f_gen", &["ADJF"]),
+    ("ru_adj_f_inst", &["ADJF"]),
+    ("ru_adj_f_nom", &["ADJF"]),
+    ("ru_adj_f_prep", &["ADJF"]),
+    ("ru_adj_m_acc", &["ADJF"]),
+    ("ru_adj_m_dat", &["ADJF"]),
+    ("ru_adj_m_gen", &["ADJF"]),
+    ("ru_adj_m_inst", &["ADJF"]),
+    ("ru_adj_m_nom", &["ADJF"]),
+    ("ru_adj_m_prep", &["ADJF"]),
+    ("ru_adj_n_acc", &["ADJF"]),
+    ("ru_adj_n_dat", &["ADJF"]),
+    ("ru_adj_n_gen", &["ADJF"]),
+    ("ru_adj_n_inst", &["ADJF"]),
+    ("ru_adj_n_nom", &["ADJF"]),
+    ("ru_adj_n_prep", &["ADJF"]),
+    ("ru_adj_pl_acc", &["ADJF"]),
+    ("ru_adj_pl_dat", &["ADJF"]),
+    ("ru_adj_pl_gen", &["ADJF"]),
+    ("ru_adj_pl_inst", &["ADJF"]),
+    ("ru_adj_pl_nom", &["ADJF"]),
+    ("ru_adj_pl_prep", &["ADJF"]),
+    ("ru_adj_short_f", &["ADJS"]),
+    ("ru_adj_short_m", &["ADJS"]),
+    ("ru_adj_short_n", &["ADJS"]),
+    ("ru_adj_short_pl", &["ADJS"]),
+    ("ru_noun_pl_acc", &["NOUN"]),
+    ("ru_noun_pl_dat", &["NOUN"]),
+    ("ru_noun_pl_gen", &["NOUN"]),
+    ("ru_noun_pl_inst", &["NOUN"]),
+    ("ru_noun_pl_nom", &["NOUN"]),
+    ("ru_noun_pl_prep", &["NOUN"]),
+    ("ru_noun_sg_acc", &["NOUN"]),
+    ("ru_noun_sg_dat", &["NOUN"]),
+    ("ru_noun_sg_gen", &["NOUN"]),
+    ("ru_noun_sg_inst", &["NOUN"]),
+    ("ru_noun_sg_nom", &["NOUN"]),
+    ("ru_noun_sg_prep", &["NOUN"]),
+    ("ru_verb_gerund_past", &["GRND"]),
+    ("ru_verb_gerund_present", &["GRND"]),
+    ("ru_verb_imperative_pl", &["VERB"]),
+    ("ru_verb_imperative_sg", &["VERB"]),
+    ("ru_verb_past_f", &["VERB"]),
+    ("ru_verb_past_m", &["VERB"]),
+    ("ru_verb_past_n", &["VERB"]),
+    ("ru_verb_past_pl", &["VERB"]),
+    ("ru_verb_presfut_pl1", &["VERB"]),
+    ("ru_verb_presfut_pl2", &["VERB"]),
+    ("ru_verb_presfut_pl3", &["VERB"]),
+    ("ru_verb_presfut_sg1", &["VERB"]),
+    ("ru_verb_presfut_sg2", &["VERB"]),
+    ("ru_verb_presfut_sg3", &["VERB"]),
+    (
+        "ru_base",
+        &[ /* nothing consistent, except often 'Fixd' */ ],
+    ),
+    ("ru_verb_participle_active_past", &["PRTF", "past", "actv"]),
+    (
+        "ru_verb_participle_active_present",
+        &["PRTF", "pres", "actv"],
+    ),
+    (
+        "ru_verb_participle_passive_past",
+        &["PRTF", "past", "passv"],
+    ),
+    (
+        "ru_verb_participle_passive_present",
+        &["PRTF", "pres", "passv"],
+    ),
+];
+
+/// Maps the *names* of OpenRussian grammemes (the `form_type` fields
+/// in the `or_word_forms` table) to the *set* of OpenCorpora
+/// grammemes attached to them corresponding words in the `oc_words`
+/// table.
+///
+/// This includes grammatical information about the "instance" of the
+/// word (such as its gender), but *not* the higher-level type
+/// information about its lemma.
+///
+/// Correctly corresponding these requires use of all mapping tables.
+pub const FORMS_WORDS_GRAMMEME_MAP: &'static [(&'static str, &'static [&'static str])] = &[
+    ("ru_adj_comparative", &["Cmp2"]),
+    ("ru_adj_f_acc", &["femn", "sing", "accs"]),
+    ("ru_adj_f_dat", &["femn", "sing", "datv"]),
+    ("ru_adj_f_gen", &["femn", "sing", "gent"]),
+    ("ru_adj_f_inst", &["femn", "sing", "ablt"]),
+    ("ru_adj_f_nom", &["femn", "sing", "nomn"]),
+    ("ru_adj_f_prep", &["femn", "sing", "loct"]),
+    ("ru_adj_m_acc", &["masc", "sing", "accs"]),
+    ("ru_adj_m_dat", &["masc", "sing", "datv"]),
+    ("ru_adj_m_gen", &["masc", "sing", "gent"]),
+    ("ru_adj_m_inst", &["masc", "sing", "ablt"]),
+    ("ru_adj_m_nom", &["masc", "sing", "nomn"]),
+    ("ru_adj_m_prep", &["masc", "sing", "loct"]),
+    ("ru_adj_n_acc", &["neut", "sing", "accs"]),
+    ("ru_adj_n_dat", &["neut", "sing", "datv"]),
+    ("ru_adj_n_gen", &["neut", "sing", "gent"]),
+    ("ru_adj_n_inst", &["neut", "sing", "ablt"]),
+    ("ru_adj_n_nom", &["neut", "sing", "nomn"]),
+    ("ru_adj_n_prep", &["neut", "sing", "loct"]),
+    ("ru_adj_pl_acc", &["plur", "accs"]),
+    ("ru_adj_pl_dat", &["plur", "datv"]),
+    ("ru_adj_pl_gen", &["plur", "gent"]),
+    ("ru_adj_pl_inst", &["plur", "ablt"]),
+    ("ru_adj_pl_nom", &["plur", "nomn"]),
+    ("ru_adj_pl_prep", &["plur", "loct"]),
+    ("ru_adj_short_f", &["femn", "sing"]),
+    ("ru_adj_short_m", &["masc", "sing"]),
+    ("ru_adj_short_n", &["neut", "sing"]),
+    ("ru_adj_short_pl", &["plur"]),
+    ("ru_noun_pl_acc", &["plur", "accs"]),
+    ("ru_noun_pl_dat", &["plur", "datv"]),
+    ("ru_noun_pl_gen", &["plur", "gent"]),
+    ("ru_noun_pl_inst", &["plur", "ablt"]),
+    ("ru_noun_pl_nom", &["plur", "nomn"]),
+    ("ru_noun_pl_prep", &["plur", "loct"]),
+    ("ru_noun_sg_acc", &["sing", "accs"]),
+    ("ru_noun_sg_dat", &["sing", "datv"]),
+    ("ru_noun_sg_gen", &["sing", "gent"]),
+    ("ru_noun_sg_inst", &["sing", "ablt"]),
+    ("ru_noun_sg_nom", &["sing", "nomn"]),
+    ("ru_noun_sg_prep", &["sing", "loct"]),
+    ("ru_verb_gerund_past", &["past", "V-sh"]),
+    ("ru_verb_imperative_pl", &["plur", "impr"]),
+    ("ru_verb_imperative_sg", &["sing", "impr"]),
+    ("ru_verb_past_f", &["femn", "sing", "past"]),
+    ("ru_verb_past_m", &["masc", "sing", "past"]),
+    ("ru_verb_past_n", &["neut", "sing", "past"]),
+    ("ru_verb_past_pl", &["plur", "past"]),
+    // these also contain "pres" or "futr", depending on the verb.
+    ("ru_verb_presfut_pl1", &["plur", "1per"]),
+    ("ru_verb_presfut_pl2", &["plur", "2per"]),
+    ("ru_verb_presfut_pl3", &["plur", "3per"]),
+    ("ru_verb_presfut_sg1", &["sing", "1per"]),
+    ("ru_verb_presfut_sg2", &["sing", "2per"]),
+    ("ru_verb_presfut_sg3", &["sing", "3per"]),
+    // Unclear items, probably only useful tags on lemmata
+    (
+        "ru_verb_gerund_present",
+        &["pres" /* prob. something missing? */],
+    ),
+    (
+        "ru_adj_superlative",
+        &[/* TODO: unclear, random list of grammemes?! */],
+    ),
+    ("ru_base", &[/* TODO: unclear */]),
+    // These have no useful tags in the forms table, only gender &
+    // case tagging.
+    ("ru_verb_participle_active_past", &[]),
+    ("ru_verb_participle_active_present", &[]),
+    ("ru_verb_participle_passive_past", &[]),
+    ("ru_verb_participle_passive_present", &[]),
+];
diff --git a/corp/russian/data-import/src/oc_parser.rs b/corp/russian/data-import/src/oc_parser.rs
new file mode 100644
index 0000000000..8103ebd923
--- /dev/null
+++ b/corp/russian/data-import/src/oc_parser.rs
@@ -0,0 +1,470 @@
+use super::{bail, Ensure};
+use log::{info, warn};
+use std::str::FromStr;
+use xml::attribute::OwnedAttribute;
+use xml::name::OwnedName;
+use xml::reader::XmlEvent;
+use xml::EventReader;
+
+#[derive(Default, Debug)]
+pub struct Grammeme {
+    pub parent: Option<String>,
+    pub name: String,
+    pub alias: String,
+    pub description: String,
+}
+
+/// Single form of a word (either its lemma, or the variations).
+#[derive(Debug, Default)]
+pub struct Variation {
+    pub word: String,
+    pub grammemes: Vec<String>,
+}
+
+#[derive(Debug, Default)]
+pub struct Lemma {
+    pub id: u64,
+    pub lemma: Variation,
+    pub grammemes: Vec<String>,
+    pub variations: Vec<Variation>,
+}
+
+#[derive(Debug, Default)]
+pub struct LinkType {
+    pub id: u64,
+    pub name: String,
+}
+
+#[derive(Debug, Default)]
+pub struct Link {
+    pub id: u64,   // link itself
+    pub from: u64, // lemma
+    pub to: u64,   // lemma
+    pub link_type: u64,
+}
+
+#[derive(Debug)]
+pub enum OcElement {
+    Grammeme(Grammeme),
+    Lemma(Lemma),
+    LinkType(LinkType),
+    Link(Link),
+}
+
+#[derive(Debug, PartialEq)]
+enum ParserState {
+    /// Parser is not parsing any particular section and waiting for a
+    /// start tag instead.
+    Init,
+
+    /// Parser is parsing grammemes.
+    Grammemes,
+
+    /// Parser is parsing lemmata.
+    Lemmata,
+
+    /// Parser is inside a lemma's actual lemma.
+    Lemma,
+
+    /// Parser is parsing a morphological variation of a lemma.
+    Variation,
+
+    /// Parser is parsing link types.
+    LinkTypes,
+
+    /// Parser is parsing links.
+    Links,
+
+    /// Parser has seen the end of the line and nothing more is
+    /// available.
+    Ended,
+}
+
+pub struct OpenCorporaParser<R: std::io::Read> {
+    reader: EventReader<R>,
+    state: ParserState,
+}
+
+#[derive(PartialEq)]
+enum SectionState {
+    /// Actively interested in parsing this section.
+    Active,
+
+    /// Section is known, but currently ignored.
+    Inactive,
+
+    /// Section is unknown (probably a bug).
+    Unknown,
+}
+
+fn section_state(section: &str) -> SectionState {
+    match section {
+        "grammemes" | "lemmata" | "link_types" | "links" => SectionState::Active,
+        "restrictions" => SectionState::Inactive,
+        _ => SectionState::Unknown,
+    }
+}
+
+impl<R: std::io::Read> OpenCorporaParser<R> {
+    pub fn new(reader: R) -> Self {
+        let config = xml::ParserConfig::new().trim_whitespace(true);
+        let reader = EventReader::new_with_config(reader, config);
+
+        Self {
+            reader,
+            state: ParserState::Init,
+        }
+    }
+
+    /// Pull an `OcElement` out of the parser. Returns `None` if the
+    /// parser stream has ended.
+    pub fn next_element(&mut self) -> Option<OcElement> {
+        if self.state == ParserState::Ended {
+            return None;
+        }
+
+        // Pull the next element to determine what context to enter
+        // next.
+        loop {
+            match &self.next() {
+                // no-op events that do not affect parser state
+                XmlEvent::Comment(_)
+                | XmlEvent::Whitespace(_)
+                | XmlEvent::ProcessingInstruction { .. }
+                | XmlEvent::StartDocument { .. } => continue,
+                XmlEvent::StartElement { name, .. } | XmlEvent::EndElement { name }
+                    if name.local_name == "dictionary" =>
+                {
+                    continue
+                }
+
+                // end of the file, nothing more to return
+                XmlEvent::EndDocument => {
+                    self.state = ParserState::Ended;
+                    return None;
+                }
+
+                // some sections are skipped
+                XmlEvent::StartElement { name, .. } | XmlEvent::EndElement { name }
+                    if section_state(&name.local_name) == SectionState::Inactive =>
+                {
+                    info!("skipping {} section", name.local_name);
+                    self.skip_section(&name.local_name);
+                }
+
+                // active section events start specific parser states ...
+                XmlEvent::StartElement { name, .. }
+                    if section_state(&name.local_name) == SectionState::Active =>
+                {
+                    self.state = match name.local_name.as_str() {
+                        "grammemes" => ParserState::Grammemes,
+                        "lemmata" => ParserState::Lemmata,
+                        "link_types" => ParserState::LinkTypes,
+                        "links" => ParserState::Links,
+                        _ => unreachable!(),
+                    };
+                }
+
+                // ... or end them
+                XmlEvent::EndElement { name, .. }
+                    if section_state(&name.local_name) == SectionState::Active =>
+                {
+                    // TODO: assert that the right section ended
+                    self.state = ParserState::Init;
+                }
+
+                // actual beginning of an actual element, dispatch accordingly
+                event @ XmlEvent::StartElement {
+                    name, attributes, ..
+                } => match &self.state {
+                    ParserState::Grammemes => {
+                        return Some(OcElement::Grammeme(self.parse_grammeme(name, attributes)))
+                    }
+
+                    ParserState::Lemmata => {
+                        return Some(OcElement::Lemma(self.parse_lemma(name, attributes)))
+                    }
+
+                    ParserState::LinkTypes => {
+                        return Some(OcElement::LinkType(self.parse_link_type(name, attributes)))
+                    }
+
+                    ParserState::Links if name.local_name == "link" => {
+                        return Some(OcElement::Link(self.parse_link(attributes)))
+                    }
+
+                    ParserState::Init | ParserState::Ended => bail(format!(
+                        "parser received an unexpected start element while in state {:?}: {:?}",
+                        self.state, event
+                    )),
+
+                    other => bail(format!(
+                        "next_element() called while parser was in state {:?}",
+                        other
+                    )),
+                },
+
+                // finally, events that indicate a bug if they're
+                // encountered here
+                event @ XmlEvent::EndElement { .. }
+                | event @ XmlEvent::CData(_)
+                | event @ XmlEvent::Characters(_) => {
+                    bail(format!("unexpected XML event: {:?}", event))
+                }
+            }
+        }
+    }
+
+    /// Skip a section by advancing the parser state until we see an
+    /// end element for the skipped section.
+    fn skip_section(&mut self, section: &str) {
+        loop {
+            match self.next() {
+                XmlEvent::EndElement { name } if name.local_name == section => return,
+                _ => continue,
+            }
+        }
+    }
+
+    fn next(&mut self) -> XmlEvent {
+        self.reader.next().ensure("XML parsing failed")
+    }
+
+    /// Parse a tag that should have plain string content.
+    fn parse_string(&mut self, tag_name: &str) -> String {
+        let mut out = String::new();
+
+        loop {
+            match self.next() {
+                // ignore irrelevant things
+                XmlEvent::Comment(_) | XmlEvent::Whitespace(_) => continue,
+
+                // set the content
+                XmlEvent::Characters(content) => {
+                    out = content;
+                }
+
+                // expect the end of the element
+                XmlEvent::EndElement { name } if name.local_name == tag_name => return out,
+
+                // fail on everything unexpected
+                event => bail(format!(
+                    "unexpected element while parsing <{}>: {:?}",
+                    tag_name, event
+                )),
+            }
+        }
+    }
+
+    /// Parse a single `<grammeme>` tag.
+    fn parse_grammeme(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> Grammeme {
+        if name.local_name != "grammeme" {
+            bail(format!(
+                "expected to parse a grammeme, but found <{}>",
+                name.local_name
+            ));
+        }
+
+        let mut grammeme = Grammeme::default();
+
+        for attr in attributes {
+            if attr.name.local_name == "parent" && !attr.value.is_empty() {
+                grammeme.parent = Some(attr.value.clone());
+            }
+        }
+
+        loop {
+            match self.next() {
+                // ignore irrelevant things
+                XmlEvent::Comment(_) | XmlEvent::Whitespace(_) => continue,
+
+                // expect known tags
+                XmlEvent::StartElement { name, .. } if name.local_name == "name" => {
+                    grammeme.name = self.parse_string("name");
+                }
+
+                XmlEvent::StartElement { name, .. } if name.local_name == "alias" => {
+                    grammeme.alias = self.parse_string("alias");
+                }
+
+                XmlEvent::StartElement { name, .. } if name.local_name == "description" => {
+                    grammeme.description = self.parse_string("description");
+                }
+
+                // handle end of the grammeme
+                XmlEvent::EndElement { name } if name.local_name == "grammeme" => break,
+
+                // fail on everything unexpected
+                event => bail(format!(
+                    "unexpected element while parsing <grammeme>: {:?}",
+                    event
+                )),
+            }
+        }
+
+        grammeme
+    }
+
+    fn parse_lemma(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> Lemma {
+        if name.local_name != "lemma" {
+            bail(format!(
+                "expected to parse a lemma, but found <{}>",
+                name.local_name
+            ));
+        }
+
+        self.state = ParserState::Lemma;
+        let mut lemma = Lemma::default();
+
+        for attr in attributes {
+            if attr.name.local_name == "id" {
+                lemma.id = u64::from_str(&attr.value).ensure("failed to parse lemma ID");
+            }
+        }
+
+        loop {
+            match self.next() {
+                // <lemma> has ended
+                XmlEvent::EndElement { name } if name.local_name == "lemma" => {
+                    self.state = ParserState::Lemmata;
+                    return lemma;
+                }
+
+                // actual lemma content
+                XmlEvent::StartElement {
+                    name, attributes, ..
+                } => {
+                    match name.local_name.as_str() {
+                        // beginning to parse the lemma itself
+                        "l" => {
+                            lemma.lemma.word = attributes
+                                .into_iter()
+                                .find(|attr| attr.name.local_name == "t")
+                                .map(|attr| attr.value)
+                                .ensure(format!("lemma {} had no actual word", lemma.id));
+                        }
+
+                        // parsing a lemma variation
+                        "f" => {
+                            self.state = ParserState::Variation;
+
+                            let word = attributes
+                                .into_iter()
+                                .find(|attr| attr.name.local_name == "t")
+                                .map(|attr| attr.value)
+                                .ensure(format!(
+                                    "variation of lemma {} had no actual word",
+                                    lemma.id
+                                ));
+
+                            lemma.variations.push(Variation {
+                                word,
+                                grammemes: vec![],
+                            });
+                        }
+
+                        // parse a grammeme association
+                        "g" => {
+                            let grammeme = attributes
+                                .into_iter()
+                                .find(|attr| attr.name.local_name == "v")
+                                .map(|attr| attr.value)
+                                .ensure(format!(
+                                    "grammeme association in lemma {} missing ID",
+                                    lemma.id
+                                ));
+
+                            match self.state {
+                                ParserState::Lemma => {
+                                    lemma.grammemes.push(grammeme);
+                                }
+
+                                ParserState::Variation => {
+                                    lemma
+                                        .variations
+                                        .last_mut()
+                                        .ensure("variations should be non-empty")
+                                        .grammemes
+                                        .push(grammeme);
+                                }
+
+                                _ => bail(format!("invalid parser state: encountered grammeme association while in {:?}", self.state)),
+                            }
+                        }
+
+                        other => bail(format!("unexpected element while parsing lemma: {other}")),
+                    };
+                }
+
+                XmlEvent::EndElement { name } => match name.local_name.as_str() {
+                    "l" if self.state == ParserState::Lemma => continue,
+                    "f" if self.state == ParserState::Variation => {
+                        self.state = ParserState::Lemma;
+                        continue;
+                    }
+                    "g" => continue,
+                    other => bail(format!(
+                        "unexpected </{other}> while parsing lemma {}",
+                        lemma.id
+                    )),
+                },
+
+                _ => continue,
+            }
+        }
+    }
+
+    fn parse_link_type(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> LinkType {
+        if name.local_name != "type" {
+            bail(format!(
+                "expected to parse a link type, but found <{}>",
+                name.local_name
+            ));
+        }
+
+        let mut link_type = LinkType::default();
+
+        for attr in attributes {
+            if attr.name.local_name == "id" {
+                link_type.id = u64::from_str(&attr.value).ensure("failed to parse link type ID");
+            }
+        }
+
+        link_type.name = self.parse_string("type");
+        link_type
+    }
+
+    fn parse_link(&mut self, attributes: &[OwnedAttribute]) -> Link {
+        let mut link = Link::default();
+
+        for attr in attributes {
+            let i_val = || u64::from_str(&attr.value).ensure("failed to parse link field");
+
+            match attr.name.local_name.as_str() {
+                "id" => {
+                    link.id = i_val();
+                }
+                "from" => {
+                    link.from = i_val();
+                }
+                "to" => {
+                    link.to = i_val();
+                }
+                "type" => {
+                    link.link_type = i_val();
+                }
+
+                other => {
+                    warn!("unexpected attribute {} on <link>", other);
+                    continue;
+                }
+            }
+        }
+
+        // expect the end of the <link> element, though since these
+        // are empty it should be immediate.
+        self.skip_section("link");
+
+        link
+    }
+}
diff --git a/corp/russian/data-import/src/or_parser.rs b/corp/russian/data-import/src/or_parser.rs
new file mode 100644
index 0000000000..8bfc61dbef
--- /dev/null
+++ b/corp/russian/data-import/src/or_parser.rs
@@ -0,0 +1,105 @@
+//! Parser for the OpenRussian data format.
+//!
+//! Note that when exporting OpenRussian data from the project you
+//! have to choose an encoding. We choose tab-separated CSV files, as
+//! tabs have a very low probability of actually appearing in the
+//! input data and this skips some potential encoding issues.
+
+use super::Ensure;
+use serde::Deserialize;
+use std::fs::File;
+use std::io::BufReader;
+use std::path::PathBuf;
+
+/// A word from the `words` table.
+#[derive(Debug, Deserialize)]
+pub struct Word {
+    pub id: usize,
+    pub position: String, // TODO: unknown
+    pub bare: String,     // TODO: unknown
+    pub accented: String, // TODO: unknown
+    pub derived_from_word_id: Option<usize>,
+    pub rank: Option<usize>,
+    pub disabled: String,     // TODO: unknown
+    pub audio: String,        // TODO: unknown
+    pub usage_en: String,     // TODO: unknown
+    pub usage_de: String,     // TODO: unknown
+    pub number_value: String, // TODO: unknown
+
+    #[serde(rename = "type")]
+    pub word_type: String, // TODO: unknown
+
+    pub level: String,      // TODO: unknown
+    pub created_at: String, // TODO: unknown
+}
+
+/// A word form from the `words_forms` table.
+#[derive(Debug, Deserialize)]
+pub struct WordForm {
+    pub id: usize,
+    pub word_id: usize,
+    pub form_type: String,
+    pub position: String,
+    pub form: String,
+    pub form_bare: String,
+}
+
+/// A translation from the `translations` table.
+#[derive(Debug, Deserialize)]
+pub struct Translation {
+    pub id: usize,
+    pub lang: String,
+    pub word_id: usize,
+    pub position: String,
+    pub tl: String, // unknown
+    pub example_ru: String,
+    pub example_tl: String,
+    pub info: String,
+}
+
+pub struct OpenRussianParser {
+    or_directory: PathBuf,
+}
+
+pub type DynIter<T> = Box<dyn Iterator<Item = T>>;
+
+impl OpenRussianParser {
+    pub fn new<P: Into<PathBuf>>(path: P) -> Self {
+        OpenRussianParser {
+            or_directory: path.into(),
+        }
+    }
+
+    pub fn words(&self) -> DynIter<Word> {
+        self.parser_for("words.csv")
+    }
+
+    pub fn words_forms(&self) -> DynIter<WordForm> {
+        self.parser_for("words_forms.csv")
+    }
+
+    pub fn translations(&self) -> DynIter<Translation> {
+        self.parser_for("translations.csv")
+    }
+
+    fn parser_for<T: serde::de::DeserializeOwned + 'static>(
+        &self,
+        file_name: &str,
+    ) -> Box<dyn Iterator<Item = T>> {
+        let mut path = self.or_directory.clone();
+        path.push(file_name);
+
+        let reader = csv::ReaderBuilder::new()
+            .delimiter(b'\t')
+            .from_reader(BufReader::new(
+                File::open(&path).ensure("failed to open words.csv"),
+            ));
+
+        Box::new(reader.into_deserialize().map(|result| {
+            result.ensure(format!(
+                "failed to deserialize {}",
+                std::any::type_name::<T>()
+            ))
+        }))
+    }
+}