diff options
Diffstat (limited to 'corp/russian/data-import/src')
-rw-r--r-- | corp/russian/data-import/src/db_setup.rs | 298 | ||||
-rw-r--r-- | corp/russian/data-import/src/main.rs | 298 | ||||
-rw-r--r-- | corp/russian/data-import/src/mappings.rs | 185 | ||||
-rw-r--r-- | corp/russian/data-import/src/oc_parser.rs | 470 | ||||
-rw-r--r-- | corp/russian/data-import/src/or_parser.rs | 105 |
5 files changed, 1356 insertions, 0 deletions
diff --git a/corp/russian/data-import/src/db_setup.rs b/corp/russian/data-import/src/db_setup.rs new file mode 100644 index 0000000000..c9fb517386 --- /dev/null +++ b/corp/russian/data-import/src/db_setup.rs @@ -0,0 +1,298 @@ +//! This module prepares the database layout. +//! +//! The XML import may be in an arbitrary order, so importing data is +//! a multi-step process where we first set up schemas matching the +//! data layout, import the data, and then modify the schema to +//! introduce things like foreign key constraints between tables that +//! represent relations. + +use super::Ensure; +use crate::oc_parser::*; +use crate::or_parser; +use log::{debug, info}; +use rusqlite::Connection; + +/// Sets up an initial schema which matches the OpenCorpora data. +pub fn initial_oc_schema(conn: &Connection) { + conn.execute_batch( + r#" +-- table for plain import of grammemes from XML +CREATE TABLE oc_grammemes ( + name TEXT PRIMARY KEY, + parent TEXT, + alias TEXT, + description TEXT +) STRICT; + +-- table for plain import of lemmas (*not* their variations!) +CREATE TABLE oc_lemmas ( + id INTEGER PRIMARY KEY, + lemma TEXT NOT NULL +) STRICT; + +-- table for relationship between grammemes and lemmas +CREATE TABLE oc_lemma_grammemes ( + lemma INTEGER, + grammeme TEXT NOT NULL, + FOREIGN KEY(lemma) REFERENCES oc_lemmas(id) +) STRICT; + +-- table for all words, i.e. including variations of lemmata +CREATE TABLE oc_words ( + lemma INTEGER NOT NULL, + word TEXT NOT NULL, + FOREIGN KEY(lemma) REFERENCES oc_lemmas(id) +) STRICT; + +-- table for relationship between words and grammemes +CREATE TABLE oc_word_grammemes ( + word INTEGER NOT NULL, + grammeme TEXT NOT NULL, + FOREIGN KEY(word) REFERENCES oc_words(ROWID) +) STRICT; + +-- table for link types +CREATE TABLE oc_link_types ( + id INTEGER PRIMARY KEY, + name TEXT +) STRICT; + +-- table for links between lemmata +CREATE TABLE oc_links ( + id INTEGER PRIMARY KEY, + link_type INTEGER NOT NULL, + from_lemma INTEGER NOT NULL, + to_lemma INTEGER NOT NULL, + FOREIGN KEY(link_type) REFERENCES oc_link_types(id), + FOREIGN KEY(from_lemma) REFERENCES oc_lemmas(id), + FOREIGN KEY(to_lemma) REFERENCES oc_lemmas(id) +) STRICT; + +"#, + ) + .ensure("setting up OpenCorpora table schema failed"); + + info!("set up initial table schema for OpenCorpora import"); +} + +/// Inserts a single OpenCorpora element into the initial table structure. +pub fn insert_oc_element(conn: &Connection, elem: OcElement) { + match elem { + OcElement::Grammeme(grammeme) => { + conn.execute( + "INSERT INTO oc_grammemes (name, parent, alias, description) VALUES (?1, ?2, ?3, ?4)", + ( + &grammeme.name, + &grammeme.parent, + &grammeme.alias, + &grammeme.description, + ), + ) + .ensure("failed to insert grammeme"); + + debug!("inserted grammeme {}", grammeme.name); + } + + OcElement::Lemma(lemma) => insert_lemma(conn, lemma), + + OcElement::LinkType(lt) => { + conn.execute( + "INSERT INTO oc_link_types (id, name) VALUES (?1, ?2)", + (<.id, <.name), + ) + .ensure("failed to insert link type"); + + info!("inserted link type {}", lt.name); + } + + OcElement::Link(link) => { + let mut stmt = conn + .prepare_cached( + "INSERT INTO oc_links (id, link_type, from_lemma, to_lemma) VALUES (?1, ?2, ?3, ?4)", + ) + .ensure("failed to prepare link statement"); + + stmt.execute((&link.id, &link.link_type, &link.from, &link.to)) + .ensure("failed to insert link"); + + debug!("inserted link {}", link.id); + } + } +} + +/// Insert a single lemma into the initial structure. This is somewhat +/// involved because it also establishes a bunch of relations. +fn insert_lemma(conn: &Connection, lemma: Lemma) { + // insert the lemma itself + let mut stmt = conn + .prepare_cached("INSERT INTO oc_lemmas (id, lemma) VALUES (?1, ?2)") + .ensure("failed to prepare statement"); + + stmt.execute((&lemma.id, &lemma.lemma.word)) + .ensure("failed to insert grammeme"); + + // followed by its relations to the grammemes set + let mut stmt = conn + .prepare_cached("INSERT INTO oc_lemma_grammemes (lemma, grammeme) VALUES (?1, ?2)") + .ensure("failed to prepare statement"); + + for grammeme in lemma.grammemes { + stmt.execute((&lemma.id, grammeme)) + .ensure("failed to insert grammeme<>lemma relationship"); + } + + // followed by all of its variations ... + let mut word_insert = conn + .prepare_cached("INSERT INTO oc_words (lemma, word) VALUES (?1, ?2)") + .unwrap(); + + let mut word_grammeme = conn + .prepare_cached("INSERT INTO oc_word_grammemes (word, grammeme) VALUES (?1, ?2)") + .unwrap(); + + for variation in lemma.variations { + // insert the word itself and get its rowid + word_insert + .execute((&lemma.id, &variation.word)) + .ensure("failed to insert word"); + let row_id = conn.last_insert_rowid(); + + // then insert its grammeme links + for grammeme in variation.grammemes { + word_grammeme + .execute((row_id, grammeme)) + .ensure("failed to insert word<>grammeme link"); + } + } + + debug!("inserted lemma {}", lemma.id); +} + +/// Sets up an initial schema for the OpenRussian data. +pub fn initial_or_schema(conn: &Connection) { + conn.execute_batch( + r#" +CREATE TABLE or_words ( + id INTEGER PRIMARY KEY, + bare TEXT NOT NULL, + accented TEXT, + derived_from_word_id INTEGER, + rank INTEGER, + word_type TEXT, + level TEXT +) STRICT; + +CREATE TABLE or_words_forms ( + id INTEGER PRIMARY KEY, + word_id INTEGER NOT NULL, + form_type TEXT, + position TEXT, + form TEXT, + form_bare TEXT, + FOREIGN KEY(word_id) REFERENCES words(id) +) STRICT; + +CREATE TABLE or_translations ( + id INTEGER PRIMARY KEY, + word_id INTEGER NOT NULL, + translation TEXT, + example_ru TEXT, + example_tl TEXT, + info TEXT, + FOREIGN KEY(word_id) REFERENCES words(id) +) STRICT; +"#, + ) + .ensure("setting up OpenRussian table schema failed"); + + info!("set up initial table schema for OpenRussian import"); +} + +pub fn insert_or_words<I: Iterator<Item = or_parser::Word>>(conn: &Connection, words: I) { + let mut stmt = conn + .prepare_cached( + " +INSERT INTO or_words (id, bare, accented, derived_from_word_id, rank, word_type, level) +VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7) +", + ) + .ensure("failed to prepare OR words statement"); + let mut count = 0; + + for word in words { + stmt.execute(( + word.id, + word.bare, + word.accented, + word.derived_from_word_id, + word.rank, + word.word_type, + word.level, + )) + .ensure("failed to insert OR word"); + count += 1; + } + + info!("inserted {} OpenRussian words", count); +} + +pub fn insert_or_word_forms<I: Iterator<Item = or_parser::WordForm>>(conn: &Connection, forms: I) { + let mut stmt = conn + .prepare_cached( + " +INSERT INTO or_words_forms (id, word_id, form_type, position, form, form_bare) +VALUES (?1, ?2, ?3, ?4, ?5, ?6) +", + ) + .ensure("failed to prepare OR word forms statement"); + let mut count = 0; + + for form in forms { + stmt.execute(( + form.id, + form.word_id, + form.form_type, + form.position, + form.form, + form.form_bare, + )) + .ensure("failed to insert OR word form"); + count += 1; + } + + info!("inserted {} OpenRussian word forms", count); +} + +pub fn insert_or_translations<I: Iterator<Item = or_parser::Translation>>( + conn: &Connection, + translations: I, +) { + let mut stmt = conn + .prepare_cached( + "INSERT INTO or_translations (id, word_id, translation, example_ru, example_tl, info) + VALUES (?1, ?2, ?3, ?4, ?5, ?6)", + ) + .ensure("failed to prepare OR translation statement"); + + let mut count = 0; + + for tl in translations { + if tl.lang != "en" { + continue; + } + + stmt.execute(( + tl.id, + tl.word_id, + tl.tl, + tl.example_ru, + tl.example_tl, + tl.info, + )) + .ensure("failed to insert OR translation"); + + count += 1; + } + + info!("inserted {} OpenRussian translations", count); +} diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs new file mode 100644 index 0000000000..21da48e8d8 --- /dev/null +++ b/corp/russian/data-import/src/main.rs @@ -0,0 +1,298 @@ +//! This program imports Russian language data from OpenCorpora +//! ("Открытый корпус") and OpenRussian into a SQLite database that +//! can be used for [//corp/russian][corp-russian] projects. +//! +//! [corp-russian]: https://at.tvl.fyi/?q=%2F%2Fcorp%2Frussian +//! +//! Ideally, running this on intact dumps should yield a fully +//! functional SQLite database compatible with all other tools +//! consuming it. +//! +//! ## OpenCorpora format +//! +//! The format used is partially documented on the [OpenCorpora +//! website][format-docs]. This seems to be a slightly outdated +//! format, however, hence some information about what the format +//! seems to be today. +//! +//! [format-docs]: http://opencorpora.org/?page=export +//! +//! The format is an XML file, which has several categories of data, +//! each with their own schema: +//! +//! * `grammemes`: These define units of grammar. They're *likely* pretty +//! static, and we'll *likely* want to map them into a custom set of +//! (simpler) categories. +//! +//! They form some kind of internal hierarchy, where some of them have a +//! `parent` attribute set to some other grammemes `name`. +//! +//! There's a ridiculous number of these. +//! +//! * `restrictions`: Unclear, not documented on the page. They describe +//! something about the relationship between grammemes. +//! +//! * `lemmata`: this lists the actual lemmas, as well as all their +//! included morphological variants +//! +//! Each lemma has an `id` attribute uniquely identifying its dictionary +//! form, as well as a number of sub-elements: +//! +//! * the `l` attribute contains the lemma itself +//! * the `f` attributes contain morphological variations +//! +//! Each of these sub elements again contains a number of `g` elements, +//! which refer to the IDs of grammems in their `v` attributes. +//! +//! * `<link_types>` These list possible "relationships between lemmas", +//! basically just assigning them IDs and names. There's only 27 of +//! these. +//! +//! * `<links>`: Using the types defined above, this establishes links +//! between lemmas that have some kind of relationship. +//! +//! For example, a relationship `cardinal/ordinal` might be established +//! between the lemmas "два" and "второй". +//! +//! ## OpenRussian format +//! +//! The [OpenRussian](https://en.openrussian.org/dictionary) project +//! lets users export its database as a set of CSV-files. For our +//! purposes, we download the files using `<tab>` separators. +//! +//! Whereas OpenCorpora opts for a flat structure with a "tag" system +//! (through its flexible grammemes), OpenRussian has a fixed pre-hoc +//! structure into which it sorts some words with their morphologies. +//! The OpenRussian database is much smaller as of January 2023 (~1.7 +//! million words vs. >5 million for OpenCorpora), but some of the +//! information is much more practically useful. +//! +//! Two very important bits of information OpenRussian has are accent +//! marks (most tables containing actual words have a normal form +//! containing and accent mark, and a "bare" form without) and +//! translations into English and German. +//! +//! The full dump includes the following tables (and some more): +//! +//! * `words`: List of lemmas in the corpus, with various bits of +//! metadata as well as hand-written notes. +//! +//! * `adjectives`: Contains IDs for words that are adjectives. +//! +//! * `nouns`: IDs for words that are nouns; and noun metadata (e.g. +//! gender, declinability) +//! +//! * `verbs`: IDs of words that are verbs, including their aspect and +//! "partnered" verb in the other aspect +//! +//! * `words_forms`: Contains all morphed variants of the lemmas from +//! `words`, including information about their grammeme, and accent +//! marks. +//! +//! * `words_rels`: Contains relations between words, containing +//! information like "synonyms" or general relation between words. +//! +//! * `translations`: Contains translations tagged by target language, +//! as well as examples and (occasionally) additional information. +//! +//! These tables also contain something, but have not been analysed +//! yet: +//! +//! * `expressions_words` +//! * `sentences` +//! * `sentences_translations` +//! * `sentences_words` + +use log::{error, info}; +use rusqlite::{Connection, Result}; +use std::env; +use std::fmt::Display; +use std::fs::File; +use std::io::BufReader; + +mod db_setup; +mod mappings; +mod oc_parser; +mod or_parser; + +struct Args { + output: String, + or_input: String, + oc_input: String, +} + +impl Args { + fn populated(&self) -> bool { + !(self.output.is_empty() || self.or_input.is_empty() || self.oc_input.is_empty()) + } +} + +fn usage(binary_name: &str) { + bail(format!( + "usage: {} --output <output-file> --or-input <or-input> --oc-input <oc-input>", + binary_name + )); +} + +fn parse_args() -> Args { + let mut args_iter = env::args(); + let binary_name = args_iter.next().unwrap(); + + let mut args = Args { + output: "".into(), + or_input: env::var("OPENRUSSIAN_DATA").unwrap_or_default(), + oc_input: env::var("OPENCORPORA_DATA").unwrap_or_default(), + }; + + loop { + if args.populated() { + break; + } + + while let Some(arg) = args_iter.next() { + match arg.as_str() { + "--output" => { + args.output = args_iter.next().unwrap(); + } + + "--or-input" => { + args.or_input = args_iter.next().unwrap(); + } + + "--oc-input" => { + args.oc_input = args_iter.next().unwrap(); + } + + _ => usage(&binary_name), + } + } + } + + if args.output.is_empty() || args.or_input.is_empty() || args.oc_input.is_empty() { + usage(&binary_name); + } + + args +} + +fn open_corpora(conn: &Connection, args: &Args) { + let input_file = File::open(&args.oc_input).ensure("failed to open input file"); + let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file)); + db_setup::initial_oc_schema(&conn); + + let mut tx = conn + .unchecked_transaction() + .ensure("failed to start transaction"); + + let mut count = 0; + + while let Some(elem) = parser.next_element() { + // commit every 1000 things + if count % 1000 == 0 { + tx.commit().ensure("transaction failed"); + tx = conn + .unchecked_transaction() + .ensure("failed to start new transaction"); + info!("transaction committed at watermark {}", count); + } + + db_setup::insert_oc_element(&tx, elem); + + count += 1; + } + + tx.commit().ensure("final OpenCorpora commit failed"); + + info!("finished OpenCorpora import"); +} + +fn open_russian(conn: &Connection, args: &Args) { + let parser = or_parser::OpenRussianParser::new(&args.or_input); + + db_setup::initial_or_schema(conn); + + { + let tx = conn + .unchecked_transaction() + .ensure("failed to start transaction"); + + db_setup::insert_or_words(&tx, parser.words()); + tx.commit().ensure("OpenRussian words commit failed"); + } + + { + let tx = conn + .unchecked_transaction() + .ensure("failed to start transaction"); + + db_setup::insert_or_word_forms(&tx, parser.words_forms()); + tx.commit().ensure("OpenRussian word forms commit failed"); + } + + { + let tx = conn + .unchecked_transaction() + .ensure("failed to start transaction"); + + db_setup::insert_or_translations(&tx, parser.translations()); + tx.commit().ensure("OpenRussian translations commit failed"); + } + + info!("finished OpenRussian import"); +} + +fn main() { + env_logger::builder() + .filter_level(log::LevelFilter::Info) + .init(); + + let args = parse_args(); + + info!("output path: {}", args.output); + info!("OpenCorpora input path: {}", args.oc_input); + info!("OpenRussian input path: {}", args.or_input); + + let conn = Connection::open(&args.output).ensure("failed to open DB connection"); + + open_corpora(&conn, &args); + open_russian(&conn, &args); + + // afterwards: + // add actual IDs to grammemes + // properly reference keys internally + // add foreign key constraint on lemma_grammemes.grammeme +} + +/// It's like `expect`, but through `log::error`. +trait Ensure<T> { + fn ensure<S: Into<String>>(self, msg: S) -> T; +} + +impl<T, E: Display> Ensure<T> for Result<T, E> { + fn ensure<S: Into<String>>(self, msg: S) -> T { + match self { + Ok(x) => x, + Err(err) => { + error!("{}: {}", msg.into(), err); + std::process::exit(1); + } + } + } +} + +impl<T> Ensure<T> for Option<T> { + fn ensure<S: Into<String>>(self, msg: S) -> T { + match self { + Some(x) => x, + None => { + error!("{}", msg.into()); + std::process::exit(1); + } + } + } +} + +fn bail<S: Into<String>>(msg: S) -> ! { + error!("{}", msg.into()); + std::process::exit(1); +} diff --git a/corp/russian/data-import/src/mappings.rs b/corp/russian/data-import/src/mappings.rs new file mode 100644 index 0000000000..985088a566 --- /dev/null +++ b/corp/russian/data-import/src/mappings.rs @@ -0,0 +1,185 @@ +//! Manual mapping of some data structures in OC/OR corpora. + +/// Maps the *names* of OpenRussian word types (the `word_type` field +/// in the `or_words` table) to the *set* of OpenCorpora grammemes +/// commonly attached to lemmata of this type in OC. +/// +/// Some word types just don't map over, and are omitted. Many words +/// also have an empty word type. +pub const WORD_TYPES_GRAMMEME_MAP: &'static [(&'static str, &'static [&'static str])] = &[ + ("adjective", &["ADJF"]), + ("adverb", &["ADVB"]), + ("noun", &["NOUN"]), + ("verb", &["INFN"]), // or "VERB" ... +]; + +/// Maps the *names* of OpenRussian grammemes (the `form_type` fields +/// in the `or_word_forms` table) to the *set* of OpenCorpora +/// grammemes attached to them corresponding lemma in the `oc_lemmas` +/// table. +/// +/// This *only* includes grammatical information about the lemma of +/// the word (such as whether it is a verb or other type), but *not* +/// information about the specific instance of the word (such as its +/// gender). +/// +/// Correctly corresponding these requires use of all mapping tables. +pub const FORMS_LEMMATA_GRAMMEME_MAP: &'static [(&'static str, &'static [&'static str])] = &[ + ("ru_adj_comparative", &["COMP"]), + ("ru_adj_superlative", &["ADJF", "Supr"]), + ("ru_adj_f_acc", &["ADJF"]), + ("ru_adj_f_dat", &["ADJF"]), + ("ru_adj_f_gen", &["ADJF"]), + ("ru_adj_f_inst", &["ADJF"]), + ("ru_adj_f_nom", &["ADJF"]), + ("ru_adj_f_prep", &["ADJF"]), + ("ru_adj_m_acc", &["ADJF"]), + ("ru_adj_m_dat", &["ADJF"]), + ("ru_adj_m_gen", &["ADJF"]), + ("ru_adj_m_inst", &["ADJF"]), + ("ru_adj_m_nom", &["ADJF"]), + ("ru_adj_m_prep", &["ADJF"]), + ("ru_adj_n_acc", &["ADJF"]), + ("ru_adj_n_dat", &["ADJF"]), + ("ru_adj_n_gen", &["ADJF"]), + ("ru_adj_n_inst", &["ADJF"]), + ("ru_adj_n_nom", &["ADJF"]), + ("ru_adj_n_prep", &["ADJF"]), + ("ru_adj_pl_acc", &["ADJF"]), + ("ru_adj_pl_dat", &["ADJF"]), + ("ru_adj_pl_gen", &["ADJF"]), + ("ru_adj_pl_inst", &["ADJF"]), + ("ru_adj_pl_nom", &["ADJF"]), + ("ru_adj_pl_prep", &["ADJF"]), + ("ru_adj_short_f", &["ADJS"]), + ("ru_adj_short_m", &["ADJS"]), + ("ru_adj_short_n", &["ADJS"]), + ("ru_adj_short_pl", &["ADJS"]), + ("ru_noun_pl_acc", &["NOUN"]), + ("ru_noun_pl_dat", &["NOUN"]), + ("ru_noun_pl_gen", &["NOUN"]), + ("ru_noun_pl_inst", &["NOUN"]), + ("ru_noun_pl_nom", &["NOUN"]), + ("ru_noun_pl_prep", &["NOUN"]), + ("ru_noun_sg_acc", &["NOUN"]), + ("ru_noun_sg_dat", &["NOUN"]), + ("ru_noun_sg_gen", &["NOUN"]), + ("ru_noun_sg_inst", &["NOUN"]), + ("ru_noun_sg_nom", &["NOUN"]), + ("ru_noun_sg_prep", &["NOUN"]), + ("ru_verb_gerund_past", &["GRND"]), + ("ru_verb_gerund_present", &["GRND"]), + ("ru_verb_imperative_pl", &["VERB"]), + ("ru_verb_imperative_sg", &["VERB"]), + ("ru_verb_past_f", &["VERB"]), + ("ru_verb_past_m", &["VERB"]), + ("ru_verb_past_n", &["VERB"]), + ("ru_verb_past_pl", &["VERB"]), + ("ru_verb_presfut_pl1", &["VERB"]), + ("ru_verb_presfut_pl2", &["VERB"]), + ("ru_verb_presfut_pl3", &["VERB"]), + ("ru_verb_presfut_sg1", &["VERB"]), + ("ru_verb_presfut_sg2", &["VERB"]), + ("ru_verb_presfut_sg3", &["VERB"]), + ( + "ru_base", + &[ /* nothing consistent, except often 'Fixd' */ ], + ), + ("ru_verb_participle_active_past", &["PRTF", "past", "actv"]), + ( + "ru_verb_participle_active_present", + &["PRTF", "pres", "actv"], + ), + ( + "ru_verb_participle_passive_past", + &["PRTF", "past", "passv"], + ), + ( + "ru_verb_participle_passive_present", + &["PRTF", "pres", "passv"], + ), +]; + +/// Maps the *names* of OpenRussian grammemes (the `form_type` fields +/// in the `or_word_forms` table) to the *set* of OpenCorpora +/// grammemes attached to them corresponding words in the `oc_words` +/// table. +/// +/// This includes grammatical information about the "instance" of the +/// word (such as its gender), but *not* the higher-level type +/// information about its lemma. +/// +/// Correctly corresponding these requires use of all mapping tables. +pub const FORMS_WORDS_GRAMMEME_MAP: &'static [(&'static str, &'static [&'static str])] = &[ + ("ru_adj_comparative", &["Cmp2"]), + ("ru_adj_f_acc", &["femn", "sing", "accs"]), + ("ru_adj_f_dat", &["femn", "sing", "datv"]), + ("ru_adj_f_gen", &["femn", "sing", "gent"]), + ("ru_adj_f_inst", &["femn", "sing", "ablt"]), + ("ru_adj_f_nom", &["femn", "sing", "nomn"]), + ("ru_adj_f_prep", &["femn", "sing", "loct"]), + ("ru_adj_m_acc", &["masc", "sing", "accs"]), + ("ru_adj_m_dat", &["masc", "sing", "datv"]), + ("ru_adj_m_gen", &["masc", "sing", "gent"]), + ("ru_adj_m_inst", &["masc", "sing", "ablt"]), + ("ru_adj_m_nom", &["masc", "sing", "nomn"]), + ("ru_adj_m_prep", &["masc", "sing", "loct"]), + ("ru_adj_n_acc", &["neut", "sing", "accs"]), + ("ru_adj_n_dat", &["neut", "sing", "datv"]), + ("ru_adj_n_gen", &["neut", "sing", "gent"]), + ("ru_adj_n_inst", &["neut", "sing", "ablt"]), + ("ru_adj_n_nom", &["neut", "sing", "nomn"]), + ("ru_adj_n_prep", &["neut", "sing", "loct"]), + ("ru_adj_pl_acc", &["plur", "accs"]), + ("ru_adj_pl_dat", &["plur", "datv"]), + ("ru_adj_pl_gen", &["plur", "gent"]), + ("ru_adj_pl_inst", &["plur", "ablt"]), + ("ru_adj_pl_nom", &["plur", "nomn"]), + ("ru_adj_pl_prep", &["plur", "loct"]), + ("ru_adj_short_f", &["femn", "sing"]), + ("ru_adj_short_m", &["masc", "sing"]), + ("ru_adj_short_n", &["neut", "sing"]), + ("ru_adj_short_pl", &["plur"]), + ("ru_noun_pl_acc", &["plur", "accs"]), + ("ru_noun_pl_dat", &["plur", "datv"]), + ("ru_noun_pl_gen", &["plur", "gent"]), + ("ru_noun_pl_inst", &["plur", "ablt"]), + ("ru_noun_pl_nom", &["plur", "nomn"]), + ("ru_noun_pl_prep", &["plur", "loct"]), + ("ru_noun_sg_acc", &["sing", "accs"]), + ("ru_noun_sg_dat", &["sing", "datv"]), + ("ru_noun_sg_gen", &["sing", "gent"]), + ("ru_noun_sg_inst", &["sing", "ablt"]), + ("ru_noun_sg_nom", &["sing", "nomn"]), + ("ru_noun_sg_prep", &["sing", "loct"]), + ("ru_verb_gerund_past", &["past", "V-sh"]), + ("ru_verb_imperative_pl", &["plur", "impr"]), + ("ru_verb_imperative_sg", &["sing", "impr"]), + ("ru_verb_past_f", &["femn", "sing", "past"]), + ("ru_verb_past_m", &["masc", "sing", "past"]), + ("ru_verb_past_n", &["neut", "sing", "past"]), + ("ru_verb_past_pl", &["plur", "past"]), + // these also contain "pres" or "futr", depending on the verb. + ("ru_verb_presfut_pl1", &["plur", "1per"]), + ("ru_verb_presfut_pl2", &["plur", "2per"]), + ("ru_verb_presfut_pl3", &["plur", "3per"]), + ("ru_verb_presfut_sg1", &["sing", "1per"]), + ("ru_verb_presfut_sg2", &["sing", "2per"]), + ("ru_verb_presfut_sg3", &["sing", "3per"]), + // Unclear items, probably only useful tags on lemmata + ( + "ru_verb_gerund_present", + &["pres" /* prob. something missing? */], + ), + ( + "ru_adj_superlative", + &[/* TODO: unclear, random list of grammemes?! */], + ), + ("ru_base", &[/* TODO: unclear */]), + // These have no useful tags in the forms table, only gender & + // case tagging. + ("ru_verb_participle_active_past", &[]), + ("ru_verb_participle_active_present", &[]), + ("ru_verb_participle_passive_past", &[]), + ("ru_verb_participle_passive_present", &[]), +]; diff --git a/corp/russian/data-import/src/oc_parser.rs b/corp/russian/data-import/src/oc_parser.rs new file mode 100644 index 0000000000..8103ebd923 --- /dev/null +++ b/corp/russian/data-import/src/oc_parser.rs @@ -0,0 +1,470 @@ +use super::{bail, Ensure}; +use log::{info, warn}; +use std::str::FromStr; +use xml::attribute::OwnedAttribute; +use xml::name::OwnedName; +use xml::reader::XmlEvent; +use xml::EventReader; + +#[derive(Default, Debug)] +pub struct Grammeme { + pub parent: Option<String>, + pub name: String, + pub alias: String, + pub description: String, +} + +/// Single form of a word (either its lemma, or the variations). +#[derive(Debug, Default)] +pub struct Variation { + pub word: String, + pub grammemes: Vec<String>, +} + +#[derive(Debug, Default)] +pub struct Lemma { + pub id: u64, + pub lemma: Variation, + pub grammemes: Vec<String>, + pub variations: Vec<Variation>, +} + +#[derive(Debug, Default)] +pub struct LinkType { + pub id: u64, + pub name: String, +} + +#[derive(Debug, Default)] +pub struct Link { + pub id: u64, // link itself + pub from: u64, // lemma + pub to: u64, // lemma + pub link_type: u64, +} + +#[derive(Debug)] +pub enum OcElement { + Grammeme(Grammeme), + Lemma(Lemma), + LinkType(LinkType), + Link(Link), +} + +#[derive(Debug, PartialEq)] +enum ParserState { + /// Parser is not parsing any particular section and waiting for a + /// start tag instead. + Init, + + /// Parser is parsing grammemes. + Grammemes, + + /// Parser is parsing lemmata. + Lemmata, + + /// Parser is inside a lemma's actual lemma. + Lemma, + + /// Parser is parsing a morphological variation of a lemma. + Variation, + + /// Parser is parsing link types. + LinkTypes, + + /// Parser is parsing links. + Links, + + /// Parser has seen the end of the line and nothing more is + /// available. + Ended, +} + +pub struct OpenCorporaParser<R: std::io::Read> { + reader: EventReader<R>, + state: ParserState, +} + +#[derive(PartialEq)] +enum SectionState { + /// Actively interested in parsing this section. + Active, + + /// Section is known, but currently ignored. + Inactive, + + /// Section is unknown (probably a bug). + Unknown, +} + +fn section_state(section: &str) -> SectionState { + match section { + "grammemes" | "lemmata" | "link_types" | "links" => SectionState::Active, + "restrictions" => SectionState::Inactive, + _ => SectionState::Unknown, + } +} + +impl<R: std::io::Read> OpenCorporaParser<R> { + pub fn new(reader: R) -> Self { + let config = xml::ParserConfig::new().trim_whitespace(true); + let reader = EventReader::new_with_config(reader, config); + + Self { + reader, + state: ParserState::Init, + } + } + + /// Pull an `OcElement` out of the parser. Returns `None` if the + /// parser stream has ended. + pub fn next_element(&mut self) -> Option<OcElement> { + if self.state == ParserState::Ended { + return None; + } + + // Pull the next element to determine what context to enter + // next. + loop { + match &self.next() { + // no-op events that do not affect parser state + XmlEvent::Comment(_) + | XmlEvent::Whitespace(_) + | XmlEvent::ProcessingInstruction { .. } + | XmlEvent::StartDocument { .. } => continue, + XmlEvent::StartElement { name, .. } | XmlEvent::EndElement { name } + if name.local_name == "dictionary" => + { + continue + } + + // end of the file, nothing more to return + XmlEvent::EndDocument => { + self.state = ParserState::Ended; + return None; + } + + // some sections are skipped + XmlEvent::StartElement { name, .. } | XmlEvent::EndElement { name } + if section_state(&name.local_name) == SectionState::Inactive => + { + info!("skipping {} section", name.local_name); + self.skip_section(&name.local_name); + } + + // active section events start specific parser states ... + XmlEvent::StartElement { name, .. } + if section_state(&name.local_name) == SectionState::Active => + { + self.state = match name.local_name.as_str() { + "grammemes" => ParserState::Grammemes, + "lemmata" => ParserState::Lemmata, + "link_types" => ParserState::LinkTypes, + "links" => ParserState::Links, + _ => unreachable!(), + }; + } + + // ... or end them + XmlEvent::EndElement { name, .. } + if section_state(&name.local_name) == SectionState::Active => + { + // TODO: assert that the right section ended + self.state = ParserState::Init; + } + + // actual beginning of an actual element, dispatch accordingly + event @ XmlEvent::StartElement { + name, attributes, .. + } => match &self.state { + ParserState::Grammemes => { + return Some(OcElement::Grammeme(self.parse_grammeme(name, attributes))) + } + + ParserState::Lemmata => { + return Some(OcElement::Lemma(self.parse_lemma(name, attributes))) + } + + ParserState::LinkTypes => { + return Some(OcElement::LinkType(self.parse_link_type(name, attributes))) + } + + ParserState::Links if name.local_name == "link" => { + return Some(OcElement::Link(self.parse_link(attributes))) + } + + ParserState::Init | ParserState::Ended => bail(format!( + "parser received an unexpected start element while in state {:?}: {:?}", + self.state, event + )), + + other => bail(format!( + "next_element() called while parser was in state {:?}", + other + )), + }, + + // finally, events that indicate a bug if they're + // encountered here + event @ XmlEvent::EndElement { .. } + | event @ XmlEvent::CData(_) + | event @ XmlEvent::Characters(_) => { + bail(format!("unexpected XML event: {:?}", event)) + } + } + } + } + + /// Skip a section by advancing the parser state until we see an + /// end element for the skipped section. + fn skip_section(&mut self, section: &str) { + loop { + match self.next() { + XmlEvent::EndElement { name } if name.local_name == section => return, + _ => continue, + } + } + } + + fn next(&mut self) -> XmlEvent { + self.reader.next().ensure("XML parsing failed") + } + + /// Parse a tag that should have plain string content. + fn parse_string(&mut self, tag_name: &str) -> String { + let mut out = String::new(); + + loop { + match self.next() { + // ignore irrelevant things + XmlEvent::Comment(_) | XmlEvent::Whitespace(_) => continue, + + // set the content + XmlEvent::Characters(content) => { + out = content; + } + + // expect the end of the element + XmlEvent::EndElement { name } if name.local_name == tag_name => return out, + + // fail on everything unexpected + event => bail(format!( + "unexpected element while parsing <{}>: {:?}", + tag_name, event + )), + } + } + } + + /// Parse a single `<grammeme>` tag. + fn parse_grammeme(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> Grammeme { + if name.local_name != "grammeme" { + bail(format!( + "expected to parse a grammeme, but found <{}>", + name.local_name + )); + } + + let mut grammeme = Grammeme::default(); + + for attr in attributes { + if attr.name.local_name == "parent" && !attr.value.is_empty() { + grammeme.parent = Some(attr.value.clone()); + } + } + + loop { + match self.next() { + // ignore irrelevant things + XmlEvent::Comment(_) | XmlEvent::Whitespace(_) => continue, + + // expect known tags + XmlEvent::StartElement { name, .. } if name.local_name == "name" => { + grammeme.name = self.parse_string("name"); + } + + XmlEvent::StartElement { name, .. } if name.local_name == "alias" => { + grammeme.alias = self.parse_string("alias"); + } + + XmlEvent::StartElement { name, .. } if name.local_name == "description" => { + grammeme.description = self.parse_string("description"); + } + + // handle end of the grammeme + XmlEvent::EndElement { name } if name.local_name == "grammeme" => break, + + // fail on everything unexpected + event => bail(format!( + "unexpected element while parsing <grammeme>: {:?}", + event + )), + } + } + + grammeme + } + + fn parse_lemma(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> Lemma { + if name.local_name != "lemma" { + bail(format!( + "expected to parse a lemma, but found <{}>", + name.local_name + )); + } + + self.state = ParserState::Lemma; + let mut lemma = Lemma::default(); + + for attr in attributes { + if attr.name.local_name == "id" { + lemma.id = u64::from_str(&attr.value).ensure("failed to parse lemma ID"); + } + } + + loop { + match self.next() { + // <lemma> has ended + XmlEvent::EndElement { name } if name.local_name == "lemma" => { + self.state = ParserState::Lemmata; + return lemma; + } + + // actual lemma content + XmlEvent::StartElement { + name, attributes, .. + } => { + match name.local_name.as_str() { + // beginning to parse the lemma itself + "l" => { + lemma.lemma.word = attributes + .into_iter() + .find(|attr| attr.name.local_name == "t") + .map(|attr| attr.value) + .ensure(format!("lemma {} had no actual word", lemma.id)); + } + + // parsing a lemma variation + "f" => { + self.state = ParserState::Variation; + + let word = attributes + .into_iter() + .find(|attr| attr.name.local_name == "t") + .map(|attr| attr.value) + .ensure(format!( + "variation of lemma {} had no actual word", + lemma.id + )); + + lemma.variations.push(Variation { + word, + grammemes: vec![], + }); + } + + // parse a grammeme association + "g" => { + let grammeme = attributes + .into_iter() + .find(|attr| attr.name.local_name == "v") + .map(|attr| attr.value) + .ensure(format!( + "grammeme association in lemma {} missing ID", + lemma.id + )); + + match self.state { + ParserState::Lemma => { + lemma.grammemes.push(grammeme); + } + + ParserState::Variation => { + lemma + .variations + .last_mut() + .ensure("variations should be non-empty") + .grammemes + .push(grammeme); + } + + _ => bail(format!("invalid parser state: encountered grammeme association while in {:?}", self.state)), + } + } + + other => bail(format!("unexpected element while parsing lemma: {other}")), + }; + } + + XmlEvent::EndElement { name } => match name.local_name.as_str() { + "l" if self.state == ParserState::Lemma => continue, + "f" if self.state == ParserState::Variation => { + self.state = ParserState::Lemma; + continue; + } + "g" => continue, + other => bail(format!( + "unexpected </{other}> while parsing lemma {}", + lemma.id + )), + }, + + _ => continue, + } + } + } + + fn parse_link_type(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> LinkType { + if name.local_name != "type" { + bail(format!( + "expected to parse a link type, but found <{}>", + name.local_name + )); + } + + let mut link_type = LinkType::default(); + + for attr in attributes { + if attr.name.local_name == "id" { + link_type.id = u64::from_str(&attr.value).ensure("failed to parse link type ID"); + } + } + + link_type.name = self.parse_string("type"); + link_type + } + + fn parse_link(&mut self, attributes: &[OwnedAttribute]) -> Link { + let mut link = Link::default(); + + for attr in attributes { + let i_val = || u64::from_str(&attr.value).ensure("failed to parse link field"); + + match attr.name.local_name.as_str() { + "id" => { + link.id = i_val(); + } + "from" => { + link.from = i_val(); + } + "to" => { + link.to = i_val(); + } + "type" => { + link.link_type = i_val(); + } + + other => { + warn!("unexpected attribute {} on <link>", other); + continue; + } + } + } + + // expect the end of the <link> element, though since these + // are empty it should be immediate. + self.skip_section("link"); + + link + } +} diff --git a/corp/russian/data-import/src/or_parser.rs b/corp/russian/data-import/src/or_parser.rs new file mode 100644 index 0000000000..8bfc61dbef --- /dev/null +++ b/corp/russian/data-import/src/or_parser.rs @@ -0,0 +1,105 @@ +//! Parser for the OpenRussian data format. +//! +//! Note that when exporting OpenRussian data from the project you +//! have to choose an encoding. We choose tab-separated CSV files, as +//! tabs have a very low probability of actually appearing in the +//! input data and this skips some potential encoding issues. + +use super::Ensure; +use serde::Deserialize; +use std::fs::File; +use std::io::BufReader; +use std::path::PathBuf; + +/// A word from the `words` table. +#[derive(Debug, Deserialize)] +pub struct Word { + pub id: usize, + pub position: String, // TODO: unknown + pub bare: String, // TODO: unknown + pub accented: String, // TODO: unknown + pub derived_from_word_id: Option<usize>, + pub rank: Option<usize>, + pub disabled: String, // TODO: unknown + pub audio: String, // TODO: unknown + pub usage_en: String, // TODO: unknown + pub usage_de: String, // TODO: unknown + pub number_value: String, // TODO: unknown + + #[serde(rename = "type")] + pub word_type: String, // TODO: unknown + + pub level: String, // TODO: unknown + pub created_at: String, // TODO: unknown +} + +/// A word form from the `words_forms` table. +#[derive(Debug, Deserialize)] +pub struct WordForm { + pub id: usize, + pub word_id: usize, + pub form_type: String, + pub position: String, + pub form: String, + pub form_bare: String, +} + +/// A translation from the `translations` table. +#[derive(Debug, Deserialize)] +pub struct Translation { + pub id: usize, + pub lang: String, + pub word_id: usize, + pub position: String, + pub tl: String, // unknown + pub example_ru: String, + pub example_tl: String, + pub info: String, +} + +pub struct OpenRussianParser { + or_directory: PathBuf, +} + +pub type DynIter<T> = Box<dyn Iterator<Item = T>>; + +impl OpenRussianParser { + pub fn new<P: Into<PathBuf>>(path: P) -> Self { + OpenRussianParser { + or_directory: path.into(), + } + } + + pub fn words(&self) -> DynIter<Word> { + self.parser_for("words.csv") + } + + pub fn words_forms(&self) -> DynIter<WordForm> { + self.parser_for("words_forms.csv") + } + + pub fn translations(&self) -> DynIter<Translation> { + self.parser_for("translations.csv") + } + + fn parser_for<T: serde::de::DeserializeOwned + 'static>( + &self, + file_name: &str, + ) -> Box<dyn Iterator<Item = T>> { + let mut path = self.or_directory.clone(); + path.push(file_name); + + let reader = csv::ReaderBuilder::new() + .delimiter(b'\t') + .from_reader(BufReader::new( + File::open(&path).ensure("failed to open words.csv"), + )); + + Box::new(reader.into_deserialize().map(|result| { + result.ensure(format!( + "failed to deserialize {}", + std::any::type_name::<T>() + )) + })) + } +} |