//! This program imports Russian language data from OpenCorpora //! ("Открытый корпус") and OpenRussian into a SQLite database that //! can be used for [//corp/russian][corp-russian] projects. //! //! [corp-russian]: https://at.tvl.fyi/?q=%2F%2Fcorp%2Frussian //! //! Ideally, running this on intact dumps should yield a fully //! functional SQLite database compatible with all other tools //! consuming it. //! //! ## OpenCorpora format //! //! The format used is partially documented on the [OpenCorpora //! website][format-docs]. This seems to be a slightly outdated //! format, however, hence some information about what the format //! seems to be today. //! //! [format-docs]: http://opencorpora.org/?page=export //! //! The format is an XML file, which has several categories of data, //! each with their own schema: //! //! * `grammemes`: These define units of grammar. They're *likely* pretty //! static, and we'll *likely* want to map them into a custom set of //! (simpler) categories. //! //! They form some kind of internal hierarchy, where some of them have a //! `parent` attribute set to some other grammemes `name`. //! //! There's a ridiculous number of these. //! //! * `restrictions`: Unclear, not documented on the page. They describe //! something about the relationship between grammemes. //! //! * `lemmata`: this lists the actual lemmas, as well as all their //! included morphological variants //! //! Each lemma has an `id` attribute uniquely identifying its dictionary //! form, as well as a number of sub-elements: //! //! * the `l` attribute contains the lemma itself //! * the `f` attributes contain morphological variations //! //! Each of these sub elements again contains a number of `g` elements, //! which refer to the IDs of grammems in their `v` attributes. //! //! * `` These list possible "relationships between lemmas", //! basically just assigning them IDs and names. There's only 27 of //! these. //! //! * ``: Using the types defined above, this establishes links //! between lemmas that have some kind of relationship. //! //! For example, a relationship `cardinal/ordinal` might be established //! between the lemmas "два" and "второй". //! //! ## OpenRussian format //! //! The [OpenRussian](https://en.openrussian.org/dictionary) project //! lets users export its database as a set of CSV-files. For our //! purposes, we download the files using `` separators. //! //! Whereas OpenCorpora opts for a flat structure with a "tag" system //! (through its flexible grammemes), OpenRussian has a fixed pre-hoc //! structure into which it sorts some words with their morphologies. //! The OpenRussian database is much smaller as of January 2023 (~1.7 //! million words vs. >5 million for OpenCorpora), but some of the //! information is much more practically useful. //! //! Two very important bits of information OpenRussian has are accent //! marks (most tables containing actual words have a normal form //! containing and accent mark, and a "bare" form without) and //! translations into English and German. //! //! The full dump includes the following tables (and some more): //! //! * `words`: List of lemmas in the corpus, with various bits of //! metadata as well as hand-written notes. //! //! * `adjectives`: Contains IDs for words that are adjectives. //! //! * `nouns`: IDs for words that are nouns; and noun metadata (e.g. //! gender, declinability) //! //! * `verbs`: IDs of words that are verbs, including their aspect and //! "partnered" verb in the other aspect //! //! * `words_forms`: Contains all morphed variants of the lemmas from //! `words`, including information about their grammeme, and accent //! marks. //! //! * `words_rels`: Contains relations between words, containing //! information like "synonyms" or general relation between words. //! //! * `translations`: Contains translations tagged by target language, //! as well as examples and (occasionally) additional information. //! //! These tables also contain something, but have not been analysed //! yet: //! //! * `expressions_words` //! * `sentences` //! * `sentences_translations` //! * `sentences_words` use log::{error, info}; use rusqlite::{Connection, Result}; use std::env; use std::fmt::Display; use std::fs::File; use std::io::BufReader; mod db_setup; mod oc_parser; mod or_parser; struct Args { output: String, or_input: String, oc_input: String, } impl Args { fn populated(&self) -> bool { !(self.output.is_empty() || self.or_input.is_empty() || self.oc_input.is_empty()) } } fn usage(binary_name: &str) { bail(format!( "usage: {} --output --or-input --oc-input ", binary_name )); } fn parse_args() -> Args { let mut args_iter = env::args(); let binary_name = args_iter.next().unwrap(); let mut args = Args { output: "".into(), or_input: env::var("OPENRUSSIAN_DATA").unwrap_or_default(), oc_input: env::var("OPENCORPORA_DATA").unwrap_or_default(), }; loop { if args.populated() { break; } while let Some(arg) = args_iter.next() { match arg.as_str() { "--output" => { args.output = args_iter.next().unwrap(); } "--or-input" => { args.or_input = args_iter.next().unwrap(); } "--oc-input" => { args.oc_input = args_iter.next().unwrap(); } _ => usage(&binary_name), } } } if args.output.is_empty() || args.or_input.is_empty() || args.oc_input.is_empty() { usage(&binary_name); } args } fn open_corpora(conn: &Connection, args: &Args) { let input_file = File::open(&args.oc_input).ensure("failed to open input file"); let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file)); db_setup::initial_oc_schema(&conn); let mut tx = conn .unchecked_transaction() .ensure("failed to start transaction"); let mut count = 0; while let Some(elem) = parser.next_element() { // commit every 1000 things if count % 1000 == 0 { tx.commit().ensure("transaction failed"); tx = conn .unchecked_transaction() .ensure("failed to start new transaction"); info!("transaction committed at watermark {}", count); } db_setup::insert_oc_element(&tx, elem); count += 1; } tx.commit().ensure("final OpenCorpora commit failed"); info!("finished OpenCorpora import"); } fn open_russian(conn: &Connection, args: &Args) { let parser = or_parser::OpenRussianParser::new(&args.or_input); db_setup::initial_or_schema(conn); let tx = conn .unchecked_transaction() .ensure("failed to start transaction"); db_setup::insert_or_words(&tx, parser.words()); tx.commit().ensure("OpenRussian words commit failed"); info!("finished OpenRussian import"); } fn main() { env_logger::builder() .filter_level(log::LevelFilter::Info) .init(); let args = parse_args(); info!("output path: {}", args.output); info!("OpenCorpora input path: {}", args.oc_input); info!("OpenRussian input path: {}", args.or_input); let conn = Connection::open(&args.output).ensure("failed to open DB connection"); open_corpora(&conn, &args); open_russian(&conn, &args); // afterwards: // add actual IDs to grammemes // properly reference keys internally // add foreign key constraint on lemma_grammemes.grammeme } /// It's like `expect`, but through `log::error`. trait Ensure { fn ensure>(self, msg: S) -> T; } impl Ensure for Result { fn ensure>(self, msg: S) -> T { match self { Ok(x) => x, Err(err) => { error!("{}: {}", msg.into(), err); std::process::exit(1); } } } } impl Ensure for Option { fn ensure>(self, msg: S) -> T { match self { Some(x) => x, None => { error!("{}", msg.into()); std::process::exit(1); } } } } fn bail>(msg: S) -> ! { error!("{}", msg.into()); std::process::exit(1); }