diff options
author | Vincent Ambo <mail@tazj.in> | 2023-01-20T10·31+0300 |
---|---|---|
committer | tazjin <tazjin@tvl.su> | 2023-01-21T17·49+0000 |
commit | 429c0d00c4cd07ea90c85bf1ec2f2c742d970420 (patch) | |
tree | 85103dcdf8b7c9d30552dfc97321ad99d77ff2e3 /corp/russian/data-import/src | |
parent | ee0c0ee95103fa10e227a1976149d20e6944001c (diff) |
feat(corp/data-import): add import of OpenRussian 'words' table r/5729
This is actually the lemmata table of this corpus, not the forms of all words (they're in a separate table). Change-Id: I89a2c2817ccce840f47406fa2a636f4ed3f49154 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7893 Reviewed-by: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI
Diffstat (limited to 'corp/russian/data-import/src')
-rw-r--r-- | corp/russian/data-import/src/db_setup.rs | 51 | ||||
-rw-r--r-- | corp/russian/data-import/src/main.rs | 126 | ||||
-rw-r--r-- | corp/russian/data-import/src/or_parser.rs | 73 |
3 files changed, 223 insertions, 27 deletions
diff --git a/corp/russian/data-import/src/db_setup.rs b/corp/russian/data-import/src/db_setup.rs index 3f0fa0ff638d..5fe64717ad9b 100644 --- a/corp/russian/data-import/src/db_setup.rs +++ b/corp/russian/data-import/src/db_setup.rs @@ -8,6 +8,7 @@ use super::{bail, Ensure}; use crate::oc_parser::*; +use crate::or_parser; use log::{debug, info}; use rusqlite::Connection; @@ -69,7 +70,7 @@ CREATE TABLE oc_links ( "#, ) - .ensure("setting up initial table schema failed"); + .ensure("setting up OpenCorpora table schema failed"); info!("set up initial table schema for OpenCorpora import"); } @@ -166,3 +167,51 @@ fn insert_lemma(conn: &Connection, lemma: Lemma) { debug!("inserted lemma {}", lemma.id); } + +/// Sets up an initial schema for the OpenRussian data. +pub fn initial_or_schema(conn: &Connection) { + conn.execute_batch( + r#" +CREATE TABLE or_words ( + id INTEGER PRIMARY KEY, + bare TEXT NOT NULL, + accented TEXT, + derived_from_word_id INTEGER, + rank TEXT, + word_type TEXT, + level TEXT +) STRICT; +"#, + ) + .ensure("setting up OpenRussian table schema failed"); + + info!("set up initial table schema for OpenRussian import"); +} + +pub fn insert_or_words<I: Iterator<Item = or_parser::Word>>(conn: &Connection, words: I) { + let mut stmt = conn + .prepare_cached( + " +INSERT INTO or_words (id, bare, accented, derived_from_word_id, rank, word_type, level) +VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7) +", + ) + .ensure("failed to prepare OR words statement"); + let mut count = 0; + + for word in words { + stmt.execute(( + word.id, + word.bare, + word.accented, + word.derived_from_word_id, + word.rank, + word.word_type, + word.level, + )) + .ensure("failed to insert OR word"); + count += 1; + } + + info!("inserted {} OpenRussian words", count); +} diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs index 21d4209991c5..11387539ab84 100644 --- a/corp/russian/data-import/src/main.rs +++ b/corp/russian/data-import/src/main.rs @@ -1,6 +1,6 @@ -//! This program imports Russian language data from OpenCorpora and -//! OpenRussian ("Открытый корпус") into a SQLite database that can be -//! used for [//corp/russian][corp-russian] projects. +//! This program imports Russian language data from OpenCorpora +//! ("Открытый корпус") and OpenRussian into a SQLite database that +//! can be used for [//corp/russian][corp-russian] projects. //! //! [corp-russian]: https://at.tvl.fyi/?q=%2F%2Fcorp%2Frussian //! @@ -112,42 +112,77 @@ use std::io::BufReader; mod db_setup; mod oc_parser; +mod or_parser; -fn main() { - env_logger::builder() - .filter_level(log::LevelFilter::Info) - .init(); +struct Args { + output: String, + or_input: String, + oc_input: String, +} - let (input_path, output_path) = { - let mut args = env::args().collect::<Vec<_>>(); +impl Args { + fn populated(&self) -> bool { + !(self.output.is_empty() || self.or_input.is_empty() || self.oc_input.is_empty()) + } +} - if args.len() != 3 { - bail(format!( - "usage: {} <input-file> <output-file>", - args.first().map(String::as_str).unwrap_or("data-import") - )); - } +fn usage(binary_name: &str) { + bail(format!( + "usage: {} --output <output-file> --or-input <or-input> --oc-input <oc-input>", + binary_name + )); +} + +fn parse_args() -> Args { + let mut args_iter = env::args(); + let binary_name = args_iter.next().unwrap(); - (args.remove(1), args.remove(1)) + let mut args = Args { + output: "".into(), + or_input: env::var("OPENRUSSIAN_DATA").unwrap_or_default(), + oc_input: env::var("OPENCORPORA_DATA").unwrap_or_default(), }; - info!("reading from {input_path}; writing output to {output_path}"); - let input_file = File::open(input_path).ensure("failed to open input file"); + loop { + if args.populated() { + break; + } - let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file)); + while let Some(arg) = args_iter.next() { + match arg.as_str() { + "--output" => { + args.output = args_iter.next().unwrap(); + } - let conn = Connection::open(output_path).ensure("failed to open DB connection"); + "--or-input" => { + args.or_input = args_iter.next().unwrap(); + } - db_setup::initial_oc_schema(&conn); + "--oc-input" => { + args.oc_input = args_iter.next().unwrap(); + } - // afterwards: - // add actual IDs to grammemes - // properly reference keys internally - // add foreign key constraint on lemma_grammemes.grammeme + _ => usage(&binary_name), + } + } + } + + if args.output.is_empty() || args.or_input.is_empty() || args.oc_input.is_empty() { + usage(&binary_name); + } + + args +} + +fn open_corpora(conn: &Connection, args: &Args) { + let input_file = File::open(&args.oc_input).ensure("failed to open input file"); + let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file)); + db_setup::initial_oc_schema(&conn); let mut tx = conn .unchecked_transaction() .ensure("failed to start transaction"); + let mut count = 0; while let Some(elem) = parser.next_element() { @@ -165,7 +200,46 @@ fn main() { count += 1; } - tx.commit().ensure("final commit failed"); + tx.commit().ensure("final OpenCorpora commit failed"); + + info!("finished OpenCorpora import"); +} + +fn open_russian(conn: &Connection, args: &Args) { + let parser = or_parser::OpenRussianParser::new(&args.or_input); + + db_setup::initial_or_schema(conn); + + let tx = conn + .unchecked_transaction() + .ensure("failed to start transaction"); + + db_setup::insert_or_words(&tx, parser.words()); + tx.commit().ensure("OpenRussian words commit failed"); + + info!("finished OpenRussian import"); +} + +fn main() { + env_logger::builder() + .filter_level(log::LevelFilter::Info) + .init(); + + let args = parse_args(); + + info!("output path: {}", args.output); + info!("OpenCorpora input path: {}", args.oc_input); + info!("OpenRussian input path: {}", args.or_input); + + let conn = Connection::open(&args.output).ensure("failed to open DB connection"); + + open_corpora(&conn, &args); + open_russian(&conn, &args); + + // afterwards: + // add actual IDs to grammemes + // properly reference keys internally + // add foreign key constraint on lemma_grammemes.grammeme } /// It's like `expect`, but through `log::error`. diff --git a/corp/russian/data-import/src/or_parser.rs b/corp/russian/data-import/src/or_parser.rs new file mode 100644 index 000000000000..c11896f6bac0 --- /dev/null +++ b/corp/russian/data-import/src/or_parser.rs @@ -0,0 +1,73 @@ +//! Parser for the OpenRussian data format. +//! +//! Note that when exporting OpenRussian data from the project you +//! have to choose an encoding. We choose tab-separated CSV files, as +//! tabs have a very low probability of actually appearing in the +//! input data and this skips some potential encoding issues. + +use super::Ensure; +use serde::Deserialize; +use std::fs::File; +use std::io::BufReader; +use std::path::PathBuf; + +/// A word from the `words` table. +#[derive(Debug, Deserialize)] +pub struct Word { + pub id: usize, + pub position: String, // TODO: unknown + pub bare: String, // TODO: unknown + pub accented: String, // TODO: unknown + pub derived_from_word_id: Option<usize>, + pub rank: String, // TODO: unknown + pub disabled: String, // TODO: unknown + pub audio: String, // TODO: unknown + pub usage_en: String, // TODO: unknown + pub usage_de: String, // TODO: unknown + pub number_value: String, // TODO: unknown + + #[serde(rename = "type")] + pub word_type: String, // TODO: unknown + + pub level: String, // TODO: unknown + pub created_at: String, // TODO: unknown +} + +pub struct OpenRussianParser { + or_directory: PathBuf, +} + +pub type DynIter<T> = Box<dyn Iterator<Item = T>>; + +impl OpenRussianParser { + pub fn new<P: Into<PathBuf>>(path: P) -> Self { + OpenRussianParser { + or_directory: path.into(), + } + } + + pub fn words(&self) -> DynIter<Word> { + self.parser_for("words.csv") + } + + fn parser_for<T: serde::de::DeserializeOwned + 'static>( + &self, + file_name: &str, + ) -> Box<dyn Iterator<Item = T>> { + let mut path = self.or_directory.clone(); + path.push(file_name); + + let reader = csv::ReaderBuilder::new() + .delimiter(b'\t') + .from_reader(BufReader::new( + File::open(&path).ensure("failed to open words.csv"), + )); + + Box::new(reader.into_deserialize().map(|result| { + result.ensure(format!( + "failed to deserialize {}", + std::any::type_name::<T>() + )) + })) + } +} |