From 429c0d00c4cd07ea90c85bf1ec2f2c742d970420 Mon Sep 17 00:00:00 2001 From: Vincent Ambo Date: Fri, 20 Jan 2023 13:31:12 +0300 Subject: feat(corp/data-import): add import of OpenRussian 'words' table This is actually the lemmata table of this corpus, not the forms of all words (they're in a separate table). Change-Id: I89a2c2817ccce840f47406fa2a636f4ed3f49154 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7893 Reviewed-by: tazjin Tested-by: BuildkiteCI --- corp/russian/data-import/src/main.rs | 126 +++++++++++++++++++++++++++-------- 1 file changed, 100 insertions(+), 26 deletions(-) (limited to 'corp/russian/data-import/src/main.rs') diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs index 21d4209991c5..11387539ab84 100644 --- a/corp/russian/data-import/src/main.rs +++ b/corp/russian/data-import/src/main.rs @@ -1,6 +1,6 @@ -//! This program imports Russian language data from OpenCorpora and -//! OpenRussian ("Открытый корпус") into a SQLite database that can be -//! used for [//corp/russian][corp-russian] projects. +//! This program imports Russian language data from OpenCorpora +//! ("Открытый корпус") and OpenRussian into a SQLite database that +//! can be used for [//corp/russian][corp-russian] projects. //! //! [corp-russian]: https://at.tvl.fyi/?q=%2F%2Fcorp%2Frussian //! @@ -112,42 +112,77 @@ use std::io::BufReader; mod db_setup; mod oc_parser; +mod or_parser; -fn main() { - env_logger::builder() - .filter_level(log::LevelFilter::Info) - .init(); +struct Args { + output: String, + or_input: String, + oc_input: String, +} - let (input_path, output_path) = { - let mut args = env::args().collect::>(); +impl Args { + fn populated(&self) -> bool { + !(self.output.is_empty() || self.or_input.is_empty() || self.oc_input.is_empty()) + } +} - if args.len() != 3 { - bail(format!( - "usage: {} ", - args.first().map(String::as_str).unwrap_or("data-import") - )); - } +fn usage(binary_name: &str) { + bail(format!( + "usage: {} --output --or-input --oc-input ", + binary_name + )); +} + +fn parse_args() -> Args { + let mut args_iter = env::args(); + let binary_name = args_iter.next().unwrap(); - (args.remove(1), args.remove(1)) + let mut args = Args { + output: "".into(), + or_input: env::var("OPENRUSSIAN_DATA").unwrap_or_default(), + oc_input: env::var("OPENCORPORA_DATA").unwrap_or_default(), }; - info!("reading from {input_path}; writing output to {output_path}"); - let input_file = File::open(input_path).ensure("failed to open input file"); + loop { + if args.populated() { + break; + } - let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file)); + while let Some(arg) = args_iter.next() { + match arg.as_str() { + "--output" => { + args.output = args_iter.next().unwrap(); + } - let conn = Connection::open(output_path).ensure("failed to open DB connection"); + "--or-input" => { + args.or_input = args_iter.next().unwrap(); + } - db_setup::initial_oc_schema(&conn); + "--oc-input" => { + args.oc_input = args_iter.next().unwrap(); + } - // afterwards: - // add actual IDs to grammemes - // properly reference keys internally - // add foreign key constraint on lemma_grammemes.grammeme + _ => usage(&binary_name), + } + } + } + + if args.output.is_empty() || args.or_input.is_empty() || args.oc_input.is_empty() { + usage(&binary_name); + } + + args +} + +fn open_corpora(conn: &Connection, args: &Args) { + let input_file = File::open(&args.oc_input).ensure("failed to open input file"); + let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file)); + db_setup::initial_oc_schema(&conn); let mut tx = conn .unchecked_transaction() .ensure("failed to start transaction"); + let mut count = 0; while let Some(elem) = parser.next_element() { @@ -165,7 +200,46 @@ fn main() { count += 1; } - tx.commit().ensure("final commit failed"); + tx.commit().ensure("final OpenCorpora commit failed"); + + info!("finished OpenCorpora import"); +} + +fn open_russian(conn: &Connection, args: &Args) { + let parser = or_parser::OpenRussianParser::new(&args.or_input); + + db_setup::initial_or_schema(conn); + + let tx = conn + .unchecked_transaction() + .ensure("failed to start transaction"); + + db_setup::insert_or_words(&tx, parser.words()); + tx.commit().ensure("OpenRussian words commit failed"); + + info!("finished OpenRussian import"); +} + +fn main() { + env_logger::builder() + .filter_level(log::LevelFilter::Info) + .init(); + + let args = parse_args(); + + info!("output path: {}", args.output); + info!("OpenCorpora input path: {}", args.oc_input); + info!("OpenRussian input path: {}", args.or_input); + + let conn = Connection::open(&args.output).ensure("failed to open DB connection"); + + open_corpora(&conn, &args); + open_russian(&conn, &args); + + // afterwards: + // add actual IDs to grammemes + // properly reference keys internally + // add foreign key constraint on lemma_grammemes.grammeme } /// It's like `expect`, but through `log::error`. -- cgit 1.4.1