diff options
author | Vincent Ambo <mail@tazj.in> | 2023-01-20T10·31+0300 |
---|---|---|
committer | tazjin <tazjin@tvl.su> | 2023-01-21T17·49+0000 |
commit | 429c0d00c4cd07ea90c85bf1ec2f2c742d970420 (patch) | |
tree | 85103dcdf8b7c9d30552dfc97321ad99d77ff2e3 /corp/russian | |
parent | ee0c0ee95103fa10e227a1976149d20e6944001c (diff) |
feat(corp/data-import): add import of OpenRussian 'words' table r/5729
This is actually the lemmata table of this corpus, not the forms of all words (they're in a separate table). Change-Id: I89a2c2817ccce840f47406fa2a636f4ed3f49154 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7893 Reviewed-by: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI
Diffstat (limited to 'corp/russian')
-rw-r--r-- | corp/russian/data-import/Cargo.lock | 115 | ||||
-rw-r--r-- | corp/russian/data-import/Cargo.toml | 2 | ||||
-rw-r--r-- | corp/russian/data-import/default.nix | 11 | ||||
-rw-r--r-- | corp/russian/data-import/src/db_setup.rs | 51 | ||||
-rw-r--r-- | corp/russian/data-import/src/main.rs | 126 | ||||
-rw-r--r-- | corp/russian/data-import/src/or_parser.rs | 73 |
6 files changed, 348 insertions, 30 deletions
diff --git a/corp/russian/data-import/Cargo.lock b/corp/russian/data-import/Cargo.lock index 125b62d43e90..cd85e058108f 100644 --- a/corp/russian/data-import/Cargo.lock +++ b/corp/russian/data-import/Cargo.lock @@ -29,6 +29,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] +name = "bstr" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata", + "serde", +] + +[[package]] name = "cc" version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -41,12 +53,36 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] +name = "csv" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +dependencies = [ + "bstr", + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + +[[package]] name = "data-import" version = "0.1.0" dependencies = [ + "csv", "env_logger", "log", "rusqlite", + "serde", "xml-rs", ] @@ -163,6 +199,18 @@ dependencies = [ ] [[package]] +name = "itoa" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] name = "libc" version = "0.2.139" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -212,6 +260,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" [[package]] +name = "proc-macro2" +version = "1.0.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b" +dependencies = [ + "proc-macro2", +] + +[[package]] name = "regex" version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -223,6 +289,12 @@ dependencies = [ ] [[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" + +[[package]] name = "regex-syntax" version = "0.6.28" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -257,12 +329,49 @@ dependencies = [ ] [[package]] +name = "ryu" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde" + +[[package]] +name = "serde" +version = "1.0.152" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.152" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af487d118eecd09402d70a5d72551860e788df87b464af30e5ea6a38c75c541e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] name = "smallvec" version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" [[package]] +name = "syn" +version = "1.0.107" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] name = "termcolor" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -272,6 +381,12 @@ dependencies = [ ] [[package]] +name = "unicode-ident" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" + +[[package]] name = "vcpkg" version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" diff --git a/corp/russian/data-import/Cargo.toml b/corp/russian/data-import/Cargo.toml index b43f829f37b0..1aae2e830578 100644 --- a/corp/russian/data-import/Cargo.toml +++ b/corp/russian/data-import/Cargo.toml @@ -6,9 +6,11 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +csv = "1.1" env_logger = "0.10.0" log = "0.4.17" rusqlite = "0.28" +serde = { version = "1.0", features = ["derive"] } xml-rs = "0.8" [profile.release-with-debug] diff --git a/corp/russian/data-import/default.nix b/corp/russian/data-import/default.nix index cf358874dce6..6aa8ad6aa3d8 100644 --- a/corp/russian/data-import/default.nix +++ b/corp/russian/data-import/default.nix @@ -19,6 +19,9 @@ let ${pkgs.bzip2}/bin/bunzip2 -k -c ${openCorporaArchive} > $out ''; + # mirrored input data from OpenRussian, as of 2023-01-17. + # + # This data is licensed under CC-BY-SA. openRussianArchive = pkgs.fetchzip { name = "openrussian-20230117"; url = "https://tazj.in/blobs/openrussian-20230117.tar.xz"; @@ -43,8 +46,10 @@ lib.fix (self: depot.third_party.naersk.buildPackage { inherit shell openCorpora; # target that actually builds an entire database - database = pkgs.runCommand "tvl-russian-db.sqlite" { } '' - ${self}/bin/data-import ${openCorpora} $out - ''; + database = pkgs.runCommand "tvl-russian-db.sqlite" + { + OPENCORPORA_DATA = openCorpora; + OPENRUSSIAN_DATA = openRussianArchive; + } "${self}/bin/data-import --output $out"; }; }) diff --git a/corp/russian/data-import/src/db_setup.rs b/corp/russian/data-import/src/db_setup.rs index 3f0fa0ff638d..5fe64717ad9b 100644 --- a/corp/russian/data-import/src/db_setup.rs +++ b/corp/russian/data-import/src/db_setup.rs @@ -8,6 +8,7 @@ use super::{bail, Ensure}; use crate::oc_parser::*; +use crate::or_parser; use log::{debug, info}; use rusqlite::Connection; @@ -69,7 +70,7 @@ CREATE TABLE oc_links ( "#, ) - .ensure("setting up initial table schema failed"); + .ensure("setting up OpenCorpora table schema failed"); info!("set up initial table schema for OpenCorpora import"); } @@ -166,3 +167,51 @@ fn insert_lemma(conn: &Connection, lemma: Lemma) { debug!("inserted lemma {}", lemma.id); } + +/// Sets up an initial schema for the OpenRussian data. +pub fn initial_or_schema(conn: &Connection) { + conn.execute_batch( + r#" +CREATE TABLE or_words ( + id INTEGER PRIMARY KEY, + bare TEXT NOT NULL, + accented TEXT, + derived_from_word_id INTEGER, + rank TEXT, + word_type TEXT, + level TEXT +) STRICT; +"#, + ) + .ensure("setting up OpenRussian table schema failed"); + + info!("set up initial table schema for OpenRussian import"); +} + +pub fn insert_or_words<I: Iterator<Item = or_parser::Word>>(conn: &Connection, words: I) { + let mut stmt = conn + .prepare_cached( + " +INSERT INTO or_words (id, bare, accented, derived_from_word_id, rank, word_type, level) +VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7) +", + ) + .ensure("failed to prepare OR words statement"); + let mut count = 0; + + for word in words { + stmt.execute(( + word.id, + word.bare, + word.accented, + word.derived_from_word_id, + word.rank, + word.word_type, + word.level, + )) + .ensure("failed to insert OR word"); + count += 1; + } + + info!("inserted {} OpenRussian words", count); +} diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs index 21d4209991c5..11387539ab84 100644 --- a/corp/russian/data-import/src/main.rs +++ b/corp/russian/data-import/src/main.rs @@ -1,6 +1,6 @@ -//! This program imports Russian language data from OpenCorpora and -//! OpenRussian ("Открытый корпус") into a SQLite database that can be -//! used for [//corp/russian][corp-russian] projects. +//! This program imports Russian language data from OpenCorpora +//! ("Открытый корпус") and OpenRussian into a SQLite database that +//! can be used for [//corp/russian][corp-russian] projects. //! //! [corp-russian]: https://at.tvl.fyi/?q=%2F%2Fcorp%2Frussian //! @@ -112,42 +112,77 @@ use std::io::BufReader; mod db_setup; mod oc_parser; +mod or_parser; -fn main() { - env_logger::builder() - .filter_level(log::LevelFilter::Info) - .init(); +struct Args { + output: String, + or_input: String, + oc_input: String, +} - let (input_path, output_path) = { - let mut args = env::args().collect::<Vec<_>>(); +impl Args { + fn populated(&self) -> bool { + !(self.output.is_empty() || self.or_input.is_empty() || self.oc_input.is_empty()) + } +} - if args.len() != 3 { - bail(format!( - "usage: {} <input-file> <output-file>", - args.first().map(String::as_str).unwrap_or("data-import") - )); - } +fn usage(binary_name: &str) { + bail(format!( + "usage: {} --output <output-file> --or-input <or-input> --oc-input <oc-input>", + binary_name + )); +} + +fn parse_args() -> Args { + let mut args_iter = env::args(); + let binary_name = args_iter.next().unwrap(); - (args.remove(1), args.remove(1)) + let mut args = Args { + output: "".into(), + or_input: env::var("OPENRUSSIAN_DATA").unwrap_or_default(), + oc_input: env::var("OPENCORPORA_DATA").unwrap_or_default(), }; - info!("reading from {input_path}; writing output to {output_path}"); - let input_file = File::open(input_path).ensure("failed to open input file"); + loop { + if args.populated() { + break; + } - let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file)); + while let Some(arg) = args_iter.next() { + match arg.as_str() { + "--output" => { + args.output = args_iter.next().unwrap(); + } - let conn = Connection::open(output_path).ensure("failed to open DB connection"); + "--or-input" => { + args.or_input = args_iter.next().unwrap(); + } - db_setup::initial_oc_schema(&conn); + "--oc-input" => { + args.oc_input = args_iter.next().unwrap(); + } - // afterwards: - // add actual IDs to grammemes - // properly reference keys internally - // add foreign key constraint on lemma_grammemes.grammeme + _ => usage(&binary_name), + } + } + } + + if args.output.is_empty() || args.or_input.is_empty() || args.oc_input.is_empty() { + usage(&binary_name); + } + + args +} + +fn open_corpora(conn: &Connection, args: &Args) { + let input_file = File::open(&args.oc_input).ensure("failed to open input file"); + let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file)); + db_setup::initial_oc_schema(&conn); let mut tx = conn .unchecked_transaction() .ensure("failed to start transaction"); + let mut count = 0; while let Some(elem) = parser.next_element() { @@ -165,7 +200,46 @@ fn main() { count += 1; } - tx.commit().ensure("final commit failed"); + tx.commit().ensure("final OpenCorpora commit failed"); + + info!("finished OpenCorpora import"); +} + +fn open_russian(conn: &Connection, args: &Args) { + let parser = or_parser::OpenRussianParser::new(&args.or_input); + + db_setup::initial_or_schema(conn); + + let tx = conn + .unchecked_transaction() + .ensure("failed to start transaction"); + + db_setup::insert_or_words(&tx, parser.words()); + tx.commit().ensure("OpenRussian words commit failed"); + + info!("finished OpenRussian import"); +} + +fn main() { + env_logger::builder() + .filter_level(log::LevelFilter::Info) + .init(); + + let args = parse_args(); + + info!("output path: {}", args.output); + info!("OpenCorpora input path: {}", args.oc_input); + info!("OpenRussian input path: {}", args.or_input); + + let conn = Connection::open(&args.output).ensure("failed to open DB connection"); + + open_corpora(&conn, &args); + open_russian(&conn, &args); + + // afterwards: + // add actual IDs to grammemes + // properly reference keys internally + // add foreign key constraint on lemma_grammemes.grammeme } /// It's like `expect`, but through `log::error`. diff --git a/corp/russian/data-import/src/or_parser.rs b/corp/russian/data-import/src/or_parser.rs new file mode 100644 index 000000000000..c11896f6bac0 --- /dev/null +++ b/corp/russian/data-import/src/or_parser.rs @@ -0,0 +1,73 @@ +//! Parser for the OpenRussian data format. +//! +//! Note that when exporting OpenRussian data from the project you +//! have to choose an encoding. We choose tab-separated CSV files, as +//! tabs have a very low probability of actually appearing in the +//! input data and this skips some potential encoding issues. + +use super::Ensure; +use serde::Deserialize; +use std::fs::File; +use std::io::BufReader; +use std::path::PathBuf; + +/// A word from the `words` table. +#[derive(Debug, Deserialize)] +pub struct Word { + pub id: usize, + pub position: String, // TODO: unknown + pub bare: String, // TODO: unknown + pub accented: String, // TODO: unknown + pub derived_from_word_id: Option<usize>, + pub rank: String, // TODO: unknown + pub disabled: String, // TODO: unknown + pub audio: String, // TODO: unknown + pub usage_en: String, // TODO: unknown + pub usage_de: String, // TODO: unknown + pub number_value: String, // TODO: unknown + + #[serde(rename = "type")] + pub word_type: String, // TODO: unknown + + pub level: String, // TODO: unknown + pub created_at: String, // TODO: unknown +} + +pub struct OpenRussianParser { + or_directory: PathBuf, +} + +pub type DynIter<T> = Box<dyn Iterator<Item = T>>; + +impl OpenRussianParser { + pub fn new<P: Into<PathBuf>>(path: P) -> Self { + OpenRussianParser { + or_directory: path.into(), + } + } + + pub fn words(&self) -> DynIter<Word> { + self.parser_for("words.csv") + } + + fn parser_for<T: serde::de::DeserializeOwned + 'static>( + &self, + file_name: &str, + ) -> Box<dyn Iterator<Item = T>> { + let mut path = self.or_directory.clone(); + path.push(file_name); + + let reader = csv::ReaderBuilder::new() + .delimiter(b'\t') + .from_reader(BufReader::new( + File::open(&path).ensure("failed to open words.csv"), + )); + + Box::new(reader.into_deserialize().map(|result| { + result.ensure(format!( + "failed to deserialize {}", + std::any::type_name::<T>() + )) + })) + } +} |