diff options
author | Vincent Ambo <mail@tazj.in> | 2023-01-21T18·17+0300 |
---|---|---|
committer | tazjin <tazjin@tvl.su> | 2023-01-22T16·13+0000 |
commit | ed8dd4acd71161893a06dd25567852b1855ac1ab (patch) | |
tree | 5dd998379e20c6601ef734b2fd15ef99a65ef3c6 /corp | |
parent | 2b308c64b94a14592f928a7d2511fc74c8846eb3 (diff) |
feat(corp/data-import): add import of OR 'translations' table r/5732
The original dataset contains translations into different languages, but only the English ones are imported here. Note that translations are for lemmata only. Change-Id: Ifb9c32c25fda44c38ad899efca9d205c520c0fa3 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7895 Reviewed-by: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI
Diffstat (limited to 'corp')
-rw-r--r-- | corp/russian/data-import/src/db_setup.rs | 44 | ||||
-rw-r--r-- | corp/russian/data-import/src/main.rs | 9 | ||||
-rw-r--r-- | corp/russian/data-import/src/or_parser.rs | 17 |
3 files changed, 70 insertions, 0 deletions
diff --git a/corp/russian/data-import/src/db_setup.rs b/corp/russian/data-import/src/db_setup.rs index 4644edf09424..21cfb2a4652e 100644 --- a/corp/russian/data-import/src/db_setup.rs +++ b/corp/russian/data-import/src/db_setup.rs @@ -191,6 +191,16 @@ CREATE TABLE or_words_forms ( form_bare TEXT, FOREIGN KEY(word_id) REFERENCES words(id) ) STRICT; + +CREATE TABLE or_translations ( + id INTEGER PRIMARY KEY, + word_id INTEGER NOT NULL, + translation TEXT, + example_ru TEXT, + example_tl TEXT, + info TEXT, + FOREIGN KEY(word_id) REFERENCES words(id) +) STRICT; "#, ) .ensure("setting up OpenRussian table schema failed"); @@ -252,3 +262,37 @@ VALUES (?1, ?2, ?3, ?4, ?5, ?6) info!("inserted {} OpenRussian word forms", count); } + +pub fn insert_or_translations<I: Iterator<Item = or_parser::Translation>>( + conn: &Connection, + translations: I, +) { + let mut stmt = conn + .prepare_cached( + "INSERT INTO or_translations (id, word_id, translation, example_ru, example_tl, info) + VALUES (?1, ?2, ?3, ?4, ?5, ?6)", + ) + .ensure("failed to prepare OR translation statement"); + + let mut count = 0; + + for tl in translations { + if tl.lang != "en" { + continue; + } + + stmt.execute(( + tl.id, + tl.word_id, + tl.tl, + tl.example_ru, + tl.example_tl, + tl.info, + )) + .ensure("failed to insert OR translation"); + + count += 1; + } + + info!("inserted {} OpenRussian translations", count); +} diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs index 18bc0238e7e0..95a38e6e93d1 100644 --- a/corp/russian/data-import/src/main.rs +++ b/corp/russian/data-import/src/main.rs @@ -228,6 +228,15 @@ fn open_russian(conn: &Connection, args: &Args) { tx.commit().ensure("OpenRussian word forms commit failed"); } + { + let tx = conn + .unchecked_transaction() + .ensure("failed to start transaction"); + + db_setup::insert_or_translations(&tx, parser.translations()); + tx.commit().ensure("OpenRussian translations commit failed"); + } + info!("finished OpenRussian import"); } diff --git a/corp/russian/data-import/src/or_parser.rs b/corp/russian/data-import/src/or_parser.rs index 28e4f14d3169..eace850c24eb 100644 --- a/corp/russian/data-import/src/or_parser.rs +++ b/corp/russian/data-import/src/or_parser.rs @@ -44,6 +44,19 @@ pub struct WordForm { pub form_bare: String, } +/// A translation from the `translations` table. +#[derive(Debug, Deserialize)] +pub struct Translation { + pub id: usize, + pub lang: String, + pub word_id: usize, + pub position: String, + pub tl: String, // unknown + pub example_ru: String, + pub example_tl: String, + pub info: String, +} + pub struct OpenRussianParser { or_directory: PathBuf, } @@ -65,6 +78,10 @@ impl OpenRussianParser { self.parser_for("words_forms.csv") } + pub fn translations(&self) -> DynIter<Translation> { + self.parser_for("translations.csv") + } + fn parser_for<T: serde::de::DeserializeOwned + 'static>( &self, file_name: &str, |