about summary refs log tree commit diff
path: root/corp
diff options
context:
space:
mode:
authorVincent Ambo <mail@tazj.in>2023-01-21T18·17+0300
committertazjin <tazjin@tvl.su>2023-01-22T16·13+0000
commited8dd4acd71161893a06dd25567852b1855ac1ab (patch)
tree5dd998379e20c6601ef734b2fd15ef99a65ef3c6 /corp
parent2b308c64b94a14592f928a7d2511fc74c8846eb3 (diff)
feat(corp/data-import): add import of OR 'translations' table r/5732
The original dataset contains translations into different languages,
but only the English ones are imported here.

Note that translations are for lemmata only.

Change-Id: Ifb9c32c25fda44c38ad899efca9d205c520c0fa3
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7895
Reviewed-by: tazjin <tazjin@tvl.su>
Tested-by: BuildkiteCI
Diffstat (limited to 'corp')
-rw-r--r--corp/russian/data-import/src/db_setup.rs44
-rw-r--r--corp/russian/data-import/src/main.rs9
-rw-r--r--corp/russian/data-import/src/or_parser.rs17
3 files changed, 70 insertions, 0 deletions
diff --git a/corp/russian/data-import/src/db_setup.rs b/corp/russian/data-import/src/db_setup.rs
index 4644edf09424..21cfb2a4652e 100644
--- a/corp/russian/data-import/src/db_setup.rs
+++ b/corp/russian/data-import/src/db_setup.rs
@@ -191,6 +191,16 @@ CREATE TABLE or_words_forms (
     form_bare TEXT,
     FOREIGN KEY(word_id) REFERENCES words(id)
 ) STRICT;
+
+CREATE TABLE or_translations (
+    id INTEGER PRIMARY KEY,
+    word_id INTEGER NOT NULL,
+    translation TEXT,
+    example_ru TEXT,
+    example_tl TEXT,
+    info TEXT,
+    FOREIGN KEY(word_id) REFERENCES words(id)
+) STRICT;
 "#,
     )
     .ensure("setting up OpenRussian table schema failed");
@@ -252,3 +262,37 @@ VALUES (?1, ?2, ?3, ?4, ?5, ?6)
 
     info!("inserted {} OpenRussian word forms", count);
 }
+
+pub fn insert_or_translations<I: Iterator<Item = or_parser::Translation>>(
+    conn: &Connection,
+    translations: I,
+) {
+    let mut stmt = conn
+        .prepare_cached(
+            "INSERT INTO or_translations (id, word_id, translation, example_ru, example_tl, info)
+             VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
+        )
+        .ensure("failed to prepare OR translation statement");
+
+    let mut count = 0;
+
+    for tl in translations {
+        if tl.lang != "en" {
+            continue;
+        }
+
+        stmt.execute((
+            tl.id,
+            tl.word_id,
+            tl.tl,
+            tl.example_ru,
+            tl.example_tl,
+            tl.info,
+        ))
+        .ensure("failed to insert OR translation");
+
+        count += 1;
+    }
+
+    info!("inserted {} OpenRussian translations", count);
+}
diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs
index 18bc0238e7e0..95a38e6e93d1 100644
--- a/corp/russian/data-import/src/main.rs
+++ b/corp/russian/data-import/src/main.rs
@@ -228,6 +228,15 @@ fn open_russian(conn: &Connection, args: &Args) {
         tx.commit().ensure("OpenRussian word forms commit failed");
     }
 
+    {
+        let tx = conn
+            .unchecked_transaction()
+            .ensure("failed to start transaction");
+
+        db_setup::insert_or_translations(&tx, parser.translations());
+        tx.commit().ensure("OpenRussian translations commit failed");
+    }
+
     info!("finished OpenRussian import");
 }
 
diff --git a/corp/russian/data-import/src/or_parser.rs b/corp/russian/data-import/src/or_parser.rs
index 28e4f14d3169..eace850c24eb 100644
--- a/corp/russian/data-import/src/or_parser.rs
+++ b/corp/russian/data-import/src/or_parser.rs
@@ -44,6 +44,19 @@ pub struct WordForm {
     pub form_bare: String,
 }
 
+/// A translation from the `translations` table.
+#[derive(Debug, Deserialize)]
+pub struct Translation {
+    pub id: usize,
+    pub lang: String,
+    pub word_id: usize,
+    pub position: String,
+    pub tl: String, // unknown
+    pub example_ru: String,
+    pub example_tl: String,
+    pub info: String,
+}
+
 pub struct OpenRussianParser {
     or_directory: PathBuf,
 }
@@ -65,6 +78,10 @@ impl OpenRussianParser {
         self.parser_for("words_forms.csv")
     }
 
+    pub fn translations(&self) -> DynIter<Translation> {
+        self.parser_for("translations.csv")
+    }
+
     fn parser_for<T: serde::de::DeserializeOwned + 'static>(
         &self,
         file_name: &str,