about summary refs log tree commit diff
path: root/corp
diff options
context:
space:
mode:
authorVincent Ambo <mail@tazj.in>2023-01-21T15·00+0300
committertazjin <tazjin@tvl.su>2023-01-21T17·49+0000
commit8eeb5d3bccf831681b2cad5c3b322e6a08f596df (patch)
tree4e02105c8ab39a4466b29e98c16aa9e2c69ac9b4 /corp
parent429c0d00c4cd07ea90c85bf1ec2f2c742d970420 (diff)
feat(corp/data-import): add import of OR 'words_forms' table r/5730
This is the full morphological set table for all the words from the
lemmata table, which they don't call it that.

Change-Id: I6f5be673c5f59f11e36bd8c8c935844a7d4fd170
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7894
Tested-by: BuildkiteCI
Reviewed-by: tazjin <tazjin@tvl.su>
Diffstat (limited to 'corp')
-rw-r--r--corp/russian/data-import/src/db_setup.rs39
-rw-r--r--corp/russian/data-import/src/main.rs21
-rw-r--r--corp/russian/data-import/src/or_parser.rs15
3 files changed, 69 insertions, 6 deletions
diff --git a/corp/russian/data-import/src/db_setup.rs b/corp/russian/data-import/src/db_setup.rs
index 5fe64717ad..4644edf094 100644
--- a/corp/russian/data-import/src/db_setup.rs
+++ b/corp/russian/data-import/src/db_setup.rs
@@ -6,7 +6,7 @@
 //! introduce things like foreign key constraints between tables that
 //! represent relations.
 
-use super::{bail, Ensure};
+use super::Ensure;
 use crate::oc_parser::*;
 use crate::or_parser;
 use log::{debug, info};
@@ -181,6 +181,16 @@ CREATE TABLE or_words (
     word_type TEXT,
     level TEXT
 ) STRICT;
+
+CREATE TABLE or_words_forms (
+    id INTEGER PRIMARY KEY,
+    word_id INTEGER NOT NULL,
+    form_type TEXT,
+    position TEXT,
+    form TEXT,
+    form_bare TEXT,
+    FOREIGN KEY(word_id) REFERENCES words(id)
+) STRICT;
 "#,
     )
     .ensure("setting up OpenRussian table schema failed");
@@ -215,3 +225,30 @@ VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)
 
     info!("inserted {} OpenRussian words", count);
 }
+
+pub fn insert_or_word_forms<I: Iterator<Item = or_parser::WordForm>>(conn: &Connection, forms: I) {
+    let mut stmt = conn
+        .prepare_cached(
+            "
+INSERT INTO or_words_forms (id, word_id, form_type, position, form, form_bare)
+VALUES (?1, ?2, ?3, ?4, ?5, ?6)
+",
+        )
+        .ensure("failed to prepare OR word forms statement");
+    let mut count = 0;
+
+    for form in forms {
+        stmt.execute((
+            form.id,
+            form.word_id,
+            form.form_type,
+            form.position,
+            form.form,
+            form.form_bare,
+        ))
+        .ensure("failed to insert OR word form");
+        count += 1;
+    }
+
+    info!("inserted {} OpenRussian word forms", count);
+}
diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs
index 11387539ab..18bc0238e7 100644
--- a/corp/russian/data-import/src/main.rs
+++ b/corp/russian/data-import/src/main.rs
@@ -210,12 +210,23 @@ fn open_russian(conn: &Connection, args: &Args) {
 
     db_setup::initial_or_schema(conn);
 
-    let tx = conn
-        .unchecked_transaction()
-        .ensure("failed to start transaction");
+    {
+        let tx = conn
+            .unchecked_transaction()
+            .ensure("failed to start transaction");
+
+        db_setup::insert_or_words(&tx, parser.words());
+        tx.commit().ensure("OpenRussian words commit failed");
+    }
+
+    {
+        let tx = conn
+            .unchecked_transaction()
+            .ensure("failed to start transaction");
 
-    db_setup::insert_or_words(&tx, parser.words());
-    tx.commit().ensure("OpenRussian words commit failed");
+        db_setup::insert_or_word_forms(&tx, parser.words_forms());
+        tx.commit().ensure("OpenRussian word forms commit failed");
+    }
 
     info!("finished OpenRussian import");
 }
diff --git a/corp/russian/data-import/src/or_parser.rs b/corp/russian/data-import/src/or_parser.rs
index c11896f6ba..28e4f14d31 100644
--- a/corp/russian/data-import/src/or_parser.rs
+++ b/corp/russian/data-import/src/or_parser.rs
@@ -33,6 +33,17 @@ pub struct Word {
     pub created_at: String, // TODO: unknown
 }
 
+/// A word form from the `words_forms` table.
+#[derive(Debug, Deserialize)]
+pub struct WordForm {
+    pub id: usize,
+    pub word_id: usize,
+    pub form_type: String,
+    pub position: String,
+    pub form: String,
+    pub form_bare: String,
+}
+
 pub struct OpenRussianParser {
     or_directory: PathBuf,
 }
@@ -50,6 +61,10 @@ impl OpenRussianParser {
         self.parser_for("words.csv")
     }
 
+    pub fn words_forms(&self) -> DynIter<WordForm> {
+        self.parser_for("words_forms.csv")
+    }
+
     fn parser_for<T: serde::de::DeserializeOwned + 'static>(
         &self,
         file_name: &str,