From 8eeb5d3bccf831681b2cad5c3b322e6a08f596df Mon Sep 17 00:00:00 2001 From: Vincent Ambo Date: Sat, 21 Jan 2023 18:00:16 +0300 Subject: feat(corp/data-import): add import of OR 'words_forms' table This is the full morphological set table for all the words from the lemmata table, which they don't call it that. Change-Id: I6f5be673c5f59f11e36bd8c8c935844a7d4fd170 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7894 Tested-by: BuildkiteCI Reviewed-by: tazjin --- corp/russian/data-import/src/db_setup.rs | 39 ++++++++++++++++++++++++++++++- corp/russian/data-import/src/main.rs | 21 +++++++++++++---- corp/russian/data-import/src/or_parser.rs | 15 ++++++++++++ 3 files changed, 69 insertions(+), 6 deletions(-) (limited to 'corp/russian/data-import') diff --git a/corp/russian/data-import/src/db_setup.rs b/corp/russian/data-import/src/db_setup.rs index 5fe64717ad9b..4644edf09424 100644 --- a/corp/russian/data-import/src/db_setup.rs +++ b/corp/russian/data-import/src/db_setup.rs @@ -6,7 +6,7 @@ //! introduce things like foreign key constraints between tables that //! represent relations. -use super::{bail, Ensure}; +use super::Ensure; use crate::oc_parser::*; use crate::or_parser; use log::{debug, info}; @@ -181,6 +181,16 @@ CREATE TABLE or_words ( word_type TEXT, level TEXT ) STRICT; + +CREATE TABLE or_words_forms ( + id INTEGER PRIMARY KEY, + word_id INTEGER NOT NULL, + form_type TEXT, + position TEXT, + form TEXT, + form_bare TEXT, + FOREIGN KEY(word_id) REFERENCES words(id) +) STRICT; "#, ) .ensure("setting up OpenRussian table schema failed"); @@ -215,3 +225,30 @@ VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7) info!("inserted {} OpenRussian words", count); } + +pub fn insert_or_word_forms>(conn: &Connection, forms: I) { + let mut stmt = conn + .prepare_cached( + " +INSERT INTO or_words_forms (id, word_id, form_type, position, form, form_bare) +VALUES (?1, ?2, ?3, ?4, ?5, ?6) +", + ) + .ensure("failed to prepare OR word forms statement"); + let mut count = 0; + + for form in forms { + stmt.execute(( + form.id, + form.word_id, + form.form_type, + form.position, + form.form, + form.form_bare, + )) + .ensure("failed to insert OR word form"); + count += 1; + } + + info!("inserted {} OpenRussian word forms", count); +} diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs index 11387539ab84..18bc0238e7e0 100644 --- a/corp/russian/data-import/src/main.rs +++ b/corp/russian/data-import/src/main.rs @@ -210,12 +210,23 @@ fn open_russian(conn: &Connection, args: &Args) { db_setup::initial_or_schema(conn); - let tx = conn - .unchecked_transaction() - .ensure("failed to start transaction"); + { + let tx = conn + .unchecked_transaction() + .ensure("failed to start transaction"); + + db_setup::insert_or_words(&tx, parser.words()); + tx.commit().ensure("OpenRussian words commit failed"); + } + + { + let tx = conn + .unchecked_transaction() + .ensure("failed to start transaction"); - db_setup::insert_or_words(&tx, parser.words()); - tx.commit().ensure("OpenRussian words commit failed"); + db_setup::insert_or_word_forms(&tx, parser.words_forms()); + tx.commit().ensure("OpenRussian word forms commit failed"); + } info!("finished OpenRussian import"); } diff --git a/corp/russian/data-import/src/or_parser.rs b/corp/russian/data-import/src/or_parser.rs index c11896f6bac0..28e4f14d3169 100644 --- a/corp/russian/data-import/src/or_parser.rs +++ b/corp/russian/data-import/src/or_parser.rs @@ -33,6 +33,17 @@ pub struct Word { pub created_at: String, // TODO: unknown } +/// A word form from the `words_forms` table. +#[derive(Debug, Deserialize)] +pub struct WordForm { + pub id: usize, + pub word_id: usize, + pub form_type: String, + pub position: String, + pub form: String, + pub form_bare: String, +} + pub struct OpenRussianParser { or_directory: PathBuf, } @@ -50,6 +61,10 @@ impl OpenRussianParser { self.parser_for("words.csv") } + pub fn words_forms(&self) -> DynIter { + self.parser_for("words_forms.csv") + } + fn parser_for( &self, file_name: &str, -- cgit 1.4.1