diff options
author | Vincent Ambo <mail@tazj.in> | 2023-01-20T10·31+0300 |
---|---|---|
committer | tazjin <tazjin@tvl.su> | 2023-01-21T17·49+0000 |
commit | 429c0d00c4cd07ea90c85bf1ec2f2c742d970420 (patch) | |
tree | 85103dcdf8b7c9d30552dfc97321ad99d77ff2e3 /corp/russian/data-import/src/db_setup.rs | |
parent | ee0c0ee95103fa10e227a1976149d20e6944001c (diff) |
feat(corp/data-import): add import of OpenRussian 'words' table r/5729
This is actually the lemmata table of this corpus, not the forms of all words (they're in a separate table). Change-Id: I89a2c2817ccce840f47406fa2a636f4ed3f49154 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7893 Reviewed-by: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI
Diffstat (limited to 'corp/russian/data-import/src/db_setup.rs')
-rw-r--r-- | corp/russian/data-import/src/db_setup.rs | 51 |
1 files changed, 50 insertions, 1 deletions
diff --git a/corp/russian/data-import/src/db_setup.rs b/corp/russian/data-import/src/db_setup.rs index 3f0fa0ff63..5fe64717ad 100644 --- a/corp/russian/data-import/src/db_setup.rs +++ b/corp/russian/data-import/src/db_setup.rs @@ -8,6 +8,7 @@ use super::{bail, Ensure}; use crate::oc_parser::*; +use crate::or_parser; use log::{debug, info}; use rusqlite::Connection; @@ -69,7 +70,7 @@ CREATE TABLE oc_links ( "#, ) - .ensure("setting up initial table schema failed"); + .ensure("setting up OpenCorpora table schema failed"); info!("set up initial table schema for OpenCorpora import"); } @@ -166,3 +167,51 @@ fn insert_lemma(conn: &Connection, lemma: Lemma) { debug!("inserted lemma {}", lemma.id); } + +/// Sets up an initial schema for the OpenRussian data. +pub fn initial_or_schema(conn: &Connection) { + conn.execute_batch( + r#" +CREATE TABLE or_words ( + id INTEGER PRIMARY KEY, + bare TEXT NOT NULL, + accented TEXT, + derived_from_word_id INTEGER, + rank TEXT, + word_type TEXT, + level TEXT +) STRICT; +"#, + ) + .ensure("setting up OpenRussian table schema failed"); + + info!("set up initial table schema for OpenRussian import"); +} + +pub fn insert_or_words<I: Iterator<Item = or_parser::Word>>(conn: &Connection, words: I) { + let mut stmt = conn + .prepare_cached( + " +INSERT INTO or_words (id, bare, accented, derived_from_word_id, rank, word_type, level) +VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7) +", + ) + .ensure("failed to prepare OR words statement"); + let mut count = 0; + + for word in words { + stmt.execute(( + word.id, + word.bare, + word.accented, + word.derived_from_word_id, + word.rank, + word.word_type, + word.level, + )) + .ensure("failed to insert OR word"); + count += 1; + } + + info!("inserted {} OpenRussian words", count); +} |