about summary refs log tree commit diff
path: root/corp/russian/data-import/src/db_setup.rs
diff options
context:
space:
mode:
authorVincent Ambo <mail@tazj.in>2023-01-18T11·52+0300
committertazjin <tazjin@tvl.su>2023-01-18T15·44+0000
commit6986aa5824ba6ae23b1363fede13c2df5ea0e770 (patch)
treeb0582427803ac6bb9c445fb947bd5092586ab7a3 /corp/russian/data-import/src/db_setup.rs
parent0196555f07d7295a40aefd5aec266f3932efbb2b (diff)
feat(corp/data-import): insert OpenCorpora data into SQLite r/5688
This is an initial and kind of dumb table structure, but there's some
massaging that needs to be done before this makes more sense.

Change-Id: I441288b684ef86be507099bcc4ebf984598789c8
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7861
Reviewed-by: tazjin <tazjin@tvl.su>
Tested-by: BuildkiteCI
Diffstat (limited to 'corp/russian/data-import/src/db_setup.rs')
-rw-r--r--corp/russian/data-import/src/db_setup.rs128
1 files changed, 128 insertions, 0 deletions
diff --git a/corp/russian/data-import/src/db_setup.rs b/corp/russian/data-import/src/db_setup.rs
new file mode 100644
index 000000000000..1a9e2dd87913
--- /dev/null
+++ b/corp/russian/data-import/src/db_setup.rs
@@ -0,0 +1,128 @@
+//! This module prepares the database layout.
+//!
+//! The XML import may be in an arbitrary order, so importing data is
+//! a multi-step process where we first set up schemas matching the
+//! data layout, import the data, and then modify the schema to
+//! introduce things like foreign key constraints between tables that
+//! represent relations.
+
+use super::{bail, Ensure};
+use crate::oc_parser::*;
+use log::{debug, info};
+use rusqlite::Connection;
+
+/// Sets up an initial schema which matches the OpenCorpora data.
+pub fn initial_schema(conn: &Connection) {
+    conn.execute_batch(
+        r#"
+-- table for plain import of grammemes from XML
+CREATE TABLE grammemes (
+    name TEXT PRIMARY KEY,
+    parent TEXT,
+    alias TEXT,
+    description TEXT
+) STRICT;
+
+-- table for plain import of lemmas (*not* their variations!)
+CREATE TABLE lemmas (
+    id INTEGER PRIMARY KEY,
+    lemma TEXT NOT NULL
+) STRICT;
+
+-- table for relationship between grammemes and lemmas
+CREATE TABLE lemma_grammemes (
+    lemma INTEGER,
+    grammeme TEXT NOT NULL,
+    FOREIGN KEY(lemma) REFERENCES lemmas(id)
+) STRICT;
+
+-- table for all words, i.e. including variations of lemmata
+CREATE TABLE words (
+    lemma INTEGER NOT NULL,
+    word TEXT NOT NULL,
+    FOREIGN KEY(lemma) REFERENCES lemmas(id)
+) STRICT;
+
+-- table for relationship between words and grammemes
+CREATE TABLE word_grammemes (
+    word INTEGER NOT NULL,
+    grammeme TEXT NOT NULL,
+    FOREIGN KEY(word) REFERENCES words(ROWID)
+) STRICT;
+
+"#,
+    )
+    .ensure("setting up initial table schema failed");
+
+    info!("set up initial table schema for OpenCorpora import");
+}
+
+/// Inserts a single OpenCorpora element into the initial table structure.
+pub fn insert_oc_element(conn: &Connection, elem: OcElement) {
+    match elem {
+        OcElement::Grammeme(grammeme) => {
+            conn.execute(
+                "INSERT INTO grammemes (name, parent, alias, description) VALUES (?1, ?2, ?3, ?4)",
+                (
+                    &grammeme.name,
+                    &grammeme.parent,
+                    &grammeme.alias,
+                    &grammeme.description,
+                ),
+            )
+            .ensure("failed to insert grammeme");
+
+            debug!("inserted grammeme {}", grammeme.name);
+        }
+
+        OcElement::Lemma(lemma) => insert_lemma(conn, lemma),
+    }
+}
+
+/// Insert a single lemma into the initial structure. This is somewhat
+/// involved because it also establishes a bunch of relations.
+fn insert_lemma(conn: &Connection, lemma: Lemma) {
+    // insert the lemma itself
+    let mut stmt = conn
+        .prepare_cached("INSERT INTO lemmas (id, lemma) VALUES (?1, ?2)")
+        .ensure("failed to prepare statement");
+
+    stmt.execute((&lemma.id, &lemma.lemma.word))
+        .ensure("failed to insert grammeme");
+
+    // followed by its relations to the grammemes set
+    let mut stmt = conn
+        .prepare_cached("INSERT INTO lemma_grammemes (lemma, grammeme) VALUES (?1, ?2)")
+        .ensure("failed to prepare statement");
+
+    for grammeme in lemma.grammemes {
+        stmt.execute((&lemma.id, grammeme))
+            .ensure("failed to insert grammeme<>lemma relationship");
+    }
+
+    // followed by all of its variations ...
+    let mut word_insert = conn
+        .prepare_cached("INSERT INTO words (lemma, word) VALUES (?1, ?2)")
+        .unwrap();
+
+    let mut word_grammeme = conn
+        .prepare_cached("INSERT INTO word_grammemes (word, grammeme) VALUES (?1, ?2)")
+        .unwrap();
+
+    for variation in lemma.variations {
+        // insert the word itself and get its rowid
+        word_insert
+            .execute((&lemma.id, &variation.word))
+            .ensure("failed to insert word");
+        let row_id = conn.last_insert_rowid();
+
+        // then insert its grammeme links
+        for grammeme in variation.grammemes {
+            word_grammeme
+                .execute((row_id, grammeme))
+                .ensure("failed to insert word<>grammeme link");
+        }
+    }
+
+    debug!("inserted lemma {}", lemma.id);
+}