about summary refs log tree commit diff
path: root/corp/russian/data-import/src
diff options
context:
space:
mode:
authorVincent Ambo <mail@tazj.in>2023-01-18T11·52+0300
committertazjin <tazjin@tvl.su>2023-01-18T15·44+0000
commit6986aa5824ba6ae23b1363fede13c2df5ea0e770 (patch)
treeb0582427803ac6bb9c445fb947bd5092586ab7a3 /corp/russian/data-import/src
parent0196555f07d7295a40aefd5aec266f3932efbb2b (diff)
feat(corp/data-import): insert OpenCorpora data into SQLite r/5688
This is an initial and kind of dumb table structure, but there's some
massaging that needs to be done before this makes more sense.

Change-Id: I441288b684ef86be507099bcc4ebf984598789c8
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7861
Reviewed-by: tazjin <tazjin@tvl.su>
Tested-by: BuildkiteCI
Diffstat (limited to 'corp/russian/data-import/src')
-rw-r--r--corp/russian/data-import/src/db_setup.rs128
-rw-r--r--corp/russian/data-import/src/main.rs36
2 files changed, 155 insertions, 9 deletions
diff --git a/corp/russian/data-import/src/db_setup.rs b/corp/russian/data-import/src/db_setup.rs
new file mode 100644
index 000000000000..1a9e2dd87913
--- /dev/null
+++ b/corp/russian/data-import/src/db_setup.rs
@@ -0,0 +1,128 @@
+//! This module prepares the database layout.
+//!
+//! The XML import may be in an arbitrary order, so importing data is
+//! a multi-step process where we first set up schemas matching the
+//! data layout, import the data, and then modify the schema to
+//! introduce things like foreign key constraints between tables that
+//! represent relations.
+
+use super::{bail, Ensure};
+use crate::oc_parser::*;
+use log::{debug, info};
+use rusqlite::Connection;
+
+/// Sets up an initial schema which matches the OpenCorpora data.
+pub fn initial_schema(conn: &Connection) {
+    conn.execute_batch(
+        r#"
+-- table for plain import of grammemes from XML
+CREATE TABLE grammemes (
+    name TEXT PRIMARY KEY,
+    parent TEXT,
+    alias TEXT,
+    description TEXT
+) STRICT;
+
+-- table for plain import of lemmas (*not* their variations!)
+CREATE TABLE lemmas (
+    id INTEGER PRIMARY KEY,
+    lemma TEXT NOT NULL
+) STRICT;
+
+-- table for relationship between grammemes and lemmas
+CREATE TABLE lemma_grammemes (
+    lemma INTEGER,
+    grammeme TEXT NOT NULL,
+    FOREIGN KEY(lemma) REFERENCES lemmas(id)
+) STRICT;
+
+-- table for all words, i.e. including variations of lemmata
+CREATE TABLE words (
+    lemma INTEGER NOT NULL,
+    word TEXT NOT NULL,
+    FOREIGN KEY(lemma) REFERENCES lemmas(id)
+) STRICT;
+
+-- table for relationship between words and grammemes
+CREATE TABLE word_grammemes (
+    word INTEGER NOT NULL,
+    grammeme TEXT NOT NULL,
+    FOREIGN KEY(word) REFERENCES words(ROWID)
+) STRICT;
+
+"#,
+    )
+    .ensure("setting up initial table schema failed");
+
+    info!("set up initial table schema for OpenCorpora import");
+}
+
+/// Inserts a single OpenCorpora element into the initial table structure.
+pub fn insert_oc_element(conn: &Connection, elem: OcElement) {
+    match elem {
+        OcElement::Grammeme(grammeme) => {
+            conn.execute(
+                "INSERT INTO grammemes (name, parent, alias, description) VALUES (?1, ?2, ?3, ?4)",
+                (
+                    &grammeme.name,
+                    &grammeme.parent,
+                    &grammeme.alias,
+                    &grammeme.description,
+                ),
+            )
+            .ensure("failed to insert grammeme");
+
+            debug!("inserted grammeme {}", grammeme.name);
+        }
+
+        OcElement::Lemma(lemma) => insert_lemma(conn, lemma),
+    }
+}
+
+/// Insert a single lemma into the initial structure. This is somewhat
+/// involved because it also establishes a bunch of relations.
+fn insert_lemma(conn: &Connection, lemma: Lemma) {
+    // insert the lemma itself
+    let mut stmt = conn
+        .prepare_cached("INSERT INTO lemmas (id, lemma) VALUES (?1, ?2)")
+        .ensure("failed to prepare statement");
+
+    stmt.execute((&lemma.id, &lemma.lemma.word))
+        .ensure("failed to insert grammeme");
+
+    // followed by its relations to the grammemes set
+    let mut stmt = conn
+        .prepare_cached("INSERT INTO lemma_grammemes (lemma, grammeme) VALUES (?1, ?2)")
+        .ensure("failed to prepare statement");
+
+    for grammeme in lemma.grammemes {
+        stmt.execute((&lemma.id, grammeme))
+            .ensure("failed to insert grammeme<>lemma relationship");
+    }
+
+    // followed by all of its variations ...
+    let mut word_insert = conn
+        .prepare_cached("INSERT INTO words (lemma, word) VALUES (?1, ?2)")
+        .unwrap();
+
+    let mut word_grammeme = conn
+        .prepare_cached("INSERT INTO word_grammemes (word, grammeme) VALUES (?1, ?2)")
+        .unwrap();
+
+    for variation in lemma.variations {
+        // insert the word itself and get its rowid
+        word_insert
+            .execute((&lemma.id, &variation.word))
+            .ensure("failed to insert word");
+        let row_id = conn.last_insert_rowid();
+
+        // then insert its grammeme links
+        for grammeme in variation.grammemes {
+            word_grammeme
+                .execute((row_id, grammeme))
+                .ensure("failed to insert word<>grammeme link");
+        }
+    }
+
+    debug!("inserted lemma {}", lemma.id);
+}
diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs
index 9f2f5089a603..502351cf9de8 100644
--- a/corp/russian/data-import/src/main.rs
+++ b/corp/russian/data-import/src/main.rs
@@ -55,11 +55,13 @@
 //!   between the lemmas "два" and "второй".
 
 use log::{error, info};
+use rusqlite::{Connection, Result};
 use std::env;
 use std::fmt::Display;
 use std::fs::File;
-use std::io::{BufReader, BufWriter, Write};
+use std::io::BufReader;
 
+mod db_setup;
 mod oc_parser;
 
 fn main() {
@@ -77,18 +79,34 @@ fn main() {
 
     let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file));
 
-    let mut out = BufWriter::new(std::io::stdout().lock());
+    let conn = Connection::open("out.db").ensure("failed to open DB connection");
+
+    db_setup::initial_schema(&conn);
+
+    // afterwards:
+    // add actual IDs to grammemes
+    // properly reference keys internally
+    // add foreign key constraint on lemma_grammemes.grammeme
+
+    let mut tx = conn
+        .unchecked_transaction()
+        .ensure("failed to start transaction");
+    let mut count = 0;
 
     while let Some(elem) = parser.next_element() {
-        if let oc_parser::OcElement::Lemma(lemma) = elem {
-            if lemma.lemma.word == "тяжёлый" {
-                writeln!(out, "{:?}", lemma).ensure("writing output failed");
-                break;
-            }
+        // commit every 1000 things
+        if count % 1000 == 0 {
+            tx.commit().ensure("transaction failed");
+            tx = conn
+                .unchecked_transaction()
+                .ensure("failed to start new transaction");
+            info!("transaction committed at watermark {}", count);
         }
-    }
 
-    out.flush().ensure("flushing the out buffer failed");
+        db_setup::insert_oc_element(&tx, elem);
+
+        count += 1;
+    }
 }
 
 /// It's like `expect`, but through `log::error`.