From 6986aa5824ba6ae23b1363fede13c2df5ea0e770 Mon Sep 17 00:00:00 2001 From: Vincent Ambo Date: Wed, 18 Jan 2023 14:52:53 +0300 Subject: feat(corp/data-import): insert OpenCorpora data into SQLite This is an initial and kind of dumb table structure, but there's some massaging that needs to be done before this makes more sense. Change-Id: I441288b684ef86be507099bcc4ebf984598789c8 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7861 Reviewed-by: tazjin Tested-by: BuildkiteCI --- corp/russian/data-import/src/db_setup.rs | 128 +++++++++++++++++++++++++++++++ corp/russian/data-import/src/main.rs | 36 ++++++--- 2 files changed, 155 insertions(+), 9 deletions(-) create mode 100644 corp/russian/data-import/src/db_setup.rs (limited to 'corp/russian') diff --git a/corp/russian/data-import/src/db_setup.rs b/corp/russian/data-import/src/db_setup.rs new file mode 100644 index 0000000000..1a9e2dd879 --- /dev/null +++ b/corp/russian/data-import/src/db_setup.rs @@ -0,0 +1,128 @@ +//! This module prepares the database layout. +//! +//! The XML import may be in an arbitrary order, so importing data is +//! a multi-step process where we first set up schemas matching the +//! data layout, import the data, and then modify the schema to +//! introduce things like foreign key constraints between tables that +//! represent relations. + +use super::{bail, Ensure}; +use crate::oc_parser::*; +use log::{debug, info}; +use rusqlite::Connection; + +/// Sets up an initial schema which matches the OpenCorpora data. +pub fn initial_schema(conn: &Connection) { + conn.execute_batch( + r#" +-- table for plain import of grammemes from XML +CREATE TABLE grammemes ( + name TEXT PRIMARY KEY, + parent TEXT, + alias TEXT, + description TEXT +) STRICT; + +-- table for plain import of lemmas (*not* their variations!) +CREATE TABLE lemmas ( + id INTEGER PRIMARY KEY, + lemma TEXT NOT NULL +) STRICT; + +-- table for relationship between grammemes and lemmas +CREATE TABLE lemma_grammemes ( + lemma INTEGER, + grammeme TEXT NOT NULL, + FOREIGN KEY(lemma) REFERENCES lemmas(id) +) STRICT; + +-- table for all words, i.e. including variations of lemmata +CREATE TABLE words ( + lemma INTEGER NOT NULL, + word TEXT NOT NULL, + FOREIGN KEY(lemma) REFERENCES lemmas(id) +) STRICT; + +-- table for relationship between words and grammemes +CREATE TABLE word_grammemes ( + word INTEGER NOT NULL, + grammeme TEXT NOT NULL, + FOREIGN KEY(word) REFERENCES words(ROWID) +) STRICT; + +"#, + ) + .ensure("setting up initial table schema failed"); + + info!("set up initial table schema for OpenCorpora import"); +} + +/// Inserts a single OpenCorpora element into the initial table structure. +pub fn insert_oc_element(conn: &Connection, elem: OcElement) { + match elem { + OcElement::Grammeme(grammeme) => { + conn.execute( + "INSERT INTO grammemes (name, parent, alias, description) VALUES (?1, ?2, ?3, ?4)", + ( + &grammeme.name, + &grammeme.parent, + &grammeme.alias, + &grammeme.description, + ), + ) + .ensure("failed to insert grammeme"); + + debug!("inserted grammeme {}", grammeme.name); + } + + OcElement::Lemma(lemma) => insert_lemma(conn, lemma), + } +} + +/// Insert a single lemma into the initial structure. This is somewhat +/// involved because it also establishes a bunch of relations. +fn insert_lemma(conn: &Connection, lemma: Lemma) { + // insert the lemma itself + let mut stmt = conn + .prepare_cached("INSERT INTO lemmas (id, lemma) VALUES (?1, ?2)") + .ensure("failed to prepare statement"); + + stmt.execute((&lemma.id, &lemma.lemma.word)) + .ensure("failed to insert grammeme"); + + // followed by its relations to the grammemes set + let mut stmt = conn + .prepare_cached("INSERT INTO lemma_grammemes (lemma, grammeme) VALUES (?1, ?2)") + .ensure("failed to prepare statement"); + + for grammeme in lemma.grammemes { + stmt.execute((&lemma.id, grammeme)) + .ensure("failed to insert grammeme<>lemma relationship"); + } + + // followed by all of its variations ... + let mut word_insert = conn + .prepare_cached("INSERT INTO words (lemma, word) VALUES (?1, ?2)") + .unwrap(); + + let mut word_grammeme = conn + .prepare_cached("INSERT INTO word_grammemes (word, grammeme) VALUES (?1, ?2)") + .unwrap(); + + for variation in lemma.variations { + // insert the word itself and get its rowid + word_insert + .execute((&lemma.id, &variation.word)) + .ensure("failed to insert word"); + let row_id = conn.last_insert_rowid(); + + // then insert its grammeme links + for grammeme in variation.grammemes { + word_grammeme + .execute((row_id, grammeme)) + .ensure("failed to insert word<>grammeme link"); + } + } + + debug!("inserted lemma {}", lemma.id); +} diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs index 9f2f5089a6..502351cf9d 100644 --- a/corp/russian/data-import/src/main.rs +++ b/corp/russian/data-import/src/main.rs @@ -55,11 +55,13 @@ //! between the lemmas "два" and "второй". use log::{error, info}; +use rusqlite::{Connection, Result}; use std::env; use std::fmt::Display; use std::fs::File; -use std::io::{BufReader, BufWriter, Write}; +use std::io::BufReader; +mod db_setup; mod oc_parser; fn main() { @@ -77,18 +79,34 @@ fn main() { let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file)); - let mut out = BufWriter::new(std::io::stdout().lock()); + let conn = Connection::open("out.db").ensure("failed to open DB connection"); + + db_setup::initial_schema(&conn); + + // afterwards: + // add actual IDs to grammemes + // properly reference keys internally + // add foreign key constraint on lemma_grammemes.grammeme + + let mut tx = conn + .unchecked_transaction() + .ensure("failed to start transaction"); + let mut count = 0; while let Some(elem) = parser.next_element() { - if let oc_parser::OcElement::Lemma(lemma) = elem { - if lemma.lemma.word == "тяжёлый" { - writeln!(out, "{:?}", lemma).ensure("writing output failed"); - break; - } + // commit every 1000 things + if count % 1000 == 0 { + tx.commit().ensure("transaction failed"); + tx = conn + .unchecked_transaction() + .ensure("failed to start new transaction"); + info!("transaction committed at watermark {}", count); } - } - out.flush().ensure("flushing the out buffer failed"); + db_setup::insert_oc_element(&tx, elem); + + count += 1; + } } /// It's like `expect`, but through `log::error`. -- cgit 1.4.1