//! This module prepares the database layout. //! //! The XML import may be in an arbitrary order, so importing data is //! a multi-step process where we first set up schemas matching the //! data layout, import the data, and then modify the schema to //! introduce things like foreign key constraints between tables that //! represent relations. use super::{bail, Ensure}; use crate::oc_parser::*; use log::{debug, info}; use rusqlite::Connection; /// Sets up an initial schema which matches the OpenCorpora data. pub fn initial_oc_schema(conn: &Connection) { conn.execute_batch( r#" -- table for plain import of grammemes from XML CREATE TABLE oc_grammemes ( name TEXT PRIMARY KEY, parent TEXT, alias TEXT, description TEXT ) STRICT; -- table for plain import of lemmas (*not* their variations!) CREATE TABLE oc_lemmas ( id INTEGER PRIMARY KEY, lemma TEXT NOT NULL ) STRICT; -- table for relationship between grammemes and lemmas CREATE TABLE oc_lemma_grammemes ( lemma INTEGER, grammeme TEXT NOT NULL, FOREIGN KEY(lemma) REFERENCES oc_lemmas(id) ) STRICT; -- table for all words, i.e. including variations of lemmata CREATE TABLE oc_words ( lemma INTEGER NOT NULL, word TEXT NOT NULL, FOREIGN KEY(lemma) REFERENCES oc_lemmas(id) ) STRICT; -- table for relationship between words and grammemes CREATE TABLE oc_word_grammemes ( word INTEGER NOT NULL, grammeme TEXT NOT NULL, FOREIGN KEY(word) REFERENCES oc_words(ROWID) ) STRICT; -- table for link types CREATE TABLE oc_link_types ( id INTEGER PRIMARY KEY, name TEXT ) STRICT; -- table for links between lemmata CREATE TABLE oc_links ( id INTEGER PRIMARY KEY, link_type INTEGER NOT NULL, from_lemma INTEGER NOT NULL, to_lemma INTEGER NOT NULL, FOREIGN KEY(link_type) REFERENCES oc_link_types(id), FOREIGN KEY(from_lemma) REFERENCES oc_lemmas(id), FOREIGN KEY(to_lemma) REFERENCES oc_lemmas(id) ) STRICT; "#, ) .ensure("setting up initial table schema failed"); info!("set up initial table schema for OpenCorpora import"); } /// Inserts a single OpenCorpora element into the initial table structure. pub fn insert_oc_element(conn: &Connection, elem: OcElement) { match elem { OcElement::Grammeme(grammeme) => { conn.execute( "INSERT INTO oc_grammemes (name, parent, alias, description) VALUES (?1, ?2, ?3, ?4)", ( &grammeme.name, &grammeme.parent, &grammeme.alias, &grammeme.description, ), ) .ensure("failed to insert grammeme"); debug!("inserted grammeme {}", grammeme.name); } OcElement::Lemma(lemma) => insert_lemma(conn, lemma), OcElement::LinkType(lt) => { conn.execute( "INSERT INTO oc_link_types (id, name) VALUES (?1, ?2)", (<.id, <.name), ) .ensure("failed to insert link type"); info!("inserted link type {}", lt.name); } OcElement::Link(link) => { let mut stmt = conn .prepare_cached( "INSERT INTO oc_links (id, link_type, from_lemma, to_lemma) VALUES (?1, ?2, ?3, ?4)", ) .ensure("failed to prepare link statement"); stmt.execute((&link.id, &link.link_type, &link.from, &link.to)) .ensure("failed to insert link"); debug!("inserted link {}", link.id); } } } /// Insert a single lemma into the initial structure. This is somewhat /// involved because it also establishes a bunch of relations. fn insert_lemma(conn: &Connection, lemma: Lemma) { // insert the lemma itself let mut stmt = conn .prepare_cached("INSERT INTO oc_lemmas (id, lemma) VALUES (?1, ?2)") .ensure("failed to prepare statement"); stmt.execute((&lemma.id, &lemma.lemma.word)) .ensure("failed to insert grammeme"); // followed by its relations to the grammemes set let mut stmt = conn .prepare_cached("INSERT INTO oc_lemma_grammemes (lemma, grammeme) VALUES (?1, ?2)") .ensure("failed to prepare statement"); for grammeme in lemma.grammemes { stmt.execute((&lemma.id, grammeme)) .ensure("failed to insert grammeme<>lemma relationship"); } // followed by all of its variations ... let mut word_insert = conn .prepare_cached("INSERT INTO oc_words (lemma, word) VALUES (?1, ?2)") .unwrap(); let mut word_grammeme = conn .prepare_cached("INSERT INTO oc_word_grammemes (word, grammeme) VALUES (?1, ?2)") .unwrap(); for variation in lemma.variations { // insert the word itself and get its rowid word_insert .execute((&lemma.id, &variation.word)) .ensure("failed to insert word"); let row_id = conn.last_insert_rowid(); // then insert its grammeme links for grammeme in variation.grammemes { word_grammeme .execute((row_id, grammeme)) .ensure("failed to insert word<>grammeme link"); } } debug!("inserted lemma {}", lemma.id); }