From db26825eecacb22b60abebf2879bf1420493b8c5 Mon Sep 17 00:00:00 2001 From: Vincent Ambo Date: Thu, 19 Jan 2023 00:20:21 +0300 Subject: chore(corp/data-import): namespace tables for OpenCorpora data I'm changing strategies to importing both OC and another dataset before continuing to normalise the data, as it might be easier to do in a set of table-constructing queries inside of SQLite with all raw data in place. Change-Id: I26b41af80586fc1bfd8e26a6be20579068a82507 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7872 Autosubmit: tazjin Reviewed-by: tazjin Tested-by: BuildkiteCI --- corp/russian/data-import/src/db_setup.rs | 42 ++++++++++++++++---------------- corp/russian/data-import/src/main.rs | 2 +- 2 files changed, 22 insertions(+), 22 deletions(-) (limited to 'corp/russian') diff --git a/corp/russian/data-import/src/db_setup.rs b/corp/russian/data-import/src/db_setup.rs index 5959ad09e5..3f0fa0ff63 100644 --- a/corp/russian/data-import/src/db_setup.rs +++ b/corp/russian/data-import/src/db_setup.rs @@ -12,11 +12,11 @@ use log::{debug, info}; use rusqlite::Connection; /// Sets up an initial schema which matches the OpenCorpora data. -pub fn initial_schema(conn: &Connection) { +pub fn initial_oc_schema(conn: &Connection) { conn.execute_batch( r#" -- table for plain import of grammemes from XML -CREATE TABLE grammemes ( +CREATE TABLE oc_grammemes ( name TEXT PRIMARY KEY, parent TEXT, alias TEXT, @@ -24,47 +24,47 @@ CREATE TABLE grammemes ( ) STRICT; -- table for plain import of lemmas (*not* their variations!) -CREATE TABLE lemmas ( +CREATE TABLE oc_lemmas ( id INTEGER PRIMARY KEY, lemma TEXT NOT NULL ) STRICT; -- table for relationship between grammemes and lemmas -CREATE TABLE lemma_grammemes ( +CREATE TABLE oc_lemma_grammemes ( lemma INTEGER, grammeme TEXT NOT NULL, - FOREIGN KEY(lemma) REFERENCES lemmas(id) + FOREIGN KEY(lemma) REFERENCES oc_lemmas(id) ) STRICT; -- table for all words, i.e. including variations of lemmata -CREATE TABLE words ( +CREATE TABLE oc_words ( lemma INTEGER NOT NULL, word TEXT NOT NULL, - FOREIGN KEY(lemma) REFERENCES lemmas(id) + FOREIGN KEY(lemma) REFERENCES oc_lemmas(id) ) STRICT; -- table for relationship between words and grammemes -CREATE TABLE word_grammemes ( +CREATE TABLE oc_word_grammemes ( word INTEGER NOT NULL, grammeme TEXT NOT NULL, - FOREIGN KEY(word) REFERENCES words(ROWID) + FOREIGN KEY(word) REFERENCES oc_words(ROWID) ) STRICT; -- table for link types -CREATE TABLE link_types ( +CREATE TABLE oc_link_types ( id INTEGER PRIMARY KEY, name TEXT ) STRICT; -- table for links between lemmata -CREATE TABLE links ( +CREATE TABLE oc_links ( id INTEGER PRIMARY KEY, link_type INTEGER NOT NULL, from_lemma INTEGER NOT NULL, to_lemma INTEGER NOT NULL, - FOREIGN KEY(link_type) REFERENCES link_types(id), - FOREIGN KEY(from_lemma) REFERENCES lemmas(id), - FOREIGN KEY(to_lemma) REFERENCES lemmas(id) + FOREIGN KEY(link_type) REFERENCES oc_link_types(id), + FOREIGN KEY(from_lemma) REFERENCES oc_lemmas(id), + FOREIGN KEY(to_lemma) REFERENCES oc_lemmas(id) ) STRICT; "#, @@ -79,7 +79,7 @@ pub fn insert_oc_element(conn: &Connection, elem: OcElement) { match elem { OcElement::Grammeme(grammeme) => { conn.execute( - "INSERT INTO grammemes (name, parent, alias, description) VALUES (?1, ?2, ?3, ?4)", + "INSERT INTO oc_grammemes (name, parent, alias, description) VALUES (?1, ?2, ?3, ?4)", ( &grammeme.name, &grammeme.parent, @@ -96,7 +96,7 @@ pub fn insert_oc_element(conn: &Connection, elem: OcElement) { OcElement::LinkType(lt) => { conn.execute( - "INSERT INTO link_types (id, name) VALUES (?1, ?2)", + "INSERT INTO oc_link_types (id, name) VALUES (?1, ?2)", (<.id, <.name), ) .ensure("failed to insert link type"); @@ -107,7 +107,7 @@ pub fn insert_oc_element(conn: &Connection, elem: OcElement) { OcElement::Link(link) => { let mut stmt = conn .prepare_cached( - "INSERT INTO links (id, link_type, from_lemma, to_lemma) VALUES (?1, ?2, ?3, ?4)", + "INSERT INTO oc_links (id, link_type, from_lemma, to_lemma) VALUES (?1, ?2, ?3, ?4)", ) .ensure("failed to prepare link statement"); @@ -124,7 +124,7 @@ pub fn insert_oc_element(conn: &Connection, elem: OcElement) { fn insert_lemma(conn: &Connection, lemma: Lemma) { // insert the lemma itself let mut stmt = conn - .prepare_cached("INSERT INTO lemmas (id, lemma) VALUES (?1, ?2)") + .prepare_cached("INSERT INTO oc_lemmas (id, lemma) VALUES (?1, ?2)") .ensure("failed to prepare statement"); stmt.execute((&lemma.id, &lemma.lemma.word)) @@ -132,7 +132,7 @@ fn insert_lemma(conn: &Connection, lemma: Lemma) { // followed by its relations to the grammemes set let mut stmt = conn - .prepare_cached("INSERT INTO lemma_grammemes (lemma, grammeme) VALUES (?1, ?2)") + .prepare_cached("INSERT INTO oc_lemma_grammemes (lemma, grammeme) VALUES (?1, ?2)") .ensure("failed to prepare statement"); for grammeme in lemma.grammemes { @@ -142,11 +142,11 @@ fn insert_lemma(conn: &Connection, lemma: Lemma) { // followed by all of its variations ... let mut word_insert = conn - .prepare_cached("INSERT INTO words (lemma, word) VALUES (?1, ?2)") + .prepare_cached("INSERT INTO oc_words (lemma, word) VALUES (?1, ?2)") .unwrap(); let mut word_grammeme = conn - .prepare_cached("INSERT INTO word_grammemes (word, grammeme) VALUES (?1, ?2)") + .prepare_cached("INSERT INTO oc_word_grammemes (word, grammeme) VALUES (?1, ?2)") .unwrap(); for variation in lemma.variations { diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs index 8a76c3823e..85e89a905b 100644 --- a/corp/russian/data-import/src/main.rs +++ b/corp/russian/data-import/src/main.rs @@ -89,7 +89,7 @@ fn main() { let conn = Connection::open(output_path).ensure("failed to open DB connection"); - db_setup::initial_schema(&conn); + db_setup::initial_oc_schema(&conn); // afterwards: // add actual IDs to grammemes -- cgit 1.4.1