about summary refs log tree commit diff
path: root/corp/russian/data-import/src/db_setup.rs
blob: c9fb517386517235f8453a67feee14b916d7cea4 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
//! This module prepares the database layout.
//!
//! The XML import may be in an arbitrary order, so importing data is
//! a multi-step process where we first set up schemas matching the
//! data layout, import the data, and then modify the schema to
//! introduce things like foreign key constraints between tables that
//! represent relations.

use super::Ensure;
use crate::oc_parser::*;
use crate::or_parser;
use log::{debug, info};
use rusqlite::Connection;

/// Sets up an initial schema which matches the OpenCorpora data.
pub fn initial_oc_schema(conn: &Connection) {
    conn.execute_batch(
        r#"
-- table for plain import of grammemes from XML
CREATE TABLE oc_grammemes (
    name TEXT PRIMARY KEY,
    parent TEXT,
    alias TEXT,
    description TEXT
) STRICT;

-- table for plain import of lemmas (*not* their variations!)
CREATE TABLE oc_lemmas (
    id INTEGER PRIMARY KEY,
    lemma TEXT NOT NULL
) STRICT;

-- table for relationship between grammemes and lemmas
CREATE TABLE oc_lemma_grammemes (
    lemma INTEGER,
    grammeme TEXT NOT NULL,
    FOREIGN KEY(lemma) REFERENCES oc_lemmas(id)
) STRICT;

-- table for all words, i.e. including variations of lemmata
CREATE TABLE oc_words (
    lemma INTEGER NOT NULL,
    word TEXT NOT NULL,
    FOREIGN KEY(lemma) REFERENCES oc_lemmas(id)
) STRICT;

-- table for relationship between words and grammemes
CREATE TABLE oc_word_grammemes (
    word INTEGER NOT NULL,
    grammeme TEXT NOT NULL,
    FOREIGN KEY(word) REFERENCES oc_words(ROWID)
) STRICT;

-- table for link types
CREATE TABLE oc_link_types (
  id INTEGER PRIMARY KEY,
  name TEXT
) STRICT;

-- table for links between lemmata
CREATE TABLE oc_links (
  id INTEGER PRIMARY KEY,
  link_type INTEGER NOT NULL,
  from_lemma INTEGER NOT NULL,
  to_lemma INTEGER NOT NULL,
  FOREIGN KEY(link_type) REFERENCES oc_link_types(id),
  FOREIGN KEY(from_lemma) REFERENCES oc_lemmas(id),
  FOREIGN KEY(to_lemma) REFERENCES oc_lemmas(id)
) STRICT;

"#,
    )
    .ensure("setting up OpenCorpora table schema failed");

    info!("set up initial table schema for OpenCorpora import");
}

/// Inserts a single OpenCorpora element into the initial table structure.
pub fn insert_oc_element(conn: &Connection, elem: OcElement) {
    match elem {
        OcElement::Grammeme(grammeme) => {
            conn.execute(
                "INSERT INTO oc_grammemes (name, parent, alias, description) VALUES (?1, ?2, ?3, ?4)",
                (
                    &grammeme.name,
                    &grammeme.parent,
                    &grammeme.alias,
                    &grammeme.description,
                ),
            )
            .ensure("failed to insert grammeme");

            debug!("inserted grammeme {}", grammeme.name);
        }

        OcElement::Lemma(lemma) => insert_lemma(conn, lemma),

        OcElement::LinkType(lt) => {
            conn.execute(
                "INSERT INTO oc_link_types (id, name) VALUES (?1, ?2)",
                (&lt.id, &lt.name),
            )
            .ensure("failed to insert link type");

            info!("inserted link type {}", lt.name);
        }

        OcElement::Link(link) => {
            let mut stmt = conn
                .prepare_cached(
                    "INSERT INTO oc_links (id, link_type, from_lemma, to_lemma) VALUES (?1, ?2, ?3, ?4)",
                )
                .ensure("failed to prepare link statement");

            stmt.execute((&link.id, &link.link_type, &link.from, &link.to))
                .ensure("failed to insert link");

            debug!("inserted link {}", link.id);
        }
    }
}

/// Insert a single lemma into the initial structure. This is somewhat
/// involved because it also establishes a bunch of relations.
fn insert_lemma(conn: &Connection, lemma: Lemma) {
    // insert the lemma itself
    let mut stmt = conn
        .prepare_cached("INSERT INTO oc_lemmas (id, lemma) VALUES (?1, ?2)")
        .ensure("failed to prepare statement");

    stmt.execute((&lemma.id, &lemma.lemma.word))
        .ensure("failed to insert grammeme");

    // followed by its relations to the grammemes set
    let mut stmt = conn
        .prepare_cached("INSERT INTO oc_lemma_grammemes (lemma, grammeme) VALUES (?1, ?2)")
        .ensure("failed to prepare statement");

    for grammeme in lemma.grammemes {
        stmt.execute((&lemma.id, grammeme))
            .ensure("failed to insert grammeme<>lemma relationship");
    }

    // followed by all of its variations ...
    let mut word_insert = conn
        .prepare_cached("INSERT INTO oc_words (lemma, word) VALUES (?1, ?2)")
        .unwrap();

    let mut word_grammeme = conn
        .prepare_cached("INSERT INTO oc_word_grammemes (word, grammeme) VALUES (?1, ?2)")
        .unwrap();

    for variation in lemma.variations {
        // insert the word itself and get its rowid
        word_insert
            .execute((&lemma.id, &variation.word))
            .ensure("failed to insert word");
        let row_id = conn.last_insert_rowid();

        // then insert its grammeme links
        for grammeme in variation.grammemes {
            word_grammeme
                .execute((row_id, grammeme))
                .ensure("failed to insert word<>grammeme link");
        }
    }

    debug!("inserted lemma {}", lemma.id);
}

/// Sets up an initial schema for the OpenRussian data.
pub fn initial_or_schema(conn: &Connection) {
    conn.execute_batch(
        r#"
CREATE TABLE or_words (
    id INTEGER PRIMARY KEY,
    bare TEXT NOT NULL,
    accented TEXT,
    derived_from_word_id INTEGER,
    rank INTEGER,
    word_type TEXT,
    level TEXT
) STRICT;

CREATE TABLE or_words_forms (
    id INTEGER PRIMARY KEY,
    word_id INTEGER NOT NULL,
    form_type TEXT,
    position TEXT,
    form TEXT,
    form_bare TEXT,
    FOREIGN KEY(word_id) REFERENCES words(id)
) STRICT;

CREATE TABLE or_translations (
    id INTEGER PRIMARY KEY,
    word_id INTEGER NOT NULL,
    translation TEXT,
    example_ru TEXT,
    example_tl TEXT,
    info TEXT,
    FOREIGN KEY(word_id) REFERENCES words(id)
) STRICT;
"#,
    )
    .ensure("setting up OpenRussian table schema failed");

    info!("set up initial table schema for OpenRussian import");
}

pub fn insert_or_words<I: Iterator<Item = or_parser::Word>>(conn: &Connection, words: I) {
    let mut stmt = conn
        .prepare_cached(
            "
INSERT INTO or_words (id, bare, accented, derived_from_word_id, rank, word_type, level)
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)
",
        )
        .ensure("failed to prepare OR words statement");
    let mut count = 0;

    for word in words {
        stmt.execute((
            word.id,
            word.bare,
            word.accented,
            word.derived_from_word_id,
            word.rank,
            word.word_type,
            word.level,
        ))
        .ensure("failed to insert OR word");
        count += 1;
    }

    info!("inserted {} OpenRussian words", count);
}

pub fn insert_or_word_forms<I: Iterator<Item = or_parser::WordForm>>(conn: &Connection, forms: I) {
    let mut stmt = conn
        .prepare_cached(
            "
INSERT INTO or_words_forms (id, word_id, form_type, position, form, form_bare)
VALUES (?1, ?2, ?3, ?4, ?5, ?6)
",
        )
        .ensure("failed to prepare OR word forms statement");
    let mut count = 0;

    for form in forms {
        stmt.execute((
            form.id,
            form.word_id,
            form.form_type,
            form.position,
            form.form,
            form.form_bare,
        ))
        .ensure("failed to insert OR word form");
        count += 1;
    }

    info!("inserted {} OpenRussian word forms", count);
}

pub fn insert_or_translations<I: Iterator<Item = or_parser::Translation>>(
    conn: &Connection,
    translations: I,
) {
    let mut stmt = conn
        .prepare_cached(
            "INSERT INTO or_translations (id, word_id, translation, example_ru, example_tl, info)
             VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
        )
        .ensure("failed to prepare OR translation statement");

    let mut count = 0;

    for tl in translations {
        if tl.lang != "en" {
            continue;
        }

        stmt.execute((
            tl.id,
            tl.word_id,
            tl.tl,
            tl.example_ru,
            tl.example_tl,
            tl.info,
        ))
        .ensure("failed to insert OR translation");

        count += 1;
    }

    info!("inserted {} OpenRussian translations", count);
}