about summary refs log tree commit diff
path: root/corp/russian/data-import/src/main.rs
blob: 70eb0a56dd285bd57aac42cea070abe2e101a784 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
//! This program imports Russian language data from OpenCorpora
//! ("Открытый корпус") into a SQLite database that can be used for
//! [//corp/russian][corp-russian] projects.
//!
//! [corp-russian]: https://at.tvl.fyi/?q=%2F%2Fcorp%2Frussian
//!
//! Ideally, running this on an OpenCorpora dump should yield a fully
//! functional SQLite database compatible with all other tools
//! consuming it.
//!
//! ## OpenCorpora format
//!
//! The format used is partially documented on the [OpenCorpora
//! website][format-docs]. This seems to be a slightly outdated
//! format, however, hence some information about what the format
//! seems to be today.
//!
//! [format-docs]: http://opencorpora.org/?page=export
//!
//! The format is an XML file, which has several categories of data,
//! each with their own schema:
//!
//! * `grammemes`: These define units of grammar. They're *likely* pretty
//!   static, and we'll *likely* want to map them into a custom set of
//!   (simpler) categories.
//!
//!   They form some kind of internal hierarchy, where some of them have a
//!   `parent` attribute set to some other grammemes `name`.
//!
//!   There's a ridiculous number of these.
//!
//! * `restrictions`: Unclear, not documented on the page. They describe
//!   something about the relationship between grammemes.
//!
//! * `lemmata`: this lists the actual lemmas, as well as all their
//!   included morphological variants
//!
//!   Each lemma has an `id` attribute uniquely identifying its dictionary
//!   form, as well as a number of sub-elements:
//!
//!   * the `l` attribute contains the lemma itself
//!   * the `f` attributes contain morphological variations
//!
//!   Each of these sub elements again contains a number of `g` elements,
//!   which refer to the IDs of grammems in their `v` attributes.
//!
//! * `<link_types>` These list possible "relationships between lemmas",
//!   basically just assigning them IDs and names. There's only 27 of
//!   these.
//!
//! * `<links>`: Using the types defined above, this establishes links
//!   between lemmas that have some kind of relationship.
//!
//!   For example, a relationship `cardinal/ordinal` might be established
//!   between the lemmas "два" and "второй".

use log::{error, info};
use rusqlite::{Connection, Result};
use std::env;
use std::fmt::Display;
use std::fs::File;
use std::io::BufReader;

mod db_setup;
mod oc_parser;

fn main() {
    env_logger::builder()
        .filter_level(log::LevelFilter::Info)
        .init();

    let input_path = env::args()
        .skip(1)
        .next()
        .ensure("must specify the input filename as the only argument");

    info!("reading from {input_path}");
    let input_file = File::open(input_path).ensure("failed to open input file");

    let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file));

    let conn = Connection::open("out.db").ensure("failed to open DB connection");

    db_setup::initial_schema(&conn);

    // afterwards:
    // add actual IDs to grammemes
    // properly reference keys internally
    // add foreign key constraint on lemma_grammemes.grammeme

    let mut tx = conn
        .unchecked_transaction()
        .ensure("failed to start transaction");
    let mut count = 0;

    while let Some(elem) = parser.next_element() {
        // commit every 1000 things
        if count % 1000 == 0 {
            tx.commit().ensure("transaction failed");
            tx = conn
                .unchecked_transaction()
                .ensure("failed to start new transaction");
            info!("transaction committed at watermark {}", count);
        }

        db_setup::insert_oc_element(&tx, elem);

        count += 1;
    }

    tx.commit().ensure("final commit failed");
}

/// It's like `expect`, but through `log::error`.
trait Ensure<T> {
    fn ensure<S: Into<String>>(self, msg: S) -> T;
}

impl<T, E: Display> Ensure<T> for Result<T, E> {
    fn ensure<S: Into<String>>(self, msg: S) -> T {
        match self {
            Ok(x) => x,
            Err(err) => {
                error!("{}: {}", msg.into(), err);
                std::process::exit(1);
            }
        }
    }
}

impl<T> Ensure<T> for Option<T> {
    fn ensure<S: Into<String>>(self, msg: S) -> T {
        match self {
            Some(x) => x,
            None => {
                error!("{}", msg.into());
                std::process::exit(1);
            }
        }
    }
}

fn bail<S: Into<String>>(msg: S) -> ! {
    error!("{}", msg.into());
    std::process::exit(1);
}