1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
//! This program imports Russian language data from OpenCorpora
//! ("Открытый корпус") into a SQLite database that can be used for
//! [//corp/russian][corp-russian] projects.
//!
//! [corp-russian]: https://at.tvl.fyi/?q=%2F%2Fcorp%2Frussian
//!
//! Ideally, running this on an OpenCorpora dump should yield a fully
//! functional SQLite database compatible with all other tools
//! consuming it.
//!
//! ## OpenCorpora format
//!
//! The format used is partially documented on the [OpenCorpora
//! website][format-docs]. This seems to be a slightly outdated
//! format, however, hence some information about what the format
//! seems to be today.
//!
//! [format-docs]: http://opencorpora.org/?page=export
//!
//! The format is an XML file, which has several categories of data,
//! each with their own schema:
//!
//! * `grammemes`: These define units of grammar. They're *likely* pretty
//! static, and we'll *likely* want to map them into a custom set of
//! (simpler) categories.
//!
//! They form some kind of internal hierarchy, where some of them have a
//! `parent` attribute set to some other grammemes `name`.
//!
//! There's a ridiculous number of these.
//!
//! * `restrictions`: Unclear, not documented on the page. They describe
//! something about the relationship between grammemes.
//!
//! * `lemmata`: this lists the actual lemmas, as well as all their
//! included morphological variants
//!
//! Each lemma has an `id` attribute uniquely identifying its dictionary
//! form, as well as a number of sub-elements:
//!
//! * the `l` attribute contains the lemma itself
//! * the `f` attributes contain morphological variations
//!
//! Each of these sub elements again contains a number of `g` elements,
//! which refer to the IDs of grammems in their `v` attributes.
//!
//! * `<link_types>` These list possible "relationships between lemmas",
//! basically just assigning them IDs and names. There's only 27 of
//! these.
//!
//! * `<links>`: Using the types defined above, this establishes links
//! between lemmas that have some kind of relationship.
//!
//! For example, a relationship `cardinal/ordinal` might be established
//! between the lemmas "два" and "второй".
use log::{error, info};
use std::env;
use std::fmt::Display;
use std::fs::File;
use std::io::{BufReader, BufWriter, Write};
mod oc_parser;
fn main() {
env_logger::builder()
.filter_level(log::LevelFilter::Info)
.init();
let input_path = env::args()
.skip(1)
.next()
.ensure("must specify the input filename as the only argument");
info!("reading from {input_path}");
let input_file = File::open(input_path).ensure("failed to open input file");
let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file));
let mut out = BufWriter::new(std::io::stdout().lock());
while let Some(elem) = parser.next_element() {
if let oc_parser::OcElement::Lemma(lemma) = elem {
if lemma.lemma.word == "тяжёлый" {
writeln!(out, "{:?}", lemma).ensure("writing output failed");
break;
}
}
}
out.flush().ensure("flushing the out buffer failed");
}
/// It's like `expect`, but through `log::error`.
trait Ensure<T> {
fn ensure<S: Into<String>>(self, msg: S) -> T;
}
impl<T, E: Display> Ensure<T> for Result<T, E> {
fn ensure<S: Into<String>>(self, msg: S) -> T {
match self {
Ok(x) => x,
Err(err) => {
error!("{}: {}", msg.into(), err);
std::process::exit(1);
}
}
}
}
impl<T> Ensure<T> for Option<T> {
fn ensure<S: Into<String>>(self, msg: S) -> T {
match self {
Some(x) => x,
None => {
error!("{}", msg.into());
std::process::exit(1);
}
}
}
}
fn bail<S: Into<String>>(msg: S) -> ! {
error!("{}", msg.into());
std::process::exit(1);
}
|