diff options
author | Vincent Ambo <mail@tazj.in> | 2023-01-17T21·36+0300 |
---|---|---|
committer | tazjin <tazjin@tvl.su> | 2023-01-18T01·10+0000 |
commit | ee7616d9563eabf2ae01927bc9d37ccf3e3b3325 (patch) | |
tree | ac43dc06b1f191308182897bd46726f5e9d41783 /corp/russian/data-import/src/main.rs | |
parent | 032ab16bbbd318704be71af7b569624ddab24802 (diff) |
feat(corp/russian/data-import): new OpenCorpora data import tool r/5683
Adds the beginning of a tool which can import OpenCorpora data into a SQLite database. This is quite a lot of toil and there's probably a better way to do this, but overall becoming this intimately familiar with the data structures is quite helpful for understanding what I can/can't do with only this dataset. Change-Id: Ieab33a8ce07ea4ac87917b9c8132226bbc6523b1 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7859 Reviewed-by: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI
Diffstat (limited to 'corp/russian/data-import/src/main.rs')
-rw-r--r-- | corp/russian/data-import/src/main.rs | 126 |
1 files changed, 126 insertions, 0 deletions
diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs new file mode 100644 index 000000000000..336cc3d14f9f --- /dev/null +++ b/corp/russian/data-import/src/main.rs @@ -0,0 +1,126 @@ +//! This program imports Russian language data from OpenCorpora +//! ("Открытый корпус") into a SQLite database that can be used for +//! [//corp/russian][corp-russian] projects. +//! +//! [corp-russian]: https://at.tvl.fyi/?q=%2F%2Fcorp%2Frussian +//! +//! Ideally, running this on an OpenCorpora dump should yield a fully +//! functional SQLite database compatible with all other tools +//! consuming it. +//! +//! ## OpenCorpora format +//! +//! The format used is partially documented on the [OpenCorpora +//! website][format-docs]. This seems to be a slightly outdated +//! format, however, hence some information about what the format +//! seems to be today. +//! +//! [format-docs]: http://opencorpora.org/?page=export +//! +//! The format is an XML file, which has several categories of data, +//! each with their own schema: +//! +//! * `grammemes`: These define units of grammar. They're *likely* pretty +//! static, and we'll *likely* want to map them into a custom set of +//! (simpler) categories. +//! +//! They form some kind of internal hierarchy, where some of them have a +//! `parent` attribute set to some other grammemes `name`. +//! +//! There's a ridiculous number of these. +//! +//! * `restrictions`: Unclear, not documented on the page. They describe +//! something about the relationship between grammemes. +//! +//! * `lemmata`: this lists the actual lemmas, as well as all their +//! included morphological variants +//! +//! Each lemma has an `id` attribute uniquely identifying its dictionary +//! form, as well as a number of sub-elements: +//! +//! * the `l` attribute contains the lemma itself +//! * the `f` attributes contain morphological variations +//! +//! Each of these sub elements again contains a number of `g` elements, +//! which refer to the IDs of grammems in their `v` attributes. +//! +//! * `<link_types>` These list possible "relationships between lemmas", +//! basically just assigning them IDs and names. There's only 27 of +//! these. +//! +//! * `<links>`: Using the types defined above, this establishes links +//! between lemmas that have some kind of relationship. +//! +//! For example, a relationship `cardinal/ordinal` might be established +//! between the lemmas "два" and "второй". + +use log::{error, info}; +use std::env; +use std::fmt::Display; +use std::fs::File; +use std::io::{BufReader, BufWriter, Write}; + +mod oc_parser; + +fn main() { + env_logger::builder() + .filter_level(log::LevelFilter::Info) + .init(); + + let input_path = env::args() + .skip(1) + .next() + .ensure("must specify the input filename as the only argument"); + + info!("reading from {input_path}"); + let input_file = File::open(input_path).ensure("failed to open input file"); + + let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file)); + + let mut out = BufWriter::new(std::io::stdout().lock()); + + while let Some(elem) = parser.next_element() { + match elem { + oc_parser::OcElement::Grammeme(g) => { + writeln!(out, "{:?}", g).ensure("writing element failed") + } + oc_parser::OcElement::Lemma(_) => continue, + } + } + + out.flush().ensure("flushing the out buffer failed"); +} + +/// It's like `expect`, but through `log::error`. +trait Ensure<T> { + fn ensure<S: Into<String>>(self, msg: S) -> T; +} + +impl<T, E: Display> Ensure<T> for Result<T, E> { + fn ensure<S: Into<String>>(self, msg: S) -> T { + match self { + Ok(x) => x, + Err(err) => { + error!("{}: {}", msg.into(), err); + std::process::exit(1); + } + } + } +} + +impl<T> Ensure<T> for Option<T> { + fn ensure<S: Into<String>>(self, msg: S) -> T { + match self { + Some(x) => x, + None => { + error!("{}", msg.into()); + std::process::exit(1); + } + } + } +} + +fn bail<S: Into<String>>(msg: S) -> ! { + error!("{}", msg.into()); + std::process::exit(1); +} |