feat(corp/russian/data-import): new OpenCorpora data import tool r/5683

Adds the beginning of a tool which can import OpenCorpora data into a SQLite database. This is quite a lot of toil and there's probably a better way to do this, but overall becoming this intimately familiar with the data structures is quite helpful for understanding what I can/can't do with only this dataset. Change-Id: Ieab33a8ce07ea4ac87917b9c8132226bbc6523b1 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7859 Reviewed-by: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI
author: Vincent Ambo <mail@tazj.in> 2023-01-17T21·36+0300
committer: tazjin <tazjin@tvl.su> 2023-01-18T01·10+0000
commit: ee7616d9563eabf2ae01927bc9d37ccf3e3b3325 (patch)
tree: ac43dc06b1f191308182897bd46726f5e9d41783 /corp/russian/data-import/src
parent: 032ab16bbbd318704be71af7b569624ddab24802 (diff)
2 files changed, 388 insertions, 0 deletions
diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs
new file mode 100644
index 000000000000..336cc3d14f9f
--- /dev/null
+++ b/corp/russian/data-import/src/main.rs
@@ -0,0 +1,126 @@
+//! This program imports Russian language data from OpenCorpora
+//! ("Открытый корпус") into a SQLite database that can be used for
+//! [//corp/russian][corp-russian] projects.
+//!
+//! [corp-russian]: https://at.tvl.fyi/?q=%2F%2Fcorp%2Frussian
+//!
+//! Ideally, running this on an OpenCorpora dump should yield a fully
+//! functional SQLite database compatible with all other tools
+//! consuming it.
+//!
+//! ## OpenCorpora format
+//!
+//! The format used is partially documented on the [OpenCorpora
+//! website][format-docs]. This seems to be a slightly outdated
+//! format, however, hence some information about what the format
+//! seems to be today.
+//!
+//! [format-docs]: http://opencorpora.org/?page=export
+//!
+//! The format is an XML file, which has several categories of data,
+//! each with their own schema:
+//!
+//! * `grammemes`: These define units of grammar. They're *likely* pretty
+//!   static, and we'll *likely* want to map them into a custom set of
+//!   (simpler) categories.
+//!
+//!   They form some kind of internal hierarchy, where some of them have a
+//!   `parent` attribute set to some other grammemes `name`.
+//!
+//!   There's a ridiculous number of these.
+//!
+//! * `restrictions`: Unclear, not documented on the page. They describe
+//!   something about the relationship between grammemes.
+//!
+//! * `lemmata`: this lists the actual lemmas, as well as all their
+//!   included morphological variants
+//!
+//!   Each lemma has an `id` attribute uniquely identifying its dictionary
+//!   form, as well as a number of sub-elements:
+//!
+//!   * the `l` attribute contains the lemma itself
+//!   * the `f` attributes contain morphological variations
+//!
+//!   Each of these sub elements again contains a number of `g` elements,
+//!   which refer to the IDs of grammems in their `v` attributes.
+//!
+//! * `<link_types>` These list possible "relationships between lemmas",
+//!   basically just assigning them IDs and names. There's only 27 of
+//!   these.
+//!
+//! * `<links>`: Using the types defined above, this establishes links
+//!   between lemmas that have some kind of relationship.
+//!
+//!   For example, a relationship `cardinal/ordinal` might be established
+//!   between the lemmas "два" and "второй".
+
+use log::{error, info};
+use std::env;
+use std::fmt::Display;
+use std::fs::File;
+use std::io::{BufReader, BufWriter, Write};
+
+mod oc_parser;
+
+fn main() {
+    env_logger::builder()
+        .filter_level(log::LevelFilter::Info)
+        .init();
+
+    let input_path = env::args()
+        .skip(1)
+        .next()
+        .ensure("must specify the input filename as the only argument");
+
+    info!("reading from {input_path}");
+    let input_file = File::open(input_path).ensure("failed to open input file");
+
+    let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file));
+
+    let mut out = BufWriter::new(std::io::stdout().lock());
+
+    while let Some(elem) = parser.next_element() {
+        match elem {
+            oc_parser::OcElement::Grammeme(g) => {
+                writeln!(out, "{:?}", g).ensure("writing element failed")
+            }
+            oc_parser::OcElement::Lemma(_) => continue,
+        }
+    }
+
+    out.flush().ensure("flushing the out buffer failed");
+}
+
+/// It's like `expect`, but through `log::error`.
+trait Ensure<T> {
+    fn ensure<S: Into<String>>(self, msg: S) -> T;
+}
+
+impl<T, E: Display> Ensure<T> for Result<T, E> {
+    fn ensure<S: Into<String>>(self, msg: S) -> T {
+        match self {
+            Ok(x) => x,
+            Err(err) => {
+                error!("{}: {}", msg.into(), err);
+                std::process::exit(1);
+            }
+        }
+    }
+}
+
+impl<T> Ensure<T> for Option<T> {
+    fn ensure<S: Into<String>>(self, msg: S) -> T {
+        match self {
+            Some(x) => x,
+            None => {
+                error!("{}", msg.into());
+                std::process::exit(1);
+            }
+        }
+    }
+}
+
+fn bail<S: Into<String>>(msg: S) -> ! {
+    error!("{}", msg.into());
+    std::process::exit(1);
+}
diff --git a/corp/russian/data-import/src/oc_parser.rs b/corp/russian/data-import/src/oc_parser.rs
new file mode 100644
index 000000000000..c7a9b8247f64
--- /dev/null
+++ b/corp/russian/data-import/src/oc_parser.rs
@@ -0,0 +1,262 @@
+use super::{bail, Ensure};
+use log::info;
+use xml::attribute::OwnedAttribute;
+use xml::name::OwnedName;
+use xml::reader::XmlEvent;
+use xml::EventReader;
+
+#[derive(Default, Debug)]
+pub struct Grammeme {
+    parent: Option<String>,
+    name: String,
+    alias: String,
+    description: String,
+}
+
+#[derive(Debug)]
+pub struct Lemma {}
+
+#[derive(Debug)]
+pub enum OcElement {
+    Grammeme(Grammeme),
+    Lemma(Lemma),
+}
+
+#[derive(Debug, PartialEq)]
+enum ParserState {
+    /// Parser is not parsing any particular section and waiting for a
+    /// start tag instead.
+    Init,
+
+    /// Parser is parsing grammemes.
+    Grammemes,
+
+    /// Parser is parsing lemmata.
+    Lemmata,
+
+    /// Parser has seen the end of the line and nothing more is
+    /// available.
+    Ended,
+}
+
+pub struct OpenCorporaParser<R: std::io::Read> {
+    reader: EventReader<R>,
+    state: ParserState,
+}
+
+#[derive(PartialEq)]
+enum SectionState {
+    /// Actively interested in parsing this section.
+    Active,
+
+    /// Section is known, but currently ignored.
+    Inactive,
+
+    /// Section is unknown (probably a bug).
+    Unknown,
+}
+
+fn section_state(section: &str) -> SectionState {
+    match section {
+        "grammemes" | "lemmata" => SectionState::Active,
+        "restrictions" | "link_types" | "links" => SectionState::Inactive,
+        _ => SectionState::Unknown,
+    }
+}
+
+impl<R: std::io::Read> OpenCorporaParser<R> {
+    pub fn new(reader: R) -> Self {
+        let config = xml::ParserConfig::new().trim_whitespace(true);
+        let reader = EventReader::new_with_config(reader, config);
+
+        Self {
+            reader,
+            state: ParserState::Init,
+        }
+    }
+
+    /// Pull an `OcElement` out of the parser. Returns `None` if the
+    /// parser stream has ended.
+    pub fn next_element(&mut self) -> Option<OcElement> {
+        if self.state == ParserState::Ended {
+            return None;
+        }
+
+        // Pull the next element to determine what context to enter
+        // next.
+        loop {
+            match &self.next() {
+                // no-op events that do not affect parser state
+                XmlEvent::Comment(_)
+                | XmlEvent::Whitespace(_)
+                | XmlEvent::ProcessingInstruction { .. }
+                | XmlEvent::StartDocument { .. } => continue,
+                XmlEvent::StartElement { name, .. } | XmlEvent::EndElement { name }
+                    if name.local_name == "dictionary" =>
+                {
+                    continue
+                }
+
+                // end of the file, nothing more to return
+                XmlEvent::EndDocument => {
+                    self.state = ParserState::Ended;
+                    return None;
+                }
+
+                // some sections are skipped
+                XmlEvent::StartElement { name, .. } | XmlEvent::EndElement { name }
+                    if section_state(&name.local_name) == SectionState::Inactive =>
+                {
+                    info!("skipping {} section", name.local_name);
+                    self.skip_section(&name.local_name);
+                }
+
+                // active section events start specific parser states ...
+                XmlEvent::StartElement { name, .. }
+                    if section_state(&name.local_name) == SectionState::Active =>
+                {
+                    self.state = match name.local_name.as_str() {
+                        "grammemes" => ParserState::Grammemes,
+                        "lemmata" => ParserState::Lemmata,
+                        _ => unreachable!(),
+                    };
+                }
+
+                // ... or end them
+                XmlEvent::EndElement { name, .. }
+                    if section_state(&name.local_name) == SectionState::Active =>
+                {
+                    // TODO: assert that the right section ended
+                    self.state = ParserState::Init;
+                }
+
+                // actual beginning of an actual element, dispatch accordingly
+                event @ XmlEvent::StartElement {
+                    name, attributes, ..
+                } => match self.state {
+                    ParserState::Grammemes => {
+                        return Some(OcElement::Grammeme(self.parse_grammeme(name, attributes)))
+                    }
+                    ParserState::Lemmata => {
+                        return Some(OcElement::Lemma(self.parse_lemma(name, attributes)))
+                    }
+
+                    ParserState::Init | ParserState::Ended => bail(format!(
+                        "parser received an unexpected start element while in state {:?}: {:?}",
+                        self.state, event
+                    )),
+                },
+
+                // finally, events that indicate a bug if they're
+                // encountered here
+                event @ XmlEvent::EndElement { .. }
+                | event @ XmlEvent::CData(_)
+                | event @ XmlEvent::Characters(_) => {
+                    bail(format!("unexpected XML event: {:?}", event))
+                }
+            }
+        }
+    }
+
+    /// Skip a section by advancing the parser state until we see an
+    /// end element for the skipped section.
+    fn skip_section(&mut self, section: &str) {
+        loop {
+            match self.next() {
+                XmlEvent::EndElement { name } if name.local_name == section => return,
+                _ => continue,
+            }
+        }
+    }
+
+    fn next(&mut self) -> XmlEvent {
+        self.reader.next().ensure("XML parsing failed")
+    }
+
+    /// Parse a tag that should have plain string content.
+    fn parse_string(&mut self, tag_name: &str) -> String {
+        let mut out = String::new();
+
+        loop {
+            match self.next() {
+                // ignore irrelevant things
+                XmlEvent::Comment(_) | XmlEvent::Whitespace(_) => continue,
+
+                // set the content
+                XmlEvent::Characters(content) => {
+                    out = content;
+                }
+
+                // expect the end of the element
+                XmlEvent::EndElement { name } if name.local_name == tag_name => return out,
+
+                // fail on everything unexpected
+                event => bail(format!(
+                    "unexpected element while parsing <{}>: {:?}",
+                    tag_name, event
+                )),
+            }
+        }
+    }
+
+    fn parse_grammeme(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> Grammeme {
+        if name.local_name != "grammeme" {
+            bail(format!(
+                "expected to parse a grammeme, but found <{}>",
+                name.local_name
+            ));
+        }
+
+        let mut grammeme = Grammeme::default();
+
+        for attr in attributes {
+            if attr.name.local_name == "parent" && !attr.value.is_empty() {
+                grammeme.parent = Some(attr.value.clone());
+            }
+        }
+
+        loop {
+            match self.next() {
+                // ignore irrelevant things
+                XmlEvent::Comment(_) | XmlEvent::Whitespace(_) => continue,
+
+                // expect known tags
+                XmlEvent::StartElement { name, .. } if name.local_name == "name" => {
+                    grammeme.name = self.parse_string("name");
+                }
+
+                XmlEvent::StartElement { name, .. } if name.local_name == "alias" => {
+                    grammeme.alias = self.parse_string("alias");
+                }
+
+                XmlEvent::StartElement { name, .. } if name.local_name == "description" => {
+                    grammeme.description = self.parse_string("description");
+                }
+
+                // handle end of the grammeme
+                XmlEvent::EndElement { name } if name.local_name == "grammeme" => break,
+
+                // fail on everything unexpected
+                event => bail(format!(
+                    "unexpected element while parsing <grammeme>: {:?}",
+                    event
+                )),
+            }
+        }
+
+        grammeme
+    }
+
+    fn parse_lemma(&mut self, name: &OwnedName, _attributes: &[OwnedAttribute]) -> Lemma {
+        if name.local_name != "lemma" {
+            bail(format!(
+                "expected to parse a lemma, but found <{}>",
+                name.local_name
+            ));
+        }
+
+        self.skip_section("lemma");
+
+        Lemma {}
+    }
+}
author	Vincent Ambo <mail@tazj.in>	2023-01-17T21·36+0300
committer	tazjin <tazjin@tvl.su>	2023-01-18T01·10+0000
commit	ee7616d9563eabf2ae01927bc9d37ccf3e3b3325 (patch)
tree	ac43dc06b1f191308182897bd46726f5e9d41783 /corp/russian/data-import/src
parent	032ab16bbbd318704be71af7b569624ddab24802 (diff)