From ee7616d9563eabf2ae01927bc9d37ccf3e3b3325 Mon Sep 17 00:00:00 2001 From: Vincent Ambo Date: Wed, 18 Jan 2023 00:36:41 +0300 Subject: feat(corp/russian/data-import): new OpenCorpora data import tool Adds the beginning of a tool which can import OpenCorpora data into a SQLite database. This is quite a lot of toil and there's probably a better way to do this, but overall becoming this intimately familiar with the data structures is quite helpful for understanding what I can/can't do with only this dataset. Change-Id: Ieab33a8ce07ea4ac87917b9c8132226bbc6523b1 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7859 Reviewed-by: tazjin Tested-by: BuildkiteCI --- corp/russian/data-import/src/main.rs | 126 ++++++++++++++ corp/russian/data-import/src/oc_parser.rs | 262 ++++++++++++++++++++++++++++++ 2 files changed, 388 insertions(+) create mode 100644 corp/russian/data-import/src/main.rs create mode 100644 corp/russian/data-import/src/oc_parser.rs (limited to 'corp/russian/data-import/src') diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs new file mode 100644 index 000000000000..336cc3d14f9f --- /dev/null +++ b/corp/russian/data-import/src/main.rs @@ -0,0 +1,126 @@ +//! This program imports Russian language data from OpenCorpora +//! ("Открытый корпус") into a SQLite database that can be used for +//! [//corp/russian][corp-russian] projects. +//! +//! [corp-russian]: https://at.tvl.fyi/?q=%2F%2Fcorp%2Frussian +//! +//! Ideally, running this on an OpenCorpora dump should yield a fully +//! functional SQLite database compatible with all other tools +//! consuming it. +//! +//! ## OpenCorpora format +//! +//! The format used is partially documented on the [OpenCorpora +//! website][format-docs]. This seems to be a slightly outdated +//! format, however, hence some information about what the format +//! seems to be today. +//! +//! [format-docs]: http://opencorpora.org/?page=export +//! +//! The format is an XML file, which has several categories of data, +//! each with their own schema: +//! +//! * `grammemes`: These define units of grammar. They're *likely* pretty +//! static, and we'll *likely* want to map them into a custom set of +//! (simpler) categories. +//! +//! They form some kind of internal hierarchy, where some of them have a +//! `parent` attribute set to some other grammemes `name`. +//! +//! There's a ridiculous number of these. +//! +//! * `restrictions`: Unclear, not documented on the page. They describe +//! something about the relationship between grammemes. +//! +//! * `lemmata`: this lists the actual lemmas, as well as all their +//! included morphological variants +//! +//! Each lemma has an `id` attribute uniquely identifying its dictionary +//! form, as well as a number of sub-elements: +//! +//! * the `l` attribute contains the lemma itself +//! * the `f` attributes contain morphological variations +//! +//! Each of these sub elements again contains a number of `g` elements, +//! which refer to the IDs of grammems in their `v` attributes. +//! +//! * `` These list possible "relationships between lemmas", +//! basically just assigning them IDs and names. There's only 27 of +//! these. +//! +//! * ``: Using the types defined above, this establishes links +//! between lemmas that have some kind of relationship. +//! +//! For example, a relationship `cardinal/ordinal` might be established +//! between the lemmas "два" and "второй". + +use log::{error, info}; +use std::env; +use std::fmt::Display; +use std::fs::File; +use std::io::{BufReader, BufWriter, Write}; + +mod oc_parser; + +fn main() { + env_logger::builder() + .filter_level(log::LevelFilter::Info) + .init(); + + let input_path = env::args() + .skip(1) + .next() + .ensure("must specify the input filename as the only argument"); + + info!("reading from {input_path}"); + let input_file = File::open(input_path).ensure("failed to open input file"); + + let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file)); + + let mut out = BufWriter::new(std::io::stdout().lock()); + + while let Some(elem) = parser.next_element() { + match elem { + oc_parser::OcElement::Grammeme(g) => { + writeln!(out, "{:?}", g).ensure("writing element failed") + } + oc_parser::OcElement::Lemma(_) => continue, + } + } + + out.flush().ensure("flushing the out buffer failed"); +} + +/// It's like `expect`, but through `log::error`. +trait Ensure { + fn ensure>(self, msg: S) -> T; +} + +impl Ensure for Result { + fn ensure>(self, msg: S) -> T { + match self { + Ok(x) => x, + Err(err) => { + error!("{}: {}", msg.into(), err); + std::process::exit(1); + } + } + } +} + +impl Ensure for Option { + fn ensure>(self, msg: S) -> T { + match self { + Some(x) => x, + None => { + error!("{}", msg.into()); + std::process::exit(1); + } + } + } +} + +fn bail>(msg: S) -> ! { + error!("{}", msg.into()); + std::process::exit(1); +} diff --git a/corp/russian/data-import/src/oc_parser.rs b/corp/russian/data-import/src/oc_parser.rs new file mode 100644 index 000000000000..c7a9b8247f64 --- /dev/null +++ b/corp/russian/data-import/src/oc_parser.rs @@ -0,0 +1,262 @@ +use super::{bail, Ensure}; +use log::info; +use xml::attribute::OwnedAttribute; +use xml::name::OwnedName; +use xml::reader::XmlEvent; +use xml::EventReader; + +#[derive(Default, Debug)] +pub struct Grammeme { + parent: Option, + name: String, + alias: String, + description: String, +} + +#[derive(Debug)] +pub struct Lemma {} + +#[derive(Debug)] +pub enum OcElement { + Grammeme(Grammeme), + Lemma(Lemma), +} + +#[derive(Debug, PartialEq)] +enum ParserState { + /// Parser is not parsing any particular section and waiting for a + /// start tag instead. + Init, + + /// Parser is parsing grammemes. + Grammemes, + + /// Parser is parsing lemmata. + Lemmata, + + /// Parser has seen the end of the line and nothing more is + /// available. + Ended, +} + +pub struct OpenCorporaParser { + reader: EventReader, + state: ParserState, +} + +#[derive(PartialEq)] +enum SectionState { + /// Actively interested in parsing this section. + Active, + + /// Section is known, but currently ignored. + Inactive, + + /// Section is unknown (probably a bug). + Unknown, +} + +fn section_state(section: &str) -> SectionState { + match section { + "grammemes" | "lemmata" => SectionState::Active, + "restrictions" | "link_types" | "links" => SectionState::Inactive, + _ => SectionState::Unknown, + } +} + +impl OpenCorporaParser { + pub fn new(reader: R) -> Self { + let config = xml::ParserConfig::new().trim_whitespace(true); + let reader = EventReader::new_with_config(reader, config); + + Self { + reader, + state: ParserState::Init, + } + } + + /// Pull an `OcElement` out of the parser. Returns `None` if the + /// parser stream has ended. + pub fn next_element(&mut self) -> Option { + if self.state == ParserState::Ended { + return None; + } + + // Pull the next element to determine what context to enter + // next. + loop { + match &self.next() { + // no-op events that do not affect parser state + XmlEvent::Comment(_) + | XmlEvent::Whitespace(_) + | XmlEvent::ProcessingInstruction { .. } + | XmlEvent::StartDocument { .. } => continue, + XmlEvent::StartElement { name, .. } | XmlEvent::EndElement { name } + if name.local_name == "dictionary" => + { + continue + } + + // end of the file, nothing more to return + XmlEvent::EndDocument => { + self.state = ParserState::Ended; + return None; + } + + // some sections are skipped + XmlEvent::StartElement { name, .. } | XmlEvent::EndElement { name } + if section_state(&name.local_name) == SectionState::Inactive => + { + info!("skipping {} section", name.local_name); + self.skip_section(&name.local_name); + } + + // active section events start specific parser states ... + XmlEvent::StartElement { name, .. } + if section_state(&name.local_name) == SectionState::Active => + { + self.state = match name.local_name.as_str() { + "grammemes" => ParserState::Grammemes, + "lemmata" => ParserState::Lemmata, + _ => unreachable!(), + }; + } + + // ... or end them + XmlEvent::EndElement { name, .. } + if section_state(&name.local_name) == SectionState::Active => + { + // TODO: assert that the right section ended + self.state = ParserState::Init; + } + + // actual beginning of an actual element, dispatch accordingly + event @ XmlEvent::StartElement { + name, attributes, .. + } => match self.state { + ParserState::Grammemes => { + return Some(OcElement::Grammeme(self.parse_grammeme(name, attributes))) + } + ParserState::Lemmata => { + return Some(OcElement::Lemma(self.parse_lemma(name, attributes))) + } + + ParserState::Init | ParserState::Ended => bail(format!( + "parser received an unexpected start element while in state {:?}: {:?}", + self.state, event + )), + }, + + // finally, events that indicate a bug if they're + // encountered here + event @ XmlEvent::EndElement { .. } + | event @ XmlEvent::CData(_) + | event @ XmlEvent::Characters(_) => { + bail(format!("unexpected XML event: {:?}", event)) + } + } + } + } + + /// Skip a section by advancing the parser state until we see an + /// end element for the skipped section. + fn skip_section(&mut self, section: &str) { + loop { + match self.next() { + XmlEvent::EndElement { name } if name.local_name == section => return, + _ => continue, + } + } + } + + fn next(&mut self) -> XmlEvent { + self.reader.next().ensure("XML parsing failed") + } + + /// Parse a tag that should have plain string content. + fn parse_string(&mut self, tag_name: &str) -> String { + let mut out = String::new(); + + loop { + match self.next() { + // ignore irrelevant things + XmlEvent::Comment(_) | XmlEvent::Whitespace(_) => continue, + + // set the content + XmlEvent::Characters(content) => { + out = content; + } + + // expect the end of the element + XmlEvent::EndElement { name } if name.local_name == tag_name => return out, + + // fail on everything unexpected + event => bail(format!( + "unexpected element while parsing <{}>: {:?}", + tag_name, event + )), + } + } + } + + fn parse_grammeme(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> Grammeme { + if name.local_name != "grammeme" { + bail(format!( + "expected to parse a grammeme, but found <{}>", + name.local_name + )); + } + + let mut grammeme = Grammeme::default(); + + for attr in attributes { + if attr.name.local_name == "parent" && !attr.value.is_empty() { + grammeme.parent = Some(attr.value.clone()); + } + } + + loop { + match self.next() { + // ignore irrelevant things + XmlEvent::Comment(_) | XmlEvent::Whitespace(_) => continue, + + // expect known tags + XmlEvent::StartElement { name, .. } if name.local_name == "name" => { + grammeme.name = self.parse_string("name"); + } + + XmlEvent::StartElement { name, .. } if name.local_name == "alias" => { + grammeme.alias = self.parse_string("alias"); + } + + XmlEvent::StartElement { name, .. } if name.local_name == "description" => { + grammeme.description = self.parse_string("description"); + } + + // handle end of the grammeme + XmlEvent::EndElement { name } if name.local_name == "grammeme" => break, + + // fail on everything unexpected + event => bail(format!( + "unexpected element while parsing : {:?}", + event + )), + } + } + + grammeme + } + + fn parse_lemma(&mut self, name: &OwnedName, _attributes: &[OwnedAttribute]) -> Lemma { + if name.local_name != "lemma" { + bail(format!( + "expected to parse a lemma, but found <{}>", + name.local_name + )); + } + + self.skip_section("lemma"); + + Lemma {} + } +} -- cgit 1.4.1