diff options
author | Vincent Ambo <mail@tazj.in> | 2023-01-17T21·36+0300 |
---|---|---|
committer | tazjin <tazjin@tvl.su> | 2023-01-18T01·10+0000 |
commit | ee7616d9563eabf2ae01927bc9d37ccf3e3b3325 (patch) | |
tree | ac43dc06b1f191308182897bd46726f5e9d41783 /corp/russian | |
parent | 032ab16bbbd318704be71af7b569624ddab24802 (diff) |
feat(corp/russian/data-import): new OpenCorpora data import tool r/5683
Adds the beginning of a tool which can import OpenCorpora data into a SQLite database. This is quite a lot of toil and there's probably a better way to do this, but overall becoming this intimately familiar with the data structures is quite helpful for understanding what I can/can't do with only this dataset. Change-Id: Ieab33a8ce07ea4ac87917b9c8132226bbc6523b1 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7859 Reviewed-by: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI
Diffstat (limited to 'corp/russian')
-rw-r--r-- | corp/russian/data-import/.gitignore | 2 | ||||
-rw-r--r-- | corp/russian/data-import/Cargo.lock | 384 | ||||
-rw-r--r-- | corp/russian/data-import/Cargo.toml | 16 | ||||
-rw-r--r-- | corp/russian/data-import/default.nix | 39 | ||||
-rw-r--r-- | corp/russian/data-import/src/main.rs | 126 | ||||
-rw-r--r-- | corp/russian/data-import/src/oc_parser.rs | 262 |
6 files changed, 829 insertions, 0 deletions
diff --git a/corp/russian/data-import/.gitignore b/corp/russian/data-import/.gitignore new file mode 100644 index 000000000000..e918c641ce99 --- /dev/null +++ b/corp/russian/data-import/.gitignore @@ -0,0 +1,2 @@ +target/ +all_events.txt diff --git a/corp/russian/data-import/Cargo.lock b/corp/russian/data-import/Cargo.lock new file mode 100644 index 000000000000..125b62d43e90 --- /dev/null +++ b/corp/russian/data-import/Cargo.lock @@ -0,0 +1,384 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "ahash" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" +dependencies = [ + "getrandom", + "once_cell", + "version_check", +] + +[[package]] +name = "aho-corasick" +version = "0.7.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" +dependencies = [ + "memchr", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "cc" +version = "1.0.78" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a20104e2335ce8a659d6dd92a51a767a0c062599c73b343fd152cb401e828c3d" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "data-import" +version = "0.1.0" +dependencies = [ + "env_logger", + "log", + "rusqlite", + "xml-rs", +] + +[[package]] +name = "env_logger" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0" +dependencies = [ + "humantime", + "is-terminal", + "log", + "regex", + "termcolor", +] + +[[package]] +name = "errno" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +dependencies = [ + "errno-dragonfly", + "libc", + "winapi", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "fallible-iterator" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + +[[package]] +name = "getrandom" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +dependencies = [ + "ahash", +] + +[[package]] +name = "hashlink" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69fe1fcf8b4278d860ad0548329f892a3631fb63f82574df68275f34cdbe0ffa" +dependencies = [ + "hashbrown", +] + +[[package]] +name = "hermit-abi" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" +dependencies = [ + "libc", +] + +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + +[[package]] +name = "io-lifetimes" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7d6c6f8c91b4b9ed43484ad1a938e393caf35960fce7f82a040497207bd8e9e" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "is-terminal" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28dfb6c8100ccc63462345b67d1bbc3679177c75ee4bf59bf29c8b1d110b8189" +dependencies = [ + "hermit-abi", + "io-lifetimes", + "rustix", + "windows-sys", +] + +[[package]] +name = "libc" +version = "0.2.139" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" + +[[package]] +name = "libsqlite3-sys" +version = "0.25.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29f835d03d717946d28b1d1ed632eb6f0e24a299388ee623d0c23118d3e8a7fa" +dependencies = [ + "pkg-config", + "vcpkg", +] + +[[package]] +name = "linux-raw-sys" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" + +[[package]] +name = "log" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "once_cell" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66" + +[[package]] +name = "pkg-config" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" + +[[package]] +name = "regex" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" + +[[package]] +name = "rusqlite" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01e213bc3ecb39ac32e81e51ebe31fd888a940515173e3a18a35f8c6e896422a" +dependencies = [ + "bitflags", + "fallible-iterator", + "fallible-streaming-iterator", + "hashlink", + "libsqlite3-sys", + "smallvec", +] + +[[package]] +name = "rustix" +version = "0.36.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4feacf7db682c6c329c4ede12649cd36ecab0f3be5b7d74e6a20304725db4549" +dependencies = [ + "bitflags", + "errno", + "io-lifetimes", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "smallvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" + +[[package]] +name = "termcolor" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7" + +[[package]] +name = "windows_i686_gnu" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640" + +[[package]] +name = "windows_i686_msvc" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd" + +[[package]] +name = "xml-rs" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2d7d3948613f75c98fd9328cfdcc45acc4d360655289d0a7d4ec931392200a3" diff --git a/corp/russian/data-import/Cargo.toml b/corp/russian/data-import/Cargo.toml new file mode 100644 index 000000000000..b43f829f37b0 --- /dev/null +++ b/corp/russian/data-import/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "data-import" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +env_logger = "0.10.0" +log = "0.4.17" +rusqlite = "0.28" +xml-rs = "0.8" + +[profile.release-with-debug] +inherits = "release" +debug = true diff --git a/corp/russian/data-import/default.nix b/corp/russian/data-import/default.nix new file mode 100644 index 000000000000..b4cdc50c25c1 --- /dev/null +++ b/corp/russian/data-import/default.nix @@ -0,0 +1,39 @@ +{ depot, pkgs, ... }: + +let + buildInputs = with pkgs; [ + sqlite + pkg-config + ]; + + # mirrored input data from OpenCorpora, as of 2023-01-17. + # + # This data is licensed under CC-BY-SA. + inputDataArchive = pkgs.fetchurl { + name = "dict.opcorpora.xml.bz"; + url = "https://tazj.in/blobs/dict.opcorpora.xml.bz2"; + sha256 = "04n5g43fkfc93z6xlwf2qfdrfdfl562pc2ivdb3cbbbsy56gkqg6"; + }; + + inputData = pkgs.runCommand "dict.opcorpora.xml" { } '' + ${pkgs.bzip2}/bin/bunzip2 -k -c ${inputDataArchive} > $out + ''; + + # development shell with native deps + shell = pkgs.mkShell { + inherit buildInputs; + + # make OPENCORPORA_DATA available in the environment + OPENCORPORA_DATA = inputData; + }; +in +depot.third_party.naersk.buildPackage { + src = depot.third_party.gitignoreSource ./.; + inherit buildInputs; + + passthru = { + inherit shell; + + + }; +} diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs new file mode 100644 index 000000000000..336cc3d14f9f --- /dev/null +++ b/corp/russian/data-import/src/main.rs @@ -0,0 +1,126 @@ +//! This program imports Russian language data from OpenCorpora +//! ("Открытый корпус") into a SQLite database that can be used for +//! [//corp/russian][corp-russian] projects. +//! +//! [corp-russian]: https://at.tvl.fyi/?q=%2F%2Fcorp%2Frussian +//! +//! Ideally, running this on an OpenCorpora dump should yield a fully +//! functional SQLite database compatible with all other tools +//! consuming it. +//! +//! ## OpenCorpora format +//! +//! The format used is partially documented on the [OpenCorpora +//! website][format-docs]. This seems to be a slightly outdated +//! format, however, hence some information about what the format +//! seems to be today. +//! +//! [format-docs]: http://opencorpora.org/?page=export +//! +//! The format is an XML file, which has several categories of data, +//! each with their own schema: +//! +//! * `grammemes`: These define units of grammar. They're *likely* pretty +//! static, and we'll *likely* want to map them into a custom set of +//! (simpler) categories. +//! +//! They form some kind of internal hierarchy, where some of them have a +//! `parent` attribute set to some other grammemes `name`. +//! +//! There's a ridiculous number of these. +//! +//! * `restrictions`: Unclear, not documented on the page. They describe +//! something about the relationship between grammemes. +//! +//! * `lemmata`: this lists the actual lemmas, as well as all their +//! included morphological variants +//! +//! Each lemma has an `id` attribute uniquely identifying its dictionary +//! form, as well as a number of sub-elements: +//! +//! * the `l` attribute contains the lemma itself +//! * the `f` attributes contain morphological variations +//! +//! Each of these sub elements again contains a number of `g` elements, +//! which refer to the IDs of grammems in their `v` attributes. +//! +//! * `<link_types>` These list possible "relationships between lemmas", +//! basically just assigning them IDs and names. There's only 27 of +//! these. +//! +//! * `<links>`: Using the types defined above, this establishes links +//! between lemmas that have some kind of relationship. +//! +//! For example, a relationship `cardinal/ordinal` might be established +//! between the lemmas "два" and "второй". + +use log::{error, info}; +use std::env; +use std::fmt::Display; +use std::fs::File; +use std::io::{BufReader, BufWriter, Write}; + +mod oc_parser; + +fn main() { + env_logger::builder() + .filter_level(log::LevelFilter::Info) + .init(); + + let input_path = env::args() + .skip(1) + .next() + .ensure("must specify the input filename as the only argument"); + + info!("reading from {input_path}"); + let input_file = File::open(input_path).ensure("failed to open input file"); + + let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file)); + + let mut out = BufWriter::new(std::io::stdout().lock()); + + while let Some(elem) = parser.next_element() { + match elem { + oc_parser::OcElement::Grammeme(g) => { + writeln!(out, "{:?}", g).ensure("writing element failed") + } + oc_parser::OcElement::Lemma(_) => continue, + } + } + + out.flush().ensure("flushing the out buffer failed"); +} + +/// It's like `expect`, but through `log::error`. +trait Ensure<T> { + fn ensure<S: Into<String>>(self, msg: S) -> T; +} + +impl<T, E: Display> Ensure<T> for Result<T, E> { + fn ensure<S: Into<String>>(self, msg: S) -> T { + match self { + Ok(x) => x, + Err(err) => { + error!("{}: {}", msg.into(), err); + std::process::exit(1); + } + } + } +} + +impl<T> Ensure<T> for Option<T> { + fn ensure<S: Into<String>>(self, msg: S) -> T { + match self { + Some(x) => x, + None => { + error!("{}", msg.into()); + std::process::exit(1); + } + } + } +} + +fn bail<S: Into<String>>(msg: S) -> ! { + error!("{}", msg.into()); + std::process::exit(1); +} diff --git a/corp/russian/data-import/src/oc_parser.rs b/corp/russian/data-import/src/oc_parser.rs new file mode 100644 index 000000000000..c7a9b8247f64 --- /dev/null +++ b/corp/russian/data-import/src/oc_parser.rs @@ -0,0 +1,262 @@ +use super::{bail, Ensure}; +use log::info; +use xml::attribute::OwnedAttribute; +use xml::name::OwnedName; +use xml::reader::XmlEvent; +use xml::EventReader; + +#[derive(Default, Debug)] +pub struct Grammeme { + parent: Option<String>, + name: String, + alias: String, + description: String, +} + +#[derive(Debug)] +pub struct Lemma {} + +#[derive(Debug)] +pub enum OcElement { + Grammeme(Grammeme), + Lemma(Lemma), +} + +#[derive(Debug, PartialEq)] +enum ParserState { + /// Parser is not parsing any particular section and waiting for a + /// start tag instead. + Init, + + /// Parser is parsing grammemes. + Grammemes, + + /// Parser is parsing lemmata. + Lemmata, + + /// Parser has seen the end of the line and nothing more is + /// available. + Ended, +} + +pub struct OpenCorporaParser<R: std::io::Read> { + reader: EventReader<R>, + state: ParserState, +} + +#[derive(PartialEq)] +enum SectionState { + /// Actively interested in parsing this section. + Active, + + /// Section is known, but currently ignored. + Inactive, + + /// Section is unknown (probably a bug). + Unknown, +} + +fn section_state(section: &str) -> SectionState { + match section { + "grammemes" | "lemmata" => SectionState::Active, + "restrictions" | "link_types" | "links" => SectionState::Inactive, + _ => SectionState::Unknown, + } +} + +impl<R: std::io::Read> OpenCorporaParser<R> { + pub fn new(reader: R) -> Self { + let config = xml::ParserConfig::new().trim_whitespace(true); + let reader = EventReader::new_with_config(reader, config); + + Self { + reader, + state: ParserState::Init, + } + } + + /// Pull an `OcElement` out of the parser. Returns `None` if the + /// parser stream has ended. + pub fn next_element(&mut self) -> Option<OcElement> { + if self.state == ParserState::Ended { + return None; + } + + // Pull the next element to determine what context to enter + // next. + loop { + match &self.next() { + // no-op events that do not affect parser state + XmlEvent::Comment(_) + | XmlEvent::Whitespace(_) + | XmlEvent::ProcessingInstruction { .. } + | XmlEvent::StartDocument { .. } => continue, + XmlEvent::StartElement { name, .. } | XmlEvent::EndElement { name } + if name.local_name == "dictionary" => + { + continue + } + + // end of the file, nothing more to return + XmlEvent::EndDocument => { + self.state = ParserState::Ended; + return None; + } + + // some sections are skipped + XmlEvent::StartElement { name, .. } | XmlEvent::EndElement { name } + if section_state(&name.local_name) == SectionState::Inactive => + { + info!("skipping {} section", name.local_name); + self.skip_section(&name.local_name); + } + + // active section events start specific parser states ... + XmlEvent::StartElement { name, .. } + if section_state(&name.local_name) == SectionState::Active => + { + self.state = match name.local_name.as_str() { + "grammemes" => ParserState::Grammemes, + "lemmata" => ParserState::Lemmata, + _ => unreachable!(), + }; + } + + // ... or end them + XmlEvent::EndElement { name, .. } + if section_state(&name.local_name) == SectionState::Active => + { + // TODO: assert that the right section ended + self.state = ParserState::Init; + } + + // actual beginning of an actual element, dispatch accordingly + event @ XmlEvent::StartElement { + name, attributes, .. + } => match self.state { + ParserState::Grammemes => { + return Some(OcElement::Grammeme(self.parse_grammeme(name, attributes))) + } + ParserState::Lemmata => { + return Some(OcElement::Lemma(self.parse_lemma(name, attributes))) + } + + ParserState::Init | ParserState::Ended => bail(format!( + "parser received an unexpected start element while in state {:?}: {:?}", + self.state, event + )), + }, + + // finally, events that indicate a bug if they're + // encountered here + event @ XmlEvent::EndElement { .. } + | event @ XmlEvent::CData(_) + | event @ XmlEvent::Characters(_) => { + bail(format!("unexpected XML event: {:?}", event)) + } + } + } + } + + /// Skip a section by advancing the parser state until we see an + /// end element for the skipped section. + fn skip_section(&mut self, section: &str) { + loop { + match self.next() { + XmlEvent::EndElement { name } if name.local_name == section => return, + _ => continue, + } + } + } + + fn next(&mut self) -> XmlEvent { + self.reader.next().ensure("XML parsing failed") + } + + /// Parse a tag that should have plain string content. + fn parse_string(&mut self, tag_name: &str) -> String { + let mut out = String::new(); + + loop { + match self.next() { + // ignore irrelevant things + XmlEvent::Comment(_) | XmlEvent::Whitespace(_) => continue, + + // set the content + XmlEvent::Characters(content) => { + out = content; + } + + // expect the end of the element + XmlEvent::EndElement { name } if name.local_name == tag_name => return out, + + // fail on everything unexpected + event => bail(format!( + "unexpected element while parsing <{}>: {:?}", + tag_name, event + )), + } + } + } + + fn parse_grammeme(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> Grammeme { + if name.local_name != "grammeme" { + bail(format!( + "expected to parse a grammeme, but found <{}>", + name.local_name + )); + } + + let mut grammeme = Grammeme::default(); + + for attr in attributes { + if attr.name.local_name == "parent" && !attr.value.is_empty() { + grammeme.parent = Some(attr.value.clone()); + } + } + + loop { + match self.next() { + // ignore irrelevant things + XmlEvent::Comment(_) | XmlEvent::Whitespace(_) => continue, + + // expect known tags + XmlEvent::StartElement { name, .. } if name.local_name == "name" => { + grammeme.name = self.parse_string("name"); + } + + XmlEvent::StartElement { name, .. } if name.local_name == "alias" => { + grammeme.alias = self.parse_string("alias"); + } + + XmlEvent::StartElement { name, .. } if name.local_name == "description" => { + grammeme.description = self.parse_string("description"); + } + + // handle end of the grammeme + XmlEvent::EndElement { name } if name.local_name == "grammeme" => break, + + // fail on everything unexpected + event => bail(format!( + "unexpected element while parsing <grammeme>: {:?}", + event + )), + } + } + + grammeme + } + + fn parse_lemma(&mut self, name: &OwnedName, _attributes: &[OwnedAttribute]) -> Lemma { + if name.local_name != "lemma" { + bail(format!( + "expected to parse a lemma, but found <{}>", + name.local_name + )); + } + + self.skip_section("lemma"); + + Lemma {} + } +} |