diff options
author | Vincent Ambo <mail@tazj.in> | 2023-01-18T00·22+0300 |
---|---|---|
committer | tazjin <tazjin@tvl.su> | 2023-01-18T01·10+0000 |
commit | 485c3cc912a5713a22cd655c0e35d77d686e3ccc (patch) | |
tree | 83f6b7550f02917461582c2918f0a29fde1fc33c /corp/russian/data-import/src | |
parent | ee7616d9563eabf2ae01927bc9d37ccf3e3b3325 (diff) |
feat(corp/data-import): parse lemmas from OpenCorpora dump r/5684
Change-Id: I1e4efcfc8e555f61578b563411d5e6ed9590d8e8 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7860 Reviewed-by: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI
Diffstat (limited to 'corp/russian/data-import/src')
-rw-r--r-- | corp/russian/data-import/src/main.rs | 8 | ||||
-rw-r--r-- | corp/russian/data-import/src/oc_parser.rs | 141 |
2 files changed, 135 insertions, 14 deletions
diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs index 336cc3d14f9f..9f2f5089a603 100644 --- a/corp/russian/data-import/src/main.rs +++ b/corp/russian/data-import/src/main.rs @@ -80,11 +80,11 @@ fn main() { let mut out = BufWriter::new(std::io::stdout().lock()); while let Some(elem) = parser.next_element() { - match elem { - oc_parser::OcElement::Grammeme(g) => { - writeln!(out, "{:?}", g).ensure("writing element failed") + if let oc_parser::OcElement::Lemma(lemma) = elem { + if lemma.lemma.word == "тяжёлый" { + writeln!(out, "{:?}", lemma).ensure("writing output failed"); + break; } - oc_parser::OcElement::Lemma(_) => continue, } } diff --git a/corp/russian/data-import/src/oc_parser.rs b/corp/russian/data-import/src/oc_parser.rs index c7a9b8247f64..148c528313c1 100644 --- a/corp/russian/data-import/src/oc_parser.rs +++ b/corp/russian/data-import/src/oc_parser.rs @@ -1,5 +1,6 @@ use super::{bail, Ensure}; use log::info; +use std::str::FromStr; use xml::attribute::OwnedAttribute; use xml::name::OwnedName; use xml::reader::XmlEvent; @@ -7,14 +8,26 @@ use xml::EventReader; #[derive(Default, Debug)] pub struct Grammeme { - parent: Option<String>, - name: String, - alias: String, - description: String, + pub parent: Option<String>, + pub name: String, + pub alias: String, + pub description: String, } -#[derive(Debug)] -pub struct Lemma {} +/// Single form of a word (either its lemma, or the variations). +#[derive(Debug, Default)] +pub struct Variation { + pub word: String, + pub grammemes: Vec<String>, +} + +#[derive(Debug, Default)] +pub struct Lemma { + pub id: u64, + pub lemma: Variation, + pub grammemes: Vec<String>, + pub variations: Vec<Variation>, +} #[derive(Debug)] pub enum OcElement { @@ -34,6 +47,12 @@ enum ParserState { /// Parser is parsing lemmata. Lemmata, + /// Parser is inside a lemma's actual lemma. + Lemma, + + /// Parser is parsing a morphological variation of a lemma. + Variation, + /// Parser has seen the end of the line and nothing more is /// available. Ended, @@ -133,7 +152,7 @@ impl<R: std::io::Read> OpenCorporaParser<R> { // actual beginning of an actual element, dispatch accordingly event @ XmlEvent::StartElement { name, attributes, .. - } => match self.state { + } => match &self.state { ParserState::Grammemes => { return Some(OcElement::Grammeme(self.parse_grammeme(name, attributes))) } @@ -145,6 +164,11 @@ impl<R: std::io::Read> OpenCorporaParser<R> { "parser received an unexpected start element while in state {:?}: {:?}", self.state, event )), + + other => bail(format!( + "next_element() called while parser was in state {:?}", + other + )), }, // finally, events that indicate a bug if they're @@ -199,6 +223,7 @@ impl<R: std::io::Read> OpenCorporaParser<R> { } } + /// Parse a single `<grammeme>` tag. fn parse_grammeme(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> Grammeme { if name.local_name != "grammeme" { bail(format!( @@ -247,7 +272,7 @@ impl<R: std::io::Read> OpenCorporaParser<R> { grammeme } - fn parse_lemma(&mut self, name: &OwnedName, _attributes: &[OwnedAttribute]) -> Lemma { + fn parse_lemma(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> Lemma { if name.local_name != "lemma" { bail(format!( "expected to parse a lemma, but found <{}>", @@ -255,8 +280,104 @@ impl<R: std::io::Read> OpenCorporaParser<R> { )); } - self.skip_section("lemma"); + self.state = ParserState::Lemma; + let mut lemma = Lemma::default(); - Lemma {} + for attr in attributes { + if attr.name.local_name == "id" { + lemma.id = u64::from_str(&attr.value).ensure("failed to parse lemma ID"); + } + } + + loop { + match self.next() { + // <lemma> has ended + XmlEvent::EndElement { name } if name.local_name == "lemma" => { + self.state = ParserState::Lemmata; + return lemma; + } + + // actual lemma content + XmlEvent::StartElement { + name, attributes, .. + } => { + match name.local_name.as_str() { + // beginning to parse the lemma itself + "l" => { + lemma.lemma.word = attributes + .into_iter() + .find(|attr| attr.name.local_name == "t") + .map(|attr| attr.value) + .ensure(format!("lemma {} had no actual word", lemma.id)); + } + + // parsing a lemma variation + "f" => { + self.state = ParserState::Variation; + + let word = attributes + .into_iter() + .find(|attr| attr.name.local_name == "t") + .map(|attr| attr.value) + .ensure(format!( + "variation of lemma {} had no actual word", + lemma.id + )); + + lemma.variations.push(Variation { + word, + grammemes: vec![], + }); + } + + // parse a grammeme association + "g" => { + let grammeme = attributes + .into_iter() + .find(|attr| attr.name.local_name == "v") + .map(|attr| attr.value) + .ensure(format!( + "grammeme association in lemma {} missing ID", + lemma.id + )); + + match self.state { + ParserState::Lemma => { + lemma.grammemes.push(grammeme); + } + + ParserState::Variation => { + lemma + .variations + .last_mut() + .ensure("variations should be non-empty") + .grammemes + .push(grammeme); + } + + _ => bail(format!("invalid parser state: encountered grammeme association while in {:?}", self.state)), + } + } + + other => bail(format!("unexpected element while parsing lemma: {other}")), + }; + } + + XmlEvent::EndElement { name } => match name.local_name.as_str() { + "l" if self.state == ParserState::Lemma => continue, + "f" if self.state == ParserState::Variation => { + self.state = ParserState::Lemma; + continue; + } + "g" => continue, + other => bail(format!( + "unexpected </{other}> while parsing lemma {}", + lemma.id + )), + }, + + _ => continue, + } + } } } |