From 485c3cc912a5713a22cd655c0e35d77d686e3ccc Mon Sep 17 00:00:00 2001 From: Vincent Ambo Date: Wed, 18 Jan 2023 03:22:53 +0300 Subject: feat(corp/data-import): parse lemmas from OpenCorpora dump Change-Id: I1e4efcfc8e555f61578b563411d5e6ed9590d8e8 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7860 Reviewed-by: tazjin Tested-by: BuildkiteCI --- corp/russian/data-import/src/main.rs | 8 +- corp/russian/data-import/src/oc_parser.rs | 141 +++++++++++++++++++++++++++--- 2 files changed, 135 insertions(+), 14 deletions(-) diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs index 336cc3d14f9f..9f2f5089a603 100644 --- a/corp/russian/data-import/src/main.rs +++ b/corp/russian/data-import/src/main.rs @@ -80,11 +80,11 @@ fn main() { let mut out = BufWriter::new(std::io::stdout().lock()); while let Some(elem) = parser.next_element() { - match elem { - oc_parser::OcElement::Grammeme(g) => { - writeln!(out, "{:?}", g).ensure("writing element failed") + if let oc_parser::OcElement::Lemma(lemma) = elem { + if lemma.lemma.word == "тяжёлый" { + writeln!(out, "{:?}", lemma).ensure("writing output failed"); + break; } - oc_parser::OcElement::Lemma(_) => continue, } } diff --git a/corp/russian/data-import/src/oc_parser.rs b/corp/russian/data-import/src/oc_parser.rs index c7a9b8247f64..148c528313c1 100644 --- a/corp/russian/data-import/src/oc_parser.rs +++ b/corp/russian/data-import/src/oc_parser.rs @@ -1,5 +1,6 @@ use super::{bail, Ensure}; use log::info; +use std::str::FromStr; use xml::attribute::OwnedAttribute; use xml::name::OwnedName; use xml::reader::XmlEvent; @@ -7,14 +8,26 @@ use xml::EventReader; #[derive(Default, Debug)] pub struct Grammeme { - parent: Option, - name: String, - alias: String, - description: String, + pub parent: Option, + pub name: String, + pub alias: String, + pub description: String, } -#[derive(Debug)] -pub struct Lemma {} +/// Single form of a word (either its lemma, or the variations). +#[derive(Debug, Default)] +pub struct Variation { + pub word: String, + pub grammemes: Vec, +} + +#[derive(Debug, Default)] +pub struct Lemma { + pub id: u64, + pub lemma: Variation, + pub grammemes: Vec, + pub variations: Vec, +} #[derive(Debug)] pub enum OcElement { @@ -34,6 +47,12 @@ enum ParserState { /// Parser is parsing lemmata. Lemmata, + /// Parser is inside a lemma's actual lemma. + Lemma, + + /// Parser is parsing a morphological variation of a lemma. + Variation, + /// Parser has seen the end of the line and nothing more is /// available. Ended, @@ -133,7 +152,7 @@ impl OpenCorporaParser { // actual beginning of an actual element, dispatch accordingly event @ XmlEvent::StartElement { name, attributes, .. - } => match self.state { + } => match &self.state { ParserState::Grammemes => { return Some(OcElement::Grammeme(self.parse_grammeme(name, attributes))) } @@ -145,6 +164,11 @@ impl OpenCorporaParser { "parser received an unexpected start element while in state {:?}: {:?}", self.state, event )), + + other => bail(format!( + "next_element() called while parser was in state {:?}", + other + )), }, // finally, events that indicate a bug if they're @@ -199,6 +223,7 @@ impl OpenCorporaParser { } } + /// Parse a single `` tag. fn parse_grammeme(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> Grammeme { if name.local_name != "grammeme" { bail(format!( @@ -247,7 +272,7 @@ impl OpenCorporaParser { grammeme } - fn parse_lemma(&mut self, name: &OwnedName, _attributes: &[OwnedAttribute]) -> Lemma { + fn parse_lemma(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> Lemma { if name.local_name != "lemma" { bail(format!( "expected to parse a lemma, but found <{}>", @@ -255,8 +280,104 @@ impl OpenCorporaParser { )); } - self.skip_section("lemma"); + self.state = ParserState::Lemma; + let mut lemma = Lemma::default(); - Lemma {} + for attr in attributes { + if attr.name.local_name == "id" { + lemma.id = u64::from_str(&attr.value).ensure("failed to parse lemma ID"); + } + } + + loop { + match self.next() { + // has ended + XmlEvent::EndElement { name } if name.local_name == "lemma" => { + self.state = ParserState::Lemmata; + return lemma; + } + + // actual lemma content + XmlEvent::StartElement { + name, attributes, .. + } => { + match name.local_name.as_str() { + // beginning to parse the lemma itself + "l" => { + lemma.lemma.word = attributes + .into_iter() + .find(|attr| attr.name.local_name == "t") + .map(|attr| attr.value) + .ensure(format!("lemma {} had no actual word", lemma.id)); + } + + // parsing a lemma variation + "f" => { + self.state = ParserState::Variation; + + let word = attributes + .into_iter() + .find(|attr| attr.name.local_name == "t") + .map(|attr| attr.value) + .ensure(format!( + "variation of lemma {} had no actual word", + lemma.id + )); + + lemma.variations.push(Variation { + word, + grammemes: vec![], + }); + } + + // parse a grammeme association + "g" => { + let grammeme = attributes + .into_iter() + .find(|attr| attr.name.local_name == "v") + .map(|attr| attr.value) + .ensure(format!( + "grammeme association in lemma {} missing ID", + lemma.id + )); + + match self.state { + ParserState::Lemma => { + lemma.grammemes.push(grammeme); + } + + ParserState::Variation => { + lemma + .variations + .last_mut() + .ensure("variations should be non-empty") + .grammemes + .push(grammeme); + } + + _ => bail(format!("invalid parser state: encountered grammeme association while in {:?}", self.state)), + } + } + + other => bail(format!("unexpected element while parsing lemma: {other}")), + }; + } + + XmlEvent::EndElement { name } => match name.local_name.as_str() { + "l" if self.state == ParserState::Lemma => continue, + "f" if self.state == ParserState::Variation => { + self.state = ParserState::Lemma; + continue; + } + "g" => continue, + other => bail(format!( + "unexpected while parsing lemma {}", + lemma.id + )), + }, + + _ => continue, + } + } } } -- cgit 1.4.1