about summary refs log tree commit diff
path: root/corp/russian/data-import/src
diff options
context:
space:
mode:
authorVincent Ambo <mail@tazj.in>2023-01-18T00·22+0300
committertazjin <tazjin@tvl.su>2023-01-18T01·10+0000
commit485c3cc912a5713a22cd655c0e35d77d686e3ccc (patch)
tree83f6b7550f02917461582c2918f0a29fde1fc33c /corp/russian/data-import/src
parentee7616d9563eabf2ae01927bc9d37ccf3e3b3325 (diff)
feat(corp/data-import): parse lemmas from OpenCorpora dump r/5684
Change-Id: I1e4efcfc8e555f61578b563411d5e6ed9590d8e8
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7860
Reviewed-by: tazjin <tazjin@tvl.su>
Tested-by: BuildkiteCI
Diffstat (limited to 'corp/russian/data-import/src')
-rw-r--r--corp/russian/data-import/src/main.rs8
-rw-r--r--corp/russian/data-import/src/oc_parser.rs141
2 files changed, 135 insertions, 14 deletions
diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs
index 336cc3d14f9f..9f2f5089a603 100644
--- a/corp/russian/data-import/src/main.rs
+++ b/corp/russian/data-import/src/main.rs
@@ -80,11 +80,11 @@ fn main() {
     let mut out = BufWriter::new(std::io::stdout().lock());
 
     while let Some(elem) = parser.next_element() {
-        match elem {
-            oc_parser::OcElement::Grammeme(g) => {
-                writeln!(out, "{:?}", g).ensure("writing element failed")
+        if let oc_parser::OcElement::Lemma(lemma) = elem {
+            if lemma.lemma.word == "тяжёлый" {
+                writeln!(out, "{:?}", lemma).ensure("writing output failed");
+                break;
             }
-            oc_parser::OcElement::Lemma(_) => continue,
         }
     }
 
diff --git a/corp/russian/data-import/src/oc_parser.rs b/corp/russian/data-import/src/oc_parser.rs
index c7a9b8247f64..148c528313c1 100644
--- a/corp/russian/data-import/src/oc_parser.rs
+++ b/corp/russian/data-import/src/oc_parser.rs
@@ -1,5 +1,6 @@
 use super::{bail, Ensure};
 use log::info;
+use std::str::FromStr;
 use xml::attribute::OwnedAttribute;
 use xml::name::OwnedName;
 use xml::reader::XmlEvent;
@@ -7,14 +8,26 @@ use xml::EventReader;
 
 #[derive(Default, Debug)]
 pub struct Grammeme {
-    parent: Option<String>,
-    name: String,
-    alias: String,
-    description: String,
+    pub parent: Option<String>,
+    pub name: String,
+    pub alias: String,
+    pub description: String,
 }
 
-#[derive(Debug)]
-pub struct Lemma {}
+/// Single form of a word (either its lemma, or the variations).
+#[derive(Debug, Default)]
+pub struct Variation {
+    pub word: String,
+    pub grammemes: Vec<String>,
+}
+
+#[derive(Debug, Default)]
+pub struct Lemma {
+    pub id: u64,
+    pub lemma: Variation,
+    pub grammemes: Vec<String>,
+    pub variations: Vec<Variation>,
+}
 
 #[derive(Debug)]
 pub enum OcElement {
@@ -34,6 +47,12 @@ enum ParserState {
     /// Parser is parsing lemmata.
     Lemmata,
 
+    /// Parser is inside a lemma's actual lemma.
+    Lemma,
+
+    /// Parser is parsing a morphological variation of a lemma.
+    Variation,
+
     /// Parser has seen the end of the line and nothing more is
     /// available.
     Ended,
@@ -133,7 +152,7 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
                 // actual beginning of an actual element, dispatch accordingly
                 event @ XmlEvent::StartElement {
                     name, attributes, ..
-                } => match self.state {
+                } => match &self.state {
                     ParserState::Grammemes => {
                         return Some(OcElement::Grammeme(self.parse_grammeme(name, attributes)))
                     }
@@ -145,6 +164,11 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
                         "parser received an unexpected start element while in state {:?}: {:?}",
                         self.state, event
                     )),
+
+                    other => bail(format!(
+                        "next_element() called while parser was in state {:?}",
+                        other
+                    )),
                 },
 
                 // finally, events that indicate a bug if they're
@@ -199,6 +223,7 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
         }
     }
 
+    /// Parse a single `<grammeme>` tag.
     fn parse_grammeme(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> Grammeme {
         if name.local_name != "grammeme" {
             bail(format!(
@@ -247,7 +272,7 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
         grammeme
     }
 
-    fn parse_lemma(&mut self, name: &OwnedName, _attributes: &[OwnedAttribute]) -> Lemma {
+    fn parse_lemma(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> Lemma {
         if name.local_name != "lemma" {
             bail(format!(
                 "expected to parse a lemma, but found <{}>",
@@ -255,8 +280,104 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
             ));
         }
 
-        self.skip_section("lemma");
+        self.state = ParserState::Lemma;
+        let mut lemma = Lemma::default();
 
-        Lemma {}
+        for attr in attributes {
+            if attr.name.local_name == "id" {
+                lemma.id = u64::from_str(&attr.value).ensure("failed to parse lemma ID");
+            }
+        }
+
+        loop {
+            match self.next() {
+                // <lemma> has ended
+                XmlEvent::EndElement { name } if name.local_name == "lemma" => {
+                    self.state = ParserState::Lemmata;
+                    return lemma;
+                }
+
+                // actual lemma content
+                XmlEvent::StartElement {
+                    name, attributes, ..
+                } => {
+                    match name.local_name.as_str() {
+                        // beginning to parse the lemma itself
+                        "l" => {
+                            lemma.lemma.word = attributes
+                                .into_iter()
+                                .find(|attr| attr.name.local_name == "t")
+                                .map(|attr| attr.value)
+                                .ensure(format!("lemma {} had no actual word", lemma.id));
+                        }
+
+                        // parsing a lemma variation
+                        "f" => {
+                            self.state = ParserState::Variation;
+
+                            let word = attributes
+                                .into_iter()
+                                .find(|attr| attr.name.local_name == "t")
+                                .map(|attr| attr.value)
+                                .ensure(format!(
+                                    "variation of lemma {} had no actual word",
+                                    lemma.id
+                                ));
+
+                            lemma.variations.push(Variation {
+                                word,
+                                grammemes: vec![],
+                            });
+                        }
+
+                        // parse a grammeme association
+                        "g" => {
+                            let grammeme = attributes
+                                .into_iter()
+                                .find(|attr| attr.name.local_name == "v")
+                                .map(|attr| attr.value)
+                                .ensure(format!(
+                                    "grammeme association in lemma {} missing ID",
+                                    lemma.id
+                                ));
+
+                            match self.state {
+                                ParserState::Lemma => {
+                                    lemma.grammemes.push(grammeme);
+                                }
+
+                                ParserState::Variation => {
+                                    lemma
+                                        .variations
+                                        .last_mut()
+                                        .ensure("variations should be non-empty")
+                                        .grammemes
+                                        .push(grammeme);
+                                }
+
+                                _ => bail(format!("invalid parser state: encountered grammeme association while in {:?}", self.state)),
+                            }
+                        }
+
+                        other => bail(format!("unexpected element while parsing lemma: {other}")),
+                    };
+                }
+
+                XmlEvent::EndElement { name } => match name.local_name.as_str() {
+                    "l" if self.state == ParserState::Lemma => continue,
+                    "f" if self.state == ParserState::Variation => {
+                        self.state = ParserState::Lemma;
+                        continue;
+                    }
+                    "g" => continue,
+                    other => bail(format!(
+                        "unexpected </{other}> while parsing lemma {}",
+                        lemma.id
+                    )),
+                },
+
+                _ => continue,
+            }
+        }
     }
 }