about summary refs log tree commit diff
path: root/corp/russian
diff options
context:
space:
mode:
authorVincent Ambo <mail@tazj.in>2023-01-18T12·48+0300
committertazjin <tazjin@tvl.su>2023-01-18T15·44+0000
commitdc55ea320101282f200c0117ebdf7033f3bae927 (patch)
treedfa3b408d5929185ba13af5456eb4df1e8dc7154 /corp/russian
parent3f0b1d8e0b518c6f4684f65ae421f71864176d99 (diff)
feat(corp/data-import): parse and import link types r/5690
Change-Id: Iae01d1dc6894117dc693b4690d8bc79861212ae6
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7863
Tested-by: BuildkiteCI
Reviewed-by: tazjin <tazjin@tvl.su>
Diffstat (limited to 'corp/russian')
-rw-r--r--corp/russian/data-import/src/db_setup.rs16
-rw-r--r--corp/russian/data-import/src/oc_parser.rs40
2 files changed, 54 insertions, 2 deletions
diff --git a/corp/russian/data-import/src/db_setup.rs b/corp/russian/data-import/src/db_setup.rs
index 1a9e2dd879..d85374dfa8 100644
--- a/corp/russian/data-import/src/db_setup.rs
+++ b/corp/russian/data-import/src/db_setup.rs
@@ -50,6 +50,12 @@ CREATE TABLE word_grammemes (
     FOREIGN KEY(word) REFERENCES words(ROWID)
 ) STRICT;
 
+-- table for link types
+CREATE TABLE link_types (
+  id INTEGER PRIMARY KEY,
+  name TEXT
+) STRICT;
+
 "#,
     )
     .ensure("setting up initial table schema failed");
@@ -76,6 +82,16 @@ pub fn insert_oc_element(conn: &Connection, elem: OcElement) {
         }
 
         OcElement::Lemma(lemma) => insert_lemma(conn, lemma),
+
+        OcElement::LinkType(lt) => {
+            conn.execute(
+                "INSERT INTO link_types (id, name) VALUES (?1, ?2)",
+                (&lt.id, &lt.name),
+            )
+            .ensure("failed to insert link type");
+
+            info!("inserted link type {}", lt.name);
+        }
     }
 }
 
diff --git a/corp/russian/data-import/src/oc_parser.rs b/corp/russian/data-import/src/oc_parser.rs
index 148c528313..faf71883ca 100644
--- a/corp/russian/data-import/src/oc_parser.rs
+++ b/corp/russian/data-import/src/oc_parser.rs
@@ -29,10 +29,17 @@ pub struct Lemma {
     pub variations: Vec<Variation>,
 }
 
+#[derive(Debug, Default)]
+pub struct LinkType {
+    pub id: u64,
+    pub name: String,
+}
+
 #[derive(Debug)]
 pub enum OcElement {
     Grammeme(Grammeme),
     Lemma(Lemma),
+    LinkType(LinkType),
 }
 
 #[derive(Debug, PartialEq)]
@@ -53,6 +60,9 @@ enum ParserState {
     /// Parser is parsing a morphological variation of a lemma.
     Variation,
 
+    /// Parser is parsing link types.
+    LinkTypes,
+
     /// Parser has seen the end of the line and nothing more is
     /// available.
     Ended,
@@ -77,8 +87,8 @@ enum SectionState {
 
 fn section_state(section: &str) -> SectionState {
     match section {
-        "grammemes" | "lemmata" => SectionState::Active,
-        "restrictions" | "link_types" | "links" => SectionState::Inactive,
+        "grammemes" | "lemmata" | "link_types" => SectionState::Active,
+        "restrictions" | "links" => SectionState::Inactive,
         _ => SectionState::Unknown,
     }
 }
@@ -137,6 +147,7 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
                     self.state = match name.local_name.as_str() {
                         "grammemes" => ParserState::Grammemes,
                         "lemmata" => ParserState::Lemmata,
+                        "link_types" => ParserState::LinkTypes,
                         _ => unreachable!(),
                     };
                 }
@@ -156,10 +167,15 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
                     ParserState::Grammemes => {
                         return Some(OcElement::Grammeme(self.parse_grammeme(name, attributes)))
                     }
+
                     ParserState::Lemmata => {
                         return Some(OcElement::Lemma(self.parse_lemma(name, attributes)))
                     }
 
+                    ParserState::LinkTypes => {
+                        return Some(OcElement::LinkType(self.parse_link_type(name, attributes)))
+                    }
+
                     ParserState::Init | ParserState::Ended => bail(format!(
                         "parser received an unexpected start element while in state {:?}: {:?}",
                         self.state, event
@@ -380,4 +396,24 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
             }
         }
     }
+
+    fn parse_link_type(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> LinkType {
+        if name.local_name != "type" {
+            bail(format!(
+                "expected to parse a link type, but found <{}>",
+                name.local_name
+            ));
+        }
+
+        let mut link_type = LinkType::default();
+
+        for attr in attributes {
+            if attr.name.local_name == "id" {
+                link_type.id = u64::from_str(&attr.value).ensure("failed to parse link type ID");
+            }
+        }
+
+        link_type.name = self.parse_string("type");
+        link_type
+    }
 }