about summary refs log tree commit diff
path: root/corp
diff options
context:
space:
mode:
authorVincent Ambo <mail@tazj.in>2023-01-18T15·23+0300
committertazjin <tazjin@tvl.su>2023-01-18T15·44+0000
commit476e312c06213e53757349a93cc7f855889bc61c (patch)
treede6770c6a8a806909ea69424294e783559c3de97 /corp
parentdc55ea320101282f200c0117ebdf7033f3bae927 (diff)
feat(corp/data-import): parse and import links r/5691
Change-Id: Iebdbc8f884f28064d7b00b8f8808b5030fa3d05c
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7864
Reviewed-by: tazjin <tazjin@tvl.su>
Tested-by: BuildkiteCI
Diffstat (limited to 'corp')
-rw-r--r--corp/russian/data-import/src/db_setup.rs24
-rw-r--r--corp/russian/data-import/src/oc_parser.rs57
2 files changed, 78 insertions, 3 deletions
diff --git a/corp/russian/data-import/src/db_setup.rs b/corp/russian/data-import/src/db_setup.rs
index d85374dfa8dd..5959ad09e5d6 100644
--- a/corp/russian/data-import/src/db_setup.rs
+++ b/corp/russian/data-import/src/db_setup.rs
@@ -56,6 +56,17 @@ CREATE TABLE link_types (
   name TEXT
 ) STRICT;
 
+-- table for links between lemmata
+CREATE TABLE links (
+  id INTEGER PRIMARY KEY,
+  link_type INTEGER NOT NULL,
+  from_lemma INTEGER NOT NULL,
+  to_lemma INTEGER NOT NULL,
+  FOREIGN KEY(link_type) REFERENCES link_types(id),
+  FOREIGN KEY(from_lemma) REFERENCES lemmas(id),
+  FOREIGN KEY(to_lemma) REFERENCES lemmas(id)
+) STRICT;
+
 "#,
     )
     .ensure("setting up initial table schema failed");
@@ -92,6 +103,19 @@ pub fn insert_oc_element(conn: &Connection, elem: OcElement) {
 
             info!("inserted link type {}", lt.name);
         }
+
+        OcElement::Link(link) => {
+            let mut stmt = conn
+                .prepare_cached(
+                    "INSERT INTO links (id, link_type, from_lemma, to_lemma) VALUES (?1, ?2, ?3, ?4)",
+                )
+                .ensure("failed to prepare link statement");
+
+            stmt.execute((&link.id, &link.link_type, &link.from, &link.to))
+                .ensure("failed to insert link");
+
+            debug!("inserted link {}", link.id);
+        }
     }
 }
 
diff --git a/corp/russian/data-import/src/oc_parser.rs b/corp/russian/data-import/src/oc_parser.rs
index faf71883ca7d..8103ebd92369 100644
--- a/corp/russian/data-import/src/oc_parser.rs
+++ b/corp/russian/data-import/src/oc_parser.rs
@@ -1,5 +1,5 @@
 use super::{bail, Ensure};
-use log::info;
+use log::{info, warn};
 use std::str::FromStr;
 use xml::attribute::OwnedAttribute;
 use xml::name::OwnedName;
@@ -35,11 +35,20 @@ pub struct LinkType {
     pub name: String,
 }
 
+#[derive(Debug, Default)]
+pub struct Link {
+    pub id: u64,   // link itself
+    pub from: u64, // lemma
+    pub to: u64,   // lemma
+    pub link_type: u64,
+}
+
 #[derive(Debug)]
 pub enum OcElement {
     Grammeme(Grammeme),
     Lemma(Lemma),
     LinkType(LinkType),
+    Link(Link),
 }
 
 #[derive(Debug, PartialEq)]
@@ -63,6 +72,9 @@ enum ParserState {
     /// Parser is parsing link types.
     LinkTypes,
 
+    /// Parser is parsing links.
+    Links,
+
     /// Parser has seen the end of the line and nothing more is
     /// available.
     Ended,
@@ -87,8 +99,8 @@ enum SectionState {
 
 fn section_state(section: &str) -> SectionState {
     match section {
-        "grammemes" | "lemmata" | "link_types" => SectionState::Active,
-        "restrictions" | "links" => SectionState::Inactive,
+        "grammemes" | "lemmata" | "link_types" | "links" => SectionState::Active,
+        "restrictions" => SectionState::Inactive,
         _ => SectionState::Unknown,
     }
 }
@@ -148,6 +160,7 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
                         "grammemes" => ParserState::Grammemes,
                         "lemmata" => ParserState::Lemmata,
                         "link_types" => ParserState::LinkTypes,
+                        "links" => ParserState::Links,
                         _ => unreachable!(),
                     };
                 }
@@ -176,6 +189,10 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
                         return Some(OcElement::LinkType(self.parse_link_type(name, attributes)))
                     }
 
+                    ParserState::Links if name.local_name == "link" => {
+                        return Some(OcElement::Link(self.parse_link(attributes)))
+                    }
+
                     ParserState::Init | ParserState::Ended => bail(format!(
                         "parser received an unexpected start element while in state {:?}: {:?}",
                         self.state, event
@@ -416,4 +433,38 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
         link_type.name = self.parse_string("type");
         link_type
     }
+
+    fn parse_link(&mut self, attributes: &[OwnedAttribute]) -> Link {
+        let mut link = Link::default();
+
+        for attr in attributes {
+            let i_val = || u64::from_str(&attr.value).ensure("failed to parse link field");
+
+            match attr.name.local_name.as_str() {
+                "id" => {
+                    link.id = i_val();
+                }
+                "from" => {
+                    link.from = i_val();
+                }
+                "to" => {
+                    link.to = i_val();
+                }
+                "type" => {
+                    link.link_type = i_val();
+                }
+
+                other => {
+                    warn!("unexpected attribute {} on <link>", other);
+                    continue;
+                }
+            }
+        }
+
+        // expect the end of the <link> element, though since these
+        // are empty it should be immediate.
+        self.skip_section("link");
+
+        link
+    }
 }