From 476e312c06213e53757349a93cc7f855889bc61c Mon Sep 17 00:00:00 2001 From: Vincent Ambo Date: Wed, 18 Jan 2023 18:23:16 +0300 Subject: feat(corp/data-import): parse and import links Change-Id: Iebdbc8f884f28064d7b00b8f8808b5030fa3d05c Reviewed-on: https://cl.tvl.fyi/c/depot/+/7864 Reviewed-by: tazjin Tested-by: BuildkiteCI --- corp/russian/data-import/src/db_setup.rs | 24 +++++++++++++ corp/russian/data-import/src/oc_parser.rs | 57 +++++++++++++++++++++++++++++-- 2 files changed, 78 insertions(+), 3 deletions(-) (limited to 'corp/russian/data-import') diff --git a/corp/russian/data-import/src/db_setup.rs b/corp/russian/data-import/src/db_setup.rs index d85374dfa8dd..5959ad09e5d6 100644 --- a/corp/russian/data-import/src/db_setup.rs +++ b/corp/russian/data-import/src/db_setup.rs @@ -56,6 +56,17 @@ CREATE TABLE link_types ( name TEXT ) STRICT; +-- table for links between lemmata +CREATE TABLE links ( + id INTEGER PRIMARY KEY, + link_type INTEGER NOT NULL, + from_lemma INTEGER NOT NULL, + to_lemma INTEGER NOT NULL, + FOREIGN KEY(link_type) REFERENCES link_types(id), + FOREIGN KEY(from_lemma) REFERENCES lemmas(id), + FOREIGN KEY(to_lemma) REFERENCES lemmas(id) +) STRICT; + "#, ) .ensure("setting up initial table schema failed"); @@ -92,6 +103,19 @@ pub fn insert_oc_element(conn: &Connection, elem: OcElement) { info!("inserted link type {}", lt.name); } + + OcElement::Link(link) => { + let mut stmt = conn + .prepare_cached( + "INSERT INTO links (id, link_type, from_lemma, to_lemma) VALUES (?1, ?2, ?3, ?4)", + ) + .ensure("failed to prepare link statement"); + + stmt.execute((&link.id, &link.link_type, &link.from, &link.to)) + .ensure("failed to insert link"); + + debug!("inserted link {}", link.id); + } } } diff --git a/corp/russian/data-import/src/oc_parser.rs b/corp/russian/data-import/src/oc_parser.rs index faf71883ca7d..8103ebd92369 100644 --- a/corp/russian/data-import/src/oc_parser.rs +++ b/corp/russian/data-import/src/oc_parser.rs @@ -1,5 +1,5 @@ use super::{bail, Ensure}; -use log::info; +use log::{info, warn}; use std::str::FromStr; use xml::attribute::OwnedAttribute; use xml::name::OwnedName; @@ -35,11 +35,20 @@ pub struct LinkType { pub name: String, } +#[derive(Debug, Default)] +pub struct Link { + pub id: u64, // link itself + pub from: u64, // lemma + pub to: u64, // lemma + pub link_type: u64, +} + #[derive(Debug)] pub enum OcElement { Grammeme(Grammeme), Lemma(Lemma), LinkType(LinkType), + Link(Link), } #[derive(Debug, PartialEq)] @@ -63,6 +72,9 @@ enum ParserState { /// Parser is parsing link types. LinkTypes, + /// Parser is parsing links. + Links, + /// Parser has seen the end of the line and nothing more is /// available. Ended, @@ -87,8 +99,8 @@ enum SectionState { fn section_state(section: &str) -> SectionState { match section { - "grammemes" | "lemmata" | "link_types" => SectionState::Active, - "restrictions" | "links" => SectionState::Inactive, + "grammemes" | "lemmata" | "link_types" | "links" => SectionState::Active, + "restrictions" => SectionState::Inactive, _ => SectionState::Unknown, } } @@ -148,6 +160,7 @@ impl OpenCorporaParser { "grammemes" => ParserState::Grammemes, "lemmata" => ParserState::Lemmata, "link_types" => ParserState::LinkTypes, + "links" => ParserState::Links, _ => unreachable!(), }; } @@ -176,6 +189,10 @@ impl OpenCorporaParser { return Some(OcElement::LinkType(self.parse_link_type(name, attributes))) } + ParserState::Links if name.local_name == "link" => { + return Some(OcElement::Link(self.parse_link(attributes))) + } + ParserState::Init | ParserState::Ended => bail(format!( "parser received an unexpected start element while in state {:?}: {:?}", self.state, event @@ -416,4 +433,38 @@ impl OpenCorporaParser { link_type.name = self.parse_string("type"); link_type } + + fn parse_link(&mut self, attributes: &[OwnedAttribute]) -> Link { + let mut link = Link::default(); + + for attr in attributes { + let i_val = || u64::from_str(&attr.value).ensure("failed to parse link field"); + + match attr.name.local_name.as_str() { + "id" => { + link.id = i_val(); + } + "from" => { + link.from = i_val(); + } + "to" => { + link.to = i_val(); + } + "type" => { + link.link_type = i_val(); + } + + other => { + warn!("unexpected attribute {} on ", other); + continue; + } + } + } + + // expect the end of the element, though since these + // are empty it should be immediate. + self.skip_section("link"); + + link + } } -- cgit 1.4.1