From dc55ea320101282f200c0117ebdf7033f3bae927 Mon Sep 17 00:00:00 2001 From: Vincent Ambo Date: Wed, 18 Jan 2023 15:48:33 +0300 Subject: feat(corp/data-import): parse and import link types Change-Id: Iae01d1dc6894117dc693b4690d8bc79861212ae6 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7863 Tested-by: BuildkiteCI Reviewed-by: tazjin --- corp/russian/data-import/src/db_setup.rs | 16 +++++++++++++ corp/russian/data-import/src/oc_parser.rs | 40 +++++++++++++++++++++++++++++-- 2 files changed, 54 insertions(+), 2 deletions(-) (limited to 'corp/russian') diff --git a/corp/russian/data-import/src/db_setup.rs b/corp/russian/data-import/src/db_setup.rs index 1a9e2dd879..d85374dfa8 100644 --- a/corp/russian/data-import/src/db_setup.rs +++ b/corp/russian/data-import/src/db_setup.rs @@ -50,6 +50,12 @@ CREATE TABLE word_grammemes ( FOREIGN KEY(word) REFERENCES words(ROWID) ) STRICT; +-- table for link types +CREATE TABLE link_types ( + id INTEGER PRIMARY KEY, + name TEXT +) STRICT; + "#, ) .ensure("setting up initial table schema failed"); @@ -76,6 +82,16 @@ pub fn insert_oc_element(conn: &Connection, elem: OcElement) { } OcElement::Lemma(lemma) => insert_lemma(conn, lemma), + + OcElement::LinkType(lt) => { + conn.execute( + "INSERT INTO link_types (id, name) VALUES (?1, ?2)", + (<.id, <.name), + ) + .ensure("failed to insert link type"); + + info!("inserted link type {}", lt.name); + } } } diff --git a/corp/russian/data-import/src/oc_parser.rs b/corp/russian/data-import/src/oc_parser.rs index 148c528313..faf71883ca 100644 --- a/corp/russian/data-import/src/oc_parser.rs +++ b/corp/russian/data-import/src/oc_parser.rs @@ -29,10 +29,17 @@ pub struct Lemma { pub variations: Vec, } +#[derive(Debug, Default)] +pub struct LinkType { + pub id: u64, + pub name: String, +} + #[derive(Debug)] pub enum OcElement { Grammeme(Grammeme), Lemma(Lemma), + LinkType(LinkType), } #[derive(Debug, PartialEq)] @@ -53,6 +60,9 @@ enum ParserState { /// Parser is parsing a morphological variation of a lemma. Variation, + /// Parser is parsing link types. + LinkTypes, + /// Parser has seen the end of the line and nothing more is /// available. Ended, @@ -77,8 +87,8 @@ enum SectionState { fn section_state(section: &str) -> SectionState { match section { - "grammemes" | "lemmata" => SectionState::Active, - "restrictions" | "link_types" | "links" => SectionState::Inactive, + "grammemes" | "lemmata" | "link_types" => SectionState::Active, + "restrictions" | "links" => SectionState::Inactive, _ => SectionState::Unknown, } } @@ -137,6 +147,7 @@ impl OpenCorporaParser { self.state = match name.local_name.as_str() { "grammemes" => ParserState::Grammemes, "lemmata" => ParserState::Lemmata, + "link_types" => ParserState::LinkTypes, _ => unreachable!(), }; } @@ -156,10 +167,15 @@ impl OpenCorporaParser { ParserState::Grammemes => { return Some(OcElement::Grammeme(self.parse_grammeme(name, attributes))) } + ParserState::Lemmata => { return Some(OcElement::Lemma(self.parse_lemma(name, attributes))) } + ParserState::LinkTypes => { + return Some(OcElement::LinkType(self.parse_link_type(name, attributes))) + } + ParserState::Init | ParserState::Ended => bail(format!( "parser received an unexpected start element while in state {:?}: {:?}", self.state, event @@ -380,4 +396,24 @@ impl OpenCorporaParser { } } } + + fn parse_link_type(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> LinkType { + if name.local_name != "type" { + bail(format!( + "expected to parse a link type, but found <{}>", + name.local_name + )); + } + + let mut link_type = LinkType::default(); + + for attr in attributes { + if attr.name.local_name == "id" { + link_type.id = u64::from_str(&attr.value).ensure("failed to parse link type ID"); + } + } + + link_type.name = self.parse_string("type"); + link_type + } } -- cgit 1.4.1