use super::{bail, Ensure};
use log::{info, warn};
use std::str::FromStr;
use xml::attribute::OwnedAttribute;
use xml::name::OwnedName;
use xml::reader::XmlEvent;
use xml::EventReader;
#[derive(Default, Debug)]
pub struct Grammeme {
pub parent: Option<String>,
pub name: String,
pub alias: String,
pub description: String,
}
/// Single form of a word (either its lemma, or the variations).
#[derive(Debug, Default)]
pub struct Variation {
pub word: String,
pub grammemes: Vec<String>,
}
#[derive(Debug, Default)]
pub struct Lemma {
pub id: u64,
pub lemma: Variation,
pub grammemes: Vec<String>,
pub variations: Vec<Variation>,
}
#[derive(Debug, Default)]
pub struct LinkType {
pub id: u64,
pub name: String,
}
#[derive(Debug, Default)]
pub struct Link {
pub id: u64, // link itself
pub from: u64, // lemma
pub to: u64, // lemma
pub link_type: u64,
}
#[derive(Debug)]
pub enum OcElement {
Grammeme(Grammeme),
Lemma(Lemma),
LinkType(LinkType),
Link(Link),
}
#[derive(Debug, PartialEq)]
enum ParserState {
/// Parser is not parsing any particular section and waiting for a
/// start tag instead.
Init,
/// Parser is parsing grammemes.
Grammemes,
/// Parser is parsing lemmata.
Lemmata,
/// Parser is inside a lemma's actual lemma.
Lemma,
/// Parser is parsing a morphological variation of a lemma.
Variation,
/// Parser is parsing link types.
LinkTypes,
/// Parser is parsing links.
Links,
/// Parser has seen the end of the line and nothing more is
/// available.
Ended,
}
pub struct OpenCorporaParser<R: std::io::Read> {
reader: EventReader<R>,
state: ParserState,
}
#[derive(PartialEq)]
enum SectionState {
/// Actively interested in parsing this section.
Active,
/// Section is known, but currently ignored.
Inactive,
/// Section is unknown (probably a bug).
Unknown,
}
fn section_state(section: &str) -> SectionState {
match section {
"grammemes" | "lemmata" | "link_types" | "links" => SectionState::Active,
"restrictions" => SectionState::Inactive,
_ => SectionState::Unknown,
}
}
impl<R: std::io::Read> OpenCorporaParser<R> {
pub fn new(reader: R) -> Self {
let config = xml::ParserConfig::new().trim_whitespace(true);
let reader = EventReader::new_with_config(reader, config);
Self {
reader,
state: ParserState::Init,
}
}
/// Pull an `OcElement` out of the parser. Returns `None` if the
/// parser stream has ended.
pub fn next_element(&mut self) -> Option<OcElement> {
if self.state == ParserState::Ended {
return None;
}
// Pull the next element to determine what context to enter
// next.
loop {
match &self.next() {
// no-op events that do not affect parser state
XmlEvent::Comment(_)
| XmlEvent::Whitespace(_)
| XmlEvent::ProcessingInstruction { .. }
| XmlEvent::StartDocument { .. } => continue,
XmlEvent::StartElement { name, .. } | XmlEvent::EndElement { name }
if name.local_name == "dictionary" =>
{
continue
}
// end of the file, nothing more to return
XmlEvent::EndDocument => {
self.state = ParserState::Ended;
return None;
}
// some sections are skipped
XmlEvent::StartElement { name, .. } | XmlEvent::EndElement { name }
if section_state(&name.local_name) == SectionState::Inactive =>
{
info!("skipping {} section", name.local_name);
self.skip_section(&name.local_name);
}
// active section events start specific parser states ...
XmlEvent::StartElement { name, .. }
if section_state(&name.local_name) == SectionState::Active =>
{
self.state = match name.local_name.as_str() {
"grammemes" => ParserState::Grammemes,
"lemmata" => ParserState::Lemmata,
"link_types" => ParserState::LinkTypes,
"links" => ParserState::Links,
_ => unreachable!(),
};
}
// ... or end them
XmlEvent::EndElement { name, .. }
if section_state(&name.local_name) == SectionState::Active =>
{
// TODO: assert that the right section ended
self.state = ParserState::Init;
}
// actual beginning of an actual element, dispatch accordingly
event @ XmlEvent::StartElement {
name, attributes, ..
} => match &self.state {
ParserState::Grammemes => {
return Some(OcElement::Grammeme(self.parse_grammeme(name, attributes)))
}
ParserState::Lemmata => {
return Some(OcElement::Lemma(self.parse_lemma(name, attributes)))
}
ParserState::LinkTypes => {
return Some(OcElement::LinkType(self.parse_link_type(name, attributes)))
}
ParserState::Links if name.local_name == "link" => {
return Some(OcElement::Link(self.parse_link(attributes)))
}
ParserState::Init | ParserState::Ended => bail(format!(
"parser received an unexpected start element while in state {:?}: {:?}",
self.state, event
)),
other => bail(format!(
"next_element() called while parser was in state {:?}",
other
)),
},
// finally, events that indicate a bug if they're
// encountered here
event @ XmlEvent::EndElement { .. }
| event @ XmlEvent::CData(_)
| event @ XmlEvent::Characters(_) => {
bail(format!("unexpected XML event: {:?}", event))
}
}
}
}
/// Skip a section by advancing the parser state until we see an
/// end element for the skipped section.
fn skip_section(&mut self, section: &str) {
loop {
match self.next() {
XmlEvent::EndElement { name } if name.local_name == section => return,
_ => continue,
}
}
}
fn next(&mut self) -> XmlEvent {
self.reader.next().ensure("XML parsing failed")
}
/// Parse a tag that should have plain string content.
fn parse_string(&mut self, tag_name: &str) -> String {
let mut out = String::new();
loop {
match self.next() {
// ignore irrelevant things
XmlEvent::Comment(_) | XmlEvent::Whitespace(_) => continue,
// set the content
XmlEvent::Characters(content) => {
out = content;
}
// expect the end of the element
XmlEvent::EndElement { name } if name.local_name == tag_name => return out,
// fail on everything unexpected
event => bail(format!(
"unexpected element while parsing <{}>: {:?}",
tag_name, event
)),
}
}
}
/// Parse a single `<grammeme>` tag.
fn parse_grammeme(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> Grammeme {
if name.local_name != "grammeme" {
bail(format!(
"expected to parse a grammeme, but found <{}>",
name.local_name
));
}
let mut grammeme = Grammeme::default();
for attr in attributes {
if attr.name.local_name == "parent" && !attr.value.is_empty() {
grammeme.parent = Some(attr.value.clone());
}
}
loop {
match self.next() {
// ignore irrelevant things
XmlEvent::Comment(_) | XmlEvent::Whitespace(_) => continue,
// expect known tags
XmlEvent::StartElement { name, .. } if name.local_name == "name" => {
grammeme.name = self.parse_string("name");
}
XmlEvent::StartElement { name, .. } if name.local_name == "alias" => {
grammeme.alias = self.parse_string("alias");
}
XmlEvent::StartElement { name, .. } if name.local_name == "description" => {
grammeme.description = self.parse_string("description");
}
// handle end of the grammeme
XmlEvent::EndElement { name } if name.local_name == "grammeme" => break,
// fail on everything unexpected
event => bail(format!(
"unexpected element while parsing <grammeme>: {:?}",
event
)),
}
}
grammeme
}
fn parse_lemma(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> Lemma {
if name.local_name != "lemma" {
bail(format!(
"expected to parse a lemma, but found <{}>",
name.local_name
));
}
self.state = ParserState::Lemma;
let mut lemma = Lemma::default();
for attr in attributes {
if attr.name.local_name == "id" {
lemma.id = u64::from_str(&attr.value).ensure("failed to parse lemma ID");
}
}
loop {
match self.next() {
// <lemma> has ended
XmlEvent::EndElement { name } if name.local_name == "lemma" => {
self.state = ParserState::Lemmata;
return lemma;
}
// actual lemma content
XmlEvent::StartElement {
name, attributes, ..
} => {
match name.local_name.as_str() {
// beginning to parse the lemma itself
"l" => {
lemma.lemma.word = attributes
.into_iter()
.find(|attr| attr.name.local_name == "t")
.map(|attr| attr.value)
.ensure(format!("lemma {} had no actual word", lemma.id));
}
// parsing a lemma variation
"f" => {
self.state = ParserState::Variation;
let word = attributes
.into_iter()
.find(|attr| attr.name.local_name == "t")
.map(|attr| attr.value)
.ensure(format!(
"variation of lemma {} had no actual word",
lemma.id
));
lemma.variations.push(Variation {
word,
grammemes: vec![],
});
}
// parse a grammeme association
"g" => {
let grammeme = attributes
.into_iter()
.find(|attr| attr.name.local_name == "v")
.map(|attr| attr.value)
.ensure(format!(
"grammeme association in lemma {} missing ID",
lemma.id
));
match self.state {
ParserState::Lemma => {
lemma.grammemes.push(grammeme);
}
ParserState::Variation => {
lemma
.variations
.last_mut()
.ensure("variations should be non-empty")
.grammemes
.push(grammeme);
}
_ => bail(format!("invalid parser state: encountered grammeme association while in {:?}", self.state)),
}
}
other => bail(format!("unexpected element while parsing lemma: {other}")),
};
}
XmlEvent::EndElement { name } => match name.local_name.as_str() {
"l" if self.state == ParserState::Lemma => continue,
"f" if self.state == ParserState::Variation => {
self.state = ParserState::Lemma;
continue;
}
"g" => continue,
other => bail(format!(
"unexpected </{other}> while parsing lemma {}",
lemma.id
)),
},
_ => continue,
}
}
}
fn parse_link_type(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> LinkType {
if name.local_name != "type" {
bail(format!(
"expected to parse a link type, but found <{}>",
name.local_name
));
}
let mut link_type = LinkType::default();
for attr in attributes {
if attr.name.local_name == "id" {
link_type.id = u64::from_str(&attr.value).ensure("failed to parse link type ID");
}
}
link_type.name = self.parse_string("type");
link_type
}
fn parse_link(&mut self, attributes: &[OwnedAttribute]) -> Link {
let mut link = Link::default();
for attr in attributes {
let i_val = || u64::from_str(&attr.value).ensure("failed to parse link field");
match attr.name.local_name.as_str() {
"id" => {
link.id = i_val();
}
"from" => {
link.from = i_val();
}
"to" => {
link.to = i_val();
}
"type" => {
link.link_type = i_val();
}
other => {
warn!("unexpected attribute {} on <link>", other);
continue;
}
}
}
// expect the end of the <link> element, though since these
// are empty it should be immediate.
self.skip_section("link");
link
}
}