about summary refs log blame commit diff
path: root/corp/russian/data-import/src/oc_parser.rs
blob: 8103ebd92369b46d7af0de92303a0b9430e13a9d (plain) (tree)
1
2
3
4
5
6
7
8
9
10
                          
                      
                      






                                   



                               

 













                                                                
 





                         







                                 



                       
                       
               













                                                                      





                                                               


                                     


                                























                                                               

                                                                                 

























































                                                                                   
                                                               
                                                      














                                                                                 
                                        


                                                                                               
 



                                                                                         



                                                                                                



                                                                                 



                                                                                                




                                                                               





















































                                                                                           
                                        















































                                                                                               
                                                                                         






                                                            

                                         
 































































































                                                                                                                                       
     



















                                                                                                 

































                                                                                           
 
use super::{bail, Ensure};
use log::{info, warn};
use std::str::FromStr;
use xml::attribute::OwnedAttribute;
use xml::name::OwnedName;
use xml::reader::XmlEvent;
use xml::EventReader;

#[derive(Default, Debug)]
pub struct Grammeme {
    pub parent: Option<String>,
    pub name: String,
    pub alias: String,
    pub description: String,
}

/// Single form of a word (either its lemma, or the variations).
#[derive(Debug, Default)]
pub struct Variation {
    pub word: String,
    pub grammemes: Vec<String>,
}

#[derive(Debug, Default)]
pub struct Lemma {
    pub id: u64,
    pub lemma: Variation,
    pub grammemes: Vec<String>,
    pub variations: Vec<Variation>,
}

#[derive(Debug, Default)]
pub struct LinkType {
    pub id: u64,
    pub name: String,
}

#[derive(Debug, Default)]
pub struct Link {
    pub id: u64,   // link itself
    pub from: u64, // lemma
    pub to: u64,   // lemma
    pub link_type: u64,
}

#[derive(Debug)]
pub enum OcElement {
    Grammeme(Grammeme),
    Lemma(Lemma),
    LinkType(LinkType),
    Link(Link),
}

#[derive(Debug, PartialEq)]
enum ParserState {
    /// Parser is not parsing any particular section and waiting for a
    /// start tag instead.
    Init,

    /// Parser is parsing grammemes.
    Grammemes,

    /// Parser is parsing lemmata.
    Lemmata,

    /// Parser is inside a lemma's actual lemma.
    Lemma,

    /// Parser is parsing a morphological variation of a lemma.
    Variation,

    /// Parser is parsing link types.
    LinkTypes,

    /// Parser is parsing links.
    Links,

    /// Parser has seen the end of the line and nothing more is
    /// available.
    Ended,
}

pub struct OpenCorporaParser<R: std::io::Read> {
    reader: EventReader<R>,
    state: ParserState,
}

#[derive(PartialEq)]
enum SectionState {
    /// Actively interested in parsing this section.
    Active,

    /// Section is known, but currently ignored.
    Inactive,

    /// Section is unknown (probably a bug).
    Unknown,
}

fn section_state(section: &str) -> SectionState {
    match section {
        "grammemes" | "lemmata" | "link_types" | "links" => SectionState::Active,
        "restrictions" => SectionState::Inactive,
        _ => SectionState::Unknown,
    }
}

impl<R: std::io::Read> OpenCorporaParser<R> {
    pub fn new(reader: R) -> Self {
        let config = xml::ParserConfig::new().trim_whitespace(true);
        let reader = EventReader::new_with_config(reader, config);

        Self {
            reader,
            state: ParserState::Init,
        }
    }

    /// Pull an `OcElement` out of the parser. Returns `None` if the
    /// parser stream has ended.
    pub fn next_element(&mut self) -> Option<OcElement> {
        if self.state == ParserState::Ended {
            return None;
        }

        // Pull the next element to determine what context to enter
        // next.
        loop {
            match &self.next() {
                // no-op events that do not affect parser state
                XmlEvent::Comment(_)
                | XmlEvent::Whitespace(_)
                | XmlEvent::ProcessingInstruction { .. }
                | XmlEvent::StartDocument { .. } => continue,
                XmlEvent::StartElement { name, .. } | XmlEvent::EndElement { name }
                    if name.local_name == "dictionary" =>
                {
                    continue
                }

                // end of the file, nothing more to return
                XmlEvent::EndDocument => {
                    self.state = ParserState::Ended;
                    return None;
                }

                // some sections are skipped
                XmlEvent::StartElement { name, .. } | XmlEvent::EndElement { name }
                    if section_state(&name.local_name) == SectionState::Inactive =>
                {
                    info!("skipping {} section", name.local_name);
                    self.skip_section(&name.local_name);
                }

                // active section events start specific parser states ...
                XmlEvent::StartElement { name, .. }
                    if section_state(&name.local_name) == SectionState::Active =>
                {
                    self.state = match name.local_name.as_str() {
                        "grammemes" => ParserState::Grammemes,
                        "lemmata" => ParserState::Lemmata,
                        "link_types" => ParserState::LinkTypes,
                        "links" => ParserState::Links,
                        _ => unreachable!(),
                    };
                }

                // ... or end them
                XmlEvent::EndElement { name, .. }
                    if section_state(&name.local_name) == SectionState::Active =>
                {
                    // TODO: assert that the right section ended
                    self.state = ParserState::Init;
                }

                // actual beginning of an actual element, dispatch accordingly
                event @ XmlEvent::StartElement {
                    name, attributes, ..
                } => match &self.state {
                    ParserState::Grammemes => {
                        return Some(OcElement::Grammeme(self.parse_grammeme(name, attributes)))
                    }

                    ParserState::Lemmata => {
                        return Some(OcElement::Lemma(self.parse_lemma(name, attributes)))
                    }

                    ParserState::LinkTypes => {
                        return Some(OcElement::LinkType(self.parse_link_type(name, attributes)))
                    }

                    ParserState::Links if name.local_name == "link" => {
                        return Some(OcElement::Link(self.parse_link(attributes)))
                    }

                    ParserState::Init | ParserState::Ended => bail(format!(
                        "parser received an unexpected start element while in state {:?}: {:?}",
                        self.state, event
                    )),

                    other => bail(format!(
                        "next_element() called while parser was in state {:?}",
                        other
                    )),
                },

                // finally, events that indicate a bug if they're
                // encountered here
                event @ XmlEvent::EndElement { .. }
                | event @ XmlEvent::CData(_)
                | event @ XmlEvent::Characters(_) => {
                    bail(format!("unexpected XML event: {:?}", event))
                }
            }
        }
    }

    /// Skip a section by advancing the parser state until we see an
    /// end element for the skipped section.
    fn skip_section(&mut self, section: &str) {
        loop {
            match self.next() {
                XmlEvent::EndElement { name } if name.local_name == section => return,
                _ => continue,
            }
        }
    }

    fn next(&mut self) -> XmlEvent {
        self.reader.next().ensure("XML parsing failed")
    }

    /// Parse a tag that should have plain string content.
    fn parse_string(&mut self, tag_name: &str) -> String {
        let mut out = String::new();

        loop {
            match self.next() {
                // ignore irrelevant things
                XmlEvent::Comment(_) | XmlEvent::Whitespace(_) => continue,

                // set the content
                XmlEvent::Characters(content) => {
                    out = content;
                }

                // expect the end of the element
                XmlEvent::EndElement { name } if name.local_name == tag_name => return out,

                // fail on everything unexpected
                event => bail(format!(
                    "unexpected element while parsing <{}>: {:?}",
                    tag_name, event
                )),
            }
        }
    }

    /// Parse a single `<grammeme>` tag.
    fn parse_grammeme(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> Grammeme {
        if name.local_name != "grammeme" {
            bail(format!(
                "expected to parse a grammeme, but found <{}>",
                name.local_name
            ));
        }

        let mut grammeme = Grammeme::default();

        for attr in attributes {
            if attr.name.local_name == "parent" && !attr.value.is_empty() {
                grammeme.parent = Some(attr.value.clone());
            }
        }

        loop {
            match self.next() {
                // ignore irrelevant things
                XmlEvent::Comment(_) | XmlEvent::Whitespace(_) => continue,

                // expect known tags
                XmlEvent::StartElement { name, .. } if name.local_name == "name" => {
                    grammeme.name = self.parse_string("name");
                }

                XmlEvent::StartElement { name, .. } if name.local_name == "alias" => {
                    grammeme.alias = self.parse_string("alias");
                }

                XmlEvent::StartElement { name, .. } if name.local_name == "description" => {
                    grammeme.description = self.parse_string("description");
                }

                // handle end of the grammeme
                XmlEvent::EndElement { name } if name.local_name == "grammeme" => break,

                // fail on everything unexpected
                event => bail(format!(
                    "unexpected element while parsing <grammeme>: {:?}",
                    event
                )),
            }
        }

        grammeme
    }

    fn parse_lemma(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> Lemma {
        if name.local_name != "lemma" {
            bail(format!(
                "expected to parse a lemma, but found <{}>",
                name.local_name
            ));
        }

        self.state = ParserState::Lemma;
        let mut lemma = Lemma::default();

        for attr in attributes {
            if attr.name.local_name == "id" {
                lemma.id = u64::from_str(&attr.value).ensure("failed to parse lemma ID");
            }
        }

        loop {
            match self.next() {
                // <lemma> has ended
                XmlEvent::EndElement { name } if name.local_name == "lemma" => {
                    self.state = ParserState::Lemmata;
                    return lemma;
                }

                // actual lemma content
                XmlEvent::StartElement {
                    name, attributes, ..
                } => {
                    match name.local_name.as_str() {
                        // beginning to parse the lemma itself
                        "l" => {
                            lemma.lemma.word = attributes
                                .into_iter()
                                .find(|attr| attr.name.local_name == "t")
                                .map(|attr| attr.value)
                                .ensure(format!("lemma {} had no actual word", lemma.id));
                        }

                        // parsing a lemma variation
                        "f" => {
                            self.state = ParserState::Variation;

                            let word = attributes
                                .into_iter()
                                .find(|attr| attr.name.local_name == "t")
                                .map(|attr| attr.value)
                                .ensure(format!(
                                    "variation of lemma {} had no actual word",
                                    lemma.id
                                ));

                            lemma.variations.push(Variation {
                                word,
                                grammemes: vec![],
                            });
                        }

                        // parse a grammeme association
                        "g" => {
                            let grammeme = attributes
                                .into_iter()
                                .find(|attr| attr.name.local_name == "v")
                                .map(|attr| attr.value)
                                .ensure(format!(
                                    "grammeme association in lemma {} missing ID",
                                    lemma.id
                                ));

                            match self.state {
                                ParserState::Lemma => {
                                    lemma.grammemes.push(grammeme);
                                }

                                ParserState::Variation => {
                                    lemma
                                        .variations
                                        .last_mut()
                                        .ensure("variations should be non-empty")
                                        .grammemes
                                        .push(grammeme);
                                }

                                _ => bail(format!("invalid parser state: encountered grammeme association while in {:?}", self.state)),
                            }
                        }

                        other => bail(format!("unexpected element while parsing lemma: {other}")),
                    };
                }

                XmlEvent::EndElement { name } => match name.local_name.as_str() {
                    "l" if self.state == ParserState::Lemma => continue,
                    "f" if self.state == ParserState::Variation => {
                        self.state = ParserState::Lemma;
                        continue;
                    }
                    "g" => continue,
                    other => bail(format!(
                        "unexpected </{other}> while parsing lemma {}",
                        lemma.id
                    )),
                },

                _ => continue,
            }
        }
    }

    fn parse_link_type(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> LinkType {
        if name.local_name != "type" {
            bail(format!(
                "expected to parse a link type, but found <{}>",
                name.local_name
            ));
        }

        let mut link_type = LinkType::default();

        for attr in attributes {
            if attr.name.local_name == "id" {
                link_type.id = u64::from_str(&attr.value).ensure("failed to parse link type ID");
            }
        }

        link_type.name = self.parse_string("type");
        link_type
    }

    fn parse_link(&mut self, attributes: &[OwnedAttribute]) -> Link {
        let mut link = Link::default();

        for attr in attributes {
            let i_val = || u64::from_str(&attr.value).ensure("failed to parse link field");

            match attr.name.local_name.as_str() {
                "id" => {
                    link.id = i_val();
                }
                "from" => {
                    link.from = i_val();
                }
                "to" => {
                    link.to = i_val();
                }
                "type" => {
                    link.link_type = i_val();
                }

                other => {
                    warn!("unexpected attribute {} on <link>", other);
                    continue;
                }
            }
        }

        // expect the end of the <link> element, though since these
        // are empty it should be immediate.
        self.skip_section("link");

        link
    }
}