From 80723b708d6edc44be00b61fb02260800101dcbb Mon Sep 17 00:00:00 2001 From: Vincent Ambo Date: Wed, 25 Jan 2023 01:29:03 +0300 Subject: feat(corp/data-import): map OC lemma grammemes to OR form types Change-Id: Ie804d185269336b0d9fe417754e5e795918e65b8 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7923 Autosubmit: tazjin Reviewed-by: tazjin Tested-by: BuildkiteCI --- corp/russian/data-import/src/mappings.rs | 118 +++++++++++++++++++++++++++---- 1 file changed, 103 insertions(+), 15 deletions(-) (limited to 'corp/russian/data-import/src') diff --git a/corp/russian/data-import/src/mappings.rs b/corp/russian/data-import/src/mappings.rs index 5d8d4f3352fb..8a581ff86ba8 100644 --- a/corp/russian/data-import/src/mappings.rs +++ b/corp/russian/data-import/src/mappings.rs @@ -1,16 +1,103 @@ //! Manual mapping of some data structures in OC/OR corpora. -/// This maps the *names* of OpenRussian grammemes (the set of -/// `form_type` fields in the `word_forms` table) to the *names* -/// of OpenCorpora grammemes. +/// Maps the *names* of OpenRussian grammemes (the `form_type` fields +/// in the `or_word_forms` table) to the *set* of OpenCorpora +/// grammemes attached to them corresponding lemma in the `oc_lemmas` +/// table. /// -/// The names of the OR grammemes are much easier to understand in -/// general, as the OC ones seem to have strange acronyms in them, -/// however the OC ones are much more structured. +/// This *only* includes grammatical information about the lemma of +/// the word (such as whether it is a verb or other type), but *not* +/// information about the specific instance of the word (such as its +/// gender). /// -/// As these forms map to the word_forms table they lack the forms -/// attached to the lemmata. -pub const FORM_TYPES_GRAMMEMES: &'static [(&'static str, &'static [&'static str])] = &[ +/// Correctly corresponding these requires use of all mapping tables. +pub const FORMS_LEMMATA_GRAMMEME_MAP: &'static [(&'static str, &'static [&'static str])] = &[ + ("ru_adj_comparative", &["COMP"]), + ("ru_adj_superlative", &["ADJF", "Supr"]), + ("ru_adj_f_acc", &["ADJF"]), + ("ru_adj_f_dat", &["ADJF"]), + ("ru_adj_f_gen", &["ADJF"]), + ("ru_adj_f_inst", &["ADJF"]), + ("ru_adj_f_nom", &["ADJF"]), + ("ru_adj_f_prep", &["ADJF"]), + ("ru_adj_m_acc", &["ADJF"]), + ("ru_adj_m_dat", &["ADJF"]), + ("ru_adj_m_gen", &["ADJF"]), + ("ru_adj_m_inst", &["ADJF"]), + ("ru_adj_m_nom", &["ADJF"]), + ("ru_adj_m_prep", &["ADJF"]), + ("ru_adj_n_acc", &["ADJF"]), + ("ru_adj_n_dat", &["ADJF"]), + ("ru_adj_n_gen", &["ADJF"]), + ("ru_adj_n_inst", &["ADJF"]), + ("ru_adj_n_nom", &["ADJF"]), + ("ru_adj_n_prep", &["ADJF"]), + ("ru_adj_pl_acc", &["ADJF"]), + ("ru_adj_pl_dat", &["ADJF"]), + ("ru_adj_pl_gen", &["ADJF"]), + ("ru_adj_pl_inst", &["ADJF"]), + ("ru_adj_pl_nom", &["ADJF"]), + ("ru_adj_pl_prep", &["ADJF"]), + ("ru_adj_short_f", &["ADJS"]), + ("ru_adj_short_m", &["ADJS"]), + ("ru_adj_short_n", &["ADJS"]), + ("ru_adj_short_pl", &["ADJS"]), + ("ru_noun_pl_acc", &["NOUN"]), + ("ru_noun_pl_dat", &["NOUN"]), + ("ru_noun_pl_gen", &["NOUN"]), + ("ru_noun_pl_inst", &["NOUN"]), + ("ru_noun_pl_nom", &["NOUN"]), + ("ru_noun_pl_prep", &["NOUN"]), + ("ru_noun_sg_acc", &["NOUN"]), + ("ru_noun_sg_dat", &["NOUN"]), + ("ru_noun_sg_gen", &["NOUN"]), + ("ru_noun_sg_inst", &["NOUN"]), + ("ru_noun_sg_nom", &["NOUN"]), + ("ru_noun_sg_prep", &["NOUN"]), + ("ru_verb_gerund_past", &["GRND"]), + ("ru_verb_gerund_present", &["GRND"]), + ("ru_verb_imperative_pl", &["VERB"]), + ("ru_verb_imperative_sg", &["VERB"]), + ("ru_verb_past_f", &["VERB"]), + ("ru_verb_past_m", &["VERB"]), + ("ru_verb_past_n", &["VERB"]), + ("ru_verb_past_pl", &["VERB"]), + ("ru_verb_presfut_pl1", &["VERB"]), + ("ru_verb_presfut_pl2", &["VERB"]), + ("ru_verb_presfut_pl3", &["VERB"]), + ("ru_verb_presfut_sg1", &["VERB"]), + ("ru_verb_presfut_sg2", &["VERB"]), + ("ru_verb_presfut_sg3", &["VERB"]), + ( + "ru_base", + &[ /* nothing consistent, except often 'Fixd' */ ], + ), + ("ru_verb_participle_active_past", &["PRTF", "past", "actv"]), + ( + "ru_verb_participle_active_present", + &["PRTF", "pres", "actv"], + ), + ( + "ru_verb_participle_passive_past", + &["PRTF", "past", "passv"], + ), + ( + "ru_verb_participle_passive_present", + &["PRTF", "pres", "passv"], + ), +]; + +/// Maps the *names* of OpenRussian grammemes (the `form_type` fields +/// in the `or_word_forms` table) to the *set* of OpenCorpora +/// grammemes attached to them corresponding words in the `oc_words` +/// table. +/// +/// This includes grammatical information about the "instance" of the +/// word (such as its gender), but *not* the higher-level type +/// information about its lemma. +/// +/// Correctly corresponding these requires use of all mapping tables. +pub const FORMS_WORDS_GRAMMEME_MAP: &'static [(&'static str, &'static [&'static str])] = &[ ("ru_adj_comparative", &["Cmp2"]), ("ru_adj_f_acc", &["femn", "sing", "accs"]), ("ru_adj_f_dat", &["femn", "sing", "datv"]), @@ -59,12 +146,13 @@ pub const FORM_TYPES_GRAMMEMES: &'static [(&'static str, &'static [&'static str] ("ru_verb_past_m", &["masc", "sing", "past"]), ("ru_verb_past_n", &["neut", "sing", "past"]), ("ru_verb_past_pl", &["plur", "past"]), - ("ru_verb_presfut_pl1", &["plur", "1per", "pres"]), - ("ru_verb_presfut_pl2", &["plur", "2per", "pres"]), - ("ru_verb_presfut_pl3", &["plur", "3per", "pres"]), - ("ru_verb_presfut_sg1", &["sing", "1per", "pres"]), - ("ru_verb_presfut_sg2", &["sing", "2per", "pres"]), - ("ru_verb_presfut_sg3", &["sing", "3per", "pres"]), + // these also contain "pres" or "futr", depending on the verb. + ("ru_verb_presfut_pl1", &["plur", "1per"]), + ("ru_verb_presfut_pl2", &["plur", "2per"]), + ("ru_verb_presfut_pl3", &["plur", "3per"]), + ("ru_verb_presfut_sg1", &["sing", "1per"]), + ("ru_verb_presfut_sg2", &["sing", "2per"]), + ("ru_verb_presfut_sg3", &["sing", "3per"]), // Unclear items, probably only useful tags on lemmata ( "ru_verb_gerund_present", -- cgit 1.4.1