From 192dac5a749edece1b5b3fb0b8acb92819df22e0 Mon Sep 17 00:00:00 2001 From: Vincent Ambo Date: Wed, 25 Jan 2023 01:36:35 +0300 Subject: feat(corp/data-import): map OR word types to sets of OC grammemes Change-Id: I674f3a66fcd65314431a2ebd747e3830aa2dd7a1 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7924 Tested-by: BuildkiteCI Reviewed-by: tazjin Autosubmit: tazjin --- corp/russian/data-import/src/mappings.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'corp/russian/data-import') diff --git a/corp/russian/data-import/src/mappings.rs b/corp/russian/data-import/src/mappings.rs index 8a581ff86ba8..985088a56628 100644 --- a/corp/russian/data-import/src/mappings.rs +++ b/corp/russian/data-import/src/mappings.rs @@ -1,5 +1,18 @@ //! Manual mapping of some data structures in OC/OR corpora. +/// Maps the *names* of OpenRussian word types (the `word_type` field +/// in the `or_words` table) to the *set* of OpenCorpora grammemes +/// commonly attached to lemmata of this type in OC. +/// +/// Some word types just don't map over, and are omitted. Many words +/// also have an empty word type. +pub const WORD_TYPES_GRAMMEME_MAP: &'static [(&'static str, &'static [&'static str])] = &[ + ("adjective", &["ADJF"]), + ("adverb", &["ADVB"]), + ("noun", &["NOUN"]), + ("verb", &["INFN"]), // or "VERB" ... +]; + /// Maps the *names* of OpenRussian grammemes (the `form_type` fields /// in the `or_word_forms` table) to the *set* of OpenCorpora /// grammemes attached to them corresponding lemma in the `oc_lemmas` -- cgit 1.4.1