diff options
author | Vincent Ambo <mail@tazj.in> | 2023-01-21T15·00+0300 |
---|---|---|
committer | tazjin <tazjin@tvl.su> | 2023-01-21T17·49+0000 |
commit | 8eeb5d3bccf831681b2cad5c3b322e6a08f596df (patch) | |
tree | 4e02105c8ab39a4466b29e98c16aa9e2c69ac9b4 /corp/russian/data-import/src/or_parser.rs | |
parent | 429c0d00c4cd07ea90c85bf1ec2f2c742d970420 (diff) |
feat(corp/data-import): add import of OR 'words_forms' table r/5730
This is the full morphological set table for all the words from the lemmata table, which they don't call it that. Change-Id: I6f5be673c5f59f11e36bd8c8c935844a7d4fd170 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7894 Tested-by: BuildkiteCI Reviewed-by: tazjin <tazjin@tvl.su>
Diffstat (limited to '')
-rw-r--r-- | corp/russian/data-import/src/or_parser.rs | 15 |
1 files changed, 15 insertions, 0 deletions
diff --git a/corp/russian/data-import/src/or_parser.rs b/corp/russian/data-import/src/or_parser.rs index c11896f6bac0..28e4f14d3169 100644 --- a/corp/russian/data-import/src/or_parser.rs +++ b/corp/russian/data-import/src/or_parser.rs @@ -33,6 +33,17 @@ pub struct Word { pub created_at: String, // TODO: unknown } +/// A word form from the `words_forms` table. +#[derive(Debug, Deserialize)] +pub struct WordForm { + pub id: usize, + pub word_id: usize, + pub form_type: String, + pub position: String, + pub form: String, + pub form_bare: String, +} + pub struct OpenRussianParser { or_directory: PathBuf, } @@ -50,6 +61,10 @@ impl OpenRussianParser { self.parser_for("words.csv") } + pub fn words_forms(&self) -> DynIter<WordForm> { + self.parser_for("words_forms.csv") + } + fn parser_for<T: serde::de::DeserializeOwned + 'static>( &self, file_name: &str, |