about summary refs log tree commit diff
path: root/corp/russian/data-import/src/or_parser.rs
diff options
context:
space:
mode:
authorVincent Ambo <mail@tazj.in>2023-01-21T15·00+0300
committertazjin <tazjin@tvl.su>2023-01-21T17·49+0000
commit8eeb5d3bccf831681b2cad5c3b322e6a08f596df (patch)
tree4e02105c8ab39a4466b29e98c16aa9e2c69ac9b4 /corp/russian/data-import/src/or_parser.rs
parent429c0d00c4cd07ea90c85bf1ec2f2c742d970420 (diff)
feat(corp/data-import): add import of OR 'words_forms' table r/5730
This is the full morphological set table for all the words from the
lemmata table, which they don't call it that.

Change-Id: I6f5be673c5f59f11e36bd8c8c935844a7d4fd170
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7894
Tested-by: BuildkiteCI
Reviewed-by: tazjin <tazjin@tvl.su>
Diffstat (limited to '')
-rw-r--r--corp/russian/data-import/src/or_parser.rs15
1 files changed, 15 insertions, 0 deletions
diff --git a/corp/russian/data-import/src/or_parser.rs b/corp/russian/data-import/src/or_parser.rs
index c11896f6bac0..28e4f14d3169 100644
--- a/corp/russian/data-import/src/or_parser.rs
+++ b/corp/russian/data-import/src/or_parser.rs
@@ -33,6 +33,17 @@ pub struct Word {
     pub created_at: String, // TODO: unknown
 }
 
+/// A word form from the `words_forms` table.
+#[derive(Debug, Deserialize)]
+pub struct WordForm {
+    pub id: usize,
+    pub word_id: usize,
+    pub form_type: String,
+    pub position: String,
+    pub form: String,
+    pub form_bare: String,
+}
+
 pub struct OpenRussianParser {
     or_directory: PathBuf,
 }
@@ -50,6 +61,10 @@ impl OpenRussianParser {
         self.parser_for("words.csv")
     }
 
+    pub fn words_forms(&self) -> DynIter<WordForm> {
+        self.parser_for("words_forms.csv")
+    }
+
     fn parser_for<T: serde::de::DeserializeOwned + 'static>(
         &self,
         file_name: &str,