about summary refs log tree commit diff
path: root/corp/russian/data-import/src/or_parser.rs
diff options
context:
space:
mode:
Diffstat (limited to 'corp/russian/data-import/src/or_parser.rs')
-rw-r--r--corp/russian/data-import/src/or_parser.rs105
1 files changed, 105 insertions, 0 deletions
diff --git a/corp/russian/data-import/src/or_parser.rs b/corp/russian/data-import/src/or_parser.rs
new file mode 100644
index 0000000000..8bfc61dbef
--- /dev/null
+++ b/corp/russian/data-import/src/or_parser.rs
@@ -0,0 +1,105 @@
+//! Parser for the OpenRussian data format.
+//!
+//! Note that when exporting OpenRussian data from the project you
+//! have to choose an encoding. We choose tab-separated CSV files, as
+//! tabs have a very low probability of actually appearing in the
+//! input data and this skips some potential encoding issues.
+
+use super::Ensure;
+use serde::Deserialize;
+use std::fs::File;
+use std::io::BufReader;
+use std::path::PathBuf;
+
+/// A word from the `words` table.
+#[derive(Debug, Deserialize)]
+pub struct Word {
+    pub id: usize,
+    pub position: String, // TODO: unknown
+    pub bare: String,     // TODO: unknown
+    pub accented: String, // TODO: unknown
+    pub derived_from_word_id: Option<usize>,
+    pub rank: Option<usize>,
+    pub disabled: String,     // TODO: unknown
+    pub audio: String,        // TODO: unknown
+    pub usage_en: String,     // TODO: unknown
+    pub usage_de: String,     // TODO: unknown
+    pub number_value: String, // TODO: unknown
+
+    #[serde(rename = "type")]
+    pub word_type: String, // TODO: unknown
+
+    pub level: String,      // TODO: unknown
+    pub created_at: String, // TODO: unknown
+}
+
+/// A word form from the `words_forms` table.
+#[derive(Debug, Deserialize)]
+pub struct WordForm {
+    pub id: usize,
+    pub word_id: usize,
+    pub form_type: String,
+    pub position: String,
+    pub form: String,
+    pub form_bare: String,
+}
+
+/// A translation from the `translations` table.
+#[derive(Debug, Deserialize)]
+pub struct Translation {
+    pub id: usize,
+    pub lang: String,
+    pub word_id: usize,
+    pub position: String,
+    pub tl: String, // unknown
+    pub example_ru: String,
+    pub example_tl: String,
+    pub info: String,
+}
+
+pub struct OpenRussianParser {
+    or_directory: PathBuf,
+}
+
+pub type DynIter<T> = Box<dyn Iterator<Item = T>>;
+
+impl OpenRussianParser {
+    pub fn new<P: Into<PathBuf>>(path: P) -> Self {
+        OpenRussianParser {
+            or_directory: path.into(),
+        }
+    }
+
+    pub fn words(&self) -> DynIter<Word> {
+        self.parser_for("words.csv")
+    }
+
+    pub fn words_forms(&self) -> DynIter<WordForm> {
+        self.parser_for("words_forms.csv")
+    }
+
+    pub fn translations(&self) -> DynIter<Translation> {
+        self.parser_for("translations.csv")
+    }
+
+    fn parser_for<T: serde::de::DeserializeOwned + 'static>(
+        &self,
+        file_name: &str,
+    ) -> Box<dyn Iterator<Item = T>> {
+        let mut path = self.or_directory.clone();
+        path.push(file_name);
+
+        let reader = csv::ReaderBuilder::new()
+            .delimiter(b'\t')
+            .from_reader(BufReader::new(
+                File::open(&path).ensure("failed to open words.csv"),
+            ));
+
+        Box::new(reader.into_deserialize().map(|result| {
+            result.ensure(format!(
+                "failed to deserialize {}",
+                std::any::type_name::<T>()
+            ))
+        }))
+    }
+}