diff options
Diffstat (limited to 'corp/russian/data-import/src/or_parser.rs')
-rw-r--r-- | corp/russian/data-import/src/or_parser.rs | 105 |
1 files changed, 105 insertions, 0 deletions
diff --git a/corp/russian/data-import/src/or_parser.rs b/corp/russian/data-import/src/or_parser.rs new file mode 100644 index 000000000000..8bfc61dbef48 --- /dev/null +++ b/corp/russian/data-import/src/or_parser.rs @@ -0,0 +1,105 @@ +//! Parser for the OpenRussian data format. +//! +//! Note that when exporting OpenRussian data from the project you +//! have to choose an encoding. We choose tab-separated CSV files, as +//! tabs have a very low probability of actually appearing in the +//! input data and this skips some potential encoding issues. + +use super::Ensure; +use serde::Deserialize; +use std::fs::File; +use std::io::BufReader; +use std::path::PathBuf; + +/// A word from the `words` table. +#[derive(Debug, Deserialize)] +pub struct Word { + pub id: usize, + pub position: String, // TODO: unknown + pub bare: String, // TODO: unknown + pub accented: String, // TODO: unknown + pub derived_from_word_id: Option<usize>, + pub rank: Option<usize>, + pub disabled: String, // TODO: unknown + pub audio: String, // TODO: unknown + pub usage_en: String, // TODO: unknown + pub usage_de: String, // TODO: unknown + pub number_value: String, // TODO: unknown + + #[serde(rename = "type")] + pub word_type: String, // TODO: unknown + + pub level: String, // TODO: unknown + pub created_at: String, // TODO: unknown +} + +/// A word form from the `words_forms` table. +#[derive(Debug, Deserialize)] +pub struct WordForm { + pub id: usize, + pub word_id: usize, + pub form_type: String, + pub position: String, + pub form: String, + pub form_bare: String, +} + +/// A translation from the `translations` table. +#[derive(Debug, Deserialize)] +pub struct Translation { + pub id: usize, + pub lang: String, + pub word_id: usize, + pub position: String, + pub tl: String, // unknown + pub example_ru: String, + pub example_tl: String, + pub info: String, +} + +pub struct OpenRussianParser { + or_directory: PathBuf, +} + +pub type DynIter<T> = Box<dyn Iterator<Item = T>>; + +impl OpenRussianParser { + pub fn new<P: Into<PathBuf>>(path: P) -> Self { + OpenRussianParser { + or_directory: path.into(), + } + } + + pub fn words(&self) -> DynIter<Word> { + self.parser_for("words.csv") + } + + pub fn words_forms(&self) -> DynIter<WordForm> { + self.parser_for("words_forms.csv") + } + + pub fn translations(&self) -> DynIter<Translation> { + self.parser_for("translations.csv") + } + + fn parser_for<T: serde::de::DeserializeOwned + 'static>( + &self, + file_name: &str, + ) -> Box<dyn Iterator<Item = T>> { + let mut path = self.or_directory.clone(); + path.push(file_name); + + let reader = csv::ReaderBuilder::new() + .delimiter(b'\t') + .from_reader(BufReader::new( + File::open(&path).ensure("failed to open words.csv"), + )); + + Box::new(reader.into_deserialize().map(|result| { + result.ensure(format!( + "failed to deserialize {}", + std::any::type_name::<T>() + )) + })) + } +} |