about summary refs log tree commit diff
diff options
context:
space:
mode:
authorVincent Ambo <mail@tazj.in>2023-01-20T10·31+0300
committertazjin <tazjin@tvl.su>2023-01-21T17·49+0000
commit429c0d00c4cd07ea90c85bf1ec2f2c742d970420 (patch)
tree85103dcdf8b7c9d30552dfc97321ad99d77ff2e3
parentee0c0ee95103fa10e227a1976149d20e6944001c (diff)
feat(corp/data-import): add import of OpenRussian 'words' table r/5729
This is actually the lemmata table of this corpus, not the forms of
all words (they're in a separate table).

Change-Id: I89a2c2817ccce840f47406fa2a636f4ed3f49154
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7893
Reviewed-by: tazjin <tazjin@tvl.su>
Tested-by: BuildkiteCI
-rw-r--r--corp/russian/data-import/Cargo.lock115
-rw-r--r--corp/russian/data-import/Cargo.toml2
-rw-r--r--corp/russian/data-import/default.nix11
-rw-r--r--corp/russian/data-import/src/db_setup.rs51
-rw-r--r--corp/russian/data-import/src/main.rs126
-rw-r--r--corp/russian/data-import/src/or_parser.rs73
6 files changed, 348 insertions, 30 deletions
diff --git a/corp/russian/data-import/Cargo.lock b/corp/russian/data-import/Cargo.lock
index 125b62d43e90..cd85e058108f 100644
--- a/corp/russian/data-import/Cargo.lock
+++ b/corp/russian/data-import/Cargo.lock
@@ -29,6 +29,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
 [[package]]
+name = "bstr"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223"
+dependencies = [
+ "lazy_static",
+ "memchr",
+ "regex-automata",
+ "serde",
+]
+
+[[package]]
 name = "cc"
 version = "1.0.78"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -41,12 +53,36 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
 [[package]]
+name = "csv"
+version = "1.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1"
+dependencies = [
+ "bstr",
+ "csv-core",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "csv-core"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
 name = "data-import"
 version = "0.1.0"
 dependencies = [
+ "csv",
  "env_logger",
  "log",
  "rusqlite",
+ "serde",
  "xml-rs",
 ]
 
@@ -163,6 +199,18 @@ dependencies = [
 ]
 
 [[package]]
+name = "itoa"
+version = "0.4.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
+
+[[package]]
+name = "lazy_static"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+
+[[package]]
 name = "libc"
 version = "0.2.139"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -212,6 +260,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
 
 [[package]]
+name = "proc-macro2"
+version = "1.0.50"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
 name = "regex"
 version = "1.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -223,6 +289,12 @@ dependencies = [
 ]
 
 [[package]]
+name = "regex-automata"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
+
+[[package]]
 name = "regex-syntax"
 version = "0.6.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -257,12 +329,49 @@ dependencies = [
 ]
 
 [[package]]
+name = "ryu"
+version = "1.0.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde"
+
+[[package]]
+name = "serde"
+version = "1.0.152"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.152"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af487d118eecd09402d70a5d72551860e788df87b464af30e5ea6a38c75c541e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
 name = "smallvec"
 version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
 
 [[package]]
+name = "syn"
+version = "1.0.107"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
 name = "termcolor"
 version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -272,6 +381,12 @@ dependencies = [
 ]
 
 [[package]]
+name = "unicode-ident"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
+
+[[package]]
 name = "vcpkg"
 version = "0.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/corp/russian/data-import/Cargo.toml b/corp/russian/data-import/Cargo.toml
index b43f829f37b0..1aae2e830578 100644
--- a/corp/russian/data-import/Cargo.toml
+++ b/corp/russian/data-import/Cargo.toml
@@ -6,9 +6,11 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
+csv = "1.1"
 env_logger = "0.10.0"
 log = "0.4.17"
 rusqlite = "0.28"
+serde = { version = "1.0", features = ["derive"] }
 xml-rs = "0.8"
 
 [profile.release-with-debug]
diff --git a/corp/russian/data-import/default.nix b/corp/russian/data-import/default.nix
index cf358874dce6..6aa8ad6aa3d8 100644
--- a/corp/russian/data-import/default.nix
+++ b/corp/russian/data-import/default.nix
@@ -19,6 +19,9 @@ let
     ${pkgs.bzip2}/bin/bunzip2 -k -c ${openCorporaArchive} > $out
   '';
 
+  # mirrored input data from OpenRussian, as of 2023-01-17.
+  #
+  # This data is licensed under CC-BY-SA.
   openRussianArchive = pkgs.fetchzip {
     name = "openrussian-20230117";
     url = "https://tazj.in/blobs/openrussian-20230117.tar.xz";
@@ -43,8 +46,10 @@ lib.fix (self: depot.third_party.naersk.buildPackage {
     inherit shell openCorpora;
 
     # target that actually builds an entire database
-    database = pkgs.runCommand "tvl-russian-db.sqlite" { } ''
-      ${self}/bin/data-import ${openCorpora} $out
-    '';
+    database = pkgs.runCommand "tvl-russian-db.sqlite"
+      {
+        OPENCORPORA_DATA = openCorpora;
+        OPENRUSSIAN_DATA = openRussianArchive;
+      } "${self}/bin/data-import --output $out";
   };
 })
diff --git a/corp/russian/data-import/src/db_setup.rs b/corp/russian/data-import/src/db_setup.rs
index 3f0fa0ff638d..5fe64717ad9b 100644
--- a/corp/russian/data-import/src/db_setup.rs
+++ b/corp/russian/data-import/src/db_setup.rs
@@ -8,6 +8,7 @@
 
 use super::{bail, Ensure};
 use crate::oc_parser::*;
+use crate::or_parser;
 use log::{debug, info};
 use rusqlite::Connection;
 
@@ -69,7 +70,7 @@ CREATE TABLE oc_links (
 
 "#,
     )
-    .ensure("setting up initial table schema failed");
+    .ensure("setting up OpenCorpora table schema failed");
 
     info!("set up initial table schema for OpenCorpora import");
 }
@@ -166,3 +167,51 @@ fn insert_lemma(conn: &Connection, lemma: Lemma) {
 
     debug!("inserted lemma {}", lemma.id);
 }
+
+/// Sets up an initial schema for the OpenRussian data.
+pub fn initial_or_schema(conn: &Connection) {
+    conn.execute_batch(
+        r#"
+CREATE TABLE or_words (
+    id INTEGER PRIMARY KEY,
+    bare TEXT NOT NULL,
+    accented TEXT,
+    derived_from_word_id INTEGER,
+    rank TEXT,
+    word_type TEXT,
+    level TEXT
+) STRICT;
+"#,
+    )
+    .ensure("setting up OpenRussian table schema failed");
+
+    info!("set up initial table schema for OpenRussian import");
+}
+
+pub fn insert_or_words<I: Iterator<Item = or_parser::Word>>(conn: &Connection, words: I) {
+    let mut stmt = conn
+        .prepare_cached(
+            "
+INSERT INTO or_words (id, bare, accented, derived_from_word_id, rank, word_type, level)
+VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)
+",
+        )
+        .ensure("failed to prepare OR words statement");
+    let mut count = 0;
+
+    for word in words {
+        stmt.execute((
+            word.id,
+            word.bare,
+            word.accented,
+            word.derived_from_word_id,
+            word.rank,
+            word.word_type,
+            word.level,
+        ))
+        .ensure("failed to insert OR word");
+        count += 1;
+    }
+
+    info!("inserted {} OpenRussian words", count);
+}
diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs
index 21d4209991c5..11387539ab84 100644
--- a/corp/russian/data-import/src/main.rs
+++ b/corp/russian/data-import/src/main.rs
@@ -1,6 +1,6 @@
-//! This program imports Russian language data from OpenCorpora and
-//! OpenRussian ("Открытый корпус") into a SQLite database that can be
-//! used for [//corp/russian][corp-russian] projects.
+//! This program imports Russian language data from OpenCorpora
+//! ("Открытый корпус") and OpenRussian into a SQLite database that
+//! can be used for [//corp/russian][corp-russian] projects.
 //!
 //! [corp-russian]: https://at.tvl.fyi/?q=%2F%2Fcorp%2Frussian
 //!
@@ -112,42 +112,77 @@ use std::io::BufReader;
 
 mod db_setup;
 mod oc_parser;
+mod or_parser;
 
-fn main() {
-    env_logger::builder()
-        .filter_level(log::LevelFilter::Info)
-        .init();
+struct Args {
+    output: String,
+    or_input: String,
+    oc_input: String,
+}
 
-    let (input_path, output_path) = {
-        let mut args = env::args().collect::<Vec<_>>();
+impl Args {
+    fn populated(&self) -> bool {
+        !(self.output.is_empty() || self.or_input.is_empty() || self.oc_input.is_empty())
+    }
+}
 
-        if args.len() != 3 {
-            bail(format!(
-                "usage: {} <input-file> <output-file>",
-                args.first().map(String::as_str).unwrap_or("data-import")
-            ));
-        }
+fn usage(binary_name: &str) {
+    bail(format!(
+        "usage: {} --output <output-file> --or-input <or-input> --oc-input <oc-input>",
+        binary_name
+    ));
+}
+
+fn parse_args() -> Args {
+    let mut args_iter = env::args();
+    let binary_name = args_iter.next().unwrap();
 
-        (args.remove(1), args.remove(1))
+    let mut args = Args {
+        output: "".into(),
+        or_input: env::var("OPENRUSSIAN_DATA").unwrap_or_default(),
+        oc_input: env::var("OPENCORPORA_DATA").unwrap_or_default(),
     };
 
-    info!("reading from {input_path}; writing output to {output_path}");
-    let input_file = File::open(input_path).ensure("failed to open input file");
+    loop {
+        if args.populated() {
+            break;
+        }
 
-    let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file));
+        while let Some(arg) = args_iter.next() {
+            match arg.as_str() {
+                "--output" => {
+                    args.output = args_iter.next().unwrap();
+                }
 
-    let conn = Connection::open(output_path).ensure("failed to open DB connection");
+                "--or-input" => {
+                    args.or_input = args_iter.next().unwrap();
+                }
 
-    db_setup::initial_oc_schema(&conn);
+                "--oc-input" => {
+                    args.oc_input = args_iter.next().unwrap();
+                }
 
-    // afterwards:
-    // add actual IDs to grammemes
-    // properly reference keys internally
-    // add foreign key constraint on lemma_grammemes.grammeme
+                _ => usage(&binary_name),
+            }
+        }
+    }
+
+    if args.output.is_empty() || args.or_input.is_empty() || args.oc_input.is_empty() {
+        usage(&binary_name);
+    }
+
+    args
+}
+
+fn open_corpora(conn: &Connection, args: &Args) {
+    let input_file = File::open(&args.oc_input).ensure("failed to open input file");
+    let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file));
+    db_setup::initial_oc_schema(&conn);
 
     let mut tx = conn
         .unchecked_transaction()
         .ensure("failed to start transaction");
+
     let mut count = 0;
 
     while let Some(elem) = parser.next_element() {
@@ -165,7 +200,46 @@ fn main() {
         count += 1;
     }
 
-    tx.commit().ensure("final commit failed");
+    tx.commit().ensure("final OpenCorpora commit failed");
+
+    info!("finished OpenCorpora import");
+}
+
+fn open_russian(conn: &Connection, args: &Args) {
+    let parser = or_parser::OpenRussianParser::new(&args.or_input);
+
+    db_setup::initial_or_schema(conn);
+
+    let tx = conn
+        .unchecked_transaction()
+        .ensure("failed to start transaction");
+
+    db_setup::insert_or_words(&tx, parser.words());
+    tx.commit().ensure("OpenRussian words commit failed");
+
+    info!("finished OpenRussian import");
+}
+
+fn main() {
+    env_logger::builder()
+        .filter_level(log::LevelFilter::Info)
+        .init();
+
+    let args = parse_args();
+
+    info!("output path: {}", args.output);
+    info!("OpenCorpora input path: {}", args.oc_input);
+    info!("OpenRussian input path: {}", args.or_input);
+
+    let conn = Connection::open(&args.output).ensure("failed to open DB connection");
+
+    open_corpora(&conn, &args);
+    open_russian(&conn, &args);
+
+    // afterwards:
+    // add actual IDs to grammemes
+    // properly reference keys internally
+    // add foreign key constraint on lemma_grammemes.grammeme
 }
 
 /// It's like `expect`, but through `log::error`.
diff --git a/corp/russian/data-import/src/or_parser.rs b/corp/russian/data-import/src/or_parser.rs
new file mode 100644
index 000000000000..c11896f6bac0
--- /dev/null
+++ b/corp/russian/data-import/src/or_parser.rs
@@ -0,0 +1,73 @@
+//! Parser for the OpenRussian data format.
+//!
+//! Note that when exporting OpenRussian data from the project you
+//! have to choose an encoding. We choose tab-separated CSV files, as
+//! tabs have a very low probability of actually appearing in the
+//! input data and this skips some potential encoding issues.
+
+use super::Ensure;
+use serde::Deserialize;
+use std::fs::File;
+use std::io::BufReader;
+use std::path::PathBuf;
+
+/// A word from the `words` table.
+#[derive(Debug, Deserialize)]
+pub struct Word {
+    pub id: usize,
+    pub position: String, // TODO: unknown
+    pub bare: String,     // TODO: unknown
+    pub accented: String, // TODO: unknown
+    pub derived_from_word_id: Option<usize>,
+    pub rank: String,         // TODO: unknown
+    pub disabled: String,     // TODO: unknown
+    pub audio: String,        // TODO: unknown
+    pub usage_en: String,     // TODO: unknown
+    pub usage_de: String,     // TODO: unknown
+    pub number_value: String, // TODO: unknown
+
+    #[serde(rename = "type")]
+    pub word_type: String, // TODO: unknown
+
+    pub level: String,      // TODO: unknown
+    pub created_at: String, // TODO: unknown
+}
+
+pub struct OpenRussianParser {
+    or_directory: PathBuf,
+}
+
+pub type DynIter<T> = Box<dyn Iterator<Item = T>>;
+
+impl OpenRussianParser {
+    pub fn new<P: Into<PathBuf>>(path: P) -> Self {
+        OpenRussianParser {
+            or_directory: path.into(),
+        }
+    }
+
+    pub fn words(&self) -> DynIter<Word> {
+        self.parser_for("words.csv")
+    }
+
+    fn parser_for<T: serde::de::DeserializeOwned + 'static>(
+        &self,
+        file_name: &str,
+    ) -> Box<dyn Iterator<Item = T>> {
+        let mut path = self.or_directory.clone();
+        path.push(file_name);
+
+        let reader = csv::ReaderBuilder::new()
+            .delimiter(b'\t')
+            .from_reader(BufReader::new(
+                File::open(&path).ensure("failed to open words.csv"),
+            ));
+
+        Box::new(reader.into_deserialize().map(|result| {
+            result.ensure(format!(
+                "failed to deserialize {}",
+                std::any::type_name::<T>()
+            ))
+        }))
+    }
+}