about summary refs log tree commit diff
path: root/corp/russian/data-import/src/main.rs
diff options
context:
space:
mode:
authorVincent Ambo <mail@tazj.in>2023-01-18T11·52+0300
committertazjin <tazjin@tvl.su>2023-01-18T15·44+0000
commit6986aa5824ba6ae23b1363fede13c2df5ea0e770 (patch)
treeb0582427803ac6bb9c445fb947bd5092586ab7a3 /corp/russian/data-import/src/main.rs
parent0196555f07d7295a40aefd5aec266f3932efbb2b (diff)
feat(corp/data-import): insert OpenCorpora data into SQLite r/5688
This is an initial and kind of dumb table structure, but there's some
massaging that needs to be done before this makes more sense.

Change-Id: I441288b684ef86be507099bcc4ebf984598789c8
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7861
Reviewed-by: tazjin <tazjin@tvl.su>
Tested-by: BuildkiteCI
Diffstat (limited to 'corp/russian/data-import/src/main.rs')
-rw-r--r--corp/russian/data-import/src/main.rs36
1 files changed, 27 insertions, 9 deletions
diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs
index 9f2f5089a6..502351cf9d 100644
--- a/corp/russian/data-import/src/main.rs
+++ b/corp/russian/data-import/src/main.rs
@@ -55,11 +55,13 @@
 //!   between the lemmas "два" and "второй".
 
 use log::{error, info};
+use rusqlite::{Connection, Result};
 use std::env;
 use std::fmt::Display;
 use std::fs::File;
-use std::io::{BufReader, BufWriter, Write};
+use std::io::BufReader;
 
+mod db_setup;
 mod oc_parser;
 
 fn main() {
@@ -77,18 +79,34 @@ fn main() {
 
     let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file));
 
-    let mut out = BufWriter::new(std::io::stdout().lock());
+    let conn = Connection::open("out.db").ensure("failed to open DB connection");
+
+    db_setup::initial_schema(&conn);
+
+    // afterwards:
+    // add actual IDs to grammemes
+    // properly reference keys internally
+    // add foreign key constraint on lemma_grammemes.grammeme
+
+    let mut tx = conn
+        .unchecked_transaction()
+        .ensure("failed to start transaction");
+    let mut count = 0;
 
     while let Some(elem) = parser.next_element() {
-        if let oc_parser::OcElement::Lemma(lemma) = elem {
-            if lemma.lemma.word == "тяжёлый" {
-                writeln!(out, "{:?}", lemma).ensure("writing output failed");
-                break;
-            }
+        // commit every 1000 things
+        if count % 1000 == 0 {
+            tx.commit().ensure("transaction failed");
+            tx = conn
+                .unchecked_transaction()
+                .ensure("failed to start new transaction");
+            info!("transaction committed at watermark {}", count);
         }
-    }
 
-    out.flush().ensure("flushing the out buffer failed");
+        db_setup::insert_oc_element(&tx, elem);
+
+        count += 1;
+    }
 }
 
 /// It's like `expect`, but through `log::error`.