diff options
author | Vincent Ambo <mail@tazj.in> | 2023-01-18T11·52+0300 |
---|---|---|
committer | tazjin <tazjin@tvl.su> | 2023-01-18T15·44+0000 |
commit | 6986aa5824ba6ae23b1363fede13c2df5ea0e770 (patch) | |
tree | b0582427803ac6bb9c445fb947bd5092586ab7a3 /corp/russian/data-import/src/main.rs | |
parent | 0196555f07d7295a40aefd5aec266f3932efbb2b (diff) |
feat(corp/data-import): insert OpenCorpora data into SQLite r/5688
This is an initial and kind of dumb table structure, but there's some massaging that needs to be done before this makes more sense. Change-Id: I441288b684ef86be507099bcc4ebf984598789c8 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7861 Reviewed-by: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI
Diffstat (limited to 'corp/russian/data-import/src/main.rs')
-rw-r--r-- | corp/russian/data-import/src/main.rs | 36 |
1 files changed, 27 insertions, 9 deletions
diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs index 9f2f5089a603..502351cf9de8 100644 --- a/corp/russian/data-import/src/main.rs +++ b/corp/russian/data-import/src/main.rs @@ -55,11 +55,13 @@ //! between the lemmas "два" and "второй". use log::{error, info}; +use rusqlite::{Connection, Result}; use std::env; use std::fmt::Display; use std::fs::File; -use std::io::{BufReader, BufWriter, Write}; +use std::io::BufReader; +mod db_setup; mod oc_parser; fn main() { @@ -77,18 +79,34 @@ fn main() { let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file)); - let mut out = BufWriter::new(std::io::stdout().lock()); + let conn = Connection::open("out.db").ensure("failed to open DB connection"); + + db_setup::initial_schema(&conn); + + // afterwards: + // add actual IDs to grammemes + // properly reference keys internally + // add foreign key constraint on lemma_grammemes.grammeme + + let mut tx = conn + .unchecked_transaction() + .ensure("failed to start transaction"); + let mut count = 0; while let Some(elem) = parser.next_element() { - if let oc_parser::OcElement::Lemma(lemma) = elem { - if lemma.lemma.word == "тяжёлый" { - writeln!(out, "{:?}", lemma).ensure("writing output failed"); - break; - } + // commit every 1000 things + if count % 1000 == 0 { + tx.commit().ensure("transaction failed"); + tx = conn + .unchecked_transaction() + .ensure("failed to start new transaction"); + info!("transaction committed at watermark {}", count); } - } - out.flush().ensure("flushing the out buffer failed"); + db_setup::insert_oc_element(&tx, elem); + + count += 1; + } } /// It's like `expect`, but through `log::error`. |