about summary refs log tree commit diff
path: root/corp/russian/data-import/default.nix
diff options
context:
space:
mode:
authorVincent Ambo <mail@tazj.in>2023-01-17T21·36+0300
committertazjin <tazjin@tvl.su>2023-01-18T01·10+0000
commitee7616d9563eabf2ae01927bc9d37ccf3e3b3325 (patch)
treeac43dc06b1f191308182897bd46726f5e9d41783 /corp/russian/data-import/default.nix
parent032ab16bbbd318704be71af7b569624ddab24802 (diff)
feat(corp/russian/data-import): new OpenCorpora data import tool r/5683
Adds the beginning of a tool which can import OpenCorpora data into a
SQLite database. This is quite a lot of toil and there's probably a
better way to do this, but overall becoming this intimately familiar
with the data structures is quite helpful for understanding what I
can/can't do with only this dataset.

Change-Id: Ieab33a8ce07ea4ac87917b9c8132226bbc6523b1
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7859
Reviewed-by: tazjin <tazjin@tvl.su>
Tested-by: BuildkiteCI
Diffstat (limited to 'corp/russian/data-import/default.nix')
-rw-r--r--corp/russian/data-import/default.nix39
1 files changed, 39 insertions, 0 deletions
diff --git a/corp/russian/data-import/default.nix b/corp/russian/data-import/default.nix
new file mode 100644
index 000000000000..b4cdc50c25c1
--- /dev/null
+++ b/corp/russian/data-import/default.nix
@@ -0,0 +1,39 @@
+{ depot, pkgs, ... }:
+
+let
+  buildInputs = with pkgs; [
+    sqlite
+    pkg-config
+  ];
+
+  # mirrored input data from OpenCorpora, as of 2023-01-17.
+  #
+  # This data is licensed under CC-BY-SA.
+  inputDataArchive = pkgs.fetchurl {
+    name = "dict.opcorpora.xml.bz";
+    url = "https://tazj.in/blobs/dict.opcorpora.xml.bz2";
+    sha256 = "04n5g43fkfc93z6xlwf2qfdrfdfl562pc2ivdb3cbbbsy56gkqg6";
+  };
+
+  inputData = pkgs.runCommand "dict.opcorpora.xml" { } ''
+    ${pkgs.bzip2}/bin/bunzip2 -k -c ${inputDataArchive} > $out
+  '';
+
+  # development shell with native deps
+  shell = pkgs.mkShell {
+    inherit buildInputs;
+
+    # make OPENCORPORA_DATA available in the environment
+    OPENCORPORA_DATA = inputData;
+  };
+in
+depot.third_party.naersk.buildPackage {
+  src = depot.third_party.gitignoreSource ./.;
+  inherit buildInputs;
+
+  passthru = {
+    inherit shell;
+
+
+  };
+}