diff options
Diffstat (limited to 'corp/russian/data-import')
-rw-r--r-- | corp/russian/data-import/default.nix | 23 |
1 files changed, 15 insertions, 8 deletions
diff --git a/corp/russian/data-import/default.nix b/corp/russian/data-import/default.nix index c2fc1bf1cb5e..cf358874dce6 100644 --- a/corp/russian/data-import/default.nix +++ b/corp/russian/data-import/default.nix @@ -9,22 +9,29 @@ let # mirrored input data from OpenCorpora, as of 2023-01-17. # # This data is licensed under CC-BY-SA. - inputDataArchive = pkgs.fetchurl { + openCorporaArchive = pkgs.fetchurl { name = "dict.opcorpora.xml.bz"; - url = "https://tazj.in/blobs/dict.opcorpora.xml.bz2"; + url = "https://tazj.in/blobs/opencorpora-20230117.xml.bz2"; sha256 = "04n5g43fkfc93z6xlwf2qfdrfdfl562pc2ivdb3cbbbsy56gkqg6"; }; - inputData = pkgs.runCommand "dict.opcorpora.xml" { } '' - ${pkgs.bzip2}/bin/bunzip2 -k -c ${inputDataArchive} > $out + openCorpora = pkgs.runCommand "dict.opcorpora.xml" { } '' + ${pkgs.bzip2}/bin/bunzip2 -k -c ${openCorporaArchive} > $out ''; + openRussianArchive = pkgs.fetchzip { + name = "openrussian-20230117"; + url = "https://tazj.in/blobs/openrussian-20230117.tar.xz"; + sha256 = "06jl7i23cx58a0n2626hb82xlzimixvnxp7lxdw0g664kv9bmw25"; + }; + # development shell with native deps shell = pkgs.mkShell { inherit buildInputs; - # make OPENCORPORA_DATA available in the environment - OPENCORPORA_DATA = inputData; + # make datasets available in the environment + OPENCORPORA_DATA = openCorpora; + OPENRUSSIAN_DATA = openRussianArchive; }; in @@ -33,11 +40,11 @@ lib.fix (self: depot.third_party.naersk.buildPackage { inherit buildInputs; passthru = depot.nix.readTree.drvTargets { - inherit shell inputData; + inherit shell openCorpora; # target that actually builds an entire database database = pkgs.runCommand "tvl-russian-db.sqlite" { } '' - ${self}/bin/data-import ${inputData} $out + ${self}/bin/data-import ${openCorpora} $out ''; }; }) |