diff options
Diffstat (limited to 'corp/russian')
-rw-r--r-- | corp/russian/README.md | 9 | ||||
-rw-r--r-- | corp/russian/data-import/.gitignore | 2 | ||||
-rw-r--r-- | corp/russian/data-import/Cargo.lock | 499 | ||||
-rw-r--r-- | corp/russian/data-import/Cargo.toml | 18 | ||||
-rw-r--r-- | corp/russian/data-import/default.nix | 55 | ||||
-rw-r--r-- | corp/russian/data-import/src/db_setup.rs | 298 | ||||
-rw-r--r-- | corp/russian/data-import/src/main.rs | 298 | ||||
-rw-r--r-- | corp/russian/data-import/src/mappings.rs | 185 | ||||
-rw-r--r-- | corp/russian/data-import/src/oc_parser.rs | 470 | ||||
-rw-r--r-- | corp/russian/data-import/src/or_parser.rs | 105 | ||||
-rw-r--r-- | corp/russian/predlozhnik/.gitignore | 3 | ||||
-rw-r--r-- | corp/russian/predlozhnik/Cargo.lock | 471 | ||||
-rw-r--r-- | corp/russian/predlozhnik/Cargo.toml | 12 | ||||
-rw-r--r-- | corp/russian/predlozhnik/default.nix | 52 | ||||
-rw-r--r-- | corp/russian/predlozhnik/index.css | 29 | ||||
-rw-r--r-- | corp/russian/predlozhnik/index.html | 24 | ||||
-rw-r--r-- | corp/russian/predlozhnik/src/main.rs | 345 |
17 files changed, 2875 insertions, 0 deletions
diff --git a/corp/russian/README.md b/corp/russian/README.md new file mode 100644 index 000000000000..23c3d594c8de --- /dev/null +++ b/corp/russian/README.md @@ -0,0 +1,9 @@ +//corp/russian +============== + +This folder contains TVL corp projects related to the Russian +language, such as the code powering +[Предложник](https://predlozhnik.ru). + +Unless otherwise specified, all rights to these projects are reserved +by ООО "ТВЛ". diff --git a/corp/russian/data-import/.gitignore b/corp/russian/data-import/.gitignore new file mode 100644 index 000000000000..e918c641ce99 --- /dev/null +++ b/corp/russian/data-import/.gitignore @@ -0,0 +1,2 @@ +target/ +all_events.txt diff --git a/corp/russian/data-import/Cargo.lock b/corp/russian/data-import/Cargo.lock new file mode 100644 index 000000000000..cd85e058108f --- /dev/null +++ b/corp/russian/data-import/Cargo.lock @@ -0,0 +1,499 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "ahash" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" +dependencies = [ + "getrandom", + "once_cell", + "version_check", +] + +[[package]] +name = "aho-corasick" +version = "0.7.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" +dependencies = [ + "memchr", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bstr" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata", + "serde", +] + +[[package]] +name = "cc" +version = "1.0.78" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a20104e2335ce8a659d6dd92a51a767a0c062599c73b343fd152cb401e828c3d" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "csv" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +dependencies = [ + "bstr", + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + +[[package]] +name = "data-import" +version = "0.1.0" +dependencies = [ + "csv", + "env_logger", + "log", + "rusqlite", + "serde", + "xml-rs", +] + +[[package]] +name = "env_logger" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0" +dependencies = [ + "humantime", + "is-terminal", + "log", + "regex", + "termcolor", +] + +[[package]] +name = "errno" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +dependencies = [ + "errno-dragonfly", + "libc", + "winapi", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "fallible-iterator" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + +[[package]] +name = "getrandom" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +dependencies = [ + "ahash", +] + +[[package]] +name = "hashlink" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69fe1fcf8b4278d860ad0548329f892a3631fb63f82574df68275f34cdbe0ffa" +dependencies = [ + "hashbrown", +] + +[[package]] +name = "hermit-abi" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" +dependencies = [ + "libc", +] + +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + +[[package]] +name = "io-lifetimes" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7d6c6f8c91b4b9ed43484ad1a938e393caf35960fce7f82a040497207bd8e9e" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "is-terminal" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28dfb6c8100ccc63462345b67d1bbc3679177c75ee4bf59bf29c8b1d110b8189" +dependencies = [ + "hermit-abi", + "io-lifetimes", + "rustix", + "windows-sys", +] + +[[package]] +name = "itoa" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.139" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" + +[[package]] +name = "libsqlite3-sys" +version = "0.25.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29f835d03d717946d28b1d1ed632eb6f0e24a299388ee623d0c23118d3e8a7fa" +dependencies = [ + "pkg-config", + "vcpkg", +] + +[[package]] +name = "linux-raw-sys" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" + +[[package]] +name = "log" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "once_cell" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66" + +[[package]] +name = "pkg-config" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" + +[[package]] +name = "proc-macro2" +version = "1.0.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" + +[[package]] +name = "regex-syntax" +version = "0.6.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" + +[[package]] +name = "rusqlite" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01e213bc3ecb39ac32e81e51ebe31fd888a940515173e3a18a35f8c6e896422a" +dependencies = [ + "bitflags", + "fallible-iterator", + "fallible-streaming-iterator", + "hashlink", + "libsqlite3-sys", + "smallvec", +] + +[[package]] +name = "rustix" +version = "0.36.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4feacf7db682c6c329c4ede12649cd36ecab0f3be5b7d74e6a20304725db4549" +dependencies = [ + "bitflags", + "errno", + "io-lifetimes", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "ryu" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde" + +[[package]] +name = "serde" +version = "1.0.152" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.152" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af487d118eecd09402d70a5d72551860e788df87b464af30e5ea6a38c75c541e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "smallvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" + +[[package]] +name = "syn" +version = "1.0.107" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "termcolor" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "unicode-ident" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7" + +[[package]] +name = "windows_i686_gnu" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640" + +[[package]] +name = "windows_i686_msvc" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd" + +[[package]] +name = "xml-rs" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2d7d3948613f75c98fd9328cfdcc45acc4d360655289d0a7d4ec931392200a3" diff --git a/corp/russian/data-import/Cargo.toml b/corp/russian/data-import/Cargo.toml new file mode 100644 index 000000000000..1aae2e830578 --- /dev/null +++ b/corp/russian/data-import/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "data-import" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +csv = "1.1" +env_logger = "0.10.0" +log = "0.4.17" +rusqlite = "0.28" +serde = { version = "1.0", features = ["derive"] } +xml-rs = "0.8" + +[profile.release-with-debug] +inherits = "release" +debug = true diff --git a/corp/russian/data-import/default.nix b/corp/russian/data-import/default.nix new file mode 100644 index 000000000000..6aa8ad6aa3d8 --- /dev/null +++ b/corp/russian/data-import/default.nix @@ -0,0 +1,55 @@ +{ depot, lib, pkgs, ... }: + +let + buildInputs = with pkgs; [ + sqlite + pkg-config + ]; + + # mirrored input data from OpenCorpora, as of 2023-01-17. + # + # This data is licensed under CC-BY-SA. + openCorporaArchive = pkgs.fetchurl { + name = "dict.opcorpora.xml.bz"; + url = "https://tazj.in/blobs/opencorpora-20230117.xml.bz2"; + sha256 = "04n5g43fkfc93z6xlwf2qfdrfdfl562pc2ivdb3cbbbsy56gkqg6"; + }; + + openCorpora = pkgs.runCommand "dict.opcorpora.xml" { } '' + ${pkgs.bzip2}/bin/bunzip2 -k -c ${openCorporaArchive} > $out + ''; + + # mirrored input data from OpenRussian, as of 2023-01-17. + # + # This data is licensed under CC-BY-SA. + openRussianArchive = pkgs.fetchzip { + name = "openrussian-20230117"; + url = "https://tazj.in/blobs/openrussian-20230117.tar.xz"; + sha256 = "06jl7i23cx58a0n2626hb82xlzimixvnxp7lxdw0g664kv9bmw25"; + }; + + # development shell with native deps + shell = pkgs.mkShell { + inherit buildInputs; + + # make datasets available in the environment + OPENCORPORA_DATA = openCorpora; + OPENRUSSIAN_DATA = openRussianArchive; + }; + +in +lib.fix (self: depot.third_party.naersk.buildPackage { + src = depot.third_party.gitignoreSource ./.; + inherit buildInputs; + + passthru = depot.nix.readTree.drvTargets { + inherit shell openCorpora; + + # target that actually builds an entire database + database = pkgs.runCommand "tvl-russian-db.sqlite" + { + OPENCORPORA_DATA = openCorpora; + OPENRUSSIAN_DATA = openRussianArchive; + } "${self}/bin/data-import --output $out"; + }; +}) diff --git a/corp/russian/data-import/src/db_setup.rs b/corp/russian/data-import/src/db_setup.rs new file mode 100644 index 000000000000..c9fb51738651 --- /dev/null +++ b/corp/russian/data-import/src/db_setup.rs @@ -0,0 +1,298 @@ +//! This module prepares the database layout. +//! +//! The XML import may be in an arbitrary order, so importing data is +//! a multi-step process where we first set up schemas matching the +//! data layout, import the data, and then modify the schema to +//! introduce things like foreign key constraints between tables that +//! represent relations. + +use super::Ensure; +use crate::oc_parser::*; +use crate::or_parser; +use log::{debug, info}; +use rusqlite::Connection; + +/// Sets up an initial schema which matches the OpenCorpora data. +pub fn initial_oc_schema(conn: &Connection) { + conn.execute_batch( + r#" +-- table for plain import of grammemes from XML +CREATE TABLE oc_grammemes ( + name TEXT PRIMARY KEY, + parent TEXT, + alias TEXT, + description TEXT +) STRICT; + +-- table for plain import of lemmas (*not* their variations!) +CREATE TABLE oc_lemmas ( + id INTEGER PRIMARY KEY, + lemma TEXT NOT NULL +) STRICT; + +-- table for relationship between grammemes and lemmas +CREATE TABLE oc_lemma_grammemes ( + lemma INTEGER, + grammeme TEXT NOT NULL, + FOREIGN KEY(lemma) REFERENCES oc_lemmas(id) +) STRICT; + +-- table for all words, i.e. including variations of lemmata +CREATE TABLE oc_words ( + lemma INTEGER NOT NULL, + word TEXT NOT NULL, + FOREIGN KEY(lemma) REFERENCES oc_lemmas(id) +) STRICT; + +-- table for relationship between words and grammemes +CREATE TABLE oc_word_grammemes ( + word INTEGER NOT NULL, + grammeme TEXT NOT NULL, + FOREIGN KEY(word) REFERENCES oc_words(ROWID) +) STRICT; + +-- table for link types +CREATE TABLE oc_link_types ( + id INTEGER PRIMARY KEY, + name TEXT +) STRICT; + +-- table for links between lemmata +CREATE TABLE oc_links ( + id INTEGER PRIMARY KEY, + link_type INTEGER NOT NULL, + from_lemma INTEGER NOT NULL, + to_lemma INTEGER NOT NULL, + FOREIGN KEY(link_type) REFERENCES oc_link_types(id), + FOREIGN KEY(from_lemma) REFERENCES oc_lemmas(id), + FOREIGN KEY(to_lemma) REFERENCES oc_lemmas(id) +) STRICT; + +"#, + ) + .ensure("setting up OpenCorpora table schema failed"); + + info!("set up initial table schema for OpenCorpora import"); +} + +/// Inserts a single OpenCorpora element into the initial table structure. +pub fn insert_oc_element(conn: &Connection, elem: OcElement) { + match elem { + OcElement::Grammeme(grammeme) => { + conn.execute( + "INSERT INTO oc_grammemes (name, parent, alias, description) VALUES (?1, ?2, ?3, ?4)", + ( + &grammeme.name, + &grammeme.parent, + &grammeme.alias, + &grammeme.description, + ), + ) + .ensure("failed to insert grammeme"); + + debug!("inserted grammeme {}", grammeme.name); + } + + OcElement::Lemma(lemma) => insert_lemma(conn, lemma), + + OcElement::LinkType(lt) => { + conn.execute( + "INSERT INTO oc_link_types (id, name) VALUES (?1, ?2)", + (<.id, <.name), + ) + .ensure("failed to insert link type"); + + info!("inserted link type {}", lt.name); + } + + OcElement::Link(link) => { + let mut stmt = conn + .prepare_cached( + "INSERT INTO oc_links (id, link_type, from_lemma, to_lemma) VALUES (?1, ?2, ?3, ?4)", + ) + .ensure("failed to prepare link statement"); + + stmt.execute((&link.id, &link.link_type, &link.from, &link.to)) + .ensure("failed to insert link"); + + debug!("inserted link {}", link.id); + } + } +} + +/// Insert a single lemma into the initial structure. This is somewhat +/// involved because it also establishes a bunch of relations. +fn insert_lemma(conn: &Connection, lemma: Lemma) { + // insert the lemma itself + let mut stmt = conn + .prepare_cached("INSERT INTO oc_lemmas (id, lemma) VALUES (?1, ?2)") + .ensure("failed to prepare statement"); + + stmt.execute((&lemma.id, &lemma.lemma.word)) + .ensure("failed to insert grammeme"); + + // followed by its relations to the grammemes set + let mut stmt = conn + .prepare_cached("INSERT INTO oc_lemma_grammemes (lemma, grammeme) VALUES (?1, ?2)") + .ensure("failed to prepare statement"); + + for grammeme in lemma.grammemes { + stmt.execute((&lemma.id, grammeme)) + .ensure("failed to insert grammeme<>lemma relationship"); + } + + // followed by all of its variations ... + let mut word_insert = conn + .prepare_cached("INSERT INTO oc_words (lemma, word) VALUES (?1, ?2)") + .unwrap(); + + let mut word_grammeme = conn + .prepare_cached("INSERT INTO oc_word_grammemes (word, grammeme) VALUES (?1, ?2)") + .unwrap(); + + for variation in lemma.variations { + // insert the word itself and get its rowid + word_insert + .execute((&lemma.id, &variation.word)) + .ensure("failed to insert word"); + let row_id = conn.last_insert_rowid(); + + // then insert its grammeme links + for grammeme in variation.grammemes { + word_grammeme + .execute((row_id, grammeme)) + .ensure("failed to insert word<>grammeme link"); + } + } + + debug!("inserted lemma {}", lemma.id); +} + +/// Sets up an initial schema for the OpenRussian data. +pub fn initial_or_schema(conn: &Connection) { + conn.execute_batch( + r#" +CREATE TABLE or_words ( + id INTEGER PRIMARY KEY, + bare TEXT NOT NULL, + accented TEXT, + derived_from_word_id INTEGER, + rank INTEGER, + word_type TEXT, + level TEXT +) STRICT; + +CREATE TABLE or_words_forms ( + id INTEGER PRIMARY KEY, + word_id INTEGER NOT NULL, + form_type TEXT, + position TEXT, + form TEXT, + form_bare TEXT, + FOREIGN KEY(word_id) REFERENCES words(id) +) STRICT; + +CREATE TABLE or_translations ( + id INTEGER PRIMARY KEY, + word_id INTEGER NOT NULL, + translation TEXT, + example_ru TEXT, + example_tl TEXT, + info TEXT, + FOREIGN KEY(word_id) REFERENCES words(id) +) STRICT; +"#, + ) + .ensure("setting up OpenRussian table schema failed"); + + info!("set up initial table schema for OpenRussian import"); +} + +pub fn insert_or_words<I: Iterator<Item = or_parser::Word>>(conn: &Connection, words: I) { + let mut stmt = conn + .prepare_cached( + " +INSERT INTO or_words (id, bare, accented, derived_from_word_id, rank, word_type, level) +VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7) +", + ) + .ensure("failed to prepare OR words statement"); + let mut count = 0; + + for word in words { + stmt.execute(( + word.id, + word.bare, + word.accented, + word.derived_from_word_id, + word.rank, + word.word_type, + word.level, + )) + .ensure("failed to insert OR word"); + count += 1; + } + + info!("inserted {} OpenRussian words", count); +} + +pub fn insert_or_word_forms<I: Iterator<Item = or_parser::WordForm>>(conn: &Connection, forms: I) { + let mut stmt = conn + .prepare_cached( + " +INSERT INTO or_words_forms (id, word_id, form_type, position, form, form_bare) +VALUES (?1, ?2, ?3, ?4, ?5, ?6) +", + ) + .ensure("failed to prepare OR word forms statement"); + let mut count = 0; + + for form in forms { + stmt.execute(( + form.id, + form.word_id, + form.form_type, + form.position, + form.form, + form.form_bare, + )) + .ensure("failed to insert OR word form"); + count += 1; + } + + info!("inserted {} OpenRussian word forms", count); +} + +pub fn insert_or_translations<I: Iterator<Item = or_parser::Translation>>( + conn: &Connection, + translations: I, +) { + let mut stmt = conn + .prepare_cached( + "INSERT INTO or_translations (id, word_id, translation, example_ru, example_tl, info) + VALUES (?1, ?2, ?3, ?4, ?5, ?6)", + ) + .ensure("failed to prepare OR translation statement"); + + let mut count = 0; + + for tl in translations { + if tl.lang != "en" { + continue; + } + + stmt.execute(( + tl.id, + tl.word_id, + tl.tl, + tl.example_ru, + tl.example_tl, + tl.info, + )) + .ensure("failed to insert OR translation"); + + count += 1; + } + + info!("inserted {} OpenRussian translations", count); +} diff --git a/corp/russian/data-import/src/main.rs b/corp/russian/data-import/src/main.rs new file mode 100644 index 000000000000..21da48e8d8f4 --- /dev/null +++ b/corp/russian/data-import/src/main.rs @@ -0,0 +1,298 @@ +//! This program imports Russian language data from OpenCorpora +//! ("Открытый корпус") and OpenRussian into a SQLite database that +//! can be used for [//corp/russian][corp-russian] projects. +//! +//! [corp-russian]: https://at.tvl.fyi/?q=%2F%2Fcorp%2Frussian +//! +//! Ideally, running this on intact dumps should yield a fully +//! functional SQLite database compatible with all other tools +//! consuming it. +//! +//! ## OpenCorpora format +//! +//! The format used is partially documented on the [OpenCorpora +//! website][format-docs]. This seems to be a slightly outdated +//! format, however, hence some information about what the format +//! seems to be today. +//! +//! [format-docs]: http://opencorpora.org/?page=export +//! +//! The format is an XML file, which has several categories of data, +//! each with their own schema: +//! +//! * `grammemes`: These define units of grammar. They're *likely* pretty +//! static, and we'll *likely* want to map them into a custom set of +//! (simpler) categories. +//! +//! They form some kind of internal hierarchy, where some of them have a +//! `parent` attribute set to some other grammemes `name`. +//! +//! There's a ridiculous number of these. +//! +//! * `restrictions`: Unclear, not documented on the page. They describe +//! something about the relationship between grammemes. +//! +//! * `lemmata`: this lists the actual lemmas, as well as all their +//! included morphological variants +//! +//! Each lemma has an `id` attribute uniquely identifying its dictionary +//! form, as well as a number of sub-elements: +//! +//! * the `l` attribute contains the lemma itself +//! * the `f` attributes contain morphological variations +//! +//! Each of these sub elements again contains a number of `g` elements, +//! which refer to the IDs of grammems in their `v` attributes. +//! +//! * `<link_types>` These list possible "relationships between lemmas", +//! basically just assigning them IDs and names. There's only 27 of +//! these. +//! +//! * `<links>`: Using the types defined above, this establishes links +//! between lemmas that have some kind of relationship. +//! +//! For example, a relationship `cardinal/ordinal` might be established +//! between the lemmas "два" and "второй". +//! +//! ## OpenRussian format +//! +//! The [OpenRussian](https://en.openrussian.org/dictionary) project +//! lets users export its database as a set of CSV-files. For our +//! purposes, we download the files using `<tab>` separators. +//! +//! Whereas OpenCorpora opts for a flat structure with a "tag" system +//! (through its flexible grammemes), OpenRussian has a fixed pre-hoc +//! structure into which it sorts some words with their morphologies. +//! The OpenRussian database is much smaller as of January 2023 (~1.7 +//! million words vs. >5 million for OpenCorpora), but some of the +//! information is much more practically useful. +//! +//! Two very important bits of information OpenRussian has are accent +//! marks (most tables containing actual words have a normal form +//! containing and accent mark, and a "bare" form without) and +//! translations into English and German. +//! +//! The full dump includes the following tables (and some more): +//! +//! * `words`: List of lemmas in the corpus, with various bits of +//! metadata as well as hand-written notes. +//! +//! * `adjectives`: Contains IDs for words that are adjectives. +//! +//! * `nouns`: IDs for words that are nouns; and noun metadata (e.g. +//! gender, declinability) +//! +//! * `verbs`: IDs of words that are verbs, including their aspect and +//! "partnered" verb in the other aspect +//! +//! * `words_forms`: Contains all morphed variants of the lemmas from +//! `words`, including information about their grammeme, and accent +//! marks. +//! +//! * `words_rels`: Contains relations between words, containing +//! information like "synonyms" or general relation between words. +//! +//! * `translations`: Contains translations tagged by target language, +//! as well as examples and (occasionally) additional information. +//! +//! These tables also contain something, but have not been analysed +//! yet: +//! +//! * `expressions_words` +//! * `sentences` +//! * `sentences_translations` +//! * `sentences_words` + +use log::{error, info}; +use rusqlite::{Connection, Result}; +use std::env; +use std::fmt::Display; +use std::fs::File; +use std::io::BufReader; + +mod db_setup; +mod mappings; +mod oc_parser; +mod or_parser; + +struct Args { + output: String, + or_input: String, + oc_input: String, +} + +impl Args { + fn populated(&self) -> bool { + !(self.output.is_empty() || self.or_input.is_empty() || self.oc_input.is_empty()) + } +} + +fn usage(binary_name: &str) { + bail(format!( + "usage: {} --output <output-file> --or-input <or-input> --oc-input <oc-input>", + binary_name + )); +} + +fn parse_args() -> Args { + let mut args_iter = env::args(); + let binary_name = args_iter.next().unwrap(); + + let mut args = Args { + output: "".into(), + or_input: env::var("OPENRUSSIAN_DATA").unwrap_or_default(), + oc_input: env::var("OPENCORPORA_DATA").unwrap_or_default(), + }; + + loop { + if args.populated() { + break; + } + + while let Some(arg) = args_iter.next() { + match arg.as_str() { + "--output" => { + args.output = args_iter.next().unwrap(); + } + + "--or-input" => { + args.or_input = args_iter.next().unwrap(); + } + + "--oc-input" => { + args.oc_input = args_iter.next().unwrap(); + } + + _ => usage(&binary_name), + } + } + } + + if args.output.is_empty() || args.or_input.is_empty() || args.oc_input.is_empty() { + usage(&binary_name); + } + + args +} + +fn open_corpora(conn: &Connection, args: &Args) { + let input_file = File::open(&args.oc_input).ensure("failed to open input file"); + let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file)); + db_setup::initial_oc_schema(&conn); + + let mut tx = conn + .unchecked_transaction() + .ensure("failed to start transaction"); + + let mut count = 0; + + while let Some(elem) = parser.next_element() { + // commit every 1000 things + if count % 1000 == 0 { + tx.commit().ensure("transaction failed"); + tx = conn + .unchecked_transaction() + .ensure("failed to start new transaction"); + info!("transaction committed at watermark {}", count); + } + + db_setup::insert_oc_element(&tx, elem); + + count += 1; + } + + tx.commit().ensure("final OpenCorpora commit failed"); + + info!("finished OpenCorpora import"); +} + +fn open_russian(conn: &Connection, args: &Args) { + let parser = or_parser::OpenRussianParser::new(&args.or_input); + + db_setup::initial_or_schema(conn); + + { + let tx = conn + .unchecked_transaction() + .ensure("failed to start transaction"); + + db_setup::insert_or_words(&tx, parser.words()); + tx.commit().ensure("OpenRussian words commit failed"); + } + + { + let tx = conn + .unchecked_transaction() + .ensure("failed to start transaction"); + + db_setup::insert_or_word_forms(&tx, parser.words_forms()); + tx.commit().ensure("OpenRussian word forms commit failed"); + } + + { + let tx = conn + .unchecked_transaction() + .ensure("failed to start transaction"); + + db_setup::insert_or_translations(&tx, parser.translations()); + tx.commit().ensure("OpenRussian translations commit failed"); + } + + info!("finished OpenRussian import"); +} + +fn main() { + env_logger::builder() + .filter_level(log::LevelFilter::Info) + .init(); + + let args = parse_args(); + + info!("output path: {}", args.output); + info!("OpenCorpora input path: {}", args.oc_input); + info!("OpenRussian input path: {}", args.or_input); + + let conn = Connection::open(&args.output).ensure("failed to open DB connection"); + + open_corpora(&conn, &args); + open_russian(&conn, &args); + + // afterwards: + // add actual IDs to grammemes + // properly reference keys internally + // add foreign key constraint on lemma_grammemes.grammeme +} + +/// It's like `expect`, but through `log::error`. +trait Ensure<T> { + fn ensure<S: Into<String>>(self, msg: S) -> T; +} + +impl<T, E: Display> Ensure<T> for Result<T, E> { + fn ensure<S: Into<String>>(self, msg: S) -> T { + match self { + Ok(x) => x, + Err(err) => { + error!("{}: {}", msg.into(), err); + std::process::exit(1); + } + } + } +} + +impl<T> Ensure<T> for Option<T> { + fn ensure<S: Into<String>>(self, msg: S) -> T { + match self { + Some(x) => x, + None => { + error!("{}", msg.into()); + std::process::exit(1); + } + } + } +} + +fn bail<S: Into<String>>(msg: S) -> ! { + error!("{}", msg.into()); + std::process::exit(1); +} diff --git a/corp/russian/data-import/src/mappings.rs b/corp/russian/data-import/src/mappings.rs new file mode 100644 index 000000000000..985088a56628 --- /dev/null +++ b/corp/russian/data-import/src/mappings.rs @@ -0,0 +1,185 @@ +//! Manual mapping of some data structures in OC/OR corpora. + +/// Maps the *names* of OpenRussian word types (the `word_type` field +/// in the `or_words` table) to the *set* of OpenCorpora grammemes +/// commonly attached to lemmata of this type in OC. +/// +/// Some word types just don't map over, and are omitted. Many words +/// also have an empty word type. +pub const WORD_TYPES_GRAMMEME_MAP: &'static [(&'static str, &'static [&'static str])] = &[ + ("adjective", &["ADJF"]), + ("adverb", &["ADVB"]), + ("noun", &["NOUN"]), + ("verb", &["INFN"]), // or "VERB" ... +]; + +/// Maps the *names* of OpenRussian grammemes (the `form_type` fields +/// in the `or_word_forms` table) to the *set* of OpenCorpora +/// grammemes attached to them corresponding lemma in the `oc_lemmas` +/// table. +/// +/// This *only* includes grammatical information about the lemma of +/// the word (such as whether it is a verb or other type), but *not* +/// information about the specific instance of the word (such as its +/// gender). +/// +/// Correctly corresponding these requires use of all mapping tables. +pub const FORMS_LEMMATA_GRAMMEME_MAP: &'static [(&'static str, &'static [&'static str])] = &[ + ("ru_adj_comparative", &["COMP"]), + ("ru_adj_superlative", &["ADJF", "Supr"]), + ("ru_adj_f_acc", &["ADJF"]), + ("ru_adj_f_dat", &["ADJF"]), + ("ru_adj_f_gen", &["ADJF"]), + ("ru_adj_f_inst", &["ADJF"]), + ("ru_adj_f_nom", &["ADJF"]), + ("ru_adj_f_prep", &["ADJF"]), + ("ru_adj_m_acc", &["ADJF"]), + ("ru_adj_m_dat", &["ADJF"]), + ("ru_adj_m_gen", &["ADJF"]), + ("ru_adj_m_inst", &["ADJF"]), + ("ru_adj_m_nom", &["ADJF"]), + ("ru_adj_m_prep", &["ADJF"]), + ("ru_adj_n_acc", &["ADJF"]), + ("ru_adj_n_dat", &["ADJF"]), + ("ru_adj_n_gen", &["ADJF"]), + ("ru_adj_n_inst", &["ADJF"]), + ("ru_adj_n_nom", &["ADJF"]), + ("ru_adj_n_prep", &["ADJF"]), + ("ru_adj_pl_acc", &["ADJF"]), + ("ru_adj_pl_dat", &["ADJF"]), + ("ru_adj_pl_gen", &["ADJF"]), + ("ru_adj_pl_inst", &["ADJF"]), + ("ru_adj_pl_nom", &["ADJF"]), + ("ru_adj_pl_prep", &["ADJF"]), + ("ru_adj_short_f", &["ADJS"]), + ("ru_adj_short_m", &["ADJS"]), + ("ru_adj_short_n", &["ADJS"]), + ("ru_adj_short_pl", &["ADJS"]), + ("ru_noun_pl_acc", &["NOUN"]), + ("ru_noun_pl_dat", &["NOUN"]), + ("ru_noun_pl_gen", &["NOUN"]), + ("ru_noun_pl_inst", &["NOUN"]), + ("ru_noun_pl_nom", &["NOUN"]), + ("ru_noun_pl_prep", &["NOUN"]), + ("ru_noun_sg_acc", &["NOUN"]), + ("ru_noun_sg_dat", &["NOUN"]), + ("ru_noun_sg_gen", &["NOUN"]), + ("ru_noun_sg_inst", &["NOUN"]), + ("ru_noun_sg_nom", &["NOUN"]), + ("ru_noun_sg_prep", &["NOUN"]), + ("ru_verb_gerund_past", &["GRND"]), + ("ru_verb_gerund_present", &["GRND"]), + ("ru_verb_imperative_pl", &["VERB"]), + ("ru_verb_imperative_sg", &["VERB"]), + ("ru_verb_past_f", &["VERB"]), + ("ru_verb_past_m", &["VERB"]), + ("ru_verb_past_n", &["VERB"]), + ("ru_verb_past_pl", &["VERB"]), + ("ru_verb_presfut_pl1", &["VERB"]), + ("ru_verb_presfut_pl2", &["VERB"]), + ("ru_verb_presfut_pl3", &["VERB"]), + ("ru_verb_presfut_sg1", &["VERB"]), + ("ru_verb_presfut_sg2", &["VERB"]), + ("ru_verb_presfut_sg3", &["VERB"]), + ( + "ru_base", + &[ /* nothing consistent, except often 'Fixd' */ ], + ), + ("ru_verb_participle_active_past", &["PRTF", "past", "actv"]), + ( + "ru_verb_participle_active_present", + &["PRTF", "pres", "actv"], + ), + ( + "ru_verb_participle_passive_past", + &["PRTF", "past", "passv"], + ), + ( + "ru_verb_participle_passive_present", + &["PRTF", "pres", "passv"], + ), +]; + +/// Maps the *names* of OpenRussian grammemes (the `form_type` fields +/// in the `or_word_forms` table) to the *set* of OpenCorpora +/// grammemes attached to them corresponding words in the `oc_words` +/// table. +/// +/// This includes grammatical information about the "instance" of the +/// word (such as its gender), but *not* the higher-level type +/// information about its lemma. +/// +/// Correctly corresponding these requires use of all mapping tables. +pub const FORMS_WORDS_GRAMMEME_MAP: &'static [(&'static str, &'static [&'static str])] = &[ + ("ru_adj_comparative", &["Cmp2"]), + ("ru_adj_f_acc", &["femn", "sing", "accs"]), + ("ru_adj_f_dat", &["femn", "sing", "datv"]), + ("ru_adj_f_gen", &["femn", "sing", "gent"]), + ("ru_adj_f_inst", &["femn", "sing", "ablt"]), + ("ru_adj_f_nom", &["femn", "sing", "nomn"]), + ("ru_adj_f_prep", &["femn", "sing", "loct"]), + ("ru_adj_m_acc", &["masc", "sing", "accs"]), + ("ru_adj_m_dat", &["masc", "sing", "datv"]), + ("ru_adj_m_gen", &["masc", "sing", "gent"]), + ("ru_adj_m_inst", &["masc", "sing", "ablt"]), + ("ru_adj_m_nom", &["masc", "sing", "nomn"]), + ("ru_adj_m_prep", &["masc", "sing", "loct"]), + ("ru_adj_n_acc", &["neut", "sing", "accs"]), + ("ru_adj_n_dat", &["neut", "sing", "datv"]), + ("ru_adj_n_gen", &["neut", "sing", "gent"]), + ("ru_adj_n_inst", &["neut", "sing", "ablt"]), + ("ru_adj_n_nom", &["neut", "sing", "nomn"]), + ("ru_adj_n_prep", &["neut", "sing", "loct"]), + ("ru_adj_pl_acc", &["plur", "accs"]), + ("ru_adj_pl_dat", &["plur", "datv"]), + ("ru_adj_pl_gen", &["plur", "gent"]), + ("ru_adj_pl_inst", &["plur", "ablt"]), + ("ru_adj_pl_nom", &["plur", "nomn"]), + ("ru_adj_pl_prep", &["plur", "loct"]), + ("ru_adj_short_f", &["femn", "sing"]), + ("ru_adj_short_m", &["masc", "sing"]), + ("ru_adj_short_n", &["neut", "sing"]), + ("ru_adj_short_pl", &["plur"]), + ("ru_noun_pl_acc", &["plur", "accs"]), + ("ru_noun_pl_dat", &["plur", "datv"]), + ("ru_noun_pl_gen", &["plur", "gent"]), + ("ru_noun_pl_inst", &["plur", "ablt"]), + ("ru_noun_pl_nom", &["plur", "nomn"]), + ("ru_noun_pl_prep", &["plur", "loct"]), + ("ru_noun_sg_acc", &["sing", "accs"]), + ("ru_noun_sg_dat", &["sing", "datv"]), + ("ru_noun_sg_gen", &["sing", "gent"]), + ("ru_noun_sg_inst", &["sing", "ablt"]), + ("ru_noun_sg_nom", &["sing", "nomn"]), + ("ru_noun_sg_prep", &["sing", "loct"]), + ("ru_verb_gerund_past", &["past", "V-sh"]), + ("ru_verb_imperative_pl", &["plur", "impr"]), + ("ru_verb_imperative_sg", &["sing", "impr"]), + ("ru_verb_past_f", &["femn", "sing", "past"]), + ("ru_verb_past_m", &["masc", "sing", "past"]), + ("ru_verb_past_n", &["neut", "sing", "past"]), + ("ru_verb_past_pl", &["plur", "past"]), + // these also contain "pres" or "futr", depending on the verb. + ("ru_verb_presfut_pl1", &["plur", "1per"]), + ("ru_verb_presfut_pl2", &["plur", "2per"]), + ("ru_verb_presfut_pl3", &["plur", "3per"]), + ("ru_verb_presfut_sg1", &["sing", "1per"]), + ("ru_verb_presfut_sg2", &["sing", "2per"]), + ("ru_verb_presfut_sg3", &["sing", "3per"]), + // Unclear items, probably only useful tags on lemmata + ( + "ru_verb_gerund_present", + &["pres" /* prob. something missing? */], + ), + ( + "ru_adj_superlative", + &[/* TODO: unclear, random list of grammemes?! */], + ), + ("ru_base", &[/* TODO: unclear */]), + // These have no useful tags in the forms table, only gender & + // case tagging. + ("ru_verb_participle_active_past", &[]), + ("ru_verb_participle_active_present", &[]), + ("ru_verb_participle_passive_past", &[]), + ("ru_verb_participle_passive_present", &[]), +]; diff --git a/corp/russian/data-import/src/oc_parser.rs b/corp/russian/data-import/src/oc_parser.rs new file mode 100644 index 000000000000..8103ebd92369 --- /dev/null +++ b/corp/russian/data-import/src/oc_parser.rs @@ -0,0 +1,470 @@ +use super::{bail, Ensure}; +use log::{info, warn}; +use std::str::FromStr; +use xml::attribute::OwnedAttribute; +use xml::name::OwnedName; +use xml::reader::XmlEvent; +use xml::EventReader; + +#[derive(Default, Debug)] +pub struct Grammeme { + pub parent: Option<String>, + pub name: String, + pub alias: String, + pub description: String, +} + +/// Single form of a word (either its lemma, or the variations). +#[derive(Debug, Default)] +pub struct Variation { + pub word: String, + pub grammemes: Vec<String>, +} + +#[derive(Debug, Default)] +pub struct Lemma { + pub id: u64, + pub lemma: Variation, + pub grammemes: Vec<String>, + pub variations: Vec<Variation>, +} + +#[derive(Debug, Default)] +pub struct LinkType { + pub id: u64, + pub name: String, +} + +#[derive(Debug, Default)] +pub struct Link { + pub id: u64, // link itself + pub from: u64, // lemma + pub to: u64, // lemma + pub link_type: u64, +} + +#[derive(Debug)] +pub enum OcElement { + Grammeme(Grammeme), + Lemma(Lemma), + LinkType(LinkType), + Link(Link), +} + +#[derive(Debug, PartialEq)] +enum ParserState { + /// Parser is not parsing any particular section and waiting for a + /// start tag instead. + Init, + + /// Parser is parsing grammemes. + Grammemes, + + /// Parser is parsing lemmata. + Lemmata, + + /// Parser is inside a lemma's actual lemma. + Lemma, + + /// Parser is parsing a morphological variation of a lemma. + Variation, + + /// Parser is parsing link types. + LinkTypes, + + /// Parser is parsing links. + Links, + + /// Parser has seen the end of the line and nothing more is + /// available. + Ended, +} + +pub struct OpenCorporaParser<R: std::io::Read> { + reader: EventReader<R>, + state: ParserState, +} + +#[derive(PartialEq)] +enum SectionState { + /// Actively interested in parsing this section. + Active, + + /// Section is known, but currently ignored. + Inactive, + + /// Section is unknown (probably a bug). + Unknown, +} + +fn section_state(section: &str) -> SectionState { + match section { + "grammemes" | "lemmata" | "link_types" | "links" => SectionState::Active, + "restrictions" => SectionState::Inactive, + _ => SectionState::Unknown, + } +} + +impl<R: std::io::Read> OpenCorporaParser<R> { + pub fn new(reader: R) -> Self { + let config = xml::ParserConfig::new().trim_whitespace(true); + let reader = EventReader::new_with_config(reader, config); + + Self { + reader, + state: ParserState::Init, + } + } + + /// Pull an `OcElement` out of the parser. Returns `None` if the + /// parser stream has ended. + pub fn next_element(&mut self) -> Option<OcElement> { + if self.state == ParserState::Ended { + return None; + } + + // Pull the next element to determine what context to enter + // next. + loop { + match &self.next() { + // no-op events that do not affect parser state + XmlEvent::Comment(_) + | XmlEvent::Whitespace(_) + | XmlEvent::ProcessingInstruction { .. } + | XmlEvent::StartDocument { .. } => continue, + XmlEvent::StartElement { name, .. } | XmlEvent::EndElement { name } + if name.local_name == "dictionary" => + { + continue + } + + // end of the file, nothing more to return + XmlEvent::EndDocument => { + self.state = ParserState::Ended; + return None; + } + + // some sections are skipped + XmlEvent::StartElement { name, .. } | XmlEvent::EndElement { name } + if section_state(&name.local_name) == SectionState::Inactive => + { + info!("skipping {} section", name.local_name); + self.skip_section(&name.local_name); + } + + // active section events start specific parser states ... + XmlEvent::StartElement { name, .. } + if section_state(&name.local_name) == SectionState::Active => + { + self.state = match name.local_name.as_str() { + "grammemes" => ParserState::Grammemes, + "lemmata" => ParserState::Lemmata, + "link_types" => ParserState::LinkTypes, + "links" => ParserState::Links, + _ => unreachable!(), + }; + } + + // ... or end them + XmlEvent::EndElement { name, .. } + if section_state(&name.local_name) == SectionState::Active => + { + // TODO: assert that the right section ended + self.state = ParserState::Init; + } + + // actual beginning of an actual element, dispatch accordingly + event @ XmlEvent::StartElement { + name, attributes, .. + } => match &self.state { + ParserState::Grammemes => { + return Some(OcElement::Grammeme(self.parse_grammeme(name, attributes))) + } + + ParserState::Lemmata => { + return Some(OcElement::Lemma(self.parse_lemma(name, attributes))) + } + + ParserState::LinkTypes => { + return Some(OcElement::LinkType(self.parse_link_type(name, attributes))) + } + + ParserState::Links if name.local_name == "link" => { + return Some(OcElement::Link(self.parse_link(attributes))) + } + + ParserState::Init | ParserState::Ended => bail(format!( + "parser received an unexpected start element while in state {:?}: {:?}", + self.state, event + )), + + other => bail(format!( + "next_element() called while parser was in state {:?}", + other + )), + }, + + // finally, events that indicate a bug if they're + // encountered here + event @ XmlEvent::EndElement { .. } + | event @ XmlEvent::CData(_) + | event @ XmlEvent::Characters(_) => { + bail(format!("unexpected XML event: {:?}", event)) + } + } + } + } + + /// Skip a section by advancing the parser state until we see an + /// end element for the skipped section. + fn skip_section(&mut self, section: &str) { + loop { + match self.next() { + XmlEvent::EndElement { name } if name.local_name == section => return, + _ => continue, + } + } + } + + fn next(&mut self) -> XmlEvent { + self.reader.next().ensure("XML parsing failed") + } + + /// Parse a tag that should have plain string content. + fn parse_string(&mut self, tag_name: &str) -> String { + let mut out = String::new(); + + loop { + match self.next() { + // ignore irrelevant things + XmlEvent::Comment(_) | XmlEvent::Whitespace(_) => continue, + + // set the content + XmlEvent::Characters(content) => { + out = content; + } + + // expect the end of the element + XmlEvent::EndElement { name } if name.local_name == tag_name => return out, + + // fail on everything unexpected + event => bail(format!( + "unexpected element while parsing <{}>: {:?}", + tag_name, event + )), + } + } + } + + /// Parse a single `<grammeme>` tag. + fn parse_grammeme(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> Grammeme { + if name.local_name != "grammeme" { + bail(format!( + "expected to parse a grammeme, but found <{}>", + name.local_name + )); + } + + let mut grammeme = Grammeme::default(); + + for attr in attributes { + if attr.name.local_name == "parent" && !attr.value.is_empty() { + grammeme.parent = Some(attr.value.clone()); + } + } + + loop { + match self.next() { + // ignore irrelevant things + XmlEvent::Comment(_) | XmlEvent::Whitespace(_) => continue, + + // expect known tags + XmlEvent::StartElement { name, .. } if name.local_name == "name" => { + grammeme.name = self.parse_string("name"); + } + + XmlEvent::StartElement { name, .. } if name.local_name == "alias" => { + grammeme.alias = self.parse_string("alias"); + } + + XmlEvent::StartElement { name, .. } if name.local_name == "description" => { + grammeme.description = self.parse_string("description"); + } + + // handle end of the grammeme + XmlEvent::EndElement { name } if name.local_name == "grammeme" => break, + + // fail on everything unexpected + event => bail(format!( + "unexpected element while parsing <grammeme>: {:?}", + event + )), + } + } + + grammeme + } + + fn parse_lemma(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> Lemma { + if name.local_name != "lemma" { + bail(format!( + "expected to parse a lemma, but found <{}>", + name.local_name + )); + } + + self.state = ParserState::Lemma; + let mut lemma = Lemma::default(); + + for attr in attributes { + if attr.name.local_name == "id" { + lemma.id = u64::from_str(&attr.value).ensure("failed to parse lemma ID"); + } + } + + loop { + match self.next() { + // <lemma> has ended + XmlEvent::EndElement { name } if name.local_name == "lemma" => { + self.state = ParserState::Lemmata; + return lemma; + } + + // actual lemma content + XmlEvent::StartElement { + name, attributes, .. + } => { + match name.local_name.as_str() { + // beginning to parse the lemma itself + "l" => { + lemma.lemma.word = attributes + .into_iter() + .find(|attr| attr.name.local_name == "t") + .map(|attr| attr.value) + .ensure(format!("lemma {} had no actual word", lemma.id)); + } + + // parsing a lemma variation + "f" => { + self.state = ParserState::Variation; + + let word = attributes + .into_iter() + .find(|attr| attr.name.local_name == "t") + .map(|attr| attr.value) + .ensure(format!( + "variation of lemma {} had no actual word", + lemma.id + )); + + lemma.variations.push(Variation { + word, + grammemes: vec![], + }); + } + + // parse a grammeme association + "g" => { + let grammeme = attributes + .into_iter() + .find(|attr| attr.name.local_name == "v") + .map(|attr| attr.value) + .ensure(format!( + "grammeme association in lemma {} missing ID", + lemma.id + )); + + match self.state { + ParserState::Lemma => { + lemma.grammemes.push(grammeme); + } + + ParserState::Variation => { + lemma + .variations + .last_mut() + .ensure("variations should be non-empty") + .grammemes + .push(grammeme); + } + + _ => bail(format!("invalid parser state: encountered grammeme association while in {:?}", self.state)), + } + } + + other => bail(format!("unexpected element while parsing lemma: {other}")), + }; + } + + XmlEvent::EndElement { name } => match name.local_name.as_str() { + "l" if self.state == ParserState::Lemma => continue, + "f" if self.state == ParserState::Variation => { + self.state = ParserState::Lemma; + continue; + } + "g" => continue, + other => bail(format!( + "unexpected </{other}> while parsing lemma {}", + lemma.id + )), + }, + + _ => continue, + } + } + } + + fn parse_link_type(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> LinkType { + if name.local_name != "type" { + bail(format!( + "expected to parse a link type, but found <{}>", + name.local_name + )); + } + + let mut link_type = LinkType::default(); + + for attr in attributes { + if attr.name.local_name == "id" { + link_type.id = u64::from_str(&attr.value).ensure("failed to parse link type ID"); + } + } + + link_type.name = self.parse_string("type"); + link_type + } + + fn parse_link(&mut self, attributes: &[OwnedAttribute]) -> Link { + let mut link = Link::default(); + + for attr in attributes { + let i_val = || u64::from_str(&attr.value).ensure("failed to parse link field"); + + match attr.name.local_name.as_str() { + "id" => { + link.id = i_val(); + } + "from" => { + link.from = i_val(); + } + "to" => { + link.to = i_val(); + } + "type" => { + link.link_type = i_val(); + } + + other => { + warn!("unexpected attribute {} on <link>", other); + continue; + } + } + } + + // expect the end of the <link> element, though since these + // are empty it should be immediate. + self.skip_section("link"); + + link + } +} diff --git a/corp/russian/data-import/src/or_parser.rs b/corp/russian/data-import/src/or_parser.rs new file mode 100644 index 000000000000..8bfc61dbef48 --- /dev/null +++ b/corp/russian/data-import/src/or_parser.rs @@ -0,0 +1,105 @@ +//! Parser for the OpenRussian data format. +//! +//! Note that when exporting OpenRussian data from the project you +//! have to choose an encoding. We choose tab-separated CSV files, as +//! tabs have a very low probability of actually appearing in the +//! input data and this skips some potential encoding issues. + +use super::Ensure; +use serde::Deserialize; +use std::fs::File; +use std::io::BufReader; +use std::path::PathBuf; + +/// A word from the `words` table. +#[derive(Debug, Deserialize)] +pub struct Word { + pub id: usize, + pub position: String, // TODO: unknown + pub bare: String, // TODO: unknown + pub accented: String, // TODO: unknown + pub derived_from_word_id: Option<usize>, + pub rank: Option<usize>, + pub disabled: String, // TODO: unknown + pub audio: String, // TODO: unknown + pub usage_en: String, // TODO: unknown + pub usage_de: String, // TODO: unknown + pub number_value: String, // TODO: unknown + + #[serde(rename = "type")] + pub word_type: String, // TODO: unknown + + pub level: String, // TODO: unknown + pub created_at: String, // TODO: unknown +} + +/// A word form from the `words_forms` table. +#[derive(Debug, Deserialize)] +pub struct WordForm { + pub id: usize, + pub word_id: usize, + pub form_type: String, + pub position: String, + pub form: String, + pub form_bare: String, +} + +/// A translation from the `translations` table. +#[derive(Debug, Deserialize)] +pub struct Translation { + pub id: usize, + pub lang: String, + pub word_id: usize, + pub position: String, + pub tl: String, // unknown + pub example_ru: String, + pub example_tl: String, + pub info: String, +} + +pub struct OpenRussianParser { + or_directory: PathBuf, +} + +pub type DynIter<T> = Box<dyn Iterator<Item = T>>; + +impl OpenRussianParser { + pub fn new<P: Into<PathBuf>>(path: P) -> Self { + OpenRussianParser { + or_directory: path.into(), + } + } + + pub fn words(&self) -> DynIter<Word> { + self.parser_for("words.csv") + } + + pub fn words_forms(&self) -> DynIter<WordForm> { + self.parser_for("words_forms.csv") + } + + pub fn translations(&self) -> DynIter<Translation> { + self.parser_for("translations.csv") + } + + fn parser_for<T: serde::de::DeserializeOwned + 'static>( + &self, + file_name: &str, + ) -> Box<dyn Iterator<Item = T>> { + let mut path = self.or_directory.clone(); + path.push(file_name); + + let reader = csv::ReaderBuilder::new() + .delimiter(b'\t') + .from_reader(BufReader::new( + File::open(&path).ensure("failed to open words.csv"), + )); + + Box::new(reader.into_deserialize().map(|result| { + result.ensure(format!( + "failed to deserialize {}", + std::any::type_name::<T>() + )) + })) + } +} diff --git a/corp/russian/predlozhnik/.gitignore b/corp/russian/predlozhnik/.gitignore new file mode 100644 index 000000000000..58eaf3e32687 --- /dev/null +++ b/corp/russian/predlozhnik/.gitignore @@ -0,0 +1,3 @@ +/target/ +**/*.rs.bk +dist/ diff --git a/corp/russian/predlozhnik/Cargo.lock b/corp/russian/predlozhnik/Cargo.lock new file mode 100644 index 000000000000..ad183fc28671 --- /dev/null +++ b/corp/russian/predlozhnik/Cargo.lock @@ -0,0 +1,471 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "boolinator" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfa8873f51c92e232f9bac4065cddef41b714152812bfc5f7672ba16d6ef8cd9" + +[[package]] +name = "bumpalo" +version = "3.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1ad822118d20d2c234f427000d5acc36eabe1e29a348c89b63dd60b13f28e5d" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "console_error_panic_hook" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06aeb73f470f66dcdbf7223caeebb85984942f22f1adb2a088cf9668146bbbc" +dependencies = [ + "cfg-if", + "wasm-bindgen", +] + +[[package]] +name = "gloo" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23947965eee55e3e97a5cd142dd4c10631cc349b48cecca0ed230fd296f568cd" +dependencies = [ + "gloo-console", + "gloo-dialogs", + "gloo-events", + "gloo-file", + "gloo-render", + "gloo-storage", + "gloo-timers", + "gloo-utils", +] + +[[package]] +name = "gloo-console" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b7ce3c05debe147233596904981848862b068862e9ec3e34be446077190d3f" +dependencies = [ + "gloo-utils", + "js-sys", + "serde", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "gloo-dialogs" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67062364ac72d27f08445a46cab428188e2e224ec9e37efdba48ae8c289002e6" +dependencies = [ + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "gloo-events" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68b107f8abed8105e4182de63845afcc7b69c098b7852a813ea7462a320992fc" +dependencies = [ + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "gloo-file" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8d5564e570a38b43d78bdc063374a0c3098c4f0d64005b12f9bbe87e869b6d7" +dependencies = [ + "gloo-events", + "js-sys", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "gloo-render" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fd9306aef67cfd4449823aadcd14e3958e0800aa2183955a309112a84ec7764" +dependencies = [ + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "gloo-storage" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d6ab60bf5dbfd6f0ed1f7843da31b41010515c745735c970e821945ca91e480" +dependencies = [ + "gloo-utils", + "js-sys", + "serde", + "serde_json", + "thiserror", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "gloo-timers" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fb7d06c1c8cc2a29bee7ec961009a0b2caa0793ee4900c2ffb348734ba1c8f9" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "gloo-utils" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40913a05c8297adca04392f707b1e73b12ba7b8eab7244a4961580b1fd34063c" +dependencies = [ + "js-sys", + "serde", + "serde_json", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + +[[package]] +name = "indexmap" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e" +dependencies = [ + "autocfg", + "hashbrown", +] + +[[package]] +name = "itoa" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c8af84674fe1f223a982c933a0ee1086ac4d4052aa0fb8060c12c6ad838e754" + +[[package]] +name = "js-sys" +version = "0.3.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49409df3e3bf0856b916e2ceaca09ee28e6871cf7d9ce97a692cacfdb2a25a47" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "log" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "maplit" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" + +[[package]] +name = "once_cell" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1" + +[[package]] +name = "predlozhnik" +version = "0.1.0" +dependencies = [ + "lazy_static", + "maplit", + "wasm-bindgen", + "yew", +] + +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn 1.0.101", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + +[[package]] +name = "proc-macro2" +version = "1.0.66" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "ryu" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09" + +[[package]] +name = "scoped-tls-hkt" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2e9d7eaddb227e8fbaaa71136ae0e1e913ca159b86c7da82f3e8f0044ad3a63" + +[[package]] +name = "serde" +version = "1.0.145" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728eb6351430bccb993660dfffc5a72f91ccc1295abaa8ce19b27ebe4f75568b" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.145" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81fa1584d3d1bcacd84c277a0dfe21f5b0f6accf4a23d04d4c6d61f1af522b4c" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.101", +] + +[[package]] +name = "serde_json" +version = "1.0.85" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e55a28e3aaef9d5ce0506d0a14dbba8054ddc7e499ef522dd8b26859ec9d4a44" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "slab" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef" +dependencies = [ + "autocfg", +] + +[[package]] +name = "syn" +version = "1.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e90cde112c4b9690b8cbe810cba9ddd8bc1d7472e2cae317b69e9438c1cba7d2" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "thiserror" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10deb33631e3c9018b9baf9dcbbc4f737320d2b576bac10f6aefa048fa407e3e" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "982d17546b47146b28f7c22e3d08465f6b8903d0ea13c1660d9d84a6e7adcdbb" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.101", +] + +[[package]] +name = "unicode-ident" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "wasm-bindgen" +version = "0.2.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ed0d4f68a3015cc185aff4db9506a015f4b96f95303897bfa23f846db54064e" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b56f625e64f3a1084ded111c4d5f477df9f8c92df113852fa5a374dbda78826" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn 2.0.28", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23639446165ca5a5de86ae1d8896b737ae80319560fbaa4c2887b7da6e7ebd7d" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0162dbf37223cd2afce98f3d0785506dcb8d266223983e4b5b525859e6e182b2" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0eb82fcb7930ae6219a7ecfd55b217f5f0893484b7a13022ebb2b2bf20b5283" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.28", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ab9b36309365056cd639da3134bf87fa8f3d86008abf99e612384a6eecd459f" + +[[package]] +name = "web-sys" +version = "0.3.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcda906d8be16e728fd5adc5b729afad4e444e106ab28cd1c7256e54fa61510f" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "yew" +version = "0.19.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a1ccb53e57d3f7d847338cf5758befa811cabe207df07f543c06f502f9998cd" +dependencies = [ + "console_error_panic_hook", + "gloo", + "gloo-utils", + "indexmap", + "js-sys", + "scoped-tls-hkt", + "slab", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "yew-macro", +] + +[[package]] +name = "yew-macro" +version = "0.19.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fab79082b556d768d6e21811869c761893f0450e1d550a67892b9bce303b7bb" +dependencies = [ + "boolinator", + "lazy_static", + "proc-macro-error", + "proc-macro2", + "quote", + "syn 1.0.101", +] diff --git a/corp/russian/predlozhnik/Cargo.toml b/corp/russian/predlozhnik/Cargo.toml new file mode 100644 index 000000000000..4fac0bf5abbd --- /dev/null +++ b/corp/russian/predlozhnik/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "predlozhnik" +version = "0.1.0" +edition = "2021" + +[dependencies] +maplit = "1.0" +lazy_static = "1.4" +yew = "0.19" + +# needs to be in sync with nixpkgs +wasm-bindgen = "= 0.2.89" diff --git a/corp/russian/predlozhnik/default.nix b/corp/russian/predlozhnik/default.nix new file mode 100644 index 000000000000..2137be111278 --- /dev/null +++ b/corp/russian/predlozhnik/default.nix @@ -0,0 +1,52 @@ +{ lib, pkgs, ... }: + +let + wasmRust = pkgs.rust-bin.stable.latest.default.override { + targets = [ "wasm32-unknown-unknown" ]; + }; + + cargoToml = with builtins; fromTOML (readFile ./Cargo.toml); + + wasmBindgenMatch = + cargoToml.dependencies.wasm-bindgen == "= ${pkgs.wasm-bindgen-cli.version}"; + + assertWasmBindgen = assert (lib.assertMsg wasmBindgenMatch '' + Due to instability in the Rust WASM ecosystem, the trunk build + tool enforces that the Cargo-dependency version of `wasm-bindgen` + MUST match the version of the CLI supplied in the environment. + + This can get out of sync when nixpkgs is updated. To resolve it, + wasm-bindgen must be bumped in the Cargo.toml file and cargo needs + to be run to resolve the dependencies. + + Versions of `wasm-bindgen` in Cargo.toml: + + Expected: '= ${pkgs.wasm-bindgen-cli.version}' + Actual: '${cargoToml.dependencies.wasm-bindgen}' + ''); pkgs.wasm-bindgen-cli; + + deps = with pkgs; [ + binaryen + sass + wasmRust + trunk + assertWasmBindgen + ]; + +in +pkgs.rustPlatform.buildRustPackage rec { + pname = "predlozhnik"; + version = "canon"; + src = lib.cleanSource ./.; + cargoLock.lockFile = ./Cargo.lock; + + buildPhase = '' + export PATH=${lib.makeBinPath deps}:$PATH + mkdir home + export HOME=$PWD/.home + env + trunk build --release -d $out + ''; + + dontInstall = true; +} diff --git a/corp/russian/predlozhnik/index.css b/corp/russian/predlozhnik/index.css new file mode 100644 index 000000000000..3529574c4f2b --- /dev/null +++ b/corp/russian/predlozhnik/index.css @@ -0,0 +1,29 @@ +body { + max-width: 800px; + margin: 40px auto; +} + +#header { + display: flex; + flex-direction: column; +} + +.btn.btn-ghost:disabled { + border-color: #9f9f9f; + color: #9f9f9f; +} + +#predlogi,#padezhi { + display: flex; + flex-direction: row; + flex-wrap: wrap; +} + +.btn { + margin: 3px; + flex-grow: 1; +} + +.footer { + text-align: right; +} diff --git a/corp/russian/predlozhnik/index.html b/corp/russian/predlozhnik/index.html new file mode 100644 index 000000000000..6af1adc0bfba --- /dev/null +++ b/corp/russian/predlozhnik/index.html @@ -0,0 +1,24 @@ +<!DOCTYPE html> +<html> + <head> + <meta charset="utf-8" /> + <meta name="viewport" content="width=device-width, initial-scale=1"> + <link rel="stylesheet" + href="https://unpkg.com/terminal.css@0.7.2/dist/terminal.min.css" /> + <link data-trunk rel="inline" href="index.css"> + <title>Предложник</title> + + <!-- Yandex.RTB --> + <script>window.yaContextCb=window.yaContextCb||[]</script> + <script src="https://yandex.ru/ads/system/context.js" async></script> + </head> + <body> + <noscript> + <h1>Предложник</h1> + <p> + ... показывает с какими падежами употребляются предлоги в + русском языке. Но, к сожалению, только с помощью Javascript. + </p> + </noscript> + </body> +</html> diff --git a/corp/russian/predlozhnik/src/main.rs b/corp/russian/predlozhnik/src/main.rs new file mode 100644 index 000000000000..56ff04808f9d --- /dev/null +++ b/corp/russian/predlozhnik/src/main.rs @@ -0,0 +1,345 @@ +use yew::html::Scope; +use yew::prelude::*; + +use lazy_static::lazy_static; +use maplit::hashmap; +use std::collections::BTreeSet; +use std::collections::HashMap; + +#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] +enum Падеж { + Именительный, + Родительный, + Дательный, + Винительный, + Творительный, + Предложный, +} + +impl Падеж { + const ВСЕ: [Self; 6] = [ + Self::Именительный, + Self::Родительный, + Self::Дательный, + Self::Винительный, + Self::Творительный, + Self::Предложный, + ]; + + fn вопрос(&self) -> &str { + use Падеж::*; + match self { + Именительный => "кто? Что?", + Родительный => "кого? Чего?", + Дательный => "кому? Чему?", + Винительный => "кого? Что?", + Творительный => "кем? Чем?", + Предложный => "ком? Чём?", + } + } +} + +lazy_static! { + static ref ПО_ПРЕДЛОГУ: HashMap<&'static str, BTreeSet<Падеж>> = { + use Падеж::*; + + hashmap! { + "без" => BTreeSet::from([Родительный]), + "близ" => BTreeSet::from([Родительный]), + "в" => BTreeSet::from([Винительный, Предложный]), + "вместо" => BTreeSet::from([Родительный]), + "вне" => BTreeSet::from([Родительный]), + "внутри" => BTreeSet::from([Родительный]), + "возле" => BTreeSet::from([Родительный]), + "вокруг" => BTreeSet::from([Родительный]), + "вроде" => BTreeSet::from([Родительный]), + "для" => BTreeSet::from([Родительный]), + "до" => BTreeSet::from([Родительный]), + "за" => BTreeSet::from([Винительный, Творительный]), + "из" => BTreeSet::from([Родительный]), + "из-за" => BTreeSet::from([Родительный]), + "из-под" => BTreeSet::from([Родительный]), + "к" => BTreeSet::from([Дательный]), + "кроме" => BTreeSet::from([Родительный]), + "между" => BTreeSet::from([Творительный, Родительный]), + "на" => BTreeSet::from([Винительный, Предложный]), + "над" => BTreeSet::from([Творительный]), + "нет" => BTreeSet::from([Родительный]), + "о" => BTreeSet::from([Винительный, Предложный]), + "около" => BTreeSet::from([Родительный]), + "от" => BTreeSet::from([Родительный]), + "перед" => BTreeSet::from([Творительный]), + "по" => BTreeSet::from([Винительный, Дательный, Предложный]), + "под" => BTreeSet::from([Винительный, Творительный]), + "после" => BTreeSet::from([Родительный]), + "при" => BTreeSet::from([Предложный]), + "про" => BTreeSet::from([Винительный]), + "ради" => BTreeSet::from([Родительный]), + "с" => BTreeSet::from([Родительный, Винительный, Творительный]), + "сквозь" => BTreeSet::from([Винительный]), + "среди" => BTreeSet::from([Родительный]), + "у" => BTreeSet::from([Родительный]), + "через" => BTreeSet::from([Винительный]), + } + }; + static ref ПО_ПАДЕЖУ: HashMap<Падеж, BTreeSet<&'static str>> = { + let mut m = hashmap!(); + + for c in Падеж::ВСЕ { + let mut предлоги: BTreeSet<&'static str> = BTreeSet::new(); + for (k, v) in &*ПО_ПРЕДЛОГУ { + if v.contains(&c) { + предлоги.insert(k); + } + } + + m.insert(c, предлоги); + } + + m + }; + static ref ПАДЕЖИ: BTreeSet<Падеж> = BTreeSet::from(Падеж::ВСЕ); + static ref ПРЕДЛОГИ: BTreeSet<&'static str> = { + let mut s: BTreeSet<&'static str> = BTreeSet::new(); + + for п in ПО_ПРЕДЛОГУ.keys() { + s.insert(п); + } + + s + }; +} + +fn исключение(предлог: &str, падеж: Падеж) -> Option<Html> { + use Падеж::*; + + match (предлог, падеж) { + ("в", Винительный) => Some(html! {"Во что? В кого?"}), + + ("о", Винительный) => Some(html! { + <> + <p>{"О кого? Обо что?"}</p> + <p>{"Редко используется. Например:"}</p> + <ul> + <li>{"Удариться о притолоку."}</li> + <li>{"точить о камень."}</li> + </ul> + </> + }), + + ("между", Родительный) => Some(html! { + <> + <p>{"Между чего?"}</p> + <p>{"Редко используется. Только в идиомах и старой литературе:"}</p> + <ul> + <li>{"Читаю между строк."}</li> + </ul> + </> + }), + + _ => None, + } +} + +enum Сообщение { + ВыбралПадеж(Option<Падеж>), + ВыбралПредлог(Option<&'static str>), +} + +#[derive(Default)] +struct Модель { + падеж: Option<Падеж>, + предлог: Option<&'static str>, +} + +struct Вывод { + доступные_падежи: BTreeSet<Падеж>, + доступные_предлоги: BTreeSet<&'static str>, + объяснение: Option<Html>, +} + +fn объясни(падеж: Падеж, предлог: &str) -> Html { + let иск = match исключение(предлог, падеж) { + Some(exp) => html! { exp }, + None => html! { format!("{} {}", предлог, падеж.вопрос()) }, + }; + + html! { + <div id="obyasnenie"> + <hr/> + <h2>{"Пример:"}</h2> + {иск} + </div> + } +} + +fn ограничить(м: &Модель) -> Вывод { + match (м.падеж, &м.предлог) { + (Some(пж), Some(пл)) => Вывод { + доступные_падежи: (*ПО_ПРЕДЛОГУ)[пл].clone(), + доступные_предлоги: (*ПО_ПАДЕЖУ)[&пж].clone(), + объяснение: Some(объясни(пж, пл)), + }, + + (Some(пж), None) => Вывод { + доступные_падежи: BTreeSet::from([пж]), + доступные_предлоги: (*ПО_ПАДЕЖУ)[&пж].clone(), + объяснение: None, + }, + + (None, Some(пл)) => Вывод { + доступные_падежи: (*ПО_ПРЕДЛОГУ)[пл].clone(), + доступные_предлоги: BTreeSet::from([*пл]), + объяснение: None, + }, + + (None, None) => Вывод { + доступные_падежи: ПАДЕЖИ.clone(), + доступные_предлоги: ПРЕДЛОГИ.clone(), + объяснение: None, + }, + } +} + +fn класс_кнопки(выбран: bool, доступен: bool) -> String { + let класс = "btn ".to_string(); + класс + + match (выбран, доступен) { + (true, _) => "btn-primary", + (false, true) => "btn-ghost btn-primary", + (false, false) => "btn-ghost btn-default", + } +} + +fn покажи_предлог( + link: &Scope<Модель>, + м: &Модель, + вв: &Вывод, + п: &'static str, +) -> Html { + let выбран = м.предлог == Some(п); + let доступен = вв.доступные_предлоги.contains(п); + let класс = класс_кнопки(выбран, доступен); + + html! { + <button class={класс} + onclick={link.callback(move |_| if выбран { + Сообщение::ВыбралПредлог(None) + } else { + Сообщение::ВыбралПредлог(Some(п)) + })} + disabled={!доступен}> + {п} + </button> + } +} + +fn покажи_падеж( + link: &Scope<Модель>, м: &Модель, вв: &Вывод, п: Падеж +) -> Html { + let выбран = м.падеж == Some(п); + let доступен = вв.доступные_падежи.contains(&п); + let класс = класс_кнопки(выбран, доступен); + + html! { + <button class={класс} + onclick={link.callback(move |_| if выбран { + Сообщение::ВыбралПадеж(None) + } else { + Сообщение::ВыбралПадеж(Some(п)) + })} + disabled={!доступен}> + {format!("{:?}", п)} + </button> + } +} + +impl Component for Модель { + type Message = Сообщение; + type Properties = (); + + fn create(_ctx: &Context<Self>) -> Self { + Default::default() + } + + fn update(&mut self, _ctx: &Context<Self>, msg: Self::Message) -> bool { + match msg { + Сообщение::ВыбралПадеж(пж) => self.падеж = пж, + Сообщение::ВыбралПредлог(пл) => self.предлог = пл, + } + + true + } + + fn view(&self, ctx: &Context<Self>) -> Html { + let вв = ограничить(self); + let link = ctx.link(); + + let кнопки_предлогов = ПРЕДЛОГИ + .iter() + .map(|п| покажи_предлог(link, self, &вв, п)) + .collect::<Html>(); + + let кнопки_падежов = ПАДЕЖИ + .iter() + .map(|п| покажи_падеж(link, self, &вв, *п)) + .collect::<Html>(); + + let объяснение = вв.объяснение.map(|exp| exp).unwrap_or_else(|| html! {}); + + let footer = html! { + <footer> + <hr/> + <p class="footer"> + <a href="https://code.tvl.fyi/tree/corp/russian/predlozhnik">{"код"}</a> + {" | "} + {"сделано "}<a href="https://tvl.su">{"ООО \"ТВЛ\""}</a> + </p> + </footer> + }; + + let код_рекламы = r#" +window.yaContextCb.push(()=>{ + Ya.Context.AdvManager.render({ + renderTo: 'yandex_rtb_R-A-1773485-1', + blockId: 'R-A-1773485-1' + }) +}) +"#; + + let реклама = html! { + <div id="ad"> + <div id="yandex_rtb_R-A-1773485-1"></div> + <script>{код_рекламы}</script> + </div> + }; + + html! { + <> + <div id="header"> + <h1>{"Предложник"}</h1> + <p>{"... показывает с какими падежами употребляются предлоги в русском языке."}</p> + </div> + + <h2>{"Выбирай предлог:"}</h2> + <div id="predlogi"> + {кнопки_предлогов} + </div> + <hr/> + + <h2>{"Выбирай падеж:"}</h2> + <div id="padezhi"> + {кнопки_падежов} + </div> + + {объяснение} + {footer} + {реклама} + </> + } + } +} + +fn main() { + yew::start_app::<Модель>(); +} |