diff options
author | Vincent Ambo <mail@tazj.in> | 2023-06-18T12·15+0300 |
---|---|---|
committer | tazjin <tazjin@tvl.su> | 2023-06-18T12·43+0000 |
commit | 332a821100be288863ad0bac5f655ed512e4fd19 (patch) | |
tree | 44469c3578b4418bb368e0942edffb5b00b4d6ff /users/tazjin/tgsa | |
parent | 6678e768a08e993c586756ad1815a5fcf1f520b6 (diff) |
feat(tazjin/tgsa): replace translation backend r/6327
The GPT backend is cool, but it's also very slow, prone to request errors and quite expensive. This switches to Yandex Translate instead which for all posts that I tested seems to be totally fine. Change-Id: I5217113995b701508a83e7782eb1325957996719 Reviewed-on: https://cl.tvl.fyi/c/depot/+/8826 Tested-by: BuildkiteCI Reviewed-by: tazjin <tazjin@tvl.su>
Diffstat (limited to 'users/tazjin/tgsa')
-rw-r--r-- | users/tazjin/tgsa/Cargo.lock | 145 | ||||
-rw-r--r-- | users/tazjin/tgsa/Cargo.toml | 6 | ||||
-rw-r--r-- | users/tazjin/tgsa/src/main.rs | 47 | ||||
-rw-r--r-- | users/tazjin/tgsa/src/translate.rs | 191 |
4 files changed, 327 insertions, 62 deletions
diff --git a/users/tazjin/tgsa/Cargo.lock b/users/tazjin/tgsa/Cargo.lock index 51d11135f37f..bcea1a9831d3 100644 --- a/users/tazjin/tgsa/Cargo.lock +++ b/users/tazjin/tgsa/Cargo.lock @@ -36,6 +36,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" [[package]] +name = "base64" +version = "0.21.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "604178f6c5c21f02dc555784810edfb88d34ac2c73b2eae109655649ee73ce3d" + +[[package]] name = "bitflags" version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -130,7 +136,7 @@ dependencies = [ "proc-macro2", "quote", "smallvec", - "syn", + "syn 1.0.101", ] [[package]] @@ -140,7 +146,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dfae75de57f2b2e85e8768c3ea840fd159c8f33e2b6522c7835b7abac81be16e" dependencies = [ "quote", - "syn", + "syn 1.0.101", ] [[package]] @@ -183,7 +189,7 @@ dependencies = [ "proc-macro2", "quote", "rustc_version", - "syn", + "syn 1.0.101", ] [[package]] @@ -229,6 +235,21 @@ dependencies = [ ] [[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + +[[package]] name = "form_urlencoded" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -307,7 +328,7 @@ dependencies = [ "markup5ever", "proc-macro2", "quote", - "syn", + "syn 1.0.101", ] [[package]] @@ -535,6 +556,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1" [[package]] +name = "openssl" +version = "0.10.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69b3f656a17a6cbc115b5c7a40c616947d213ba182135b014d6051b73ab6f019" +dependencies = [ + "bitflags", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.18", +] + +[[package]] name = "openssl-probe" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -542,11 +589,10 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.76" +version = "0.9.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5230151e44c0f05157effb743e8d517472843121cf9243e8b81393edb5acd9ce" +checksum = "c2ce0f250f34a308dcfdbb351f511359857d4ed2134ba715a4eadd46e1ffd617" dependencies = [ - "autocfg", "cc", "libc", "pkg-config", @@ -653,7 +699,7 @@ dependencies = [ "proc-macro-hack", "proc-macro2", "quote", - "syn", + "syn 1.0.101", ] [[package]] @@ -700,9 +746,9 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" [[package]] name = "proc-macro2" -version = "1.0.46" +version = "1.0.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b" +checksum = "dec2b086b7a862cf4de201096214fa870344cf922b2b30c167badb3af3195406" dependencies = [ "unicode-ident", ] @@ -715,9 +761,9 @@ checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" [[package]] name = "quote" -version = "1.0.21" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179" +checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488" dependencies = [ "proc-macro2", ] @@ -822,12 +868,27 @@ dependencies = [ ] [[package]] +name = "ring" +version = "0.16.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" +dependencies = [ + "cc", + "libc", + "once_cell", + "spin", + "untrusted", + "web-sys", + "winapi", +] + +[[package]] name = "rouille" version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "18b2380c42510ef4a28b5f228a174c801e0dec590103e215e60812e2e2f34d05" dependencies = [ - "base64", + "base64 0.13.0", "chrono", "filetime", "multipart", @@ -928,6 +989,9 @@ name = "serde" version = "1.0.145" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "728eb6351430bccb993660dfffc5a72f91ccc1295abaa8ce19b27ebe4f75568b" +dependencies = [ + "serde_derive", +] [[package]] name = "serde_derive" @@ -937,7 +1001,7 @@ checksum = "81fa1584d3d1bcacd84c277a0dfe21f5b0f6accf4a23d04d4c6d61f1af522b4c" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.101", ] [[package]] @@ -999,6 +1063,12 @@ dependencies = [ ] [[package]] +name = "spin" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" + +[[package]] name = "stable_deref_trait" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1042,6 +1112,17 @@ dependencies = [ ] [[package]] +name = "syn" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] name = "tempfile" version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1071,8 +1152,12 @@ name = "tgsa" version = "0.1.0" dependencies = [ "anyhow", + "base64 0.21.2", "crimp", "ego-tree", + "lazy_static", + "openssl", + "ring", "rouille", "scraper", "serde", @@ -1097,15 +1182,23 @@ dependencies = [ [[package]] name = "time" -version = "0.3.15" +version = "0.3.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d634a985c4d4238ec39cacaed2e7ae552fbd3c476b552c1deac3021b7d7eaf0c" +checksum = "ea9e1b3cf1243ae005d9e74085d4d542f3125458f3a81af210d901dcd7411efd" dependencies = [ "libc", "num_threads", + "serde", + "time-core", ] [[package]] +name = "time-core" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" + +[[package]] name = "tiny_http" version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1179,6 +1272,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" [[package]] +name = "untrusted" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" + +[[package]] name = "url" version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1240,7 +1339,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn", + "syn 1.0.101", "wasm-bindgen-shared", ] @@ -1262,7 +1361,7 @@ checksum = "07bc0c051dc5f23e307b13285f9d75df86bfdf816c5721e573dec1f9b8aa193c" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.101", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -1274,6 +1373,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1c38c045535d93ec4f0b4defec448e4291638ee608530863b1e2ba115d4fff7f" [[package]] +name = "web-sys" +version = "0.3.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcda906d8be16e728fd5adc5b729afad4e444e106ab28cd1c7256e54fa61510f" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] name = "winapi" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" diff --git a/users/tazjin/tgsa/Cargo.toml b/users/tazjin/tgsa/Cargo.toml index 0b1529805864..8764ef652491 100644 --- a/users/tazjin/tgsa/Cargo.toml +++ b/users/tazjin/tgsa/Cargo.toml @@ -10,5 +10,9 @@ rouille = { version = "3.5", default-features = false } url = "2.3" scraper = "0.13" ego-tree = "0.6" # in tandem with 'scraper' -serde = "1.0" +serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" +ring = "0.16.20" +openssl = "0.10.54" +base64 = "0.21.2" +lazy_static = "1.4.0" diff --git a/users/tazjin/tgsa/src/main.rs b/users/tazjin/tgsa/src/main.rs index 1c7ee408abe4..cbdb272e0e3f 100644 --- a/users/tazjin/tgsa/src/main.rs +++ b/users/tazjin/tgsa/src/main.rs @@ -1,10 +1,11 @@ use anyhow::{anyhow, Context, Result}; use scraper::{Html, Selector}; -use serde_json::Value; use std::collections::HashMap; use std::sync::RwLock; use std::time::{Duration, Instant}; +mod translate; + #[derive(Clone, Debug, Eq, Hash, PartialEq)] struct TgLink { username: String, @@ -63,46 +64,6 @@ fn fetch_post(link: &TgLink, embed: bool) -> Result<String> { Ok(response.body) } -fn fetch_translation(message: &str) -> Result<String> { - let request = serde_json::json!({ - "model": "gpt-3.5-turbo", - "messages": [ - {"role": "user", "content": "Please translate the following message from a Telegram channel into English. If the post is already partially in English, please leave those bits intact as they are. Please respond only with the translation."}, - {"role": "user", "content": message} - ] - }); - - let response: Value = crimp::Request::post("https://api.openai.com/v1/chat/completions") - .bearer_auth(&std::env::var("OPENAPI_KEY").context("no openapi key set")?)? - .json(&request)? - .send() - .context("failed to fetch translation from openai")? - .as_json::<Value>()? - .error_for_status(|resp| { - anyhow!( - "translation request failed: {} ({})", - resp.body, - resp.status - ) - })? - .body; - - // we want choices[0].message.content, and inshallah it's the right thing. - let translation = response - .get("choices") - .ok_or_else(|| anyhow!("missing 'choices' key"))? - .get(0) - .ok_or_else(|| anyhow!("empty 'choices' or something"))? - .get("message") - .ok_or_else(|| anyhow!("missing 'message' key"))? - .get("content") - .ok_or_else(|| anyhow!("missing 'content' key"))? - .as_str() - .ok_or_else(|| anyhow!("'content' was not a string"))?; - - Ok(translation.to_string()) -} - // in some cases, posts can not be embedded, but telegram still // includes their content in metadata tags for content previews. // @@ -306,7 +267,7 @@ fn fetch_with_cache(cache: &Cache, link: &TgLink) -> Result<TgPost> { if let Some(message) = &msg.message { if link.translated { println!("translating {}#{}", link.username, link.message_id); - msg.message = Some(fetch_translation(message)?); + msg.message = Some(translate::fetch_translation(message)?); } } @@ -410,7 +371,7 @@ if you see this message and think you did the above correctly, you didn't. try again. idiot. it can also translate posts from russian, ukrainian or whatever other -dumb language you speak into english, by adding `/translate/`, for +dumb language you speak into english by adding `/translate/`, for example: https://tgsa.tazj.in/translate/https://t.me/strelkovii/4329 diff --git a/users/tazjin/tgsa/src/translate.rs b/users/tazjin/tgsa/src/translate.rs new file mode 100644 index 000000000000..84c74cc5b36c --- /dev/null +++ b/users/tazjin/tgsa/src/translate.rs @@ -0,0 +1,191 @@ +//! integration with yandex cloud translate api, for automatically +//! translating telegram posts. +//! +//! most of this module is concerned with handling the authentication +//! tokens for yandex cloud, as jwt signing needs to be handled +//! manually (none of the rust jwt libraries that i tried actually +//! work). + +use anyhow::{anyhow, Context, Result}; +use base64::prelude::BASE64_URL_SAFE_NO_PAD as B64; +use base64::Engine; +use lazy_static::lazy_static; +use ring::signature as sig; +use serde::Deserialize; +use serde_json::{json, Value}; +use std::sync::Mutex; +use std::time::{Duration, SystemTime}; + +/// token exchange url (exchanging a signed jwt for an iam token +/// understood by the translation service) +const TOKEN_URL: &str = "https://iam.api.cloud.yandex.net/iam/v1/tokens"; + +/// translation endpoint +const TRANSLATE_URL: &str = "https://translate.api.cloud.yandex.net/translate/v2/translate"; + +/// describes the private key as downloaded from yandex, pem-encoded. +#[derive(Deserialize)] +struct AuthorizedKey { + id: String, + service_account_id: String, + private_key: String, +} + +/// cached iam token for yandex cloud +struct Token { + token: String, + expiry: SystemTime, +} + +impl Token { + fn is_expired(&self) -> bool { + self.expiry < SystemTime::now() + } +} + +lazy_static! { + static ref KEY_FILE: String = + std::env::var("YANDEX_KEY_FILE").expect("`YANDEX_KEY_FILE` variable should be set"); + static ref CACHED_TOKEN: Mutex<Token> = { + let token = refresh_token().expect("fetching initial translation token must not fail"); + Mutex::new(token) + }; +} + +/// wrap all the authentication logic below into a single function. +fn refresh_token() -> Result<Token> { + let file = std::fs::File::open(KEY_FILE.as_str())?; + let key: AuthorizedKey = serde_json::from_reader(file)?; + let jwt = sign_yandex_jwt(&key)?; + let token = fetch_iam_token(&jwt)?; + + Ok(Token { + token, + expiry: SystemTime::now() + Duration::from_secs(3600), + }) +} + +/// wrapper around the cached token that refreshes if required. +fn current_token() -> Result<String> { + let mut token = CACHED_TOKEN + .lock() + .expect("thread operating on token should never fail"); + + if token.is_expired() { + println!("refreshing translation token"); + *token = refresh_token().context("refreshing translation token")?; + } + + Ok(token.token.clone()) +} + +/// use openssl to read the pem-encoded key, as ring itself is not +/// capable of this. +fn read_pem_key(key: &AuthorizedKey) -> Result<sig::RsaKeyPair> { + let rsa = openssl::rsa::Rsa::private_key_from_pem(key.private_key.as_bytes()) + .context("parsing RSA key")?; + + let der = rsa + .private_key_to_der() + .context("encoding key as DER for ring")?; + + sig::RsaKeyPair::from_der(&der).map_err(|err| anyhow!("decoding DER key in ring: {}", err)) +} + +/// manually construct and sign the jwt required to perform the +/// iam-token key exchange with yandex. +fn sign_yandex_jwt(key: &AuthorizedKey) -> Result<String> { + let iat = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH)? + .as_secs(); + + let header = json!({ + "typ": "JWT", + "alg": "PS256", + "kid": key.id, + }) + .to_string(); + + let payload = json!({ + "iss": key.service_account_id, + "aud": TOKEN_URL, + "iat": iat, + "exp": iat + 60, + }) + .to_string(); + + let unsigned = format!("{}.{}", B64.encode(header), B64.encode(payload)); + let key_pair = read_pem_key(key)?; + + let rng = ring::rand::SystemRandom::new(); + let mut signature = vec![0; key_pair.public_modulus_len()]; + key_pair + .sign( + &sig::RSA_PSS_SHA256, + &rng, + unsigned.as_bytes(), + &mut signature, + ) + .map_err(|err| anyhow!("while signing JWT: {}", err))?; + + Ok(format!("{}.{}", unsigned, B64.encode(&signature))) +} + +/// exchange the jwt for an iam token +fn fetch_iam_token(token: &str) -> Result<String> { + #[derive(Deserialize)] + #[serde(rename_all = "camelCase")] + struct TokenResponse { + iam_token: String, + } + + let response = crimp::Request::post(TOKEN_URL) + .json(&json!({ + "jwt": token, + }))? + .send()? + .error_for_status(|resp| { + anyhow::anyhow!("{} ({})", String::from_utf8_lossy(&resp.body), resp.status) + })? + .as_json::<TokenResponse>() + .context("deserialising IAM token")?; + + Ok(response.body.iam_token) +} + +pub fn fetch_translation(message: &str) -> Result<String> { + let request_body = json!({ + "folderId": "b1gq41rsbggeum4qafnh", + "texts": [ message ], + "targetLanguageCode": "en", + }); + + let response = crimp::Request::post(TRANSLATE_URL) + .bearer_auth(¤t_token()?) + .context("adding 'Bearer' token")? + .json(&request_body) + .context("preparing JSON body")? + .send() + .context("failed to fetch translation from yandex")? + .error_for_status(|resp| { + anyhow!( + "translation request failed: {} ({})", + String::from_utf8_lossy(&resp.body), + resp.status + ) + })? + .as_json::<Value>()? + .body; + + let translation = response + .get("translations") + .ok_or_else(|| anyhow!("missing 'translations' key"))? + .get(0) + .ok_or_else(|| anyhow!("translations list is empty"))? + .get("text") + .ok_or_else(|| anyhow!("translation missing 'text' key"))? + .as_str() + .ok_or_else(|| anyhow!("'text' was not a string"))?; + + Ok(translation.to_string()) +} |