diff options
author | Vincent Ambo <mail@tazj.in> | 2023-06-18T12·15+0300 |
---|---|---|
committer | tazjin <tazjin@tvl.su> | 2023-06-18T12·43+0000 |
commit | 332a821100be288863ad0bac5f655ed512e4fd19 (patch) | |
tree | 44469c3578b4418bb368e0942edffb5b00b4d6ff /users/tazjin/tgsa/src | |
parent | 6678e768a08e993c586756ad1815a5fcf1f520b6 (diff) |
feat(tazjin/tgsa): replace translation backend r/6327
The GPT backend is cool, but it's also very slow, prone to request errors and quite expensive. This switches to Yandex Translate instead which for all posts that I tested seems to be totally fine. Change-Id: I5217113995b701508a83e7782eb1325957996719 Reviewed-on: https://cl.tvl.fyi/c/depot/+/8826 Tested-by: BuildkiteCI Reviewed-by: tazjin <tazjin@tvl.su>
Diffstat (limited to 'users/tazjin/tgsa/src')
-rw-r--r-- | users/tazjin/tgsa/src/main.rs | 47 | ||||
-rw-r--r-- | users/tazjin/tgsa/src/translate.rs | 191 |
2 files changed, 195 insertions, 43 deletions
diff --git a/users/tazjin/tgsa/src/main.rs b/users/tazjin/tgsa/src/main.rs index 1c7ee408abe4..cbdb272e0e3f 100644 --- a/users/tazjin/tgsa/src/main.rs +++ b/users/tazjin/tgsa/src/main.rs @@ -1,10 +1,11 @@ use anyhow::{anyhow, Context, Result}; use scraper::{Html, Selector}; -use serde_json::Value; use std::collections::HashMap; use std::sync::RwLock; use std::time::{Duration, Instant}; +mod translate; + #[derive(Clone, Debug, Eq, Hash, PartialEq)] struct TgLink { username: String, @@ -63,46 +64,6 @@ fn fetch_post(link: &TgLink, embed: bool) -> Result<String> { Ok(response.body) } -fn fetch_translation(message: &str) -> Result<String> { - let request = serde_json::json!({ - "model": "gpt-3.5-turbo", - "messages": [ - {"role": "user", "content": "Please translate the following message from a Telegram channel into English. If the post is already partially in English, please leave those bits intact as they are. Please respond only with the translation."}, - {"role": "user", "content": message} - ] - }); - - let response: Value = crimp::Request::post("https://api.openai.com/v1/chat/completions") - .bearer_auth(&std::env::var("OPENAPI_KEY").context("no openapi key set")?)? - .json(&request)? - .send() - .context("failed to fetch translation from openai")? - .as_json::<Value>()? - .error_for_status(|resp| { - anyhow!( - "translation request failed: {} ({})", - resp.body, - resp.status - ) - })? - .body; - - // we want choices[0].message.content, and inshallah it's the right thing. - let translation = response - .get("choices") - .ok_or_else(|| anyhow!("missing 'choices' key"))? - .get(0) - .ok_or_else(|| anyhow!("empty 'choices' or something"))? - .get("message") - .ok_or_else(|| anyhow!("missing 'message' key"))? - .get("content") - .ok_or_else(|| anyhow!("missing 'content' key"))? - .as_str() - .ok_or_else(|| anyhow!("'content' was not a string"))?; - - Ok(translation.to_string()) -} - // in some cases, posts can not be embedded, but telegram still // includes their content in metadata tags for content previews. // @@ -306,7 +267,7 @@ fn fetch_with_cache(cache: &Cache, link: &TgLink) -> Result<TgPost> { if let Some(message) = &msg.message { if link.translated { println!("translating {}#{}", link.username, link.message_id); - msg.message = Some(fetch_translation(message)?); + msg.message = Some(translate::fetch_translation(message)?); } } @@ -410,7 +371,7 @@ if you see this message and think you did the above correctly, you didn't. try again. idiot. it can also translate posts from russian, ukrainian or whatever other -dumb language you speak into english, by adding `/translate/`, for +dumb language you speak into english by adding `/translate/`, for example: https://tgsa.tazj.in/translate/https://t.me/strelkovii/4329 diff --git a/users/tazjin/tgsa/src/translate.rs b/users/tazjin/tgsa/src/translate.rs new file mode 100644 index 000000000000..84c74cc5b36c --- /dev/null +++ b/users/tazjin/tgsa/src/translate.rs @@ -0,0 +1,191 @@ +//! integration with yandex cloud translate api, for automatically +//! translating telegram posts. +//! +//! most of this module is concerned with handling the authentication +//! tokens for yandex cloud, as jwt signing needs to be handled +//! manually (none of the rust jwt libraries that i tried actually +//! work). + +use anyhow::{anyhow, Context, Result}; +use base64::prelude::BASE64_URL_SAFE_NO_PAD as B64; +use base64::Engine; +use lazy_static::lazy_static; +use ring::signature as sig; +use serde::Deserialize; +use serde_json::{json, Value}; +use std::sync::Mutex; +use std::time::{Duration, SystemTime}; + +/// token exchange url (exchanging a signed jwt for an iam token +/// understood by the translation service) +const TOKEN_URL: &str = "https://iam.api.cloud.yandex.net/iam/v1/tokens"; + +/// translation endpoint +const TRANSLATE_URL: &str = "https://translate.api.cloud.yandex.net/translate/v2/translate"; + +/// describes the private key as downloaded from yandex, pem-encoded. +#[derive(Deserialize)] +struct AuthorizedKey { + id: String, + service_account_id: String, + private_key: String, +} + +/// cached iam token for yandex cloud +struct Token { + token: String, + expiry: SystemTime, +} + +impl Token { + fn is_expired(&self) -> bool { + self.expiry < SystemTime::now() + } +} + +lazy_static! { + static ref KEY_FILE: String = + std::env::var("YANDEX_KEY_FILE").expect("`YANDEX_KEY_FILE` variable should be set"); + static ref CACHED_TOKEN: Mutex<Token> = { + let token = refresh_token().expect("fetching initial translation token must not fail"); + Mutex::new(token) + }; +} + +/// wrap all the authentication logic below into a single function. +fn refresh_token() -> Result<Token> { + let file = std::fs::File::open(KEY_FILE.as_str())?; + let key: AuthorizedKey = serde_json::from_reader(file)?; + let jwt = sign_yandex_jwt(&key)?; + let token = fetch_iam_token(&jwt)?; + + Ok(Token { + token, + expiry: SystemTime::now() + Duration::from_secs(3600), + }) +} + +/// wrapper around the cached token that refreshes if required. +fn current_token() -> Result<String> { + let mut token = CACHED_TOKEN + .lock() + .expect("thread operating on token should never fail"); + + if token.is_expired() { + println!("refreshing translation token"); + *token = refresh_token().context("refreshing translation token")?; + } + + Ok(token.token.clone()) +} + +/// use openssl to read the pem-encoded key, as ring itself is not +/// capable of this. +fn read_pem_key(key: &AuthorizedKey) -> Result<sig::RsaKeyPair> { + let rsa = openssl::rsa::Rsa::private_key_from_pem(key.private_key.as_bytes()) + .context("parsing RSA key")?; + + let der = rsa + .private_key_to_der() + .context("encoding key as DER for ring")?; + + sig::RsaKeyPair::from_der(&der).map_err(|err| anyhow!("decoding DER key in ring: {}", err)) +} + +/// manually construct and sign the jwt required to perform the +/// iam-token key exchange with yandex. +fn sign_yandex_jwt(key: &AuthorizedKey) -> Result<String> { + let iat = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH)? + .as_secs(); + + let header = json!({ + "typ": "JWT", + "alg": "PS256", + "kid": key.id, + }) + .to_string(); + + let payload = json!({ + "iss": key.service_account_id, + "aud": TOKEN_URL, + "iat": iat, + "exp": iat + 60, + }) + .to_string(); + + let unsigned = format!("{}.{}", B64.encode(header), B64.encode(payload)); + let key_pair = read_pem_key(key)?; + + let rng = ring::rand::SystemRandom::new(); + let mut signature = vec![0; key_pair.public_modulus_len()]; + key_pair + .sign( + &sig::RSA_PSS_SHA256, + &rng, + unsigned.as_bytes(), + &mut signature, + ) + .map_err(|err| anyhow!("while signing JWT: {}", err))?; + + Ok(format!("{}.{}", unsigned, B64.encode(&signature))) +} + +/// exchange the jwt for an iam token +fn fetch_iam_token(token: &str) -> Result<String> { + #[derive(Deserialize)] + #[serde(rename_all = "camelCase")] + struct TokenResponse { + iam_token: String, + } + + let response = crimp::Request::post(TOKEN_URL) + .json(&json!({ + "jwt": token, + }))? + .send()? + .error_for_status(|resp| { + anyhow::anyhow!("{} ({})", String::from_utf8_lossy(&resp.body), resp.status) + })? + .as_json::<TokenResponse>() + .context("deserialising IAM token")?; + + Ok(response.body.iam_token) +} + +pub fn fetch_translation(message: &str) -> Result<String> { + let request_body = json!({ + "folderId": "b1gq41rsbggeum4qafnh", + "texts": [ message ], + "targetLanguageCode": "en", + }); + + let response = crimp::Request::post(TRANSLATE_URL) + .bearer_auth(¤t_token()?) + .context("adding 'Bearer' token")? + .json(&request_body) + .context("preparing JSON body")? + .send() + .context("failed to fetch translation from yandex")? + .error_for_status(|resp| { + anyhow!( + "translation request failed: {} ({})", + String::from_utf8_lossy(&resp.body), + resp.status + ) + })? + .as_json::<Value>()? + .body; + + let translation = response + .get("translations") + .ok_or_else(|| anyhow!("missing 'translations' key"))? + .get(0) + .ok_or_else(|| anyhow!("translations list is empty"))? + .get("text") + .ok_or_else(|| anyhow!("translation missing 'text' key"))? + .as_str() + .ok_or_else(|| anyhow!("'text' was not a string"))?; + + Ok(translation.to_string()) +} |