about summary refs log tree commit diff
path: root/users/tazjin/tgsa/src
diff options
context:
space:
mode:
authorVincent Ambo <mail@tazj.in>2023-06-18T12·15+0300
committertazjin <tazjin@tvl.su>2023-06-18T12·43+0000
commit332a821100be288863ad0bac5f655ed512e4fd19 (patch)
tree44469c3578b4418bb368e0942edffb5b00b4d6ff /users/tazjin/tgsa/src
parent6678e768a08e993c586756ad1815a5fcf1f520b6 (diff)
feat(tazjin/tgsa): replace translation backend r/6327
The GPT backend is cool, but it's also very slow, prone to request
errors and quite expensive.

This switches to Yandex Translate instead which for all posts that I
tested seems to be totally fine.

Change-Id: I5217113995b701508a83e7782eb1325957996719
Reviewed-on: https://cl.tvl.fyi/c/depot/+/8826
Tested-by: BuildkiteCI
Reviewed-by: tazjin <tazjin@tvl.su>
Diffstat (limited to 'users/tazjin/tgsa/src')
-rw-r--r--users/tazjin/tgsa/src/main.rs47
-rw-r--r--users/tazjin/tgsa/src/translate.rs191
2 files changed, 195 insertions, 43 deletions
diff --git a/users/tazjin/tgsa/src/main.rs b/users/tazjin/tgsa/src/main.rs
index 1c7ee408abe4..cbdb272e0e3f 100644
--- a/users/tazjin/tgsa/src/main.rs
+++ b/users/tazjin/tgsa/src/main.rs
@@ -1,10 +1,11 @@
 use anyhow::{anyhow, Context, Result};
 use scraper::{Html, Selector};
-use serde_json::Value;
 use std::collections::HashMap;
 use std::sync::RwLock;
 use std::time::{Duration, Instant};
 
+mod translate;
+
 #[derive(Clone, Debug, Eq, Hash, PartialEq)]
 struct TgLink {
     username: String,
@@ -63,46 +64,6 @@ fn fetch_post(link: &TgLink, embed: bool) -> Result<String> {
     Ok(response.body)
 }
 
-fn fetch_translation(message: &str) -> Result<String> {
-    let request = serde_json::json!({
-        "model": "gpt-3.5-turbo",
-        "messages": [
-            {"role": "user", "content": "Please translate the following message from a Telegram channel into English. If the post is already partially in English, please leave those bits intact as they are. Please respond only with the translation."},
-            {"role": "user", "content": message}
-        ]
-    });
-
-    let response: Value = crimp::Request::post("https://api.openai.com/v1/chat/completions")
-        .bearer_auth(&std::env::var("OPENAPI_KEY").context("no openapi key set")?)?
-        .json(&request)?
-        .send()
-        .context("failed to fetch translation from openai")?
-        .as_json::<Value>()?
-        .error_for_status(|resp| {
-            anyhow!(
-                "translation request failed: {} ({})",
-                resp.body,
-                resp.status
-            )
-        })?
-        .body;
-
-    // we want choices[0].message.content, and inshallah it's the right thing.
-    let translation = response
-        .get("choices")
-        .ok_or_else(|| anyhow!("missing 'choices' key"))?
-        .get(0)
-        .ok_or_else(|| anyhow!("empty 'choices' or something"))?
-        .get("message")
-        .ok_or_else(|| anyhow!("missing 'message' key"))?
-        .get("content")
-        .ok_or_else(|| anyhow!("missing 'content' key"))?
-        .as_str()
-        .ok_or_else(|| anyhow!("'content' was not a string"))?;
-
-    Ok(translation.to_string())
-}
-
 // in some cases, posts can not be embedded, but telegram still
 // includes their content in metadata tags for content previews.
 //
@@ -306,7 +267,7 @@ fn fetch_with_cache(cache: &Cache, link: &TgLink) -> Result<TgPost> {
     if let Some(message) = &msg.message {
         if link.translated {
             println!("translating {}#{}", link.username, link.message_id);
-            msg.message = Some(fetch_translation(message)?);
+            msg.message = Some(translate::fetch_translation(message)?);
         }
     }
 
@@ -410,7 +371,7 @@ if you see this message and think you did the above correctly, you
 didn't. try again. idiot.
 
 it can also translate posts from russian, ukrainian or whatever other
-dumb language you speak into english, by adding `/translate/`, for
+dumb language you speak into english by adding `/translate/`, for
 example:
 
   https://tgsa.tazj.in/translate/https://t.me/strelkovii/4329
diff --git a/users/tazjin/tgsa/src/translate.rs b/users/tazjin/tgsa/src/translate.rs
new file mode 100644
index 000000000000..84c74cc5b36c
--- /dev/null
+++ b/users/tazjin/tgsa/src/translate.rs
@@ -0,0 +1,191 @@
+//! integration with yandex cloud translate api, for automatically
+//! translating telegram posts.
+//!
+//! most of this module is concerned with handling the authentication
+//! tokens for yandex cloud, as jwt signing needs to be handled
+//! manually (none of the rust jwt libraries that i tried actually
+//! work).
+
+use anyhow::{anyhow, Context, Result};
+use base64::prelude::BASE64_URL_SAFE_NO_PAD as B64;
+use base64::Engine;
+use lazy_static::lazy_static;
+use ring::signature as sig;
+use serde::Deserialize;
+use serde_json::{json, Value};
+use std::sync::Mutex;
+use std::time::{Duration, SystemTime};
+
+/// token exchange url (exchanging a signed jwt for an iam token
+/// understood by the translation service)
+const TOKEN_URL: &str = "https://iam.api.cloud.yandex.net/iam/v1/tokens";
+
+/// translation endpoint
+const TRANSLATE_URL: &str = "https://translate.api.cloud.yandex.net/translate/v2/translate";
+
+/// describes the private key as downloaded from yandex, pem-encoded.
+#[derive(Deserialize)]
+struct AuthorizedKey {
+    id: String,
+    service_account_id: String,
+    private_key: String,
+}
+
+/// cached iam token for yandex cloud
+struct Token {
+    token: String,
+    expiry: SystemTime,
+}
+
+impl Token {
+    fn is_expired(&self) -> bool {
+        self.expiry < SystemTime::now()
+    }
+}
+
+lazy_static! {
+    static ref KEY_FILE: String =
+        std::env::var("YANDEX_KEY_FILE").expect("`YANDEX_KEY_FILE` variable should be set");
+    static ref CACHED_TOKEN: Mutex<Token> = {
+        let token = refresh_token().expect("fetching initial translation token must not fail");
+        Mutex::new(token)
+    };
+}
+
+/// wrap all the authentication logic below into a single function.
+fn refresh_token() -> Result<Token> {
+    let file = std::fs::File::open(KEY_FILE.as_str())?;
+    let key: AuthorizedKey = serde_json::from_reader(file)?;
+    let jwt = sign_yandex_jwt(&key)?;
+    let token = fetch_iam_token(&jwt)?;
+
+    Ok(Token {
+        token,
+        expiry: SystemTime::now() + Duration::from_secs(3600),
+    })
+}
+
+/// wrapper around the cached token that refreshes if required.
+fn current_token() -> Result<String> {
+    let mut token = CACHED_TOKEN
+        .lock()
+        .expect("thread operating on token should never fail");
+
+    if token.is_expired() {
+        println!("refreshing translation token");
+        *token = refresh_token().context("refreshing translation token")?;
+    }
+
+    Ok(token.token.clone())
+}
+
+/// use openssl to read the pem-encoded key, as ring itself is not
+/// capable of this.
+fn read_pem_key(key: &AuthorizedKey) -> Result<sig::RsaKeyPair> {
+    let rsa = openssl::rsa::Rsa::private_key_from_pem(key.private_key.as_bytes())
+        .context("parsing RSA key")?;
+
+    let der = rsa
+        .private_key_to_der()
+        .context("encoding key as DER for ring")?;
+
+    sig::RsaKeyPair::from_der(&der).map_err(|err| anyhow!("decoding DER key in ring: {}", err))
+}
+
+/// manually construct and sign the jwt required to perform the
+/// iam-token key exchange with yandex.
+fn sign_yandex_jwt(key: &AuthorizedKey) -> Result<String> {
+    let iat = SystemTime::now()
+        .duration_since(SystemTime::UNIX_EPOCH)?
+        .as_secs();
+
+    let header = json!({
+        "typ": "JWT",
+        "alg": "PS256",
+        "kid": key.id,
+    })
+    .to_string();
+
+    let payload = json!({
+        "iss": key.service_account_id,
+        "aud": TOKEN_URL,
+        "iat": iat,
+        "exp": iat + 60,
+    })
+    .to_string();
+
+    let unsigned = format!("{}.{}", B64.encode(header), B64.encode(payload));
+    let key_pair = read_pem_key(key)?;
+
+    let rng = ring::rand::SystemRandom::new();
+    let mut signature = vec![0; key_pair.public_modulus_len()];
+    key_pair
+        .sign(
+            &sig::RSA_PSS_SHA256,
+            &rng,
+            unsigned.as_bytes(),
+            &mut signature,
+        )
+        .map_err(|err| anyhow!("while signing JWT: {}", err))?;
+
+    Ok(format!("{}.{}", unsigned, B64.encode(&signature)))
+}
+
+/// exchange the jwt for an iam token
+fn fetch_iam_token(token: &str) -> Result<String> {
+    #[derive(Deserialize)]
+    #[serde(rename_all = "camelCase")]
+    struct TokenResponse {
+        iam_token: String,
+    }
+
+    let response = crimp::Request::post(TOKEN_URL)
+        .json(&json!({
+            "jwt": token,
+        }))?
+        .send()?
+        .error_for_status(|resp| {
+            anyhow::anyhow!("{} ({})", String::from_utf8_lossy(&resp.body), resp.status)
+        })?
+        .as_json::<TokenResponse>()
+        .context("deserialising IAM token")?;
+
+    Ok(response.body.iam_token)
+}
+
+pub fn fetch_translation(message: &str) -> Result<String> {
+    let request_body = json!({
+        "folderId": "b1gq41rsbggeum4qafnh",
+        "texts": [ message ],
+        "targetLanguageCode": "en",
+    });
+
+    let response = crimp::Request::post(TRANSLATE_URL)
+        .bearer_auth(&current_token()?)
+        .context("adding 'Bearer' token")?
+        .json(&request_body)
+        .context("preparing JSON body")?
+        .send()
+        .context("failed to fetch translation from yandex")?
+        .error_for_status(|resp| {
+            anyhow!(
+                "translation request failed: {} ({})",
+                String::from_utf8_lossy(&resp.body),
+                resp.status
+            )
+        })?
+        .as_json::<Value>()?
+        .body;
+
+    let translation = response
+        .get("translations")
+        .ok_or_else(|| anyhow!("missing 'translations' key"))?
+        .get(0)
+        .ok_or_else(|| anyhow!("translations list is empty"))?
+        .get("text")
+        .ok_or_else(|| anyhow!("translation missing 'text' key"))?
+        .as_str()
+        .ok_or_else(|| anyhow!("'text' was not a string"))?;
+
+    Ok(translation.to_string())
+}