about summary refs log tree commit diff
path: root/users
diff options
context:
space:
mode:
authorVincent Ambo <mail@tazj.in>2023-06-18T12·15+0300
committertazjin <tazjin@tvl.su>2023-06-18T12·43+0000
commit332a821100be288863ad0bac5f655ed512e4fd19 (patch)
tree44469c3578b4418bb368e0942edffb5b00b4d6ff /users
parent6678e768a08e993c586756ad1815a5fcf1f520b6 (diff)
feat(tazjin/tgsa): replace translation backend r/6327
The GPT backend is cool, but it's also very slow, prone to request
errors and quite expensive.

This switches to Yandex Translate instead which for all posts that I
tested seems to be totally fine.

Change-Id: I5217113995b701508a83e7782eb1325957996719
Reviewed-on: https://cl.tvl.fyi/c/depot/+/8826
Tested-by: BuildkiteCI
Reviewed-by: tazjin <tazjin@tvl.su>
Diffstat (limited to 'users')
-rw-r--r--users/tazjin/tgsa/Cargo.lock145
-rw-r--r--users/tazjin/tgsa/Cargo.toml6
-rw-r--r--users/tazjin/tgsa/src/main.rs47
-rw-r--r--users/tazjin/tgsa/src/translate.rs191
4 files changed, 327 insertions, 62 deletions
diff --git a/users/tazjin/tgsa/Cargo.lock b/users/tazjin/tgsa/Cargo.lock
index 51d11135f37f..bcea1a9831d3 100644
--- a/users/tazjin/tgsa/Cargo.lock
+++ b/users/tazjin/tgsa/Cargo.lock
@@ -36,6 +36,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd"
 
 [[package]]
+name = "base64"
+version = "0.21.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "604178f6c5c21f02dc555784810edfb88d34ac2c73b2eae109655649ee73ce3d"
+
+[[package]]
 name = "bitflags"
 version = "1.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -130,7 +136,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "smallvec",
- "syn",
+ "syn 1.0.101",
 ]
 
 [[package]]
@@ -140,7 +146,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dfae75de57f2b2e85e8768c3ea840fd159c8f33e2b6522c7835b7abac81be16e"
 dependencies = [
  "quote",
- "syn",
+ "syn 1.0.101",
 ]
 
 [[package]]
@@ -183,7 +189,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustc_version",
- "syn",
+ "syn 1.0.101",
 ]
 
 [[package]]
@@ -229,6 +235,21 @@ dependencies = [
 ]
 
 [[package]]
+name = "foreign-types"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
+dependencies = [
+ "foreign-types-shared",
+]
+
+[[package]]
+name = "foreign-types-shared"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
+
+[[package]]
 name = "form_urlencoded"
 version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -307,7 +328,7 @@ dependencies = [
  "markup5ever",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.101",
 ]
 
 [[package]]
@@ -535,6 +556,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1"
 
 [[package]]
+name = "openssl"
+version = "0.10.54"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69b3f656a17a6cbc115b5c7a40c616947d213ba182135b014d6051b73ab6f019"
+dependencies = [
+ "bitflags",
+ "cfg-if",
+ "foreign-types",
+ "libc",
+ "once_cell",
+ "openssl-macros",
+ "openssl-sys",
+]
+
+[[package]]
+name = "openssl-macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.18",
+]
+
+[[package]]
 name = "openssl-probe"
 version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -542,11 +589,10 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.76"
+version = "0.9.88"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5230151e44c0f05157effb743e8d517472843121cf9243e8b81393edb5acd9ce"
+checksum = "c2ce0f250f34a308dcfdbb351f511359857d4ed2134ba715a4eadd46e1ffd617"
 dependencies = [
- "autocfg",
  "cc",
  "libc",
  "pkg-config",
@@ -653,7 +699,7 @@ dependencies = [
  "proc-macro-hack",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.101",
 ]
 
 [[package]]
@@ -700,9 +746,9 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.46"
+version = "1.0.60"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b"
+checksum = "dec2b086b7a862cf4de201096214fa870344cf922b2b30c167badb3af3195406"
 dependencies = [
  "unicode-ident",
 ]
@@ -715,9 +761,9 @@ checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
 
 [[package]]
 name = "quote"
-version = "1.0.21"
+version = "1.0.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179"
+checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488"
 dependencies = [
  "proc-macro2",
 ]
@@ -822,12 +868,27 @@ dependencies = [
 ]
 
 [[package]]
+name = "ring"
+version = "0.16.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc"
+dependencies = [
+ "cc",
+ "libc",
+ "once_cell",
+ "spin",
+ "untrusted",
+ "web-sys",
+ "winapi",
+]
+
+[[package]]
 name = "rouille"
 version = "3.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "18b2380c42510ef4a28b5f228a174c801e0dec590103e215e60812e2e2f34d05"
 dependencies = [
- "base64",
+ "base64 0.13.0",
  "chrono",
  "filetime",
  "multipart",
@@ -928,6 +989,9 @@ name = "serde"
 version = "1.0.145"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "728eb6351430bccb993660dfffc5a72f91ccc1295abaa8ce19b27ebe4f75568b"
+dependencies = [
+ "serde_derive",
+]
 
 [[package]]
 name = "serde_derive"
@@ -937,7 +1001,7 @@ checksum = "81fa1584d3d1bcacd84c277a0dfe21f5b0f6accf4a23d04d4c6d61f1af522b4c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.101",
 ]
 
 [[package]]
@@ -999,6 +1063,12 @@ dependencies = [
 ]
 
 [[package]]
+name = "spin"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
+
+[[package]]
 name = "stable_deref_trait"
 version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1042,6 +1112,17 @@ dependencies = [
 ]
 
 [[package]]
+name = "syn"
+version = "2.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
 name = "tempfile"
 version = "3.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1071,8 +1152,12 @@ name = "tgsa"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "base64 0.21.2",
  "crimp",
  "ego-tree",
+ "lazy_static",
+ "openssl",
+ "ring",
  "rouille",
  "scraper",
  "serde",
@@ -1097,15 +1182,23 @@ dependencies = [
 
 [[package]]
 name = "time"
-version = "0.3.15"
+version = "0.3.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d634a985c4d4238ec39cacaed2e7ae552fbd3c476b552c1deac3021b7d7eaf0c"
+checksum = "ea9e1b3cf1243ae005d9e74085d4d542f3125458f3a81af210d901dcd7411efd"
 dependencies = [
  "libc",
  "num_threads",
+ "serde",
+ "time-core",
 ]
 
 [[package]]
+name = "time-core"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"
+
+[[package]]
 name = "tiny_http"
 version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1179,6 +1272,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
 
 [[package]]
+name = "untrusted"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
+
+[[package]]
 name = "url"
 version = "2.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1240,7 +1339,7 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.101",
  "wasm-bindgen-shared",
 ]
 
@@ -1262,7 +1361,7 @@ checksum = "07bc0c051dc5f23e307b13285f9d75df86bfdf816c5721e573dec1f9b8aa193c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.101",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -1274,6 +1373,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1c38c045535d93ec4f0b4defec448e4291638ee608530863b1e2ba115d4fff7f"
 
 [[package]]
+name = "web-sys"
+version = "0.3.60"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bcda906d8be16e728fd5adc5b729afad4e444e106ab28cd1c7256e54fa61510f"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
 name = "winapi"
 version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/users/tazjin/tgsa/Cargo.toml b/users/tazjin/tgsa/Cargo.toml
index 0b1529805864..8764ef652491 100644
--- a/users/tazjin/tgsa/Cargo.toml
+++ b/users/tazjin/tgsa/Cargo.toml
@@ -10,5 +10,9 @@ rouille = { version = "3.5", default-features = false }
 url = "2.3"
 scraper = "0.13"
 ego-tree = "0.6" # in tandem with 'scraper'
-serde = "1.0"
+serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
+ring = "0.16.20"
+openssl = "0.10.54"
+base64 = "0.21.2"
+lazy_static = "1.4.0"
diff --git a/users/tazjin/tgsa/src/main.rs b/users/tazjin/tgsa/src/main.rs
index 1c7ee408abe4..cbdb272e0e3f 100644
--- a/users/tazjin/tgsa/src/main.rs
+++ b/users/tazjin/tgsa/src/main.rs
@@ -1,10 +1,11 @@
 use anyhow::{anyhow, Context, Result};
 use scraper::{Html, Selector};
-use serde_json::Value;
 use std::collections::HashMap;
 use std::sync::RwLock;
 use std::time::{Duration, Instant};
 
+mod translate;
+
 #[derive(Clone, Debug, Eq, Hash, PartialEq)]
 struct TgLink {
     username: String,
@@ -63,46 +64,6 @@ fn fetch_post(link: &TgLink, embed: bool) -> Result<String> {
     Ok(response.body)
 }
 
-fn fetch_translation(message: &str) -> Result<String> {
-    let request = serde_json::json!({
-        "model": "gpt-3.5-turbo",
-        "messages": [
-            {"role": "user", "content": "Please translate the following message from a Telegram channel into English. If the post is already partially in English, please leave those bits intact as they are. Please respond only with the translation."},
-            {"role": "user", "content": message}
-        ]
-    });
-
-    let response: Value = crimp::Request::post("https://api.openai.com/v1/chat/completions")
-        .bearer_auth(&std::env::var("OPENAPI_KEY").context("no openapi key set")?)?
-        .json(&request)?
-        .send()
-        .context("failed to fetch translation from openai")?
-        .as_json::<Value>()?
-        .error_for_status(|resp| {
-            anyhow!(
-                "translation request failed: {} ({})",
-                resp.body,
-                resp.status
-            )
-        })?
-        .body;
-
-    // we want choices[0].message.content, and inshallah it's the right thing.
-    let translation = response
-        .get("choices")
-        .ok_or_else(|| anyhow!("missing 'choices' key"))?
-        .get(0)
-        .ok_or_else(|| anyhow!("empty 'choices' or something"))?
-        .get("message")
-        .ok_or_else(|| anyhow!("missing 'message' key"))?
-        .get("content")
-        .ok_or_else(|| anyhow!("missing 'content' key"))?
-        .as_str()
-        .ok_or_else(|| anyhow!("'content' was not a string"))?;
-
-    Ok(translation.to_string())
-}
-
 // in some cases, posts can not be embedded, but telegram still
 // includes their content in metadata tags for content previews.
 //
@@ -306,7 +267,7 @@ fn fetch_with_cache(cache: &Cache, link: &TgLink) -> Result<TgPost> {
     if let Some(message) = &msg.message {
         if link.translated {
             println!("translating {}#{}", link.username, link.message_id);
-            msg.message = Some(fetch_translation(message)?);
+            msg.message = Some(translate::fetch_translation(message)?);
         }
     }
 
@@ -410,7 +371,7 @@ if you see this message and think you did the above correctly, you
 didn't. try again. idiot.
 
 it can also translate posts from russian, ukrainian or whatever other
-dumb language you speak into english, by adding `/translate/`, for
+dumb language you speak into english by adding `/translate/`, for
 example:
 
   https://tgsa.tazj.in/translate/https://t.me/strelkovii/4329
diff --git a/users/tazjin/tgsa/src/translate.rs b/users/tazjin/tgsa/src/translate.rs
new file mode 100644
index 000000000000..84c74cc5b36c
--- /dev/null
+++ b/users/tazjin/tgsa/src/translate.rs
@@ -0,0 +1,191 @@
+//! integration with yandex cloud translate api, for automatically
+//! translating telegram posts.
+//!
+//! most of this module is concerned with handling the authentication
+//! tokens for yandex cloud, as jwt signing needs to be handled
+//! manually (none of the rust jwt libraries that i tried actually
+//! work).
+
+use anyhow::{anyhow, Context, Result};
+use base64::prelude::BASE64_URL_SAFE_NO_PAD as B64;
+use base64::Engine;
+use lazy_static::lazy_static;
+use ring::signature as sig;
+use serde::Deserialize;
+use serde_json::{json, Value};
+use std::sync::Mutex;
+use std::time::{Duration, SystemTime};
+
+/// token exchange url (exchanging a signed jwt for an iam token
+/// understood by the translation service)
+const TOKEN_URL: &str = "https://iam.api.cloud.yandex.net/iam/v1/tokens";
+
+/// translation endpoint
+const TRANSLATE_URL: &str = "https://translate.api.cloud.yandex.net/translate/v2/translate";
+
+/// describes the private key as downloaded from yandex, pem-encoded.
+#[derive(Deserialize)]
+struct AuthorizedKey {
+    id: String,
+    service_account_id: String,
+    private_key: String,
+}
+
+/// cached iam token for yandex cloud
+struct Token {
+    token: String,
+    expiry: SystemTime,
+}
+
+impl Token {
+    fn is_expired(&self) -> bool {
+        self.expiry < SystemTime::now()
+    }
+}
+
+lazy_static! {
+    static ref KEY_FILE: String =
+        std::env::var("YANDEX_KEY_FILE").expect("`YANDEX_KEY_FILE` variable should be set");
+    static ref CACHED_TOKEN: Mutex<Token> = {
+        let token = refresh_token().expect("fetching initial translation token must not fail");
+        Mutex::new(token)
+    };
+}
+
+/// wrap all the authentication logic below into a single function.
+fn refresh_token() -> Result<Token> {
+    let file = std::fs::File::open(KEY_FILE.as_str())?;
+    let key: AuthorizedKey = serde_json::from_reader(file)?;
+    let jwt = sign_yandex_jwt(&key)?;
+    let token = fetch_iam_token(&jwt)?;
+
+    Ok(Token {
+        token,
+        expiry: SystemTime::now() + Duration::from_secs(3600),
+    })
+}
+
+/// wrapper around the cached token that refreshes if required.
+fn current_token() -> Result<String> {
+    let mut token = CACHED_TOKEN
+        .lock()
+        .expect("thread operating on token should never fail");
+
+    if token.is_expired() {
+        println!("refreshing translation token");
+        *token = refresh_token().context("refreshing translation token")?;
+    }
+
+    Ok(token.token.clone())
+}
+
+/// use openssl to read the pem-encoded key, as ring itself is not
+/// capable of this.
+fn read_pem_key(key: &AuthorizedKey) -> Result<sig::RsaKeyPair> {
+    let rsa = openssl::rsa::Rsa::private_key_from_pem(key.private_key.as_bytes())
+        .context("parsing RSA key")?;
+
+    let der = rsa
+        .private_key_to_der()
+        .context("encoding key as DER for ring")?;
+
+    sig::RsaKeyPair::from_der(&der).map_err(|err| anyhow!("decoding DER key in ring: {}", err))
+}
+
+/// manually construct and sign the jwt required to perform the
+/// iam-token key exchange with yandex.
+fn sign_yandex_jwt(key: &AuthorizedKey) -> Result<String> {
+    let iat = SystemTime::now()
+        .duration_since(SystemTime::UNIX_EPOCH)?
+        .as_secs();
+
+    let header = json!({
+        "typ": "JWT",
+        "alg": "PS256",
+        "kid": key.id,
+    })
+    .to_string();
+
+    let payload = json!({
+        "iss": key.service_account_id,
+        "aud": TOKEN_URL,
+        "iat": iat,
+        "exp": iat + 60,
+    })
+    .to_string();
+
+    let unsigned = format!("{}.{}", B64.encode(header), B64.encode(payload));
+    let key_pair = read_pem_key(key)?;
+
+    let rng = ring::rand::SystemRandom::new();
+    let mut signature = vec![0; key_pair.public_modulus_len()];
+    key_pair
+        .sign(
+            &sig::RSA_PSS_SHA256,
+            &rng,
+            unsigned.as_bytes(),
+            &mut signature,
+        )
+        .map_err(|err| anyhow!("while signing JWT: {}", err))?;
+
+    Ok(format!("{}.{}", unsigned, B64.encode(&signature)))
+}
+
+/// exchange the jwt for an iam token
+fn fetch_iam_token(token: &str) -> Result<String> {
+    #[derive(Deserialize)]
+    #[serde(rename_all = "camelCase")]
+    struct TokenResponse {
+        iam_token: String,
+    }
+
+    let response = crimp::Request::post(TOKEN_URL)
+        .json(&json!({
+            "jwt": token,
+        }))?
+        .send()?
+        .error_for_status(|resp| {
+            anyhow::anyhow!("{} ({})", String::from_utf8_lossy(&resp.body), resp.status)
+        })?
+        .as_json::<TokenResponse>()
+        .context("deserialising IAM token")?;
+
+    Ok(response.body.iam_token)
+}
+
+pub fn fetch_translation(message: &str) -> Result<String> {
+    let request_body = json!({
+        "folderId": "b1gq41rsbggeum4qafnh",
+        "texts": [ message ],
+        "targetLanguageCode": "en",
+    });
+
+    let response = crimp::Request::post(TRANSLATE_URL)
+        .bearer_auth(&current_token()?)
+        .context("adding 'Bearer' token")?
+        .json(&request_body)
+        .context("preparing JSON body")?
+        .send()
+        .context("failed to fetch translation from yandex")?
+        .error_for_status(|resp| {
+            anyhow!(
+                "translation request failed: {} ({})",
+                String::from_utf8_lossy(&resp.body),
+                resp.status
+            )
+        })?
+        .as_json::<Value>()?
+        .body;
+
+    let translation = response
+        .get("translations")
+        .ok_or_else(|| anyhow!("missing 'translations' key"))?
+        .get(0)
+        .ok_or_else(|| anyhow!("translations list is empty"))?
+        .get("text")
+        .ok_or_else(|| anyhow!("translation missing 'text' key"))?
+        .as_str()
+        .ok_or_else(|| anyhow!("'text' was not a string"))?;
+
+    Ok(translation.to_string())
+}