diff options
author | Vincent Ambo <mail@tazj.in> | 2023-05-24T16·08+0300 |
---|---|---|
committer | tazjin <tazjin@tvl.su> | 2023-05-24T16·28+0000 |
commit | 38042ea4452b224499b24ee5390a56774d74375c (patch) | |
tree | 13b4da979070388bc62f3a418d8d9f4e6a4a8df5 /users/tazjin | |
parent | 63047449d7ac5d92a50c67b5f14b0cce0da5ba81 (diff) |
feat(tazjin/tgsa): add gpt-3 powered message translation feature r/6194
this is slow and often overloaded, but it's kind of cool when it works. this translation method deals much better with the kind of slang you'd see in telegram posts than any other method. Change-Id: I7e4c845eb382f0eac627c4237b492c8e40dae574 Reviewed-on: https://cl.tvl.fyi/c/depot/+/8625 Reviewed-by: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI
Diffstat (limited to 'users/tazjin')
-rw-r--r-- | users/tazjin/tgsa/Cargo.lock | 2 | ||||
-rw-r--r-- | users/tazjin/tgsa/Cargo.toml | 2 | ||||
-rw-r--r-- | users/tazjin/tgsa/src/main.rs | 86 |
3 files changed, 83 insertions, 7 deletions
diff --git a/users/tazjin/tgsa/Cargo.lock b/users/tazjin/tgsa/Cargo.lock index 386355cddaeb..51d11135f37f 100644 --- a/users/tazjin/tgsa/Cargo.lock +++ b/users/tazjin/tgsa/Cargo.lock @@ -1075,6 +1075,8 @@ dependencies = [ "ego-tree", "rouille", "scraper", + "serde", + "serde_json", "url", ] diff --git a/users/tazjin/tgsa/Cargo.toml b/users/tazjin/tgsa/Cargo.toml index b589a8174e8e..0b1529805864 100644 --- a/users/tazjin/tgsa/Cargo.toml +++ b/users/tazjin/tgsa/Cargo.toml @@ -10,3 +10,5 @@ rouille = { version = "3.5", default-features = false } url = "2.3" scraper = "0.13" ego-tree = "0.6" # in tandem with 'scraper' +serde = "1.0" +serde_json = "1.0" diff --git a/users/tazjin/tgsa/src/main.rs b/users/tazjin/tgsa/src/main.rs index 508b9872a254..bff02924038f 100644 --- a/users/tazjin/tgsa/src/main.rs +++ b/users/tazjin/tgsa/src/main.rs @@ -1,13 +1,15 @@ use anyhow::{anyhow, Context, Result}; +use scraper::{Html, Selector}; +use serde_json::Value; use std::collections::HashMap; use std::sync::RwLock; use std::time::{Duration, Instant}; -use scraper::{Html, Selector}; #[derive(Clone, Debug, Eq, Hash, PartialEq)] struct TgLink { username: String, message_id: usize, + translated: bool, } impl TgLink { @@ -16,10 +18,15 @@ impl TgLink { } fn to_url(&self, embed: bool) -> String { - format!("https://t.me/{}/{}{}", self.username, self.message_id, if embed { "?embed=1" } else { "" }) + format!( + "https://t.me/{}/{}{}", + self.username, + self.message_id, + if embed { "?embed=1" } else { "" } + ) } - fn parse(url: &str) -> Option<Self> { + fn parse(url: &str, translated: bool) -> Option<Self> { let url = url.strip_prefix("/")?; let parsed = url::Url::parse(url).ok()?; @@ -37,6 +44,7 @@ impl TgLink { Some(TgLink { username: parts[0].into(), message_id: parts[1].parse().ok()?, + translated, }) } } @@ -55,6 +63,46 @@ fn fetch_post(link: &TgLink, embed: bool) -> Result<String> { Ok(response.body) } +fn fetch_translation(message: &str) -> Result<String> { + let request = serde_json::json!({ + "model": "gpt-3.5-turbo", + "messages": [ + {"role": "user", "content": "Please translate the following message from a Telegram channel into English. If the post is already partially in English, please leave those bits intact as they are. Please respond only with the translation."}, + {"role": "user", "content": message} + ] + }); + + let response: Value = crimp::Request::post("https://api.openai.com/v1/chat/completions") + .bearer_auth(&std::env::var("OPENAPI_KEY").context("no openapi key set")?)? + .json(&request)? + .send() + .context("failed to fetch translation from openai")? + .as_json::<Value>()? + .error_for_status(|resp| { + anyhow!( + "translation request failed: {} ({})", + resp.body, + resp.status + ) + })? + .body; + + // we want choices[0].message.content, and inshallah it's the right thing. + let translation = response + .get("choices") + .ok_or_else(|| anyhow!("missing 'choices' key"))? + .get(0) + .ok_or_else(|| anyhow!("empty 'choices' or something"))? + .get("message") + .ok_or_else(|| anyhow!("missing 'message' key"))? + .get("content") + .ok_or_else(|| anyhow!("missing 'content' key"))? + .as_str() + .ok_or_else(|| anyhow!("'content' was not a string"))?; + + Ok(translation.to_string()) +} + // in some cases, posts can not be embedded, but telegram still // includes their content in metadata tags for content previews. // @@ -255,6 +303,12 @@ fn fetch_with_cache(cache: &Cache, link: &TgLink) -> Result<TgPost> { msg.message = fetch_fallback(&link)?; } + if let Some(message) = &msg.message { + if link.translated { + msg.message = Some(fetch_translation(message)?); + } + } + let bbcode = to_bbcode(&link, &msg); let mut media = vec![]; @@ -292,6 +346,7 @@ fn handle_img_redirect(cache: &Cache, img_path: &str) -> Result<rouille::Respons let link = TgLink { username: img_parts[0].into(), message_id: img_parts[1].parse().context("failed to parse message_id")?, + translated: false, }; let img_idx: usize = img_parts[2].parse().context("failed to parse img_idx")?; @@ -320,12 +375,20 @@ fn main() { let cache: Cache = RwLock::new(HashMap::new()); rouille::start_server("0.0.0.0:8472", move |request| { + let mut raw_url = request.raw_url(); + let mut translate = false; + let response = loop { - if request.raw_url().starts_with("/img/") { - break handle_img_redirect(&cache, &request.raw_url()[5..]); + if raw_url.starts_with("/img/") { + break handle_img_redirect(&cache, &raw_url[5..]); } - break match TgLink::parse(request.raw_url()) { + if raw_url.starts_with("/translate/") { + translate = true; + raw_url = &raw_url[10..]; + } + + break match TgLink::parse(raw_url, translate) { None => Ok(rouille::Response::text( r#"tgsa ---- @@ -345,7 +408,16 @@ yes, that looks stupid, but it works if you see this message and think you did the above correctly, you didn't. try again. idiot. -pm me on the forums if this makes you mad or something. +it can also translate posts from russian, ukrainian or whatever other +dumb language you speak into english, by adding `/translate/`, for +example: + + https://tgsa.tazj.in/translate/https://t.me/strelkovii/4329 + +expect this to be slow though. that's the price to pay for translating +shitty slang. + +pm me on the forums if any of this makes you mad or something. "#, )), Some(link) => handle_tg_link(&cache, &link), |