From 38042ea4452b224499b24ee5390a56774d74375c Mon Sep 17 00:00:00 2001 From: Vincent Ambo Date: Wed, 24 May 2023 19:08:40 +0300 Subject: feat(tazjin/tgsa): add gpt-3 powered message translation feature this is slow and often overloaded, but it's kind of cool when it works. this translation method deals much better with the kind of slang you'd see in telegram posts than any other method. Change-Id: I7e4c845eb382f0eac627c4237b492c8e40dae574 Reviewed-on: https://cl.tvl.fyi/c/depot/+/8625 Reviewed-by: tazjin Tested-by: BuildkiteCI --- users/tazjin/tgsa/Cargo.lock | 2 + users/tazjin/tgsa/Cargo.toml | 2 + users/tazjin/tgsa/src/main.rs | 86 +++++++++++++++++++++++++++++++++++++++---- 3 files changed, 83 insertions(+), 7 deletions(-) (limited to 'users/tazjin') diff --git a/users/tazjin/tgsa/Cargo.lock b/users/tazjin/tgsa/Cargo.lock index 386355cdda..51d11135f3 100644 --- a/users/tazjin/tgsa/Cargo.lock +++ b/users/tazjin/tgsa/Cargo.lock @@ -1075,6 +1075,8 @@ dependencies = [ "ego-tree", "rouille", "scraper", + "serde", + "serde_json", "url", ] diff --git a/users/tazjin/tgsa/Cargo.toml b/users/tazjin/tgsa/Cargo.toml index b589a8174e..0b15298058 100644 --- a/users/tazjin/tgsa/Cargo.toml +++ b/users/tazjin/tgsa/Cargo.toml @@ -10,3 +10,5 @@ rouille = { version = "3.5", default-features = false } url = "2.3" scraper = "0.13" ego-tree = "0.6" # in tandem with 'scraper' +serde = "1.0" +serde_json = "1.0" diff --git a/users/tazjin/tgsa/src/main.rs b/users/tazjin/tgsa/src/main.rs index 508b9872a2..bff0292403 100644 --- a/users/tazjin/tgsa/src/main.rs +++ b/users/tazjin/tgsa/src/main.rs @@ -1,13 +1,15 @@ use anyhow::{anyhow, Context, Result}; +use scraper::{Html, Selector}; +use serde_json::Value; use std::collections::HashMap; use std::sync::RwLock; use std::time::{Duration, Instant}; -use scraper::{Html, Selector}; #[derive(Clone, Debug, Eq, Hash, PartialEq)] struct TgLink { username: String, message_id: usize, + translated: bool, } impl TgLink { @@ -16,10 +18,15 @@ impl TgLink { } fn to_url(&self, embed: bool) -> String { - format!("https://t.me/{}/{}{}", self.username, self.message_id, if embed { "?embed=1" } else { "" }) + format!( + "https://t.me/{}/{}{}", + self.username, + self.message_id, + if embed { "?embed=1" } else { "" } + ) } - fn parse(url: &str) -> Option { + fn parse(url: &str, translated: bool) -> Option { let url = url.strip_prefix("/")?; let parsed = url::Url::parse(url).ok()?; @@ -37,6 +44,7 @@ impl TgLink { Some(TgLink { username: parts[0].into(), message_id: parts[1].parse().ok()?, + translated, }) } } @@ -55,6 +63,46 @@ fn fetch_post(link: &TgLink, embed: bool) -> Result { Ok(response.body) } +fn fetch_translation(message: &str) -> Result { + let request = serde_json::json!({ + "model": "gpt-3.5-turbo", + "messages": [ + {"role": "user", "content": "Please translate the following message from a Telegram channel into English. If the post is already partially in English, please leave those bits intact as they are. Please respond only with the translation."}, + {"role": "user", "content": message} + ] + }); + + let response: Value = crimp::Request::post("https://api.openai.com/v1/chat/completions") + .bearer_auth(&std::env::var("OPENAPI_KEY").context("no openapi key set")?)? + .json(&request)? + .send() + .context("failed to fetch translation from openai")? + .as_json::()? + .error_for_status(|resp| { + anyhow!( + "translation request failed: {} ({})", + resp.body, + resp.status + ) + })? + .body; + + // we want choices[0].message.content, and inshallah it's the right thing. + let translation = response + .get("choices") + .ok_or_else(|| anyhow!("missing 'choices' key"))? + .get(0) + .ok_or_else(|| anyhow!("empty 'choices' or something"))? + .get("message") + .ok_or_else(|| anyhow!("missing 'message' key"))? + .get("content") + .ok_or_else(|| anyhow!("missing 'content' key"))? + .as_str() + .ok_or_else(|| anyhow!("'content' was not a string"))?; + + Ok(translation.to_string()) +} + // in some cases, posts can not be embedded, but telegram still // includes their content in metadata tags for content previews. // @@ -255,6 +303,12 @@ fn fetch_with_cache(cache: &Cache, link: &TgLink) -> Result { msg.message = fetch_fallback(&link)?; } + if let Some(message) = &msg.message { + if link.translated { + msg.message = Some(fetch_translation(message)?); + } + } + let bbcode = to_bbcode(&link, &msg); let mut media = vec![]; @@ -292,6 +346,7 @@ fn handle_img_redirect(cache: &Cache, img_path: &str) -> Result Ok(rouille::Response::text( r#"tgsa ---- @@ -345,7 +408,16 @@ yes, that looks stupid, but it works if you see this message and think you did the above correctly, you didn't. try again. idiot. -pm me on the forums if this makes you mad or something. +it can also translate posts from russian, ukrainian or whatever other +dumb language you speak into english, by adding `/translate/`, for +example: + + https://tgsa.tazj.in/translate/https://t.me/strelkovii/4329 + +expect this to be slow though. that's the price to pay for translating +shitty slang. + +pm me on the forums if any of this makes you mad or something. "#, )), Some(link) => handle_tg_link(&cache, &link), -- cgit 1.4.1