From eadcfbbfab12edd7b956a31a88330dbc0ba0b038 Mon Sep 17 00:00:00 2001 From: Vincent Ambo Date: Wed, 12 Apr 2023 15:25:20 +0300 Subject: feat(tazjin/tgsa): support extracting fallback message from preview some telegram channels do not allow embedding of messages, but do allow a preview to be shown on twitter. this preview is just embedded in the html, and can be scraped out if no message was found. technically this preview also contains image links, but they are to very low resolution, thumbnail-style images so i decided not to include them here. Change-Id: Ifb89f9fbde8140d577a5ee3af6e60b04232e53e3 Reviewed-on: https://cl.tvl.fyi/c/depot/+/8480 Autosubmit: tazjin Reviewed-by: tazjin Tested-by: BuildkiteCI --- users/tazjin/tgsa/src/main.rs | 48 +++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/users/tazjin/tgsa/src/main.rs b/users/tazjin/tgsa/src/main.rs index ed72569f9204..508b9872a254 100644 --- a/users/tazjin/tgsa/src/main.rs +++ b/users/tazjin/tgsa/src/main.rs @@ -2,6 +2,7 @@ use anyhow::{anyhow, Context, Result}; use std::collections::HashMap; use std::sync::RwLock; use std::time::{Duration, Instant}; +use scraper::{Html, Selector}; #[derive(Clone, Debug, Eq, Hash, PartialEq)] struct TgLink { @@ -14,8 +15,8 @@ impl TgLink { format!("t.me/{}/{}", self.username, self.message_id) } - fn to_url(&self) -> String { - format!("https://t.me/{}/{}?embed=1", self.username, self.message_id) + fn to_url(&self, embed: bool) -> String { + format!("https://t.me/{}/{}{}", self.username, self.message_id, if embed { "?embed=1" } else { "" }) } fn parse(url: &str) -> Option { @@ -40,9 +41,9 @@ impl TgLink { } } -fn fetch_embed(link: &TgLink) -> Result { +fn fetch_post(link: &TgLink, embed: bool) -> Result { println!("fetching {}#{}", link.username, link.message_id); - let response = crimp::Request::get(&link.to_url()) + let response = crimp::Request::get(&link.to_url(embed)) .send() .context("failed to fetch embed data")? .as_string() @@ -54,6 +55,28 @@ fn fetch_embed(link: &TgLink) -> Result { Ok(response.body) } +// in some cases, posts can not be embedded, but telegram still +// includes their content in metadata tags for content previews. +// +// we skip images in this case, as they are scaled down to thumbnail +// size and not useful. +fn fetch_fallback(link: &TgLink) -> Result> { + let post = fetch_post(link, false)?; + let doc = Html::parse_document(&post); + let desc_sel = Selector::parse("meta[property=\"og:description\"]").unwrap(); + let desc_elem = match doc.select(&desc_sel).next() { + None => return Ok(None), + Some(elem) => elem, + }; + + let content = match desc_elem.value().attr("content") { + None => return Ok(None), + Some(content) => content.to_string(), + }; + + return Ok(Some(content)); +} + #[derive(Debug)] struct TgMessage { author: String, @@ -71,8 +94,6 @@ fn extract_photo_url(style: &str) -> Option<&str> { } fn parse_tgmessage(embed: &str) -> Result { - use scraper::{Html, Selector}; - let doc = Html::parse_document(embed); let author_sel = Selector::parse("a.tgme_widget_message_owner_name").unwrap(); @@ -164,7 +185,7 @@ fn to_bbcode(link: &TgLink, msg: &TgMessage) -> String { out.push_str(&format!("[quote=\"{}\"]\n", msg.author)); for video in 0..msg.videos.len() { - out.push_str(&format!("[url=\"{}\"]", link.to_url())); + out.push_str(&format!("[url=\"{}\"]", link.to_url(true))); // video thumbnail links are appended to the photos, hence the // addition here @@ -184,7 +205,7 @@ fn to_bbcode(link: &TgLink, msg: &TgMessage) -> String { if msg.has_audio { out.push_str(&format!( "[i]This message has audio attached. Go [url=\"{}\"]to Telegram[/url] to listen.[/i]", - link.to_url(), + link.to_url(true), )); } @@ -196,7 +217,7 @@ fn to_bbcode(link: &TgLink, msg: &TgMessage) -> String { out.push_str(&format!( "[sub](from [url=\"{}\"]{}[/url], via [url=\"https://tgsa.tazj.in\"]tgsa[/url])[/sub]\n", - link.to_url(), + link.to_url(true), link.human_friendly_url(), )); @@ -227,8 +248,13 @@ fn fetch_with_cache(cache: &Cache, link: &TgLink) -> Result { // TODO(tazjin): per link? let mut writer = cache.write().unwrap(); - let embed = fetch_embed(&link)?; - let mut msg = parse_tgmessage(&embed)?; + let post = fetch_post(&link, true)?; + let mut msg = parse_tgmessage(&post)?; + + if msg.message.is_none() { + msg.message = fetch_fallback(&link)?; + } + let bbcode = to_bbcode(&link, &msg); let mut media = vec![]; -- cgit 1.4.1