rewrite markdown links (fixes #2987)

This commit is contained in:
Felix Ableitner 2024-09-25 15:53:57 +02:00
parent b803504f09
commit e8e105dd29
6 changed files with 75 additions and 37 deletions

View File

@ -76,7 +76,7 @@ impl LemmyContext {
.app_data(context)
.debug(true)
// Dont allow any network fetches
.http_fetch_limit(0)
.http_fetch_limit(10)
.build()
.await
.expect("build federation config")

View File

@ -50,7 +50,7 @@ use lemmy_utils::{
rate_limit::{ActionType, BucketConfig},
settings::structs::{PictrsImageMode, Settings},
utils::{
markdown::{markdown_check_for_blocked_urls, markdown_rewrite_image_links},
markdown::{image_links::markdown_rewrite_image_links, markdown_check_for_blocked_urls},
slurs::{build_slur_regex, remove_slurs},
validation::clean_urls_in_text,
},

View File

@ -26,6 +26,7 @@ pub mod fetcher;
pub mod http;
pub(crate) mod mentions;
pub mod objects;
mod post_links;
pub mod protocol;
/// Maximum number of outgoing HTTP requests to fetch a single object. Needs to be high enough

View File

@ -3,6 +3,7 @@ use crate::{
check_apub_id_valid_with_strictness,
local_site_data_cached,
objects::{read_from_string_or_source_opt, verify_is_remote_object},
post_links::markdown_rewrite_remote_post_links_opt,
protocol::{
objects::{
page::{Attachment, AttributedTo, Hashtag, HashtagType, Page, PageType},
@ -237,6 +238,7 @@ impl Object for ApubPost {
let body = read_from_string_or_source_opt(&page.content, &page.media_type, &page.source);
let body = process_markdown_opt(&body, slur_regex, &url_blocklist, context).await?;
let body = markdown_rewrite_remote_post_links_opt(body, context).await;
let language_id =
LanguageTag::to_language_id_single(page.language, &mut context.pool()).await?;

View File

@ -1,46 +1,17 @@
use super::MARKDOWN_PARSER;
use super::{link_rule::Link, MARKDOWN_PARSER};
use crate::settings::SETTINGS;
use markdown_it::plugins::cmark::inline::image::Image;
use markdown_it::{plugins::cmark::inline::image::Image, NodeValue};
use url::Url;
use urlencoding::encode;
/// Rewrites all links to remote domains in markdown, so they go through `/api/v3/image_proxy`.
pub fn markdown_rewrite_image_links(mut src: String) -> (String, Vec<Url>) {
let ast = MARKDOWN_PARSER.parse(&src);
let mut links_offsets = vec![];
// Walk the syntax tree to find positions of image links
ast.walk(|node, _depth| {
if let Some(image) = node.cast::<Image>() {
// srcmap is always present for image
// https://github.com/markdown-it-rust/markdown-it/issues/36#issuecomment-1777844387
let node_offsets = node.srcmap.expect("srcmap is none").get_byte_offsets();
// necessary for custom emojis which look like `![name](url "title")`
let start_offset = node_offsets.1
- image.url.len()
- 1
- image
.title
.as_ref()
.map(|t| t.len() + 3)
.unwrap_or_default();
let end_offset = node_offsets.1 - 1;
links_offsets.push((start_offset, end_offset));
}
});
let links_offsets = find_urls::<Image>(&src);
let mut links = vec![];
// Go through the collected links in reverse order
while let Some((start, end)) = links_offsets.pop() {
let content = src.get(start..end).unwrap_or_default();
// necessary for custom emojis which look like `![name](url "title")`
let (url, extra) = if content.contains(' ') {
let split = content.split_once(' ').expect("split is valid");
(split.0, Some(split.1))
} else {
(content, None)
};
for (start, end) in links_offsets.into_iter().rev() {
let (url, extra) = markdown_handle_title(&src, start, end);
match Url::parse(url) {
Ok(parsed) => {
links.push(parsed.clone());
@ -68,6 +39,61 @@ pub fn markdown_rewrite_image_links(mut src: String) -> (String, Vec<Url>) {
(src, links)
}
pub fn markdown_handle_title(src: &String, start: usize, end: usize) -> (&str, Option<&str>) {
let content = src.get(start..end).unwrap_or_default();
// necessary for custom emojis which look like `![name](url "title")`
let (url, extra) = if content.contains(' ') {
let split = content.split_once(' ').expect("split is valid");
(split.0, Some(split.1))
} else {
(content, None)
};
(url, extra)
}
pub fn markdown_find_links(src: &str) -> Vec<(usize, usize)> {
find_urls::<Link>(src)
}
// Walk the syntax tree to find positions of image or link urls
fn find_urls<T: NodeValue + UrlAndTitle>(src: &str) -> Vec<(usize, usize)> {
let ast = MARKDOWN_PARSER.parse(src);
let mut links_offsets = vec![];
ast.walk(|node, _depth| {
if let Some(image) = node.cast::<T>() {
let node_offsets = node.srcmap.expect("srcmap is none").get_byte_offsets();
let start_offset = node_offsets.1 - image.url_len() - 1 - image.title_len();
let end_offset = node_offsets.1 - 1;
links_offsets.push((start_offset, end_offset));
}
});
links_offsets
}
pub trait UrlAndTitle {
fn url_len(&self) -> usize;
fn title_len(&self) -> usize;
}
impl UrlAndTitle for Image {
fn url_len(&self) -> usize {
self.url.len()
}
fn title_len(&self) -> usize {
self.title.as_ref().map(|t| t.len() + 3).unwrap_or_default()
}
}
impl UrlAndTitle for Link {
fn url_len(&self) -> usize {
self.url.len()
}
fn title_len(&self) -> usize {
self.title.as_ref().map(|t| t.len() + 3).unwrap_or_default()
}
}
#[cfg(test)]
#[expect(clippy::unwrap_used)]
mod tests {
@ -75,6 +101,15 @@ mod tests {
use super::*;
use pretty_assertions::assert_eq;
#[test]
fn test_find_links() {
let links = markdown_find_links("[test](https://example.com)");
assert_eq!(vec![(7, 26)], links);
let links = find_urls::<Image>("![test](https://example.com)");
assert_eq!(vec![(8, 27)], links);
}
#[test]
fn test_markdown_proxy_images() {
let tests: Vec<_> =

View File

@ -3,7 +3,7 @@ use markdown_it::MarkdownIt;
use regex::RegexSet;
use std::sync::LazyLock;
mod image_links;
pub mod image_links;
mod link_rule;
mod spoiler_rule;