rewrite markdown links (fixes #2987)

2024-10-01 01:36:12 -04:00 · 2024-09-25 15:53:57 +02:00 · 2024-09-25 15:53:57 +02:00 · e8e105dd29
commit e8e105dd29
parent b803504f09
6 changed files with 75 additions and 37 deletions
--- a/crates/api_common/src/context.rs
+++ b/crates/api_common/src/context.rs
@ -76,7 +76,7 @@ impl LemmyContext {
      .app_data(context)
      .debug(true)
      // Dont allow any network fetches
-      .http_fetch_limit(0)
+      .http_fetch_limit(10)
      .build()
      .await
      .expect("build federation config")
--- a/crates/api_common/src/utils.rs
+++ b/crates/api_common/src/utils.rs
@ -50,7 +50,7 @@ use lemmy_utils::{
  rate_limit::{ActionType, BucketConfig},
  settings::structs::{PictrsImageMode, Settings},
  utils::{
-    markdown::{markdown_check_for_blocked_urls, markdown_rewrite_image_links},
+    markdown::{image_links::markdown_rewrite_image_links, markdown_check_for_blocked_urls},
    slurs::{build_slur_regex, remove_slurs},
    validation::clean_urls_in_text,
  },
--- a/crates/apub/src/lib.rs
+++ b/crates/apub/src/lib.rs
@ -26,6 +26,7 @@ pub mod fetcher;
 pub mod http;
 pub(crate) mod mentions;
 pub mod objects;
+mod post_links;
 pub mod protocol;

 /// Maximum number of outgoing HTTP requests to fetch a single object. Needs to be high enough
--- a/crates/apub/src/objects/post.rs
+++ b/crates/apub/src/objects/post.rs
@ -3,6 +3,7 @@ use crate::{
  check_apub_id_valid_with_strictness,
  local_site_data_cached,
  objects::{read_from_string_or_source_opt, verify_is_remote_object},
+  post_links::markdown_rewrite_remote_post_links_opt,
  protocol::{
    objects::{
      page::{Attachment, AttributedTo, Hashtag, HashtagType, Page, PageType},
@ -237,6 +238,7 @@ impl Object for ApubPost {

    let body = read_from_string_or_source_opt(&page.content, &page.media_type, &page.source);
    let body = process_markdown_opt(&body, slur_regex, &url_blocklist, context).await?;
+    let body = markdown_rewrite_remote_post_links_opt(body, context).await;
    let language_id =
      LanguageTag::to_language_id_single(page.language, &mut context.pool()).await?;

--- a/crates/utils/src/utils/markdown/image_links.rs
+++ b/crates/utils/src/utils/markdown/image_links.rs
@ -1,46 +1,17 @@
-use super::MARKDOWN_PARSER;
+use super::{link_rule::Link, MARKDOWN_PARSER};
 use crate::settings::SETTINGS;
-use markdown_it::plugins::cmark::inline::image::Image;
+use markdown_it::{plugins::cmark::inline::image::Image, NodeValue};
 use url::Url;
 use urlencoding::encode;

 /// Rewrites all links to remote domains in markdown, so they go through `/api/v3/image_proxy`.
 pub fn markdown_rewrite_image_links(mut src: String) -> (String, Vec<Url>) {
-  let ast = MARKDOWN_PARSER.parse(&src);
-  let mut links_offsets = vec![];
-
-  // Walk the syntax tree to find positions of image links
-  ast.walk(|node, _depth| {
-    if let Some(image) = node.cast::<Image>() {
-      // srcmap is always present for image
-      // https://github.com/markdown-it-rust/markdown-it/issues/36#issuecomment-1777844387
-      let node_offsets = node.srcmap.expect("srcmap is none").get_byte_offsets();
-      // necessary for custom emojis which look like `![name](url "title")`
-      let start_offset = node_offsets.1
-        - image.url.len()
-        - 1
-        - image
-          .title
-          .as_ref()
-          .map(|t| t.len() + 3)
-          .unwrap_or_default();
-      let end_offset = node_offsets.1 - 1;
-
-      links_offsets.push((start_offset, end_offset));
-    }
-  });
+  let links_offsets = find_urls::<Image>(&src);

  let mut links = vec![];
  // Go through the collected links in reverse order
-  while let Some((start, end)) = links_offsets.pop() {
-    let content = src.get(start..end).unwrap_or_default();
-    // necessary for custom emojis which look like `![name](url "title")`
-    let (url, extra) = if content.contains(' ') {
-      let split = content.split_once(' ').expect("split is valid");
-      (split.0, Some(split.1))
-    } else {
-      (content, None)
-    };
+  for (start, end) in links_offsets.into_iter().rev() {
+    let (url, extra) = markdown_handle_title(&src, start, end);
    match Url::parse(url) {
      Ok(parsed) => {
        links.push(parsed.clone());
@ -68,6 +39,61 @@ pub fn markdown_rewrite_image_links(mut src: String) -> (String, Vec<Url>) {
  (src, links)
 }

+pub fn markdown_handle_title(src: &String, start: usize, end: usize) -> (&str, Option<&str>) {
+  let content = src.get(start..end).unwrap_or_default();
+  // necessary for custom emojis which look like `![name](url "title")`
+  let (url, extra) = if content.contains(' ') {
+    let split = content.split_once(' ').expect("split is valid");
+    (split.0, Some(split.1))
+  } else {
+    (content, None)
+  };
+  (url, extra)
+}
+
+pub fn markdown_find_links(src: &str) -> Vec<(usize, usize)> {
+  find_urls::<Link>(src)
+}
+
+// Walk the syntax tree to find positions of image or link urls
+fn find_urls<T: NodeValue + UrlAndTitle>(src: &str) -> Vec<(usize, usize)> {
+  let ast = MARKDOWN_PARSER.parse(src);
+  let mut links_offsets = vec![];
+  ast.walk(|node, _depth| {
+    if let Some(image) = node.cast::<T>() {
+      let node_offsets = node.srcmap.expect("srcmap is none").get_byte_offsets();
+      let start_offset = node_offsets.1 - image.url_len() - 1 - image.title_len();
+      let end_offset = node_offsets.1 - 1;
+
+      links_offsets.push((start_offset, end_offset));
+    }
+  });
+  links_offsets
+}
+
+pub trait UrlAndTitle {
+  fn url_len(&self) -> usize;
+  fn title_len(&self) -> usize;
+}
+
+impl UrlAndTitle for Image {
+  fn url_len(&self) -> usize {
+    self.url.len()
+  }
+
+  fn title_len(&self) -> usize {
+    self.title.as_ref().map(|t| t.len() + 3).unwrap_or_default()
+  }
+}
+impl UrlAndTitle for Link {
+  fn url_len(&self) -> usize {
+    self.url.len()
+  }
+  fn title_len(&self) -> usize {
+    self.title.as_ref().map(|t| t.len() + 3).unwrap_or_default()
+  }
+}
+
 #[cfg(test)]
 #[expect(clippy::unwrap_used)]
 mod tests {
@ -75,6 +101,15 @@ mod tests {
  use super::*;
  use pretty_assertions::assert_eq;

+  #[test]
+  fn test_find_links() {
+    let links = markdown_find_links("[test](https://example.com)");
+    assert_eq!(vec![(7, 26)], links);
+
+    let links = find_urls::<Image>("![test](https://example.com)");
+    assert_eq!(vec![(8, 27)], links);
+  }
+
  #[test]
  fn test_markdown_proxy_images() {
    let tests: Vec<_> =
--- a/crates/utils/src/utils/markdown/mod.rs
+++ b/crates/utils/src/utils/markdown/mod.rs
@ -3,7 +3,7 @@ use markdown_it::MarkdownIt;
 use regex::RegexSet;
 use std::sync::LazyLock;

-mod image_links;
+pub mod image_links;
 mod link_rule;
 mod spoiler_rule;