Clean-up logic for rebasing URLs during URL preview. (#12219)

By using urljoin from the standard library and reducing the number of places URLs are rebased.
2025-11-28 10:50:31 -05:00 · 2022-03-16 07:21:36 -04:00 · 2022-03-16 07:21:36 -04:00 · 4587b35929
commit 4587b35929
parent dda9b7fc4d
4 changed files with 26 additions and 91 deletions
--- a/synapse/rest/media/v1/preview_html.py
+++ b/synapse/rest/media/v1/preview_html.py
@ -16,7 +16,6 @@ import itertools
 import logging
 import re
 from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Union
-from urllib import parse as urlparse

 if TYPE_CHECKING:
    from lxml import etree
@ -144,9 +143,7 @@ def decode_body(
    return etree.fromstring(body, parser)


-def parse_html_to_open_graph(
-    tree: "etree.Element", media_uri: str
-) -> Dict[str, Optional[str]]:
+def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
    """
    Parse the HTML document into an Open Graph response.

@ -155,7 +152,6 @@ def parse_html_to_open_graph(

    Args:
        tree: The parsed HTML document.
-        media_url: The URI used to download the body.

    Returns:
        The Open Graph response as a dictionary.
@ -209,7 +205,7 @@ def parse_html_to_open_graph(
            "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
        )
        if meta_image:
-            og["og:image"] = rebase_url(meta_image[0], media_uri)
+            og["og:image"] = meta_image[0]
        else:
            # TODO: consider inlined CSS styles as well as width & height attribs
            images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
@ -320,37 +316,6 @@ def _iterate_over_text(
            )


-def rebase_url(url: str, base: str) -> str:
-    """
-    Resolves a potentially relative `url` against an absolute `base` URL.
-
-    For example:
-
-        >>> rebase_url("subpage", "https://example.com/foo/")
-        'https://example.com/foo/subpage'
-        >>> rebase_url("sibling", "https://example.com/foo")
-        'https://example.com/sibling'
-        >>> rebase_url("/bar", "https://example.com/foo/")
-        'https://example.com/bar'
-        >>> rebase_url("https://alice.com/a/", "https://example.com/foo/")
-        'https://alice.com/a'
-    """
-    base_parts = urlparse.urlparse(base)
-    # Convert the parsed URL to a list for (potential) modification.
-    url_parts = list(urlparse.urlparse(url))
-    # Add a scheme, if one does not exist.
-    if not url_parts[0]:
-        url_parts[0] = base_parts.scheme or "http"
-    # Fix up the hostname, if this is not a data URL.
-    if url_parts[0] != "data" and not url_parts[1]:
-        url_parts[1] = base_parts.netloc
-        # If the path does not start with a /, nest it under the base path's last
-        # directory.
-        if not url_parts[2].startswith("/"):
-            url_parts[2] = re.sub(r"/[^/]+$", "/", base_parts.path) + url_parts[2]
-    return urlparse.urlunparse(url_parts)
-
-
 def summarize_paragraphs(
    text_nodes: Iterable[str], min_size: int = 200, max_size: int = 500
 ) -> Optional[str]: