mirror of
https://git.anonymousland.org/anonymousland/synapse.git
synced 2025-06-01 13:34:18 -04:00
Clean-up logic for rebasing URLs during URL preview. (#12219)
By using urljoin from the standard library and reducing the number of places URLs are rebased.
This commit is contained in:
parent
dda9b7fc4d
commit
4587b35929
4 changed files with 26 additions and 91 deletions
|
@ -16,7 +16,6 @@ import itertools
|
|||
import logging
|
||||
import re
|
||||
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Union
|
||||
from urllib import parse as urlparse
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from lxml import etree
|
||||
|
@ -144,9 +143,7 @@ def decode_body(
|
|||
return etree.fromstring(body, parser)
|
||||
|
||||
|
||||
def parse_html_to_open_graph(
|
||||
tree: "etree.Element", media_uri: str
|
||||
) -> Dict[str, Optional[str]]:
|
||||
def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
|
||||
"""
|
||||
Parse the HTML document into an Open Graph response.
|
||||
|
||||
|
@ -155,7 +152,6 @@ def parse_html_to_open_graph(
|
|||
|
||||
Args:
|
||||
tree: The parsed HTML document.
|
||||
media_url: The URI used to download the body.
|
||||
|
||||
Returns:
|
||||
The Open Graph response as a dictionary.
|
||||
|
@ -209,7 +205,7 @@ def parse_html_to_open_graph(
|
|||
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
|
||||
)
|
||||
if meta_image:
|
||||
og["og:image"] = rebase_url(meta_image[0], media_uri)
|
||||
og["og:image"] = meta_image[0]
|
||||
else:
|
||||
# TODO: consider inlined CSS styles as well as width & height attribs
|
||||
images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
|
||||
|
@ -320,37 +316,6 @@ def _iterate_over_text(
|
|||
)
|
||||
|
||||
|
||||
def rebase_url(url: str, base: str) -> str:
|
||||
"""
|
||||
Resolves a potentially relative `url` against an absolute `base` URL.
|
||||
|
||||
For example:
|
||||
|
||||
>>> rebase_url("subpage", "https://example.com/foo/")
|
||||
'https://example.com/foo/subpage'
|
||||
>>> rebase_url("sibling", "https://example.com/foo")
|
||||
'https://example.com/sibling'
|
||||
>>> rebase_url("/bar", "https://example.com/foo/")
|
||||
'https://example.com/bar'
|
||||
>>> rebase_url("https://alice.com/a/", "https://example.com/foo/")
|
||||
'https://alice.com/a'
|
||||
"""
|
||||
base_parts = urlparse.urlparse(base)
|
||||
# Convert the parsed URL to a list for (potential) modification.
|
||||
url_parts = list(urlparse.urlparse(url))
|
||||
# Add a scheme, if one does not exist.
|
||||
if not url_parts[0]:
|
||||
url_parts[0] = base_parts.scheme or "http"
|
||||
# Fix up the hostname, if this is not a data URL.
|
||||
if url_parts[0] != "data" and not url_parts[1]:
|
||||
url_parts[1] = base_parts.netloc
|
||||
# If the path does not start with a /, nest it under the base path's last
|
||||
# directory.
|
||||
if not url_parts[2].startswith("/"):
|
||||
url_parts[2] = re.sub(r"/[^/]+$", "/", base_parts.path) + url_parts[2]
|
||||
return urlparse.urlunparse(url_parts)
|
||||
|
||||
|
||||
def summarize_paragraphs(
|
||||
text_nodes: Iterable[str], min_size: int = 200, max_size: int = 500
|
||||
) -> Optional[str]:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue