mirror of
https://git.anonymousland.org/anonymousland/synapse.git
synced 2025-12-17 11:13:55 -05:00
Merge remote-tracking branch 'upstream/release-v1.27.0'
This commit is contained in:
commit
7f7fb9b566
146 changed files with 4893 additions and 1484 deletions
|
|
@ -375,7 +375,7 @@ class PreviewUrlResource(DirectServeJsonResource):
|
|||
"""
|
||||
Check whether the URL should be downloaded as oEmbed content instead.
|
||||
|
||||
Params:
|
||||
Args:
|
||||
url: The URL to check.
|
||||
|
||||
Returns:
|
||||
|
|
@ -392,7 +392,7 @@ class PreviewUrlResource(DirectServeJsonResource):
|
|||
"""
|
||||
Request content from an oEmbed endpoint.
|
||||
|
||||
Params:
|
||||
Args:
|
||||
endpoint: The oEmbed API endpoint.
|
||||
url: The URL to pass to the API.
|
||||
|
||||
|
|
@ -681,27 +681,51 @@ class PreviewUrlResource(DirectServeJsonResource):
|
|||
def decode_and_calc_og(
|
||||
body: bytes, media_uri: str, request_encoding: Optional[str] = None
|
||||
) -> Dict[str, Optional[str]]:
|
||||
"""
|
||||
Calculate metadata for an HTML document.
|
||||
|
||||
This uses lxml to parse the HTML document into the OG response. If errors
|
||||
occur during processing of the document, an empty response is returned.
|
||||
|
||||
Args:
|
||||
body: The HTML document, as bytes.
|
||||
media_url: The URI used to download the body.
|
||||
request_encoding: The character encoding of the body, as a string.
|
||||
|
||||
Returns:
|
||||
The OG response as a dictionary.
|
||||
"""
|
||||
# If there's no body, nothing useful is going to be found.
|
||||
if not body:
|
||||
return {}
|
||||
|
||||
from lxml import etree
|
||||
|
||||
# Create an HTML parser. If this fails, log and return no metadata.
|
||||
try:
|
||||
parser = etree.HTMLParser(recover=True, encoding=request_encoding)
|
||||
tree = etree.fromstring(body, parser)
|
||||
og = _calc_og(tree, media_uri)
|
||||
except LookupError:
|
||||
# blindly consider the encoding as utf-8.
|
||||
parser = etree.HTMLParser(recover=True, encoding="utf-8")
|
||||
except Exception as e:
|
||||
logger.warning("Unable to create HTML parser: %s" % (e,))
|
||||
return {}
|
||||
|
||||
def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
|
||||
# Attempt to parse the body. If this fails, log and return no metadata.
|
||||
tree = etree.fromstring(body_attempt, parser)
|
||||
return _calc_og(tree, media_uri)
|
||||
|
||||
# Attempt to parse the body. If this fails, log and return no metadata.
|
||||
try:
|
||||
return _attempt_calc_og(body)
|
||||
except UnicodeDecodeError:
|
||||
# blindly try decoding the body as utf-8, which seems to fix
|
||||
# the charset mismatches on https://google.com
|
||||
parser = etree.HTMLParser(recover=True, encoding=request_encoding)
|
||||
tree = etree.fromstring(body.decode("utf-8", "ignore"), parser)
|
||||
og = _calc_og(tree, media_uri)
|
||||
|
||||
return og
|
||||
return _attempt_calc_og(body.decode("utf-8", "ignore"))
|
||||
|
||||
|
||||
def _calc_og(tree, media_uri: str) -> Dict[str, Optional[str]]:
|
||||
def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
|
||||
# suck our tree into lxml and define our OG response.
|
||||
|
||||
# if we see any image URLs in the OG response, then spider them
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue