Merge remote-tracking branch 'upstream/release-v1.27.0'

This commit is contained in:
Tulir Asokan 2021-02-02 16:06:33 +02:00
commit 7f7fb9b566
146 changed files with 4893 additions and 1484 deletions

View file

@ -375,7 +375,7 @@ class PreviewUrlResource(DirectServeJsonResource):
"""
Check whether the URL should be downloaded as oEmbed content instead.
Params:
Args:
url: The URL to check.
Returns:
@ -392,7 +392,7 @@ class PreviewUrlResource(DirectServeJsonResource):
"""
Request content from an oEmbed endpoint.
Params:
Args:
endpoint: The oEmbed API endpoint.
url: The URL to pass to the API.
@ -681,27 +681,51 @@ class PreviewUrlResource(DirectServeJsonResource):
def decode_and_calc_og(
body: bytes, media_uri: str, request_encoding: Optional[str] = None
) -> Dict[str, Optional[str]]:
"""
Calculate metadata for an HTML document.
This uses lxml to parse the HTML document into the OG response. If errors
occur during processing of the document, an empty response is returned.
Args:
body: The HTML document, as bytes.
media_url: The URI used to download the body.
request_encoding: The character encoding of the body, as a string.
Returns:
The OG response as a dictionary.
"""
# If there's no body, nothing useful is going to be found.
if not body:
return {}
from lxml import etree
# Create an HTML parser. If this fails, log and return no metadata.
try:
parser = etree.HTMLParser(recover=True, encoding=request_encoding)
tree = etree.fromstring(body, parser)
og = _calc_og(tree, media_uri)
except LookupError:
# blindly consider the encoding as utf-8.
parser = etree.HTMLParser(recover=True, encoding="utf-8")
except Exception as e:
logger.warning("Unable to create HTML parser: %s" % (e,))
return {}
def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
# Attempt to parse the body. If this fails, log and return no metadata.
tree = etree.fromstring(body_attempt, parser)
return _calc_og(tree, media_uri)
# Attempt to parse the body. If this fails, log and return no metadata.
try:
return _attempt_calc_og(body)
except UnicodeDecodeError:
# blindly try decoding the body as utf-8, which seems to fix
# the charset mismatches on https://google.com
parser = etree.HTMLParser(recover=True, encoding=request_encoding)
tree = etree.fromstring(body.decode("utf-8", "ignore"), parser)
og = _calc_og(tree, media_uri)
return og
return _attempt_calc_og(body.decode("utf-8", "ignore"))
def _calc_og(tree, media_uri: str) -> Dict[str, Optional[str]]:
def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
# suck our tree into lxml and define our OG response.
# if we see any image URLs in the OG response, then spider them