mirror of
https://git.anonymousland.org/anonymousland/synapse.git
synced 2025-05-03 01:04:49 -04:00
Handle additional errors when previewing URLs. (#9333)
* Handle the case of lxml not finding a document tree. * Parse the document encoding from the XML tag.
This commit is contained in:
parent
b0b2cac057
commit
0963d39ea6
3 changed files with 145 additions and 30 deletions
|
@ -58,7 +58,10 @@ if TYPE_CHECKING:
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I)
|
||||
_charset_match = re.compile(br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?', flags=re.I)
|
||||
_xml_encoding_match = re.compile(
|
||||
br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I
|
||||
)
|
||||
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
|
||||
|
||||
OG_TAG_NAME_MAXLEN = 50
|
||||
|
@ -300,24 +303,7 @@ class PreviewUrlResource(DirectServeJsonResource):
|
|||
with open(media_info["filename"], "rb") as file:
|
||||
body = file.read()
|
||||
|
||||
encoding = None
|
||||
|
||||
# Let's try and figure out if it has an encoding set in a meta tag.
|
||||
# Limit it to the first 1kb, since it ought to be in the meta tags
|
||||
# at the top.
|
||||
match = _charset_match.search(body[:1000])
|
||||
|
||||
# If we find a match, it should take precedence over the
|
||||
# Content-Type header, so set it here.
|
||||
if match:
|
||||
encoding = match.group(1).decode("ascii")
|
||||
|
||||
# If we don't find a match, we'll look at the HTTP Content-Type, and
|
||||
# if that doesn't exist, we'll fall back to UTF-8.
|
||||
if not encoding:
|
||||
content_match = _content_type_match.match(media_info["media_type"])
|
||||
encoding = content_match.group(1) if content_match else "utf-8"
|
||||
|
||||
encoding = get_html_media_encoding(body, media_info["media_type"])
|
||||
og = decode_and_calc_og(body, media_info["uri"], encoding)
|
||||
|
||||
# pre-cache the image for posterity
|
||||
|
@ -689,6 +675,48 @@ class PreviewUrlResource(DirectServeJsonResource):
|
|||
logger.debug("No media removed from url cache")
|
||||
|
||||
|
||||
def get_html_media_encoding(body: bytes, content_type: str) -> str:
|
||||
"""
|
||||
Get the encoding of the body based on the (presumably) HTML body or media_type.
|
||||
|
||||
The precedence used for finding a character encoding is:
|
||||
|
||||
1. meta tag with a charset declared.
|
||||
2. The XML document's character encoding attribute.
|
||||
3. The Content-Type header.
|
||||
4. Fallback to UTF-8.
|
||||
|
||||
Args:
|
||||
body: The HTML document, as bytes.
|
||||
content_type: The Content-Type header.
|
||||
|
||||
Returns:
|
||||
The character encoding of the body, as a string.
|
||||
"""
|
||||
# Limit searches to the first 1kb, since it ought to be at the top.
|
||||
body_start = body[:1024]
|
||||
|
||||
# Let's try and figure out if it has an encoding set in a meta tag.
|
||||
match = _charset_match.search(body_start)
|
||||
if match:
|
||||
return match.group(1).decode("ascii")
|
||||
|
||||
# TODO Support <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
|
||||
|
||||
# If we didn't find a match, see if it an XML document with an encoding.
|
||||
match = _xml_encoding_match.match(body_start)
|
||||
if match:
|
||||
return match.group(1).decode("ascii")
|
||||
|
||||
# If we don't find a match, we'll look at the HTTP Content-Type, and
|
||||
# if that doesn't exist, we'll fall back to UTF-8.
|
||||
content_match = _content_type_match.match(content_type)
|
||||
if content_match:
|
||||
return content_match.group(1)
|
||||
|
||||
return "utf-8"
|
||||
|
||||
|
||||
def decode_and_calc_og(
|
||||
body: bytes, media_uri: str, request_encoding: Optional[str] = None
|
||||
) -> Dict[str, Optional[str]]:
|
||||
|
@ -725,6 +753,11 @@ def decode_and_calc_og(
|
|||
def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
|
||||
# Attempt to parse the body. If this fails, log and return no metadata.
|
||||
tree = etree.fromstring(body_attempt, parser)
|
||||
|
||||
# The data was successfully parsed, but no tree was found.
|
||||
if tree is None:
|
||||
return {}
|
||||
|
||||
return _calc_og(tree, media_uri)
|
||||
|
||||
# Attempt to parse the body. If this fails, log and return no metadata.
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue