Merge remote-tracking branch 'upstream/release-v1.27.0'

2025-12-17 11:13:55 -05:00 · 2021-02-02 16:06:33 +02:00 · 2021-02-02 16:06:33 +02:00 · 7f7fb9b566
commit 7f7fb9b566
parent 4c65f9a9f7 eec9ab3225
146 changed files with 4893 additions and 1484 deletions
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@ -375,7 +375,7 @@ class PreviewUrlResource(DirectServeJsonResource):
        """
        Check whether the URL should be downloaded as oEmbed content instead.

-        Params:
+        Args:
            url: The URL to check.

        Returns:
@ -392,7 +392,7 @@ class PreviewUrlResource(DirectServeJsonResource):
        """
        Request content from an oEmbed endpoint.

-        Params:
+        Args:
            endpoint: The oEmbed API endpoint.
            url: The URL to pass to the API.

@ -681,27 +681,51 @@ class PreviewUrlResource(DirectServeJsonResource):
 def decode_and_calc_og(
    body: bytes, media_uri: str, request_encoding: Optional[str] = None
 ) -> Dict[str, Optional[str]]:
+    """
+    Calculate metadata for an HTML document.
+
+    This uses lxml to parse the HTML document into the OG response. If errors
+    occur during processing of the document, an empty response is returned.
+
+    Args:
+        body: The HTML document, as bytes.
+        media_url: The URI used to download the body.
+        request_encoding: The character encoding of the body, as a string.
+
+    Returns:
+        The OG response as a dictionary.
+    """
    # If there's no body, nothing useful is going to be found.
    if not body:
        return {}

    from lxml import etree

+    # Create an HTML parser. If this fails, log and return no metadata.
    try:
        parser = etree.HTMLParser(recover=True, encoding=request_encoding)
-        tree = etree.fromstring(body, parser)
-        og = _calc_og(tree, media_uri)
+    except LookupError:
+        # blindly consider the encoding as utf-8.
+        parser = etree.HTMLParser(recover=True, encoding="utf-8")
+    except Exception as e:
+        logger.warning("Unable to create HTML parser: %s" % (e,))
+        return {}
+
+    def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
+        # Attempt to parse the body. If this fails, log and return no metadata.
+        tree = etree.fromstring(body_attempt, parser)
+        return _calc_og(tree, media_uri)
+
+    # Attempt to parse the body. If this fails, log and return no metadata.
+    try:
+        return _attempt_calc_og(body)
    except UnicodeDecodeError:
        # blindly try decoding the body as utf-8, which seems to fix
        # the charset mismatches on https://google.com
-        parser = etree.HTMLParser(recover=True, encoding=request_encoding)
-        tree = etree.fromstring(body.decode("utf-8", "ignore"), parser)
-        og = _calc_og(tree, media_uri)
-
-    return og
+        return _attempt_calc_og(body.decode("utf-8", "ignore"))


-def _calc_og(tree, media_uri: str) -> Dict[str, Optional[str]]:
+def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
    # suck our tree into lxml and define our OG response.

    # if we see any image URLs in the OG response, then spider them