Refactor oEmbed previews (#10814)

The major change is moving the decision of whether to use oEmbed
further up the call-stack. This reverts the _download_url method to
being a "dumb" functionwhich takes a single URL and downloads it
(as it was before #7920).

This also makes more minor refactorings:

* Renames internal variables for clarity.
* Factors out shared code between the HTML and rich oEmbed
  previews.
* Fixes tests to preview an oEmbed image.
This commit is contained in:
Patrick Cloke 2021-09-21 12:09:57 -04:00 committed by GitHub
parent 2843058a8b
commit ba7a91aea5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 299 additions and 220 deletions

View File

@ -0,0 +1 @@
Improve oEmbed previews by processing the author name, photo, and video information.

View File

@ -25,16 +25,14 @@ When Synapse is asked to preview a URL it does the following:
3. Kicks off a background process to generate a preview: 3. Kicks off a background process to generate a preview:
1. Checks the database cache by URL and timestamp and returns the result if it 1. Checks the database cache by URL and timestamp and returns the result if it
has not expired and was successful (a 2xx return code). has not expired and was successful (a 2xx return code).
2. Checks if the URL matches an oEmbed pattern. If it does, fetch the oEmbed 2. Checks if the URL matches an [oEmbed](https://oembed.com/) pattern. If it
response. If this is an image, replace the URL to fetch and continue. If does, update the URL to download.
if it is HTML content, use the HTML as the document and continue. 3. Downloads the URL and stores it into a file via the media storage provider
3. If it doesn't match an oEmbed pattern, downloads the URL and stores it and saves the local media metadata.
into a file via the media storage provider and saves the local media 4. If the media is an image:
metadata.
5. If the media is an image:
1. Generates thumbnails. 1. Generates thumbnails.
2. Generates an Open Graph response based on image properties. 2. Generates an Open Graph response based on image properties.
6. If the media is HTML: 5. If the media is HTML:
1. Decodes the HTML via the stored file. 1. Decodes the HTML via the stored file.
2. Generates an Open Graph response from the HTML. 2. Generates an Open Graph response from the HTML.
3. If an image exists in the Open Graph response: 3. If an image exists in the Open Graph response:
@ -42,6 +40,13 @@ When Synapse is asked to preview a URL it does the following:
provider and saves the local media metadata. provider and saves the local media metadata.
2. Generates thumbnails. 2. Generates thumbnails.
3. Updates the Open Graph response based on image properties. 3. Updates the Open Graph response based on image properties.
6. If the media is JSON and an oEmbed URL was found:
1. Convert the oEmbed response to an Open Graph response.
2. If a thumbnail or image is in the oEmbed response:
1. Downloads the URL and stores it into a file via the media storage
provider and saves the local media metadata.
2. Generates thumbnails.
3. Updates the Open Graph response based on image properties.
7. Stores the result in the database cache. 7. Stores the result in the database cache.
4. Returns the result. 4. Returns the result.

View File

@ -12,11 +12,14 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import logging import logging
import urllib.parse
from typing import TYPE_CHECKING, Optional from typing import TYPE_CHECKING, Optional
import attr import attr
from synapse.http.client import SimpleHttpClient from synapse.http.client import SimpleHttpClient
from synapse.types import JsonDict
from synapse.util import json_decoder
if TYPE_CHECKING: if TYPE_CHECKING:
from synapse.server import HomeServer from synapse.server import HomeServer
@ -24,18 +27,15 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@attr.s(slots=True, auto_attribs=True) @attr.s(slots=True, frozen=True, auto_attribs=True)
class OEmbedResult: class OEmbedResult:
# Either HTML content or URL must be provided. # The Open Graph result (converted from the oEmbed result).
html: Optional[str] open_graph_result: JsonDict
url: Optional[str] # Number of seconds to cache the content, according to the oEmbed response.
title: Optional[str] #
# Number of seconds to cache the content. # This will be None if no cache-age is provided in the oEmbed response (or
cache_age: int # if the oEmbed response cannot be turned into an Open Graph response).
cache_age: Optional[int]
class OEmbedError(Exception):
"""An error occurred processing the oEmbed object."""
class OEmbedProvider: class OEmbedProvider:
@ -81,75 +81,106 @@ class OEmbedProvider:
""" """
for url_pattern, endpoint in self._oembed_patterns.items(): for url_pattern, endpoint in self._oembed_patterns.items():
if url_pattern.fullmatch(url): if url_pattern.fullmatch(url):
return endpoint # TODO Specify max height / width.
# Note that only the JSON format is supported, some endpoints want
# this in the URL, others want it as an argument.
endpoint = endpoint.replace("{format}", "json")
args = {"url": url, "format": "json"}
query_str = urllib.parse.urlencode(args, True)
return f"{endpoint}?{query_str}"
# No match. # No match.
return None return None
async def get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult: def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
""" """
Request content from an oEmbed endpoint. Parse the oEmbed response into an Open Graph response.
Args: Args:
endpoint: The oEmbed API endpoint. url: The URL which is being previewed (not the one which was
url: The URL to pass to the API. requested).
raw_body: The oEmbed response as JSON encoded as bytes.
Returns: Returns:
An object representing the metadata returned. json-encoded Open Graph data
Raises:
OEmbedError if fetching or parsing of the oEmbed information fails.
""" """
try: try:
logger.debug("Trying to get oEmbed content for url '%s'", url) # oEmbed responses *must* be UTF-8 according to the spec.
oembed = json_decoder.decode(raw_body.decode("utf-8"))
# Note that only the JSON format is supported, some endpoints want
# this in the URL, others want it as an argument.
endpoint = endpoint.replace("{format}", "json")
result = await self._client.get_json(
endpoint,
# TODO Specify max height / width.
args={"url": url, "format": "json"},
)
# Ensure there's a version of 1.0. # Ensure there's a version of 1.0.
if result.get("version") != "1.0": oembed_version = oembed["version"]
raise OEmbedError("Invalid version: %s" % (result.get("version"),)) if oembed_version != "1.0":
raise RuntimeError(f"Invalid version: {oembed_version}")
oembed_type = result.get("type")
# Ensure the cache age is None or an int. # Ensure the cache age is None or an int.
cache_age = result.get("cache_age") cache_age = oembed.get("cache_age")
if cache_age: if cache_age:
cache_age = int(cache_age) cache_age = int(cache_age)
oembed_result = OEmbedResult(None, None, result.get("title"), cache_age) # The results.
open_graph_response = {"og:title": oembed.get("title")}
# HTML content. # If a thumbnail exists, use it. Note that dimensions will be calculated later.
if "thumbnail_url" in oembed:
open_graph_response["og:image"] = oembed["thumbnail_url"]
# Process each type separately.
oembed_type = oembed["type"]
if oembed_type == "rich": if oembed_type == "rich":
oembed_result.html = result.get("html") calc_description_and_urls(open_graph_response, oembed["html"])
return oembed_result
if oembed_type == "photo": elif oembed_type == "photo":
oembed_result.url = result.get("url") # If this is a photo, use the full image, not the thumbnail.
return oembed_result open_graph_response["og:image"] = oembed["url"]
# TODO Handle link and video types. else:
raise RuntimeError(f"Unknown oEmbed type: {oembed_type}")
if "thumbnail_url" in result:
oembed_result.url = result.get("thumbnail_url")
return oembed_result
raise OEmbedError("Incompatible oEmbed information.")
except OEmbedError as e:
# Trap OEmbedErrors first so we can directly re-raise them.
logger.warning("Error parsing oEmbed metadata from %s: %r", url, e)
raise
except Exception as e: except Exception as e:
# Trap any exception and let the code follow as usual. # Trap any exception and let the code follow as usual.
# FIXME: pass through 404s and other error messages nicely logger.warning(f"Error parsing oEmbed metadata from {url}: {e:r}")
logger.warning("Error downloading oEmbed metadata from %s: %r", url, e) open_graph_response = {}
raise OEmbedError() from e cache_age = None
return OEmbedResult(open_graph_response, cache_age)
def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None:
"""
Calculate description for an HTML document.
This uses lxml to convert the HTML document into plaintext. If errors
occur during processing of the document, an empty response is returned.
Args:
open_graph_response: The current Open Graph summary. This is updated with additional fields.
html_body: The HTML document, as bytes.
Returns:
The summary
"""
# If there's no body, nothing useful is going to be found.
if not html_body:
return
from lxml import etree
# Create an HTML parser. If this fails, log and return no metadata.
parser = etree.HTMLParser(recover=True, encoding="utf-8")
# Attempt to parse the body. If this fails, log and return no metadata.
tree = etree.fromstring(html_body, parser)
# The data was successfully parsed, but no tree was found.
if tree is None:
return
from synapse.rest.media.v1.preview_url_resource import _calc_description
description = _calc_description(tree)
if description:
open_graph_response["og:description"] = description

View File

@ -44,7 +44,7 @@ from synapse.logging.context import make_deferred_yieldable, run_in_background
from synapse.metrics.background_process_metrics import run_as_background_process from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.rest.media.v1._base import get_filename_from_headers from synapse.rest.media.v1._base import get_filename_from_headers
from synapse.rest.media.v1.media_storage import MediaStorage from synapse.rest.media.v1.media_storage import MediaStorage
from synapse.rest.media.v1.oembed import OEmbedError, OEmbedProvider from synapse.rest.media.v1.oembed import OEmbedProvider
from synapse.types import JsonDict from synapse.types import JsonDict
from synapse.util import json_encoder from synapse.util import json_encoder
from synapse.util.async_helpers import ObservableDeferred from synapse.util.async_helpers import ObservableDeferred
@ -73,6 +73,7 @@ OG_TAG_NAME_MAXLEN = 50
OG_TAG_VALUE_MAXLEN = 1000 OG_TAG_VALUE_MAXLEN = 1000
ONE_HOUR = 60 * 60 * 1000 ONE_HOUR = 60 * 60 * 1000
ONE_DAY = 24 * ONE_HOUR
@attr.s(slots=True, frozen=True, auto_attribs=True) @attr.s(slots=True, frozen=True, auto_attribs=True)
@ -255,10 +256,19 @@ class PreviewUrlResource(DirectServeJsonResource):
og = og.encode("utf8") og = og.encode("utf8")
return og return og
media_info = await self._download_url(url, user) # If this URL can be accessed via oEmbed, use that instead.
url_to_download = url
oembed_url = self._oembed.get_oembed_url(url)
if oembed_url:
url_to_download = oembed_url
media_info = await self._download_url(url_to_download, user)
logger.debug("got media_info of '%s'", media_info) logger.debug("got media_info of '%s'", media_info)
# The number of milliseconds that the response should be considered valid.
expiration_ms = media_info.expires
if _is_media(media_info.media_type): if _is_media(media_info.media_type):
file_id = media_info.filesystem_id file_id = media_info.filesystem_id
dims = await self.media_repo._generate_thumbnails( dims = await self.media_repo._generate_thumbnails(
@ -288,34 +298,22 @@ class PreviewUrlResource(DirectServeJsonResource):
encoding = get_html_media_encoding(body, media_info.media_type) encoding = get_html_media_encoding(body, media_info.media_type)
og = decode_and_calc_og(body, media_info.uri, encoding) og = decode_and_calc_og(body, media_info.uri, encoding)
# pre-cache the image for posterity await self._precache_image_url(user, media_info, og)
# FIXME: it might be cleaner to use the same flow as the main /preview_url
# request itself and benefit from the same caching etc. But for now we
# just rely on the caching on the master request to speed things up.
if "og:image" in og and og["og:image"]:
image_info = await self._download_url(
_rebase_url(og["og:image"], media_info.uri), user
)
if _is_media(image_info.media_type): elif oembed_url and _is_json(media_info.media_type):
# TODO: make sure we don't choke on white-on-transparent images # Handle an oEmbed response.
file_id = image_info.filesystem_id with open(media_info.filename, "rb") as file:
dims = await self.media_repo._generate_thumbnails( body = file.read()
None, file_id, file_id, image_info.media_type, url_cache=True
) oembed_response = self._oembed.parse_oembed_response(media_info.uri, body)
if dims: og = oembed_response.open_graph_result
og["og:image:width"] = dims["width"]
og["og:image:height"] = dims["height"] # Use the cache age from the oEmbed result, instead of the HTTP response.
else: if oembed_response.cache_age is not None:
logger.warning("Couldn't get dims for %s", og["og:image"]) expiration_ms = oembed_response.cache_age
await self._precache_image_url(user, media_info, og)
og[
"og:image"
] = f"mxc://{self.server_name}/{image_info.filesystem_id}"
og["og:image:type"] = image_info.media_type
og["matrix:image:size"] = image_info.media_length
else:
del og["og:image"]
else: else:
logger.warning("Failed to find any OG data in %s", url) logger.warning("Failed to find any OG data in %s", url)
og = {} og = {}
@ -336,12 +334,15 @@ class PreviewUrlResource(DirectServeJsonResource):
jsonog = json_encoder.encode(og) jsonog = json_encoder.encode(og)
# Cap the amount of time to consider a response valid.
expiration_ms = min(expiration_ms, ONE_DAY)
# store OG in history-aware DB cache # store OG in history-aware DB cache
await self.store.store_url_cache( await self.store.store_url_cache(
url, url,
media_info.response_code, media_info.response_code,
media_info.etag, media_info.etag,
media_info.expires + media_info.created_ts_ms, media_info.created_ts_ms + expiration_ms,
jsonog, jsonog,
media_info.filesystem_id, media_info.filesystem_id,
media_info.created_ts_ms, media_info.created_ts_ms,
@ -358,88 +359,52 @@ class PreviewUrlResource(DirectServeJsonResource):
file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True) file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)
# If this URL can be accessed via oEmbed, use that instead. with self.media_storage.store_into_file(file_info) as (f, fname, finish):
url_to_download: Optional[str] = url
oembed_url = self._oembed.get_oembed_url(url)
if oembed_url:
# The result might be a new URL to download, or it might be HTML content.
try: try:
oembed_result = await self._oembed.get_oembed_content(oembed_url, url) logger.debug("Trying to get preview for url '%s'", url)
if oembed_result.url: length, headers, uri, code = await self.client.get_file(
url_to_download = oembed_result.url url,
elif oembed_result.html: output_stream=f,
url_to_download = None max_size=self.max_spider_size,
except OEmbedError: headers={"Accept-Language": self.url_preview_accept_language},
# If an error occurs, try doing a normal preview.
pass
if url_to_download:
with self.media_storage.store_into_file(file_info) as (f, fname, finish):
try:
logger.debug("Trying to get preview for url '%s'", url_to_download)
length, headers, uri, code = await self.client.get_file(
url_to_download,
output_stream=f,
max_size=self.max_spider_size,
headers={"Accept-Language": self.url_preview_accept_language},
)
except SynapseError:
# Pass SynapseErrors through directly, so that the servlet
# handler will return a SynapseError to the client instead of
# blank data or a 500.
raise
except DNSLookupError:
# DNS lookup returned no results
# Note: This will also be the case if one of the resolved IP
# addresses is blacklisted
raise SynapseError(
502,
"DNS resolution failure during URL preview generation",
Codes.UNKNOWN,
)
except Exception as e:
# FIXME: pass through 404s and other error messages nicely
logger.warning("Error downloading %s: %r", url_to_download, e)
raise SynapseError(
500,
"Failed to download content: %s"
% (traceback.format_exception_only(sys.exc_info()[0], e),),
Codes.UNKNOWN,
)
await finish()
if b"Content-Type" in headers:
media_type = headers[b"Content-Type"][0].decode("ascii")
else:
media_type = "application/octet-stream"
download_name = get_filename_from_headers(headers)
# FIXME: we should calculate a proper expiration based on the
# Cache-Control and Expire headers. But for now, assume 1 hour.
expires = ONE_HOUR
etag = (
headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None
) )
else: except SynapseError:
# we can only get here if we did an oembed request and have an oembed_result.html # Pass SynapseErrors through directly, so that the servlet
assert oembed_result.html is not None # handler will return a SynapseError to the client instead of
assert oembed_url is not None # blank data or a 500.
raise
except DNSLookupError:
# DNS lookup returned no results
# Note: This will also be the case if one of the resolved IP
# addresses is blacklisted
raise SynapseError(
502,
"DNS resolution failure during URL preview generation",
Codes.UNKNOWN,
)
except Exception as e:
# FIXME: pass through 404s and other error messages nicely
logger.warning("Error downloading %s: %r", url, e)
html_bytes = oembed_result.html.encode("utf-8") raise SynapseError(
with self.media_storage.store_into_file(file_info) as (f, fname, finish): 500,
f.write(html_bytes) "Failed to download content: %s"
await finish() % (traceback.format_exception_only(sys.exc_info()[0], e),),
Codes.UNKNOWN,
)
await finish()
media_type = "text/html" if b"Content-Type" in headers:
download_name = oembed_result.title media_type = headers[b"Content-Type"][0].decode("ascii")
length = len(html_bytes) else:
# If a specific cache age was not given, assume 1 hour. media_type = "application/octet-stream"
expires = oembed_result.cache_age or ONE_HOUR
uri = oembed_url download_name = get_filename_from_headers(headers)
code = 200
etag = None # FIXME: we should calculate a proper expiration based on the
# Cache-Control and Expire headers. But for now, assume 1 hour.
expires = ONE_HOUR
etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None
try: try:
time_now_ms = self.clock.time_msec() time_now_ms = self.clock.time_msec()
@ -474,6 +439,46 @@ class PreviewUrlResource(DirectServeJsonResource):
etag=etag, etag=etag,
) )
async def _precache_image_url(
self, user: str, media_info: MediaInfo, og: JsonDict
) -> None:
"""
Pre-cache the image (if one exists) for posterity
Args:
user: The user requesting the preview.
media_info: The media being previewed.
og: The Open Graph dictionary. This is modified with image information.
"""
# If there's no image or it is blank, there's nothing to do.
if "og:image" not in og or not og["og:image"]:
return
# FIXME: it might be cleaner to use the same flow as the main /preview_url
# request itself and benefit from the same caching etc. But for now we
# just rely on the caching on the master request to speed things up.
image_info = await self._download_url(
_rebase_url(og["og:image"], media_info.uri), user
)
if _is_media(image_info.media_type):
# TODO: make sure we don't choke on white-on-transparent images
file_id = image_info.filesystem_id
dims = await self.media_repo._generate_thumbnails(
None, file_id, file_id, image_info.media_type, url_cache=True
)
if dims:
og["og:image:width"] = dims["width"]
og["og:image:height"] = dims["height"]
else:
logger.warning("Couldn't get dims for %s", og["og:image"])
og["og:image"] = f"mxc://{self.server_name}/{image_info.filesystem_id}"
og["og:image:type"] = image_info.media_type
og["matrix:image:size"] = image_info.media_length
else:
del og["og:image"]
def _start_expire_url_cache_data(self) -> Deferred: def _start_expire_url_cache_data(self) -> Deferred:
return run_as_background_process( return run_as_background_process(
"expire_url_cache_data", self._expire_url_cache_data "expire_url_cache_data", self._expire_url_cache_data
@ -527,7 +532,7 @@ class PreviewUrlResource(DirectServeJsonResource):
# These may be cached for a bit on the client (i.e., they # These may be cached for a bit on the client (i.e., they
# may have a room open with a preview url thing open). # may have a room open with a preview url thing open).
# So we wait a couple of days before deleting, just in case. # So we wait a couple of days before deleting, just in case.
expire_before = now - 2 * 24 * ONE_HOUR expire_before = now - 2 * ONE_DAY
media_ids = await self.store.get_url_cache_media_before(expire_before) media_ids = await self.store.get_url_cache_media_before(expire_before)
removed_media = [] removed_media = []
@ -669,7 +674,18 @@ def decode_and_calc_og(
def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]: def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
# suck our tree into lxml and define our OG response. """
Calculate metadata for an HTML document.
This uses lxml to search the HTML document for Open Graph data.
Args:
tree: The parsed HTML document.
media_url: The URI used to download the body.
Returns:
The Open Graph response as a dictionary.
"""
# if we see any image URLs in the OG response, then spider them # if we see any image URLs in the OG response, then spider them
# (although the client could choose to do this by asking for previews of those # (although the client could choose to do this by asking for previews of those
@ -743,35 +759,7 @@ def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
if meta_description: if meta_description:
og["og:description"] = meta_description[0] og["og:description"] = meta_description[0]
else: else:
# grab any text nodes which are inside the <body/> tag... og["og:description"] = _calc_description(tree)
# unless they are within an HTML5 semantic markup tag...
# <header/>, <nav/>, <aside/>, <footer/>
# ...or if they are within a <script/> or <style/> tag.
# This is a very very very coarse approximation to a plain text
# render of the page.
# We don't just use XPATH here as that is slow on some machines.
from lxml import etree
TAGS_TO_REMOVE = (
"header",
"nav",
"aside",
"footer",
"script",
"noscript",
"style",
etree.Comment,
)
# Split all the text nodes into paragraphs (by splitting on new
# lines)
text_nodes = (
re.sub(r"\s+", "\n", el).strip()
for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE)
)
og["og:description"] = summarize_paragraphs(text_nodes)
elif og["og:description"]: elif og["og:description"]:
# This must be a non-empty string at this point. # This must be a non-empty string at this point.
assert isinstance(og["og:description"], str) assert isinstance(og["og:description"], str)
@ -782,6 +770,46 @@ def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
return og return og
def _calc_description(tree: "etree.Element") -> Optional[str]:
"""
Calculate a text description based on an HTML document.
Grabs any text nodes which are inside the <body/> tag, unless they are within
an HTML5 semantic markup tag (<header/>, <nav/>, <aside/>, <footer/>), or
if they are within a <script/> or <style/> tag.
This is a very very very coarse approximation to a plain text render of the page.
Args:
tree: The parsed HTML document.
Returns:
The plain text description, or None if one cannot be generated.
"""
# We don't just use XPATH here as that is slow on some machines.
from lxml import etree
TAGS_TO_REMOVE = (
"header",
"nav",
"aside",
"footer",
"script",
"noscript",
"style",
etree.Comment,
)
# Split all the text nodes into paragraphs (by splitting on new
# lines)
text_nodes = (
re.sub(r"\s+", "\n", el).strip()
for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE)
)
return summarize_paragraphs(text_nodes)
def _iterate_over_text( def _iterate_over_text(
tree: "etree.Element", *tags_to_ignore: Iterable[Union[str, "etree.Comment"]] tree: "etree.Element", *tags_to_ignore: Iterable[Union[str, "etree.Comment"]]
) -> Generator[str, None, None]: ) -> Generator[str, None, None]:
@ -841,11 +869,25 @@ def _is_html(content_type: str) -> bool:
) )
def _is_json(content_type: str) -> bool:
return content_type.lower().startswith("application/json")
def summarize_paragraphs( def summarize_paragraphs(
text_nodes: Iterable[str], min_size: int = 200, max_size: int = 500 text_nodes: Iterable[str], min_size: int = 200, max_size: int = 500
) -> Optional[str]: ) -> Optional[str]:
# Try to get a summary of between 200 and 500 words, respecting """
# first paragraph and then word boundaries. Try to get a summary respecting first paragraph and then word boundaries.
Args:
text_nodes: The paragraphs to summarize.
min_size: The minimum number of words to include.
max_size: The maximum number of words to include.
Returns:
A summary of the text nodes, or None if that was not possible.
"""
# TODO: Respect sentences? # TODO: Respect sentences?
description = "" description = ""
@ -868,7 +910,7 @@ def summarize_paragraphs(
new_desc = "" new_desc = ""
# This splits the paragraph into words, but keeping the # This splits the paragraph into words, but keeping the
# (preceeding) whitespace intact so we can easily concat # (preceding) whitespace intact so we can easily concat
# words back together. # words back together.
for match in re.finditer(r"\s*\S+", description): for match in re.finditer(r"\s*\S+", description):
word = match.group() word = match.group()

View File

@ -24,6 +24,7 @@ from synapse.config.oembed import OEmbedEndpointConfig
from tests import unittest from tests import unittest
from tests.server import FakeTransport from tests.server import FakeTransport
from tests.test_utils import SMALL_PNG
try: try:
import lxml import lxml
@ -576,13 +577,6 @@ class URLPreviewTests(unittest.HomeserverTestCase):
} }
oembed_content = json.dumps(result).encode("utf-8") oembed_content = json.dumps(result).encode("utf-8")
end_content = (
b"<html><head>"
b"<title>Some Title</title>"
b'<meta property="og:description" content="hi" />'
b"</head></html>"
)
channel = self.make_request( channel = self.make_request(
"GET", "GET",
"preview_url?url=http://twitter.com/matrixdotorg/status/12345", "preview_url?url=http://twitter.com/matrixdotorg/status/12345",
@ -606,6 +600,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
self.pump() self.pump()
# Ensure a second request is made to the photo URL.
client = self.reactor.tcpClients[1][2].buildProtocol(None) client = self.reactor.tcpClients[1][2].buildProtocol(None)
server = AccumulatingProtocol() server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor)) server.makeConnection(FakeTransport(client, self.reactor))
@ -613,18 +608,23 @@ class URLPreviewTests(unittest.HomeserverTestCase):
client.dataReceived( client.dataReceived(
( (
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b'Content-Type: text/html; charset="utf8"\r\n\r\n' b"Content-Type: image/png\r\n\r\n"
) )
% (len(end_content),) % (len(SMALL_PNG),)
+ end_content + SMALL_PNG
) )
self.pump() self.pump()
# Ensure the URL is what was requested.
self.assertIn(b"/matrixdotorg", server.data)
self.assertEqual(channel.code, 200) self.assertEqual(channel.code, 200)
self.assertEqual( self.assertIsNone(channel.json_body["og:title"])
channel.json_body, {"og:title": "Some Title", "og:description": "hi"} self.assertTrue(channel.json_body["og:image"].startswith("mxc://"))
) self.assertEqual(channel.json_body["og:image:height"], 1)
self.assertEqual(channel.json_body["og:image:width"], 1)
self.assertEqual(channel.json_body["og:image:type"], "image/png")
def test_oembed_rich(self): def test_oembed_rich(self):
"""Test an oEmbed endpoint which returns HTML content via the 'rich' type.""" """Test an oEmbed endpoint which returns HTML content via the 'rich' type."""